├── .circleci
    └── config.yml
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── azure-pipelines.yml
├── docs
    ├── .conf.py.swp
    ├── Makefile
    ├── api_docs
    │   ├── drifter_ml.classification_tests.rst
    │   ├── drifter_ml.columnar_tests.rst
    │   ├── drifter_ml.regression_tests.rst
    │   ├── drifter_ml.rst
    │   ├── drifter_ml.structural_tests.rst
    │   ├── drifter_ml.timeseries_tests.rst
    │   └── modules.rst
    ├── classification-tests.rst
    ├── conf.py
    ├── designing-your-own-tests.rst
    ├── index.rst
    ├── introduction.rst
    ├── make.bat
    ├── project-setup.rst
    └── regression-tests.rst
├── drifter_ml
    ├── __init__.py
    ├── classification_tests
    │   ├── __init__.py
    │   └── classification_tests.py
    ├── columnar_tests
    │   ├── __init__.py
    │   └── columnar_tests.py
    ├── regression_tests
    │   ├── __init__.py
    │   └── regression_tests.py
    ├── structural_tests
    │   ├── __init__.py
    │   └── structural_tests.py
    └── timeseries_tests
    │   ├── __init__.py
    │   └── timeseries_tests.py
├── example_models
    └── static_examples
    │   ├── data.csv
    │   ├── example_model.py
    │   ├── example_tests.py
    │   ├── keras_example.py
    │   ├── model.joblib
    │   ├── model1.joblib
    │   ├── model_metadata.json
    │   ├── prototype_test_framework.py
    │   └── random_file.py
├── experiments
    └── simple_example.ipynb
├── requirements.txt
├── setup.py
└── tests
    ├── test_classification_tests.py
    ├── test_columnar_tests.py
    ├── test_regression_tests.py
    └── test_structural_tests.py


/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | # Python CircleCI 2.0 configuration file
 2 | #
 3 | # Check https://circleci.com/docs/2.0/language-python/ for more details
 4 | #
 5 | version: 2
 6 | jobs:
 7 |   build:
 8 |     docker:
 9 |       # specify the version you desire here
10 |       # use `-browsers` prefix for selenium tests, e.g. `3.6.1-browsers`
11 |       - image: circleci/python:3.6.1
12 | 
13 |       # Specify service dependencies here if necessary
14 |       # CircleCI maintains a library of pre-built images
15 |       # documented at https://circleci.com/docs/2.0/circleci-images/
16 |       # - image: circleci/postgres:9.4
17 | 
18 |     working_directory: ~/repo
19 | 
20 |     steps:
21 |       - checkout
22 | 
23 |       # Download and cache dependencies
24 |       - restore_cache:
25 |           keys:
26 |             - v1-dependencies-{{ checksum "requirements.txt" }}
27 |             # fallback to using the latest cache if no exact match is found
28 |             - v1-dependencies-
29 | 
30 |       - run:
31 |           name: install dependencies
32 |           command: |
33 |             python3 -m venv venv
34 |             . venv/bin/activate
35 |             pip install -r requirements.txt
36 | 
37 |       - save_cache:
38 |           paths:
39 |             - ./venv
40 |           key: v1-dependencies-{{ checksum "requirements.txt" }}
41 | 
42 |       # run tests!
43 |       # this example uses Django's built-in test-runner
44 |       # other common Python testing frameworks include pytest and nose
45 |       # https://pytest.org
46 |       # https://nose.readthedocs.io
47 |       - run:
48 |           name: Install python
49 |           command: |
50 |             python3 -m venv venv
51 |             . venv/bin/activate
52 |             python -m pip install pytest --user
53 |       - run:
54 |           name: Install library
55 |           command: |
56 |             python3 -m venv venv
57 |             . venv/bin/activate
58 |             python setup.py install
59 |       - run:
60 |           name: run tests
61 |           command: |
62 |             . venv/bin/activate
63 |             python -m pytest tests
64 |       - run:
65 |           name: Install codecov
66 |           command: |
67 |             . venv/bin/activate
68 |             python -m pip install codecov pytest-cov 
69 |             cd tests && pytest --cov-report xml --cov=drifter_ml ./*
70 |       - run: 
71 |           name: run codecov
72 |           command: |
73 |             curl -s https://codecov.io/bash | bash -s - -t 6875d787-f809-4e64-909b-c672e8845796        
74 |       - store_artifacts:
75 |           path: ./tests/htmlcov
76 |           destination: ./tests/htmlcov
77 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | _build
2 | _static
3 | _templates
4 | *pyc
5 | *py~
6 | __pycache__


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Eric Schles
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include *.py
2 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SOURCEDIR     = source
 8 | BUILDDIR      = build
 9 | 
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 | 
14 | .PHONY: help Makefile
15 | 
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ML Testing
 2 | 
 3 | The goal of this module is to create a flexible and easy to use module for testing machine learning models, specifically those in scikit-learn.  
 4 | 
 5 | The tests will be readable enough that anyone can extend them to other frameworks and APIs with the major notions kept the same, but more or less the ideas will be extended, no work will be taken in this library to extend passed the scikit-learn API.
 6 | 
 7 | You can [read the docs](https://drifter-ml.readthedocs.io/en/latest/) for a more detailed explaination.
 8 | 
 9 | [A video explaining the idea behind the framework](https://www.youtube.com/watch?v=bZtdnFVAfbs&t=3s)
10 | 
11 | [![Documentation Status](https://readthedocs.org/projects/drifter-ml/badge/?version=latest)](https://drifter-ml.readthedocs.io/en/latest/?badge=latest)
12 | [![CircleCI](https://circleci.com/gh/EricSchles/drifter_ml.svg?style=svg)](https://circleci.com/gh/EricSchles/drifter_ml)
13 | [![Version Number](https://img.shields.io/pypi/v/drifter-ml.svg)](https://pypi.org/project/drifter-ml/)
14 | [![Downloads Per Month](https://img.shields.io/pypi/dm/drifter-ml.svg)](https://pypi.org/project/drifter-ml/)
15 | [![codecov](https://codecov.io/gh/EricSchles/drifter_ml/branch/master/graph/badge.svg)](https://codecov.io/gh/EricSchles/drifter_ml)
16 | 
17 | ## Tests Covered
18 | 
19 |  * Testing Against Metrics
20 |  	* Classification Tests
21 |  		* Rule Based Testing:
22 | 	 		* precision lower boundary
23 | 	 		* recall lower boundary
24 | 	 		* f1 score lower boundary
25 | 	 		* AUC lower boundary
26 | 	 		* precision lower boundary per class
27 | 	 		* recall lower boundary per class
28 | 	 		* f1 score lower boundary per class
29 | 	 		* AUC lower boundary per class
30 |  		* Decision Based Testing:
31 |  			* precision fold below average
32 |  			* recall fold below average
33 |  			* f1 fold below average
34 |  			* AUC fold below average
35 |  			* precision fold below average per class
36 |  			* recall fold below average per class
37 |  			* f1 fold below average per class
38 |  			* AUC fold below average per class
39 |  		* Against New Predictions
40 |  			* proportion of predictions per class
41 |  			* class imbalance tests
42 |  			* probability distribution similarity tests
43 |  			* calibration tests
44 |  		* environmental impact tests
45 |  			* [energyusage](https://pypi.org/project/energyusage/) upper bound test
46 |  	* Regression Tests
47 |  		* Rule Based Testing:
48 |  		 	* Mean Squared Error upper boundary
49 |  			* Median Absolute Error upper boundary
50 |  		* Decision Based Testing:
51 |  			* Mean Squared Error fold above average
52 |  			* Median Absolute Error fold above average
53 |  * Testing Against Run Time Performance
54 |  	* prediction run time for simulated samples of size X
55 |  * Testing Against Input Data
56 |   	* percentage of correct imputes for any columns requiring imputation
57 |  	* dataset testing - http://www.vldb.org/pvldb/vol11/p1781-schelter.pdf 
58 |  * Memoryful Tests
59 |  	* cluster testing - this is about the overall structure of the data
60 |  		If the number of clusters increases or decreases substantially that 
61 |  		should be an indicator that the data has changed enough that things
62 |  		should possibly be rerun
63 |  	* correlation testing - this is about ensuring that the correlation for a given column with previous data collected in the past does not change very much. If the data does change then the model should possibly be rerun.
64 |     * shape testing - this is about ensuring the general shape of for the given column does not change much over time.  The idea here is the same as the correlation tests.
65 | 
66 | ## Possible Issues
67 | 
68 | Some known issues with this, any machine learning tests are going to require human interaction because of type 1 and type 2 error for statistical tests.  Additionally, one simply needs to interrogate models from a lot of angles.  It can't be from just one angle.  So please use with care!
69 | 
70 | ## Future Features
71 | 
72 | * cross validation score testing
73 | * add custom loss function
74 | * add custom accuracy function
75 | * add these tests: https://www.datasciencecentral.com/profiles/blogs/a-plethora-of-original-underused-statistical-tests
76 | * clustering for classification
77 | * Unsupervised and semi supervised tests
78 | 	* verify similarity in clusters to similarity in labels
79 | 	* generate a small representative set of labels and then propagate other labels
80 | 
81 | 
82 | ## References
83 | 
84 | * https://dzone.com/articles/quality-assurancetesting-the-machine-learning-mode
85 | * https://medium.com/datadriveninvestor/how-to-perform-quality-assurance-for-ml-models-cef77bbbcfb
86 | * Explaination of UAT: https://www.techopedia.com/definition/3887/user-acceptance-testing-uat
87 | * https://mice.cs.columbia.edu/getTechreport.php?techreportID=419&format=pdf
88 | * https://www.xenonstack.com/blog/unit-testing-tdd-bdd-deep-machine-learning/


--------------------------------------------------------------------------------
/azure-pipelines.yml:
--------------------------------------------------------------------------------
 1 | # Python package
 2 | # Create and test a Python package on multiple Python versions.
 3 | # Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more:
 4 | # https://docs.microsoft.com/azure/devops/pipelines/languages/python
 5 | 
 6 | trigger:
 7 | - master
 8 | 
 9 | jobs:
10 | 
11 | - job: 'Test'
12 |   pool:
13 |     vmImage: 'Ubuntu-16.04'
14 |   strategy:
15 |     matrix:
16 |       Python36:
17 |         python.version: '3.6'
18 |       Python37:
19 |         python.version: '3.7'
20 |     maxParallel: 4
21 | 
22 |   steps:
23 |   - task: UsePythonVersion@0
24 |     inputs:
25 |       versionSpec: '$(python.version)'
26 |       architecture: 'x64'
27 | 
28 |   - script: python -m pip install --upgrade pip && pip install -r requirements.txt
29 |     displayName: 'Install dependencies'
30 | 
31 |   - script: python setup.py install
32 |     displayName: 'Install library'
33 | 
34 |   - script: |
35 |       pip install pytest
36 |       pytest tests --doctest-modules --junitxml=junit/test-results.xml
37 |     displayName: 'pytest'
38 | 
39 |   - task: PublishTestResults@2
40 |     inputs:
41 |       testResultsFiles: '**/test-results.xml'
42 |       testRunTitle: 'Python $(python.version)'
43 |     condition: succeededOrFailed()
44 | 
45 | - job: 'Publish'
46 |   dependsOn: 'Test'
47 |   pool:
48 |     vmImage: 'Ubuntu-16.04'
49 | 
50 |   steps:
51 |   - task: UsePythonVersion@0
52 |     inputs:
53 |       versionSpec: '3.x'
54 |       architecture: 'x64'
55 | 
56 |   - script: python setup.py sdist
57 |     displayName: 'Build sdist'
58 | 


--------------------------------------------------------------------------------
/docs/.conf.py.swp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EricSchles/drifter_ml/198a2e4a0b6310765e551f3122ff0ea8b04ed900/docs/.conf.py.swp


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SOURCEDIR     = .
 8 | BUILDDIR      = _build
 9 | 
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 | 
14 | .PHONY: help Makefile
15 | 
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs/api_docs/drifter_ml.classification_tests.rst:
--------------------------------------------------------------------------------
 1 | drifter\_ml.classification\_tests package
 2 | =========================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | drifter\_ml.classification\_tests.classification\_tests module
 8 | --------------------------------------------------------------
 9 | 
10 | .. automodule:: drifter_ml.classification_tests.classification_tests
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | 
16 | Module contents
17 | ---------------
18 | 
19 | .. automodule:: drifter_ml.classification_tests
20 |     :members:
21 |     :undoc-members:
22 |     :show-inheritance:
23 | 


--------------------------------------------------------------------------------
/docs/api_docs/drifter_ml.columnar_tests.rst:
--------------------------------------------------------------------------------
 1 | drifter\_ml.columnar\_tests package
 2 | ===================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | drifter\_ml.columnar\_tests.columnar\_tests module
 8 | --------------------------------------------------
 9 | 
10 | .. automodule:: drifter_ml.columnar_tests.columnar_tests
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | 
16 | Module contents
17 | ---------------
18 | 
19 | .. automodule:: drifter_ml.columnar_tests
20 |     :members:
21 |     :undoc-members:
22 |     :show-inheritance:
23 | 


--------------------------------------------------------------------------------
/docs/api_docs/drifter_ml.regression_tests.rst:
--------------------------------------------------------------------------------
 1 | drifter\_ml.regression\_tests package
 2 | =====================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | drifter\_ml.regression\_tests.regression\_tests module
 8 | ------------------------------------------------------
 9 | 
10 | .. automodule:: drifter_ml.regression_tests.regression_tests
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | 
16 | Module contents
17 | ---------------
18 | 
19 | .. automodule:: drifter_ml.regression_tests
20 |     :members:
21 |     :undoc-members:
22 |     :show-inheritance:
23 | 


--------------------------------------------------------------------------------
/docs/api_docs/drifter_ml.rst:
--------------------------------------------------------------------------------
 1 | drifter\_ml package
 2 | ===================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |     drifter_ml.classification_tests
10 |     drifter_ml.columnar_tests
11 |     drifter_ml.regression_tests
12 |     drifter_ml.structural_tests
13 |     drifter_ml.timeseries_tests
14 | 
15 | Module contents
16 | ---------------
17 | 
18 | .. automodule:: drifter_ml
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 


--------------------------------------------------------------------------------
/docs/api_docs/drifter_ml.structural_tests.rst:
--------------------------------------------------------------------------------
 1 | drifter\_ml.structural\_tests package
 2 | =====================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | drifter\_ml.structural\_tests.structural\_tests module
 8 | ------------------------------------------------------
 9 | 
10 | .. automodule:: drifter_ml.structural_tests.structural_tests
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | 
16 | Module contents
17 | ---------------
18 | 
19 | .. automodule:: drifter_ml.structural_tests
20 |     :members:
21 |     :undoc-members:
22 |     :show-inheritance:
23 | 


--------------------------------------------------------------------------------
/docs/api_docs/drifter_ml.timeseries_tests.rst:
--------------------------------------------------------------------------------
 1 | drifter\_ml.timeseries\_tests package
 2 | =====================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | drifter\_ml.timeseries\_tests.timeseries\_tests module
 8 | ------------------------------------------------------
 9 | 
10 | .. automodule:: drifter_ml.timeseries_tests.timeseries_tests
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | 
16 | Module contents
17 | ---------------
18 | 
19 | .. automodule:: drifter_ml.timeseries_tests
20 |     :members:
21 |     :undoc-members:
22 |     :show-inheritance:
23 | 


--------------------------------------------------------------------------------
/docs/api_docs/modules.rst:
--------------------------------------------------------------------------------
1 | drifter_ml
2 | ==========
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    drifter_ml
8 | 


--------------------------------------------------------------------------------
/docs/classification-tests.rst:
--------------------------------------------------------------------------------
  1 | ####################
  2 | Classification Tests
  3 | ####################
  4 | 
  5 | The goal of the following set of tests is to accomplish some monitoring goals:
  6 | 
  7 | 1. Establish baselines for model performance in production per class
  8 | 
  9 | 2. Establish maximum processing time for various volumes of data, through the statistical model
 10 | 
 11 | 3. Ensure that the current model in production is the best available model according to a set of predefined measures
 12 | 
 13 | Let's look at each of these classes of tests now.
 14 | 
 15 | 
 16 | Lower Bound Classification Measures
 17 | ===================================
 18 | 
 19 | Each of the following examples ensures that your classifier meets a minimum criteria, which should be decided based on the need of your use-case.  One simple way to do this is to define failure by how many dollars it will cost you.  
 20 | 
 21 | Precision, Recall and F1 score are great tools for ensuring your classifier optimizes for minimal misclassification, however you define it.  
 22 | 
 23 | That is why they are basis of the set of tests found below.
 24 | 
 25 | 
 26 | Classifier Test Example - Model Metrics
 27 | =======================================
 28 | 
 29 | Suppose you had the following model::
 30 | 
 31 | 	from sklearn import tree
 32 | 	import pandas as pd
 33 | 	import numpy as np
 34 | 	import joblib
 35 | 
 36 | 	df = pd.DataFrame()
 37 | 	for _ in range(1000):
 38 | 	    a = np.random.normal(0, 1)
 39 | 	    b = np.random.normal(0, 3)
 40 | 	    c = np.random.normal(12, 4)
 41 | 	    if a + b + c > 11:
 42 | 	        target = 1
 43 | 	    else:
 44 | 	        target = 0
 45 | 	    df = df.append({
 46 | 	        "A": a,
 47 | 	        "B": b,
 48 | 	        "C": c,
 49 | 	        "target": target
 50 | 	    }, ignore_index=True)
 51 | 
 52 | 	clf = tree.DecisionTreeClassifier()
 53 | 	X = df[["A", "B", "C"]]
 54 | 	clf.fit(X, df["target"])
 55 | 	joblib.dump(clf, "model.joblib")
 56 | 	df.to_csv("data.csv")
 57 | 
 58 | We could write the following set of tests to ensure this model does well::
 59 | 
 60 | 	from drifter_ml.classification_tests import ClassificationTests
 61 | 	import joblib
 62 | 	import pandas as pd
 63 | 
 64 | 	def test_precision():
 65 | 	    df = pd.read_csv("data.csv")
 66 | 	    column_names = ["A", "B", "C"]
 67 | 	    target_name = "target"
 68 | 	    clf = joblib.load("model.joblib")
 69 | 
 70 | 	    test_suite = ClassificationTests(clf, 
 71 | 	    df, target_name, column_names)
 72 | 	    classes = list(df.target.unique())
 73 | 	    assert test_suite.precision_lower_boundary_per_class(
 74 | 	        {klass: 0.9 for klass in classes}
 75 | 	    )
 76 | 
 77 | 	def test_recall():
 78 | 	    df = pd.read_csv("data.csv")
 79 | 	    column_names = ["A", "B", "C"]
 80 | 	    target_name = "target"
 81 | 	    clf = joblib.load("model.joblib")
 82 | 
 83 | 	    test_suite = ClassificationTests(clf, 
 84 | 	    df, target_name, column_names)
 85 | 	    classes = list(df.target.unique())
 86 | 	    assert test_suite.recall_lower_boundary_per_class(
 87 | 	        {klass: 0.9 for klass in classes}
 88 | 	    )
 89 | 
 90 | 	def test_f1():
 91 | 	    df = pd.read_csv("data.csv")
 92 | 	    column_names = ["A", "B", "C"]
 93 | 	    target_name = "target"
 94 | 	    clf = joblib.load("model.joblib")
 95 | 
 96 | 	    test_suite = ClassificationTests(clf, 
 97 | 	    df, target_name, column_names)
 98 | 	    classes = list(df.target.unique())
 99 | 	    assert test_suite.f1_lower_boundary_per_class(
100 | 	        {klass: 0.9 for klass in classes}
101 | 	    )
102 | 
103 | 
104 | Or you could simply write one test for all three::
105 | 
106 | 	from drifter_ml.classification_tests import ClassificationTests
107 | 	import joblib
108 | 	import pandas as pd
109 | 
110 | 	def test_precision_recall_f1():
111 | 	    df = pd.read_csv("data.csv")
112 | 	    column_names = ["A", "B", "C"]
113 | 	    target_name = "target"
114 | 	    clf = joblib.load("model.joblib")
115 | 
116 | 	    test_suite = ClassificationTests(clf, 
117 | 	    df, target_name, column_names)
118 | 	    classes = list(df.target.unique())
119 | 	    assert test_suite.classifier_testing_per_class(
120 | 	        {klass: 0.9 for klass in classes},
121 | 	        {klass: 0.9 for klass in classes},
122 | 	        {klass: 0.9 for klass in classes}
123 | 	    )
124 | 
125 | Regardless of which test you choose, you get complete flexibility to ensure your model always meets the minimum criteria so that your costs are minimized, given constraints.
126 | 
127 | 
128 | Classifier Test Example - Model Speed
129 | =====================================
130 | 
131 | Additionally, you can test to ensure your classifier performs, even under load.  Assume we have the same model as before::
132 | 
133 | 	from sklearn import tree
134 | 	import pandas as pd
135 | 	import numpy as np
136 | 	import joblib
137 | 
138 | 	df = pd.DataFrame()
139 | 	for _ in range(1000):
140 | 	    a = np.random.normal(0, 1)
141 | 	    b = np.random.normal(0, 3)
142 | 	    c = np.random.normal(12, 4)
143 | 	    if a + b + c > 11:
144 | 	        target = 1
145 | 	    else:
146 | 	        target = 0
147 | 	    df = df.append({
148 | 	        "A": a,
149 | 	        "B": b,
150 | 	        "C": c,
151 | 	        "target": target
152 | 	    }, ignore_index=True)
153 | 
154 | 	clf = tree.DecisionTreeClassifier()
155 | 	X = df[["A", "B", "C"]]
156 | 	clf.fit(X, df["target"])
157 | 	joblib.dump(clf, "model.joblib")
158 | 	df.to_csv("data.csv")
159 | 
160 | Now we test to ensure the model predicts new labels within our constraints::
161 | 
162 | 	from drifter_ml.classification_tests import ClassificationTests
163 | 	import joblib
164 | 	import pandas as pd
165 | 
166 | 	def test_precision_recall_f1_speed():
167 | 	    df = pd.read_csv("data.csv")
168 | 	    column_names = ["A", "B", "C"]
169 | 	    target_name = "target"
170 | 	    clf = joblib.load("model.joblib")
171 | 
172 | 	    test_suite = ClassificationTests(clf, 
173 | 	    df, target_name, column_names)
174 |     	sample_sizes = [i for i in range(100, 1000, 100)]
175 |     	max_run_times = [100 for _ in range(len(sample_sizes))]
176 | 	    assert test_suite.run_time_stress_test(
177 | 	        sample_sizes, max_run_times
178 | 	    )
179 | 
180 | This test ensures that from 1 to 100000 elements, the model never takes longer than 10 seconds.  
181 | 
182 | Cross Validation Based Testing
183 | ==============================
184 | 
185 | In the last section we asked questions of our model with respect to a lower boundary, both of various model measures as well as speed measurement in seconds.  Now armed with cross validation we can ask questions about sections of our dataset, to ensure that the measures we found were an accurate representation across the dataset, rather than one global metric across the entire dataset.  Just to make sure we are all on the same page, cross validation breaks the dataset into unique samples and then each sample is used as the test sample, all other samples are used as training, the score for each validation sample is recorded and then the model is discarded.  For more information and a detailed introduction see https://machinelearningmastery.com/k-fold-cross-validation/.  
186 | 
187 | The advantage of checking our model in this way is now it is less likely that the model is just memorizing the training data and will actually scale to other examples.  This happens because the model scores are tested on a more limited dataset and also because "k" samples, the tuning parameter in cross validation, are tested to ensure the model performance is consistent.  
188 | 
189 | This also yields some advantages for testing, because now we can verify that our lower boundary precision, recall or f1 score is true across many folds, rather than some global lower bound which may not be true on some subset of the data.  This gives us more confidence in our models overall efficacy, but also requires that we have enough data to ensure our model can learn something.  
190 | 
191 | Sadly I could find no good rules of thumb but I'd say less than you need at least something like 1000 data points per fold at least, and it's probably best to never go above 20 folds unless your dataset is truly massive, like in the gigabytes.
192 | 
193 | 
194 | Classifier Test Example - Cross Validation Lower Bound Precision
195 | ================================================================
196 | 
197 | This example won't be that different from what you've seen before, except now we can tune on the number of folds to include.  Let's spice things up by using a keras classifier instead of a scikit learn one::
198 | 
199 | 	from keras.models import Sequential
200 | 	from keras.layers import Dense
201 | 	from keras.wrappers.scikit_learn import KerasClassifier
202 | 	import pandas as pd
203 | 	import numpy as np
204 | 	import joblib
205 | 
206 | 	# Function to create model, required for KerasClassifier
207 | 	def create_model():
208 | 	    # create model
209 | 	    model = Sequential()
210 | 	    model.add(Dense(12, input_dim=3, activation='relu'))
211 | 	    model.add(Dense(8, activation='relu'))
212 | 	    model.add(Dense(1, activation='sigmoid'))
213 | 	    # Compile model
214 | 	    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
215 | 	    return model
216 | 
217 | 	# fix random seed for reproducibility
218 | 	df = pd.DataFrame()
219 | 	for _ in range(1000):
220 | 	    a = np.random.normal(0, 1)
221 | 	    b = np.random.normal(0, 3)
222 | 	    c = np.random.normal(12, 4)
223 | 	    if a + b + c > 11:
224 | 	        target = 1
225 | 	    else:
226 | 	        target = 0
227 | 	    df = df.append({
228 | 	        "A": a,
229 | 	        "B": b,
230 | 	        "C": c,
231 | 	        "target": target
232 | 	    }, ignore_index=True)
233 | 
234 | 	# split into input (X) and output (Y) variables
235 | 	# create model
236 | 	clf = KerasClassifier(build_fn=create_model, epochs=150, batch_size=10, verbose=0)
237 | 	X = df[["A", "B", "C"]]
238 | 	clf.fit(X, df["target"])
239 | 	joblib.dump(clf, "model.joblib")
240 | 	df.to_csv("data.csv")
241 | 
242 | Now that we have the model and data saved, let's write the test::
243 | 
244 | 	from drifter_ml.classification_tests import ClassificationTests
245 | 	import joblib
246 | 	import pandas as pd
247 | 
248 | 	def test_cv_precision_lower_boundary():
249 | 	    df = pd.read_csv("data.csv")
250 | 	    column_names = ["A", "B", "C"]
251 | 	    target_name = "target"
252 | 	    clf = joblib.load("model.joblib")
253 | 
254 | 	    test_suite = ClassificationTests(clf, 
255 | 	    df, target_name, column_names)
256 | 	    lower_boundary = 0.9
257 | 	    test_suite.cross_val_precision_lower_boundary(
258 | 	    	lower_boundary
259 | 	    )
260 | 
261 | There are a few things to notice here:
262 | 
263 | 1. The set up didn't change - we train the model the same way, we store the model the same way, we pass the model in the same way.
264 | 
265 | 2. We aren't specifying percision per class - we will see examples of tests like that below, but because of the added stringency of limiting our training set, as well as training it across several samples of the dataset, sometimes called folds, we now don't need to specify as much granularity.  What we are really testing here is somewhat different - we want to make sure no samples of the dataset form significantly worse than the average.  What we are really looking for is anomalous samples of the data, that the model does much worse on.  Because any training set is just a sample, if a given subsample does much worse than others, then we need to ask the question - is this given subsample representative of a pattern we may see in the future?  Is it truly an anamoly?  If it's not, that's usually a strong indicator that our model needs some work.
266 | 
267 | Classifier Test Example - Cross Validation Average
268 | ===================================================
269 | 
270 | In the above example we test to ensure that none of the folds fall below a precision of 0.9 per fold.  But what if we only care if one of the folds does significantly worse than the others?  But don't actually care if all the folds meet the minimum criteria?  After all, some level of any model measure is defined by how much data you train it on.  It could be the case that we are right on the edge of having enough labeled data to train the model for all the imperative cases, but not enough to really ensure 90% percision, recall or some other meeasure.  If that is the case, then we could simply look to see if any of the folds does significantly worse than some notion of centrality, which could be a red flag on its own.  
271 | 
272 | Here we can set some deviance from the center for precision, recall or f1 score.  If a given fold falls below some deviance from centrality then we believe some intervention needs to be taken.  Let's look at an example::
273 | 
274 | 	from sklearn import tree
275 | 	import pandas as pd
276 | 	import numpy as np
277 | 	import joblib
278 | 
279 | 	df = pd.DataFrame()
280 | 	for _ in range(1000):
281 | 	    a = np.random.normal(0, 1)
282 | 	    b = np.random.normal(0, 3)
283 | 	    c = np.random.normal(12, 4)
284 | 	    if a + b + c > 11:
285 | 	        target = 1
286 | 	    else:
287 | 	        target = 0
288 | 	    df = df.append({
289 | 	        "A": a,
290 | 	        "B": b,
291 | 	        "C": c,
292 | 	        "target": target
293 | 	    }, ignore_index=True)
294 | 
295 | 	clf = tree.DecisionTreeClassifier()
296 | 	X = df[["A", "B", "C"]]
297 | 	clf.fit(X, df["target"])
298 | 	joblib.dump(clf, "model.joblib")
299 | 	df.to_csv("data.csv")
300 | 
301 | 
302 | Let's see a test::
303 | 
304 | 	from drifter_ml.classification_tests import ClassificationTests
305 | 	import joblib
306 | 	import pandas as pd
307 | 
308 | 	def test_cv_precision_anomaly_detection():
309 | 	    df = pd.read_csv("data.csv")
310 | 	    column_names = ["A", "B", "C"]
311 | 	    target_name = "target"
312 | 	    clf = joblib.load("model.joblib")
313 | 
314 | 	    test_suite = ClassificationTests(clf, 
315 | 	    df, target_name, column_names)
316 | 	    precision_tolerance = 0.2
317 | 	    test_suite.cross_val_precision_anomaly_detection(
318 | 	    	precision_tolerance, method='mean'
319 | 	    )
320 | 
321 | Here instead of setting an expectation of the precision, we set an expectation of the deviance from average precision.  So if the average is 0.7 and one of the folds scores is less than 5.0 then the test fails.  So it's important to have some lower boundary in place as well.  However we can be less stringent if we include this test.  A more complete test suite would likely be something like this::
322 | 
323 | 	from drifter_ml.classification_tests import ClassificationTests
324 | 	import joblib
325 | 	import pandas as pd
326 | 
327 | 	def test_cv_precision_anomaly_detection():
328 | 	    df = pd.read_csv("data.csv")
329 | 	    column_names = ["A", "B", "C"]
330 | 	    target_name = "target"
331 | 	    clf = joblib.load("model.joblib")
332 | 
333 | 	    test_suite = ClassificationTests(clf, 
334 | 	    df, target_name, column_names)
335 | 	    precision_tolerance = 0.2
336 | 	    test_suite.cross_val_precision_anomaly_detection(
337 | 	    	precision_tolerance, method='mean'
338 | 	    )
339 | 
340 | 	def test_cv_precision_lower_boundary():
341 | 	    df = pd.read_csv("data.csv")
342 | 	    column_names = ["A", "B", "C"]
343 | 	    target_name = "target"
344 | 	    clf = joblib.load("model.joblib")
345 | 
346 | 	    test_suite = ClassificationTests(clf, 
347 | 	    df, target_name, column_names)
348 | 	    min_averange = 0.7
349 | 	    test_suite.cross_val_precision_avg(
350 | 	    	min_average, method='mean'
351 | 	    )
352 | 
353 | Now we can say for sure, the precision should be at least 0.7 on average but can fall below up to 0.2 of that before we raise an error.  So 
354 | 
355 | Classifier Test Example - Cross Validation Anamoly Detection With Spread
356 | ========================================================================
357 | 
358 | In the previous example, we looked for a specific deviance now we'll make use of some properties of statistics to define what exactly we mean by an anamolous fold.  In order to do this, we'll look at deviance with respect to spread.  To make this concrete, let's walk through what that means::
359 | 
360 | 	from drifter_ml.classification_tests import ClassificationTests
361 | 	import joblib
362 | 	import pandas as pd
363 | 
364 | 	def test_cv_precision_anomaly_detection():
365 | 	    df = pd.read_csv("data.csv")
366 | 	    column_names = ["A", "B", "C"]
367 | 	    target_name = "target"
368 | 	    clf = joblib.load("model.joblib")
369 | 
370 | 	    test_suite = ClassificationTests(clf, 
371 | 	    df, target_name, column_names)
372 | 	    precision_tolerance = 0.2
373 | 	    test_suite.cross_val_precision_anomaly_detection(
374 | 	    	precision_tolerance, method='mean'
375 | 	    )
376 | 
377 | Before we go through what's happening let's recall what cross validation is. The basic notion of cross validation is random samples are taken, called folds of from the training set, trains the algorithm with that data and tests against all the other folds.  For this reason, it is necessary that you have enough data such that you can learn a pattern from the data.  For more information on k-fold check out this article: https://machinelearningmastery.com/k-fold-cross-validation/.  
378 | 
379 | As you can see we require a precision tolerance of 0.2 per fold of the cross validation.  To understand how this comes into play, let's look at how cross validation anomaly detection is done generally in the library::
380 | 
381 | 1. decide on the measure of center to use
382 | 2. calculate the average of all the scores (each score comes from a fold)
383 | 3. compute the list of deviances from the average
384 | 4. determine if the deviance from the average is every greater than the tolerance
385 | 
386 | So basically, this is a test for consistency on different folds of the data.  If the model performances above or below the tolerance bound on any of the folds, then the test fails.  This is really good if you need your model to act in an expected way, a lot of the time.  


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Configuration file for the Sphinx documentation builder.
  4 | #
  5 | # This file does only contain a selection of the most common options. For a
  6 | # full list see the documentation:
  7 | # http://www.sphinx-doc.org/en/master/config
  8 | 
  9 | # -- Path setup --------------------------------------------------------------
 10 | 
 11 | # If extensions (or modules to document with autodoc) are in another directory,
 12 | # add these directories to sys.path here. If the directory is relative to the
 13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 14 | #
 15 | import os
 16 | import sys
 17 | sys.path.insert(0, os.path.abspath('.'))
 18 | sys.path.insert(0, os.path.abspath('../'))
 19 | 
 20 | 
 21 | # -- Project information -----------------------------------------------------
 22 | 
 23 | project = 'drifter_ml'
 24 | copyright = '2019, Eric Schles'
 25 | author = 'Eric Schles'
 26 | 
 27 | # The short X.Y version
 28 | version = ''
 29 | # The full version, including alpha/beta/rc tags
 30 | release = '0.20'
 31 | 
 32 | 
 33 | # -- General configuration ---------------------------------------------------
 34 | 
 35 | # If your documentation needs a minimal Sphinx version, state it here.
 36 | #
 37 | # needs_sphinx = '1.0'
 38 | 
 39 | # Add any Sphinx extension module names here, as strings. They can be
 40 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 41 | # ones.
 42 | extensions = [
 43 |     'sphinx.ext.autodoc',
 44 |     'sphinx.ext.coverage',
 45 |     'sphinx.ext.napoleon'
 46 | ]
 47 | 
 48 | # Add any paths that contain templates here, relative to this directory.
 49 | templates_path = ['_templates']
 50 | 
 51 | # The suffix(es) of source filenames.
 52 | # You can specify multiple suffix as a list of string:
 53 | #
 54 | # source_suffix = ['.rst', '.md']
 55 | source_suffix = '.rst'
 56 | 
 57 | # The master toctree document.
 58 | master_doc = 'index'
 59 | 
 60 | # The language for content autogenerated by Sphinx. Refer to documentation
 61 | # for a list of supported languages.
 62 | #
 63 | # This is also used if you do content translation via gettext catalogs.
 64 | # Usually you set "language" from the command line for these cases.
 65 | language = None
 66 | 
 67 | # List of patterns, relative to source directory, that match files and
 68 | # directories to ignore when looking for source files.
 69 | # This pattern also affects html_static_path and html_extra_path.
 70 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 71 | 
 72 | # The name of the Pygments (syntax highlighting) style to use.
 73 | pygments_style = None
 74 | 
 75 | 
 76 | # -- Options for HTML output -------------------------------------------------
 77 | 
 78 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 79 | # a list of builtin themes.
 80 | #
 81 | html_theme = 'sphinx_rtd_theme'
 82 | 
 83 | # Theme options are theme-specific and customize the look and feel of a theme
 84 | # further.  For a list of options available for each theme, see the
 85 | # documentation.
 86 | #
 87 | # html_theme_options = {}
 88 | 
 89 | # Add any paths that contain custom static files (such as style sheets) here,
 90 | # relative to this directory. They are copied after the builtin static files,
 91 | # so a file named "default.css" will overwrite the builtin "default.css".
 92 | html_static_path = ['_static']
 93 | 
 94 | # Custom sidebar templates, must be a dictionary that maps document names
 95 | # to template names.
 96 | #
 97 | # The default sidebars (for documents that don't match any pattern) are
 98 | # defined by theme itself.  Builtin themes are using these templates by
 99 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
100 | # 'searchbox.html']``.
101 | #
102 | # html_sidebars = {}
103 | 
104 | 
105 | # -- Options for HTMLHelp output ---------------------------------------------
106 | 
107 | # Output file base name for HTML help builder.
108 | htmlhelp_basename = 'drifter_mldoc'
109 | 
110 | 
111 | # -- Options for LaTeX output ------------------------------------------------
112 | 
113 | latex_elements = {
114 |     # The paper size ('letterpaper' or 'a4paper').
115 |     #
116 |     # 'papersize': 'letterpaper',
117 | 
118 |     # The font size ('10pt', '11pt' or '12pt').
119 |     #
120 |     # 'pointsize': '10pt',
121 | 
122 |     # Additional stuff for the LaTeX preamble.
123 |     #
124 |     # 'preamble': '',
125 | 
126 |     # Latex figure (float) alignment
127 |     #
128 |     # 'figure_align': 'htbp',
129 | }
130 | 
131 | # Grouping the document tree into LaTeX files. List of tuples
132 | # (source start file, target name, title,
133 | #  author, documentclass [howto, manual, or own class]).
134 | latex_documents = [
135 |     (master_doc, 'drifter_ml.tex', 'drifter\\_ml Documentation',
136 |      'Eric Schles', 'manual'),
137 | ]
138 | 
139 | 
140 | # -- Options for manual page output ------------------------------------------
141 | 
142 | # One entry per manual page. List of tuples
143 | # (source start file, name, description, authors, manual section).
144 | man_pages = [
145 |     (master_doc, 'drifter_ml', 'drifter_ml Documentation',
146 |      [author], 1)
147 | ]
148 | 
149 | 
150 | # -- Options for Texinfo output ----------------------------------------------
151 | 
152 | # Grouping the document tree into Texinfo files. List of tuples
153 | # (source start file, target name, title, author,
154 | #  dir menu entry, description, category)
155 | texinfo_documents = [
156 |     (master_doc, 'drifter_ml', 'drifter_ml Documentation',
157 |      author, 'drifter_ml', 'One line description of project.',
158 |      'Miscellaneous'),
159 | ]
160 | 
161 | 
162 | # -- Options for Epub output -------------------------------------------------
163 | 
164 | # Bibliographic Dublin Core info.
165 | epub_title = project
166 | 
167 | # The unique identifier of the text. This can be a ISBN number
168 | # or the project homepage.
169 | #
170 | # epub_identifier = ''
171 | 
172 | # A unique identification for the text.
173 | #
174 | # epub_uid = ''
175 | 
176 | # A list of files that should not be packed into the epub file.
177 | epub_exclude_files = ['search.html']
178 | 


--------------------------------------------------------------------------------
/docs/designing-your-own-tests.rst:
--------------------------------------------------------------------------------
 1 | ########################
 2 | Designing your own tests
 3 | ########################
 4 | 
 5 | Before we jump into the API and all the premade tests that have been written to make your life easier, let's talk about a process for designing your own machine learning tests.  The reason for doing this is important, machine learning testing is not like other software engineering tests.  That's because software engineering tests are deterministic, like software engineering code ought to be.  However, when you write tests for your data or your machine learning model, you need to account for the probabilistic nature of the code you are writing.  The goal, therefore is much more fuzzy.  But the process defined below should help you out.
 6 | 
 7 | 
 8 | It's About Proving Or Disproving Assumptions
 9 | ============================================
10 | 
11 | There are a standard set of steps to any machine learning project:
12 | 
13 | 1. Exploratory Analysis
14 | 2. Data Cleaning
15 | 3. Model Evaluation
16 | 4. Productionalizing The Model
17 | 5. Monitoring The Model
18 | 
19 | Machine learning tests are really about monitoring, but the big open question is, what do you monitor?  
20 | 
21 | Monitoring the steps you took in 1-3 above, gives at least a base line.  There will likely be other things to account for and monitor once you go into production, but what you've found in evaluation will likely be helpful later.  So that should inform your first set of tests.
22 | 
23 | 
24 | Data Monitoring Tests
25 | ======================
26 | 
27 | Specifically, we can monitor the data by:
28 | 
29 | * checking to see if any descriptive statistics you found have changed substantially
30 | * checking to see if current data is correlated with previous data per column
31 | * checking to see if columns that were correlated or uncorrelated in past data remain that way
32 | * checking to see if the number of clusters in the data has changed in a meaningful way
33 | * checking to see whether the number of missing values stays consistent between new and old data, 
34 | * checking to see certain monotonicity requirements between columns remain consistent
35 | 
36 | It is an imperative to model the data because your model is merely a function of your data.  If your data is bad or changes in some important way, your model will be useless.  Also, there may be more measures you used to evaluate the data and those may become important features of whatever model you build later on.  Therefore, making sure your data continues to follow the trends found previously may be of great import.  Otherwise, your model might be wrong and you'd never know it.  
37 | 
38 | 
39 | Model Monitoring Tests
40 | =======================
41 | 
42 | Additionally, we can monitor the model itself:
43 | 
44 | * checking to see if the model meets all metric requirements as specified by the business use-case
45 | * checking to see if the model does better than some other test model on all measures of interest
46 | 
47 | 
48 | System Monitoring Tests
49 | ========================
50 | 
51 | Finally, there are also traditional tests one should run:
52 | 
53 | * making sure the serialized model exists where expected
54 | * making sure the data exists where expected
55 | * making sure data can flow into the system, to the model and through it
56 | * making sure the new data matches the types you expect
57 | * making sure the model produces the types you expect
58 | * making sure new models can be deployed to the model pipeline
59 | * making sure the model can perform well under load
60 | * making sure the data can flow through fast enough to reach the model at ingress and egress
61 | 
62 | These three classes of machine learning system evaluation form a minimal reference set for monitoring such a system.  There are likely more tests you'll need to write, but again just to outline the process in clear terms:
63 | 
64 | 1. Look at what you wrote when you did exploratory analysis and data cleaning, turn those into tests to make sure your data stays that way, as long as it's supposed to
65 | 
66 | 2. Look at how your model performed on test and training data, turn those evaluation measures into tests to make sure your model performs as well in production
67 | 
68 | 3. Make sure everything actually goes from point A (the start of your system) to point B (the end of your system).
69 | 
70 | 
71 | Fairness Monitoring Tests
72 | ==========================
73 | 
74 | There is a fourth class of tests that are unclear regarding the ethical nature of the algorithm you are building.  These tests are unfortunately poorly defined at the present moment and very context specific, so all that can be offered is an example of what one might do:
75 | 
76 | Suppose you worked for a bank and were writing a piece of software that determined who gets a loan.  Assuming a fair system folks from all races, genders, ages would get loans at a similar rate or would perhaps not be rejected due to race, gender, age or other factors.
77 | 
78 | If when accounting for some protected variable like race, gender, or age your algorithm does something odd compared to when not accounting for race, gender, or age then your algorithm may be biased.  
79 | 
80 | However, this field of research is far from complete.  There are some notions of testing for this, at the present moment they appear to be in need of further research and analysis.  However, if possible, one should account for such a set of tests if possible, to ensure your algorithm is fair, unbiased and treats all individuals equally and fairly.


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. drifter_ml documentation master file, created by
 2 |    sphinx-quickstart on Thu Mar 14 07:54:18 2019.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to Drifter ML's documentation!
 7 | ======================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 |    
12 |    introduction
13 |    project-setup
14 |    designing-your-own-tests
15 |    classification-tests
16 |    regression-tests
17 |    api_docs/drifter_ml.classification_tests
18 |    api_docs/drifter_ml.regression_tests
19 |    api_docs/drifter_ml.columnar_tests
20 |    api_docs/drifter_ml.structural_tests
21 | 
22 | Indices and tables
23 | ==================
24 | 
25 | * :ref:`genindex`
26 | * :ref:`modindex`
27 | * :ref:`search`
28 | 


--------------------------------------------------------------------------------
/docs/introduction.rst:
--------------------------------------------------------------------------------
 1 | ############
 2 | Introduction
 3 | ############
 4 | 
 5 | Welcome to Drifter, a tool to help you test your machine learning models.  This testing framework is broken out semantically, so you can test different aspects of your machine learning system.  
 6 | 
 7 | The tests come in two general flavors, component tests, like this one that tests for a minimum precision per class::
 8 | 
 9 | 	from drifter_ml.classification_tests import ClassificationTests
10 | 	import joblib
11 | 	import pandas as pd
12 | 
13 | 	def test_precision():
14 | 		clf = joblib.load("random_forest.joblib")
15 | 		test_data = pd.read_csv("test.csv")
16 | 		columns = test_data.columns.tolist()
17 | 		columns.remove("target")
18 | 		clf_tests = ClassificationTests(clf, test_data, "target", columns)
19 | 		classes = set(test_data["target"])
20 | 		precision_per_class = {klass: 0.9 for klass in classes}
21 | 		clf_tests.precision_lower_boundary_per_class(precision_per_class)
22 | 
23 | 
24 | And an entire test suite that tests for precision, recall and f1 score in one test::
25 | 
26 | 	from drifter_ml.classification_tests import ClassificationTests
27 | 	import joblib
28 | 	import pandas as pd
29 | 
30 | 	def test_precision():
31 | 		clf = joblib.load("random_forest.joblib")
32 | 		test_data = pd.read_csv("test.csv")
33 | 		columns = test_data.columns.tolist()
34 | 		columns.remove("target")
35 | 		clf_tests = ClassificationTests(clf, test_data, "target", columns)
36 | 		classes = set(test_data["target"])
37 | 		precision_per_class = {klass: 0.9 for klass in classes}
38 | 		recall_per_class = {klass: 0.9 for klass in classes}
39 | 		f1_per_class = {klass: 0.9 for klass in classes}
40 | 		clf_tests.classifier_testing(
41 | 		precision_per_class,
42 | 		recall_per_class,
43 | 		f1_per_class
44 | 		)
45 | 
46 | 
47 | The expectation at present is that all models follow the scikit learn api, which means there is an expectation of a `fit` and `predict` on all models.  This may appear exclusionary, but you can infact wrap keras models with scikit-learn style objects, allowing for the same api::
48 | 
49 | 	from keras.models import Sequential
50 | 	from keras.layers import Dense
51 | 	from keras.wrappers.scikit_learn import KerasClassifier
52 | 	from sklearn.model_selection import StratifiedKFold
53 | 	from sklearn.model_selection import cross_val_score
54 | 	import numpy
55 | 	 
56 | 	# Function to create model, required for KerasClassifier
57 | 	def create_model():
58 | 		# create model
59 | 		model = Sequential()
60 | 		model.add(Dense(12, input_dim=8, activation='relu'))
61 | 		model.add(Dense(8, activation='relu'))
62 | 		model.add(Dense(1, activation='sigmoid'))
63 | 		# Compile model
64 | 		model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
65 | 		return model
66 | 	 
67 | 	# fix random seed for reproducibility
68 | 	seed = 7
69 | 	numpy.random.seed(seed)
70 | 	# load pima indians dataset
71 | 	dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",")
72 | 	# split into input (X) and output (Y) variables
73 | 	X = dataset[:,0:8]
74 | 	Y = dataset[:,8]
75 | 	# create model
76 | 	model = KerasClassifier(build_fn=create_model, epochs=150, batch_size=10, verbose=0)
77 | 	# evaluate using 10-fold cross validation
78 | 	kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
79 | 	results = cross_val_score(model, X, Y, cv=kfold)
80 | 	print(results.mean())
81 | 
82 | This means that traditional machine learning and deep learning are available for testing out of the box!


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/project-setup.rst:
--------------------------------------------------------------------------------
  1 | #############
  2 | Project Setup
  3 | #############
  4 | 
  5 | Regression and Classification Tests
  6 | ===================================
  7 | 
  8 | If you are going to use regression or classification tests, you'll need to do a bit of setup.  The first step is making sure you have a test set with labeled data that you can trust. It is recommended that you break your initial labeled dataset up into test and train and keep the test for both the model generation phase as well as for model monitoring throughout.
  9 | 
 10 | A good rule of thumb is to have 70% train, and 30% test.  Other splits may be ideal, depending on the needs of your project.  You can setup test and train using existing tools from sklearn as follows::
 11 | 
 12 | 	 from sklearn.model_selection import train_test_split
 13 | 	 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
 14 | 
 15 | Once you have your two datasets you can train your model with the training set, as is typical::
 16 | 
 17 | 	from sklearn import tree
 18 | 	import pandas as pd
 19 | 	import numpy as np
 20 | 	from sklearn.model_selection import train_test_split
 21 | 	import joblib
 22 | 
 23 | 	df = pd.DataFrame()
 24 | 	for _ in range(5000):
 25 | 	    a = np.random.normal(0, 1)
 26 | 	    b = np.random.normal(0, 3)
 27 | 	    c = np.random.normal(12, 4)
 28 | 	    if a + b + c > 11:
 29 | 	        target = 1
 30 | 	    else:
 31 | 	        target = 0
 32 | 	    df = df.append({
 33 | 	        "A": a,
 34 | 	        "B": b,
 35 | 	        "C": c,
 36 | 	        "target": target
 37 | 	    }, ignore_index=True)
 38 | 
 39 | 	clf = tree.DecisionTreeClassifier()
 40 | 	X = df[["A", "B", "C"]]
 41 | 	y = df["target"]
 42 | 	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
 43 | 
 44 | 	clf.fit(X_train, y_train)
 45 | 	joblib.dump(clf, "model.joblib")
 46 | 	df.to_csv("data.csv")
 47 | 	test_data = pd.DataFrame()
 48 | 	test_data[["A", "B", "C"]]
 49 | 	test_data["target"] = y_test
 50 | 	test_data.to_csv("test_data.csv")
 51 | 
 52 | Then you can test against your model before you put it into production as follows::
 53 | 
 54 | 	import joblib
 55 | 	import pandas as pd
 56 | 	from sklearn.metrics import f1_score
 57 | 
 58 | 	clf = joblib.load("model.joblib")
 59 | 	test_data = pd.read_csv("test_data.csv")
 60 | 	y_pred = clf.predict(test_data[["A", "B", "C"]])
 61 | 	y_true = test_data["target"]
 62 | 	print(f1_score(y_true, y_pred))
 63 | 
 64 | It's worth noting that one score is likely never good enough, you need to include multiple measures to ensure your model is not simply fitting towards a single measure.  Assuming the measures are good enough you can move onto productionizing your model.
 65 | 
 66 | Strategies For Testing Your Productionized Model
 67 | ================================================
 68 | 
 69 | Once you've put your model into production there are a few strategies for making sure your model continues to meet your requirements:
 70 | 
 71 | 1. Using the test set from training - Gather new data and predictions from production and then training a new classifier or regressor with the new data and new predictions.  Then test against the test set you've set aside.  If the measures stay approximately the same, it's possible your model is performing as expected.  It's important that the new classifier have the same hyper parameters as the one in production as well as using the same versions for all associated code that creates the new model object.
 72 | 
 73 | 2. Generating a new test set from a process - Gather new data and new predictions from the production model.  Then manually label the same set of new data, either via some people process or other process you believe to be able to generate faithful labels.  Then validate that the manually labeled examples against the predicted examples.  If you are predicting new data a lot, I recommend taking random non-overlapping samples from the production data and labeling those.
 74 | 
 75 | 3. Generating a new test set from a process and then do label propagation - Gather new data and new predictions from the production model.  Then manually label a small set of the new data in some manor.  Make sure to have multiple people manually label the same data, till everyone agrees on the ground truth.  Then generate a new set of labels via label propagation.  Then have people manually label the newly propagated labels, if the newly propagated labels agree with the manual labels often enough, then continue the label propagation process.  Continue to check random non-overlapping samples until you feel satisified, then label the remainder of the production data.
 76 | 
 77 | 
 78 | Using The Test Set From Training
 79 | ================================
 80 | 
 81 | So the above description is a bit terse so let's break it down with some example code to inform your own project setup.  First let's assume that you have some data to train on and test on::
 82 | 
 83 | 	from sklearn import tree
 84 | 	import pandas as pd
 85 | 	import numpy as np
 86 | 	from sklearn.model_selection import train_test_split
 87 | 	import joblib
 88 | 
 89 | 	df = pd.DataFrame()
 90 | 	for _ in range(5000):
 91 | 	    a = np.random.normal(0, 1)
 92 | 	    b = np.random.normal(0, 3)
 93 | 	    c = np.random.normal(12, 4)
 94 | 	    if a + b + c > 11:
 95 | 	        target = 1
 96 | 	    else:
 97 | 	        target = 0
 98 | 	    df = df.append({
 99 | 	        "A": a,
100 | 	        "B": b,
101 | 	        "C": c,
102 | 	        "target": target
103 | 	    }, ignore_index=True)
104 | 
105 | 	clf = tree.DecisionTreeClassifier()
106 | 	X = df[["A", "B", "C"]]
107 | 	y = df["target"]
108 | 	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
109 | 
110 | 	clf.fit(X_train, y_train)
111 | 	joblib.dump(clf, "model.joblib")
112 | 	df.to_csv("data.csv")
113 | 	test_data = pd.DataFrame()
114 | 	test_data[["A", "B", "C"]] = df[["A", "B", "C"]]
115 | 	test_data["target"] = y_test
116 | 	test_data.to_csv("test_data.csv")
117 | 
118 | Next we need to test our model to make sure it's performing well enough to go into production::
119 | 
120 | 	import joblib
121 | 	import pandas as pd
122 | 	from sklearn.metrics import classification_report
123 | 
124 | 	clf = joblib.load("model.joblib")
125 | 	test_data = pd.read_csv("test_data.csv")
126 | 	y_pred = clf.predict(test_data[["A", "B", "C"]])
127 | 	y_true = test_data["target"]
128 | 	print(classification_report(y_true, y_pred))
129 | 
130 | Let's assume everything met our minimum criteria for going to production. Now we are ready to put our model into production!! For this we'll need to write our test such that it makes use of the test data, our new data and our new predictions.  For the purposes of the below example, assume you've been saving new data and new predictions to a csv called new_data.csv, that you have saved your production model in a file called model.joblib and that you have test data saved to test_data.csv.  Below is an example test you might write using the framework::
131 | 
132 | 	import joblib
133 | 	import pandas as pd
134 | 	from sklearn import tree
135 | 	from drifter_ml import classification_tests
136 | 
137 | 	def generate_model_from_production_data():
138 | 	    new_data = pd.read_csv("new_data.csv")
139 | 	    prod_clf = joblib.load("model.joblib")
140 | 	    test_data = pd.read_csv("test_data.csv")
141 | 	    return test_data, new_data, prod_clf
142 | 
143 | 	def test_precision():
144 | 	    test_data, new_data, prod_clf = generate_model_from_production_data()
145 | 	    column_names = ["A", "B", "C"]
146 | 	    target_name = "target"
147 | 	    test_clf = tree.DecisionTreeClassifier()
148 | 	    test_clf.set_params(**prod_clf.get_params())
149 | 	    X = new_data[column_names]
150 | 	    y = new_data[target_name]
151 | 	    test_clf.fit(X, y)
152 | 
153 | 	    test_suite = ClassificationTests(test_clf, 
154 | 	    	test_data, target_name, column_names)
155 | 	    classes = list(df.target.unique())
156 | 	    lower_bound_requirement = {klass: 0.9 for klass in classes}
157 | 	    assert test_suite.precision_lower_boundary_per_class(
158 | 	        lower_bound_requirement
159 | 	    )
160 | 
161 | Notice that we train on the production data and labels (in this case in target) and then test against the labels we know.  Here we use the lower_bound_requirement variable to set the expectation for how well the model should do against the test set.  If the labels generated by the production model train a model that performs as well on the test data as the production model did on the test set, then we have some confidence in the labels it produces.  This is probably not the only way one could do this comparison, if you come up with something better, please share back out to the project!


--------------------------------------------------------------------------------
/docs/regression-tests.rst:
--------------------------------------------------------------------------------
  1 | #################
  2 | Regression Tests
  3 | #################
  4 | 
  5 | So this section will likely be the most confusing for anyone coming from classical software engineering.  Here regression refers to a model that outputs a floating point number, instead of a class.  The biggest important difference between classification and regression is, the numbers produced by regression are "real" numbers.  So they actually have magnitude, direction, a sense of scale, etc.  
  6 | 
  7 | Classification returns a "class".  Which means class "1" has no ordering relationship with class "2".  So you shouldn't compare these with ordering.
  8 | 
  9 | In any event, the regression tests break out into the follow categories:
 10 | 
 11 | 1. Establish a baseline maximum error tolerance based on a model measure
 12 | 2. Establish a tolerance level for deviance from the average fold error
 13 | 3. Stress testing for the speed of calculating new values
 14 | 4. Comparison of the current model against new models for the above defined measures
 15 | 5. Comparison of the speed of performance against new models
 16 | 
 17 | Upper Bound Regression Metrics
 18 | ==============================
 19 | 
 20 | Each of the following examples ensures that your model meets a minimum criteria, which should be decided based on the need of your use-case.  One simple way to do this is to define failure by how many dollars it will cost you for every unit amount your model is off on average.  
 21 | 
 22 | Mean Squared Error and Median Absolute Error are great tools for ensuring your regressor optimizes for least error.  The scale of that error will be entirely context specific.
 23 | 
 24 | That is why they are basis of the set of tests found below.
 25 | 
 26 | Regression Test Example - Model Metrics
 27 | =======================================
 28 | 
 29 | Suppose you had the following model::
 30 | 
 31 | 	from sklearn import linear_model
 32 | 	import pandas as pd
 33 | 	import numpy as np
 34 | 	import joblib
 35 | 
 36 | 	df = pd.DataFrame()
 37 | 	for _ in range(1000):
 38 | 	    a = np.random.normal(0, 1)
 39 | 	    b = np.random.normal(0, 3)
 40 | 	    c = np.random.normal(12, 4)
 41 | 	    target = 5*a + 3*b + c
 42 | 	    df = df.append({
 43 | 	        "A": a,
 44 | 	        "B": b,
 45 | 	        "C": c,
 46 | 	        "target": target
 47 | 	    }, ignore_index=True)
 48 | 
 49 | 	reg = linear_model.LinearRegression()
 50 | 	X = df[["A", "B", "C"]]
 51 | 	reg.fit(X, df["target"])
 52 | 	joblib.dump(reg, "model.joblib")
 53 | 	df.to_csv("data.csv")
 54 | 
 55 | We could write the following set of tests to ensure this model does well::
 56 | 
 57 | 	from drifter_ml.regression_tests import RegressionTests
 58 | 	import joblib
 59 | 	import pandas as pd
 60 | 
 61 | 	def test_mse():
 62 | 	    df = pd.read_csv("data.csv")
 63 | 	    column_names = ["A", "B", "C"]
 64 | 	    target_name = "target"
 65 | 	    reg = joblib.load("model.joblib")
 66 | 
 67 | 	    test_suite = RegressionTests(reg, 
 68 | 	    df, target_name, column_names)
 69 | 	    mse_boundary = 15
 70 | 	    assert test_suite.mse_upper_boundary(mse_boundary)
 71 | 
 72 | 	def test_mae():
 73 | 	    df = pd.read_csv("data.csv")
 74 | 	    column_names = ["A", "B", "C"]
 75 | 	    target_name = "target"
 76 | 	    reg = joblib.load("model.joblib")
 77 | 
 78 | 	    test_suite = RegressionTests(reg, 
 79 | 	    df, target_name, column_names)
 80 | 	    mae_boundary = 10
 81 | 	    assert test_suite.mae_upper_boundary(mae_boundary)
 82 | 
 83 | Or you could simply write one test for all three::
 84 | 
 85 | 	from drifter_ml.regression_tests import RegressionTests
 86 | 	import joblib
 87 | 	import pandas as pd
 88 | 
 89 | 	def test_mse_mae():
 90 | 	    df = pd.read_csv("data.csv")
 91 | 	    column_names = ["A", "B", "C"]
 92 | 	    target_name = "target"
 93 | 	    reg = joblib.load("model.joblib")
 94 | 
 95 | 	    test_suite = RegressionTests(reg, 
 96 | 	    df, target_name, column_names)
 97 | 	    mse_boundary = 15
 98 | 	    mae_boundary = 10
 99 | 	    assert test_suite.regression_testing(mse_boundary,
100 | 	    					 mae_boundary)
101 | 
102 | Regression Test Example - Model Speed
103 | =====================================
104 | 
105 | Additionally, you can test to ensure your regressor performs, even under load.  Assume we have the same model as before::
106 | 
107 | 	from sklearn import linear_model
108 | 	import pandas as pd
109 | 	import numpy as np
110 | 	import joblib
111 | 
112 | 	df = pd.DataFrame()
113 | 	for _ in range(1000):
114 | 	    a = np.random.normal(0, 1)
115 | 	    b = np.random.normal(0, 3)
116 | 	    c = np.random.normal(12, 4)
117 | 	    target = 5*a + 3*b + c
118 | 	    df = df.append({
119 | 	        "A": a,
120 | 	        "B": b,
121 | 	        "C": c,
122 | 	        "target": target
123 | 	    }, ignore_index=True)
124 | 
125 | 	reg = linear_model.LinearRegression()
126 | 	X = df[["A", "B", "C"]]
127 | 	reg.fit(X, df["target"])
128 | 	joblib.dump(reg, "model.joblib")
129 | 	df.to_csv("data.csv")
130 | 
131 | Now we test to ensure the model predicts new labels within our constraints::
132 | 
133 | 	from drifter_ml.regression_tests import RegressionTests
134 | 	import joblib
135 | 	import pandas as pd
136 | 
137 | 	def test_mse_mae_speed():
138 | 	    df = pd.read_csv("data.csv")
139 | 	    column_names = ["A", "B", "C"]
140 | 	    target_name = "target"
141 | 	    reg = joblib.load("model.joblib")
142 | 
143 | 	    test_suite = RegressionTests(reg, 
144 | 	    df, target_name, column_names)
145 | 	    performance_boundary = []
146 | 	    for size in range(1, 100000, 100):
147 | 	    	performance_boundary.append({
148 | 	    		"sample_size": size,
149 | 	    		"max_run_time": 10.0 # seconds
150 | 	    	})
151 | 	    assert test_suite.run_time_stress_test(
152 | 	        performance_boundary
153 | 	    )
154 | 
155 | This test ensures that from 1 to 100000 elements, the model never takes longer than 10 seconds.  
156 | 
157 | 


--------------------------------------------------------------------------------
/drifter_ml/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = '0.25'
 2 | 
 3 | from .classification_tests import classification_tests
 4 | from .columnar_tests import columnar_tests
 5 | from .regression_tests import regression_tests
 6 | from .structural_tests import structural_tests
 7 | 
 8 | 
 9 | __all__ = ["classification_tests", "columnar_tests", "regression_tests", "structural_tests"]
10 | 


--------------------------------------------------------------------------------
/drifter_ml/classification_tests/__init__.py:
--------------------------------------------------------------------------------
1 | from .classification_tests import ClassificationTests
2 | from .classification_tests import ClassifierComparison
3 | 
4 | __all__ = ["ClassificationTests", "ClassifierComparison"]
5 | 


--------------------------------------------------------------------------------
/drifter_ml/columnar_tests/__init__.py:
--------------------------------------------------------------------------------
1 | from .columnar_tests import DataSanitization
2 | from .columnar_tests import ColumnarData
3 | 
4 | __all__ = ["DataSanitization", "ColumnarData"]
5 | 


--------------------------------------------------------------------------------
/drifter_ml/columnar_tests/columnar_tests.py:
--------------------------------------------------------------------------------
  1 | from sklearn import metrics
  2 | import numpy as np
  3 | import time
  4 | from scipy import stats
  5 | from mlxtend.evaluate import permutation_test
  6 | 
  7 | class DataSanitization(): 
  8 |     def __init__(self, data):
  9 |         self.data = data
 10 |         
 11 |     def is_complete(self, column):
 12 |         return self.data[column].isnull().sum() == 0
 13 | 
 14 |     def has_completeness(self, column, threshold):
 15 |         return self.data[column].isnull().sum()/len(self.data) > threshold
 16 | 
 17 |     def is_unique(self, column):
 18 |         return len(self.data[column].unique())/len(self.data) == 1
 19 | 
 20 |     def has_uniqueness(self, column, threshold):
 21 |         return len(self.data[column].unique())/len(self.data) > threshold
 22 | 
 23 |     def is_in_range(self, column, lower_bound, upper_bound, threshold):
 24 |         return self.data[(self.data[column] <= upper_bound) & (self.data[column] >= lower_bound)]/len(self.data) > threshold
 25 | 
 26 |     def is_non_negative(self, column):
 27 |         return self.data[self.data[column] > 0]
 28 | 
 29 |     def is_less_than(self, column_one, column_two):
 30 |         return self.data[self.data[column_one] < self.data[column_two]].all()
 31 | 
 32 | class ColumnarData():
 33 |     def __init__(self, historical_data, new_data):
 34 |         self.new_data = new_data
 35 |         self.historical_data = historical_data
 36 | 
 37 |     def mean_similarity(self, column, tolerance=2):
 38 |         new_mean = float(np.mean(self.new_data[column]))
 39 |         old_mean = float(np.mean(self.historical_data[column]))
 40 |         std = float(np.std(self.historical_data[column]))
 41 |         upper_bound = old_mean + (std * tolerance)
 42 |         lower_bound = old_mean - (std * tolerance)
 43 |         if new_mean < lower_bound:
 44 |             return False
 45 |         elif new_mean > upper_bound:
 46 |             return False
 47 |         else:
 48 |             return True
 49 | 
 50 |     def median_similarity(self, column, tolerance=2):
 51 |         new_median = float(np.median(self.new_data[column]))
 52 |         old_median = float(np.median(self.historical_data[column]))
 53 |         iqr = float(stats.iqr(self.historical_data[column]))
 54 |         upper_bound = old_median + (iqr * tolerance)
 55 |         lower_bound = old_median - (iqr * tolerance)
 56 |         if new_median < lower_bound:
 57 |             return False
 58 |         elif new_median > upper_bound:
 59 |             return False
 60 |         else:
 61 |             return True
 62 | 
 63 |     def trimean(self, data):
 64 |         q1 = float(np.quantile(data, 0.25))
 65 |         q3 = float(np.quantile(data, 0.75))
 66 |         median = float(np.median(data))
 67 |         return (q1 + 2*median + q3)/4
 68 | 
 69 |     def trimean_absolute_deviation(self, data):
 70 |         trimean = self.trimean(data)
 71 |         numerator = [abs(elem - trimean) for elem in data]
 72 |         return sum(numerator)/len(data)
 73 | 
 74 |     def trimean_similarity(self, column, tolerance=2):
 75 |         new_trimean = self.trimean(self.new_data[column])
 76 |         old_trimean = self.trimean(self.historical_data[column])
 77 |         tad = self.trimean_absolute_deviation(self.historical_data[column])
 78 |         upper_bound = old_trimean + (tad * tolerance)
 79 |         lower_bound = old_trimean - (tad * tolerance)
 80 |         if new_trimean < lower_bound:
 81 |             return False
 82 |         if new_trimean > upper_bound:
 83 |             return False
 84 |         else:
 85 |             return True
 86 |     
 87 |     def is_normal(self, column):
 88 |         new_data_result = stats.normaltest(self.new_data[column])
 89 |         historical_data_result = stats.normaltest(self.historical_data[column])
 90 |         if new_data_result.pvalue > 0.05 and historical_data_result.pvalue > 0.05:
 91 |             return True
 92 |         return False
 93 |     
 94 |     def pearson_similar_correlation(self, column,
 95 |                                      correlation_lower_bound,
 96 |                                      pvalue_threshold=0.05,
 97 |                                      num_rounds=3):
 98 |         correlation_info = stats.pearsonr(self.new_data[column],
 99 |                                           self.historical_data[column])
100 |         p_value = permutation_test(
101 |             self.new_data[column],
102 |             self.historical_data[column],
103 |             method="approximate",
104 |             num_rounds=num_rounds,
105 |             func=lambda x, y: stats.pearsonr(x, y)[0],
106 |             seed=0)
107 |         if p_value > pvalue_threshold:
108 |             return False
109 |         if correlation_info[0] < correlation_lower_bound:
110 |             return False
111 |         return True
112 | 
113 |     def spearman_similar_correlation(self, column,
114 |                                       correlation_lower_bound,
115 |                                       pvalue_threshold=0.05,
116 |                                       num_rounds=3):
117 |         correlation_info = stats.spearmanr(self.new_data[column],
118 |                                            self.historical_data[column])
119 |         p_value = permutation_test(
120 |             self.new_data[column],
121 |             self.historical_data[column],
122 |             method="approximate",
123 |             num_rounds=num_rounds,
124 |             func=lambda x, y: stats.spearmanr(x, y).correlation,
125 |             seed=0)
126 |         if p_value > pvalue_threshold:
127 |             return False
128 |         if correlation_info.correlation < correlation_lower_bound:
129 |             return False
130 |         return True
131 | 
132 |     def wilcoxon_similar_distribution(self, column,
133 |                                        pvalue_threshold=0.05,
134 |                                        num_rounds=3):
135 |         p_value = permutation_test(
136 |             self.new_data[column],
137 |             self.historical_data[column],
138 |             method="approximate",
139 |             num_rounds=num_rounds,
140 |             func=lambda x, y: stats.wilcoxon(x, y).statistic,
141 |             seed=0)
142 |         if p_value < pvalue_threshold:
143 |             return False
144 |         return True
145 |         
146 |     def ks_2samp_similar_distribution(self, column,
147 |                                        pvalue_threshold=0.05,
148 |                                        num_rounds=3):
149 |         p_value = permutation_test(
150 |             self.new_data[column],
151 |             self.historical_data[column],
152 |             method="approximate",
153 |             num_rounds=num_rounds,
154 |             func=lambda x, y: stats.ks_2samp(x, y).statistic,
155 |             seed=0)
156 |         if p_value < pvalue_threshold:
157 |             return False
158 |         return True
159 | 
160 |     def kruskal_similar_distribution(self, column,
161 |                                       pvalue_threshold=0.05,
162 |                                       num_rounds=3):
163 |         p_value = permutation_test(
164 |             self.new_data[column],
165 |             self.historical_data[column],
166 |             method="approximate",
167 |             num_rounds=num_rounds,
168 |             func=lambda x, y: stats.kruskal(x, y).statistic,
169 |             seed=0)
170 |         if p_value < pvalue_threshold:
171 |             return False
172 |         return True
173 | 
174 |     def mann_whitney_u_similar_distribution(self, column,
175 |                                             pvalue_threshold=0.05,
176 |                                             num_rounds=3):
177 |         p_value = permutation_test(
178 |             self.new_data[column],
179 |             self.historical_data[column],
180 |             method="approximate",
181 |             num_rounds=num_rounds,
182 |             func=lambda x, y: stats.mannwhitneyu(x, y).statistic,
183 |             seed=0)
184 | 
185 |         if p_value < pvalue_threshold:
186 |             return False
187 |         return True
188 | 


--------------------------------------------------------------------------------
/drifter_ml/regression_tests/__init__.py:
--------------------------------------------------------------------------------
1 | from .regression_tests import RegressionTests
2 | from .regression_tests import RegressionComparison
3 | 
4 | __all__ = ["RegressionTests", "RegressionComparison"]
5 | 


--------------------------------------------------------------------------------
/drifter_ml/regression_tests/regression_tests.py:
--------------------------------------------------------------------------------
  1 | from sklearn import metrics
  2 | import numpy as np
  3 | import time
  4 | from scipy import stats
  5 | from sklearn.model_selection import cross_validate, cross_val_predict
  6 | 
  7 | class RegressionTests():
  8 |     def __init__(self,
  9 |                  reg,
 10 |                  test_data,
 11 |                  target_name,
 12 |                  column_names):
 13 |         self.reg = reg
 14 |         self.column_names = column_names
 15 |         self.target_name = target_name
 16 |         self.test_data = test_data
 17 |         self.y = test_data[target_name]
 18 |         self.X = test_data[column_names]
 19 | 
 20 |     def get_test_score(self, cross_val_dict):
 21 |         return list(cross_val_dict["test_score"])
 22 | 
 23 |     def trimean(self, data):
 24 |         """
 25 |         I'm exposing this as a public method because
 26 |         the trimean is not implemented in enough packages.
 27 |         
 28 |         Formula:
 29 |         (25th percentile + 2*50th percentile + 75th percentile)/4
 30 |         
 31 |         Parameters
 32 |         ----------
 33 |         data : array-like
 34 |           an iterable, either a list or a numpy array
 35 | 
 36 |         Returns
 37 |         -------
 38 |         the trimean: float
 39 |         """
 40 |         q1 = np.quantile(data, 0.25)
 41 |         q3 = np.quantile(data, 0.75)
 42 |         median = np.median(data)
 43 |         return (q1 + 2*median + q3)/4
 44 | 
 45 |     def trimean_absolute_deviation(self, data):
 46 |         """
 47 |         The trimean absolute deviation is the
 48 |         the average distance from the trimean.
 49 |         
 50 |         Parameters
 51 |         ----------
 52 |         data : array-like
 53 |           an iterable, either a list or a numpy array
 54 | 
 55 |         Returns
 56 |         -------
 57 |         the average distance to the trimean: float
 58 |         """
 59 |         trimean = self.trimean(data)
 60 |         numerator = [abs(elem - trimean) for elem in data]
 61 |         return sum(numerator)/len(data)
 62 | 
 63 |     def describe_scores(self, scores, method):
 64 |         """
 65 |         Describes scores.
 66 |         
 67 |         Parameters
 68 |         ----------
 69 |         scores : array-like
 70 |           the scores from the model, as a list or numpy array
 71 |         method : string
 72 |           the method to use to calculate central tendency and spread
 73 |         
 74 |         Returns
 75 |         -------
 76 |         Returns the central tendency, and spread
 77 |         by method.
 78 |         
 79 |         Methods:
 80 |         mean:
 81 |         * central tendency: mean
 82 |         * spread: standard deviation
 83 |         
 84 |         median:
 85 |         * central tendency: median
 86 |         * spread: interquartile range
 87 |         
 88 |         trimean:
 89 |         * central tendency: trimean
 90 |         * spread: trimean absolute deviation
 91 |         """
 92 |         if method == "mean":
 93 |             return np.mean(scores), np.std(scores)
 94 |         elif method == "median":
 95 |             return np.median(scores), stats.iqr(scores)
 96 |         elif method == "trimean":
 97 |             return self.trimean(scores), self.trimean_absolute_deviation(scores)
 98 | 
 99 |     def mae_cv(self, cv):
100 |         """
101 |         This method performs cross-validation over median absolute error.
102 |         
103 |         Parameters
104 |         ----------
105 |         * cv : integer
106 |           The number of cross validation folds to perform
107 | 
108 |         Returns
109 |         -------
110 |         Returns a scores of the k-fold median absolute error.
111 |         """
112 | 
113 |         mae = metrics.make_scorer(metrics.median_absolute_error)
114 |         result = cross_validate(self.reg, self.X,
115 |                                 self.y, cv=cv,
116 |                                 scoring=(mae))
117 |         return self.get_test_score(result)
118 |     
119 |     def mse_cv(self, cv):
120 |         """
121 |         This method performs cross-validation over mean squared error.
122 |         
123 |         Parameters
124 |         ----------
125 |         * cv : integer
126 |           The number of cross validation folds to perform
127 | 
128 |         Returns
129 |         -------
130 |         Returns a scores of the k-fold mean squared error.
131 |         """
132 |         mse = metrics.make_scorer(metrics.mean_squared_error)
133 |         result = cross_validate(self.reg, self.X,
134 |                                 self.y, cv=cv,
135 |                                 scoring=(mse))
136 |         return self.get_test_score(result)
137 | 
138 |     def trimean_squared_error(self, y_true, y_pred,
139 |                               sample_weight=None,
140 |                               multioutput='uniform_average'):
141 |         output_errors = self.trimean((y_true - y_pred) ** 2)
142 |         return self.trimean(output_errors)
143 | 
144 |     def trimean_absolute_error(self, y_true, y_pred,
145 |                                sample_weight=None,
146 |                                multioutput='uniform_average'):
147 |         output_errors = self.trimean(abs(y_true - y_pred))
148 |         return self.trimean(output_errors)
149 | 
150 |     def tse_cv(self, cv):
151 |         """
152 |         This method performs cross-validation over trimean squared error.
153 |         
154 |         Parameters
155 |         ----------
156 |         * cv : integer
157 |           The number of cross validation folds to perform
158 | 
159 |         Returns
160 |         -------
161 |         Returns a scores of the k-fold trimean squared error.
162 |         """
163 |         tse = metrics.make_scorer(self.trimean_squared_error)
164 |         result = cross_validate(self.reg, self.X,
165 |                                 self.y, cv=cv,
166 |                                 scoring=(tse))
167 |         return self.get_test_score(result)
168 | 
169 |     def tae_cv(self, cv):
170 |         """
171 |         This method performs cross-validation over trimean absolute error.
172 |         
173 |         Parameters
174 |         ----------
175 |         * cv : integer
176 |           The number of cross validation folds to perform
177 | 
178 |         Returns
179 |         -------
180 |         Returns a scores of the k-fold trimean absolute error.
181 |         """
182 |         tse = metrics.make_scorer(self.trimean_absolute_error)
183 |         result = cross_validate(self.reg, self.X,
184 |                                 self.y, cv=cv,
185 |                                 scoring=(tse))
186 |         return self.get_test_score(result)
187 | 
188 |     def _cross_val_anomaly_detection(self, scores, tolerance, method='mean'):
189 |         avg, _ = self.describe_scores(scores, method)
190 |         deviance_from_avg = [abs(score - avg)
191 |                              for score in scores]
192 |         for deviance in deviance_from_avg:
193 |             if deviance > tolerance:
194 |                 return False
195 |         return True
196 | 
197 |     def _cross_val_avg(self, scores, maximum_center_tolerance, method='mean'):
198 |         avg, _ = self.describe_scores(scores, method)
199 |         if avg > maximum_center_tolerance:
200 |             return False
201 |         return True
202 | 
203 |     def _cross_val_upper_boundary(self, scores, upper_boundary):
204 |         for score in scores:
205 |             if score > upper_boundary:
206 |                 return False
207 |         return True
208 | 
209 |     def cross_val_tse_anomaly_detection(self, tolerance, cv=3, method='mean'):
210 |         scores = self.tse_cv(cv)
211 |         return self._cross_val_anomaly_detection(scores, tolerance, method=method)
212 | 
213 |     def cross_val_tse_avg(self, minimum_center_tolerance, cv=3, method='mean'):
214 |         scores = self.tse_cv(cv)
215 |         return self._cross_val_avg(scores, minimum_center_tolerance)
216 | 
217 |     def cross_val_tse_upper_boundary(self, upper_boundary, cv=3):
218 |         scores = self.tse_cv(cv)
219 |         return self._cross_val_upper_boundary(scores, upper_boundary)
220 |         
221 |     def tse_upper_boundary(self, upper_boundary):
222 |         y_pred = self.reg.predict(self.X)
223 |         if self.trimean_squared_error(self.y, y_pred) > upper_boundary:
224 |             return False
225 |         return True
226 | 
227 |     def cross_val_tae_anomaly_detection(self, tolerance, cv=3, method='mean'):
228 |         scores = self.tae_cv(cv)
229 |         return self._cross_val_anomaly_detection(scores, tolerance, method=method)
230 | 
231 |     def cross_val_tae_avg(self, minimum_center_tolerance, cv=3, method='mean'):
232 |         scores = self.tae_cv(cv)
233 |         return self._cross_val_avg(scores, minimum_center_tolerance)
234 | 
235 |     def cross_val_tae_upper_boundary(self, upper_boundary, cv=3):
236 |         scores = self.tae_cv(cv)
237 |         return self._cross_val_upper_boundary(scores, upper_boundary)
238 |         
239 |     def tae_upper_boundary(self, upper_boundary):
240 |         y_pred = self.reg.predict(self.X)
241 |         if self.trimean_absolute_error(self.y, y_pred) > upper_boundary:
242 |             return False
243 |         return True
244 | 
245 |     def cross_val_mse_anomaly_detection(self, tolerance, cv=3, method='mean'):
246 |         scores = self.mse_cv(cv)
247 |         return self._cross_val_anomaly_detection(scores, tolerance, method=method)
248 | 
249 |     def cross_val_mse_avg(self, minimum_center_tolerance, cv=3, method='mean'):
250 |         scores = self.mse_cv(cv)
251 |         return self._cross_val_avg(scores, minimum_center_tolerance)
252 | 
253 |     def cross_val_mse_upper_boundary(self, upper_boundary, cv=3):
254 |         scores = self.mse_cv(cv)
255 |         return self._cross_val_upper_boundary(scores, upper_boundary)
256 |         
257 |     def mse_upper_boundary(self, upper_boundary):
258 |         y_pred = self.reg.predict(self.X)
259 |         if metrics.mean_squared_error(self.y, y_pred) > upper_boundary:
260 |             return False
261 |         return True
262 |     
263 |     def cross_val_mae_anomaly_detection(self, tolerance, cv=3, method='mean'):
264 |         scores = self.mae_cv(cv)
265 |         return self._cross_val_anomaly_detection(scores, tolerance, method=method)
266 | 
267 |     def cross_val_mae_avg(self, minimum_center_tolerance, cv=3, method='mean'):
268 |         scores = self.mae_cv(cv)
269 |         return self._cross_val_avg(scores, minimum_center_tolerance, method=method)
270 | 
271 |     def cross_val_mae_upper_boundary(self, upper_boundary, cv=3):
272 |         scores = self.mae_cv(cv)
273 |         return self._cross_val_upper_boundary(scores, upper_boundary)
274 |     
275 |     def mae_upper_boundary(self, upper_boundary):
276 |         y_pred = self.reg.predict(self.X)
277 |         if metrics.median_absolute_error(self.y, y_pred) > upper_boundary:
278 |             return False
279 |         return True
280 | 
281 |     def upper_bound_regression_testing(self,
282 |                                        mse_upper_boundary,
283 |                                        mae_upper_boundary,
284 |                                        tse_upper_boundary,
285 |                                        tae_upper_boundary):
286 |         mse_test = self.mse_upper_boundary(mse_upper_boundary)
287 |         mae_test = self.mae_upper_boundary(mae_upper_boundary)
288 |         tse_test = self.tse_upper_boundary(tse_upper_boundary)
289 |         tae_test = self.tae_upper_boundary(tae_upper_boundary)
290 |         if mse_test and mae_test and tse_test and tae_test:
291 |             return True
292 |         else:
293 |             return False
294 | 
295 |     def run_time_stress_test(self, sample_sizes, max_run_times):
296 |         for index, sample_size in enumerate(sample_sizes):
297 |             max_run_time = max_run_times[index]
298 |             data = self.X.sample(sample_size, replace=True)
299 |             start_time = time.time()
300 |             self.reg.predict(data)
301 |             model_run_time = time.time() - start_time
302 |             if model_run_time > max_run_time:
303 |                 return False
304 |         return True
305 | 
306 | class RegressionComparison():
307 |     def __init__(self,
308 |                  reg_one,
309 |                  reg_two,
310 |                  test_data,
311 |                  target_name,
312 |                  column_names):
313 |         self.reg_one = reg_one
314 |         self.reg_two = reg_two
315 |         self.column_names = column_names
316 |         self.target_name = target_name
317 |         self.test_data = test_data
318 |         self.y = test_data[target_name]
319 |         self.X = test_data[column_names]
320 |         
321 |     def two_model_prediction_run_time_stress_test(self, sample_sizes):
322 |         for sample_size in sample_sizes:
323 |             data = self.X.sample(sample_size, replace=True)
324 |             start_time = time.time()
325 |             self.reg_one.predict(data)
326 |             model_one_run_time = time.time() - start_time
327 |             start_time = time.time()
328 |             self.reg_two.predict(data)
329 |             model_two_run_time = time.time() - start_time
330 |             # we assume model one should be faster than model two
331 |             if model_one_run_time > model_two_run_time:
332 |                 return False
333 |         return True
334 | 
335 |     def cross_val_mse_result(self, reg, cv=3):
336 |         y_pred = cross_val_predict(reg, self.X, self.y)
337 |         return metrics.mean_squared_error(self.y, y_pred)
338 |         
339 |     def cross_val_mae_result(self, reg, cv=3):
340 |         y_pred = cross_val_predict(reg, self.X, self.y)
341 |         return metrics.median_absolute_error(self.y, y_pred)
342 | 
343 |     def mse_result(self, reg):
344 |         y_pred = reg.predict(self.X)
345 |         return metrics.mean_squared_error(self.y, y_pred)
346 | 
347 |     def mae_result(self, reg):
348 |         y_pred = reg.predict(self.X)
349 |         return metrics.median_absolute_error(self.y, y_pred)
350 | 
351 |     def cv_two_model_regression_testing(self, cv=3):
352 |         mse_one_test = self.cross_val_mse_result(self.reg_one, cv=cv)
353 |         mae_one_test = self.cross_val_mae_result(self.reg_one, cv=cv)
354 |         mse_two_test = self.cross_val_mse_result(self.reg_two, cv=cv)
355 |         mae_two_test = self.cross_val_mae_result(self.reg_two, cv=cv)
356 |         if mse_one_test < mse_two_test and mae_one_test < mae_two_test:
357 |             return True
358 |         else:
359 |             return False
360 | 
361 |     def two_model_regression_testing(self):
362 |         mse_one_test = self.mse_result(self.reg_one)
363 |         mae_one_test = self.mae_result(self.reg_one)
364 |         mse_two_test = self.mse_result(self.reg_two)
365 |         mae_two_test = self.mae_result(self.reg_two)
366 |         if mse_one_test < mse_two_test and mae_one_test < mae_two_test:
367 |             return True
368 |         else:
369 |             return False
370 | 


--------------------------------------------------------------------------------
/drifter_ml/structural_tests/__init__.py:
--------------------------------------------------------------------------------
1 | from .structural_tests import StructuralData
2 | 
3 | __all__ =["StructuralData"]
4 | 


--------------------------------------------------------------------------------
/drifter_ml/structural_tests/structural_tests.py:
--------------------------------------------------------------------------------
  1 | from sklearn import metrics
  2 | import time
  3 | from sklearn import neighbors
  4 | from scipy import stats
  5 | from sklearn.model_selection import cross_val_score
  6 | from sklearn import cluster
  7 | 
  8 | class KmeansClustering():
  9 |     def __init__(self,
 10 |                  new_data,
 11 |                  historical_data,
 12 |                  column_names,
 13 |                  target_name):
 14 |         self.column_names = column_names
 15 |         self.target_name = target_name
 16 |         self.new_data = new_data
 17 |         self.historical_data = historical_data
 18 | 
 19 |     def kmeans_clusters(self, n_clusters, data):
 20 |         k_means = cluster.KMeans(n_clusters=n_clusters)
 21 |         k_means.fit(data)
 22 |         return k_means.predict(data)
 23 | 
 24 |     def kmeans_scorer(self, metric, min_similarity):
 25 |         for k in range(2, 12):
 26 |             new_data = self.new_data[self.column_names]
 27 |             historical_data = self.historical_data[self.column_names]
 28 |             new_data_clusters = self.kmeans_clusters(k, new_data)
 29 |             historical_data_clusters = self.kmeans_clusters(k, historical_data)
 30 |             score = metric(
 31 |                 new_data_clusters, historical_data_clusters)
 32 |             if score < min_similarity:
 33 |                 return False
 34 |         return True
 35 |     
 36 |     def mutual_info_kmeans_scorer(self, min_similarity):
 37 |         return self.kmeans_scorer(
 38 |             metrics.adjusted_mutual_info_score,
 39 |             min_similarity
 40 |         )
 41 | 
 42 |     def adjusted_rand_kmeans_scorer(self, min_similarity):
 43 |         return self.kmeans_scorer(
 44 |             metrics.adjusted_rand_score,
 45 |             min_similarity
 46 |         )
 47 | 
 48 |     def completeness_kmeans_scorer(self, min_similarity):
 49 |         return self.kmeans_scorer(
 50 |             metrics.completeness_score,
 51 |             min_similarity
 52 |         )
 53 | 
 54 |     def fowlkes_mallows_kmeans_scorer(self, min_similarity):
 55 |         return self.kmeans_scorer(
 56 |             metrics.fowlkes_mallows_score,
 57 |             min_similarity
 58 |         )
 59 | 
 60 |     def homogeneity_kmeans_scorer(self, min_similarity):
 61 |         return self.kmeans_scorer(
 62 |             metrics.homogeneity_score,
 63 |             min_similarity
 64 |         )
 65 | 
 66 |     def v_measure_kmeans_scorer(self, min_similarity):
 67 |         return self.kmeans_scorer(
 68 |             metrics.v_measure_score,
 69 |             min_similarity
 70 |         )
 71 | 
 72 |     def unsupervised_kmeans_score_clustering(self, min_similarity):
 73 |         return all([
 74 |             self.v_measure_kmeans_scorer(min_similarity),
 75 |             self.homogeneity_kmeans_scorer(min_similarity),
 76 |             self.fowlkes_mallows_kmeans_scorer(min_similarity),
 77 |             self.completeness_kmeans_scorer(min_similarity),
 78 |             self.adjusted_rand_kmeans_scorer(min_similarity),
 79 |             self.mutual_info_kmeans_scorer(min_similarity),
 80 |         ])
 81 | 
 82 | class DBscanClustering():
 83 |     def __init__(self,
 84 |                  new_data,
 85 |                  historical_data,
 86 |                  column_names,
 87 |                  target_name):
 88 |         self.column_names = column_names
 89 |         self.target_name = target_name
 90 |         self.new_data = new_data
 91 |         self.historical_data = historical_data
 92 | 
 93 |     def dbscan_clusters(self, data):
 94 |         dbscan = cluster.DBSCAN()
 95 |         return dbscan.fit_predict(data)
 96 |     
 97 |     def dbscan_scorer(self, metric, min_similarity):
 98 |         for k in range(2, 12):
 99 |             new_data = self.new_data[self.column_names]
100 |             historical_data = self.historical_data[self.column_names]
101 |             new_data_clusters = self.dbscan_clusters(new_data)
102 |             historical_data_clusters = self.dbscan_clusters(historical_data)
103 |             score = metric(
104 |                 new_data_clusters, historical_data_clusters)
105 |             if score < min_similarity:
106 |                 return False
107 |         return True
108 | 
109 |     def mutual_info_dbscan_scorer(self, min_similarity):
110 |         return self.dbscan_scorer(
111 |             metrics.adjusted_mutual_info_score,
112 |             min_similarity
113 |         )
114 | 
115 |     def adjusted_rand_dbscan_scorer(self, min_similarity):
116 |         return self.dbscan_scorer(
117 |             metrics.adjusted_rand_score,
118 |             min_similarity
119 |         )
120 | 
121 |     def completeness_dbscan_scorer(self, min_similarity):
122 |         return self.dbscan_scorer(
123 |             metrics.completeness_score,
124 |             min_similarity
125 |         )
126 | 
127 |     def fowlkes_mallows_dbscan_scorer(self, min_similarity):
128 |         return self.dbscan_scorer(
129 |             metrics.fowlkes_mallows_score,
130 |             min_similarity
131 |         )
132 | 
133 |     def homogeneity_dbscan_scorer(self, min_similarity):
134 |         return self.dbscan_scorer(
135 |             metrics.homogeneity_score,
136 |             min_similarity
137 |         )
138 | 
139 |     def v_measure_dbscan_scorer(self, min_similarity):
140 |         return self.dbscan_scorer(
141 |             metrics.v_measure_score,
142 |             min_similarity
143 |         )
144 | 
145 |     def unsupervised_dbscan_score_clustering(self, min_similarity):
146 |         return all([
147 |             self.v_measure_dbscan_scorer(min_similarity),
148 |             self.homogeneity_dbscan_scorer(min_similarity),
149 |             self.fowlkes_mallows_dbscan_scorer(min_similarity),
150 |             self.completeness_dbscan_scorer(min_similarity),
151 |             self.adjusted_rand_dbscan_scorer(min_similarity),
152 |             self.mutual_info_dbscan_scorer(min_similarity),
153 |         ])
154 | 
155 | class KnnClustering():
156 |     def __init__(self,
157 |                  new_data,
158 |                  historical_data,
159 |                  column_names,
160 |                  target_name):
161 |         self.column_names = column_names
162 |         self.target_name = target_name
163 |         self.new_data = new_data
164 |         self.historical_data = historical_data
165 | 
166 |     def reg_supervised_clustering(self, data):
167 |         k_measures = []
168 |         X = data[self.column_names]
169 |         y = data[self.target_name]
170 |         for k in range(2, 12):
171 |             knn = neighbors.KNeighborsRegressor(n_neighbors=k)
172 |             knn.fit(X, y)
173 |             y_pred = knn.predict(X)
174 |             k_measures.append((k, metrics.mean_squared_error(y, y_pred)))
175 |         sorted_k_measures = sorted(k_measures, key=lambda t:t[1])
176 |         lowest_mse = sorted_k_measures[0]
177 |         best_k = lowest_mse[0]
178 |         return best_k
179 | 
180 |     def reg_supervised_similar_clustering(self, absolute_distance):
181 |         historical_k = self.reg_supervised_clustering(self.historical_data)
182 |         new_k = self.reg_supervised_clustering(self.new_data)
183 |         if abs(historical_k - new_k) > absolute_distance:
184 |             return False
185 |         else:
186 |             return True
187 | 
188 |     def cls_supervised_clustering(self, data):
189 |         k_measures = []
190 |         X = data[self.column_names]
191 |         y = data[self.target_name]
192 |         for k in range(2, 12):
193 |             knn = neighbors.KNeighborsClassifier(n_neighbors=k)
194 |             knn.fit(X, y)
195 |             y_pred = knn.predict(X)
196 |             k_measures.append((k, metrics.mean_squared_error(y, y_pred)))
197 |         sorted_k_measures = sorted(k_measures, key=lambda t:t[1])
198 |         lowest_mse = sorted_k_measures[0]
199 |         best_k = lowest_mse[0]
200 |         return best_k
201 | 
202 |     def cls_supervised_similar_clustering(self, absolute_distance):
203 |         historical_k = self.cls_supervised_clustering(self.historical_data)
204 |         new_k = self.cls_supervised_clustering(self.new_data)
205 |         if abs(historical_k - new_k) > absolute_distance:
206 |             return False
207 |         else:
208 |             return True
209 | 
210 | class StructuralData(KnnClustering,
211 |                      DBscanClustering,
212 |                      KmeansClustering):
213 |     def __init__(self,
214 |                  new_data,
215 |                  historical_data,
216 |                  column_names,
217 |                  target_name):
218 |         self.column_names = column_names
219 |         self.target_name = target_name
220 |         self.new_data = new_data
221 |         self.historical_data = historical_data
222 | 
223 | 


--------------------------------------------------------------------------------
/drifter_ml/timeseries_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EricSchles/drifter_ml/198a2e4a0b6310765e551f3122ff0ea8b04ed900/drifter_ml/timeseries_tests/__init__.py


--------------------------------------------------------------------------------
/drifter_ml/timeseries_tests/timeseries_tests.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | The goal of this model is to test for model drift:  Does the model behave the same
  3 | way over time?
  4 | 
  5 | Is the model and data consistent over time?
  6 | 
  7 | We can think of this through the following questions, 
  8 | 
  9 | * do the same inputs produce the same outputs, over time?
 10 | * how sensitive is the model to input data?
 11 | * what is the distribution of predictions over time?
 12 | * what are the marginal distributions of the data over time?
 13 | * As the marginal distributions change, how much does the distribution of predictions change, over time?
 14 | * how stable is the distribution of predictions over time? (for regression)
 15 | * how stable are the percentages per class of the predictions over time? (for classification)
 16 | * how likely are certain predictions over time? (for classification)
 17 | * how likely are certain ranges of predictions over time? (for regression)
 18 | 
 19 | * how much data do we expect to be misclassified over time? (for classification)
 20 |   * precision
 21 |   * recall
 22 |   * f1 score
 23 | * how much error do we expect over time? (for regression)
 24 |   * mean squared error
 25 |   * median absolute error
 26 |   * trimean absolute error
 27 | * how many outliers do we expect in the data over time? (using various techniques)
 28 | * how likely is it the data is drawn from the same distribution over a given time frame? (using distribution similarity tests)
 29 | * how sensitive is the model to changes in each marginal variable over time? (regression and classification) IE, if we change each variable while holding all others constant, how many values do we need to change to produce a significant change in the prediction (significant increase in the output for regression) or change of class for classification?  
 30 | * how sensitive is the model to the marginal effects of n variables? (with the above set up) where n varies from 1 to the total number of variables in the data
 31 | * how do various feature selection algorithms change on the data over time? aka which features are statistically significant over time?
 32 | * how much of the data is missing over time?
 33 | '''
 34 | from backtester import metrics as bt_metrics
 35 | import pandas as pd
 36 | import datetime
 37 | 
 38 | class TimeSeriesClassificationTests:
 39 |     """
 40 |     The general goal of this class is to test 
 41 |     classification algorithms over time.
 42 |     The class expects the following parameters:
 43 |     
 44 |     * descriptors : arraylike
 45 |       A set of descriptions of a model.  This ought to
 46 |       be a classification metric like precision, recall, or
 47 |       f1-score or a loss like log loss.
 48 | 
 49 |     * timestamps : arraylike
 50 |       A set of timestamps associated with the descriptors.
 51 |       this will be important for some of the metrics used.
 52 |       Each element should be of time datetime.datetime.
 53 |     
 54 |     The way in which classification algorithms is assessed via
 55 |     hypothesis tests and time series metrics. The time series
 56 |     metrics come to us from backtester, another framework I developed.
 57 |     Each timeseries metric is standard where the expectation is
 58 |     that data is compared against a forecast.  
 59 |     A simple moving average is used for the forecast model to make
 60 |     sure the only thing we are trying to capture is how much the model
 61 |     has changed recently.
 62 |     
 63 |     For this reason, the number of lag periods is very important. If you
 64 |     set this number too low, you may think everything is fine, when in fact
 65 |     things are actually changing quiet rapidly.  If you set the number of lags
 66 |     too long, then you may capture bugs from the last anomaly, and thus won't
 67 |     capture the next.
 68 |     
 69 |     A good rule of thumb is to set the number of lags for a week, assuming everything
 70 |     has been fine.  And set it for 5 periods after the last bug, to assess normality.
 71 |     
 72 |     It may make sense to initialize multiple instances of the class, if
 73 |     you want to capture things at different levels of granularity.
 74 |     """
 75 |     def __init__(self, descriptors, timestamps, lags=10):
 76 |         self.descriptors = discriptors
 77 |         self.timestamps = timestamps
 78 |         self.lags = lags
 79 |         self.series = self._generate_series()
 80 |         
 81 |     def _generate_series(self):
 82 |         return pd.Series(
 83 |             data = self.descriptors,
 84 |             index = self.timestamps
 85 |         )
 86 | 
 87 |     def _apply_metric(self, metric, forecast_start, max_error):
 88 |         y_true = series[forecast_start:]
 89 |         y_pred = self.series.rolling(window=self.lags).mean()
 90 |         y_pred = y_pred[forecast_start:]
 91 |         error = metric(
 92 |             y_true, y_pred
 93 |         )
 94 |         return error < max_error
 95 | 
 96 |     def root_mean_squared_error(self, forecast_start: datetime.datetime, max_error: float) -> bool:
 97 |         """
 98 |         The root mean squared error is a standard metric for 
 99 |         assessing error in a regression problem.  It lends itself
100 |         naturally to the forecast context because of its application
101 |         of a euclidean metric as well as taking of the average.
102 |         
103 |         An average is especially advantegous due to its sensitivity
104 |         to outliers.
105 |         
106 |         Parameters
107 |         ----------
108 |         * forecast_start : datetime.datetime
109 |           The starting timestamp to begin the forecast.
110 |           Observations of the descriptor after the start time will be checked.
111 |           Special care should be given when choosing the start forecast.
112 |         
113 |         * max_error: float
114 |           The maximum allowed error or tolerance of the forecast.
115 |           If we are dealing with a score function like f1-score
116 |           it is imperative that we set max_error below 1.0.
117 |         
118 |         Return
119 |         ------
120 |         True if the root mean squared error of 
121 |         the forecast and actual error is below the max_error.
122 |         False otherwise
123 |         """
124 |         return self._apply_metric(
125 |             bt_metric.root_mean_squared_error,
126 |             forecast_start, max_error
127 |         )
128 |         
129 |     def normalized_root_mean_squared_error(self, forecast_start: datetime.datetime, max_error: float) -> bool:
130 |         """
131 |         The normalized root mean squared error takes into account scale.
132 |         It is not recommended that the normalized root mean squared error
133 |         be used if your descriptor is a score, since those are already bounded
134 |         between (0.0, 1.0).  If you are dealing with a loss function, then
135 |         the normalized root mean squared error may be advantegous as sense of
136 |         scale is removed.
137 |         
138 |         Since there is no standard convention for how to normalize the choice
139 |         of max - min of the observations is used as a choice for normalization.
140 |         
141 |         Parameters
142 |         ----------
143 |         * forecast_start : datetime.datetime
144 |           The starting timestamp to begin the forecast.
145 |           Observations of the descriptor after the start time will be checked.
146 |           Special care should be given when choosing the start forecast.
147 |         
148 |         * max_error: float
149 |           The maximum allowed error or tolerance of the forecast.
150 |           If we are dealing with a score function like f1-score
151 |           it is imperative that we set max_error below 1.0.
152 |         
153 |         Return
154 |         ------
155 |         True if the normalized root mean squared error of 
156 |         the forecast and actual error is below the max_error.
157 |         False otherwise
158 |         """
159 |         return self._apply_metric(
160 |             bt_metric.normalized_root_mean_squared_error,
161 |             forecast_start, max_error
162 |         )
163 | 
164 |     def mean_error(self, forecast_start: datetime.datetime, max_error: float) -> bool:
165 |         """
166 |         Perhaps the most naive metric I could think of, mean error
167 |         is simply the average error of the forecast against the
168 |         observations.
169 |         
170 |         As a result, this measure will be sensitive to outliers, which may
171 |         be advantegous for assessing deviance quickly and obviously.
172 | 
173 |         Parameters
174 |         ----------
175 |         * forecast_start : datetime.datetime
176 |           The starting timestamp to begin the forecast.
177 |           Observations of the descriptor after the start time will be checked.
178 |           Special care should be given when choosing the start forecast.
179 |         
180 |         * max_error: float
181 |           The maximum allowed error or tolerance of the forecast.
182 |           If we are dealing with a score function like f1-score
183 |           it is imperative that we set max_error below 1.0.
184 |         
185 |         Return
186 |         ------
187 |         True if the mean error of the forecast 
188 |         and actual error is below the max_error.
189 |         False otherwise
190 |         """
191 |         return self._apply_metric(
192 |             bt_metric.mean_error,
193 |             forecast_start, max_error
194 |         )
195 | 
196 |     def mean_absolute_error(self, forecast_start: datetime.datetime, max_error: float) -> bool:
197 |         """
198 |         Perhaps one of the most naive metrics out there, mean absolute error
199 |         is simply the average of the absolute value of the error of the forecast against the
200 |         observations.
201 |         
202 |         It ought to be the same as mean error, because score functions are bounded to the
203 |         range (0.0, 1.0) and loss functions should never be negative.  That said
204 |         it is always possible something went wrong.  It therefore might be useful
205 |         to run mean absolute error and mean error with the same parameters.  If
206 |         one passes and the other fails, this will be a good signal that something is
207 |         wrong with your set up.
208 |         
209 |         Parameters
210 |         ----------
211 |         * forecast_start : datetime.datetime
212 |           The starting timestamp to begin the forecast.
213 |           Observations of the descriptor after the start time will be checked.
214 |           Special care should be given when choosing the start forecast.
215 |         
216 |         * max_error: float
217 |           The maximum allowed error or tolerance of the forecast.
218 |           If we are dealing with a score function like f1-score
219 |           it is imperative that we set max_error below 1.0.
220 |         
221 |         Return
222 |         ------
223 |         True if the mean absolute error of the forecast 
224 |         and actual error is below the max_error.
225 |         False otherwise
226 |         """
227 |         return self._apply_metric(
228 |             bt_metric.mean_absolute_error,
229 |             forecast_start, max_error
230 |         )
231 | 
232 |     def median_absolute_error(self, forecast_start: datetime.datetime, max_error: float) -> bool:
233 |         """
234 |         The median absolute error is an interesting metric to look at.  It ignores outliers,
235 |         so it may be used as an expectation of normalcy without the outliers.  Comparing
236 |         median absolute error and mean absolute error might give a sense of how much outliers
237 |         are effecting centrality.
238 |         
239 |         Parameters
240 |         ----------
241 |         * forecast_start : datetime.datetime
242 |           The starting timestamp to begin the forecast.
243 |           Observations of the descriptor after the start time will be checked.
244 |           Special care should be given when choosing the start forecast.
245 |         
246 |         * max_error: float
247 |           The maximum allowed error or tolerance of the forecast.
248 |           If we are dealing with a score function like f1-score
249 |           it is imperative that we set max_error below 1.0.
250 |         
251 |         Return
252 |         ------
253 |         True if the median absolute error of the forecast 
254 |         and actual error is below the max_error.
255 |         False otherwise
256 |         """
257 |         return self._apply_metric(
258 |             bt_metric.median_absolute_error,
259 |             forecast_start, max_error
260 |         )
261 | 
262 |     def variance_absolute_error(self, forecast_start: datetime.datetime, max_error: float) -> bool:
263 |         """
264 |         The variance absolute error gives us a sense of the variance in our error.  This way
265 |         we can directly interrogate variability in our absolute error.  And we can set boundaries
266 |         for the maximum boundary on deviances from our forecast.
267 | 
268 |         Parameters
269 |         ----------
270 |         * forecast_start : datetime.datetime
271 |           The starting timestamp to begin the forecast.
272 |           Observations of the descriptor after the start time will be checked.
273 |           Special care should be given when choosing the start forecast.
274 |         
275 |         * max_error: float
276 |           The maximum allowed error or tolerance of the forecast.
277 |           If we are dealing with a score function like f1-score
278 |           it is imperative that we set max_error below 1.0.
279 |         
280 |         Return
281 |         ------
282 |         True if the variance absolute error of the forecast 
283 |         and actual error is below the max_error.
284 |         False otherwise
285 |         """
286 |         return self._apply_metric(
287 |             bt_metric.median_absolute_error,
288 |             forecast_start, max_error
289 |         )
290 |     
291 |     def mean_squared_error(self, forecast_start: datetime.datetime, max_error: float) -> bool:
292 |         """
293 |         The mean squared error is a canonical measure of error.  It overstates large deviations
294 |         of individual examples while marginalizing the effect size of any deviances of deviations
295 |         smaller than one.  Because the mean is used, large values are overstated, thus individual
296 |         large deviations will tend to become apparent.  For the mean squared error to be small,
297 |         therefore no extreme deviances must exist.  However relatively small deviances across
298 |         many or even all samples will be understated.
299 |         
300 |         Parameters
301 |         ----------
302 |         * forecast_start : datetime.datetime
303 |           The starting timestamp to begin the forecast.
304 |           Observations of the descriptor after the start time will be checked.
305 |           Special care should be given when choosing the start forecast.
306 |         
307 |         * max_error: float
308 |           The maximum allowed error or tolerance of the forecast.
309 |           If we are dealing with a score function like f1-score
310 |           it is imperative that we set max_error below 1.0.
311 |         
312 |         Return
313 |         ------
314 |         True if the mean squared error of the forecast 
315 |         and actual error is below the max_error.
316 |         False otherwise
317 |         """
318 |         return self._apply_metric(
319 |             bt_metric.mean_squared_error,
320 |             forecast_start, max_error
321 |         )
322 | 
323 |     def mean_squared_log_error(self, forecast_start: datetime.datetime, max_error: float) -> bool:
324 |         """
325 |         The mean squared log error is a variant on mean squared error.  Mean squared log error
326 |         measures the relative difference between the true and predicted values.
327 |         It over penalizes underestimates, cases where the predicted value is less than
328 |         the true value, more than it penalizes overestimates, cases where the predicted
329 |         value is more than the true value.  This is because it's a MSLE is a ratio of the two.
330 |         
331 |         This measure is especially useful if you want to check if your prediction is smaller
332 |         than your actual timeseries.  Therefore it is very useful for accuracy and less
333 |         useful for error metrics.
334 |         
335 |         Parameters
336 |         ----------
337 |         * forecast_start : datetime.datetime
338 |           The starting timestamp to begin the forecast.
339 |           Observations of the descriptor after the start time will be checked.
340 |           Special care should be given when choosing the start forecast.
341 |         
342 |         * max_error: float
343 |           The maximum allowed error or tolerance of the forecast.
344 |           If we are dealing with a score function like f1-score
345 |           it is imperative that we set max_error below 1.0.
346 |         
347 |         Return
348 |         ------
349 |         True if the mean squared error of the forecast 
350 |         and actual error is below the max_error.
351 |         False otherwise
352 |         """
353 |         return self._apply_metric(
354 |             bt_metric.mean_squared_log_error,
355 |             forecast_start, max_error
356 |         )
357 | 
358 |     def root_mean_squared_log_error(self, forecast_start: datetime.datetime, max_error: float) -> bool:
359 |         """
360 |         The root mean squared log error is a variant on mean squared error.  
361 |         Root mean squared log error measures the relative difference between 
362 |         the true and predicted values. It over penalizes underestimates, cases 
363 |         where the predicted value is less than the true value, more than it 
364 |         penalizes overestimates, cases where the predicted value is more than the true value.  
365 |         This is because it's a RMSLE is a ratio of the two.  However unlike the MSLE
366 |         by taking the root the penalization is diminished making this closer in measure
367 |         to something like the mean squared error in terms of direction.
368 |                 
369 |         Parameters
370 |         ----------
371 |         * forecast_start : datetime.datetime
372 |           The starting timestamp to begin the forecast.
373 |           Observations of the descriptor after the start time will be checked.
374 |           Special care should be given when choosing the start forecast.
375 |         
376 |         * max_error: float
377 |           The maximum allowed error or tolerance of the forecast.
378 |           If we are dealing with a score function like f1-score
379 |           it is imperative that we set max_error below 1.0.
380 |         
381 |         Return
382 |         ------
383 |         True if the mean squared error of the forecast 
384 |         and actual error is below the max_error.
385 |         False otherwise
386 |         """
387 |         return self._apply_metric(
388 |             bt_metric.root_mean_squared_log_error,
389 |             forecast_start, max_error
390 |         )
391 | 
392 | 
393 | # iqr_absolute_error
394 | # geometric_mean_absolute_error
395 | # mean_percentage_error
396 | # mean_absolute_percentage_error
397 | # median_absolute_percentage_error
398 | # symmetric_mean_absolute_percentage_error
399 | # symmetric_median_absolute_percentage_error
400 | # mean_arctangent_absolute_percentage_error
401 | # mean_absolute_scaled_error
402 | # normalized_absolute_error
403 | # normalized_absolute_percentage_error
404 | # root_mean_squared_percentage_error
405 | # root_median_squared_percentage_error
406 | # root_mean_squared_scaled_error
407 | # integral_normalized_root_squared_error
408 | # root_relative_squared_error
409 | # mean_relative_error
410 | # relative_absolute_error
411 | # mean_relative_absolute_error
412 | # median_relative_absolute_error
413 | # geometric_mean_relative_absolute_error
414 | # mean_bounded_relative_absolute_error
415 | # unscaled_mean_bounded_relative_absolute_error
416 | # mean_directional_accuracy
417 |  
418 | 


--------------------------------------------------------------------------------
/example_models/static_examples/example_model.py:
--------------------------------------------------------------------------------
 1 | from sklearn import tree
 2 | from sklearn import ensemble
 3 | from sklearn import model_selection
 4 | import pandas as pd
 5 | import numpy as np
 6 | import joblib
 7 | import code
 8 | import json
 9 | 
10 | df = pd.DataFrame()
11 | for _ in range(1000):
12 |     a = np.random.normal(0, 1)
13 |     b = np.random.normal(0, 3)
14 |     c = np.random.normal(12, 4)
15 |     if a + b + c > 11:
16 |         target = 1
17 |     else:
18 |         target = 0
19 |     df = df.append({
20 |         "A": a,
21 |         "B": b,
22 |         "C": c,
23 |         "target": target
24 |     }, ignore_index=True)
25 | 
26 | clf1 = tree.DecisionTreeClassifier()
27 | clf2 = ensemble.RandomForestClassifier()
28 | X = df[["A", "B", "C"]]
29 | clf1.fit(X, df["target"])
30 | clf2.fit(X, df["target"])
31 | #code.interact(local=locals())
32 | joblib.dump(clf1, "model1.joblib")
33 | joblib.dump(clf2, "model1.joblib")
34 | json.dump({
35 |     "column_names": ["A", "B", "C"],
36 |     "target_name": "target"
37 |     }, open("model_metadata.json", "w"))
38 | df.to_csv("data.csv")
39 | 


--------------------------------------------------------------------------------
/example_models/static_examples/example_tests.py:
--------------------------------------------------------------------------------
 1 | from drifter_ml import classification_tests
 2 | import joblib
 3 | import pandas as pd
 4 | import code
 5 | 
 6 | def test():
 7 |     df = pd.read_csv("data.csv")
 8 |     column_names = ["A", "B", "C"]
 9 |     target_name = "target"
10 |     clf = joblib.load("model1.joblib")
11 | 
12 |     test_suite = classification_tests.ClassificationTests(clf,
13 |                                                           df,
14 |                                                           target_name,
15 |                                                           column_names)
16 |     classes = list(df.target.unique())
17 |     assert test_suite.classifier_testing(
18 |         {klass: 0.9 for klass in classes},
19 |         {klass: 0.9 for klass in classes},
20 |         {klass: 0.9 for klass in classes}
21 |     )
22 |     
23 |     
24 | 
25 | 


--------------------------------------------------------------------------------
/example_models/static_examples/keras_example.py:
--------------------------------------------------------------------------------
 1 | from keras.models import Sequential
 2 | from keras.layers import Dense
 3 | from keras.wrappers.scikit_learn import KerasClassifier
 4 | import pandas as pd
 5 | import numpy as np
 6 | import joblib
 7 | 
 8 | # Function to create model, required for KerasClassifier
 9 | def create_model():
10 |         # create model
11 |         model = Sequential()
12 |         model.add(Dense(12, input_dim=3, activation='relu'))
13 |         model.add(Dense(8, activation='relu'))
14 |         model.add(Dense(1, activation='sigmoid'))
15 |         # Compile model
16 |         model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
17 |         return model
18 | 
19 | # fix random seed for reproducibility
20 | df = pd.DataFrame()
21 | for _ in range(1000):
22 |     a = np.random.normal(0, 1)
23 |     b = np.random.normal(0, 3)
24 |     c = np.random.normal(12, 4)
25 |     if a + b + c > 11:
26 |         target = 1
27 |     else:
28 |         target = 0
29 |     df = df.append({
30 |         "A": a,
31 |         "B": b,
32 |         "C": c,
33 |         "target": target
34 |     }, ignore_index=True)
35 | 
36 | # split into input (X) and output (Y) variables
37 | # create model
38 | clf = KerasClassifier(build_fn=create_model, epochs=150, batch_size=10, verbose=0)
39 | X = df[["A", "B", "C"]]
40 | clf.fit(X, df["target"])
41 | joblib.dump(clf, "model.joblib")
42 | df.to_csv("data.csv")
43 | 


--------------------------------------------------------------------------------
/example_models/static_examples/model.joblib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EricSchles/drifter_ml/198a2e4a0b6310765e551f3122ff0ea8b04ed900/example_models/static_examples/model.joblib


--------------------------------------------------------------------------------
/example_models/static_examples/model1.joblib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EricSchles/drifter_ml/198a2e4a0b6310765e551f3122ff0ea8b04ed900/example_models/static_examples/model1.joblib


--------------------------------------------------------------------------------
/example_models/static_examples/model_metadata.json:
--------------------------------------------------------------------------------
1 | {"column_names": ["A", "B", "C"], "target_name": "target"}


--------------------------------------------------------------------------------
/example_models/static_examples/prototype_test_framework.py:
--------------------------------------------------------------------------------
  1 | import joblib
  2 | import json
  3 | from sklearn import metrics
  4 | import numpy as np
  5 | import time
  6 | from sklearn import neighbors
  7 | from scipy import stats
  8 | from sklearn.model_selection import cross_val_score
  9 | 
 10 | # classification tests
 11 | class ModelClassificationTestSuite():
 12 |     def __init__(self, clf_name, clf_metadata, data_filename):
 13 |         clf, metadata, colum_names, target_name, test_data = self.get_parameters(
 14 |             clf_name, clf_metadata, data_filename)
 15 |         self.clf = clf
 16 |         self.data_filename
 17 |         self.metadata = metadata
 18 |         self.column_names = column_names
 19 |         self.target_name = target_name
 20 |         self.test_data = test_data
 21 |         self.y = test_data[target_name]
 22 |         self.X = test_data[column_names]
 23 |         self.classes = set(self.y)
 24 | 
 25 |     # potentially include hyper parameters from the model
 26 |     # algorithm could be stored in metadata
 27 |     def get_parameters(self, clf_name, clf_metadata, data_filename):
 28 |         clf = joblib.load(clf_name)
 29 |         metadata = json.load(open(clf_metadata, "r"))
 30 |         column_names = metadata["column_names"]
 31 |         target_name = metadata["target_name"]
 32 |         test_data = pd.read_csv(data_name)
 33 |         return clf, metadata, column_names, target_name, test_data
 34 | 
 35 |     def precision_lower_boundary_per_class(self, lower_boundary):
 36 |         y_pred = self.clf.predict(self.X)
 37 |         for class_info in lower_boundary["per_class"]:
 38 |             klass = class_info["class"]
 39 |             y_pred_class = np.take(y_pred, self.y[self.y == klass].index, axis=0)
 40 |             y_class = self.y[self.y == klass]
 41 |             if metrics.precision_score(y_class, y_pred_class) < class_info["precision_score"]:
 42 |                 return False
 43 |         return True
 44 | 
 45 |     def recall_lower_boundary_per_class(self, lower_boundary):
 46 |         y_pred = self.clf.predict(self.X)
 47 |         for class_info in lower_boundary["per_class"]:
 48 |             klass = class_info["class"]
 49 |             y_pred_class = np.take(y_pred, self.y[self.y == klass].index, axis=0)
 50 |             y_class = self.y[self.y == klass]
 51 |             if metrics.recall_score(y_class, y_pred_class) < class_info["recall_score"]:
 52 |                 return False
 53 |         return True
 54 | 
 55 |     def f1_lower_boundary_per_class(self, clf, test_data, target_name, column_names, lower_boundary):
 56 |         y_pred = self.clf.predict(self.X)
 57 |         for class_info in lower_boundary["per_class"]:
 58 |             klass = class_info["class"]
 59 |             y_pred_class = np.take(y_pred, self.y[self.y == klass].index, axis=0)
 60 |             y_class = self.y[self.y == klass]
 61 |             if metrics.f1_score(y_class, y_pred_class) < class_info["f1_score"]:
 62 |                 return False
 63 |         return True
 64 | 
 65 |     def classifier_testing(self, precision_lower_boundary, recall_lower_boundary, f1_lower_boundary):
 66 |         precision_test = self.precision_lower_boundary_per_class(precision_lower_boundary)
 67 |         recall_test = self.recall_lower_boundary_per_class(recall_lower_boundary)
 68 |         f1_test = self.f1_lower_boundary_per_class(f1_lower_boundary)
 69 |         if precision_test and recall_test and f1_test:
 70 |             return True
 71 |         else:
 72 |             return False
 73 | 
 74 |     def run_time_stress_test(self, performance_boundary):
 75 |         for performance_info in performance_boundary:
 76 |             n = int(performance_info["sample_size"])
 77 |             max_run_time = float(performance_info["max_run_time"])
 78 |             data = self.X.sample(n, replace=True)
 79 |             start_time = time.time()
 80 |             self.clf.predict(data)
 81 |             model_run_time = time.time() - start_time
 82 |             if model_run_time > run_time:
 83 |                 return False
 84 |         return True
 85 | 
 86 | # post training - 
 87 | # todo: add model metric outside of some standard deviation
 88 | # for many models
 89 | # is the model non-empty
 90 | # is the model deserializable
 91 | 
 92 | # test against training and scoring
 93 | 
 94 | class ModelRegressionTestSuite():
 95 |     def __init__(self, reg_name, reg_metadata, data_filename):
 96 |         reg, reg_metadata, colum_names, target_name, test_data = self.get_parameters(
 97 |             reg_name, reg_metadata, data_filename)
 98 |         self.reg = reg
 99 |         self.data_filename
100 |         self.metadata = metadata
101 |         self.column_names = column_names
102 |         self.target_name = target_name
103 |         self.test_data = test_data
104 |         self.y = test_data[target_name]
105 |         self.X = test_data[column_names]
106 |         
107 |     def get_parameters(self, reg_name, reg_metadata, data_filename):
108 |         reg = joblib.load(reg_name)
109 |         metadata = json.load(open(reg_metadata, "r"))
110 |         column_names = metadata["column_names"]
111 |         target_name = metadata["target_name"]
112 |         test_data = pd.read_csv(data_name)
113 |         return reg, metadata, column_names, target_name, test_data
114 | 
115 |     def mse_upper_boundary(upper_boundary):
116 |         y_pred = self.reg.predict(self.X)
117 |         if metrics.mean_squared_error(self.y, y_pred) > upper_boundary:
118 |             return False
119 |         return True
120 | 
121 |     def mae_upper_boundary(upper_boundary):
122 |         y_pred = self.reg.predict(self.X)
123 |         if metrics.median_absolute_error(self.y, y_pred) > upper_boundary:
124 |             return False
125 |         return True
126 | 
127 |     def regression_testing(mse_upper_boundary, mae_upper_boundary):
128 |         mse_test = self.mse_upper_boundary(mse_upper_boundary)
129 |         mae_test = self.mae_upper_boundary(mae_upper_boundary)
130 |         if mse_test and mae_test:
131 |             return True
132 |         else:
133 |             return False
134 | 
135 |     def run_time_stress_test(self, performance_boundary):
136 |         for performance_info in performance_boundary:
137 |             n = int(performance_info["sample_size"])
138 |             max_run_time = float(performance_info["max_run_time"])
139 |             data = self.X.sample(n, replace=True)
140 |             start_time = time.time()
141 |             self.reg.predict(data)
142 |             model_run_time = time.time() - start_time
143 |             if model_run_time > run_time:
144 |                 return False
145 |         return True
146 | 
147 | class ClassifierComparison():
148 |     def __init__(self, clf_one_name, clf_one_metadata, clf_two_name, clf_two_metadata, data_filename):
149 |         clf_one, metadata_one, colum_names, target_name, test_data = self.get_parameters(
150 |             clf_one_name, clf_one_metadata, data_filename)
151 |         clf_two, metadata_two, colum_names, target_name, test_data = self.get_parameters(
152 |             clf_two_name, clf_two_metadata, data_filename)
153 |         self.clf_one = clf_one
154 |         self.clf_two = clf_two
155 |         self.data_filename
156 |         self.metadata_one = metadata_one
157 |         self.metadata_two = metadata_two
158 |         self.column_names = column_names
159 |         self.target_name = target_name
160 |         self.test_data = test_data
161 |         self.y = test_data[target_name]
162 |         self.X = test_data[column_names]
163 |         self.classes = set(self.y)
164 |         
165 |     def two_model_prediction_run_time_stress_test(self, performance_boundary):
166 |         for performance_info in performance_boundary:
167 |             n = int(performance_info["sample_size"])
168 |             data = self.X.sample(n, replace=True)
169 |             start_time = time.time()
170 |             self.clf_one.predict(data)
171 |             model_one_run_time = time.time() - start_time
172 |             start_time = time.time()
173 |             self.clf_two.predict(data)
174 |             model_two_run_time = time.time() - start_time
175 |             # we assume model one should be faster than model two
176 |             if model_one_run_time > model_two_run_time:
177 |                 return False
178 |         return True
179 | 
180 |     def precision_per_class(self, clf, test_data, target_name, column_names):
181 |         y = test_data[target_name]
182 |         classes = set(y)
183 |         y_pred = clf.predict(test_data[column_names])
184 |         precision = {}
185 |         for klass in classes:
186 |             y_pred_class = np.take(y_pred, y[y == klass].index, axis=0)
187 |             y_class = y[y == klass]
188 |             precision[klass] = metrics.precision_score(y_class, y_pred_class) 
189 |         return precision
190 | 
191 |     def recall_per_class(self, clf, test_data, target_name, column_names):
192 |         y = test_data[target_name]
193 |         classes = set(y)
194 |         y_pred = clf.predict(test_data[column_names])
195 |         recall = {}
196 |         for klass in classes:
197 |             y_pred_class = np.take(y_pred, y[y == klass].index, axis=0)
198 |             y_class = y[y == klass]
199 |             recall[klass] = metrics.recall_score(y_class, y_pred_class)
200 |         return recall
201 | 
202 |     def f1_per_class(self, clf, test_data, target_name, column_names):
203 |         y = test_data[target_name]
204 |         classes = set(y)
205 |         y_pred = clf.predict(test_data[column_names])
206 |         f1 = {}
207 |         for klass in classes:
208 |             y_pred_class = np.take(y_pred, y[y == klass].index, axis=0)
209 |             y_class = y[y == klass]
210 |             f1[klass] = metrics.f1_score(y_class, y_pred_class)
211 |         return f1
212 | 
213 |     def two_model_classifier_testing(self):
214 |         precision_one_test = self.precision_per_class(self.clf_one)
215 |         recall_one_test = self.recall_per_class(self.clf_one)
216 |         f1_one_test = self.f1_per_class(self.clf_one)
217 |         precision_two_test = precision_per_class(self.clf_two)
218 |         recall_two_test = recall_per_class(self.clf_two)
219 |         f1_two_test = f1_per_class(self.clf_two)
220 | 
221 |         precision_result =  precision_one_test > precision_two_test
222 |         recall_result = recall_one_test > recall_two_test
223 |         f1_result = f1_one_test > f1_two_test
224 |         if precision_result and recall_result and f1_result:
225 |             return True
226 |         else:
227 |             return False
228 | 
229 | class RegressionComparison():
230 |     def __init__(self, reg_one_name, reg_one_metadata, reg_two_name, reg_two_metadata, data_filename):
231 |         reg_one, metadata_one, colum_names, target_name, test_data = self.get_parameters(
232 |             reg_one_name, reg_one_metadata, data_filename)
233 |         reg_two, metadata_two, colum_names, target_name, test_data = self.get_parameters(
234 |             reg_two_name, reg_two_metadata, data_filename)
235 |         self.reg_one = reg_one
236 |         self.reg_two = reg_two
237 |         self.data_filename
238 |         self.metadata_one = metadata_one
239 |         self.metadata_two = metadata_two
240 |         self.column_names = column_names
241 |         self.target_name = target_name
242 |         self.test_data = test_data
243 |         self.y = test_data[target_name]
244 |         self.X = test_data[column_names]
245 |         
246 |     def two_model_prediction_run_time_stress_test(self, performance_boundary):
247 |         for performance_info in performance_boundary:
248 |             n = int(performance_info["sample_size"])
249 |             data = self.X.sample(n, replace=True)
250 |             start_time = time.time()
251 |             self.reg_one.predict(data)
252 |             model_one_run_time = time.time() - start_time
253 |             start_time = time.time()
254 |             self.reg_two.predict(data)
255 |             model_two_run_time = time.time() - start_time
256 |             # we assume model one should be faster than model two
257 |             if model_one_run_time > model_two_run_time:
258 |                 return False
259 |         return True
260 | 
261 |     def mse_result(self, reg):
262 |         y_pred = reg.predict(self.X)
263 |         return metrics.mean_squared_error(self.y, y_pred)
264 | 
265 |     def mae_result(self, reg):
266 |         y_pred = reg.predict(self.X)
267 |         return metrics.median_absolute_error(self.y, y_pred)
268 | 
269 |     def two_model_regression_testing(self):
270 |         mse_one_test = self.mse_result(self.reg_one)
271 |         mae_one_test = self.mae_result(self.reg_one)
272 |         mse_two_test = self.mse_result(self.reg_two)
273 |         mae_two_test = self.mae_result(self.reg_two)
274 |         if mse_one_test < mse_two_test and mae_one_test < mae_two_test:
275 |             return True
276 |         else:
277 |             return False
278 | 
279 | # data tests
280 | class DataSanitization(): 
281 |     def __init__(self, data_filename):
282 |         self.data_filename
283 |         self.data = pd.read_csv(data_filename)
284 |         
285 |     def is_complete(self, column):
286 |         return self.data[column].isnull().sum() == 0
287 | 
288 |     def has_completeness(self, column, threshold):
289 |         return self.data[column].isnull().sum()/len(self.data) > threshold
290 | 
291 |     def is_unique(self, column):
292 |         return len(self.data[column].unique())/len(self.data) == 1
293 | 
294 |     def has_uniqueness(column, threshold):
295 |         return len(self.data[column].unique())/len(self.data) > threshold
296 | 
297 |     def is_in_range(column, lower_bound, upper_bound, threshold):
298 |         return self.data[(self.data[column] <= upper_bound) & (self.data[column] >= lower_bound)]/len(self.data) > threshold
299 | 
300 |     def is_non_negative(column):
301 |         return self.data[self.data[column] > 0]
302 | 
303 |     def is_less_than(column_one, column_two):
304 |         return self.data[self.data[column_one] < self.data[column_two]].all()
305 | 
306 | # memoryful tests
307 | class StructuralData():
308 |     def __init__(self, metadata, data_filename):
309 |         metadata, column_names, target_name, test_data = self.get_parameters(
310 |             metadata, data_filename)
311 |         self.data_filename
312 |         self.metadata = metadata
313 |         self.column_names = column_names
314 |         self.target_name = target_name
315 |         self.test_data = test_data
316 |         self.y = test_data[target_name]
317 |         self.X = test_data[column_names]
318 | 
319 |     def get_parameters(self, metadata, data_filename):
320 |         metadata = json.load(open(clf_metadata, "r"))
321 |         column_names = metadata["column_names"]
322 |         target_name = metadata["target_name"]
323 |         test_data = pd.read_csv(data_name)
324 |         return metadata, column_names, target_name, test_data
325 | 
326 |     def reg_clustering(self, data, columns, target):
327 |         k_measures = []
328 |         for k in range(2, 12):
329 |             knn = neighbors.KNeighborsRegressor(n_neighbors=k)
330 |             knn.fit(self.X, self.y)
331 |             y_pred = knn.predict(self.X)
332 |             k_measures.append((k, metrics.mean_squared_error(self.y, y_pred)))
333 |         sorted_k_measures = sorted(k_measures, key=lambda t:t[1])
334 |         lowest_mse = sorted_k_measures[0]
335 |         best_k = lowest_mse[0]
336 |         return best_k
337 | 
338 |     def reg_similar_clustering(self, absolute_distance, new_data, historical_data, column_names, target_name):
339 |         historical_k = reg_clustering(historical_data, column_names, target_name)
340 |         new_k = reg_clustering(new_data, column_names, target_name)
341 |         if abs(historical_k - new_k) > absolute_distance:
342 |             return False
343 |         else:
344 |             return True
345 | 
346 |     # this was never updated
347 |     def cls_clustering(self):
348 |         k_measures = []
349 |         for k in range(2, 12):
350 |             knn = neighbors.KNeighborsRegressor(n_neighbors=k)
351 |             knn.fit(self.X, self.y)
352 |             y_pred = knn.predict(self.X)
353 |             k_measures.append((k, metrics.mean_squared_error(self.y, y_pred)))
354 |         sorted_k_measures = sorted(k_measures, key=lambda t:t[1])
355 |         lowest_mse = sorted_k_measures[0]
356 |         best_k = lowest_mse[0]
357 |         return best_k
358 | 
359 |     def cls_similiar_clustering(absolute_distance, new_data, historical_data, column_names, target_name):
360 |         historical_k = cls_clustering(historical_data, column_names, target_name)
361 |         new_k = cls_clustering(new_data, column_names, target_name)
362 |         if abs(historical_k - new_k) > absolute_distance:
363 |             return False
364 |         else:
365 |             return True
366 | 
367 | # this needs work
368 | class ColumnarData():
369 |     def similiar_correlation(correlation_lower_bound, new_data, historical_data, column_names, pvalue_threshold=0.05):
370 |         for column_name in column_names:
371 |             correlation_info = stats.spearmanr(new_data[column_name], historical_data[column_name])
372 |             if correlation_info.pvalue > pvalue_threshold:
373 |                 return False
374 |             if correlation_info.correlation < correlation_lower_bound:
375 |                 return False
376 |         return True
377 | 
378 |     def similiar_distribution(new_data, historical_data, column_names, pvalue_threshold=0.05):
379 |         for column_name in column_names:
380 |             distribution_info = stats.ks_2samp(new_data[column_name], historical_data[column_name])
381 |             if correlation_info.pvalue < pvalue_threshold:
382 |                 return False
383 |         return True
384 | 
385 | # does the preprocessing break?
386 | # does the model build?
387 | # does the model meet some threshold?
388 | # add memoryful tests for measures over time (like over several days)
389 | 


--------------------------------------------------------------------------------
/example_models/static_examples/random_file.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | import random
 3 | import code
 4 | 
 5 | Card = collections.namedtuple('Card', ['rank', 'suit'])
 6 | 
 7 | class FrenchDeck:
 8 |     ranks = [str(n) for n in range(2, 11)] + list('JQKA')
 9 |     suits = 'spades diamonds clubs hearts'.split()
10 | 
11 |     def __init__(self):
12 |         self._cards = [Card(rank, suit) for suit in self.suits
13 |                                         for rank in self.ranks]
14 | 
15 |     def __len__(self):
16 |         return len(self._cards)
17 | 
18 |     def __getitem__(self, position):
19 |         return self._cards[position]
20 | 
21 | if __name__ == '__main__':
22 |     deck = FrenchDeck()
23 |     card = random.choice(deck)
24 |     code.interact(local=locals())
25 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | scikit-learn
3 | scipy
4 | numpy
5 | pandas
6 | mlxtend
7 | energyusage
8 | backtester
9 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | from setuptools import setup
 3 | 
 4 | # The directory containing this file
 5 | HERE = pathlib.Path(__file__).parent
 6 | 
 7 | # The text of the README file
 8 | README = (HERE / "README.md").read_text()
 9 | 
10 | # This call to setup() does all the work
11 | setup(
12 |     name="drifter_ml",
13 |     version="0.25",
14 |     description="Testing for models confirming to the scikit-learn api",
15 |     long_description=README,
16 |     long_description_content_type="text/markdown",
17 |     url="https://github.com/EricSchles/drifter_ml",
18 |     author="Eric Schles",
19 |     author_email="ericschles@gmail.com",
20 |     license="MIT",
21 |     classifiers=[
22 |         "License :: OSI Approved :: MIT License",
23 |         "Programming Language :: Python :: 3",
24 |         "Programming Language :: Python :: 3.6",
25 |         "Programming Language :: Python :: 3.7",
26 |     ],
27 |     packages=["drifter_ml", 'drifter_ml.classification_tests', 'drifter_ml.columnar_tests',
28 |               'drifter_ml.regression_tests', 'drifter_ml.structural_tests'],
29 |     include_package_data=True,
30 |     install_requires=["sklearn", "scipy", "numpy",
31 |                       "statsmodels", "mlxtend", "pytest",
32 |                       "energyusage", "backtester"],
33 | )
34 | 


--------------------------------------------------------------------------------
/tests/test_classification_tests.py:
--------------------------------------------------------------------------------
  1 | from drifter_ml import classification_tests
  2 | from sklearn import tree
  3 | from sklearn import ensemble
  4 | from sklearn import model_selection
  5 | import numpy as np
  6 | import pandas as pd
  7 | import random
  8 | 
  9 | def generate_binary_classification_data_and_models():
 10 |     df = pd.DataFrame()
 11 |     for _ in range(1000):
 12 |         a = np.random.normal(0, 1)
 13 |         b = np.random.normal(0, 3)
 14 |         c = np.random.normal(12, 4)
 15 |         target = random.choice([0, 1])
 16 |         df = df.append({
 17 |             "A": a,
 18 |             "B": b,
 19 |             "C": c,
 20 |             "target": target
 21 |         }, ignore_index=True)
 22 | 
 23 |     clf1 = tree.DecisionTreeClassifier()
 24 |     clf2 = ensemble.RandomForestClassifier()
 25 |     column_names = ["A", "B", "C"]
 26 |     target_name = "target"
 27 |     X = df[column_names]
 28 |     clf1.fit(X, df[target_name])
 29 |     clf2.fit(X, df[target_name])
 30 |     return df, column_names, target_name, clf1, clf2
 31 | 
 32 | def generate_multiclass_classification_data_and_models():
 33 |     df = pd.DataFrame()
 34 |     for _ in range(1000):
 35 |         a = np.random.normal(0, 1)
 36 |         b = np.random.normal(0, 3)
 37 |         c = np.random.normal(12, 4)
 38 |         target = random.choice([0, 1, 2])
 39 |         df = df.append({
 40 |             "A": a,
 41 |             "B": b,
 42 |             "C": c,
 43 |             "target": target
 44 |         }, ignore_index=True)
 45 | 
 46 |     clf1 = tree.DecisionTreeClassifier()
 47 |     clf2 = ensemble.RandomForestClassifier()
 48 |     column_names = ["A", "B", "C"]
 49 |     target_name = "target"
 50 |     X = df[column_names]
 51 |     clf1.fit(X, df[target_name])
 52 |     clf2.fit(X, df[target_name])
 53 |     return df, column_names, target_name, clf1, clf2
 54 | 
 55 | def test_precision_recall_f1_binary():
 56 |     df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models()
 57 |     test_suite = classification_tests.ClassificationTests(clf,
 58 |                                                           df,
 59 |                                                           target_name,
 60 |                                                           column_names)
 61 |     try:
 62 |         classes = list(df[target_name].unique())
 63 |         test_suite.classifier_testing_per_class(
 64 |             {klass: 0.1 for klass in classes},
 65 |             {klass: 0.1 for klass in classes},
 66 |             {klass: 0.1 for klass in classes}
 67 |         )
 68 |         assert True
 69 |     except:
 70 |         assert False
 71 | 
 72 | def test_precision_recall_f1_multiclass():
 73 |     df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models()
 74 |     test_suite = classification_tests.ClassificationTests(clf,
 75 |                                                           df,
 76 |                                                           target_name,
 77 |                                                           column_names)
 78 |     try:
 79 |         classes = list(df[target_name].unique())
 80 |         test_suite.classifier_testing_per_class(
 81 |             {klass: 0.1 for klass in classes},
 82 |             {klass: 0.1 for klass in classes},
 83 |             {klass: 0.1 for klass in classes},
 84 |             average="micro"
 85 |         )
 86 |         assert True
 87 |     except:
 88 |         assert False
 89 | 
 90 | def test_roc_auc_cv_binary():
 91 |     df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models()
 92 |     test_suite = classification_tests.ClassificationTests(clf,
 93 |                                                           df,
 94 |                                                           target_name,
 95 |                                                           column_names)
 96 |     try:
 97 |         roc_auc_scores = test_suite.roc_auc_cv(3)
 98 |         assert isinstance(roc_auc_scores, list)
 99 |         assert isinstance(roc_auc_scores[0], float)
100 |         assert len(roc_auc_scores) == 3
101 |     except ValueError:
102 |         assert True
103 |         
104 | def test_f1_cv_binary():
105 |     df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models()
106 |     test_suite = classification_tests.ClassificationTests(clf,
107 |                                                           df,
108 |                                                           target_name,
109 |                                                           column_names)
110 |     f1_scores = test_suite.f1_cv(3)
111 |     assert isinstance(f1_scores, list)
112 |     assert isinstance(f1_scores[0], float)
113 |     assert len(f1_scores) == 3
114 | 
115 | def test_f1_cv_multiclass():
116 |     df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models()
117 |     test_suite = classification_tests.ClassificationTests(clf,
118 |                                                           df,
119 |                                                           target_name,
120 |                                                           column_names)
121 |     f1_scores = test_suite.f1_cv(3)
122 |     assert isinstance(f1_scores, list)
123 |     assert isinstance(f1_scores[0], float)
124 |     assert len(f1_scores) == 3
125 | 
126 | def test_recall_cv_binary():
127 |     df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models()
128 |     test_suite = classification_tests.ClassificationTests(clf,
129 |                                                           df,
130 |                                                           target_name,
131 |                                                           column_names)
132 |     recall_scores = test_suite.recall_cv(3)
133 |     assert isinstance(recall_scores, list)
134 |     assert isinstance(recall_scores[0], float)
135 |     assert len(recall_scores) == 3
136 | 
137 | def test_recall_cv_multiclass():
138 |     df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models()
139 |     test_suite = classification_tests.ClassificationTests(clf,
140 |                                                           df,
141 |                                                           target_name,
142 |                                                           column_names)
143 |     recall_scores = test_suite.recall_cv(3)
144 |     assert isinstance(recall_scores, list)
145 |     assert isinstance(recall_scores[0], float)
146 |     assert len(recall_scores) == 3
147 | 
148 | def test_precision_cv_binary():
149 |     df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models()
150 |     test_suite = classification_tests.ClassificationTests(clf,
151 |                                                           df,
152 |                                                           target_name,
153 |                                                           column_names)
154 |     precision_scores = test_suite.precision_cv(3)
155 |     assert isinstance(precision_scores, list)
156 |     assert isinstance(precision_scores[0], float)
157 |     assert len(precision_scores) == 3
158 | 
159 | def test_precision_cv_multiclass():
160 |     df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models()
161 |     test_suite = classification_tests.ClassificationTests(clf,
162 |                                                           df,
163 |                                                           target_name,
164 |                                                           column_names)
165 |     precision_scores = test_suite.precision_cv(3)
166 |     assert isinstance(precision_scores, list)
167 |     assert isinstance(precision_scores[0], float)
168 |     assert len(precision_scores) == 3
169 | 
170 | def test_precision_metric():
171 |     fixed_metrics = classification_tests.FixedClassificationMetrics()
172 |     assert 1.0 == fixed_metrics.precision_score([0,0,0], [0,0,0])
173 | 
174 | def test_recall_metric():
175 |     fixed_metrics = classification_tests.FixedClassificationMetrics()
176 |     assert 1.0 == fixed_metrics.recall_score([0,0,0], [0,0,0])
177 | 
178 | def test_f1_metric():
179 |     fixed_metrics = classification_tests.FixedClassificationMetrics()
180 |     assert 1.0 == fixed_metrics.f1_score([0,0,0], [0,0,0])
181 | 
182 | def test_cross_val_per_class_percision_anomaly_detection_binary():
183 |     df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models()
184 |     test_suite = classification_tests.ClassificationTests(clf,
185 |                                                           df,
186 |                                                           target_name,
187 |                                                           column_names)
188 |     try:
189 |         tolerance = 1
190 |         test_suite.cross_val_per_class_precision_anomaly_detection(tolerance)
191 |         assert True
192 |     except:
193 |         assert False
194 | 
195 | def test_cross_val_per_class_percision_anomaly_detection_multiclass():
196 |     df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models()
197 |     test_suite = classification_tests.ClassificationTests(clf,
198 |                                                           df,
199 |                                                           target_name,
200 |                                                           column_names)
201 |     try:
202 |         tolerance = 1
203 |         test_suite.cross_val_per_class_precision_anomaly_detection(tolerance, average="micro")
204 |         assert True
205 |     except:
206 |         assert False
207 | 
208 | def test_cross_val_per_class_recall_anomaly_detection_binary():
209 |     df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models()
210 |     test_suite = classification_tests.ClassificationTests(clf,
211 |                                                           df,
212 |                                                           target_name,
213 |                                                           column_names)
214 |     try:
215 |         tolerance = 1
216 |         test_suite.cross_val_per_class_recall_anomaly_detection(tolerance)
217 |         assert True
218 |     except:
219 |         assert False
220 | 
221 | def test_cross_val_per_class_recall_anomaly_detection_multiclass():
222 |     df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models()
223 |     test_suite = classification_tests.ClassificationTests(clf,
224 |                                                           df,
225 |                                                           target_name,
226 |                                                           column_names)
227 |     try:
228 |         tolerance = 1
229 |         test_suite.cross_val_per_class_recall_anomaly_detection(tolerance, average="micro")
230 |         assert True
231 |     except:
232 |         assert False
233 | 
234 | def test_cross_val_per_class_f1_anomaly_detection_binary():
235 |     df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models()
236 |     test_suite = classification_tests.ClassificationTests(clf,
237 |                                                           df,
238 |                                                           target_name,
239 |                                                           column_names)
240 |     try:
241 |         tolerance = 1
242 |         test_suite.cross_val_per_class_f1_anomaly_detection(tolerance)
243 |         assert True
244 |     except:
245 |         assert False
246 | 
247 | def test_cross_val_per_class_f1_anomaly_detection_multiclass():
248 |     df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models()
249 |     test_suite = classification_tests.ClassificationTests(clf,
250 |                                                           df,
251 |                                                           target_name,
252 |                                                           column_names)
253 |     try:
254 |         tolerance = 1
255 |         test_suite.cross_val_per_class_f1_anomaly_detection(tolerance, average="micro")
256 |         assert True
257 |     except:
258 |         assert False
259 | 
260 | def test_cross_val_per_class_roc_auc_anomaly_detection_binary():
261 |     df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models()
262 |     test_suite = classification_tests.ClassificationTests(clf,
263 |                                                           df,
264 |                                                           target_name,
265 |                                                           column_names)
266 |     try:
267 |         tolerance = 1
268 |         print(test_suite.is_binary())
269 |         test_suite.cross_val_per_class_roc_auc_anomaly_detection(tolerance)
270 |         assert True
271 |     except:
272 |         assert False
273 | 
274 | def test_cross_val_precision_anomaly_detection_binary():
275 |     df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models()
276 |     test_suite = classification_tests.ClassificationTests(clf,
277 |                                                           df,
278 |                                                           target_name,
279 |                                                           column_names)
280 |     try:
281 |         tolerance = 1
282 |         test_suite.cross_val_precision_anomaly_detection(tolerance)
283 |         assert True
284 |     except:
285 |         assert False
286 | 
287 | def test_cross_val_precision_anomaly_detection_multiclass():
288 |     df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models()
289 |     test_suite = classification_tests.ClassificationTests(clf,
290 |                                                           df,
291 |                                                           target_name,
292 |                                                           column_names)
293 |     try:
294 |         tolerance = 1
295 |         test_suite.cross_val_precision_anomaly_detection(tolerance, average="micro")
296 |         assert True
297 |     except:
298 |         assert False
299 | 
300 | def test_cross_val_recall_anomaly_detection_binary():
301 |     df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models()
302 |     test_suite = classification_tests.ClassificationTests(clf,
303 |                                                           df,
304 |                                                           target_name,
305 |                                                           column_names)
306 |     try:
307 |         tolerance = 1
308 |         test_suite.cross_val_recall_anomaly_detection(tolerance)
309 |         assert True
310 |     except:
311 |         assert False
312 | 
313 | def test_cross_val_recall_anomaly_detection_multiclass():
314 |     df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models()
315 |     test_suite = classification_tests.ClassificationTests(clf,
316 |                                                           df,
317 |                                                           target_name,
318 |                                                           column_names)
319 |     try:
320 |         tolerance = 1
321 |         test_suite.cross_val_recall_anomaly_detection(tolerance, average="micro")
322 |         assert True
323 |     except:
324 |         assert False
325 | 
326 | def test_cross_val_f1_anomaly_detection_binary():
327 |     df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models()
328 |     test_suite = classification_tests.ClassificationTests(clf,
329 |                                                           df,
330 |                                                           target_name,
331 |                                                           column_names)
332 |     try:
333 |         tolerance = 1
334 |         test_suite.cross_val_f1_anomaly_detection(tolerance)
335 |         assert True
336 |     except:
337 |         assert False
338 | 
339 | def test_cross_val_f1_anomaly_detection_mutliclass():
340 |     df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models()
341 |     test_suite = classification_tests.ClassificationTests(clf,
342 |                                                           df,
343 |                                                           target_name,
344 |                                                           column_names)
345 |     try:
346 |         tolerance = 1
347 |         test_suite.cross_val_f1_anomaly_detection(tolerance, average="micro")
348 |         assert True
349 |     except:
350 |         assert False
351 | 
352 | def test_cross_val_roc_auc_anomaly_detection_binary():
353 |     df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models()
354 |     test_suite = classification_tests.ClassificationTests(clf,
355 |                                                           df,
356 |                                                           target_name,
357 |                                                           column_names)
358 |     try:
359 |         tolerance = 1
360 |         test_suite.cross_val_roc_auc_anomaly_detection(tolerance)
361 |         assert True
362 |     except:
363 |         assert False
364 | 
365 | def test_cross_val_precision_avg_binary():
366 |     df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models()
367 |     test_suite = classification_tests.ClassificationTests(clf,
368 |                                                           df,
369 |                                                           target_name,
370 |                                                           column_names)
371 |     try:
372 |         avg = 0.1
373 |         test_suite.cross_val_precision_avg(avg)
374 |         assert True
375 |     except:
376 |         assert False
377 | 
378 | def test_cross_val_precision_avg_mutliclass():
379 |     df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models()
380 |     test_suite = classification_tests.ClassificationTests(clf,
381 |                                                           df,
382 |                                                           target_name,
383 |                                                           column_names)
384 |     try:
385 |         avg = 0.1
386 |         test_suite.cross_val_precision_avg(avg, average="micro")
387 |         assert True
388 |     except:
389 |         assert False
390 | 
391 | def test_cross_val_recall_avg_binary():
392 |     df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models()
393 |     test_suite = classification_tests.ClassificationTests(clf,
394 |                                                           df,
395 |                                                           target_name,
396 |                                                           column_names)
397 |     try:
398 |         avg = 0.1
399 |         test_suite.cross_val_recall_avg(avg)
400 |         assert True
401 |     except:
402 |         assert False
403 | 
404 | def test_cross_val_recall_avg_multiclass():
405 |     df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models()
406 |     test_suite = classification_tests.ClassificationTests(clf,
407 |                                                           df,
408 |                                                           target_name,
409 |                                                           column_names)
410 |     try:
411 |         avg = 0.1
412 |         test_suite.cross_val_recall_avg(avg, average="micro")
413 |         assert True
414 |     except:
415 |         assert False
416 | 
417 | def test_cross_val_f1_avg_binary():
418 |     df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models()
419 |     test_suite = classification_tests.ClassificationTests(clf,
420 |                                                           df,
421 |                                                           target_name,
422 |                                                           column_names)
423 |     try:
424 |         avg = 0.1
425 |         test_suite.cross_val_f1_avg(avg)
426 |         assert True
427 |     except:
428 |         assert False
429 | 
430 | def test_cross_val_f1_avg_multiclass():
431 |     df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models()
432 |     test_suite = classification_tests.ClassificationTests(clf,
433 |                                                           df,
434 |                                                           target_name,
435 |                                                           column_names)
436 |     try:
437 |         avg = 0.1
438 |         test_suite.cross_val_f1_avg(avg, average="micro")
439 |         assert True
440 |     except:
441 |         assert False
442 | 
443 | def test_cross_val_roc_auc_avg_binary():
444 |     df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models()
445 |     test_suite = classification_tests.ClassificationTests(clf,
446 |                                                           df,
447 |                                                           target_name,
448 |                                                           column_names)
449 |     try:
450 |         avg = 0.1
451 |         test_suite.cross_val_roc_auc_avg(avg)
452 |         assert True
453 |     except:
454 |         assert False
455 |         
456 | def test_spread_cross_val_precision_anomaly_detection_binary():
457 |     df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models()
458 |     test_suite = classification_tests.ClassificationTests(clf,
459 |                                                           df,
460 |                                                           target_name,
461 |                                                           column_names)
462 |     try:
463 |         tolerance =  1
464 |         test_suite.spread_cross_val_precision_anomaly_detection(tolerance)
465 |         test_suite.spread_cross_val_precision_anomaly_detection(tolerance, method="median")
466 |         test_suite.spread_cross_val_precision_anomaly_detection(tolerance, method="trimean")
467 |         assert True
468 |     except:
469 |         assert False
470 | 
471 | def test_spread_cross_val_precision_anomaly_detection_multiclass():
472 |     df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models()
473 |     test_suite = classification_tests.ClassificationTests(clf,
474 |                                                           df,
475 |                                                           target_name,
476 |                                                           column_names)
477 |     try:
478 |         tolerance =  1
479 |         average = "micro"
480 |         test_suite.spread_cross_val_precision_anomaly_detection(tolerance,
481 |                                                                 average=average)
482 |         test_suite.spread_cross_val_precision_anomaly_detection(tolerance,
483 |                                                                 method="median",
484 |                                                                 average=average)
485 |         test_suite.spread_cross_val_precision_anomaly_detection(tolerance,
486 |                                                                 method="trimean",
487 |                                                                 average=average)
488 |         assert True
489 |     except:
490 |         assert False
491 | 
492 | def test_spread_cross_val_recall_anomaly_detection_binary():
493 |     df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models()
494 |     test_suite = classification_tests.ClassificationTests(clf,
495 |                                                           df,
496 |                                                           target_name,
497 |                                                           column_names)
498 |     try:
499 |         tolerance =  1
500 |         test_suite.spread_cross_val_recall_anomaly_detection(tolerance)
501 |         test_suite.spread_cross_val_recall_anomaly_detection(tolerance, method="median")
502 |         test_suite.spread_cross_val_recall_anomaly_detection(tolerance, method="trimean")
503 |         assert True
504 |     except:
505 |         assert False
506 |         
507 | def test_spread_cross_val_recall_anomaly_detection_multiclass():
508 |     df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models()
509 |     test_suite = classification_tests.ClassificationTests(clf,
510 |                                                           df,
511 |                                                           target_name,
512 |                                                           column_names)
513 |     try:
514 |         tolerance =  1
515 |         average = "micro"
516 |         test_suite.spread_cross_val_recall_anomaly_detection(tolerance,
517 |                                                              average=average)
518 |         test_suite.spread_cross_val_recall_anomaly_detection(tolerance,
519 |                                                              method="median",
520 |                                                              average=average)
521 |         test_suite.spread_cross_val_recall_anomaly_detection(tolerance,
522 |                                                              method="trimean",
523 |                                                              average=average)
524 |         assert True
525 |     except:
526 |         assert False
527 | 
528 | def test_spread_cross_val_f1_anomaly_detection_binary():
529 |     df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models()
530 |     test_suite = classification_tests.ClassificationTests(clf,
531 |                                                           df,
532 |                                                           target_name,
533 |                                                           column_names)
534 |     try:
535 |         tolerance =  1
536 |         test_suite.spread_cross_val_f1_anomaly_detection(tolerance)
537 |         test_suite.spread_cross_val_f1_anomaly_detection(tolerance, method="median")
538 |         test_suite.spread_cross_val_f1_anomaly_detection(tolerance, method="trimean")
539 |         assert True
540 |     except:
541 |         assert False
542 | 
543 | def test_spread_cross_val_f1_anomaly_detection_multiclass():
544 |     df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models()
545 |     test_suite = classification_tests.ClassificationTests(clf,
546 |                                                           df,
547 |                                                           target_name,
548 |                                                           column_names)
549 |     try:
550 |         tolerance =  1
551 |         average = "micro"
552 |         test_suite.spread_cross_val_f1_anomaly_detection(tolerance,
553 |                                                          average=average)
554 |         test_suite.spread_cross_val_f1_anomaly_detection(tolerance,
555 |                                                          method="median",
556 |                                                          average=average)
557 |         test_suite.spread_cross_val_f1_anomaly_detection(tolerance,
558 |                                                          method="trimean",
559 |                                                          average=average)
560 |         assert True
561 |     except:
562 |         assert False
563 | 
564 | def test_spread_cross_val_roc_auc_anomaly_detection_binary():
565 |     df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models()
566 |     test_suite = classification_tests.ClassificationTests(clf,
567 |                                                           df,
568 |                                                           target_name,
569 |                                                           column_names)
570 |     try:
571 |         tolerance =  1
572 |         test_suite.spread_cross_val_roc_auc_anomaly_detection(tolerance)
573 |         test_suite.spread_cross_val_roc_auc_anomaly_detection(tolerance, method="median")
574 |         test_suite.spread_cross_val_roc_auc_anomaly_detection(tolerance, method="trimean")
575 |         assert True
576 |     except:
577 |         assert False
578 | 
579 | def test_run_time_stress_test():
580 |     df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models()
581 |     test_suite = classification_tests.ClassificationTests(clf,
582 |                                                           df,
583 |                                                           target_name,
584 |                                                           column_names)
585 |     sample_sizes = [i for i in range(100, 1000, 100)]
586 |     max_run_times = [100 for _ in range(len(sample_sizes))]
587 |     try:
588 |         test_suite.run_time_stress_test(sample_sizes, max_run_times)
589 |         assert True
590 |     except:
591 |         assert False
592 | 
593 | def test_two_model_prediction_run_time_stress_test():
594 |     df, column_names, target_name, clf1, clf2 = generate_binary_classification_data_and_models()
595 |     test_suite = classification_tests.ClassifierComparison(clf1,
596 |                                                            clf2,
597 |                                                            df,
598 |                                                            target_name,
599 |                                                            column_names)
600 | 
601 |     sample_sizes = [i for i in range(100, 1000, 100)]
602 |     try:
603 |         test_suite.two_model_prediction_run_time_stress_test(sample_sizes)
604 |         assert True
605 |     except:
606 |         assert False
607 | 
608 | def test_two_model_classifier_testing_binary():
609 |     df, column_names, target_name, clf1, clf2 = generate_binary_classification_data_and_models()
610 |     test_suite = classification_tests.ClassifierComparison(clf1,
611 |                                                            clf2,
612 |                                                            df,
613 |                                                            target_name,
614 |                                                            column_names)
615 |     try:
616 |         test_suite.two_model_classifier_testing()
617 |         assert True
618 |     except:
619 |         assert False
620 | 
621 | def test_two_model_classifier_testing_multiclass():
622 |     df, column_names, target_name, clf1, clf2 = generate_multiclass_classification_data_and_models()
623 |     test_suite = classification_tests.ClassifierComparison(clf1,
624 |                                                            clf2,
625 |                                                            df,
626 |                                                            target_name,
627 |                                                            column_names)
628 |     try:
629 |         test_suite.two_model_classifier_testing(average="micro")
630 |         assert True
631 |     except:
632 |         assert False
633 | 
634 | def test_cross_val_two_model_classifier_testing_binary():
635 |     df, column_names, target_name, clf1, clf2 = generate_binary_classification_data_and_models()
636 |     test_suite = classification_tests.ClassifierComparison(clf1,
637 |                                                            clf2,
638 |                                                            df,
639 |                                                            target_name,
640 |                                                            column_names)
641 |     try:
642 |         test_suite.cross_val_two_model_classifier_testing()
643 |         assert True
644 |     except:
645 |         assert False
646 | 
647 | def test_cross_val_two_model_classifier_testing_multiclass():
648 |     df, column_names, target_name, clf1, clf2 = generate_multiclass_classification_data_and_models()
649 |     test_suite = classification_tests.ClassifierComparison(clf1,
650 |                                                            clf2,
651 |                                                            df,
652 |                                                            target_name,
653 |                                                            column_names)
654 |     try:
655 |         test_suite.cross_val_two_model_classifier_testing(average="micro")
656 |         assert True
657 |     except:
658 |         assert False
659 | 
660 | def test_cross_val_two_model_classifier_testing_binary():
661 |     df, column_names, target_name, clf1, clf2 = generate_binary_classification_data_and_models()
662 |     test_suite = classification_tests.ClassifierComparison(clf1,
663 |                                                            clf2,
664 |                                                            df,
665 |                                                            target_name,
666 |                                                            column_names)
667 |     try:
668 |         test_suite.cross_val_per_class_two_model_classifier_testing()
669 |         assert True
670 |     except:
671 |         assert False
672 | 
673 | def test_cross_val_two_model_classifier_testing_multiclass():
674 |     df, column_names, target_name, clf1, clf2 = generate_multiclass_classification_data_and_models()
675 |     test_suite = classification_tests.ClassifierComparison(clf1,
676 |                                                            clf2,
677 |                                                            df,
678 |                                                            target_name,
679 |                                                            column_names)
680 |     try:
681 |         test_suite.cross_val_per_class_two_model_classifier_testing(average="micro")
682 |         assert True
683 |     except:
684 |         assert False
685 | 


--------------------------------------------------------------------------------
/tests/test_columnar_tests.py:
--------------------------------------------------------------------------------
  1 | from drifter_ml import columnar_tests
  2 | import numpy as np
  3 | import pandas as pd
  4 | 
  5 | def generate_data():
  6 |     new_data = pd.DataFrame()
  7 |     historical_data = pd.DataFrame()
  8 |     new_data["similar_normal"] = np.random.normal(0, 10, size=1000)
  9 |     historical_data["similar_normal"] = np.random.normal(0, 10, size=1000)
 10 |     new_data["different_normal"] = np.random.normal(1000, 250, size=1000)
 11 |     historical_data["different_normal"] = np.random.normal(5, 17, size=1000)
 12 |     new_data["random"] = np.random.random(size=1000)
 13 |     historical_data["random"] = np.random.random(size=1000)
 14 |     new_data["similar_gamma"] = np.random.gamma(1, 2, size=1000)
 15 |     historical_data["similar_gamma"] = np.random.gamma(1, 2, size=1000)
 16 |     new_data["different_gamma"] = np.random.gamma(7.5, 0, size=1000)
 17 |     historical_data["different_gamma"] = np.random.gamma(2, 4, size=1000)
 18 |     return new_data, historical_data
 19 | 
 20 | def test_mean_similarity():
 21 |     new_data, historical_data = generate_data()
 22 |     test_suite = columnar_tests.ColumnarData(new_data, historical_data)
 23 |     try:
 24 |         test_suite.mean_similarity("similar_normal")
 25 |         assert True
 26 |     except:
 27 |         assert False
 28 |     
 29 | def test_median_similarity():
 30 |     new_data, historical_data = generate_data()
 31 |     test_suite = columnar_tests.ColumnarData(new_data, historical_data)
 32 |     try:
 33 |         test_suite.median_similarity("similar_normal")
 34 |         assert True
 35 |     except:
 36 |         assert False
 37 | 
 38 | 
 39 | def test_trimean_similarity():
 40 |     new_data, historical_data = generate_data()
 41 |     test_suite = columnar_tests.ColumnarData(new_data, historical_data)
 42 |     try:
 43 |         test_suite.trimean_similarity("similar_normal")
 44 |         assert True
 45 |     except:
 46 |         assert False
 47 |     
 48 | def test_is_normal():
 49 |     new_data, historical_data = generate_data()
 50 |     test_suite = columnar_tests.ColumnarData(new_data, historical_data)
 51 |     try:
 52 |         test_suite.is_normal("similar_normal")
 53 |         assert True
 54 |     except:
 55 |         assert False
 56 | 
 57 | def test_pearson_similar_correlation():
 58 |     new_data, historical_data = generate_data()
 59 |     test_suite = columnar_tests.ColumnarData(new_data, historical_data)
 60 |     correlation_lower_bound = 0.3
 61 |     try:
 62 |         test_suite.pearson_similar_correlation("similar_normal", correlation_lower_bound)
 63 |         assert True
 64 |     except:
 65 |         assert False
 66 | 
 67 | def test_spearman_similar_correlation():
 68 |     new_data, historical_data = generate_data()
 69 |     test_suite = columnar_tests.ColumnarData(new_data, historical_data)
 70 |     correlation_lower_bound = 0.3
 71 |     try:
 72 |         test_suite.spearman_similar_correlation("similar_normal", correlation_lower_bound)
 73 |         assert True
 74 |     except:
 75 |         assert False
 76 | 
 77 | def test_wilcoxon_similar_distribution():
 78 |     new_data, historical_data = generate_data()
 79 |     test_suite = columnar_tests.ColumnarData(new_data, historical_data)
 80 |     try:
 81 |         test_suite.wilcoxon_similar_distribution("similar_normal")
 82 |         assert True
 83 |     except:
 84 |         assert False
 85 |         
 86 | def test_ks_2samp_similar_distribution():
 87 |     new_data, historical_data = generate_data()
 88 |     test_suite = columnar_tests.ColumnarData(new_data, historical_data)
 89 |     try:
 90 |         test_suite.ks_2samp_similar_distribution("similar_normal")
 91 |         assert True
 92 |     except:
 93 |         assert False
 94 | 
 95 | def test_kruskal_similar_distribution():
 96 |     new_data, historical_data = generate_data()
 97 |     test_suite = columnar_tests.ColumnarData(new_data, historical_data)
 98 |     try:
 99 |         test_suite.kruskal_similar_distribution("similar_normal")
100 |         assert True
101 |     except:
102 |         assert False
103 |         
104 | def test_mann_whitney_u_similar_distribution():
105 |     new_data, historical_data = generate_data()
106 |     test_suite = columnar_tests.ColumnarData(new_data, historical_data)
107 |     try:
108 |         test_suite.mann_whitney_u_similar_distribution("similar_normal")
109 |         assert True
110 |     except:
111 |         assert False
112 | 


--------------------------------------------------------------------------------
/tests/test_regression_tests.py:
--------------------------------------------------------------------------------
  1 | from drifter_ml import regression_tests
  2 | from sklearn import tree
  3 | from sklearn import ensemble
  4 | from sklearn import model_selection
  5 | import numpy as np
  6 | import pandas as pd
  7 | 
  8 | def generate_regression_data_and_models():
  9 |     df = pd.DataFrame()
 10 |     for _ in range(1000):
 11 |         a = np.random.normal(0, 1)
 12 |         b = np.random.normal(0, 3)
 13 |         c = np.random.normal(12, 4)
 14 |         target = a + b + c
 15 |         df = df.append({
 16 |             "A": a,
 17 |             "B": b,
 18 |             "C": c,
 19 |             "target": target
 20 |         }, ignore_index=True)
 21 | 
 22 |     reg1 = tree.DecisionTreeRegressor()
 23 |     reg2 = ensemble.RandomForestRegressor()
 24 |     column_names = ["A", "B", "C"]
 25 |     target_name = "target"
 26 |     X = df[column_names]
 27 |     reg1.fit(X, df[target_name])
 28 |     reg2.fit(X, df[target_name])
 29 |     return df, column_names, target_name, reg1, reg2
 30 | 
 31 | def test_regression_basic():
 32 |     df, column_names, target_name, reg, _ = generate_regression_data_and_models()
 33 |     test_suite = regression_tests.RegressionTests(reg,
 34 |                                                   df,
 35 |                                                   target_name,
 36 |                                                   column_names)
 37 |     try:
 38 |         mse_upper_boundary = 10000
 39 |         mae_upper_boundary = 10000
 40 |         tse_upper_boundary = 10000
 41 |         tae_upper_boundary = 10000
 42 |         test_suite.upper_bound_regression_testing(
 43 |             mse_upper_boundary,
 44 |             mae_upper_boundary,
 45 |             tse_upper_boundary,
 46 |             tae_upper_boundary
 47 |         )
 48 |         assert True
 49 |     except:
 50 |         assert False
 51 | 
 52 | def test_cross_val_mse_anomaly_detection():
 53 |     df, column_names, target_name, reg, _ = generate_regression_data_and_models()
 54 |     test_suite = regression_tests.RegressionTests(reg,
 55 |                                                   df,
 56 |                                                   target_name,
 57 |                                                   column_names)
 58 |     try:
 59 |         mse_tolerance = 10000
 60 |         test_suite.cross_val_mse_anomaly_detection(
 61 |             mse_tolerance
 62 |         )
 63 |         assert True
 64 |     except:
 65 |         assert False
 66 | 
 67 | def test_cross_val_tse_anomaly_detection():
 68 |     df, column_names, target_name, reg, _ = generate_regression_data_and_models()
 69 |     test_suite = regression_tests.RegressionTests(reg,
 70 |                                                   df,
 71 |                                                   target_name,
 72 |                                                   column_names)
 73 |     try:
 74 |         tse_tolerance = 10000
 75 |         test_suite.cross_val_tse_anomaly_detection(
 76 |             tse_tolerance
 77 |         )
 78 |         assert True
 79 |     except:
 80 |         assert False
 81 | 
 82 | def test_cross_val_mae_anomaly_detection():
 83 |     df, column_names, target_name, reg, _ = generate_regression_data_and_models()
 84 |     test_suite = regression_tests.RegressionTests(reg,
 85 |                                                   df,
 86 |                                                   target_name,
 87 |                                                   column_names)
 88 |     try:
 89 |         
 90 |         mae_tolerance = 10000
 91 |         test_suite.cross_val_mae_anomaly_detection(
 92 |             mae_tolerance
 93 |         )
 94 |         assert True
 95 |     except:
 96 |         assert False
 97 | 
 98 | def test_cross_val_tae_anomaly_detection():
 99 |     df, column_names, target_name, reg, _ = generate_regression_data_and_models()
100 |     test_suite = regression_tests.RegressionTests(reg,
101 |                                                   df,
102 |                                                   target_name,
103 |                                                   column_names)
104 |     try:
105 |         
106 |         tae_tolerance = 10000
107 |         test_suite.cross_val_tae_anomaly_detection(
108 |             tae_tolerance
109 |         )
110 |         assert True
111 |     except:
112 |         assert False
113 | 
114 | def test_cross_val_mse_avg():
115 |     df, column_names, target_name, reg, _ = generate_regression_data_and_models()
116 |     test_suite = regression_tests.RegressionTests(reg,
117 |                                                   df,
118 |                                                   target_name,
119 |                                                   column_names)
120 |     try:
121 |         mse_avg = 100
122 |         test_suite.cross_val_mse_avg(
123 |             mse_avg
124 |         )
125 |         assert True
126 |     except:
127 |         assert False
128 | 
129 | def test_cross_val_tse_avg():
130 |     df, column_names, target_name, reg, _ = generate_regression_data_and_models()
131 |     test_suite = regression_tests.RegressionTests(reg,
132 |                                                   df,
133 |                                                   target_name,
134 |                                                   column_names)
135 |     try:
136 |         tse_avg = 100
137 |         test_suite.cross_val_tse_avg(
138 |             tse_avg
139 |         )
140 |         assert True
141 |     except:
142 |         assert False
143 | 
144 | def test_cross_val_mae_avg():
145 |     df, column_names, target_name, reg, _ = generate_regression_data_and_models()
146 |     test_suite = regression_tests.RegressionTests(reg,
147 |                                                   df,
148 |                                                   target_name,
149 |                                                   column_names)
150 |     try:
151 |         mae_avg = 100
152 |         test_suite.cross_val_mae_avg(
153 |             mae_avg
154 |         )
155 |         assert True
156 |     except:
157 |         assert False
158 | 
159 | def test_cross_val_tae_avg():
160 |     df, column_names, target_name, reg, _ = generate_regression_data_and_models()
161 |     test_suite = regression_tests.RegressionTests(reg,
162 |                                                   df,
163 |                                                   target_name,
164 |                                                   column_names)
165 |     try:
166 |         tae_avg = 100
167 |         test_suite.cross_val_tae_avg(
168 |             tae_avg
169 |         )
170 |         assert True
171 |     except:
172 |         assert False
173 | 
174 | def test_run_time_stress_test():
175 |     df, column_names, target_name, reg, _ = generate_regression_data_and_models()
176 |     test_suite = regression_tests.RegressionTests(reg,
177 |                                                   df,
178 |                                                   target_name,
179 |                                                   column_names)
180 |     
181 |     sample_sizes = [i for i in range(100, 1000, 100)]
182 |     max_run_times = [100 for _ in range(len(sample_sizes))]
183 |     try:
184 |         test_suite.run_time_stress_test(
185 |             sample_sizes, max_run_times
186 |         )
187 |         assert True
188 |     except:
189 |         assert False
190 | 
191 | def test_two_model_prediction_run_time_stress_test():
192 |     df, column_names, target_name, reg1, reg2 = generate_regression_data_and_models()
193 |     test_suite = regression_tests.RegressionComparison(reg1,
194 |                                                        reg2,
195 |                                                        df,
196 |                                                        target_name,
197 |                                                        column_names)
198 |     sample_sizes = [i for i in range(100, 1000, 100)]
199 |     try:
200 |         test_suite.two_model_prediction_run_time_stress_test(
201 |             sample_sizes
202 |         )
203 |         assert True
204 |     except:
205 |         assert False
206 | 
207 | def test_cv_two_model_regression_testing():
208 |     df, column_names, target_name, reg1, reg2 = generate_regression_data_and_models()
209 |     test_suite = regression_tests.RegressionComparison(reg1,
210 |                                                        reg2,
211 |                                                        df,
212 |                                                        target_name,
213 |                                                        column_names)
214 |     try:
215 |         test_suite.cv_two_model_regression_testing()
216 |         assert True
217 |     except:
218 |         assert False
219 | 
220 | def test_two_model_regression_testing():
221 |     df, column_names, target_name, reg1, reg2 = generate_regression_data_and_models()
222 |     test_suite = regression_tests.RegressionComparison(reg1,
223 |                                                        reg2,
224 |                                                        df,
225 |                                                        target_name,
226 |                                                        column_names)
227 |     try:
228 |         test_suite.two_model_regression_testing()
229 |         assert True
230 |     except:
231 |         assert False
232 | 


--------------------------------------------------------------------------------
/tests/test_structural_tests.py:
--------------------------------------------------------------------------------
  1 | from drifter_ml import structural_tests
  2 | import numpy as np
  3 | import pandas as pd
  4 | 
  5 | def generate_classification_data_and_models():
  6 |     new_data = pd.DataFrame()
  7 |     for _ in range(1000):
  8 |         a = np.random.normal(0, 1)
  9 |         b = np.random.normal(0, 3)
 10 |         c = np.random.normal(12, 4)
 11 |         if a + b + c > 11:
 12 |             target = 1
 13 |         else:
 14 |             target = 0
 15 |             new_data = new_data.append({
 16 |                 "A": a,
 17 |                 "B": b,
 18 |                 "C": c,
 19 |                 "target": target
 20 |             }, ignore_index=True)
 21 | 
 22 |     historical_data = pd.DataFrame()
 23 |     for _ in range(1000):
 24 |         a = np.random.normal(0, 1)
 25 |         b = np.random.normal(0, 3)
 26 |         c = np.random.normal(12, 4)
 27 |         if a + b + c > 11:
 28 |             target = 1
 29 |         else:
 30 |             target = 0
 31 |             historical_data = historical_data.append({
 32 |                 "A": a,
 33 |                 "B": b,
 34 |                 "C": c,
 35 |                 "target": target
 36 |             }, ignore_index=True)
 37 | 
 38 |     column_names = ["A", "B", "C"]
 39 |     target_name = "target"
 40 |     return new_data, historical_data, column_names, target_name
 41 | 
 42 | def generate_regression_data_and_models():
 43 |     new_data = pd.DataFrame()
 44 |     for _ in range(1000):
 45 |         a = np.random.normal(0, 1)
 46 |         b = np.random.normal(0, 3)
 47 |         c = np.random.normal(12, 4)
 48 |         target = a + b + c
 49 |         new_data = new_data.append({
 50 |             "A": a,
 51 |             "B": b,
 52 |             "C": c,
 53 |             "target": target
 54 |         }, ignore_index=True)
 55 | 
 56 |     historical_data = pd.DataFrame()
 57 |     for _ in range(1000):
 58 |         a = np.random.normal(0, 1)
 59 |         b = np.random.normal(0, 3)
 60 |         c = np.random.normal(12, 4)
 61 |         target = a + b + c
 62 |         historical_data = historical_data.append({
 63 |             "A": a,
 64 |             "B": b,
 65 |             "C": c,
 66 |             "target": target
 67 |         }, ignore_index=True)
 68 | 
 69 |     column_names = ["A", "B", "C"]
 70 |     target_name = "target"
 71 |     return new_data, historical_data, column_names, target_name
 72 | 
 73 | def generate_unsupervised_data():
 74 |     new_data = pd.DataFrame()
 75 |     historical_data = pd.DataFrame()
 76 |     new_data["similar_normal"] = np.random.normal(0, 10, size=1000)
 77 |     historical_data["similar_normal"] = np.random.normal(0, 10, size=1000)
 78 |     new_data["different_normal"] = np.random.normal(1000, 250, size=1000)
 79 |     historical_data["different_normal"] = np.random.normal(5, 17, size=1000)
 80 |     new_data["random"] = np.random.random(size=1000)
 81 |     historical_data["random"] = np.random.random(size=1000)
 82 |     new_data["similar_gamma"] = np.random.gamma(1, 2, size=1000)
 83 |     historical_data["similar_gamma"] = np.random.gamma(1, 2, size=1000)
 84 |     new_data["different_gamma"] = np.random.gamma(7.5, 0, size=1000)
 85 |     historical_data["different_gamma"] = np.random.gamma(2, 4, size=1000)
 86 |     return new_data, historical_data
 87 | 
 88 | def test_mutual_info_kmeans_scorer():
 89 |     new_data, historical_data = generate_unsupervised_data()
 90 |     columns = ["similar_normal", "different_normal",
 91 |                "similar_gamma", "different_gamma"]
 92 |     target = ''
 93 |     test_suite = structural_tests.StructuralData(new_data,
 94 |                                                  historical_data,
 95 |                                                  columns,
 96 |                                                  target)
 97 |     try:
 98 |         min_similarity = 0.5
 99 |         test_suite.mutual_info_kmeans_scorer(min_similarity)
100 |         assert True
101 |     except:
102 |         assert False
103 | 
104 | def test_adjusted_rand_kmeans_scorer():
105 |     new_data, historical_data = generate_unsupervised_data()
106 |     columns = ["similar_normal", "different_normal",
107 |                "similar_gamma", "different_gamma"]
108 |     target = ''
109 |     test_suite = structural_tests.StructuralData(new_data,
110 |                                                  historical_data,
111 |                                                  columns,
112 |                                                  target)
113 |     try:
114 |         min_similarity = 0.5
115 |         test_suite.adjusted_rand_kmeans_scorer(min_similarity)
116 |         assert True
117 |     except:
118 |         assert False
119 | 
120 | def test_completeness_kmeans_scorer():
121 |     new_data, historical_data = generate_unsupervised_data()
122 |     columns = ["similar_normal", "different_normal",
123 |                "similar_gamma", "different_gamma"]
124 |     target = ''
125 |     test_suite = structural_tests.StructuralData(new_data,
126 |                                                  historical_data,
127 |                                                  columns,
128 |                                                  target)
129 |     try:
130 |         min_similarity = 0.5
131 |         test_suite.completeness_kmeans_scorer(min_similarity)
132 |         assert True
133 |     except:
134 |         assert False
135 | 
136 | def test_fowlkes_mallows_kmeans_scorer():
137 |     new_data, historical_data = generate_unsupervised_data()
138 |     columns = ["similar_normal", "different_normal",
139 |                "similar_gamma", "different_gamma"]
140 |     target = ''
141 |     test_suite = structural_tests.StructuralData(new_data,
142 |                                                  historical_data,
143 |                                                  columns,
144 |                                                  target)
145 |     try:
146 |         min_similarity = 0.5
147 |         test_suite.fowlkes_mallows_kmeans_scorer(min_similarity)
148 |         assert True
149 |     except:
150 |         assert False
151 | 
152 | def test_homogeneity_kmeans_scorer():
153 |     new_data, historical_data = generate_unsupervised_data()
154 |     columns = ["similar_normal", "different_normal",
155 |                "similar_gamma", "different_gamma"]
156 |     target = ''
157 |     test_suite = structural_tests.StructuralData(new_data,
158 |                                                  historical_data,
159 |                                                  columns,
160 |                                                  target)
161 |     try:
162 |         min_similarity = 0.5
163 |         test_suite.homogeneity_kmeans_scorer(min_similarity)
164 |         assert True
165 |     except:
166 |         assert False
167 | 
168 | def test_v_measure_kmeans_scorer():
169 |     new_data, historical_data = generate_unsupervised_data()
170 |     columns = ["similar_normal", "different_normal",
171 |                "similar_gamma", "different_gamma"]
172 |     target = ''
173 |     test_suite = structural_tests.StructuralData(new_data,
174 |                                                  historical_data,
175 |                                                  columns,
176 |                                                  target)
177 |     try:
178 |         min_similarity = 0.5
179 |         test_suite.v_measure_kmeans_scorer(min_similarity)
180 |         assert True
181 |     except:
182 |         assert False
183 | 
184 | def test_mutual_info_dbscan_scorer():
185 |     new_data, historical_data = generate_unsupervised_data()
186 |     columns = ["similar_normal", "different_normal",
187 |                "similar_gamma", "different_gamma"]
188 |     target = ''
189 |     test_suite = structural_tests.StructuralData(new_data,
190 |                                                  historical_data,
191 |                                                  columns,
192 |                                                  target)
193 |     try:
194 |         min_similarity = 0.5
195 |         test_suite.mutual_info_dbscan_scorer(min_similarity)
196 |         assert True
197 |     except:
198 |         assert False
199 | 
200 | def test_adjusted_rand_dbscan_scorer():
201 |     new_data, historical_data = generate_unsupervised_data()
202 |     columns = ["similar_normal", "different_normal",
203 |                "similar_gamma", "different_gamma"]
204 |     target = ''
205 |     test_suite = structural_tests.StructuralData(new_data,
206 |                                                  historical_data,
207 |                                                  columns,
208 |                                                  target)
209 |     try:
210 |         min_similarity = 0.5
211 |         test_suite.adjusted_rand_dbscan_scorer(min_similarity)
212 |         assert True
213 |     except:
214 |         assert False
215 | 
216 | def test_completeness_dbscan_scorer():
217 |     new_data, historical_data = generate_unsupervised_data()
218 |     columns = ["similar_normal", "different_normal",
219 |                "similar_gamma", "different_gamma"]
220 |     target = ''
221 |     test_suite = structural_tests.StructuralData(new_data,
222 |                                                  historical_data,
223 |                                                  columns,
224 |                                                  target)
225 |     try:
226 |         min_similarity = 0.5
227 |         test_suite.completeness_dbscan_scorer(min_similarity)
228 |         assert True
229 |     except:
230 |         assert False
231 | 
232 | def test_fowlkes_mallows_dbscan_scorer():
233 |     new_data, historical_data = generate_unsupervised_data()
234 |     columns = ["similar_normal", "different_normal",
235 |                "similar_gamma", "different_gamma"]
236 |     target = ''
237 |     test_suite = structural_tests.StructuralData(new_data,
238 |                                                  historical_data,
239 |                                                  columns,
240 |                                                  target)
241 |     try:
242 |         min_similarity = 0.5
243 |         test_suite.fowlkes_mallows_dbscan_scorer(min_similarity)
244 |         assert True
245 |     except:
246 |         assert False
247 | 
248 | def test_homogeneity_dbscan_scorer():
249 |     new_data, historical_data = generate_unsupervised_data()
250 |     columns = ["similar_normal", "different_normal",
251 |                "similar_gamma", "different_gamma"]
252 |     target = ''
253 |     test_suite = structural_tests.StructuralData(new_data,
254 |                                                  historical_data,
255 |                                                  columns,
256 |                                                  target)
257 |     try:
258 |         min_similarity = 0.5
259 |         test_suite.homogeneity_dbscan_scorer(min_similarity)
260 |         assert True
261 |     except:
262 |         assert False
263 | 
264 | def test_v_measure_dbscan_scorer():
265 |     new_data, historical_data = generate_unsupervised_data()
266 |     columns = ["similar_normal", "different_normal",
267 |                "similar_gamma", "different_gamma"]
268 |     target = ''
269 |     test_suite = structural_tests.StructuralData(new_data,
270 |                                                  historical_data,
271 |                                                  columns,
272 |                                                  target)
273 |     try:
274 |         min_similarity = 0.5
275 |         test_suite.v_measure_dbscan_scorer(min_similarity)
276 |         assert True
277 |     except:
278 |         assert False
279 | 
280 | def test_reg_supervised_similar_clustering():
281 |     new_data, historical_data, column_names, target_name = generate_regression_data_and_models()
282 | 
283 |     test_suite = structural_tests.StructuralData(new_data,
284 |                                                  historical_data,
285 |                                                  column_names,
286 |                                                  target_name)
287 |     try:
288 |         absolute_distance = 2
289 |         test_suite.reg_supervised_similar_clustering(absolute_distance)
290 |         assert True
291 |     except:
292 |         assert False
293 | 
294 | def test_reg_supervised_similar_clustering():
295 |     new_data, historical_data, column_names, target_name = generate_classification_data_and_models()
296 |     test_suite = structural_tests.StructuralData(new_data,
297 |                                                  historical_data,
298 |                                                  column_names,
299 |                                                  target_name)
300 |     try:
301 |         absolute_distance = 2
302 |         test_suite.cls_supervised_similar_clustering(absolute_distance)
303 |         assert True
304 |     except:
305 |         assert False
306 | 
307 | 


--------------------------------------------------------------------------------