├── .circleci └── config.yml ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── azure-pipelines.yml ├── docs ├── .conf.py.swp ├── Makefile ├── api_docs │ ├── drifter_ml.classification_tests.rst │ ├── drifter_ml.columnar_tests.rst │ ├── drifter_ml.regression_tests.rst │ ├── drifter_ml.rst │ ├── drifter_ml.structural_tests.rst │ ├── drifter_ml.timeseries_tests.rst │ └── modules.rst ├── classification-tests.rst ├── conf.py ├── designing-your-own-tests.rst ├── index.rst ├── introduction.rst ├── make.bat ├── project-setup.rst └── regression-tests.rst ├── drifter_ml ├── __init__.py ├── classification_tests │ ├── __init__.py │ └── classification_tests.py ├── columnar_tests │ ├── __init__.py │ └── columnar_tests.py ├── regression_tests │ ├── __init__.py │ └── regression_tests.py ├── structural_tests │ ├── __init__.py │ └── structural_tests.py └── timeseries_tests │ ├── __init__.py │ └── timeseries_tests.py ├── example_models └── static_examples │ ├── data.csv │ ├── example_model.py │ ├── example_tests.py │ ├── keras_example.py │ ├── model.joblib │ ├── model1.joblib │ ├── model_metadata.json │ ├── prototype_test_framework.py │ └── random_file.py ├── experiments └── simple_example.ipynb ├── requirements.txt ├── setup.py └── tests ├── test_classification_tests.py ├── test_columnar_tests.py ├── test_regression_tests.py └── test_structural_tests.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | # Python CircleCI 2.0 configuration file 2 | # 3 | # Check https://circleci.com/docs/2.0/language-python/ for more details 4 | # 5 | version: 2 6 | jobs: 7 | build: 8 | docker: 9 | # specify the version you desire here 10 | # use `-browsers` prefix for selenium tests, e.g. `3.6.1-browsers` 11 | - image: circleci/python:3.6.1 12 | 13 | # Specify service dependencies here if necessary 14 | # CircleCI maintains a library of pre-built images 15 | # documented at https://circleci.com/docs/2.0/circleci-images/ 16 | # - image: circleci/postgres:9.4 17 | 18 | working_directory: ~/repo 19 | 20 | steps: 21 | - checkout 22 | 23 | # Download and cache dependencies 24 | - restore_cache: 25 | keys: 26 | - v1-dependencies-{{ checksum "requirements.txt" }} 27 | # fallback to using the latest cache if no exact match is found 28 | - v1-dependencies- 29 | 30 | - run: 31 | name: install dependencies 32 | command: | 33 | python3 -m venv venv 34 | . venv/bin/activate 35 | pip install -r requirements.txt 36 | 37 | - save_cache: 38 | paths: 39 | - ./venv 40 | key: v1-dependencies-{{ checksum "requirements.txt" }} 41 | 42 | # run tests! 43 | # this example uses Django's built-in test-runner 44 | # other common Python testing frameworks include pytest and nose 45 | # https://pytest.org 46 | # https://nose.readthedocs.io 47 | - run: 48 | name: Install python 49 | command: | 50 | python3 -m venv venv 51 | . venv/bin/activate 52 | python -m pip install pytest --user 53 | - run: 54 | name: Install library 55 | command: | 56 | python3 -m venv venv 57 | . venv/bin/activate 58 | python setup.py install 59 | - run: 60 | name: run tests 61 | command: | 62 | . venv/bin/activate 63 | python -m pytest tests 64 | - run: 65 | name: Install codecov 66 | command: | 67 | . venv/bin/activate 68 | python -m pip install codecov pytest-cov 69 | cd tests && pytest --cov-report xml --cov=drifter_ml ./* 70 | - run: 71 | name: run codecov 72 | command: | 73 | curl -s https://codecov.io/bash | bash -s - -t 6875d787-f809-4e64-909b-c672e8845796 74 | - store_artifacts: 75 | path: ./tests/htmlcov 76 | destination: ./tests/htmlcov 77 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | _build 2 | _static 3 | _templates 4 | *pyc 5 | *py~ 6 | __pycache__ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Eric Schles 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include *.py 2 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = source 8 | BUILDDIR = build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ML Testing 2 | 3 | The goal of this module is to create a flexible and easy to use module for testing machine learning models, specifically those in scikit-learn. 4 | 5 | The tests will be readable enough that anyone can extend them to other frameworks and APIs with the major notions kept the same, but more or less the ideas will be extended, no work will be taken in this library to extend passed the scikit-learn API. 6 | 7 | You can [read the docs](https://drifter-ml.readthedocs.io/en/latest/) for a more detailed explaination. 8 | 9 | [A video explaining the idea behind the framework](https://www.youtube.com/watch?v=bZtdnFVAfbs&t=3s) 10 | 11 | [![Documentation Status](https://readthedocs.org/projects/drifter-ml/badge/?version=latest)](https://drifter-ml.readthedocs.io/en/latest/?badge=latest) 12 | [![CircleCI](https://circleci.com/gh/EricSchles/drifter_ml.svg?style=svg)](https://circleci.com/gh/EricSchles/drifter_ml) 13 | [![Version Number](https://img.shields.io/pypi/v/drifter-ml.svg)](https://pypi.org/project/drifter-ml/) 14 | [![Downloads Per Month](https://img.shields.io/pypi/dm/drifter-ml.svg)](https://pypi.org/project/drifter-ml/) 15 | [![codecov](https://codecov.io/gh/EricSchles/drifter_ml/branch/master/graph/badge.svg)](https://codecov.io/gh/EricSchles/drifter_ml) 16 | 17 | ## Tests Covered 18 | 19 | * Testing Against Metrics 20 | * Classification Tests 21 | * Rule Based Testing: 22 | * precision lower boundary 23 | * recall lower boundary 24 | * f1 score lower boundary 25 | * AUC lower boundary 26 | * precision lower boundary per class 27 | * recall lower boundary per class 28 | * f1 score lower boundary per class 29 | * AUC lower boundary per class 30 | * Decision Based Testing: 31 | * precision fold below average 32 | * recall fold below average 33 | * f1 fold below average 34 | * AUC fold below average 35 | * precision fold below average per class 36 | * recall fold below average per class 37 | * f1 fold below average per class 38 | * AUC fold below average per class 39 | * Against New Predictions 40 | * proportion of predictions per class 41 | * class imbalance tests 42 | * probability distribution similarity tests 43 | * calibration tests 44 | * environmental impact tests 45 | * [energyusage](https://pypi.org/project/energyusage/) upper bound test 46 | * Regression Tests 47 | * Rule Based Testing: 48 | * Mean Squared Error upper boundary 49 | * Median Absolute Error upper boundary 50 | * Decision Based Testing: 51 | * Mean Squared Error fold above average 52 | * Median Absolute Error fold above average 53 | * Testing Against Run Time Performance 54 | * prediction run time for simulated samples of size X 55 | * Testing Against Input Data 56 | * percentage of correct imputes for any columns requiring imputation 57 | * dataset testing - http://www.vldb.org/pvldb/vol11/p1781-schelter.pdf 58 | * Memoryful Tests 59 | * cluster testing - this is about the overall structure of the data 60 | If the number of clusters increases or decreases substantially that 61 | should be an indicator that the data has changed enough that things 62 | should possibly be rerun 63 | * correlation testing - this is about ensuring that the correlation for a given column with previous data collected in the past does not change very much. If the data does change then the model should possibly be rerun. 64 | * shape testing - this is about ensuring the general shape of for the given column does not change much over time. The idea here is the same as the correlation tests. 65 | 66 | ## Possible Issues 67 | 68 | Some known issues with this, any machine learning tests are going to require human interaction because of type 1 and type 2 error for statistical tests. Additionally, one simply needs to interrogate models from a lot of angles. It can't be from just one angle. So please use with care! 69 | 70 | ## Future Features 71 | 72 | * cross validation score testing 73 | * add custom loss function 74 | * add custom accuracy function 75 | * add these tests: https://www.datasciencecentral.com/profiles/blogs/a-plethora-of-original-underused-statistical-tests 76 | * clustering for classification 77 | * Unsupervised and semi supervised tests 78 | * verify similarity in clusters to similarity in labels 79 | * generate a small representative set of labels and then propagate other labels 80 | 81 | 82 | ## References 83 | 84 | * https://dzone.com/articles/quality-assurancetesting-the-machine-learning-mode 85 | * https://medium.com/datadriveninvestor/how-to-perform-quality-assurance-for-ml-models-cef77bbbcfb 86 | * Explaination of UAT: https://www.techopedia.com/definition/3887/user-acceptance-testing-uat 87 | * https://mice.cs.columbia.edu/getTechreport.php?techreportID=419&format=pdf 88 | * https://www.xenonstack.com/blog/unit-testing-tdd-bdd-deep-machine-learning/ -------------------------------------------------------------------------------- /azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | # Python package 2 | # Create and test a Python package on multiple Python versions. 3 | # Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more: 4 | # https://docs.microsoft.com/azure/devops/pipelines/languages/python 5 | 6 | trigger: 7 | - master 8 | 9 | jobs: 10 | 11 | - job: 'Test' 12 | pool: 13 | vmImage: 'Ubuntu-16.04' 14 | strategy: 15 | matrix: 16 | Python36: 17 | python.version: '3.6' 18 | Python37: 19 | python.version: '3.7' 20 | maxParallel: 4 21 | 22 | steps: 23 | - task: UsePythonVersion@0 24 | inputs: 25 | versionSpec: '$(python.version)' 26 | architecture: 'x64' 27 | 28 | - script: python -m pip install --upgrade pip && pip install -r requirements.txt 29 | displayName: 'Install dependencies' 30 | 31 | - script: python setup.py install 32 | displayName: 'Install library' 33 | 34 | - script: | 35 | pip install pytest 36 | pytest tests --doctest-modules --junitxml=junit/test-results.xml 37 | displayName: 'pytest' 38 | 39 | - task: PublishTestResults@2 40 | inputs: 41 | testResultsFiles: '**/test-results.xml' 42 | testRunTitle: 'Python $(python.version)' 43 | condition: succeededOrFailed() 44 | 45 | - job: 'Publish' 46 | dependsOn: 'Test' 47 | pool: 48 | vmImage: 'Ubuntu-16.04' 49 | 50 | steps: 51 | - task: UsePythonVersion@0 52 | inputs: 53 | versionSpec: '3.x' 54 | architecture: 'x64' 55 | 56 | - script: python setup.py sdist 57 | displayName: 'Build sdist' 58 | -------------------------------------------------------------------------------- /docs/.conf.py.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EricSchles/drifter_ml/198a2e4a0b6310765e551f3122ff0ea8b04ed900/docs/.conf.py.swp -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = . 8 | BUILDDIR = _build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/api_docs/drifter_ml.classification_tests.rst: -------------------------------------------------------------------------------- 1 | drifter\_ml.classification\_tests package 2 | ========================================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | drifter\_ml.classification\_tests.classification\_tests module 8 | -------------------------------------------------------------- 9 | 10 | .. automodule:: drifter_ml.classification_tests.classification_tests 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: drifter_ml.classification_tests 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /docs/api_docs/drifter_ml.columnar_tests.rst: -------------------------------------------------------------------------------- 1 | drifter\_ml.columnar\_tests package 2 | =================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | drifter\_ml.columnar\_tests.columnar\_tests module 8 | -------------------------------------------------- 9 | 10 | .. automodule:: drifter_ml.columnar_tests.columnar_tests 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: drifter_ml.columnar_tests 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /docs/api_docs/drifter_ml.regression_tests.rst: -------------------------------------------------------------------------------- 1 | drifter\_ml.regression\_tests package 2 | ===================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | drifter\_ml.regression\_tests.regression\_tests module 8 | ------------------------------------------------------ 9 | 10 | .. automodule:: drifter_ml.regression_tests.regression_tests 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: drifter_ml.regression_tests 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /docs/api_docs/drifter_ml.rst: -------------------------------------------------------------------------------- 1 | drifter\_ml package 2 | =================== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | drifter_ml.classification_tests 10 | drifter_ml.columnar_tests 11 | drifter_ml.regression_tests 12 | drifter_ml.structural_tests 13 | drifter_ml.timeseries_tests 14 | 15 | Module contents 16 | --------------- 17 | 18 | .. automodule:: drifter_ml 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | -------------------------------------------------------------------------------- /docs/api_docs/drifter_ml.structural_tests.rst: -------------------------------------------------------------------------------- 1 | drifter\_ml.structural\_tests package 2 | ===================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | drifter\_ml.structural\_tests.structural\_tests module 8 | ------------------------------------------------------ 9 | 10 | .. automodule:: drifter_ml.structural_tests.structural_tests 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: drifter_ml.structural_tests 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /docs/api_docs/drifter_ml.timeseries_tests.rst: -------------------------------------------------------------------------------- 1 | drifter\_ml.timeseries\_tests package 2 | ===================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | drifter\_ml.timeseries\_tests.timeseries\_tests module 8 | ------------------------------------------------------ 9 | 10 | .. automodule:: drifter_ml.timeseries_tests.timeseries_tests 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: drifter_ml.timeseries_tests 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /docs/api_docs/modules.rst: -------------------------------------------------------------------------------- 1 | drifter_ml 2 | ========== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | drifter_ml 8 | -------------------------------------------------------------------------------- /docs/classification-tests.rst: -------------------------------------------------------------------------------- 1 | #################### 2 | Classification Tests 3 | #################### 4 | 5 | The goal of the following set of tests is to accomplish some monitoring goals: 6 | 7 | 1. Establish baselines for model performance in production per class 8 | 9 | 2. Establish maximum processing time for various volumes of data, through the statistical model 10 | 11 | 3. Ensure that the current model in production is the best available model according to a set of predefined measures 12 | 13 | Let's look at each of these classes of tests now. 14 | 15 | 16 | Lower Bound Classification Measures 17 | =================================== 18 | 19 | Each of the following examples ensures that your classifier meets a minimum criteria, which should be decided based on the need of your use-case. One simple way to do this is to define failure by how many dollars it will cost you. 20 | 21 | Precision, Recall and F1 score are great tools for ensuring your classifier optimizes for minimal misclassification, however you define it. 22 | 23 | That is why they are basis of the set of tests found below. 24 | 25 | 26 | Classifier Test Example - Model Metrics 27 | ======================================= 28 | 29 | Suppose you had the following model:: 30 | 31 | from sklearn import tree 32 | import pandas as pd 33 | import numpy as np 34 | import joblib 35 | 36 | df = pd.DataFrame() 37 | for _ in range(1000): 38 | a = np.random.normal(0, 1) 39 | b = np.random.normal(0, 3) 40 | c = np.random.normal(12, 4) 41 | if a + b + c > 11: 42 | target = 1 43 | else: 44 | target = 0 45 | df = df.append({ 46 | "A": a, 47 | "B": b, 48 | "C": c, 49 | "target": target 50 | }, ignore_index=True) 51 | 52 | clf = tree.DecisionTreeClassifier() 53 | X = df[["A", "B", "C"]] 54 | clf.fit(X, df["target"]) 55 | joblib.dump(clf, "model.joblib") 56 | df.to_csv("data.csv") 57 | 58 | We could write the following set of tests to ensure this model does well:: 59 | 60 | from drifter_ml.classification_tests import ClassificationTests 61 | import joblib 62 | import pandas as pd 63 | 64 | def test_precision(): 65 | df = pd.read_csv("data.csv") 66 | column_names = ["A", "B", "C"] 67 | target_name = "target" 68 | clf = joblib.load("model.joblib") 69 | 70 | test_suite = ClassificationTests(clf, 71 | df, target_name, column_names) 72 | classes = list(df.target.unique()) 73 | assert test_suite.precision_lower_boundary_per_class( 74 | {klass: 0.9 for klass in classes} 75 | ) 76 | 77 | def test_recall(): 78 | df = pd.read_csv("data.csv") 79 | column_names = ["A", "B", "C"] 80 | target_name = "target" 81 | clf = joblib.load("model.joblib") 82 | 83 | test_suite = ClassificationTests(clf, 84 | df, target_name, column_names) 85 | classes = list(df.target.unique()) 86 | assert test_suite.recall_lower_boundary_per_class( 87 | {klass: 0.9 for klass in classes} 88 | ) 89 | 90 | def test_f1(): 91 | df = pd.read_csv("data.csv") 92 | column_names = ["A", "B", "C"] 93 | target_name = "target" 94 | clf = joblib.load("model.joblib") 95 | 96 | test_suite = ClassificationTests(clf, 97 | df, target_name, column_names) 98 | classes = list(df.target.unique()) 99 | assert test_suite.f1_lower_boundary_per_class( 100 | {klass: 0.9 for klass in classes} 101 | ) 102 | 103 | 104 | Or you could simply write one test for all three:: 105 | 106 | from drifter_ml.classification_tests import ClassificationTests 107 | import joblib 108 | import pandas as pd 109 | 110 | def test_precision_recall_f1(): 111 | df = pd.read_csv("data.csv") 112 | column_names = ["A", "B", "C"] 113 | target_name = "target" 114 | clf = joblib.load("model.joblib") 115 | 116 | test_suite = ClassificationTests(clf, 117 | df, target_name, column_names) 118 | classes = list(df.target.unique()) 119 | assert test_suite.classifier_testing_per_class( 120 | {klass: 0.9 for klass in classes}, 121 | {klass: 0.9 for klass in classes}, 122 | {klass: 0.9 for klass in classes} 123 | ) 124 | 125 | Regardless of which test you choose, you get complete flexibility to ensure your model always meets the minimum criteria so that your costs are minimized, given constraints. 126 | 127 | 128 | Classifier Test Example - Model Speed 129 | ===================================== 130 | 131 | Additionally, you can test to ensure your classifier performs, even under load. Assume we have the same model as before:: 132 | 133 | from sklearn import tree 134 | import pandas as pd 135 | import numpy as np 136 | import joblib 137 | 138 | df = pd.DataFrame() 139 | for _ in range(1000): 140 | a = np.random.normal(0, 1) 141 | b = np.random.normal(0, 3) 142 | c = np.random.normal(12, 4) 143 | if a + b + c > 11: 144 | target = 1 145 | else: 146 | target = 0 147 | df = df.append({ 148 | "A": a, 149 | "B": b, 150 | "C": c, 151 | "target": target 152 | }, ignore_index=True) 153 | 154 | clf = tree.DecisionTreeClassifier() 155 | X = df[["A", "B", "C"]] 156 | clf.fit(X, df["target"]) 157 | joblib.dump(clf, "model.joblib") 158 | df.to_csv("data.csv") 159 | 160 | Now we test to ensure the model predicts new labels within our constraints:: 161 | 162 | from drifter_ml.classification_tests import ClassificationTests 163 | import joblib 164 | import pandas as pd 165 | 166 | def test_precision_recall_f1_speed(): 167 | df = pd.read_csv("data.csv") 168 | column_names = ["A", "B", "C"] 169 | target_name = "target" 170 | clf = joblib.load("model.joblib") 171 | 172 | test_suite = ClassificationTests(clf, 173 | df, target_name, column_names) 174 | sample_sizes = [i for i in range(100, 1000, 100)] 175 | max_run_times = [100 for _ in range(len(sample_sizes))] 176 | assert test_suite.run_time_stress_test( 177 | sample_sizes, max_run_times 178 | ) 179 | 180 | This test ensures that from 1 to 100000 elements, the model never takes longer than 10 seconds. 181 | 182 | Cross Validation Based Testing 183 | ============================== 184 | 185 | In the last section we asked questions of our model with respect to a lower boundary, both of various model measures as well as speed measurement in seconds. Now armed with cross validation we can ask questions about sections of our dataset, to ensure that the measures we found were an accurate representation across the dataset, rather than one global metric across the entire dataset. Just to make sure we are all on the same page, cross validation breaks the dataset into unique samples and then each sample is used as the test sample, all other samples are used as training, the score for each validation sample is recorded and then the model is discarded. For more information and a detailed introduction see https://machinelearningmastery.com/k-fold-cross-validation/. 186 | 187 | The advantage of checking our model in this way is now it is less likely that the model is just memorizing the training data and will actually scale to other examples. This happens because the model scores are tested on a more limited dataset and also because "k" samples, the tuning parameter in cross validation, are tested to ensure the model performance is consistent. 188 | 189 | This also yields some advantages for testing, because now we can verify that our lower boundary precision, recall or f1 score is true across many folds, rather than some global lower bound which may not be true on some subset of the data. This gives us more confidence in our models overall efficacy, but also requires that we have enough data to ensure our model can learn something. 190 | 191 | Sadly I could find no good rules of thumb but I'd say less than you need at least something like 1000 data points per fold at least, and it's probably best to never go above 20 folds unless your dataset is truly massive, like in the gigabytes. 192 | 193 | 194 | Classifier Test Example - Cross Validation Lower Bound Precision 195 | ================================================================ 196 | 197 | This example won't be that different from what you've seen before, except now we can tune on the number of folds to include. Let's spice things up by using a keras classifier instead of a scikit learn one:: 198 | 199 | from keras.models import Sequential 200 | from keras.layers import Dense 201 | from keras.wrappers.scikit_learn import KerasClassifier 202 | import pandas as pd 203 | import numpy as np 204 | import joblib 205 | 206 | # Function to create model, required for KerasClassifier 207 | def create_model(): 208 | # create model 209 | model = Sequential() 210 | model.add(Dense(12, input_dim=3, activation='relu')) 211 | model.add(Dense(8, activation='relu')) 212 | model.add(Dense(1, activation='sigmoid')) 213 | # Compile model 214 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 215 | return model 216 | 217 | # fix random seed for reproducibility 218 | df = pd.DataFrame() 219 | for _ in range(1000): 220 | a = np.random.normal(0, 1) 221 | b = np.random.normal(0, 3) 222 | c = np.random.normal(12, 4) 223 | if a + b + c > 11: 224 | target = 1 225 | else: 226 | target = 0 227 | df = df.append({ 228 | "A": a, 229 | "B": b, 230 | "C": c, 231 | "target": target 232 | }, ignore_index=True) 233 | 234 | # split into input (X) and output (Y) variables 235 | # create model 236 | clf = KerasClassifier(build_fn=create_model, epochs=150, batch_size=10, verbose=0) 237 | X = df[["A", "B", "C"]] 238 | clf.fit(X, df["target"]) 239 | joblib.dump(clf, "model.joblib") 240 | df.to_csv("data.csv") 241 | 242 | Now that we have the model and data saved, let's write the test:: 243 | 244 | from drifter_ml.classification_tests import ClassificationTests 245 | import joblib 246 | import pandas as pd 247 | 248 | def test_cv_precision_lower_boundary(): 249 | df = pd.read_csv("data.csv") 250 | column_names = ["A", "B", "C"] 251 | target_name = "target" 252 | clf = joblib.load("model.joblib") 253 | 254 | test_suite = ClassificationTests(clf, 255 | df, target_name, column_names) 256 | lower_boundary = 0.9 257 | test_suite.cross_val_precision_lower_boundary( 258 | lower_boundary 259 | ) 260 | 261 | There are a few things to notice here: 262 | 263 | 1. The set up didn't change - we train the model the same way, we store the model the same way, we pass the model in the same way. 264 | 265 | 2. We aren't specifying percision per class - we will see examples of tests like that below, but because of the added stringency of limiting our training set, as well as training it across several samples of the dataset, sometimes called folds, we now don't need to specify as much granularity. What we are really testing here is somewhat different - we want to make sure no samples of the dataset form significantly worse than the average. What we are really looking for is anomalous samples of the data, that the model does much worse on. Because any training set is just a sample, if a given subsample does much worse than others, then we need to ask the question - is this given subsample representative of a pattern we may see in the future? Is it truly an anamoly? If it's not, that's usually a strong indicator that our model needs some work. 266 | 267 | Classifier Test Example - Cross Validation Average 268 | =================================================== 269 | 270 | In the above example we test to ensure that none of the folds fall below a precision of 0.9 per fold. But what if we only care if one of the folds does significantly worse than the others? But don't actually care if all the folds meet the minimum criteria? After all, some level of any model measure is defined by how much data you train it on. It could be the case that we are right on the edge of having enough labeled data to train the model for all the imperative cases, but not enough to really ensure 90% percision, recall or some other meeasure. If that is the case, then we could simply look to see if any of the folds does significantly worse than some notion of centrality, which could be a red flag on its own. 271 | 272 | Here we can set some deviance from the center for precision, recall or f1 score. If a given fold falls below some deviance from centrality then we believe some intervention needs to be taken. Let's look at an example:: 273 | 274 | from sklearn import tree 275 | import pandas as pd 276 | import numpy as np 277 | import joblib 278 | 279 | df = pd.DataFrame() 280 | for _ in range(1000): 281 | a = np.random.normal(0, 1) 282 | b = np.random.normal(0, 3) 283 | c = np.random.normal(12, 4) 284 | if a + b + c > 11: 285 | target = 1 286 | else: 287 | target = 0 288 | df = df.append({ 289 | "A": a, 290 | "B": b, 291 | "C": c, 292 | "target": target 293 | }, ignore_index=True) 294 | 295 | clf = tree.DecisionTreeClassifier() 296 | X = df[["A", "B", "C"]] 297 | clf.fit(X, df["target"]) 298 | joblib.dump(clf, "model.joblib") 299 | df.to_csv("data.csv") 300 | 301 | 302 | Let's see a test:: 303 | 304 | from drifter_ml.classification_tests import ClassificationTests 305 | import joblib 306 | import pandas as pd 307 | 308 | def test_cv_precision_anomaly_detection(): 309 | df = pd.read_csv("data.csv") 310 | column_names = ["A", "B", "C"] 311 | target_name = "target" 312 | clf = joblib.load("model.joblib") 313 | 314 | test_suite = ClassificationTests(clf, 315 | df, target_name, column_names) 316 | precision_tolerance = 0.2 317 | test_suite.cross_val_precision_anomaly_detection( 318 | precision_tolerance, method='mean' 319 | ) 320 | 321 | Here instead of setting an expectation of the precision, we set an expectation of the deviance from average precision. So if the average is 0.7 and one of the folds scores is less than 5.0 then the test fails. So it's important to have some lower boundary in place as well. However we can be less stringent if we include this test. A more complete test suite would likely be something like this:: 322 | 323 | from drifter_ml.classification_tests import ClassificationTests 324 | import joblib 325 | import pandas as pd 326 | 327 | def test_cv_precision_anomaly_detection(): 328 | df = pd.read_csv("data.csv") 329 | column_names = ["A", "B", "C"] 330 | target_name = "target" 331 | clf = joblib.load("model.joblib") 332 | 333 | test_suite = ClassificationTests(clf, 334 | df, target_name, column_names) 335 | precision_tolerance = 0.2 336 | test_suite.cross_val_precision_anomaly_detection( 337 | precision_tolerance, method='mean' 338 | ) 339 | 340 | def test_cv_precision_lower_boundary(): 341 | df = pd.read_csv("data.csv") 342 | column_names = ["A", "B", "C"] 343 | target_name = "target" 344 | clf = joblib.load("model.joblib") 345 | 346 | test_suite = ClassificationTests(clf, 347 | df, target_name, column_names) 348 | min_averange = 0.7 349 | test_suite.cross_val_precision_avg( 350 | min_average, method='mean' 351 | ) 352 | 353 | Now we can say for sure, the precision should be at least 0.7 on average but can fall below up to 0.2 of that before we raise an error. So 354 | 355 | Classifier Test Example - Cross Validation Anamoly Detection With Spread 356 | ======================================================================== 357 | 358 | In the previous example, we looked for a specific deviance now we'll make use of some properties of statistics to define what exactly we mean by an anamolous fold. In order to do this, we'll look at deviance with respect to spread. To make this concrete, let's walk through what that means:: 359 | 360 | from drifter_ml.classification_tests import ClassificationTests 361 | import joblib 362 | import pandas as pd 363 | 364 | def test_cv_precision_anomaly_detection(): 365 | df = pd.read_csv("data.csv") 366 | column_names = ["A", "B", "C"] 367 | target_name = "target" 368 | clf = joblib.load("model.joblib") 369 | 370 | test_suite = ClassificationTests(clf, 371 | df, target_name, column_names) 372 | precision_tolerance = 0.2 373 | test_suite.cross_val_precision_anomaly_detection( 374 | precision_tolerance, method='mean' 375 | ) 376 | 377 | Before we go through what's happening let's recall what cross validation is. The basic notion of cross validation is random samples are taken, called folds of from the training set, trains the algorithm with that data and tests against all the other folds. For this reason, it is necessary that you have enough data such that you can learn a pattern from the data. For more information on k-fold check out this article: https://machinelearningmastery.com/k-fold-cross-validation/. 378 | 379 | As you can see we require a precision tolerance of 0.2 per fold of the cross validation. To understand how this comes into play, let's look at how cross validation anomaly detection is done generally in the library:: 380 | 381 | 1. decide on the measure of center to use 382 | 2. calculate the average of all the scores (each score comes from a fold) 383 | 3. compute the list of deviances from the average 384 | 4. determine if the deviance from the average is every greater than the tolerance 385 | 386 | So basically, this is a test for consistency on different folds of the data. If the model performances above or below the tolerance bound on any of the folds, then the test fails. This is really good if you need your model to act in an expected way, a lot of the time. -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/master/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | import os 16 | import sys 17 | sys.path.insert(0, os.path.abspath('.')) 18 | sys.path.insert(0, os.path.abspath('../')) 19 | 20 | 21 | # -- Project information ----------------------------------------------------- 22 | 23 | project = 'drifter_ml' 24 | copyright = '2019, Eric Schles' 25 | author = 'Eric Schles' 26 | 27 | # The short X.Y version 28 | version = '' 29 | # The full version, including alpha/beta/rc tags 30 | release = '0.20' 31 | 32 | 33 | # -- General configuration --------------------------------------------------- 34 | 35 | # If your documentation needs a minimal Sphinx version, state it here. 36 | # 37 | # needs_sphinx = '1.0' 38 | 39 | # Add any Sphinx extension module names here, as strings. They can be 40 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 41 | # ones. 42 | extensions = [ 43 | 'sphinx.ext.autodoc', 44 | 'sphinx.ext.coverage', 45 | 'sphinx.ext.napoleon' 46 | ] 47 | 48 | # Add any paths that contain templates here, relative to this directory. 49 | templates_path = ['_templates'] 50 | 51 | # The suffix(es) of source filenames. 52 | # You can specify multiple suffix as a list of string: 53 | # 54 | # source_suffix = ['.rst', '.md'] 55 | source_suffix = '.rst' 56 | 57 | # The master toctree document. 58 | master_doc = 'index' 59 | 60 | # The language for content autogenerated by Sphinx. Refer to documentation 61 | # for a list of supported languages. 62 | # 63 | # This is also used if you do content translation via gettext catalogs. 64 | # Usually you set "language" from the command line for these cases. 65 | language = None 66 | 67 | # List of patterns, relative to source directory, that match files and 68 | # directories to ignore when looking for source files. 69 | # This pattern also affects html_static_path and html_extra_path. 70 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 71 | 72 | # The name of the Pygments (syntax highlighting) style to use. 73 | pygments_style = None 74 | 75 | 76 | # -- Options for HTML output ------------------------------------------------- 77 | 78 | # The theme to use for HTML and HTML Help pages. See the documentation for 79 | # a list of builtin themes. 80 | # 81 | html_theme = 'sphinx_rtd_theme' 82 | 83 | # Theme options are theme-specific and customize the look and feel of a theme 84 | # further. For a list of options available for each theme, see the 85 | # documentation. 86 | # 87 | # html_theme_options = {} 88 | 89 | # Add any paths that contain custom static files (such as style sheets) here, 90 | # relative to this directory. They are copied after the builtin static files, 91 | # so a file named "default.css" will overwrite the builtin "default.css". 92 | html_static_path = ['_static'] 93 | 94 | # Custom sidebar templates, must be a dictionary that maps document names 95 | # to template names. 96 | # 97 | # The default sidebars (for documents that don't match any pattern) are 98 | # defined by theme itself. Builtin themes are using these templates by 99 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 100 | # 'searchbox.html']``. 101 | # 102 | # html_sidebars = {} 103 | 104 | 105 | # -- Options for HTMLHelp output --------------------------------------------- 106 | 107 | # Output file base name for HTML help builder. 108 | htmlhelp_basename = 'drifter_mldoc' 109 | 110 | 111 | # -- Options for LaTeX output ------------------------------------------------ 112 | 113 | latex_elements = { 114 | # The paper size ('letterpaper' or 'a4paper'). 115 | # 116 | # 'papersize': 'letterpaper', 117 | 118 | # The font size ('10pt', '11pt' or '12pt'). 119 | # 120 | # 'pointsize': '10pt', 121 | 122 | # Additional stuff for the LaTeX preamble. 123 | # 124 | # 'preamble': '', 125 | 126 | # Latex figure (float) alignment 127 | # 128 | # 'figure_align': 'htbp', 129 | } 130 | 131 | # Grouping the document tree into LaTeX files. List of tuples 132 | # (source start file, target name, title, 133 | # author, documentclass [howto, manual, or own class]). 134 | latex_documents = [ 135 | (master_doc, 'drifter_ml.tex', 'drifter\\_ml Documentation', 136 | 'Eric Schles', 'manual'), 137 | ] 138 | 139 | 140 | # -- Options for manual page output ------------------------------------------ 141 | 142 | # One entry per manual page. List of tuples 143 | # (source start file, name, description, authors, manual section). 144 | man_pages = [ 145 | (master_doc, 'drifter_ml', 'drifter_ml Documentation', 146 | [author], 1) 147 | ] 148 | 149 | 150 | # -- Options for Texinfo output ---------------------------------------------- 151 | 152 | # Grouping the document tree into Texinfo files. List of tuples 153 | # (source start file, target name, title, author, 154 | # dir menu entry, description, category) 155 | texinfo_documents = [ 156 | (master_doc, 'drifter_ml', 'drifter_ml Documentation', 157 | author, 'drifter_ml', 'One line description of project.', 158 | 'Miscellaneous'), 159 | ] 160 | 161 | 162 | # -- Options for Epub output ------------------------------------------------- 163 | 164 | # Bibliographic Dublin Core info. 165 | epub_title = project 166 | 167 | # The unique identifier of the text. This can be a ISBN number 168 | # or the project homepage. 169 | # 170 | # epub_identifier = '' 171 | 172 | # A unique identification for the text. 173 | # 174 | # epub_uid = '' 175 | 176 | # A list of files that should not be packed into the epub file. 177 | epub_exclude_files = ['search.html'] 178 | -------------------------------------------------------------------------------- /docs/designing-your-own-tests.rst: -------------------------------------------------------------------------------- 1 | ######################## 2 | Designing your own tests 3 | ######################## 4 | 5 | Before we jump into the API and all the premade tests that have been written to make your life easier, let's talk about a process for designing your own machine learning tests. The reason for doing this is important, machine learning testing is not like other software engineering tests. That's because software engineering tests are deterministic, like software engineering code ought to be. However, when you write tests for your data or your machine learning model, you need to account for the probabilistic nature of the code you are writing. The goal, therefore is much more fuzzy. But the process defined below should help you out. 6 | 7 | 8 | It's About Proving Or Disproving Assumptions 9 | ============================================ 10 | 11 | There are a standard set of steps to any machine learning project: 12 | 13 | 1. Exploratory Analysis 14 | 2. Data Cleaning 15 | 3. Model Evaluation 16 | 4. Productionalizing The Model 17 | 5. Monitoring The Model 18 | 19 | Machine learning tests are really about monitoring, but the big open question is, what do you monitor? 20 | 21 | Monitoring the steps you took in 1-3 above, gives at least a base line. There will likely be other things to account for and monitor once you go into production, but what you've found in evaluation will likely be helpful later. So that should inform your first set of tests. 22 | 23 | 24 | Data Monitoring Tests 25 | ====================== 26 | 27 | Specifically, we can monitor the data by: 28 | 29 | * checking to see if any descriptive statistics you found have changed substantially 30 | * checking to see if current data is correlated with previous data per column 31 | * checking to see if columns that were correlated or uncorrelated in past data remain that way 32 | * checking to see if the number of clusters in the data has changed in a meaningful way 33 | * checking to see whether the number of missing values stays consistent between new and old data, 34 | * checking to see certain monotonicity requirements between columns remain consistent 35 | 36 | It is an imperative to model the data because your model is merely a function of your data. If your data is bad or changes in some important way, your model will be useless. Also, there may be more measures you used to evaluate the data and those may become important features of whatever model you build later on. Therefore, making sure your data continues to follow the trends found previously may be of great import. Otherwise, your model might be wrong and you'd never know it. 37 | 38 | 39 | Model Monitoring Tests 40 | ======================= 41 | 42 | Additionally, we can monitor the model itself: 43 | 44 | * checking to see if the model meets all metric requirements as specified by the business use-case 45 | * checking to see if the model does better than some other test model on all measures of interest 46 | 47 | 48 | System Monitoring Tests 49 | ======================== 50 | 51 | Finally, there are also traditional tests one should run: 52 | 53 | * making sure the serialized model exists where expected 54 | * making sure the data exists where expected 55 | * making sure data can flow into the system, to the model and through it 56 | * making sure the new data matches the types you expect 57 | * making sure the model produces the types you expect 58 | * making sure new models can be deployed to the model pipeline 59 | * making sure the model can perform well under load 60 | * making sure the data can flow through fast enough to reach the model at ingress and egress 61 | 62 | These three classes of machine learning system evaluation form a minimal reference set for monitoring such a system. There are likely more tests you'll need to write, but again just to outline the process in clear terms: 63 | 64 | 1. Look at what you wrote when you did exploratory analysis and data cleaning, turn those into tests to make sure your data stays that way, as long as it's supposed to 65 | 66 | 2. Look at how your model performed on test and training data, turn those evaluation measures into tests to make sure your model performs as well in production 67 | 68 | 3. Make sure everything actually goes from point A (the start of your system) to point B (the end of your system). 69 | 70 | 71 | Fairness Monitoring Tests 72 | ========================== 73 | 74 | There is a fourth class of tests that are unclear regarding the ethical nature of the algorithm you are building. These tests are unfortunately poorly defined at the present moment and very context specific, so all that can be offered is an example of what one might do: 75 | 76 | Suppose you worked for a bank and were writing a piece of software that determined who gets a loan. Assuming a fair system folks from all races, genders, ages would get loans at a similar rate or would perhaps not be rejected due to race, gender, age or other factors. 77 | 78 | If when accounting for some protected variable like race, gender, or age your algorithm does something odd compared to when not accounting for race, gender, or age then your algorithm may be biased. 79 | 80 | However, this field of research is far from complete. There are some notions of testing for this, at the present moment they appear to be in need of further research and analysis. However, if possible, one should account for such a set of tests if possible, to ensure your algorithm is fair, unbiased and treats all individuals equally and fairly. -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. drifter_ml documentation master file, created by 2 | sphinx-quickstart on Thu Mar 14 07:54:18 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to Drifter ML's documentation! 7 | ====================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | 12 | introduction 13 | project-setup 14 | designing-your-own-tests 15 | classification-tests 16 | regression-tests 17 | api_docs/drifter_ml.classification_tests 18 | api_docs/drifter_ml.regression_tests 19 | api_docs/drifter_ml.columnar_tests 20 | api_docs/drifter_ml.structural_tests 21 | 22 | Indices and tables 23 | ================== 24 | 25 | * :ref:`genindex` 26 | * :ref:`modindex` 27 | * :ref:`search` 28 | -------------------------------------------------------------------------------- /docs/introduction.rst: -------------------------------------------------------------------------------- 1 | ############ 2 | Introduction 3 | ############ 4 | 5 | Welcome to Drifter, a tool to help you test your machine learning models. This testing framework is broken out semantically, so you can test different aspects of your machine learning system. 6 | 7 | The tests come in two general flavors, component tests, like this one that tests for a minimum precision per class:: 8 | 9 | from drifter_ml.classification_tests import ClassificationTests 10 | import joblib 11 | import pandas as pd 12 | 13 | def test_precision(): 14 | clf = joblib.load("random_forest.joblib") 15 | test_data = pd.read_csv("test.csv") 16 | columns = test_data.columns.tolist() 17 | columns.remove("target") 18 | clf_tests = ClassificationTests(clf, test_data, "target", columns) 19 | classes = set(test_data["target"]) 20 | precision_per_class = {klass: 0.9 for klass in classes} 21 | clf_tests.precision_lower_boundary_per_class(precision_per_class) 22 | 23 | 24 | And an entire test suite that tests for precision, recall and f1 score in one test:: 25 | 26 | from drifter_ml.classification_tests import ClassificationTests 27 | import joblib 28 | import pandas as pd 29 | 30 | def test_precision(): 31 | clf = joblib.load("random_forest.joblib") 32 | test_data = pd.read_csv("test.csv") 33 | columns = test_data.columns.tolist() 34 | columns.remove("target") 35 | clf_tests = ClassificationTests(clf, test_data, "target", columns) 36 | classes = set(test_data["target"]) 37 | precision_per_class = {klass: 0.9 for klass in classes} 38 | recall_per_class = {klass: 0.9 for klass in classes} 39 | f1_per_class = {klass: 0.9 for klass in classes} 40 | clf_tests.classifier_testing( 41 | precision_per_class, 42 | recall_per_class, 43 | f1_per_class 44 | ) 45 | 46 | 47 | The expectation at present is that all models follow the scikit learn api, which means there is an expectation of a `fit` and `predict` on all models. This may appear exclusionary, but you can infact wrap keras models with scikit-learn style objects, allowing for the same api:: 48 | 49 | from keras.models import Sequential 50 | from keras.layers import Dense 51 | from keras.wrappers.scikit_learn import KerasClassifier 52 | from sklearn.model_selection import StratifiedKFold 53 | from sklearn.model_selection import cross_val_score 54 | import numpy 55 | 56 | # Function to create model, required for KerasClassifier 57 | def create_model(): 58 | # create model 59 | model = Sequential() 60 | model.add(Dense(12, input_dim=8, activation='relu')) 61 | model.add(Dense(8, activation='relu')) 62 | model.add(Dense(1, activation='sigmoid')) 63 | # Compile model 64 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 65 | return model 66 | 67 | # fix random seed for reproducibility 68 | seed = 7 69 | numpy.random.seed(seed) 70 | # load pima indians dataset 71 | dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",") 72 | # split into input (X) and output (Y) variables 73 | X = dataset[:,0:8] 74 | Y = dataset[:,8] 75 | # create model 76 | model = KerasClassifier(build_fn=create_model, epochs=150, batch_size=10, verbose=0) 77 | # evaluate using 10-fold cross validation 78 | kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed) 79 | results = cross_val_score(model, X, Y, cv=kfold) 80 | print(results.mean()) 81 | 82 | This means that traditional machine learning and deep learning are available for testing out of the box! -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/project-setup.rst: -------------------------------------------------------------------------------- 1 | ############# 2 | Project Setup 3 | ############# 4 | 5 | Regression and Classification Tests 6 | =================================== 7 | 8 | If you are going to use regression or classification tests, you'll need to do a bit of setup. The first step is making sure you have a test set with labeled data that you can trust. It is recommended that you break your initial labeled dataset up into test and train and keep the test for both the model generation phase as well as for model monitoring throughout. 9 | 10 | A good rule of thumb is to have 70% train, and 30% test. Other splits may be ideal, depending on the needs of your project. You can setup test and train using existing tools from sklearn as follows:: 11 | 12 | from sklearn.model_selection import train_test_split 13 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) 14 | 15 | Once you have your two datasets you can train your model with the training set, as is typical:: 16 | 17 | from sklearn import tree 18 | import pandas as pd 19 | import numpy as np 20 | from sklearn.model_selection import train_test_split 21 | import joblib 22 | 23 | df = pd.DataFrame() 24 | for _ in range(5000): 25 | a = np.random.normal(0, 1) 26 | b = np.random.normal(0, 3) 27 | c = np.random.normal(12, 4) 28 | if a + b + c > 11: 29 | target = 1 30 | else: 31 | target = 0 32 | df = df.append({ 33 | "A": a, 34 | "B": b, 35 | "C": c, 36 | "target": target 37 | }, ignore_index=True) 38 | 39 | clf = tree.DecisionTreeClassifier() 40 | X = df[["A", "B", "C"]] 41 | y = df["target"] 42 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) 43 | 44 | clf.fit(X_train, y_train) 45 | joblib.dump(clf, "model.joblib") 46 | df.to_csv("data.csv") 47 | test_data = pd.DataFrame() 48 | test_data[["A", "B", "C"]] 49 | test_data["target"] = y_test 50 | test_data.to_csv("test_data.csv") 51 | 52 | Then you can test against your model before you put it into production as follows:: 53 | 54 | import joblib 55 | import pandas as pd 56 | from sklearn.metrics import f1_score 57 | 58 | clf = joblib.load("model.joblib") 59 | test_data = pd.read_csv("test_data.csv") 60 | y_pred = clf.predict(test_data[["A", "B", "C"]]) 61 | y_true = test_data["target"] 62 | print(f1_score(y_true, y_pred)) 63 | 64 | It's worth noting that one score is likely never good enough, you need to include multiple measures to ensure your model is not simply fitting towards a single measure. Assuming the measures are good enough you can move onto productionizing your model. 65 | 66 | Strategies For Testing Your Productionized Model 67 | ================================================ 68 | 69 | Once you've put your model into production there are a few strategies for making sure your model continues to meet your requirements: 70 | 71 | 1. Using the test set from training - Gather new data and predictions from production and then training a new classifier or regressor with the new data and new predictions. Then test against the test set you've set aside. If the measures stay approximately the same, it's possible your model is performing as expected. It's important that the new classifier have the same hyper parameters as the one in production as well as using the same versions for all associated code that creates the new model object. 72 | 73 | 2. Generating a new test set from a process - Gather new data and new predictions from the production model. Then manually label the same set of new data, either via some people process or other process you believe to be able to generate faithful labels. Then validate that the manually labeled examples against the predicted examples. If you are predicting new data a lot, I recommend taking random non-overlapping samples from the production data and labeling those. 74 | 75 | 3. Generating a new test set from a process and then do label propagation - Gather new data and new predictions from the production model. Then manually label a small set of the new data in some manor. Make sure to have multiple people manually label the same data, till everyone agrees on the ground truth. Then generate a new set of labels via label propagation. Then have people manually label the newly propagated labels, if the newly propagated labels agree with the manual labels often enough, then continue the label propagation process. Continue to check random non-overlapping samples until you feel satisified, then label the remainder of the production data. 76 | 77 | 78 | Using The Test Set From Training 79 | ================================ 80 | 81 | So the above description is a bit terse so let's break it down with some example code to inform your own project setup. First let's assume that you have some data to train on and test on:: 82 | 83 | from sklearn import tree 84 | import pandas as pd 85 | import numpy as np 86 | from sklearn.model_selection import train_test_split 87 | import joblib 88 | 89 | df = pd.DataFrame() 90 | for _ in range(5000): 91 | a = np.random.normal(0, 1) 92 | b = np.random.normal(0, 3) 93 | c = np.random.normal(12, 4) 94 | if a + b + c > 11: 95 | target = 1 96 | else: 97 | target = 0 98 | df = df.append({ 99 | "A": a, 100 | "B": b, 101 | "C": c, 102 | "target": target 103 | }, ignore_index=True) 104 | 105 | clf = tree.DecisionTreeClassifier() 106 | X = df[["A", "B", "C"]] 107 | y = df["target"] 108 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) 109 | 110 | clf.fit(X_train, y_train) 111 | joblib.dump(clf, "model.joblib") 112 | df.to_csv("data.csv") 113 | test_data = pd.DataFrame() 114 | test_data[["A", "B", "C"]] = df[["A", "B", "C"]] 115 | test_data["target"] = y_test 116 | test_data.to_csv("test_data.csv") 117 | 118 | Next we need to test our model to make sure it's performing well enough to go into production:: 119 | 120 | import joblib 121 | import pandas as pd 122 | from sklearn.metrics import classification_report 123 | 124 | clf = joblib.load("model.joblib") 125 | test_data = pd.read_csv("test_data.csv") 126 | y_pred = clf.predict(test_data[["A", "B", "C"]]) 127 | y_true = test_data["target"] 128 | print(classification_report(y_true, y_pred)) 129 | 130 | Let's assume everything met our minimum criteria for going to production. Now we are ready to put our model into production!! For this we'll need to write our test such that it makes use of the test data, our new data and our new predictions. For the purposes of the below example, assume you've been saving new data and new predictions to a csv called new_data.csv, that you have saved your production model in a file called model.joblib and that you have test data saved to test_data.csv. Below is an example test you might write using the framework:: 131 | 132 | import joblib 133 | import pandas as pd 134 | from sklearn import tree 135 | from drifter_ml import classification_tests 136 | 137 | def generate_model_from_production_data(): 138 | new_data = pd.read_csv("new_data.csv") 139 | prod_clf = joblib.load("model.joblib") 140 | test_data = pd.read_csv("test_data.csv") 141 | return test_data, new_data, prod_clf 142 | 143 | def test_precision(): 144 | test_data, new_data, prod_clf = generate_model_from_production_data() 145 | column_names = ["A", "B", "C"] 146 | target_name = "target" 147 | test_clf = tree.DecisionTreeClassifier() 148 | test_clf.set_params(**prod_clf.get_params()) 149 | X = new_data[column_names] 150 | y = new_data[target_name] 151 | test_clf.fit(X, y) 152 | 153 | test_suite = ClassificationTests(test_clf, 154 | test_data, target_name, column_names) 155 | classes = list(df.target.unique()) 156 | lower_bound_requirement = {klass: 0.9 for klass in classes} 157 | assert test_suite.precision_lower_boundary_per_class( 158 | lower_bound_requirement 159 | ) 160 | 161 | Notice that we train on the production data and labels (in this case in target) and then test against the labels we know. Here we use the lower_bound_requirement variable to set the expectation for how well the model should do against the test set. If the labels generated by the production model train a model that performs as well on the test data as the production model did on the test set, then we have some confidence in the labels it produces. This is probably not the only way one could do this comparison, if you come up with something better, please share back out to the project! -------------------------------------------------------------------------------- /docs/regression-tests.rst: -------------------------------------------------------------------------------- 1 | ################# 2 | Regression Tests 3 | ################# 4 | 5 | So this section will likely be the most confusing for anyone coming from classical software engineering. Here regression refers to a model that outputs a floating point number, instead of a class. The biggest important difference between classification and regression is, the numbers produced by regression are "real" numbers. So they actually have magnitude, direction, a sense of scale, etc. 6 | 7 | Classification returns a "class". Which means class "1" has no ordering relationship with class "2". So you shouldn't compare these with ordering. 8 | 9 | In any event, the regression tests break out into the follow categories: 10 | 11 | 1. Establish a baseline maximum error tolerance based on a model measure 12 | 2. Establish a tolerance level for deviance from the average fold error 13 | 3. Stress testing for the speed of calculating new values 14 | 4. Comparison of the current model against new models for the above defined measures 15 | 5. Comparison of the speed of performance against new models 16 | 17 | Upper Bound Regression Metrics 18 | ============================== 19 | 20 | Each of the following examples ensures that your model meets a minimum criteria, which should be decided based on the need of your use-case. One simple way to do this is to define failure by how many dollars it will cost you for every unit amount your model is off on average. 21 | 22 | Mean Squared Error and Median Absolute Error are great tools for ensuring your regressor optimizes for least error. The scale of that error will be entirely context specific. 23 | 24 | That is why they are basis of the set of tests found below. 25 | 26 | Regression Test Example - Model Metrics 27 | ======================================= 28 | 29 | Suppose you had the following model:: 30 | 31 | from sklearn import linear_model 32 | import pandas as pd 33 | import numpy as np 34 | import joblib 35 | 36 | df = pd.DataFrame() 37 | for _ in range(1000): 38 | a = np.random.normal(0, 1) 39 | b = np.random.normal(0, 3) 40 | c = np.random.normal(12, 4) 41 | target = 5*a + 3*b + c 42 | df = df.append({ 43 | "A": a, 44 | "B": b, 45 | "C": c, 46 | "target": target 47 | }, ignore_index=True) 48 | 49 | reg = linear_model.LinearRegression() 50 | X = df[["A", "B", "C"]] 51 | reg.fit(X, df["target"]) 52 | joblib.dump(reg, "model.joblib") 53 | df.to_csv("data.csv") 54 | 55 | We could write the following set of tests to ensure this model does well:: 56 | 57 | from drifter_ml.regression_tests import RegressionTests 58 | import joblib 59 | import pandas as pd 60 | 61 | def test_mse(): 62 | df = pd.read_csv("data.csv") 63 | column_names = ["A", "B", "C"] 64 | target_name = "target" 65 | reg = joblib.load("model.joblib") 66 | 67 | test_suite = RegressionTests(reg, 68 | df, target_name, column_names) 69 | mse_boundary = 15 70 | assert test_suite.mse_upper_boundary(mse_boundary) 71 | 72 | def test_mae(): 73 | df = pd.read_csv("data.csv") 74 | column_names = ["A", "B", "C"] 75 | target_name = "target" 76 | reg = joblib.load("model.joblib") 77 | 78 | test_suite = RegressionTests(reg, 79 | df, target_name, column_names) 80 | mae_boundary = 10 81 | assert test_suite.mae_upper_boundary(mae_boundary) 82 | 83 | Or you could simply write one test for all three:: 84 | 85 | from drifter_ml.regression_tests import RegressionTests 86 | import joblib 87 | import pandas as pd 88 | 89 | def test_mse_mae(): 90 | df = pd.read_csv("data.csv") 91 | column_names = ["A", "B", "C"] 92 | target_name = "target" 93 | reg = joblib.load("model.joblib") 94 | 95 | test_suite = RegressionTests(reg, 96 | df, target_name, column_names) 97 | mse_boundary = 15 98 | mae_boundary = 10 99 | assert test_suite.regression_testing(mse_boundary, 100 | mae_boundary) 101 | 102 | Regression Test Example - Model Speed 103 | ===================================== 104 | 105 | Additionally, you can test to ensure your regressor performs, even under load. Assume we have the same model as before:: 106 | 107 | from sklearn import linear_model 108 | import pandas as pd 109 | import numpy as np 110 | import joblib 111 | 112 | df = pd.DataFrame() 113 | for _ in range(1000): 114 | a = np.random.normal(0, 1) 115 | b = np.random.normal(0, 3) 116 | c = np.random.normal(12, 4) 117 | target = 5*a + 3*b + c 118 | df = df.append({ 119 | "A": a, 120 | "B": b, 121 | "C": c, 122 | "target": target 123 | }, ignore_index=True) 124 | 125 | reg = linear_model.LinearRegression() 126 | X = df[["A", "B", "C"]] 127 | reg.fit(X, df["target"]) 128 | joblib.dump(reg, "model.joblib") 129 | df.to_csv("data.csv") 130 | 131 | Now we test to ensure the model predicts new labels within our constraints:: 132 | 133 | from drifter_ml.regression_tests import RegressionTests 134 | import joblib 135 | import pandas as pd 136 | 137 | def test_mse_mae_speed(): 138 | df = pd.read_csv("data.csv") 139 | column_names = ["A", "B", "C"] 140 | target_name = "target" 141 | reg = joblib.load("model.joblib") 142 | 143 | test_suite = RegressionTests(reg, 144 | df, target_name, column_names) 145 | performance_boundary = [] 146 | for size in range(1, 100000, 100): 147 | performance_boundary.append({ 148 | "sample_size": size, 149 | "max_run_time": 10.0 # seconds 150 | }) 151 | assert test_suite.run_time_stress_test( 152 | performance_boundary 153 | ) 154 | 155 | This test ensures that from 1 to 100000 elements, the model never takes longer than 10 seconds. 156 | 157 | -------------------------------------------------------------------------------- /drifter_ml/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.25' 2 | 3 | from .classification_tests import classification_tests 4 | from .columnar_tests import columnar_tests 5 | from .regression_tests import regression_tests 6 | from .structural_tests import structural_tests 7 | 8 | 9 | __all__ = ["classification_tests", "columnar_tests", "regression_tests", "structural_tests"] 10 | -------------------------------------------------------------------------------- /drifter_ml/classification_tests/__init__.py: -------------------------------------------------------------------------------- 1 | from .classification_tests import ClassificationTests 2 | from .classification_tests import ClassifierComparison 3 | 4 | __all__ = ["ClassificationTests", "ClassifierComparison"] 5 | -------------------------------------------------------------------------------- /drifter_ml/columnar_tests/__init__.py: -------------------------------------------------------------------------------- 1 | from .columnar_tests import DataSanitization 2 | from .columnar_tests import ColumnarData 3 | 4 | __all__ = ["DataSanitization", "ColumnarData"] 5 | -------------------------------------------------------------------------------- /drifter_ml/columnar_tests/columnar_tests.py: -------------------------------------------------------------------------------- 1 | from sklearn import metrics 2 | import numpy as np 3 | import time 4 | from scipy import stats 5 | from mlxtend.evaluate import permutation_test 6 | 7 | class DataSanitization(): 8 | def __init__(self, data): 9 | self.data = data 10 | 11 | def is_complete(self, column): 12 | return self.data[column].isnull().sum() == 0 13 | 14 | def has_completeness(self, column, threshold): 15 | return self.data[column].isnull().sum()/len(self.data) > threshold 16 | 17 | def is_unique(self, column): 18 | return len(self.data[column].unique())/len(self.data) == 1 19 | 20 | def has_uniqueness(self, column, threshold): 21 | return len(self.data[column].unique())/len(self.data) > threshold 22 | 23 | def is_in_range(self, column, lower_bound, upper_bound, threshold): 24 | return self.data[(self.data[column] <= upper_bound) & (self.data[column] >= lower_bound)]/len(self.data) > threshold 25 | 26 | def is_non_negative(self, column): 27 | return self.data[self.data[column] > 0] 28 | 29 | def is_less_than(self, column_one, column_two): 30 | return self.data[self.data[column_one] < self.data[column_two]].all() 31 | 32 | class ColumnarData(): 33 | def __init__(self, historical_data, new_data): 34 | self.new_data = new_data 35 | self.historical_data = historical_data 36 | 37 | def mean_similarity(self, column, tolerance=2): 38 | new_mean = float(np.mean(self.new_data[column])) 39 | old_mean = float(np.mean(self.historical_data[column])) 40 | std = float(np.std(self.historical_data[column])) 41 | upper_bound = old_mean + (std * tolerance) 42 | lower_bound = old_mean - (std * tolerance) 43 | if new_mean < lower_bound: 44 | return False 45 | elif new_mean > upper_bound: 46 | return False 47 | else: 48 | return True 49 | 50 | def median_similarity(self, column, tolerance=2): 51 | new_median = float(np.median(self.new_data[column])) 52 | old_median = float(np.median(self.historical_data[column])) 53 | iqr = float(stats.iqr(self.historical_data[column])) 54 | upper_bound = old_median + (iqr * tolerance) 55 | lower_bound = old_median - (iqr * tolerance) 56 | if new_median < lower_bound: 57 | return False 58 | elif new_median > upper_bound: 59 | return False 60 | else: 61 | return True 62 | 63 | def trimean(self, data): 64 | q1 = float(np.quantile(data, 0.25)) 65 | q3 = float(np.quantile(data, 0.75)) 66 | median = float(np.median(data)) 67 | return (q1 + 2*median + q3)/4 68 | 69 | def trimean_absolute_deviation(self, data): 70 | trimean = self.trimean(data) 71 | numerator = [abs(elem - trimean) for elem in data] 72 | return sum(numerator)/len(data) 73 | 74 | def trimean_similarity(self, column, tolerance=2): 75 | new_trimean = self.trimean(self.new_data[column]) 76 | old_trimean = self.trimean(self.historical_data[column]) 77 | tad = self.trimean_absolute_deviation(self.historical_data[column]) 78 | upper_bound = old_trimean + (tad * tolerance) 79 | lower_bound = old_trimean - (tad * tolerance) 80 | if new_trimean < lower_bound: 81 | return False 82 | if new_trimean > upper_bound: 83 | return False 84 | else: 85 | return True 86 | 87 | def is_normal(self, column): 88 | new_data_result = stats.normaltest(self.new_data[column]) 89 | historical_data_result = stats.normaltest(self.historical_data[column]) 90 | if new_data_result.pvalue > 0.05 and historical_data_result.pvalue > 0.05: 91 | return True 92 | return False 93 | 94 | def pearson_similar_correlation(self, column, 95 | correlation_lower_bound, 96 | pvalue_threshold=0.05, 97 | num_rounds=3): 98 | correlation_info = stats.pearsonr(self.new_data[column], 99 | self.historical_data[column]) 100 | p_value = permutation_test( 101 | self.new_data[column], 102 | self.historical_data[column], 103 | method="approximate", 104 | num_rounds=num_rounds, 105 | func=lambda x, y: stats.pearsonr(x, y)[0], 106 | seed=0) 107 | if p_value > pvalue_threshold: 108 | return False 109 | if correlation_info[0] < correlation_lower_bound: 110 | return False 111 | return True 112 | 113 | def spearman_similar_correlation(self, column, 114 | correlation_lower_bound, 115 | pvalue_threshold=0.05, 116 | num_rounds=3): 117 | correlation_info = stats.spearmanr(self.new_data[column], 118 | self.historical_data[column]) 119 | p_value = permutation_test( 120 | self.new_data[column], 121 | self.historical_data[column], 122 | method="approximate", 123 | num_rounds=num_rounds, 124 | func=lambda x, y: stats.spearmanr(x, y).correlation, 125 | seed=0) 126 | if p_value > pvalue_threshold: 127 | return False 128 | if correlation_info.correlation < correlation_lower_bound: 129 | return False 130 | return True 131 | 132 | def wilcoxon_similar_distribution(self, column, 133 | pvalue_threshold=0.05, 134 | num_rounds=3): 135 | p_value = permutation_test( 136 | self.new_data[column], 137 | self.historical_data[column], 138 | method="approximate", 139 | num_rounds=num_rounds, 140 | func=lambda x, y: stats.wilcoxon(x, y).statistic, 141 | seed=0) 142 | if p_value < pvalue_threshold: 143 | return False 144 | return True 145 | 146 | def ks_2samp_similar_distribution(self, column, 147 | pvalue_threshold=0.05, 148 | num_rounds=3): 149 | p_value = permutation_test( 150 | self.new_data[column], 151 | self.historical_data[column], 152 | method="approximate", 153 | num_rounds=num_rounds, 154 | func=lambda x, y: stats.ks_2samp(x, y).statistic, 155 | seed=0) 156 | if p_value < pvalue_threshold: 157 | return False 158 | return True 159 | 160 | def kruskal_similar_distribution(self, column, 161 | pvalue_threshold=0.05, 162 | num_rounds=3): 163 | p_value = permutation_test( 164 | self.new_data[column], 165 | self.historical_data[column], 166 | method="approximate", 167 | num_rounds=num_rounds, 168 | func=lambda x, y: stats.kruskal(x, y).statistic, 169 | seed=0) 170 | if p_value < pvalue_threshold: 171 | return False 172 | return True 173 | 174 | def mann_whitney_u_similar_distribution(self, column, 175 | pvalue_threshold=0.05, 176 | num_rounds=3): 177 | p_value = permutation_test( 178 | self.new_data[column], 179 | self.historical_data[column], 180 | method="approximate", 181 | num_rounds=num_rounds, 182 | func=lambda x, y: stats.mannwhitneyu(x, y).statistic, 183 | seed=0) 184 | 185 | if p_value < pvalue_threshold: 186 | return False 187 | return True 188 | -------------------------------------------------------------------------------- /drifter_ml/regression_tests/__init__.py: -------------------------------------------------------------------------------- 1 | from .regression_tests import RegressionTests 2 | from .regression_tests import RegressionComparison 3 | 4 | __all__ = ["RegressionTests", "RegressionComparison"] 5 | -------------------------------------------------------------------------------- /drifter_ml/regression_tests/regression_tests.py: -------------------------------------------------------------------------------- 1 | from sklearn import metrics 2 | import numpy as np 3 | import time 4 | from scipy import stats 5 | from sklearn.model_selection import cross_validate, cross_val_predict 6 | 7 | class RegressionTests(): 8 | def __init__(self, 9 | reg, 10 | test_data, 11 | target_name, 12 | column_names): 13 | self.reg = reg 14 | self.column_names = column_names 15 | self.target_name = target_name 16 | self.test_data = test_data 17 | self.y = test_data[target_name] 18 | self.X = test_data[column_names] 19 | 20 | def get_test_score(self, cross_val_dict): 21 | return list(cross_val_dict["test_score"]) 22 | 23 | def trimean(self, data): 24 | """ 25 | I'm exposing this as a public method because 26 | the trimean is not implemented in enough packages. 27 | 28 | Formula: 29 | (25th percentile + 2*50th percentile + 75th percentile)/4 30 | 31 | Parameters 32 | ---------- 33 | data : array-like 34 | an iterable, either a list or a numpy array 35 | 36 | Returns 37 | ------- 38 | the trimean: float 39 | """ 40 | q1 = np.quantile(data, 0.25) 41 | q3 = np.quantile(data, 0.75) 42 | median = np.median(data) 43 | return (q1 + 2*median + q3)/4 44 | 45 | def trimean_absolute_deviation(self, data): 46 | """ 47 | The trimean absolute deviation is the 48 | the average distance from the trimean. 49 | 50 | Parameters 51 | ---------- 52 | data : array-like 53 | an iterable, either a list or a numpy array 54 | 55 | Returns 56 | ------- 57 | the average distance to the trimean: float 58 | """ 59 | trimean = self.trimean(data) 60 | numerator = [abs(elem - trimean) for elem in data] 61 | return sum(numerator)/len(data) 62 | 63 | def describe_scores(self, scores, method): 64 | """ 65 | Describes scores. 66 | 67 | Parameters 68 | ---------- 69 | scores : array-like 70 | the scores from the model, as a list or numpy array 71 | method : string 72 | the method to use to calculate central tendency and spread 73 | 74 | Returns 75 | ------- 76 | Returns the central tendency, and spread 77 | by method. 78 | 79 | Methods: 80 | mean: 81 | * central tendency: mean 82 | * spread: standard deviation 83 | 84 | median: 85 | * central tendency: median 86 | * spread: interquartile range 87 | 88 | trimean: 89 | * central tendency: trimean 90 | * spread: trimean absolute deviation 91 | """ 92 | if method == "mean": 93 | return np.mean(scores), np.std(scores) 94 | elif method == "median": 95 | return np.median(scores), stats.iqr(scores) 96 | elif method == "trimean": 97 | return self.trimean(scores), self.trimean_absolute_deviation(scores) 98 | 99 | def mae_cv(self, cv): 100 | """ 101 | This method performs cross-validation over median absolute error. 102 | 103 | Parameters 104 | ---------- 105 | * cv : integer 106 | The number of cross validation folds to perform 107 | 108 | Returns 109 | ------- 110 | Returns a scores of the k-fold median absolute error. 111 | """ 112 | 113 | mae = metrics.make_scorer(metrics.median_absolute_error) 114 | result = cross_validate(self.reg, self.X, 115 | self.y, cv=cv, 116 | scoring=(mae)) 117 | return self.get_test_score(result) 118 | 119 | def mse_cv(self, cv): 120 | """ 121 | This method performs cross-validation over mean squared error. 122 | 123 | Parameters 124 | ---------- 125 | * cv : integer 126 | The number of cross validation folds to perform 127 | 128 | Returns 129 | ------- 130 | Returns a scores of the k-fold mean squared error. 131 | """ 132 | mse = metrics.make_scorer(metrics.mean_squared_error) 133 | result = cross_validate(self.reg, self.X, 134 | self.y, cv=cv, 135 | scoring=(mse)) 136 | return self.get_test_score(result) 137 | 138 | def trimean_squared_error(self, y_true, y_pred, 139 | sample_weight=None, 140 | multioutput='uniform_average'): 141 | output_errors = self.trimean((y_true - y_pred) ** 2) 142 | return self.trimean(output_errors) 143 | 144 | def trimean_absolute_error(self, y_true, y_pred, 145 | sample_weight=None, 146 | multioutput='uniform_average'): 147 | output_errors = self.trimean(abs(y_true - y_pred)) 148 | return self.trimean(output_errors) 149 | 150 | def tse_cv(self, cv): 151 | """ 152 | This method performs cross-validation over trimean squared error. 153 | 154 | Parameters 155 | ---------- 156 | * cv : integer 157 | The number of cross validation folds to perform 158 | 159 | Returns 160 | ------- 161 | Returns a scores of the k-fold trimean squared error. 162 | """ 163 | tse = metrics.make_scorer(self.trimean_squared_error) 164 | result = cross_validate(self.reg, self.X, 165 | self.y, cv=cv, 166 | scoring=(tse)) 167 | return self.get_test_score(result) 168 | 169 | def tae_cv(self, cv): 170 | """ 171 | This method performs cross-validation over trimean absolute error. 172 | 173 | Parameters 174 | ---------- 175 | * cv : integer 176 | The number of cross validation folds to perform 177 | 178 | Returns 179 | ------- 180 | Returns a scores of the k-fold trimean absolute error. 181 | """ 182 | tse = metrics.make_scorer(self.trimean_absolute_error) 183 | result = cross_validate(self.reg, self.X, 184 | self.y, cv=cv, 185 | scoring=(tse)) 186 | return self.get_test_score(result) 187 | 188 | def _cross_val_anomaly_detection(self, scores, tolerance, method='mean'): 189 | avg, _ = self.describe_scores(scores, method) 190 | deviance_from_avg = [abs(score - avg) 191 | for score in scores] 192 | for deviance in deviance_from_avg: 193 | if deviance > tolerance: 194 | return False 195 | return True 196 | 197 | def _cross_val_avg(self, scores, maximum_center_tolerance, method='mean'): 198 | avg, _ = self.describe_scores(scores, method) 199 | if avg > maximum_center_tolerance: 200 | return False 201 | return True 202 | 203 | def _cross_val_upper_boundary(self, scores, upper_boundary): 204 | for score in scores: 205 | if score > upper_boundary: 206 | return False 207 | return True 208 | 209 | def cross_val_tse_anomaly_detection(self, tolerance, cv=3, method='mean'): 210 | scores = self.tse_cv(cv) 211 | return self._cross_val_anomaly_detection(scores, tolerance, method=method) 212 | 213 | def cross_val_tse_avg(self, minimum_center_tolerance, cv=3, method='mean'): 214 | scores = self.tse_cv(cv) 215 | return self._cross_val_avg(scores, minimum_center_tolerance) 216 | 217 | def cross_val_tse_upper_boundary(self, upper_boundary, cv=3): 218 | scores = self.tse_cv(cv) 219 | return self._cross_val_upper_boundary(scores, upper_boundary) 220 | 221 | def tse_upper_boundary(self, upper_boundary): 222 | y_pred = self.reg.predict(self.X) 223 | if self.trimean_squared_error(self.y, y_pred) > upper_boundary: 224 | return False 225 | return True 226 | 227 | def cross_val_tae_anomaly_detection(self, tolerance, cv=3, method='mean'): 228 | scores = self.tae_cv(cv) 229 | return self._cross_val_anomaly_detection(scores, tolerance, method=method) 230 | 231 | def cross_val_tae_avg(self, minimum_center_tolerance, cv=3, method='mean'): 232 | scores = self.tae_cv(cv) 233 | return self._cross_val_avg(scores, minimum_center_tolerance) 234 | 235 | def cross_val_tae_upper_boundary(self, upper_boundary, cv=3): 236 | scores = self.tae_cv(cv) 237 | return self._cross_val_upper_boundary(scores, upper_boundary) 238 | 239 | def tae_upper_boundary(self, upper_boundary): 240 | y_pred = self.reg.predict(self.X) 241 | if self.trimean_absolute_error(self.y, y_pred) > upper_boundary: 242 | return False 243 | return True 244 | 245 | def cross_val_mse_anomaly_detection(self, tolerance, cv=3, method='mean'): 246 | scores = self.mse_cv(cv) 247 | return self._cross_val_anomaly_detection(scores, tolerance, method=method) 248 | 249 | def cross_val_mse_avg(self, minimum_center_tolerance, cv=3, method='mean'): 250 | scores = self.mse_cv(cv) 251 | return self._cross_val_avg(scores, minimum_center_tolerance) 252 | 253 | def cross_val_mse_upper_boundary(self, upper_boundary, cv=3): 254 | scores = self.mse_cv(cv) 255 | return self._cross_val_upper_boundary(scores, upper_boundary) 256 | 257 | def mse_upper_boundary(self, upper_boundary): 258 | y_pred = self.reg.predict(self.X) 259 | if metrics.mean_squared_error(self.y, y_pred) > upper_boundary: 260 | return False 261 | return True 262 | 263 | def cross_val_mae_anomaly_detection(self, tolerance, cv=3, method='mean'): 264 | scores = self.mae_cv(cv) 265 | return self._cross_val_anomaly_detection(scores, tolerance, method=method) 266 | 267 | def cross_val_mae_avg(self, minimum_center_tolerance, cv=3, method='mean'): 268 | scores = self.mae_cv(cv) 269 | return self._cross_val_avg(scores, minimum_center_tolerance, method=method) 270 | 271 | def cross_val_mae_upper_boundary(self, upper_boundary, cv=3): 272 | scores = self.mae_cv(cv) 273 | return self._cross_val_upper_boundary(scores, upper_boundary) 274 | 275 | def mae_upper_boundary(self, upper_boundary): 276 | y_pred = self.reg.predict(self.X) 277 | if metrics.median_absolute_error(self.y, y_pred) > upper_boundary: 278 | return False 279 | return True 280 | 281 | def upper_bound_regression_testing(self, 282 | mse_upper_boundary, 283 | mae_upper_boundary, 284 | tse_upper_boundary, 285 | tae_upper_boundary): 286 | mse_test = self.mse_upper_boundary(mse_upper_boundary) 287 | mae_test = self.mae_upper_boundary(mae_upper_boundary) 288 | tse_test = self.tse_upper_boundary(tse_upper_boundary) 289 | tae_test = self.tae_upper_boundary(tae_upper_boundary) 290 | if mse_test and mae_test and tse_test and tae_test: 291 | return True 292 | else: 293 | return False 294 | 295 | def run_time_stress_test(self, sample_sizes, max_run_times): 296 | for index, sample_size in enumerate(sample_sizes): 297 | max_run_time = max_run_times[index] 298 | data = self.X.sample(sample_size, replace=True) 299 | start_time = time.time() 300 | self.reg.predict(data) 301 | model_run_time = time.time() - start_time 302 | if model_run_time > max_run_time: 303 | return False 304 | return True 305 | 306 | class RegressionComparison(): 307 | def __init__(self, 308 | reg_one, 309 | reg_two, 310 | test_data, 311 | target_name, 312 | column_names): 313 | self.reg_one = reg_one 314 | self.reg_two = reg_two 315 | self.column_names = column_names 316 | self.target_name = target_name 317 | self.test_data = test_data 318 | self.y = test_data[target_name] 319 | self.X = test_data[column_names] 320 | 321 | def two_model_prediction_run_time_stress_test(self, sample_sizes): 322 | for sample_size in sample_sizes: 323 | data = self.X.sample(sample_size, replace=True) 324 | start_time = time.time() 325 | self.reg_one.predict(data) 326 | model_one_run_time = time.time() - start_time 327 | start_time = time.time() 328 | self.reg_two.predict(data) 329 | model_two_run_time = time.time() - start_time 330 | # we assume model one should be faster than model two 331 | if model_one_run_time > model_two_run_time: 332 | return False 333 | return True 334 | 335 | def cross_val_mse_result(self, reg, cv=3): 336 | y_pred = cross_val_predict(reg, self.X, self.y) 337 | return metrics.mean_squared_error(self.y, y_pred) 338 | 339 | def cross_val_mae_result(self, reg, cv=3): 340 | y_pred = cross_val_predict(reg, self.X, self.y) 341 | return metrics.median_absolute_error(self.y, y_pred) 342 | 343 | def mse_result(self, reg): 344 | y_pred = reg.predict(self.X) 345 | return metrics.mean_squared_error(self.y, y_pred) 346 | 347 | def mae_result(self, reg): 348 | y_pred = reg.predict(self.X) 349 | return metrics.median_absolute_error(self.y, y_pred) 350 | 351 | def cv_two_model_regression_testing(self, cv=3): 352 | mse_one_test = self.cross_val_mse_result(self.reg_one, cv=cv) 353 | mae_one_test = self.cross_val_mae_result(self.reg_one, cv=cv) 354 | mse_two_test = self.cross_val_mse_result(self.reg_two, cv=cv) 355 | mae_two_test = self.cross_val_mae_result(self.reg_two, cv=cv) 356 | if mse_one_test < mse_two_test and mae_one_test < mae_two_test: 357 | return True 358 | else: 359 | return False 360 | 361 | def two_model_regression_testing(self): 362 | mse_one_test = self.mse_result(self.reg_one) 363 | mae_one_test = self.mae_result(self.reg_one) 364 | mse_two_test = self.mse_result(self.reg_two) 365 | mae_two_test = self.mae_result(self.reg_two) 366 | if mse_one_test < mse_two_test and mae_one_test < mae_two_test: 367 | return True 368 | else: 369 | return False 370 | -------------------------------------------------------------------------------- /drifter_ml/structural_tests/__init__.py: -------------------------------------------------------------------------------- 1 | from .structural_tests import StructuralData 2 | 3 | __all__ =["StructuralData"] 4 | -------------------------------------------------------------------------------- /drifter_ml/structural_tests/structural_tests.py: -------------------------------------------------------------------------------- 1 | from sklearn import metrics 2 | import time 3 | from sklearn import neighbors 4 | from scipy import stats 5 | from sklearn.model_selection import cross_val_score 6 | from sklearn import cluster 7 | 8 | class KmeansClustering(): 9 | def __init__(self, 10 | new_data, 11 | historical_data, 12 | column_names, 13 | target_name): 14 | self.column_names = column_names 15 | self.target_name = target_name 16 | self.new_data = new_data 17 | self.historical_data = historical_data 18 | 19 | def kmeans_clusters(self, n_clusters, data): 20 | k_means = cluster.KMeans(n_clusters=n_clusters) 21 | k_means.fit(data) 22 | return k_means.predict(data) 23 | 24 | def kmeans_scorer(self, metric, min_similarity): 25 | for k in range(2, 12): 26 | new_data = self.new_data[self.column_names] 27 | historical_data = self.historical_data[self.column_names] 28 | new_data_clusters = self.kmeans_clusters(k, new_data) 29 | historical_data_clusters = self.kmeans_clusters(k, historical_data) 30 | score = metric( 31 | new_data_clusters, historical_data_clusters) 32 | if score < min_similarity: 33 | return False 34 | return True 35 | 36 | def mutual_info_kmeans_scorer(self, min_similarity): 37 | return self.kmeans_scorer( 38 | metrics.adjusted_mutual_info_score, 39 | min_similarity 40 | ) 41 | 42 | def adjusted_rand_kmeans_scorer(self, min_similarity): 43 | return self.kmeans_scorer( 44 | metrics.adjusted_rand_score, 45 | min_similarity 46 | ) 47 | 48 | def completeness_kmeans_scorer(self, min_similarity): 49 | return self.kmeans_scorer( 50 | metrics.completeness_score, 51 | min_similarity 52 | ) 53 | 54 | def fowlkes_mallows_kmeans_scorer(self, min_similarity): 55 | return self.kmeans_scorer( 56 | metrics.fowlkes_mallows_score, 57 | min_similarity 58 | ) 59 | 60 | def homogeneity_kmeans_scorer(self, min_similarity): 61 | return self.kmeans_scorer( 62 | metrics.homogeneity_score, 63 | min_similarity 64 | ) 65 | 66 | def v_measure_kmeans_scorer(self, min_similarity): 67 | return self.kmeans_scorer( 68 | metrics.v_measure_score, 69 | min_similarity 70 | ) 71 | 72 | def unsupervised_kmeans_score_clustering(self, min_similarity): 73 | return all([ 74 | self.v_measure_kmeans_scorer(min_similarity), 75 | self.homogeneity_kmeans_scorer(min_similarity), 76 | self.fowlkes_mallows_kmeans_scorer(min_similarity), 77 | self.completeness_kmeans_scorer(min_similarity), 78 | self.adjusted_rand_kmeans_scorer(min_similarity), 79 | self.mutual_info_kmeans_scorer(min_similarity), 80 | ]) 81 | 82 | class DBscanClustering(): 83 | def __init__(self, 84 | new_data, 85 | historical_data, 86 | column_names, 87 | target_name): 88 | self.column_names = column_names 89 | self.target_name = target_name 90 | self.new_data = new_data 91 | self.historical_data = historical_data 92 | 93 | def dbscan_clusters(self, data): 94 | dbscan = cluster.DBSCAN() 95 | return dbscan.fit_predict(data) 96 | 97 | def dbscan_scorer(self, metric, min_similarity): 98 | for k in range(2, 12): 99 | new_data = self.new_data[self.column_names] 100 | historical_data = self.historical_data[self.column_names] 101 | new_data_clusters = self.dbscan_clusters(new_data) 102 | historical_data_clusters = self.dbscan_clusters(historical_data) 103 | score = metric( 104 | new_data_clusters, historical_data_clusters) 105 | if score < min_similarity: 106 | return False 107 | return True 108 | 109 | def mutual_info_dbscan_scorer(self, min_similarity): 110 | return self.dbscan_scorer( 111 | metrics.adjusted_mutual_info_score, 112 | min_similarity 113 | ) 114 | 115 | def adjusted_rand_dbscan_scorer(self, min_similarity): 116 | return self.dbscan_scorer( 117 | metrics.adjusted_rand_score, 118 | min_similarity 119 | ) 120 | 121 | def completeness_dbscan_scorer(self, min_similarity): 122 | return self.dbscan_scorer( 123 | metrics.completeness_score, 124 | min_similarity 125 | ) 126 | 127 | def fowlkes_mallows_dbscan_scorer(self, min_similarity): 128 | return self.dbscan_scorer( 129 | metrics.fowlkes_mallows_score, 130 | min_similarity 131 | ) 132 | 133 | def homogeneity_dbscan_scorer(self, min_similarity): 134 | return self.dbscan_scorer( 135 | metrics.homogeneity_score, 136 | min_similarity 137 | ) 138 | 139 | def v_measure_dbscan_scorer(self, min_similarity): 140 | return self.dbscan_scorer( 141 | metrics.v_measure_score, 142 | min_similarity 143 | ) 144 | 145 | def unsupervised_dbscan_score_clustering(self, min_similarity): 146 | return all([ 147 | self.v_measure_dbscan_scorer(min_similarity), 148 | self.homogeneity_dbscan_scorer(min_similarity), 149 | self.fowlkes_mallows_dbscan_scorer(min_similarity), 150 | self.completeness_dbscan_scorer(min_similarity), 151 | self.adjusted_rand_dbscan_scorer(min_similarity), 152 | self.mutual_info_dbscan_scorer(min_similarity), 153 | ]) 154 | 155 | class KnnClustering(): 156 | def __init__(self, 157 | new_data, 158 | historical_data, 159 | column_names, 160 | target_name): 161 | self.column_names = column_names 162 | self.target_name = target_name 163 | self.new_data = new_data 164 | self.historical_data = historical_data 165 | 166 | def reg_supervised_clustering(self, data): 167 | k_measures = [] 168 | X = data[self.column_names] 169 | y = data[self.target_name] 170 | for k in range(2, 12): 171 | knn = neighbors.KNeighborsRegressor(n_neighbors=k) 172 | knn.fit(X, y) 173 | y_pred = knn.predict(X) 174 | k_measures.append((k, metrics.mean_squared_error(y, y_pred))) 175 | sorted_k_measures = sorted(k_measures, key=lambda t:t[1]) 176 | lowest_mse = sorted_k_measures[0] 177 | best_k = lowest_mse[0] 178 | return best_k 179 | 180 | def reg_supervised_similar_clustering(self, absolute_distance): 181 | historical_k = self.reg_supervised_clustering(self.historical_data) 182 | new_k = self.reg_supervised_clustering(self.new_data) 183 | if abs(historical_k - new_k) > absolute_distance: 184 | return False 185 | else: 186 | return True 187 | 188 | def cls_supervised_clustering(self, data): 189 | k_measures = [] 190 | X = data[self.column_names] 191 | y = data[self.target_name] 192 | for k in range(2, 12): 193 | knn = neighbors.KNeighborsClassifier(n_neighbors=k) 194 | knn.fit(X, y) 195 | y_pred = knn.predict(X) 196 | k_measures.append((k, metrics.mean_squared_error(y, y_pred))) 197 | sorted_k_measures = sorted(k_measures, key=lambda t:t[1]) 198 | lowest_mse = sorted_k_measures[0] 199 | best_k = lowest_mse[0] 200 | return best_k 201 | 202 | def cls_supervised_similar_clustering(self, absolute_distance): 203 | historical_k = self.cls_supervised_clustering(self.historical_data) 204 | new_k = self.cls_supervised_clustering(self.new_data) 205 | if abs(historical_k - new_k) > absolute_distance: 206 | return False 207 | else: 208 | return True 209 | 210 | class StructuralData(KnnClustering, 211 | DBscanClustering, 212 | KmeansClustering): 213 | def __init__(self, 214 | new_data, 215 | historical_data, 216 | column_names, 217 | target_name): 218 | self.column_names = column_names 219 | self.target_name = target_name 220 | self.new_data = new_data 221 | self.historical_data = historical_data 222 | 223 | -------------------------------------------------------------------------------- /drifter_ml/timeseries_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EricSchles/drifter_ml/198a2e4a0b6310765e551f3122ff0ea8b04ed900/drifter_ml/timeseries_tests/__init__.py -------------------------------------------------------------------------------- /drifter_ml/timeseries_tests/timeseries_tests.py: -------------------------------------------------------------------------------- 1 | ''' 2 | The goal of this model is to test for model drift: Does the model behave the same 3 | way over time? 4 | 5 | Is the model and data consistent over time? 6 | 7 | We can think of this through the following questions, 8 | 9 | * do the same inputs produce the same outputs, over time? 10 | * how sensitive is the model to input data? 11 | * what is the distribution of predictions over time? 12 | * what are the marginal distributions of the data over time? 13 | * As the marginal distributions change, how much does the distribution of predictions change, over time? 14 | * how stable is the distribution of predictions over time? (for regression) 15 | * how stable are the percentages per class of the predictions over time? (for classification) 16 | * how likely are certain predictions over time? (for classification) 17 | * how likely are certain ranges of predictions over time? (for regression) 18 | 19 | * how much data do we expect to be misclassified over time? (for classification) 20 | * precision 21 | * recall 22 | * f1 score 23 | * how much error do we expect over time? (for regression) 24 | * mean squared error 25 | * median absolute error 26 | * trimean absolute error 27 | * how many outliers do we expect in the data over time? (using various techniques) 28 | * how likely is it the data is drawn from the same distribution over a given time frame? (using distribution similarity tests) 29 | * how sensitive is the model to changes in each marginal variable over time? (regression and classification) IE, if we change each variable while holding all others constant, how many values do we need to change to produce a significant change in the prediction (significant increase in the output for regression) or change of class for classification? 30 | * how sensitive is the model to the marginal effects of n variables? (with the above set up) where n varies from 1 to the total number of variables in the data 31 | * how do various feature selection algorithms change on the data over time? aka which features are statistically significant over time? 32 | * how much of the data is missing over time? 33 | ''' 34 | from backtester import metrics as bt_metrics 35 | import pandas as pd 36 | import datetime 37 | 38 | class TimeSeriesClassificationTests: 39 | """ 40 | The general goal of this class is to test 41 | classification algorithms over time. 42 | The class expects the following parameters: 43 | 44 | * descriptors : arraylike 45 | A set of descriptions of a model. This ought to 46 | be a classification metric like precision, recall, or 47 | f1-score or a loss like log loss. 48 | 49 | * timestamps : arraylike 50 | A set of timestamps associated with the descriptors. 51 | this will be important for some of the metrics used. 52 | Each element should be of time datetime.datetime. 53 | 54 | The way in which classification algorithms is assessed via 55 | hypothesis tests and time series metrics. The time series 56 | metrics come to us from backtester, another framework I developed. 57 | Each timeseries metric is standard where the expectation is 58 | that data is compared against a forecast. 59 | A simple moving average is used for the forecast model to make 60 | sure the only thing we are trying to capture is how much the model 61 | has changed recently. 62 | 63 | For this reason, the number of lag periods is very important. If you 64 | set this number too low, you may think everything is fine, when in fact 65 | things are actually changing quiet rapidly. If you set the number of lags 66 | too long, then you may capture bugs from the last anomaly, and thus won't 67 | capture the next. 68 | 69 | A good rule of thumb is to set the number of lags for a week, assuming everything 70 | has been fine. And set it for 5 periods after the last bug, to assess normality. 71 | 72 | It may make sense to initialize multiple instances of the class, if 73 | you want to capture things at different levels of granularity. 74 | """ 75 | def __init__(self, descriptors, timestamps, lags=10): 76 | self.descriptors = discriptors 77 | self.timestamps = timestamps 78 | self.lags = lags 79 | self.series = self._generate_series() 80 | 81 | def _generate_series(self): 82 | return pd.Series( 83 | data = self.descriptors, 84 | index = self.timestamps 85 | ) 86 | 87 | def _apply_metric(self, metric, forecast_start, max_error): 88 | y_true = series[forecast_start:] 89 | y_pred = self.series.rolling(window=self.lags).mean() 90 | y_pred = y_pred[forecast_start:] 91 | error = metric( 92 | y_true, y_pred 93 | ) 94 | return error < max_error 95 | 96 | def root_mean_squared_error(self, forecast_start: datetime.datetime, max_error: float) -> bool: 97 | """ 98 | The root mean squared error is a standard metric for 99 | assessing error in a regression problem. It lends itself 100 | naturally to the forecast context because of its application 101 | of a euclidean metric as well as taking of the average. 102 | 103 | An average is especially advantegous due to its sensitivity 104 | to outliers. 105 | 106 | Parameters 107 | ---------- 108 | * forecast_start : datetime.datetime 109 | The starting timestamp to begin the forecast. 110 | Observations of the descriptor after the start time will be checked. 111 | Special care should be given when choosing the start forecast. 112 | 113 | * max_error: float 114 | The maximum allowed error or tolerance of the forecast. 115 | If we are dealing with a score function like f1-score 116 | it is imperative that we set max_error below 1.0. 117 | 118 | Return 119 | ------ 120 | True if the root mean squared error of 121 | the forecast and actual error is below the max_error. 122 | False otherwise 123 | """ 124 | return self._apply_metric( 125 | bt_metric.root_mean_squared_error, 126 | forecast_start, max_error 127 | ) 128 | 129 | def normalized_root_mean_squared_error(self, forecast_start: datetime.datetime, max_error: float) -> bool: 130 | """ 131 | The normalized root mean squared error takes into account scale. 132 | It is not recommended that the normalized root mean squared error 133 | be used if your descriptor is a score, since those are already bounded 134 | between (0.0, 1.0). If you are dealing with a loss function, then 135 | the normalized root mean squared error may be advantegous as sense of 136 | scale is removed. 137 | 138 | Since there is no standard convention for how to normalize the choice 139 | of max - min of the observations is used as a choice for normalization. 140 | 141 | Parameters 142 | ---------- 143 | * forecast_start : datetime.datetime 144 | The starting timestamp to begin the forecast. 145 | Observations of the descriptor after the start time will be checked. 146 | Special care should be given when choosing the start forecast. 147 | 148 | * max_error: float 149 | The maximum allowed error or tolerance of the forecast. 150 | If we are dealing with a score function like f1-score 151 | it is imperative that we set max_error below 1.0. 152 | 153 | Return 154 | ------ 155 | True if the normalized root mean squared error of 156 | the forecast and actual error is below the max_error. 157 | False otherwise 158 | """ 159 | return self._apply_metric( 160 | bt_metric.normalized_root_mean_squared_error, 161 | forecast_start, max_error 162 | ) 163 | 164 | def mean_error(self, forecast_start: datetime.datetime, max_error: float) -> bool: 165 | """ 166 | Perhaps the most naive metric I could think of, mean error 167 | is simply the average error of the forecast against the 168 | observations. 169 | 170 | As a result, this measure will be sensitive to outliers, which may 171 | be advantegous for assessing deviance quickly and obviously. 172 | 173 | Parameters 174 | ---------- 175 | * forecast_start : datetime.datetime 176 | The starting timestamp to begin the forecast. 177 | Observations of the descriptor after the start time will be checked. 178 | Special care should be given when choosing the start forecast. 179 | 180 | * max_error: float 181 | The maximum allowed error or tolerance of the forecast. 182 | If we are dealing with a score function like f1-score 183 | it is imperative that we set max_error below 1.0. 184 | 185 | Return 186 | ------ 187 | True if the mean error of the forecast 188 | and actual error is below the max_error. 189 | False otherwise 190 | """ 191 | return self._apply_metric( 192 | bt_metric.mean_error, 193 | forecast_start, max_error 194 | ) 195 | 196 | def mean_absolute_error(self, forecast_start: datetime.datetime, max_error: float) -> bool: 197 | """ 198 | Perhaps one of the most naive metrics out there, mean absolute error 199 | is simply the average of the absolute value of the error of the forecast against the 200 | observations. 201 | 202 | It ought to be the same as mean error, because score functions are bounded to the 203 | range (0.0, 1.0) and loss functions should never be negative. That said 204 | it is always possible something went wrong. It therefore might be useful 205 | to run mean absolute error and mean error with the same parameters. If 206 | one passes and the other fails, this will be a good signal that something is 207 | wrong with your set up. 208 | 209 | Parameters 210 | ---------- 211 | * forecast_start : datetime.datetime 212 | The starting timestamp to begin the forecast. 213 | Observations of the descriptor after the start time will be checked. 214 | Special care should be given when choosing the start forecast. 215 | 216 | * max_error: float 217 | The maximum allowed error or tolerance of the forecast. 218 | If we are dealing with a score function like f1-score 219 | it is imperative that we set max_error below 1.0. 220 | 221 | Return 222 | ------ 223 | True if the mean absolute error of the forecast 224 | and actual error is below the max_error. 225 | False otherwise 226 | """ 227 | return self._apply_metric( 228 | bt_metric.mean_absolute_error, 229 | forecast_start, max_error 230 | ) 231 | 232 | def median_absolute_error(self, forecast_start: datetime.datetime, max_error: float) -> bool: 233 | """ 234 | The median absolute error is an interesting metric to look at. It ignores outliers, 235 | so it may be used as an expectation of normalcy without the outliers. Comparing 236 | median absolute error and mean absolute error might give a sense of how much outliers 237 | are effecting centrality. 238 | 239 | Parameters 240 | ---------- 241 | * forecast_start : datetime.datetime 242 | The starting timestamp to begin the forecast. 243 | Observations of the descriptor after the start time will be checked. 244 | Special care should be given when choosing the start forecast. 245 | 246 | * max_error: float 247 | The maximum allowed error or tolerance of the forecast. 248 | If we are dealing with a score function like f1-score 249 | it is imperative that we set max_error below 1.0. 250 | 251 | Return 252 | ------ 253 | True if the median absolute error of the forecast 254 | and actual error is below the max_error. 255 | False otherwise 256 | """ 257 | return self._apply_metric( 258 | bt_metric.median_absolute_error, 259 | forecast_start, max_error 260 | ) 261 | 262 | def variance_absolute_error(self, forecast_start: datetime.datetime, max_error: float) -> bool: 263 | """ 264 | The variance absolute error gives us a sense of the variance in our error. This way 265 | we can directly interrogate variability in our absolute error. And we can set boundaries 266 | for the maximum boundary on deviances from our forecast. 267 | 268 | Parameters 269 | ---------- 270 | * forecast_start : datetime.datetime 271 | The starting timestamp to begin the forecast. 272 | Observations of the descriptor after the start time will be checked. 273 | Special care should be given when choosing the start forecast. 274 | 275 | * max_error: float 276 | The maximum allowed error or tolerance of the forecast. 277 | If we are dealing with a score function like f1-score 278 | it is imperative that we set max_error below 1.0. 279 | 280 | Return 281 | ------ 282 | True if the variance absolute error of the forecast 283 | and actual error is below the max_error. 284 | False otherwise 285 | """ 286 | return self._apply_metric( 287 | bt_metric.median_absolute_error, 288 | forecast_start, max_error 289 | ) 290 | 291 | def mean_squared_error(self, forecast_start: datetime.datetime, max_error: float) -> bool: 292 | """ 293 | The mean squared error is a canonical measure of error. It overstates large deviations 294 | of individual examples while marginalizing the effect size of any deviances of deviations 295 | smaller than one. Because the mean is used, large values are overstated, thus individual 296 | large deviations will tend to become apparent. For the mean squared error to be small, 297 | therefore no extreme deviances must exist. However relatively small deviances across 298 | many or even all samples will be understated. 299 | 300 | Parameters 301 | ---------- 302 | * forecast_start : datetime.datetime 303 | The starting timestamp to begin the forecast. 304 | Observations of the descriptor after the start time will be checked. 305 | Special care should be given when choosing the start forecast. 306 | 307 | * max_error: float 308 | The maximum allowed error or tolerance of the forecast. 309 | If we are dealing with a score function like f1-score 310 | it is imperative that we set max_error below 1.0. 311 | 312 | Return 313 | ------ 314 | True if the mean squared error of the forecast 315 | and actual error is below the max_error. 316 | False otherwise 317 | """ 318 | return self._apply_metric( 319 | bt_metric.mean_squared_error, 320 | forecast_start, max_error 321 | ) 322 | 323 | def mean_squared_log_error(self, forecast_start: datetime.datetime, max_error: float) -> bool: 324 | """ 325 | The mean squared log error is a variant on mean squared error. Mean squared log error 326 | measures the relative difference between the true and predicted values. 327 | It over penalizes underestimates, cases where the predicted value is less than 328 | the true value, more than it penalizes overestimates, cases where the predicted 329 | value is more than the true value. This is because it's a MSLE is a ratio of the two. 330 | 331 | This measure is especially useful if you want to check if your prediction is smaller 332 | than your actual timeseries. Therefore it is very useful for accuracy and less 333 | useful for error metrics. 334 | 335 | Parameters 336 | ---------- 337 | * forecast_start : datetime.datetime 338 | The starting timestamp to begin the forecast. 339 | Observations of the descriptor after the start time will be checked. 340 | Special care should be given when choosing the start forecast. 341 | 342 | * max_error: float 343 | The maximum allowed error or tolerance of the forecast. 344 | If we are dealing with a score function like f1-score 345 | it is imperative that we set max_error below 1.0. 346 | 347 | Return 348 | ------ 349 | True if the mean squared error of the forecast 350 | and actual error is below the max_error. 351 | False otherwise 352 | """ 353 | return self._apply_metric( 354 | bt_metric.mean_squared_log_error, 355 | forecast_start, max_error 356 | ) 357 | 358 | def root_mean_squared_log_error(self, forecast_start: datetime.datetime, max_error: float) -> bool: 359 | """ 360 | The root mean squared log error is a variant on mean squared error. 361 | Root mean squared log error measures the relative difference between 362 | the true and predicted values. It over penalizes underestimates, cases 363 | where the predicted value is less than the true value, more than it 364 | penalizes overestimates, cases where the predicted value is more than the true value. 365 | This is because it's a RMSLE is a ratio of the two. However unlike the MSLE 366 | by taking the root the penalization is diminished making this closer in measure 367 | to something like the mean squared error in terms of direction. 368 | 369 | Parameters 370 | ---------- 371 | * forecast_start : datetime.datetime 372 | The starting timestamp to begin the forecast. 373 | Observations of the descriptor after the start time will be checked. 374 | Special care should be given when choosing the start forecast. 375 | 376 | * max_error: float 377 | The maximum allowed error or tolerance of the forecast. 378 | If we are dealing with a score function like f1-score 379 | it is imperative that we set max_error below 1.0. 380 | 381 | Return 382 | ------ 383 | True if the mean squared error of the forecast 384 | and actual error is below the max_error. 385 | False otherwise 386 | """ 387 | return self._apply_metric( 388 | bt_metric.root_mean_squared_log_error, 389 | forecast_start, max_error 390 | ) 391 | 392 | 393 | # iqr_absolute_error 394 | # geometric_mean_absolute_error 395 | # mean_percentage_error 396 | # mean_absolute_percentage_error 397 | # median_absolute_percentage_error 398 | # symmetric_mean_absolute_percentage_error 399 | # symmetric_median_absolute_percentage_error 400 | # mean_arctangent_absolute_percentage_error 401 | # mean_absolute_scaled_error 402 | # normalized_absolute_error 403 | # normalized_absolute_percentage_error 404 | # root_mean_squared_percentage_error 405 | # root_median_squared_percentage_error 406 | # root_mean_squared_scaled_error 407 | # integral_normalized_root_squared_error 408 | # root_relative_squared_error 409 | # mean_relative_error 410 | # relative_absolute_error 411 | # mean_relative_absolute_error 412 | # median_relative_absolute_error 413 | # geometric_mean_relative_absolute_error 414 | # mean_bounded_relative_absolute_error 415 | # unscaled_mean_bounded_relative_absolute_error 416 | # mean_directional_accuracy 417 | 418 | -------------------------------------------------------------------------------- /example_models/static_examples/example_model.py: -------------------------------------------------------------------------------- 1 | from sklearn import tree 2 | from sklearn import ensemble 3 | from sklearn import model_selection 4 | import pandas as pd 5 | import numpy as np 6 | import joblib 7 | import code 8 | import json 9 | 10 | df = pd.DataFrame() 11 | for _ in range(1000): 12 | a = np.random.normal(0, 1) 13 | b = np.random.normal(0, 3) 14 | c = np.random.normal(12, 4) 15 | if a + b + c > 11: 16 | target = 1 17 | else: 18 | target = 0 19 | df = df.append({ 20 | "A": a, 21 | "B": b, 22 | "C": c, 23 | "target": target 24 | }, ignore_index=True) 25 | 26 | clf1 = tree.DecisionTreeClassifier() 27 | clf2 = ensemble.RandomForestClassifier() 28 | X = df[["A", "B", "C"]] 29 | clf1.fit(X, df["target"]) 30 | clf2.fit(X, df["target"]) 31 | #code.interact(local=locals()) 32 | joblib.dump(clf1, "model1.joblib") 33 | joblib.dump(clf2, "model1.joblib") 34 | json.dump({ 35 | "column_names": ["A", "B", "C"], 36 | "target_name": "target" 37 | }, open("model_metadata.json", "w")) 38 | df.to_csv("data.csv") 39 | -------------------------------------------------------------------------------- /example_models/static_examples/example_tests.py: -------------------------------------------------------------------------------- 1 | from drifter_ml import classification_tests 2 | import joblib 3 | import pandas as pd 4 | import code 5 | 6 | def test(): 7 | df = pd.read_csv("data.csv") 8 | column_names = ["A", "B", "C"] 9 | target_name = "target" 10 | clf = joblib.load("model1.joblib") 11 | 12 | test_suite = classification_tests.ClassificationTests(clf, 13 | df, 14 | target_name, 15 | column_names) 16 | classes = list(df.target.unique()) 17 | assert test_suite.classifier_testing( 18 | {klass: 0.9 for klass in classes}, 19 | {klass: 0.9 for klass in classes}, 20 | {klass: 0.9 for klass in classes} 21 | ) 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /example_models/static_examples/keras_example.py: -------------------------------------------------------------------------------- 1 | from keras.models import Sequential 2 | from keras.layers import Dense 3 | from keras.wrappers.scikit_learn import KerasClassifier 4 | import pandas as pd 5 | import numpy as np 6 | import joblib 7 | 8 | # Function to create model, required for KerasClassifier 9 | def create_model(): 10 | # create model 11 | model = Sequential() 12 | model.add(Dense(12, input_dim=3, activation='relu')) 13 | model.add(Dense(8, activation='relu')) 14 | model.add(Dense(1, activation='sigmoid')) 15 | # Compile model 16 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 17 | return model 18 | 19 | # fix random seed for reproducibility 20 | df = pd.DataFrame() 21 | for _ in range(1000): 22 | a = np.random.normal(0, 1) 23 | b = np.random.normal(0, 3) 24 | c = np.random.normal(12, 4) 25 | if a + b + c > 11: 26 | target = 1 27 | else: 28 | target = 0 29 | df = df.append({ 30 | "A": a, 31 | "B": b, 32 | "C": c, 33 | "target": target 34 | }, ignore_index=True) 35 | 36 | # split into input (X) and output (Y) variables 37 | # create model 38 | clf = KerasClassifier(build_fn=create_model, epochs=150, batch_size=10, verbose=0) 39 | X = df[["A", "B", "C"]] 40 | clf.fit(X, df["target"]) 41 | joblib.dump(clf, "model.joblib") 42 | df.to_csv("data.csv") 43 | -------------------------------------------------------------------------------- /example_models/static_examples/model.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EricSchles/drifter_ml/198a2e4a0b6310765e551f3122ff0ea8b04ed900/example_models/static_examples/model.joblib -------------------------------------------------------------------------------- /example_models/static_examples/model1.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EricSchles/drifter_ml/198a2e4a0b6310765e551f3122ff0ea8b04ed900/example_models/static_examples/model1.joblib -------------------------------------------------------------------------------- /example_models/static_examples/model_metadata.json: -------------------------------------------------------------------------------- 1 | {"column_names": ["A", "B", "C"], "target_name": "target"} -------------------------------------------------------------------------------- /example_models/static_examples/prototype_test_framework.py: -------------------------------------------------------------------------------- 1 | import joblib 2 | import json 3 | from sklearn import metrics 4 | import numpy as np 5 | import time 6 | from sklearn import neighbors 7 | from scipy import stats 8 | from sklearn.model_selection import cross_val_score 9 | 10 | # classification tests 11 | class ModelClassificationTestSuite(): 12 | def __init__(self, clf_name, clf_metadata, data_filename): 13 | clf, metadata, colum_names, target_name, test_data = self.get_parameters( 14 | clf_name, clf_metadata, data_filename) 15 | self.clf = clf 16 | self.data_filename 17 | self.metadata = metadata 18 | self.column_names = column_names 19 | self.target_name = target_name 20 | self.test_data = test_data 21 | self.y = test_data[target_name] 22 | self.X = test_data[column_names] 23 | self.classes = set(self.y) 24 | 25 | # potentially include hyper parameters from the model 26 | # algorithm could be stored in metadata 27 | def get_parameters(self, clf_name, clf_metadata, data_filename): 28 | clf = joblib.load(clf_name) 29 | metadata = json.load(open(clf_metadata, "r")) 30 | column_names = metadata["column_names"] 31 | target_name = metadata["target_name"] 32 | test_data = pd.read_csv(data_name) 33 | return clf, metadata, column_names, target_name, test_data 34 | 35 | def precision_lower_boundary_per_class(self, lower_boundary): 36 | y_pred = self.clf.predict(self.X) 37 | for class_info in lower_boundary["per_class"]: 38 | klass = class_info["class"] 39 | y_pred_class = np.take(y_pred, self.y[self.y == klass].index, axis=0) 40 | y_class = self.y[self.y == klass] 41 | if metrics.precision_score(y_class, y_pred_class) < class_info["precision_score"]: 42 | return False 43 | return True 44 | 45 | def recall_lower_boundary_per_class(self, lower_boundary): 46 | y_pred = self.clf.predict(self.X) 47 | for class_info in lower_boundary["per_class"]: 48 | klass = class_info["class"] 49 | y_pred_class = np.take(y_pred, self.y[self.y == klass].index, axis=0) 50 | y_class = self.y[self.y == klass] 51 | if metrics.recall_score(y_class, y_pred_class) < class_info["recall_score"]: 52 | return False 53 | return True 54 | 55 | def f1_lower_boundary_per_class(self, clf, test_data, target_name, column_names, lower_boundary): 56 | y_pred = self.clf.predict(self.X) 57 | for class_info in lower_boundary["per_class"]: 58 | klass = class_info["class"] 59 | y_pred_class = np.take(y_pred, self.y[self.y == klass].index, axis=0) 60 | y_class = self.y[self.y == klass] 61 | if metrics.f1_score(y_class, y_pred_class) < class_info["f1_score"]: 62 | return False 63 | return True 64 | 65 | def classifier_testing(self, precision_lower_boundary, recall_lower_boundary, f1_lower_boundary): 66 | precision_test = self.precision_lower_boundary_per_class(precision_lower_boundary) 67 | recall_test = self.recall_lower_boundary_per_class(recall_lower_boundary) 68 | f1_test = self.f1_lower_boundary_per_class(f1_lower_boundary) 69 | if precision_test and recall_test and f1_test: 70 | return True 71 | else: 72 | return False 73 | 74 | def run_time_stress_test(self, performance_boundary): 75 | for performance_info in performance_boundary: 76 | n = int(performance_info["sample_size"]) 77 | max_run_time = float(performance_info["max_run_time"]) 78 | data = self.X.sample(n, replace=True) 79 | start_time = time.time() 80 | self.clf.predict(data) 81 | model_run_time = time.time() - start_time 82 | if model_run_time > run_time: 83 | return False 84 | return True 85 | 86 | # post training - 87 | # todo: add model metric outside of some standard deviation 88 | # for many models 89 | # is the model non-empty 90 | # is the model deserializable 91 | 92 | # test against training and scoring 93 | 94 | class ModelRegressionTestSuite(): 95 | def __init__(self, reg_name, reg_metadata, data_filename): 96 | reg, reg_metadata, colum_names, target_name, test_data = self.get_parameters( 97 | reg_name, reg_metadata, data_filename) 98 | self.reg = reg 99 | self.data_filename 100 | self.metadata = metadata 101 | self.column_names = column_names 102 | self.target_name = target_name 103 | self.test_data = test_data 104 | self.y = test_data[target_name] 105 | self.X = test_data[column_names] 106 | 107 | def get_parameters(self, reg_name, reg_metadata, data_filename): 108 | reg = joblib.load(reg_name) 109 | metadata = json.load(open(reg_metadata, "r")) 110 | column_names = metadata["column_names"] 111 | target_name = metadata["target_name"] 112 | test_data = pd.read_csv(data_name) 113 | return reg, metadata, column_names, target_name, test_data 114 | 115 | def mse_upper_boundary(upper_boundary): 116 | y_pred = self.reg.predict(self.X) 117 | if metrics.mean_squared_error(self.y, y_pred) > upper_boundary: 118 | return False 119 | return True 120 | 121 | def mae_upper_boundary(upper_boundary): 122 | y_pred = self.reg.predict(self.X) 123 | if metrics.median_absolute_error(self.y, y_pred) > upper_boundary: 124 | return False 125 | return True 126 | 127 | def regression_testing(mse_upper_boundary, mae_upper_boundary): 128 | mse_test = self.mse_upper_boundary(mse_upper_boundary) 129 | mae_test = self.mae_upper_boundary(mae_upper_boundary) 130 | if mse_test and mae_test: 131 | return True 132 | else: 133 | return False 134 | 135 | def run_time_stress_test(self, performance_boundary): 136 | for performance_info in performance_boundary: 137 | n = int(performance_info["sample_size"]) 138 | max_run_time = float(performance_info["max_run_time"]) 139 | data = self.X.sample(n, replace=True) 140 | start_time = time.time() 141 | self.reg.predict(data) 142 | model_run_time = time.time() - start_time 143 | if model_run_time > run_time: 144 | return False 145 | return True 146 | 147 | class ClassifierComparison(): 148 | def __init__(self, clf_one_name, clf_one_metadata, clf_two_name, clf_two_metadata, data_filename): 149 | clf_one, metadata_one, colum_names, target_name, test_data = self.get_parameters( 150 | clf_one_name, clf_one_metadata, data_filename) 151 | clf_two, metadata_two, colum_names, target_name, test_data = self.get_parameters( 152 | clf_two_name, clf_two_metadata, data_filename) 153 | self.clf_one = clf_one 154 | self.clf_two = clf_two 155 | self.data_filename 156 | self.metadata_one = metadata_one 157 | self.metadata_two = metadata_two 158 | self.column_names = column_names 159 | self.target_name = target_name 160 | self.test_data = test_data 161 | self.y = test_data[target_name] 162 | self.X = test_data[column_names] 163 | self.classes = set(self.y) 164 | 165 | def two_model_prediction_run_time_stress_test(self, performance_boundary): 166 | for performance_info in performance_boundary: 167 | n = int(performance_info["sample_size"]) 168 | data = self.X.sample(n, replace=True) 169 | start_time = time.time() 170 | self.clf_one.predict(data) 171 | model_one_run_time = time.time() - start_time 172 | start_time = time.time() 173 | self.clf_two.predict(data) 174 | model_two_run_time = time.time() - start_time 175 | # we assume model one should be faster than model two 176 | if model_one_run_time > model_two_run_time: 177 | return False 178 | return True 179 | 180 | def precision_per_class(self, clf, test_data, target_name, column_names): 181 | y = test_data[target_name] 182 | classes = set(y) 183 | y_pred = clf.predict(test_data[column_names]) 184 | precision = {} 185 | for klass in classes: 186 | y_pred_class = np.take(y_pred, y[y == klass].index, axis=0) 187 | y_class = y[y == klass] 188 | precision[klass] = metrics.precision_score(y_class, y_pred_class) 189 | return precision 190 | 191 | def recall_per_class(self, clf, test_data, target_name, column_names): 192 | y = test_data[target_name] 193 | classes = set(y) 194 | y_pred = clf.predict(test_data[column_names]) 195 | recall = {} 196 | for klass in classes: 197 | y_pred_class = np.take(y_pred, y[y == klass].index, axis=0) 198 | y_class = y[y == klass] 199 | recall[klass] = metrics.recall_score(y_class, y_pred_class) 200 | return recall 201 | 202 | def f1_per_class(self, clf, test_data, target_name, column_names): 203 | y = test_data[target_name] 204 | classes = set(y) 205 | y_pred = clf.predict(test_data[column_names]) 206 | f1 = {} 207 | for klass in classes: 208 | y_pred_class = np.take(y_pred, y[y == klass].index, axis=0) 209 | y_class = y[y == klass] 210 | f1[klass] = metrics.f1_score(y_class, y_pred_class) 211 | return f1 212 | 213 | def two_model_classifier_testing(self): 214 | precision_one_test = self.precision_per_class(self.clf_one) 215 | recall_one_test = self.recall_per_class(self.clf_one) 216 | f1_one_test = self.f1_per_class(self.clf_one) 217 | precision_two_test = precision_per_class(self.clf_two) 218 | recall_two_test = recall_per_class(self.clf_two) 219 | f1_two_test = f1_per_class(self.clf_two) 220 | 221 | precision_result = precision_one_test > precision_two_test 222 | recall_result = recall_one_test > recall_two_test 223 | f1_result = f1_one_test > f1_two_test 224 | if precision_result and recall_result and f1_result: 225 | return True 226 | else: 227 | return False 228 | 229 | class RegressionComparison(): 230 | def __init__(self, reg_one_name, reg_one_metadata, reg_two_name, reg_two_metadata, data_filename): 231 | reg_one, metadata_one, colum_names, target_name, test_data = self.get_parameters( 232 | reg_one_name, reg_one_metadata, data_filename) 233 | reg_two, metadata_two, colum_names, target_name, test_data = self.get_parameters( 234 | reg_two_name, reg_two_metadata, data_filename) 235 | self.reg_one = reg_one 236 | self.reg_two = reg_two 237 | self.data_filename 238 | self.metadata_one = metadata_one 239 | self.metadata_two = metadata_two 240 | self.column_names = column_names 241 | self.target_name = target_name 242 | self.test_data = test_data 243 | self.y = test_data[target_name] 244 | self.X = test_data[column_names] 245 | 246 | def two_model_prediction_run_time_stress_test(self, performance_boundary): 247 | for performance_info in performance_boundary: 248 | n = int(performance_info["sample_size"]) 249 | data = self.X.sample(n, replace=True) 250 | start_time = time.time() 251 | self.reg_one.predict(data) 252 | model_one_run_time = time.time() - start_time 253 | start_time = time.time() 254 | self.reg_two.predict(data) 255 | model_two_run_time = time.time() - start_time 256 | # we assume model one should be faster than model two 257 | if model_one_run_time > model_two_run_time: 258 | return False 259 | return True 260 | 261 | def mse_result(self, reg): 262 | y_pred = reg.predict(self.X) 263 | return metrics.mean_squared_error(self.y, y_pred) 264 | 265 | def mae_result(self, reg): 266 | y_pred = reg.predict(self.X) 267 | return metrics.median_absolute_error(self.y, y_pred) 268 | 269 | def two_model_regression_testing(self): 270 | mse_one_test = self.mse_result(self.reg_one) 271 | mae_one_test = self.mae_result(self.reg_one) 272 | mse_two_test = self.mse_result(self.reg_two) 273 | mae_two_test = self.mae_result(self.reg_two) 274 | if mse_one_test < mse_two_test and mae_one_test < mae_two_test: 275 | return True 276 | else: 277 | return False 278 | 279 | # data tests 280 | class DataSanitization(): 281 | def __init__(self, data_filename): 282 | self.data_filename 283 | self.data = pd.read_csv(data_filename) 284 | 285 | def is_complete(self, column): 286 | return self.data[column].isnull().sum() == 0 287 | 288 | def has_completeness(self, column, threshold): 289 | return self.data[column].isnull().sum()/len(self.data) > threshold 290 | 291 | def is_unique(self, column): 292 | return len(self.data[column].unique())/len(self.data) == 1 293 | 294 | def has_uniqueness(column, threshold): 295 | return len(self.data[column].unique())/len(self.data) > threshold 296 | 297 | def is_in_range(column, lower_bound, upper_bound, threshold): 298 | return self.data[(self.data[column] <= upper_bound) & (self.data[column] >= lower_bound)]/len(self.data) > threshold 299 | 300 | def is_non_negative(column): 301 | return self.data[self.data[column] > 0] 302 | 303 | def is_less_than(column_one, column_two): 304 | return self.data[self.data[column_one] < self.data[column_two]].all() 305 | 306 | # memoryful tests 307 | class StructuralData(): 308 | def __init__(self, metadata, data_filename): 309 | metadata, column_names, target_name, test_data = self.get_parameters( 310 | metadata, data_filename) 311 | self.data_filename 312 | self.metadata = metadata 313 | self.column_names = column_names 314 | self.target_name = target_name 315 | self.test_data = test_data 316 | self.y = test_data[target_name] 317 | self.X = test_data[column_names] 318 | 319 | def get_parameters(self, metadata, data_filename): 320 | metadata = json.load(open(clf_metadata, "r")) 321 | column_names = metadata["column_names"] 322 | target_name = metadata["target_name"] 323 | test_data = pd.read_csv(data_name) 324 | return metadata, column_names, target_name, test_data 325 | 326 | def reg_clustering(self, data, columns, target): 327 | k_measures = [] 328 | for k in range(2, 12): 329 | knn = neighbors.KNeighborsRegressor(n_neighbors=k) 330 | knn.fit(self.X, self.y) 331 | y_pred = knn.predict(self.X) 332 | k_measures.append((k, metrics.mean_squared_error(self.y, y_pred))) 333 | sorted_k_measures = sorted(k_measures, key=lambda t:t[1]) 334 | lowest_mse = sorted_k_measures[0] 335 | best_k = lowest_mse[0] 336 | return best_k 337 | 338 | def reg_similar_clustering(self, absolute_distance, new_data, historical_data, column_names, target_name): 339 | historical_k = reg_clustering(historical_data, column_names, target_name) 340 | new_k = reg_clustering(new_data, column_names, target_name) 341 | if abs(historical_k - new_k) > absolute_distance: 342 | return False 343 | else: 344 | return True 345 | 346 | # this was never updated 347 | def cls_clustering(self): 348 | k_measures = [] 349 | for k in range(2, 12): 350 | knn = neighbors.KNeighborsRegressor(n_neighbors=k) 351 | knn.fit(self.X, self.y) 352 | y_pred = knn.predict(self.X) 353 | k_measures.append((k, metrics.mean_squared_error(self.y, y_pred))) 354 | sorted_k_measures = sorted(k_measures, key=lambda t:t[1]) 355 | lowest_mse = sorted_k_measures[0] 356 | best_k = lowest_mse[0] 357 | return best_k 358 | 359 | def cls_similiar_clustering(absolute_distance, new_data, historical_data, column_names, target_name): 360 | historical_k = cls_clustering(historical_data, column_names, target_name) 361 | new_k = cls_clustering(new_data, column_names, target_name) 362 | if abs(historical_k - new_k) > absolute_distance: 363 | return False 364 | else: 365 | return True 366 | 367 | # this needs work 368 | class ColumnarData(): 369 | def similiar_correlation(correlation_lower_bound, new_data, historical_data, column_names, pvalue_threshold=0.05): 370 | for column_name in column_names: 371 | correlation_info = stats.spearmanr(new_data[column_name], historical_data[column_name]) 372 | if correlation_info.pvalue > pvalue_threshold: 373 | return False 374 | if correlation_info.correlation < correlation_lower_bound: 375 | return False 376 | return True 377 | 378 | def similiar_distribution(new_data, historical_data, column_names, pvalue_threshold=0.05): 379 | for column_name in column_names: 380 | distribution_info = stats.ks_2samp(new_data[column_name], historical_data[column_name]) 381 | if correlation_info.pvalue < pvalue_threshold: 382 | return False 383 | return True 384 | 385 | # does the preprocessing break? 386 | # does the model build? 387 | # does the model meet some threshold? 388 | # add memoryful tests for measures over time (like over several days) 389 | -------------------------------------------------------------------------------- /example_models/static_examples/random_file.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import random 3 | import code 4 | 5 | Card = collections.namedtuple('Card', ['rank', 'suit']) 6 | 7 | class FrenchDeck: 8 | ranks = [str(n) for n in range(2, 11)] + list('JQKA') 9 | suits = 'spades diamonds clubs hearts'.split() 10 | 11 | def __init__(self): 12 | self._cards = [Card(rank, suit) for suit in self.suits 13 | for rank in self.ranks] 14 | 15 | def __len__(self): 16 | return len(self._cards) 17 | 18 | def __getitem__(self, position): 19 | return self._cards[position] 20 | 21 | if __name__ == '__main__': 22 | deck = FrenchDeck() 23 | card = random.choice(deck) 24 | code.interact(local=locals()) 25 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | scikit-learn 3 | scipy 4 | numpy 5 | pandas 6 | mlxtend 7 | energyusage 8 | backtester 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | from setuptools import setup 3 | 4 | # The directory containing this file 5 | HERE = pathlib.Path(__file__).parent 6 | 7 | # The text of the README file 8 | README = (HERE / "README.md").read_text() 9 | 10 | # This call to setup() does all the work 11 | setup( 12 | name="drifter_ml", 13 | version="0.25", 14 | description="Testing for models confirming to the scikit-learn api", 15 | long_description=README, 16 | long_description_content_type="text/markdown", 17 | url="https://github.com/EricSchles/drifter_ml", 18 | author="Eric Schles", 19 | author_email="ericschles@gmail.com", 20 | license="MIT", 21 | classifiers=[ 22 | "License :: OSI Approved :: MIT License", 23 | "Programming Language :: Python :: 3", 24 | "Programming Language :: Python :: 3.6", 25 | "Programming Language :: Python :: 3.7", 26 | ], 27 | packages=["drifter_ml", 'drifter_ml.classification_tests', 'drifter_ml.columnar_tests', 28 | 'drifter_ml.regression_tests', 'drifter_ml.structural_tests'], 29 | include_package_data=True, 30 | install_requires=["sklearn", "scipy", "numpy", 31 | "statsmodels", "mlxtend", "pytest", 32 | "energyusage", "backtester"], 33 | ) 34 | -------------------------------------------------------------------------------- /tests/test_classification_tests.py: -------------------------------------------------------------------------------- 1 | from drifter_ml import classification_tests 2 | from sklearn import tree 3 | from sklearn import ensemble 4 | from sklearn import model_selection 5 | import numpy as np 6 | import pandas as pd 7 | import random 8 | 9 | def generate_binary_classification_data_and_models(): 10 | df = pd.DataFrame() 11 | for _ in range(1000): 12 | a = np.random.normal(0, 1) 13 | b = np.random.normal(0, 3) 14 | c = np.random.normal(12, 4) 15 | target = random.choice([0, 1]) 16 | df = df.append({ 17 | "A": a, 18 | "B": b, 19 | "C": c, 20 | "target": target 21 | }, ignore_index=True) 22 | 23 | clf1 = tree.DecisionTreeClassifier() 24 | clf2 = ensemble.RandomForestClassifier() 25 | column_names = ["A", "B", "C"] 26 | target_name = "target" 27 | X = df[column_names] 28 | clf1.fit(X, df[target_name]) 29 | clf2.fit(X, df[target_name]) 30 | return df, column_names, target_name, clf1, clf2 31 | 32 | def generate_multiclass_classification_data_and_models(): 33 | df = pd.DataFrame() 34 | for _ in range(1000): 35 | a = np.random.normal(0, 1) 36 | b = np.random.normal(0, 3) 37 | c = np.random.normal(12, 4) 38 | target = random.choice([0, 1, 2]) 39 | df = df.append({ 40 | "A": a, 41 | "B": b, 42 | "C": c, 43 | "target": target 44 | }, ignore_index=True) 45 | 46 | clf1 = tree.DecisionTreeClassifier() 47 | clf2 = ensemble.RandomForestClassifier() 48 | column_names = ["A", "B", "C"] 49 | target_name = "target" 50 | X = df[column_names] 51 | clf1.fit(X, df[target_name]) 52 | clf2.fit(X, df[target_name]) 53 | return df, column_names, target_name, clf1, clf2 54 | 55 | def test_precision_recall_f1_binary(): 56 | df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models() 57 | test_suite = classification_tests.ClassificationTests(clf, 58 | df, 59 | target_name, 60 | column_names) 61 | try: 62 | classes = list(df[target_name].unique()) 63 | test_suite.classifier_testing_per_class( 64 | {klass: 0.1 for klass in classes}, 65 | {klass: 0.1 for klass in classes}, 66 | {klass: 0.1 for klass in classes} 67 | ) 68 | assert True 69 | except: 70 | assert False 71 | 72 | def test_precision_recall_f1_multiclass(): 73 | df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models() 74 | test_suite = classification_tests.ClassificationTests(clf, 75 | df, 76 | target_name, 77 | column_names) 78 | try: 79 | classes = list(df[target_name].unique()) 80 | test_suite.classifier_testing_per_class( 81 | {klass: 0.1 for klass in classes}, 82 | {klass: 0.1 for klass in classes}, 83 | {klass: 0.1 for klass in classes}, 84 | average="micro" 85 | ) 86 | assert True 87 | except: 88 | assert False 89 | 90 | def test_roc_auc_cv_binary(): 91 | df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models() 92 | test_suite = classification_tests.ClassificationTests(clf, 93 | df, 94 | target_name, 95 | column_names) 96 | try: 97 | roc_auc_scores = test_suite.roc_auc_cv(3) 98 | assert isinstance(roc_auc_scores, list) 99 | assert isinstance(roc_auc_scores[0], float) 100 | assert len(roc_auc_scores) == 3 101 | except ValueError: 102 | assert True 103 | 104 | def test_f1_cv_binary(): 105 | df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models() 106 | test_suite = classification_tests.ClassificationTests(clf, 107 | df, 108 | target_name, 109 | column_names) 110 | f1_scores = test_suite.f1_cv(3) 111 | assert isinstance(f1_scores, list) 112 | assert isinstance(f1_scores[0], float) 113 | assert len(f1_scores) == 3 114 | 115 | def test_f1_cv_multiclass(): 116 | df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models() 117 | test_suite = classification_tests.ClassificationTests(clf, 118 | df, 119 | target_name, 120 | column_names) 121 | f1_scores = test_suite.f1_cv(3) 122 | assert isinstance(f1_scores, list) 123 | assert isinstance(f1_scores[0], float) 124 | assert len(f1_scores) == 3 125 | 126 | def test_recall_cv_binary(): 127 | df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models() 128 | test_suite = classification_tests.ClassificationTests(clf, 129 | df, 130 | target_name, 131 | column_names) 132 | recall_scores = test_suite.recall_cv(3) 133 | assert isinstance(recall_scores, list) 134 | assert isinstance(recall_scores[0], float) 135 | assert len(recall_scores) == 3 136 | 137 | def test_recall_cv_multiclass(): 138 | df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models() 139 | test_suite = classification_tests.ClassificationTests(clf, 140 | df, 141 | target_name, 142 | column_names) 143 | recall_scores = test_suite.recall_cv(3) 144 | assert isinstance(recall_scores, list) 145 | assert isinstance(recall_scores[0], float) 146 | assert len(recall_scores) == 3 147 | 148 | def test_precision_cv_binary(): 149 | df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models() 150 | test_suite = classification_tests.ClassificationTests(clf, 151 | df, 152 | target_name, 153 | column_names) 154 | precision_scores = test_suite.precision_cv(3) 155 | assert isinstance(precision_scores, list) 156 | assert isinstance(precision_scores[0], float) 157 | assert len(precision_scores) == 3 158 | 159 | def test_precision_cv_multiclass(): 160 | df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models() 161 | test_suite = classification_tests.ClassificationTests(clf, 162 | df, 163 | target_name, 164 | column_names) 165 | precision_scores = test_suite.precision_cv(3) 166 | assert isinstance(precision_scores, list) 167 | assert isinstance(precision_scores[0], float) 168 | assert len(precision_scores) == 3 169 | 170 | def test_precision_metric(): 171 | fixed_metrics = classification_tests.FixedClassificationMetrics() 172 | assert 1.0 == fixed_metrics.precision_score([0,0,0], [0,0,0]) 173 | 174 | def test_recall_metric(): 175 | fixed_metrics = classification_tests.FixedClassificationMetrics() 176 | assert 1.0 == fixed_metrics.recall_score([0,0,0], [0,0,0]) 177 | 178 | def test_f1_metric(): 179 | fixed_metrics = classification_tests.FixedClassificationMetrics() 180 | assert 1.0 == fixed_metrics.f1_score([0,0,0], [0,0,0]) 181 | 182 | def test_cross_val_per_class_percision_anomaly_detection_binary(): 183 | df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models() 184 | test_suite = classification_tests.ClassificationTests(clf, 185 | df, 186 | target_name, 187 | column_names) 188 | try: 189 | tolerance = 1 190 | test_suite.cross_val_per_class_precision_anomaly_detection(tolerance) 191 | assert True 192 | except: 193 | assert False 194 | 195 | def test_cross_val_per_class_percision_anomaly_detection_multiclass(): 196 | df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models() 197 | test_suite = classification_tests.ClassificationTests(clf, 198 | df, 199 | target_name, 200 | column_names) 201 | try: 202 | tolerance = 1 203 | test_suite.cross_val_per_class_precision_anomaly_detection(tolerance, average="micro") 204 | assert True 205 | except: 206 | assert False 207 | 208 | def test_cross_val_per_class_recall_anomaly_detection_binary(): 209 | df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models() 210 | test_suite = classification_tests.ClassificationTests(clf, 211 | df, 212 | target_name, 213 | column_names) 214 | try: 215 | tolerance = 1 216 | test_suite.cross_val_per_class_recall_anomaly_detection(tolerance) 217 | assert True 218 | except: 219 | assert False 220 | 221 | def test_cross_val_per_class_recall_anomaly_detection_multiclass(): 222 | df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models() 223 | test_suite = classification_tests.ClassificationTests(clf, 224 | df, 225 | target_name, 226 | column_names) 227 | try: 228 | tolerance = 1 229 | test_suite.cross_val_per_class_recall_anomaly_detection(tolerance, average="micro") 230 | assert True 231 | except: 232 | assert False 233 | 234 | def test_cross_val_per_class_f1_anomaly_detection_binary(): 235 | df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models() 236 | test_suite = classification_tests.ClassificationTests(clf, 237 | df, 238 | target_name, 239 | column_names) 240 | try: 241 | tolerance = 1 242 | test_suite.cross_val_per_class_f1_anomaly_detection(tolerance) 243 | assert True 244 | except: 245 | assert False 246 | 247 | def test_cross_val_per_class_f1_anomaly_detection_multiclass(): 248 | df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models() 249 | test_suite = classification_tests.ClassificationTests(clf, 250 | df, 251 | target_name, 252 | column_names) 253 | try: 254 | tolerance = 1 255 | test_suite.cross_val_per_class_f1_anomaly_detection(tolerance, average="micro") 256 | assert True 257 | except: 258 | assert False 259 | 260 | def test_cross_val_per_class_roc_auc_anomaly_detection_binary(): 261 | df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models() 262 | test_suite = classification_tests.ClassificationTests(clf, 263 | df, 264 | target_name, 265 | column_names) 266 | try: 267 | tolerance = 1 268 | print(test_suite.is_binary()) 269 | test_suite.cross_val_per_class_roc_auc_anomaly_detection(tolerance) 270 | assert True 271 | except: 272 | assert False 273 | 274 | def test_cross_val_precision_anomaly_detection_binary(): 275 | df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models() 276 | test_suite = classification_tests.ClassificationTests(clf, 277 | df, 278 | target_name, 279 | column_names) 280 | try: 281 | tolerance = 1 282 | test_suite.cross_val_precision_anomaly_detection(tolerance) 283 | assert True 284 | except: 285 | assert False 286 | 287 | def test_cross_val_precision_anomaly_detection_multiclass(): 288 | df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models() 289 | test_suite = classification_tests.ClassificationTests(clf, 290 | df, 291 | target_name, 292 | column_names) 293 | try: 294 | tolerance = 1 295 | test_suite.cross_val_precision_anomaly_detection(tolerance, average="micro") 296 | assert True 297 | except: 298 | assert False 299 | 300 | def test_cross_val_recall_anomaly_detection_binary(): 301 | df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models() 302 | test_suite = classification_tests.ClassificationTests(clf, 303 | df, 304 | target_name, 305 | column_names) 306 | try: 307 | tolerance = 1 308 | test_suite.cross_val_recall_anomaly_detection(tolerance) 309 | assert True 310 | except: 311 | assert False 312 | 313 | def test_cross_val_recall_anomaly_detection_multiclass(): 314 | df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models() 315 | test_suite = classification_tests.ClassificationTests(clf, 316 | df, 317 | target_name, 318 | column_names) 319 | try: 320 | tolerance = 1 321 | test_suite.cross_val_recall_anomaly_detection(tolerance, average="micro") 322 | assert True 323 | except: 324 | assert False 325 | 326 | def test_cross_val_f1_anomaly_detection_binary(): 327 | df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models() 328 | test_suite = classification_tests.ClassificationTests(clf, 329 | df, 330 | target_name, 331 | column_names) 332 | try: 333 | tolerance = 1 334 | test_suite.cross_val_f1_anomaly_detection(tolerance) 335 | assert True 336 | except: 337 | assert False 338 | 339 | def test_cross_val_f1_anomaly_detection_mutliclass(): 340 | df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models() 341 | test_suite = classification_tests.ClassificationTests(clf, 342 | df, 343 | target_name, 344 | column_names) 345 | try: 346 | tolerance = 1 347 | test_suite.cross_val_f1_anomaly_detection(tolerance, average="micro") 348 | assert True 349 | except: 350 | assert False 351 | 352 | def test_cross_val_roc_auc_anomaly_detection_binary(): 353 | df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models() 354 | test_suite = classification_tests.ClassificationTests(clf, 355 | df, 356 | target_name, 357 | column_names) 358 | try: 359 | tolerance = 1 360 | test_suite.cross_val_roc_auc_anomaly_detection(tolerance) 361 | assert True 362 | except: 363 | assert False 364 | 365 | def test_cross_val_precision_avg_binary(): 366 | df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models() 367 | test_suite = classification_tests.ClassificationTests(clf, 368 | df, 369 | target_name, 370 | column_names) 371 | try: 372 | avg = 0.1 373 | test_suite.cross_val_precision_avg(avg) 374 | assert True 375 | except: 376 | assert False 377 | 378 | def test_cross_val_precision_avg_mutliclass(): 379 | df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models() 380 | test_suite = classification_tests.ClassificationTests(clf, 381 | df, 382 | target_name, 383 | column_names) 384 | try: 385 | avg = 0.1 386 | test_suite.cross_val_precision_avg(avg, average="micro") 387 | assert True 388 | except: 389 | assert False 390 | 391 | def test_cross_val_recall_avg_binary(): 392 | df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models() 393 | test_suite = classification_tests.ClassificationTests(clf, 394 | df, 395 | target_name, 396 | column_names) 397 | try: 398 | avg = 0.1 399 | test_suite.cross_val_recall_avg(avg) 400 | assert True 401 | except: 402 | assert False 403 | 404 | def test_cross_val_recall_avg_multiclass(): 405 | df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models() 406 | test_suite = classification_tests.ClassificationTests(clf, 407 | df, 408 | target_name, 409 | column_names) 410 | try: 411 | avg = 0.1 412 | test_suite.cross_val_recall_avg(avg, average="micro") 413 | assert True 414 | except: 415 | assert False 416 | 417 | def test_cross_val_f1_avg_binary(): 418 | df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models() 419 | test_suite = classification_tests.ClassificationTests(clf, 420 | df, 421 | target_name, 422 | column_names) 423 | try: 424 | avg = 0.1 425 | test_suite.cross_val_f1_avg(avg) 426 | assert True 427 | except: 428 | assert False 429 | 430 | def test_cross_val_f1_avg_multiclass(): 431 | df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models() 432 | test_suite = classification_tests.ClassificationTests(clf, 433 | df, 434 | target_name, 435 | column_names) 436 | try: 437 | avg = 0.1 438 | test_suite.cross_val_f1_avg(avg, average="micro") 439 | assert True 440 | except: 441 | assert False 442 | 443 | def test_cross_val_roc_auc_avg_binary(): 444 | df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models() 445 | test_suite = classification_tests.ClassificationTests(clf, 446 | df, 447 | target_name, 448 | column_names) 449 | try: 450 | avg = 0.1 451 | test_suite.cross_val_roc_auc_avg(avg) 452 | assert True 453 | except: 454 | assert False 455 | 456 | def test_spread_cross_val_precision_anomaly_detection_binary(): 457 | df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models() 458 | test_suite = classification_tests.ClassificationTests(clf, 459 | df, 460 | target_name, 461 | column_names) 462 | try: 463 | tolerance = 1 464 | test_suite.spread_cross_val_precision_anomaly_detection(tolerance) 465 | test_suite.spread_cross_val_precision_anomaly_detection(tolerance, method="median") 466 | test_suite.spread_cross_val_precision_anomaly_detection(tolerance, method="trimean") 467 | assert True 468 | except: 469 | assert False 470 | 471 | def test_spread_cross_val_precision_anomaly_detection_multiclass(): 472 | df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models() 473 | test_suite = classification_tests.ClassificationTests(clf, 474 | df, 475 | target_name, 476 | column_names) 477 | try: 478 | tolerance = 1 479 | average = "micro" 480 | test_suite.spread_cross_val_precision_anomaly_detection(tolerance, 481 | average=average) 482 | test_suite.spread_cross_val_precision_anomaly_detection(tolerance, 483 | method="median", 484 | average=average) 485 | test_suite.spread_cross_val_precision_anomaly_detection(tolerance, 486 | method="trimean", 487 | average=average) 488 | assert True 489 | except: 490 | assert False 491 | 492 | def test_spread_cross_val_recall_anomaly_detection_binary(): 493 | df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models() 494 | test_suite = classification_tests.ClassificationTests(clf, 495 | df, 496 | target_name, 497 | column_names) 498 | try: 499 | tolerance = 1 500 | test_suite.spread_cross_val_recall_anomaly_detection(tolerance) 501 | test_suite.spread_cross_val_recall_anomaly_detection(tolerance, method="median") 502 | test_suite.spread_cross_val_recall_anomaly_detection(tolerance, method="trimean") 503 | assert True 504 | except: 505 | assert False 506 | 507 | def test_spread_cross_val_recall_anomaly_detection_multiclass(): 508 | df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models() 509 | test_suite = classification_tests.ClassificationTests(clf, 510 | df, 511 | target_name, 512 | column_names) 513 | try: 514 | tolerance = 1 515 | average = "micro" 516 | test_suite.spread_cross_val_recall_anomaly_detection(tolerance, 517 | average=average) 518 | test_suite.spread_cross_val_recall_anomaly_detection(tolerance, 519 | method="median", 520 | average=average) 521 | test_suite.spread_cross_val_recall_anomaly_detection(tolerance, 522 | method="trimean", 523 | average=average) 524 | assert True 525 | except: 526 | assert False 527 | 528 | def test_spread_cross_val_f1_anomaly_detection_binary(): 529 | df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models() 530 | test_suite = classification_tests.ClassificationTests(clf, 531 | df, 532 | target_name, 533 | column_names) 534 | try: 535 | tolerance = 1 536 | test_suite.spread_cross_val_f1_anomaly_detection(tolerance) 537 | test_suite.spread_cross_val_f1_anomaly_detection(tolerance, method="median") 538 | test_suite.spread_cross_val_f1_anomaly_detection(tolerance, method="trimean") 539 | assert True 540 | except: 541 | assert False 542 | 543 | def test_spread_cross_val_f1_anomaly_detection_multiclass(): 544 | df, column_names, target_name, clf, _ = generate_multiclass_classification_data_and_models() 545 | test_suite = classification_tests.ClassificationTests(clf, 546 | df, 547 | target_name, 548 | column_names) 549 | try: 550 | tolerance = 1 551 | average = "micro" 552 | test_suite.spread_cross_val_f1_anomaly_detection(tolerance, 553 | average=average) 554 | test_suite.spread_cross_val_f1_anomaly_detection(tolerance, 555 | method="median", 556 | average=average) 557 | test_suite.spread_cross_val_f1_anomaly_detection(tolerance, 558 | method="trimean", 559 | average=average) 560 | assert True 561 | except: 562 | assert False 563 | 564 | def test_spread_cross_val_roc_auc_anomaly_detection_binary(): 565 | df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models() 566 | test_suite = classification_tests.ClassificationTests(clf, 567 | df, 568 | target_name, 569 | column_names) 570 | try: 571 | tolerance = 1 572 | test_suite.spread_cross_val_roc_auc_anomaly_detection(tolerance) 573 | test_suite.spread_cross_val_roc_auc_anomaly_detection(tolerance, method="median") 574 | test_suite.spread_cross_val_roc_auc_anomaly_detection(tolerance, method="trimean") 575 | assert True 576 | except: 577 | assert False 578 | 579 | def test_run_time_stress_test(): 580 | df, column_names, target_name, clf, _ = generate_binary_classification_data_and_models() 581 | test_suite = classification_tests.ClassificationTests(clf, 582 | df, 583 | target_name, 584 | column_names) 585 | sample_sizes = [i for i in range(100, 1000, 100)] 586 | max_run_times = [100 for _ in range(len(sample_sizes))] 587 | try: 588 | test_suite.run_time_stress_test(sample_sizes, max_run_times) 589 | assert True 590 | except: 591 | assert False 592 | 593 | def test_two_model_prediction_run_time_stress_test(): 594 | df, column_names, target_name, clf1, clf2 = generate_binary_classification_data_and_models() 595 | test_suite = classification_tests.ClassifierComparison(clf1, 596 | clf2, 597 | df, 598 | target_name, 599 | column_names) 600 | 601 | sample_sizes = [i for i in range(100, 1000, 100)] 602 | try: 603 | test_suite.two_model_prediction_run_time_stress_test(sample_sizes) 604 | assert True 605 | except: 606 | assert False 607 | 608 | def test_two_model_classifier_testing_binary(): 609 | df, column_names, target_name, clf1, clf2 = generate_binary_classification_data_and_models() 610 | test_suite = classification_tests.ClassifierComparison(clf1, 611 | clf2, 612 | df, 613 | target_name, 614 | column_names) 615 | try: 616 | test_suite.two_model_classifier_testing() 617 | assert True 618 | except: 619 | assert False 620 | 621 | def test_two_model_classifier_testing_multiclass(): 622 | df, column_names, target_name, clf1, clf2 = generate_multiclass_classification_data_and_models() 623 | test_suite = classification_tests.ClassifierComparison(clf1, 624 | clf2, 625 | df, 626 | target_name, 627 | column_names) 628 | try: 629 | test_suite.two_model_classifier_testing(average="micro") 630 | assert True 631 | except: 632 | assert False 633 | 634 | def test_cross_val_two_model_classifier_testing_binary(): 635 | df, column_names, target_name, clf1, clf2 = generate_binary_classification_data_and_models() 636 | test_suite = classification_tests.ClassifierComparison(clf1, 637 | clf2, 638 | df, 639 | target_name, 640 | column_names) 641 | try: 642 | test_suite.cross_val_two_model_classifier_testing() 643 | assert True 644 | except: 645 | assert False 646 | 647 | def test_cross_val_two_model_classifier_testing_multiclass(): 648 | df, column_names, target_name, clf1, clf2 = generate_multiclass_classification_data_and_models() 649 | test_suite = classification_tests.ClassifierComparison(clf1, 650 | clf2, 651 | df, 652 | target_name, 653 | column_names) 654 | try: 655 | test_suite.cross_val_two_model_classifier_testing(average="micro") 656 | assert True 657 | except: 658 | assert False 659 | 660 | def test_cross_val_two_model_classifier_testing_binary(): 661 | df, column_names, target_name, clf1, clf2 = generate_binary_classification_data_and_models() 662 | test_suite = classification_tests.ClassifierComparison(clf1, 663 | clf2, 664 | df, 665 | target_name, 666 | column_names) 667 | try: 668 | test_suite.cross_val_per_class_two_model_classifier_testing() 669 | assert True 670 | except: 671 | assert False 672 | 673 | def test_cross_val_two_model_classifier_testing_multiclass(): 674 | df, column_names, target_name, clf1, clf2 = generate_multiclass_classification_data_and_models() 675 | test_suite = classification_tests.ClassifierComparison(clf1, 676 | clf2, 677 | df, 678 | target_name, 679 | column_names) 680 | try: 681 | test_suite.cross_val_per_class_two_model_classifier_testing(average="micro") 682 | assert True 683 | except: 684 | assert False 685 | -------------------------------------------------------------------------------- /tests/test_columnar_tests.py: -------------------------------------------------------------------------------- 1 | from drifter_ml import columnar_tests 2 | import numpy as np 3 | import pandas as pd 4 | 5 | def generate_data(): 6 | new_data = pd.DataFrame() 7 | historical_data = pd.DataFrame() 8 | new_data["similar_normal"] = np.random.normal(0, 10, size=1000) 9 | historical_data["similar_normal"] = np.random.normal(0, 10, size=1000) 10 | new_data["different_normal"] = np.random.normal(1000, 250, size=1000) 11 | historical_data["different_normal"] = np.random.normal(5, 17, size=1000) 12 | new_data["random"] = np.random.random(size=1000) 13 | historical_data["random"] = np.random.random(size=1000) 14 | new_data["similar_gamma"] = np.random.gamma(1, 2, size=1000) 15 | historical_data["similar_gamma"] = np.random.gamma(1, 2, size=1000) 16 | new_data["different_gamma"] = np.random.gamma(7.5, 0, size=1000) 17 | historical_data["different_gamma"] = np.random.gamma(2, 4, size=1000) 18 | return new_data, historical_data 19 | 20 | def test_mean_similarity(): 21 | new_data, historical_data = generate_data() 22 | test_suite = columnar_tests.ColumnarData(new_data, historical_data) 23 | try: 24 | test_suite.mean_similarity("similar_normal") 25 | assert True 26 | except: 27 | assert False 28 | 29 | def test_median_similarity(): 30 | new_data, historical_data = generate_data() 31 | test_suite = columnar_tests.ColumnarData(new_data, historical_data) 32 | try: 33 | test_suite.median_similarity("similar_normal") 34 | assert True 35 | except: 36 | assert False 37 | 38 | 39 | def test_trimean_similarity(): 40 | new_data, historical_data = generate_data() 41 | test_suite = columnar_tests.ColumnarData(new_data, historical_data) 42 | try: 43 | test_suite.trimean_similarity("similar_normal") 44 | assert True 45 | except: 46 | assert False 47 | 48 | def test_is_normal(): 49 | new_data, historical_data = generate_data() 50 | test_suite = columnar_tests.ColumnarData(new_data, historical_data) 51 | try: 52 | test_suite.is_normal("similar_normal") 53 | assert True 54 | except: 55 | assert False 56 | 57 | def test_pearson_similar_correlation(): 58 | new_data, historical_data = generate_data() 59 | test_suite = columnar_tests.ColumnarData(new_data, historical_data) 60 | correlation_lower_bound = 0.3 61 | try: 62 | test_suite.pearson_similar_correlation("similar_normal", correlation_lower_bound) 63 | assert True 64 | except: 65 | assert False 66 | 67 | def test_spearman_similar_correlation(): 68 | new_data, historical_data = generate_data() 69 | test_suite = columnar_tests.ColumnarData(new_data, historical_data) 70 | correlation_lower_bound = 0.3 71 | try: 72 | test_suite.spearman_similar_correlation("similar_normal", correlation_lower_bound) 73 | assert True 74 | except: 75 | assert False 76 | 77 | def test_wilcoxon_similar_distribution(): 78 | new_data, historical_data = generate_data() 79 | test_suite = columnar_tests.ColumnarData(new_data, historical_data) 80 | try: 81 | test_suite.wilcoxon_similar_distribution("similar_normal") 82 | assert True 83 | except: 84 | assert False 85 | 86 | def test_ks_2samp_similar_distribution(): 87 | new_data, historical_data = generate_data() 88 | test_suite = columnar_tests.ColumnarData(new_data, historical_data) 89 | try: 90 | test_suite.ks_2samp_similar_distribution("similar_normal") 91 | assert True 92 | except: 93 | assert False 94 | 95 | def test_kruskal_similar_distribution(): 96 | new_data, historical_data = generate_data() 97 | test_suite = columnar_tests.ColumnarData(new_data, historical_data) 98 | try: 99 | test_suite.kruskal_similar_distribution("similar_normal") 100 | assert True 101 | except: 102 | assert False 103 | 104 | def test_mann_whitney_u_similar_distribution(): 105 | new_data, historical_data = generate_data() 106 | test_suite = columnar_tests.ColumnarData(new_data, historical_data) 107 | try: 108 | test_suite.mann_whitney_u_similar_distribution("similar_normal") 109 | assert True 110 | except: 111 | assert False 112 | -------------------------------------------------------------------------------- /tests/test_regression_tests.py: -------------------------------------------------------------------------------- 1 | from drifter_ml import regression_tests 2 | from sklearn import tree 3 | from sklearn import ensemble 4 | from sklearn import model_selection 5 | import numpy as np 6 | import pandas as pd 7 | 8 | def generate_regression_data_and_models(): 9 | df = pd.DataFrame() 10 | for _ in range(1000): 11 | a = np.random.normal(0, 1) 12 | b = np.random.normal(0, 3) 13 | c = np.random.normal(12, 4) 14 | target = a + b + c 15 | df = df.append({ 16 | "A": a, 17 | "B": b, 18 | "C": c, 19 | "target": target 20 | }, ignore_index=True) 21 | 22 | reg1 = tree.DecisionTreeRegressor() 23 | reg2 = ensemble.RandomForestRegressor() 24 | column_names = ["A", "B", "C"] 25 | target_name = "target" 26 | X = df[column_names] 27 | reg1.fit(X, df[target_name]) 28 | reg2.fit(X, df[target_name]) 29 | return df, column_names, target_name, reg1, reg2 30 | 31 | def test_regression_basic(): 32 | df, column_names, target_name, reg, _ = generate_regression_data_and_models() 33 | test_suite = regression_tests.RegressionTests(reg, 34 | df, 35 | target_name, 36 | column_names) 37 | try: 38 | mse_upper_boundary = 10000 39 | mae_upper_boundary = 10000 40 | tse_upper_boundary = 10000 41 | tae_upper_boundary = 10000 42 | test_suite.upper_bound_regression_testing( 43 | mse_upper_boundary, 44 | mae_upper_boundary, 45 | tse_upper_boundary, 46 | tae_upper_boundary 47 | ) 48 | assert True 49 | except: 50 | assert False 51 | 52 | def test_cross_val_mse_anomaly_detection(): 53 | df, column_names, target_name, reg, _ = generate_regression_data_and_models() 54 | test_suite = regression_tests.RegressionTests(reg, 55 | df, 56 | target_name, 57 | column_names) 58 | try: 59 | mse_tolerance = 10000 60 | test_suite.cross_val_mse_anomaly_detection( 61 | mse_tolerance 62 | ) 63 | assert True 64 | except: 65 | assert False 66 | 67 | def test_cross_val_tse_anomaly_detection(): 68 | df, column_names, target_name, reg, _ = generate_regression_data_and_models() 69 | test_suite = regression_tests.RegressionTests(reg, 70 | df, 71 | target_name, 72 | column_names) 73 | try: 74 | tse_tolerance = 10000 75 | test_suite.cross_val_tse_anomaly_detection( 76 | tse_tolerance 77 | ) 78 | assert True 79 | except: 80 | assert False 81 | 82 | def test_cross_val_mae_anomaly_detection(): 83 | df, column_names, target_name, reg, _ = generate_regression_data_and_models() 84 | test_suite = regression_tests.RegressionTests(reg, 85 | df, 86 | target_name, 87 | column_names) 88 | try: 89 | 90 | mae_tolerance = 10000 91 | test_suite.cross_val_mae_anomaly_detection( 92 | mae_tolerance 93 | ) 94 | assert True 95 | except: 96 | assert False 97 | 98 | def test_cross_val_tae_anomaly_detection(): 99 | df, column_names, target_name, reg, _ = generate_regression_data_and_models() 100 | test_suite = regression_tests.RegressionTests(reg, 101 | df, 102 | target_name, 103 | column_names) 104 | try: 105 | 106 | tae_tolerance = 10000 107 | test_suite.cross_val_tae_anomaly_detection( 108 | tae_tolerance 109 | ) 110 | assert True 111 | except: 112 | assert False 113 | 114 | def test_cross_val_mse_avg(): 115 | df, column_names, target_name, reg, _ = generate_regression_data_and_models() 116 | test_suite = regression_tests.RegressionTests(reg, 117 | df, 118 | target_name, 119 | column_names) 120 | try: 121 | mse_avg = 100 122 | test_suite.cross_val_mse_avg( 123 | mse_avg 124 | ) 125 | assert True 126 | except: 127 | assert False 128 | 129 | def test_cross_val_tse_avg(): 130 | df, column_names, target_name, reg, _ = generate_regression_data_and_models() 131 | test_suite = regression_tests.RegressionTests(reg, 132 | df, 133 | target_name, 134 | column_names) 135 | try: 136 | tse_avg = 100 137 | test_suite.cross_val_tse_avg( 138 | tse_avg 139 | ) 140 | assert True 141 | except: 142 | assert False 143 | 144 | def test_cross_val_mae_avg(): 145 | df, column_names, target_name, reg, _ = generate_regression_data_and_models() 146 | test_suite = regression_tests.RegressionTests(reg, 147 | df, 148 | target_name, 149 | column_names) 150 | try: 151 | mae_avg = 100 152 | test_suite.cross_val_mae_avg( 153 | mae_avg 154 | ) 155 | assert True 156 | except: 157 | assert False 158 | 159 | def test_cross_val_tae_avg(): 160 | df, column_names, target_name, reg, _ = generate_regression_data_and_models() 161 | test_suite = regression_tests.RegressionTests(reg, 162 | df, 163 | target_name, 164 | column_names) 165 | try: 166 | tae_avg = 100 167 | test_suite.cross_val_tae_avg( 168 | tae_avg 169 | ) 170 | assert True 171 | except: 172 | assert False 173 | 174 | def test_run_time_stress_test(): 175 | df, column_names, target_name, reg, _ = generate_regression_data_and_models() 176 | test_suite = regression_tests.RegressionTests(reg, 177 | df, 178 | target_name, 179 | column_names) 180 | 181 | sample_sizes = [i for i in range(100, 1000, 100)] 182 | max_run_times = [100 for _ in range(len(sample_sizes))] 183 | try: 184 | test_suite.run_time_stress_test( 185 | sample_sizes, max_run_times 186 | ) 187 | assert True 188 | except: 189 | assert False 190 | 191 | def test_two_model_prediction_run_time_stress_test(): 192 | df, column_names, target_name, reg1, reg2 = generate_regression_data_and_models() 193 | test_suite = regression_tests.RegressionComparison(reg1, 194 | reg2, 195 | df, 196 | target_name, 197 | column_names) 198 | sample_sizes = [i for i in range(100, 1000, 100)] 199 | try: 200 | test_suite.two_model_prediction_run_time_stress_test( 201 | sample_sizes 202 | ) 203 | assert True 204 | except: 205 | assert False 206 | 207 | def test_cv_two_model_regression_testing(): 208 | df, column_names, target_name, reg1, reg2 = generate_regression_data_and_models() 209 | test_suite = regression_tests.RegressionComparison(reg1, 210 | reg2, 211 | df, 212 | target_name, 213 | column_names) 214 | try: 215 | test_suite.cv_two_model_regression_testing() 216 | assert True 217 | except: 218 | assert False 219 | 220 | def test_two_model_regression_testing(): 221 | df, column_names, target_name, reg1, reg2 = generate_regression_data_and_models() 222 | test_suite = regression_tests.RegressionComparison(reg1, 223 | reg2, 224 | df, 225 | target_name, 226 | column_names) 227 | try: 228 | test_suite.two_model_regression_testing() 229 | assert True 230 | except: 231 | assert False 232 | -------------------------------------------------------------------------------- /tests/test_structural_tests.py: -------------------------------------------------------------------------------- 1 | from drifter_ml import structural_tests 2 | import numpy as np 3 | import pandas as pd 4 | 5 | def generate_classification_data_and_models(): 6 | new_data = pd.DataFrame() 7 | for _ in range(1000): 8 | a = np.random.normal(0, 1) 9 | b = np.random.normal(0, 3) 10 | c = np.random.normal(12, 4) 11 | if a + b + c > 11: 12 | target = 1 13 | else: 14 | target = 0 15 | new_data = new_data.append({ 16 | "A": a, 17 | "B": b, 18 | "C": c, 19 | "target": target 20 | }, ignore_index=True) 21 | 22 | historical_data = pd.DataFrame() 23 | for _ in range(1000): 24 | a = np.random.normal(0, 1) 25 | b = np.random.normal(0, 3) 26 | c = np.random.normal(12, 4) 27 | if a + b + c > 11: 28 | target = 1 29 | else: 30 | target = 0 31 | historical_data = historical_data.append({ 32 | "A": a, 33 | "B": b, 34 | "C": c, 35 | "target": target 36 | }, ignore_index=True) 37 | 38 | column_names = ["A", "B", "C"] 39 | target_name = "target" 40 | return new_data, historical_data, column_names, target_name 41 | 42 | def generate_regression_data_and_models(): 43 | new_data = pd.DataFrame() 44 | for _ in range(1000): 45 | a = np.random.normal(0, 1) 46 | b = np.random.normal(0, 3) 47 | c = np.random.normal(12, 4) 48 | target = a + b + c 49 | new_data = new_data.append({ 50 | "A": a, 51 | "B": b, 52 | "C": c, 53 | "target": target 54 | }, ignore_index=True) 55 | 56 | historical_data = pd.DataFrame() 57 | for _ in range(1000): 58 | a = np.random.normal(0, 1) 59 | b = np.random.normal(0, 3) 60 | c = np.random.normal(12, 4) 61 | target = a + b + c 62 | historical_data = historical_data.append({ 63 | "A": a, 64 | "B": b, 65 | "C": c, 66 | "target": target 67 | }, ignore_index=True) 68 | 69 | column_names = ["A", "B", "C"] 70 | target_name = "target" 71 | return new_data, historical_data, column_names, target_name 72 | 73 | def generate_unsupervised_data(): 74 | new_data = pd.DataFrame() 75 | historical_data = pd.DataFrame() 76 | new_data["similar_normal"] = np.random.normal(0, 10, size=1000) 77 | historical_data["similar_normal"] = np.random.normal(0, 10, size=1000) 78 | new_data["different_normal"] = np.random.normal(1000, 250, size=1000) 79 | historical_data["different_normal"] = np.random.normal(5, 17, size=1000) 80 | new_data["random"] = np.random.random(size=1000) 81 | historical_data["random"] = np.random.random(size=1000) 82 | new_data["similar_gamma"] = np.random.gamma(1, 2, size=1000) 83 | historical_data["similar_gamma"] = np.random.gamma(1, 2, size=1000) 84 | new_data["different_gamma"] = np.random.gamma(7.5, 0, size=1000) 85 | historical_data["different_gamma"] = np.random.gamma(2, 4, size=1000) 86 | return new_data, historical_data 87 | 88 | def test_mutual_info_kmeans_scorer(): 89 | new_data, historical_data = generate_unsupervised_data() 90 | columns = ["similar_normal", "different_normal", 91 | "similar_gamma", "different_gamma"] 92 | target = '' 93 | test_suite = structural_tests.StructuralData(new_data, 94 | historical_data, 95 | columns, 96 | target) 97 | try: 98 | min_similarity = 0.5 99 | test_suite.mutual_info_kmeans_scorer(min_similarity) 100 | assert True 101 | except: 102 | assert False 103 | 104 | def test_adjusted_rand_kmeans_scorer(): 105 | new_data, historical_data = generate_unsupervised_data() 106 | columns = ["similar_normal", "different_normal", 107 | "similar_gamma", "different_gamma"] 108 | target = '' 109 | test_suite = structural_tests.StructuralData(new_data, 110 | historical_data, 111 | columns, 112 | target) 113 | try: 114 | min_similarity = 0.5 115 | test_suite.adjusted_rand_kmeans_scorer(min_similarity) 116 | assert True 117 | except: 118 | assert False 119 | 120 | def test_completeness_kmeans_scorer(): 121 | new_data, historical_data = generate_unsupervised_data() 122 | columns = ["similar_normal", "different_normal", 123 | "similar_gamma", "different_gamma"] 124 | target = '' 125 | test_suite = structural_tests.StructuralData(new_data, 126 | historical_data, 127 | columns, 128 | target) 129 | try: 130 | min_similarity = 0.5 131 | test_suite.completeness_kmeans_scorer(min_similarity) 132 | assert True 133 | except: 134 | assert False 135 | 136 | def test_fowlkes_mallows_kmeans_scorer(): 137 | new_data, historical_data = generate_unsupervised_data() 138 | columns = ["similar_normal", "different_normal", 139 | "similar_gamma", "different_gamma"] 140 | target = '' 141 | test_suite = structural_tests.StructuralData(new_data, 142 | historical_data, 143 | columns, 144 | target) 145 | try: 146 | min_similarity = 0.5 147 | test_suite.fowlkes_mallows_kmeans_scorer(min_similarity) 148 | assert True 149 | except: 150 | assert False 151 | 152 | def test_homogeneity_kmeans_scorer(): 153 | new_data, historical_data = generate_unsupervised_data() 154 | columns = ["similar_normal", "different_normal", 155 | "similar_gamma", "different_gamma"] 156 | target = '' 157 | test_suite = structural_tests.StructuralData(new_data, 158 | historical_data, 159 | columns, 160 | target) 161 | try: 162 | min_similarity = 0.5 163 | test_suite.homogeneity_kmeans_scorer(min_similarity) 164 | assert True 165 | except: 166 | assert False 167 | 168 | def test_v_measure_kmeans_scorer(): 169 | new_data, historical_data = generate_unsupervised_data() 170 | columns = ["similar_normal", "different_normal", 171 | "similar_gamma", "different_gamma"] 172 | target = '' 173 | test_suite = structural_tests.StructuralData(new_data, 174 | historical_data, 175 | columns, 176 | target) 177 | try: 178 | min_similarity = 0.5 179 | test_suite.v_measure_kmeans_scorer(min_similarity) 180 | assert True 181 | except: 182 | assert False 183 | 184 | def test_mutual_info_dbscan_scorer(): 185 | new_data, historical_data = generate_unsupervised_data() 186 | columns = ["similar_normal", "different_normal", 187 | "similar_gamma", "different_gamma"] 188 | target = '' 189 | test_suite = structural_tests.StructuralData(new_data, 190 | historical_data, 191 | columns, 192 | target) 193 | try: 194 | min_similarity = 0.5 195 | test_suite.mutual_info_dbscan_scorer(min_similarity) 196 | assert True 197 | except: 198 | assert False 199 | 200 | def test_adjusted_rand_dbscan_scorer(): 201 | new_data, historical_data = generate_unsupervised_data() 202 | columns = ["similar_normal", "different_normal", 203 | "similar_gamma", "different_gamma"] 204 | target = '' 205 | test_suite = structural_tests.StructuralData(new_data, 206 | historical_data, 207 | columns, 208 | target) 209 | try: 210 | min_similarity = 0.5 211 | test_suite.adjusted_rand_dbscan_scorer(min_similarity) 212 | assert True 213 | except: 214 | assert False 215 | 216 | def test_completeness_dbscan_scorer(): 217 | new_data, historical_data = generate_unsupervised_data() 218 | columns = ["similar_normal", "different_normal", 219 | "similar_gamma", "different_gamma"] 220 | target = '' 221 | test_suite = structural_tests.StructuralData(new_data, 222 | historical_data, 223 | columns, 224 | target) 225 | try: 226 | min_similarity = 0.5 227 | test_suite.completeness_dbscan_scorer(min_similarity) 228 | assert True 229 | except: 230 | assert False 231 | 232 | def test_fowlkes_mallows_dbscan_scorer(): 233 | new_data, historical_data = generate_unsupervised_data() 234 | columns = ["similar_normal", "different_normal", 235 | "similar_gamma", "different_gamma"] 236 | target = '' 237 | test_suite = structural_tests.StructuralData(new_data, 238 | historical_data, 239 | columns, 240 | target) 241 | try: 242 | min_similarity = 0.5 243 | test_suite.fowlkes_mallows_dbscan_scorer(min_similarity) 244 | assert True 245 | except: 246 | assert False 247 | 248 | def test_homogeneity_dbscan_scorer(): 249 | new_data, historical_data = generate_unsupervised_data() 250 | columns = ["similar_normal", "different_normal", 251 | "similar_gamma", "different_gamma"] 252 | target = '' 253 | test_suite = structural_tests.StructuralData(new_data, 254 | historical_data, 255 | columns, 256 | target) 257 | try: 258 | min_similarity = 0.5 259 | test_suite.homogeneity_dbscan_scorer(min_similarity) 260 | assert True 261 | except: 262 | assert False 263 | 264 | def test_v_measure_dbscan_scorer(): 265 | new_data, historical_data = generate_unsupervised_data() 266 | columns = ["similar_normal", "different_normal", 267 | "similar_gamma", "different_gamma"] 268 | target = '' 269 | test_suite = structural_tests.StructuralData(new_data, 270 | historical_data, 271 | columns, 272 | target) 273 | try: 274 | min_similarity = 0.5 275 | test_suite.v_measure_dbscan_scorer(min_similarity) 276 | assert True 277 | except: 278 | assert False 279 | 280 | def test_reg_supervised_similar_clustering(): 281 | new_data, historical_data, column_names, target_name = generate_regression_data_and_models() 282 | 283 | test_suite = structural_tests.StructuralData(new_data, 284 | historical_data, 285 | column_names, 286 | target_name) 287 | try: 288 | absolute_distance = 2 289 | test_suite.reg_supervised_similar_clustering(absolute_distance) 290 | assert True 291 | except: 292 | assert False 293 | 294 | def test_reg_supervised_similar_clustering(): 295 | new_data, historical_data, column_names, target_name = generate_classification_data_and_models() 296 | test_suite = structural_tests.StructuralData(new_data, 297 | historical_data, 298 | column_names, 299 | target_name) 300 | try: 301 | absolute_distance = 2 302 | test_suite.cls_supervised_similar_clustering(absolute_distance) 303 | assert True 304 | except: 305 | assert False 306 | 307 | --------------------------------------------------------------------------------