├── .gitignore ├── Makefile ├── README.md ├── code-env └── python │ ├── desc.json │ └── spec │ └── requirements.txt ├── custom-recipes ├── compare-dataset-with-model │ ├── recipe.json │ └── recipe.py └── compare-datasets │ ├── recipe.json │ └── recipe.py ├── plugin.json ├── python-lib ├── dku_data_drift │ ├── __init__.py │ ├── dataframe_helpers.py │ ├── dataset_helpers.py │ ├── drift_analyzer.py │ ├── model_accessor.py │ ├── model_drift_constants.py │ ├── model_tools.py │ └── preprocessing.py ├── dku_tools.py └── model_metadata.py ├── python-probes └── drift-score │ ├── probe.json │ └── probe.py ├── resource ├── bootstrap.min.css ├── compute_model_id_choice.py ├── d3.v4.min.js ├── dku-helpers.js └── style.css ├── tests └── python │ ├── requirements.txt │ └── unit │ └── test_drift_analyzer.py └── webapps └── model-drift-view ├── app.js ├── backend.py ├── body.html ├── style.css └── webapp.json /.gitignore: -------------------------------------------------------------------------------- 1 | unit.xml 2 | state.json 3 | *.pyc 4 | .DS_Store 5 | .idea/ 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | db.sqlite3 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # Environments 86 | .env 87 | .venv 88 | env/ 89 | venv/ 90 | ENV/ 91 | env.bak/ 92 | venv.bak/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | .spyproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # mkdocs documentation 102 | /site 103 | 104 | # mypy 105 | .mypy_cache/ 106 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Makefile variables set automatically 2 | plugin_id=`cat plugin.json | python -c "import sys, json; print(str(json.load(sys.stdin)['id']).replace('/',''))"` 3 | plugin_version=`cat plugin.json | python -c "import sys, json; print(str(json.load(sys.stdin)['version']).replace('/',''))"` 4 | archive_file_name="dss-plugin-${plugin_id}-${plugin_version}.zip" 5 | remote_url=`git config --get remote.origin.url` 6 | last_commit_id=`git rev-parse HEAD` 7 | 8 | 9 | plugin: 10 | @echo "[START] Archiving plugin to dist/ folder..." 11 | @cat plugin.json | json_pp > /dev/null 12 | @rm -rf dist 13 | @mkdir dist 14 | @echo "{\"remote_url\":\"${remote_url}\",\"last_commit_id\":\"${last_commit_id}\"}" > release_info.json 15 | @git archive -v -9 --format zip -o dist/${archive_file_name} HEAD 16 | @zip -u dist/${archive_file_name} release_info.json 17 | @rm release_info.json 18 | @echo "[SUCCESS] Archiving plugin to dist/ folder: Done!" 19 | 20 | unit-tests: 21 | @echo "[START] Running unit tests..." 22 | @( \ 23 | PYTHON_VERSION=`python3 -V 2>&1 | sed 's/[^0-9]*//g' | cut -c 1,2`; \ 24 | PYTHON_VERSION_IS_CORRECT=`cat code-env/python/desc.json | python3 -c "import sys, json; print(str($$PYTHON_VERSION) in [x[-2:] for x in json.load(sys.stdin)['acceptedPythonInterpreters']]);"`; \ 25 | if [ ! $$PYTHON_VERSION_IS_CORRECT ]; then echo "Python version $$PYTHON_VERSION is not in acceptedPythonInterpreters"; exit 1; fi; \ 26 | ) 27 | @( \ 28 | python3 -m venv env/; \ 29 | source env/bin/activate; \ 30 | pip3 install --upgrade pip; \ 31 | pip install --no-cache-dir -r tests/python/requirements.txt; \ 32 | pip install --no-cache-dir -r code-env/python/spec/requirements.txt; \ 33 | export PYTHONPATH="$(PYTHONPATH):$(PWD)/python-lib"; \ 34 | pytest -o junit_family=xunit2 --junitxml=unit.xml tests/python/unit || true; \ 35 | deactivate; \ 36 | ) 37 | @echo "[SUCCESS] Running unit tests: Done!" 38 | 39 | integration-tests: 40 | @echo "[START] Running integration tests..." 41 | # TODO add integration tests 42 | @echo "[SUCCESS] Running integration tests: Done!" 43 | 44 | tests: unit-tests integration-tests 45 | 46 | dist-clean: 47 | rm -rf dist -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ⚠️ Starting with DSS version 10.0.0, this plugin is considered as "Deprecated" and will be maintained only to fix critical issues. We recommend using the native feature [Model Evaluation Store](https://doc.dataiku.com/dss/latest/python-api/model-evaluation-stores.html). 2 | 3 | 4 | # Model drift monitoring 5 | 6 | Monitoring ML models in production is often a tedious task. You can apply a simply retraining strategy based on monitoring the model’s performance: if your AUC drops by a given percentage, retrain. Although accurate, this approach requires to obtain the ground truth for your preditctions, which is not always fast, and certainly not “real time”. 7 | 8 | Instead of waiting for the ground truth, we propose to look at the recent data the model has had to score, and statistically compare it with the data on which the model was evaluated. If these datasets are too different, the model may need to be retrained. 9 | 10 | 11 | ## Scope of the plugin 12 | This plugin offers a set of different DSS components to monitor input data drift (of a model): 13 | * Model view: visualise the drift metrics and graph 14 | * Recipe: compute feature drift of a deployed model 15 | * Recipe: compute drift between two datasets 16 | * Custom metric: retrieve the most recent drift metric 17 | 18 | 19 | ## Installation and requirements 20 | 21 | Please see our [official plugin page](https://www.dataiku.com/product/plugins/model-drift-monitoring/) for installation. 22 | 23 | ## Changelog 24 | 25 | ****Version 3.1.5 (2022-07)** 26 | * Misc: 27 | * Load js package locally to support offline DSS instances. 28 | 29 | ****Version 3.1.4 (2022-03)** 30 | * Misc: 31 | * Add cloudpickle to code-env requirements. 32 | * Update existing packages version. 33 | 34 | 35 | ****Version 3.1.3 (2021-12)** 36 | * Enhancement: 37 | * Use feature importance from Tree-based regression models. 38 | * Use surrogate model for CalibratedClassifierCV. 39 | 40 | **Version 3.0.0 (2020-12)** 41 | * Enhancement: 42 | * Add binomial test to check the reliability of drift score. 43 | * Improve model view's UI. 44 | 45 | **Version 2.0.0 (2020-06)** 46 | * New components: 47 | * Recipe: Compute feature drift of a deployed model 48 | * Recipe: Compute drift between two datasets 49 | * Custom metric: Retrieve last drift metric 50 | * Enhancement: 51 | * Add support for regression algorithms and non tree-based algorithm 52 | * Add riskiest features information allowing users to immediately have the list of features that they need to be careful about (ie. features that are drifted the most and are important in the deloyed model) 53 | * Add support for partitioning 54 | * Add support for all types of train-test split (with/without cross-validation) 55 | * Bug fixes: 56 | * Fix bug with boolean dtype handling that leads to mismatch prediction probability and weird categorical variable encoding. 57 | * Fix bug with Date dtype and python 3. 58 | 59 | **Version 1.0.0 (2019-12)** 60 | 61 | * Initial release 62 | * Model view component: support for tree-based classification algorithms 63 | 64 | You can log feature requests or issues on our [dedicated Github repository](https://github.com/dataiku/dss-plugin-model-drift/issues). 65 | 66 | # License 67 | 68 | The Model drift monitoring plugin is: 69 | 70 | Copyright (c) 2020 Dataiku SAS 71 | Licensed under the [MIT License](LICENSE.md). 72 | -------------------------------------------------------------------------------- /code-env/python/desc.json: -------------------------------------------------------------------------------- 1 | { 2 | "acceptedPythonInterpreters": ["PYTHON27", "PYTHON36"], 3 | "forceConda": false, 4 | "installCorePackages": true, 5 | "installJupyterSupport": true 6 | } -------------------------------------------------------------------------------- /code-env/python/spec/requirements.txt: -------------------------------------------------------------------------------- 1 | flask==1.1.2 2 | scikit-learn>=0.20,<0.21 3 | scipy>=1.2,<1.3 4 | xgboost==0.82 5 | future==0.18.2 6 | joblib==0.14.1 7 | enum34==1.1.10 8 | statsmodels>=0.10,<0.11 9 | cloudpickle>=1.3,<1.6 10 | jinja2>=2.10,<2.11 11 | -------------------------------------------------------------------------------- /custom-recipes/compare-dataset-with-model/recipe.json: -------------------------------------------------------------------------------- 1 | { 2 | "meta": { 3 | "label": "Compute feature drift of a deployed model", 4 | "description": "Measure data drift between the original training data of a model and a new dataset.", 5 | "icon": "icon-anchor", 6 | "displayOrderRank": 1 7 | }, 8 | 9 | "selectableFromDataset": "new", 10 | "kind": "PYTHON", 11 | "paramsPythonSetup": "compute_model_id_choice.py", 12 | "inputRoles" : [ 13 | { 14 | "name": "model", 15 | "label": "Deployed model", 16 | "arity": "UNARY", 17 | "required": true, 18 | "acceptsDataset": false, 19 | "acceptsSavedModel": true 20 | }, 21 | { 22 | "name": "new", 23 | "label": "New dataset", 24 | "description": "New dataset", 25 | "arity": "UNARY", 26 | "required": true, 27 | "acceptsDataset": true 28 | } 29 | ], 30 | 31 | "outputRoles" : [ 32 | { 33 | "name": "output_dataset", 34 | "label": "Metrics dataset", 35 | "description": "Drift metrics will be stored in this dataset", 36 | "arity": "UNARY", 37 | "required": true, 38 | "acceptsDataset": true 39 | } 40 | ], 41 | 42 | "params": [ 43 | { 44 | "name": "use_active_version", 45 | "label": "Use the model's active version", 46 | "type": "BOOLEAN", 47 | "defaultValue": true 48 | }, 49 | { 50 | "name": "version_id", 51 | "label": "Model version", 52 | "type": "SELECT", 53 | "getChoicesFromPython": true, 54 | "visibilityCondition": "!model.use_active_version" 55 | 56 | }, 57 | { 58 | "name": "metric_list", 59 | "label": "Metrics", 60 | "type": "MULTISELECT", 61 | "selectChoices": [ 62 | { "value": "drift_model_accuracy", "label": "Drift score"}, 63 | { "value": "fugacity", "label": "Fugacity"}, 64 | { "value": "riskiest_features", "label": "Riskiest features"}, 65 | { "value": "feature_importance", "label": "Feature importance"} 66 | ], 67 | "defaultValue": ["drift_score"] 68 | } 69 | ], 70 | 71 | "resourceKeys": [] 72 | 73 | } 74 | -------------------------------------------------------------------------------- /custom-recipes/compare-dataset-with-model/recipe.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import datetime 3 | import logging 4 | from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role, get_recipe_config 5 | from dku_tools import set_column_description, get_input_output, get_params_with_model, build_drift_metric_dataframe 6 | from model_metadata import get_train_date 7 | from dku_data_drift.drift_analyzer import DriftAnalyzer 8 | from dku_data_drift.model_accessor import ModelAccessor 9 | from dku_data_drift.dataset_helpers import get_partitioning_columns 10 | from dku_data_drift.model_drift_constants import ModelDriftConstants 11 | from model_metadata import get_model_handler 12 | 13 | 14 | # init logger 15 | logger = logging.getLogger(__name__) 16 | logging.basicConfig(level=logging.INFO, format='Model Drift Recipe | %(levelname)s - %(message)s') 17 | 18 | new_dataset, model, output_dataset = get_input_output(has_model_as_second_input=True) 19 | new_df = new_dataset.get_dataframe(bool_as_str=True, limit=ModelDriftConstants.MAX_NUM_ROW) 20 | 21 | partition_cols_new_df = get_partitioning_columns(new_dataset) 22 | if partition_cols_new_df: 23 | new_df = new_df.drop(partition_cols_new_df, axis=1) 24 | if len(new_df.columns) == 0: 25 | raise ValueError('Without the partition column, dataset is empty.') 26 | 27 | version_id, metric_list = get_params_with_model(get_recipe_config(), model) 28 | 29 | # Access the model 30 | model_handler = get_model_handler(model=model, version_id=version_id) 31 | model_accessor = ModelAccessor(model_handler) 32 | 33 | # Analyze the drift 34 | drifter = DriftAnalyzer(prediction_type=None) 35 | drifter.fit(new_df, model_accessor=model_accessor) 36 | 37 | # Write the drift score and metrics 38 | timestamp = datetime.datetime.now() 39 | model_train_date = get_train_date(model.get_id(), version_id) 40 | new_df = pd.DataFrame({ModelDriftConstants.TIMESTAMP: [timestamp], 41 | ModelDriftConstants.MODEL_ID: [model.get_id()], 42 | ModelDriftConstants.VERSION_ID: [version_id], 43 | ModelDriftConstants.TRAIN_DATE: [model_train_date]}) 44 | #specify the column order 45 | new_df = new_df[[ModelDriftConstants.TIMESTAMP, ModelDriftConstants.MODEL_ID, ModelDriftConstants.VERSION_ID, ModelDriftConstants.TRAIN_DATE]] 46 | metrics_df, column_description_dict = build_drift_metric_dataframe(drifter, metric_list, new_df, has_model_as_input=True) 47 | 48 | output_dataset.write_with_schema(metrics_df) 49 | set_column_description(output_dataset, column_description_dict) -------------------------------------------------------------------------------- /custom-recipes/compare-datasets/recipe.json: -------------------------------------------------------------------------------- 1 | { 2 | "meta": { 3 | "label": "Compute drift between two datasets", 4 | "description": "Measure data drift between two datasets having the same schema.", 5 | "icon": "icon-anchor", 6 | "displayOrderRank": 2 7 | }, 8 | 9 | "selectableFromDataset": "original", 10 | "kind": "PYTHON", 11 | "inputRoles" : [ 12 | { 13 | "name": "original", 14 | "label": "Original dataset", 15 | "description": "", 16 | "arity": "UNARY", 17 | "required": true, 18 | "acceptsDataset": true 19 | }, 20 | { 21 | "name": "new", 22 | "label": "New dataset", 23 | "description": "", 24 | "arity": "UNARY", 25 | "required": true, 26 | "acceptsDataset": true 27 | } 28 | ], 29 | 30 | "outputRoles" : [ 31 | { 32 | "name": "output_dataset", 33 | "label": "Metrics dataset", 34 | "description": "Dataset storing drift metrics", 35 | "arity": "UNARY", 36 | "required": true, 37 | "acceptsDataset": true 38 | } 39 | ], 40 | 41 | "params": [ 42 | { 43 | "name": "metric_list_without_prediction", 44 | "label": "Metrics", 45 | "type": "MULTISELECT", 46 | "selectChoices": [ 47 | { "value": "drift_model_accuracy", "label": "Drift score"}, 48 | { "value": "feature_importance", "label": "Most drifted features"} 49 | ], 50 | "defaultValue": ["drift_model_accuracy"] 51 | }, 52 | { 53 | "name": "columns_to_remove", 54 | "label": "Columns to ignore", 55 | "type": "COLUMNS", 56 | "columnRole": "original" 57 | } 58 | ], 59 | "resourceKeys": [] 60 | 61 | } 62 | -------------------------------------------------------------------------------- /custom-recipes/compare-datasets/recipe.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import datetime 3 | import logging 4 | from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role, get_recipe_config 5 | from dku_data_drift.drift_analyzer import DriftAnalyzer 6 | from dku_data_drift.dataset_helpers import get_partitioning_columns 7 | from dku_data_drift.model_drift_constants import ModelDriftConstants 8 | from dku_tools import set_column_description, get_input_output, get_params_without_model, build_drift_metric_dataframe 9 | 10 | logger = logging.getLogger(__name__) 11 | logging.basicConfig(level=logging.INFO, format='Model Drift Recipe | %(levelname)s - %(message)s') 12 | 13 | new_dataset, original_dataset, output_dataset = get_input_output() 14 | original_df = original_dataset.get_dataframe(bool_as_str=True, limit=ModelDriftConstants.MAX_NUM_ROW) 15 | new_df = new_dataset.get_dataframe(bool_as_str=True, limit=ModelDriftConstants.MAX_NUM_ROW) 16 | 17 | columns_to_remove, metric_list = get_params_without_model(get_recipe_config()) 18 | 19 | if len(columns_to_remove) != 0: 20 | to_remove_in_original = set(original_df.columns).intersection(set(columns_to_remove)) 21 | if to_remove_in_original: 22 | original_df = original_df.drop(list(to_remove_in_original), axis=1) 23 | to_remove_in_new = set(new_df.columns).intersection(set(columns_to_remove)) 24 | if to_remove_in_new: 25 | new_df = new_df.drop(list(to_remove_in_new), axis=1) 26 | 27 | # Handle partitioning 28 | partition_cols_new_df = get_partitioning_columns(new_dataset) 29 | partition_cols_original_df = get_partitioning_columns(original_dataset) 30 | if partition_cols_original_df: 31 | original_df = original_df.drop(partition_cols_original_df, axis=1) 32 | if partition_cols_new_df: 33 | new_df = new_df.drop(partition_cols_new_df, axis=1) 34 | if len(new_df.columns) == 0 or len(original_df.columns) == 0: 35 | raise ValueError('Without the partition column, at least one of the datasets is empty.') 36 | 37 | # Analyse the drift 38 | drifter = DriftAnalyzer() 39 | drifter.fit(new_df=new_df, original_df=original_df) 40 | 41 | # Write the drift score and metrics 42 | timestamp = datetime.datetime.now() 43 | new_df = pd.DataFrame({ModelDriftConstants.TIMESTAMP: [timestamp]}) 44 | metrics_df, column_description_dict = build_drift_metric_dataframe(drifter, metric_list, new_df, has_model_as_input=False) 45 | 46 | output_dataset.write_with_schema(metrics_df) 47 | set_column_description(output_dataset, column_description_dict) 48 | 49 | -------------------------------------------------------------------------------- /plugin.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "model-drift", 3 | "version": "3.1.6", 4 | "meta": { 5 | "label": "Model drift monitoring (deprecated)", 6 | "description": "Get insights on data drift between two (training) datasets. ⚠️ We recommend using the native feature Model Evaluation Store to compute feature drift of a deployed model.", 7 | "author": "Dataiku (Léo Dreyfus-Schmidt, Du Phan & Thibault Desfontaines)", 8 | "icon": "icon-anchor", 9 | "supportLevel": "TIER2_SUPPORT", 10 | "licenseInfo": "Apache Software License", 11 | "url": "https://www.dataiku.com/product/plugins/model-drift/", 12 | "tags": ["Machine Learning"] 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /python-lib/dku_data_drift/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from dku_data_drift.drift_analyzer import DriftAnalyzer 3 | from dku_data_drift.model_accessor import ModelAccessor 4 | from dku_data_drift.model_drift_constants import ModelDriftConstants -------------------------------------------------------------------------------- /python-lib/dku_data_drift/dataframe_helpers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Simple functions helpers 5 | """ 6 | 7 | import logging 8 | import sys 9 | 10 | logger = logging.getLogger(__name__) 11 | logging.basicConfig(level=logging.INFO, format='Model Drift Plugin | %(levelname)s - %(message)s') 12 | 13 | logger.info("Python version: {}".format(sys.version)) 14 | # python3 does not have basetring 15 | try: 16 | basestring 17 | except NameError: 18 | basestring = str 19 | 20 | 21 | def schema_are_compatible(df1, df2): 22 | """ 23 | Return True if df1 and df2 have the same columns 24 | :param df1: Pandas dataframe 25 | :param df2: Pandas dataframe 26 | :return: 27 | """ 28 | return set(df1.columns) == set(df2.columns) 29 | 30 | 31 | def not_enough_data(df, min_len=1): 32 | """ 33 | Compare length of dataframe to minimum lenght of the test data. 34 | Used in the relevance of the measure. 35 | :param df: Input dataframe 36 | :param min_len: 37 | :return: 38 | """ 39 | return len(df) < min_len 40 | 41 | 42 | def nothing_to_do(stuff): 43 | return stuff is None 44 | 45 | 46 | def generic_check_compute_arguments(datetime_column, groupby_columns): 47 | """ 48 | Check columns argument in the dataframe. Date is always tricky to handle. 49 | :param datetime_column: 50 | :param groupby_columns: 51 | :return: 52 | """ 53 | if not isinstance(datetime_column, basestring): 54 | raise ValueError('datetime_column param must be string. Got: ' + str(datetime_column)) 55 | if groupby_columns: 56 | if not isinstance(groupby_columns, list): 57 | raise ValueError('groupby_columns param must be an array of strings. Got: ' + str(groupby_columns)) 58 | for col in groupby_columns: 59 | if not isinstance(col, basestring): 60 | raise ValueError('groupby_columns param must be an array of strings. Got: ' + str(col)) -------------------------------------------------------------------------------- /python-lib/dku_data_drift/dataset_helpers.py: -------------------------------------------------------------------------------- 1 | def get_partitioning_columns(dataset): 2 | """ 3 | Dataset DSS object 4 | :param dataset: 5 | :return: The partitioning columns as a list of strings 6 | """ 7 | partitioning_settings = dataset.get_config().get('partitioning', {}) 8 | partitioning_dimensions = partitioning_settings.get('dimensions', []) 9 | is_filesystem_partition = 'filePathPattern' in dataset.get_config().get('partitioning', {}) 10 | if len(partitioning_dimensions) > 0 and not is_filesystem_partition: 11 | partitioning_columns = [col.get('name') for col in partitioning_dimensions] 12 | return partitioning_columns 13 | return [] 14 | 15 | -------------------------------------------------------------------------------- /python-lib/dku_data_drift/drift_analyzer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import numpy as np 4 | import pandas as pd 5 | import scipy.stats 6 | import statsmodels.stats.proportion 7 | from sklearn.metrics import accuracy_score 8 | from sklearn.ensemble import RandomForestClassifier 9 | from sklearn.preprocessing import KBinsDiscretizer 10 | from dku_data_drift.preprocessing import Preprocessor 11 | from dku_data_drift.model_tools import format_proba_density 12 | from dku_data_drift.model_drift_constants import ModelDriftConstants 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class DriftAnalyzer(object): 18 | 19 | def __init__(self, prediction_type=None): 20 | self.prediction_type = prediction_type 21 | self.drift_clf = RandomForestClassifier(n_estimators=100, random_state=1337, max_depth=13, min_samples_leaf=1) 22 | self._original_df = None 23 | self._new_df = None 24 | self._drift_test_X = None 25 | self._drift_test_Y = None 26 | self._model_accessor = None 27 | self.has_predictions = False 28 | self.target = None 29 | self.features_in_drift_model = None 30 | self.sample_size = None 31 | 32 | def get_prediction_type(self): 33 | return self.prediction_type 34 | 35 | def fit(self, new_df, model_accessor=None, original_df=None, target=None): 36 | """ 37 | Trains a classifier that attempts to discriminate between rows from the provided dataframe and 38 | rows from the dataset originally used to evaluate the model 39 | 40 | Returns (columns, classifier) 41 | """ 42 | logger.info("Preparing the drift model...") 43 | 44 | if model_accessor is not None and original_df is not None: 45 | raise ValueError('model_accessor and original_df can not be defined at the same time. Please choose one of them.') 46 | 47 | if model_accessor is not None and original_df is None and target is None: 48 | self._model_accessor = model_accessor 49 | self.has_predictions = True 50 | self.target = self._model_accessor.get_target_variable() 51 | self.prediction_type = self._model_accessor.get_prediction_type() 52 | original_df = self._model_accessor.get_original_test_df() 53 | df = self.prepare_data_when_having_model(new_df, original_df) 54 | elif model_accessor is None and original_df is not None and target is not None: 55 | self.has_predictions = True 56 | self.target = target 57 | df = self.prepare_data_when_having_target(new_df, original_df) 58 | elif model_accessor is None and original_df is not None and target is None: 59 | df = self.prepare_data_when_without_target(new_df, original_df) 60 | else: 61 | raise NotImplementedError('You need to precise either a model accessor or an original df.') 62 | 63 | preprocessor = Preprocessor(df, target=ModelDriftConstants.ORIGIN_COLUMN) 64 | train, test = preprocessor.get_processed_train_test() 65 | drift_train_X = train.drop(ModelDriftConstants.ORIGIN_COLUMN, axis=1) 66 | drift_train_Y = np.array(train[ModelDriftConstants.ORIGIN_COLUMN]) 67 | self._drift_test_X = test.drop(ModelDriftConstants.ORIGIN_COLUMN, axis=1) # we will use them later when compute metrics 68 | self._drift_test_Y = np.array(test[ModelDriftConstants.ORIGIN_COLUMN]) 69 | self.features_in_drift_model = drift_train_X.columns 70 | 71 | logger.info("Fitting the drift model...") 72 | self.drift_clf.fit(drift_train_X, drift_train_Y) 73 | 74 | def prepare_data_when_having_model(self, new_df, original_df): 75 | logger.info('Prepare data with model') 76 | 77 | if self.target not in original_df: 78 | raise ValueError('The original dataset does not contain target "{}".'.format(self.target)) 79 | 80 | self._new_df = new_df 81 | self._original_df = original_df 82 | original_df_without_target = original_df.drop(self.target, axis=1) 83 | return self._prepare_data_for_drift_model(new_df, original_df_without_target) 84 | 85 | def prepare_data_when_having_target(self, new_df, original_df): 86 | logger.info('Prepare data with target for drift model') 87 | 88 | if self.target not in new_df: 89 | raise ValueError('The new dataset does not contain target "{}".'.format(self.target)) 90 | 91 | if self.target not in original_df: 92 | raise ValueError('The original dataset does not contain target "{}".'.format(self.target)) 93 | 94 | self._new_df = new_df 95 | self._original_df = original_df 96 | new_df_without_target = new_df.drop(self.target, axis=1) 97 | original_df_without_target = original_df.drop(self.target, axis=1) 98 | return self._prepare_data_for_drift_model(new_df_without_target, original_df_without_target) 99 | 100 | def prepare_data_when_without_target(self, new_df, original_df): 101 | logger.info('Prepare data without target for drift model') 102 | return self._prepare_data_for_drift_model(new_df, original_df) 103 | 104 | def get_drift_metrics_for_webapp(self): 105 | """ 106 | Return a dict of metrics with a format to be easily used in frontend 107 | """ 108 | 109 | if self.features_in_drift_model is None or self.drift_clf is None: 110 | logger.warning('drift_features and drift_clf must be defined') 111 | return {} 112 | 113 | logger.info("Computing drift metrics ...") 114 | drift_accuracy, drift_accuracy_lower, drift_accuracy_upper, drift_test_pvalue = self.get_drift_score(output_raw_score=True) 115 | feature_importance_metrics, riskiest_features = self._get_feature_importance_metrics() 116 | 117 | if self.prediction_type == ModelDriftConstants.REGRRSSION_TYPE: 118 | kde_dict = self._get_regression_prediction_kde() 119 | fugacity_metrics = {} 120 | label_list = [] 121 | elif self.prediction_type == ModelDriftConstants.CLASSIFICATION_TYPE: 122 | logger.info("Compute classification drift metrics for classification") 123 | kde_dict, fugacity_metrics, label_list = self._get_classification_prediction_metrics() 124 | else: 125 | raise ValueError('Prediction type not defined.') 126 | 127 | return {'type': self.prediction_type, 128 | 'sample_size': self.sample_size, 129 | 'feature_importance': feature_importance_metrics, 130 | 'drift_accuracy': round(drift_accuracy, 3), 131 | 'drift_accuracy_lower': round(drift_accuracy_lower, 3), 132 | 'drift_accuracy_upper': round(drift_accuracy_upper, 3), 133 | 'drift_test_pvalue': round(drift_test_pvalue, 5), 134 | 'kde': kde_dict, 135 | 'fugacity': fugacity_metrics, 136 | 'label_list': label_list, 137 | 'riskiest_features': riskiest_features} 138 | 139 | def _get_classification_prediction_metrics(self): 140 | 141 | if not self.has_predictions: 142 | raise ValueError('DriftAnalyzer needs a target.') 143 | 144 | if self.prediction_type != ModelDriftConstants.CLASSIFICATION_TYPE: 145 | raise ValueError('Can not use this function with a {} model.'.format(self.prediction_type)) 146 | 147 | if self._model_accessor is not None: 148 | prediction_dict = self.get_predictions_from_original_model(limit=ModelDriftConstants.PREDICTION_TEST_SIZE) 149 | predictions_by_class = {} 150 | for label in prediction_dict.get(ModelDriftConstants.FROM_ORIGINAL).columns: 151 | if 'proba_' in label: 152 | original_proba = np.around(prediction_dict.get(ModelDriftConstants.FROM_ORIGINAL)[label].values, 2).tolist() 153 | new_proba = np.around(prediction_dict.get(ModelDriftConstants.FROM_NEW)[label].values, 2).tolist() 154 | predictions_by_class[label] = {ModelDriftConstants.FROM_ORIGINAL: original_proba, ModelDriftConstants.FROM_NEW: new_proba} 155 | kde_dict = {} 156 | for label in predictions_by_class.keys(): 157 | kde_original = format_proba_density(predictions_by_class.get(label).get(ModelDriftConstants.FROM_ORIGINAL)) 158 | kde_new = format_proba_density(predictions_by_class.get(label).get(ModelDriftConstants.FROM_NEW)) 159 | cleaned_label = label.replace('proba_', ModelDriftConstants.CLASS) 160 | kde_dict[cleaned_label] = {ModelDriftConstants.FROM_ORIGINAL: kde_original, ModelDriftConstants.FROM_NEW: kde_new} 161 | fugacity = self.get_classification_fugacity(reformat=True) 162 | label_list = [label for label in fugacity[0].keys() if label != 'source'] 163 | 164 | return kde_dict, fugacity, label_list 165 | else: 166 | fugacity = self.get_classification_fugacity() 167 | label_list = fugacity[ModelDriftConstants.CLASS].unique() 168 | return None, fugacity, label_list 169 | 170 | def _get_regression_prediction_kde(self): 171 | 172 | if not self.has_predictions: 173 | raise ValueError('No target was defined at fit phase.') 174 | 175 | if self.prediction_type != ModelDriftConstants.REGRRSSION_TYPE: 176 | raise ValueError('Can not use this function with a {} model.'.format(self.prediction_type)) 177 | 178 | prediction_dict = self.get_predictions_from_original_model(limit=ModelDriftConstants.PREDICTION_TEST_SIZE) 179 | original_serie = prediction_dict.get(ModelDriftConstants.FROM_ORIGINAL).values 180 | new_serie = prediction_dict.get(ModelDriftConstants.FROM_NEW).values 181 | min_support = float(min(min(original_serie), min(new_serie))) 182 | max_support = float(max(max(original_serie), max(new_serie))) 183 | logger.info("Computed histogram support: [{},{}]".format(min_support, max_support)) 184 | kde_original = format_proba_density(original_serie, min_support=min_support, max_support=max_support) 185 | kde_new = format_proba_density(new_serie, min_support=min_support, max_support=max_support) 186 | kde_dict= { 187 | 'Prediction': { 188 | ModelDriftConstants.FROM_ORIGINAL: kde_original, 189 | ModelDriftConstants.FROM_NEW: kde_new, 190 | "min_support": min_support, 191 | "max_support": max_support 192 | } 193 | } 194 | return kde_dict 195 | 196 | def get_regression_fugacity(self): 197 | """ 198 | TODO refactor 199 | 200 | """ 201 | kde_dict = self._get_regression_prediction_kde() 202 | new = kde_dict.get('Prediction').get('new') 203 | old = kde_dict.get('Prediction').get('original') 204 | old_arr = np.array(old).T 205 | df = pd.DataFrame(new, columns=['val_new', 'new_density']) 206 | df['val_old'] = old_arr[0] 207 | df['old_density'] = old_arr[1] 208 | kb = KBinsDiscretizer(n_bins=10, encode='ordinal') 209 | df['old_bin'] = kb.fit_transform(df['val_old'].values.reshape(-1, 1)).reshape(-1, ).astype(int) 210 | df['new_bin'] = kb.transform(df['val_new'].values.reshape(-1, 1)).reshape(-1, ).astype(int) 211 | full_density_old = df.old_density.sum() 212 | full_density_new = df.new_density.sum() 213 | fuga_old = 100 * df.groupby('old_bin').old_density.sum() / full_density_old 214 | fuga_new = 100 * df.groupby('new_bin').new_density.sum() / full_density_new 215 | 216 | fuga_old_df = pd.DataFrame(fuga_old).reset_index() 217 | fuga_old_df['old_bin'] = fuga_old_df['old_bin'].map(lambda x: 'fugacity_decile_{}'.format(x)) 218 | old_fugacity_values = fuga_old_df.set_index('old_bin').to_dict().get('old_density') 219 | 220 | fuga_new_df = pd.DataFrame(fuga_new).reset_index() 221 | fuga_new_df['new_bin'] = fuga_new_df['new_bin'].map(lambda x: 'fugacity_decile_{}'.format(x)) 222 | new_fugacity_values = fuga_new_df.set_index('new_bin').to_dict().get('new_density') 223 | fugacity = {} 224 | for k, v in old_fugacity_values.items(): 225 | fugacity[k] = {ModelDriftConstants.ORIGINAL_DATASET: v, ModelDriftConstants.NEW_DATASET: new_fugacity_values.get(k)} 226 | 227 | fugacity_relative_change_values = np.around(100*(fuga_new - fuga_old)/fuga_old, decimals=3) 228 | fuga_relative_change_df = pd.DataFrame(fugacity_relative_change_values.to_dict(), index=[0]) 229 | fuga_diff_columns = [ModelDriftConstants.FUGACITY_RELATIVE_CHANGE_REGRESSION_LABEL.format(col) for col in fuga_relative_change_df.columns] 230 | fuga_relative_change_df.columns = fuga_diff_columns 231 | 232 | fugacity_relative_change = fuga_relative_change_df.iloc[0].to_dict() 233 | 234 | e = '-inf' 235 | decile_interval_description = [] 236 | for edge in kb.bin_edges_[0][1:-1]: 237 | decile_interval_description.append('from {0} to {1}'.format(e, round(edge, 2))) 238 | e = round(edge, 3) 239 | 240 | decile_interval_description.append('from {0} to +inf'.format(round(kb.bin_edges_[0][-2], 2))) 241 | return fugacity, fugacity_relative_change, decile_interval_description 242 | 243 | 244 | def _prepare_data_for_drift_model(self, new_df, original_df, min_num_row=ModelDriftConstants.MIN_NUM_ROWS, max_num_row=ModelDriftConstants.MAX_NUM_ROW): 245 | """ 246 | Sampling function so that original test set and new test set has the same ratio in the drift training set 247 | For now only do top n sampling, with max n = MAX_NUM_ROW 248 | 249 | :return: a dataframe with data source target (orignal vs new) 250 | """ 251 | 252 | original_df[ModelDriftConstants.ORIGIN_COLUMN] = ModelDriftConstants.FROM_ORIGINAL 253 | new_df[ModelDriftConstants.ORIGIN_COLUMN] = ModelDriftConstants.FROM_NEW 254 | 255 | logger.info("Rebalancing data:") 256 | number_of_rows = min(original_df.shape[0], new_df.shape[0], max_num_row) 257 | self.sample_size = number_of_rows 258 | logger.info(" - original dataset had %s rows, new dataset has %s. Selecting the first %s for each." % (original_df.shape[0], new_df.shape[0], number_of_rows)) 259 | 260 | df = pd.concat([original_df.head(number_of_rows), new_df.head(number_of_rows)], sort=False) 261 | 262 | if self._model_accessor is not None: 263 | selected_features = [ModelDriftConstants.ORIGIN_COLUMN] + self._model_accessor.get_selected_features() 264 | else: 265 | selected_features = original_df.columns 266 | 267 | 268 | logger.info('Features used for drift models: {}'.format(selected_features)) 269 | missing_features = set(selected_features) - set(new_df.columns) 270 | if len(missing_features) > 0: 271 | raise ValueError('Missing column(s) in the new dataframe: {}'.format(', '.join(list(missing_features)))) 272 | 273 | return df.loc[:, selected_features] 274 | 275 | def get_drift_feature_importance(self, cumulative_percentage_threshold=ModelDriftConstants.FEAT_IMP_CUMULATIVE_PERCENTAGE_THRESHOLD): 276 | feature_importance = [] 277 | for feature_name, feat_importance in zip(self.features_in_drift_model, self.drift_clf.feature_importances_): 278 | feature_importance.append({ 279 | ModelDriftConstants.FEATURE: feature_name, 280 | ModelDriftConstants.IMPORTANCE: 100 * feat_importance / sum(self.drift_clf.feature_importances_) 281 | }) 282 | 283 | dfx = pd.DataFrame(feature_importance).sort_values(by=ModelDriftConstants.IMPORTANCE, ascending=False).reset_index(drop=True) 284 | dfx[ModelDriftConstants.CUMULATIVE_IMPORTANCE] = dfx[ModelDriftConstants.IMPORTANCE].cumsum() 285 | dfx_top = dfx.loc[dfx[ModelDriftConstants.CUMULATIVE_IMPORTANCE] <= cumulative_percentage_threshold] 286 | return dfx_top.rename_axis(ModelDriftConstants.RANK).reset_index().set_index(ModelDriftConstants.FEATURE) 287 | 288 | def get_original_feature_importance(self, cumulative_percentage_threshold=ModelDriftConstants.FEAT_IMP_CUMULATIVE_PERCENTAGE_THRESHOLD): 289 | if self._model_accessor is not None: 290 | return self._model_accessor.get_feature_importance(cumulative_percentage_threshold) 291 | else: 292 | raise ValueError('DriftAnalyzer needs a ModelAccessor as input.') 293 | 294 | def get_riskiest_features(self, drift_feature_importance=None, original_feature_importance=None, ratio_threshold=ModelDriftConstants.RISKIEST_FEATURES_RATIO_THRESHOLD): 295 | """ 296 | Return a list of features that users should check (ie. those that are on the top right quadrant of the feat imp plot) 297 | If either feature importance is not computed, compute them here 298 | 299 | :param drift_feature_importance: 300 | :param original_feature_importance: 301 | :return: 302 | """ 303 | if drift_feature_importance is None: 304 | drift_feature_importance = self.get_drift_feature_importance() 305 | if original_feature_importance is None: 306 | original_feature_importance = self.get_original_feature_importance() 307 | 308 | original_feat_imp_threshold = ratio_threshold * max(original_feature_importance[ModelDriftConstants.IMPORTANCE]) 309 | drift_feat_imp_threshold = ratio_threshold * max(drift_feature_importance[ModelDriftConstants.IMPORTANCE]) 310 | top_original_features = original_feature_importance[original_feature_importance[ModelDriftConstants.IMPORTANCE] > original_feat_imp_threshold].index 311 | top_drift_features = drift_feature_importance[drift_feature_importance[ModelDriftConstants.IMPORTANCE] > drift_feat_imp_threshold].index 312 | 313 | return list(set(top_original_features).intersection(top_drift_features)) 314 | 315 | def _get_feature_importance_metrics(self): 316 | """ 317 | For visualisation purpose 318 | 319 | :return: 320 | """ 321 | original_feature_importance_df = self.get_original_feature_importance() 322 | drift_feature_importance_df = self.get_drift_feature_importance() 323 | topn_drift_feature = drift_feature_importance_df.to_dict()[ModelDriftConstants.IMPORTANCE] 324 | topn_original_feature = original_feature_importance_df.to_dict()[ModelDriftConstants.IMPORTANCE] 325 | feature_importance_metrics = [] 326 | for feature in set(topn_original_feature.keys()).union(set(topn_drift_feature.keys())): 327 | drift_feat_rank = topn_drift_feature.get(feature) 328 | original_feat_rank = topn_original_feature.get(feature) 329 | if drift_feat_rank is None: 330 | logger.warn('Feature {} does not exist in the most important features of the drift model.'.format(feature)) 331 | if original_feat_rank is None: 332 | logger.warn('Feature {} does not exist in the most important features of the orignal model.'.format(feature)) 333 | feature_importance_metrics.append({ 334 | 'original_model': original_feat_rank if original_feat_rank else 0.01, 335 | 'drift_model': drift_feat_rank if drift_feat_rank else 0.01, 336 | 'feature': feature 337 | }) 338 | 339 | riskiest_feature = self.get_riskiest_features(drift_feature_importance=drift_feature_importance_df, original_feature_importance=original_feature_importance_df) 340 | return feature_importance_metrics, riskiest_feature 341 | 342 | def get_drift_score(self, output_raw_score=False, confidence_level=0.95): 343 | 344 | """ 345 | Drift score is the accuracy of drift model (with an exponential transform by default) 346 | 347 | :param 348 | output_raw_score 349 | confidence_level 350 | :return: 351 | """ 352 | predicted_Y = self.drift_clf.predict(self._drift_test_X) 353 | test_Y = pd.Series(self._drift_test_Y) 354 | drift_accuracy = accuracy_score(test_Y, predicted_Y) 355 | 356 | # 95% confidence interval around accuracy 357 | nb_correct = sum(test_Y == predicted_Y) 358 | nb_total = len(test_Y) 359 | drift_accuracy_lower, drift_accuracy_upper = statsmodels.stats.proportion.proportion_confint( 360 | nb_correct, nb_total, method="wilson", alpha=(1 - confidence_level) 361 | ) 362 | 363 | # H0: there is no drift (== domain classifier is correct 50% of the time) 364 | drift_test_pvalue = scipy.stats.binom_test(nb_correct, nb_total, p=.5, alternative='greater') 365 | 366 | if output_raw_score: 367 | return drift_accuracy, drift_accuracy_lower, drift_accuracy_upper, drift_test_pvalue 368 | else: 369 | exponential_function = lambda x: round(np.exp(1 - 1 / (np.power(x, 2.5))), 2) 370 | return exponential_function(drift_accuracy) # make the score looks more "logic" from the user point of view 371 | 372 | def get_predictions_from_original_model(self, limit=ModelDriftConstants.MAX_NUM_ROW): 373 | """ 374 | Predictions on the test set of original and new data 375 | 376 | The result of model_accessor.predict() is a dataframe prediction|proba_0|proba_1|... 377 | """ 378 | if not self.has_predictions: 379 | raise ValueError('No target was defined at fit phase.') 380 | 381 | if self._model_accessor is not None: 382 | original_prediction_df = self._model_accessor.predict(self._original_df[:limit]) 383 | original_prediction_df = original_prediction_df.rename(columns={'prediction':self.target}) 384 | new_predicton_df = self._model_accessor.predict(self._new_df[:limit]) 385 | new_predicton_df = new_predicton_df.rename(columns={'prediction':self.target}) 386 | 387 | if self._model_accessor.get_prediction_type() == ModelDriftConstants.CLASSIFICATION_TYPE: 388 | proba_columns = [col for col in original_prediction_df.columns if 'proba_' in col] 389 | # move to % scale, it plays nicer with d3 ... 390 | original_prediction_df.loc[:, proba_columns] = np.around(original_prediction_df.loc[:, proba_columns] * 100) 391 | new_predicton_df.loc[:, proba_columns] = np.around(new_predicton_df.loc[:, proba_columns] * 100) 392 | 393 | return {ModelDriftConstants.FROM_ORIGINAL: original_prediction_df, ModelDriftConstants.FROM_NEW: new_predicton_df} 394 | 395 | else: # no proba columns 396 | original_prediction_df = self._original_df.loc[:, [self.target]] 397 | new_prediciton_df = self._new_df.loc[:, [self.target]] 398 | return {ModelDriftConstants.FROM_ORIGINAL: original_prediction_df, ModelDriftConstants.FROM_NEW: new_prediciton_df} 399 | 400 | 401 | def get_classification_fugacity(self, reformat=False): 402 | """ 403 | For classification only, this compute the ratio of each predicted label 404 | 405 | :param prediction_dict: 406 | :return: 407 | """ 408 | if self.prediction_type != ModelDriftConstants.CLASSIFICATION_TYPE: 409 | raise ValueError('This function is for prediction of type {0}.'.format(self.prediction_type)) 410 | 411 | if not self.has_predictions: 412 | raise ValueError('No target was defined in the fit phase.') 413 | 414 | prediction_dict = self.get_predictions_from_original_model(limit=ModelDriftConstants.PREDICTION_TEST_SIZE) 415 | original_prediction_df = prediction_dict.get(ModelDriftConstants.FROM_ORIGINAL) 416 | new_prediciton_df = prediction_dict.get(ModelDriftConstants.FROM_NEW) 417 | 418 | if reformat: # for the model view 419 | original_fugacity = (100 * original_prediction_df[self.target].value_counts(normalize=True)).round(decimals=2).to_dict() 420 | new_fugacity = (100 * new_prediciton_df[self.target].value_counts(normalize=True)).round(decimals=2).to_dict() 421 | fugacity = [] 422 | for key in original_fugacity.keys(): 423 | temp_fugacity = {} 424 | new_key = "Predicted {} (%)".format(key) 425 | temp_fugacity[' Score'] = new_key 426 | temp_fugacity['Test dataset'] = original_fugacity.get(key, 0.) 427 | temp_fugacity['Input dataset'] = new_fugacity.get(key, 0.) 428 | fugacity.append(temp_fugacity) 429 | return fugacity 430 | else: 431 | original_fugacity = (100 * original_prediction_df[self.target].value_counts(normalize=True)).round(decimals=2).rename_axis(ModelDriftConstants.CLASS).reset_index(name=ModelDriftConstants.PERCENTAGE) 432 | new_fugacity = (100 * new_prediciton_df[self.target].value_counts(normalize=True)).round(decimals=2).rename_axis(ModelDriftConstants.CLASS).reset_index(name=ModelDriftConstants.PERCENTAGE) 433 | fugacity_relative_change = {} 434 | fugacity = {} 435 | 436 | for label in original_fugacity[ModelDriftConstants.CLASS].unique(): 437 | new_value = new_fugacity[new_fugacity[ModelDriftConstants.CLASS] == label][ModelDriftConstants.PERCENTAGE].values[0] 438 | original_value = original_fugacity[original_fugacity[ModelDriftConstants.CLASS] == label][ModelDriftConstants.PERCENTAGE].values[0] 439 | fugacity_diff = 100 * float(new_value - original_value)/float(original_value) 440 | new_label_relative = ModelDriftConstants.FUGACITY_RELATIVE_CHANGE_CLASSIF_LABEL.format(label) 441 | fugacity_relative_change[new_label_relative] = round(fugacity_diff, 3) 442 | new_label = ModelDriftConstants.FUGACITY_CLASSIF_LABEL.format(label) 443 | fugacity[new_label] = {ModelDriftConstants.ORIGINAL_DATASET: original_value, ModelDriftConstants.NEW_DATASET: new_value} 444 | return fugacity, fugacity_relative_change -------------------------------------------------------------------------------- /python-lib/dku_data_drift/model_accessor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import pandas as pd 4 | from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor 5 | from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor 6 | from sklearn.calibration import CalibratedClassifierCV 7 | from dku_data_drift.model_tools import SurrogateModel 8 | from dku_data_drift.model_drift_constants import ModelDriftConstants 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | ALGORITHMS_WITH_VARIABLE_IMPORTANCE = [RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, DecisionTreeClassifier, 13 | RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, DecisionTreeRegressor] 14 | 15 | 16 | class ModelAccessor(object): 17 | def __init__(self, model_handler=None): 18 | self.model_handler = model_handler 19 | 20 | def get_prediction_type(self): 21 | """ 22 | Wrap the prediction type accessor of the model 23 | """ 24 | if self.model_handler.get_prediction_type() in ModelDriftConstants.DKU_CLASSIFICATION_TYPE: 25 | return ModelDriftConstants.CLASSIFICATION_TYPE 26 | elif ModelDriftConstants.REGRRSSION_TYPE in self.model_handler.get_prediction_type(): 27 | return ModelDriftConstants.REGRRSSION_TYPE 28 | else: 29 | return ModelDriftConstants.CLUSTERING_TYPE 30 | 31 | def get_target_variable(self): 32 | """ 33 | Return the name of the target variable 34 | """ 35 | return self.model_handler.get_target_variable() 36 | 37 | def get_original_test_df(self, limit=ModelDriftConstants.MAX_NUM_ROW): 38 | try: 39 | return self.model_handler.get_test_df()[0][:limit] 40 | except Exception as e: 41 | logger.warning('Can not retrieve original test set: {}. The plugin will take the whole original dataset.'.format(e)) 42 | return self.model_handler.get_full_df()[0][:limit] 43 | 44 | def get_per_feature(self): 45 | return self.model_handler.get_per_feature() 46 | 47 | def get_predictor(self): 48 | return self.model_handler.get_predictor() 49 | 50 | def get_feature_importance(self, cumulative_percentage_threshold=ModelDriftConstants.FEAT_IMP_CUMULATIVE_PERCENTAGE_THRESHOLD): 51 | """ 52 | :param cumulative_percentage_threshold: only return the top n features whose sum of importance reaches this threshold 53 | :return: 54 | """ 55 | if self._algorithm_is_tree_based(): 56 | predictor = self.get_predictor() 57 | clf = predictor._clf 58 | feature_names = predictor.get_features() 59 | feature_importances = clf.feature_importances_ 60 | 61 | else: # use surrogate model 62 | logger.info('Fitting surrogate model ...') 63 | surrogate_model = SurrogateModel(self.get_prediction_type()) 64 | original_test_df = self.get_original_test_df() 65 | predictions_on_original_test_df = self.get_predictor().predict(original_test_df) 66 | surrogate_df = original_test_df[self.get_selected_features()] 67 | surrogate_df[ModelDriftConstants.SURROGATE_TARGET] = predictions_on_original_test_df['prediction'] 68 | surrogate_model.fit(surrogate_df, ModelDriftConstants.SURROGATE_TARGET) 69 | feature_names = surrogate_model.get_features() 70 | feature_importances = surrogate_model.clf.feature_importances_ 71 | 72 | feature_importance = [] 73 | for feature_name, feat_importance in zip(feature_names, feature_importances): 74 | feature_importance.append({ 75 | ModelDriftConstants.FEATURE: feature_name, 76 | ModelDriftConstants.IMPORTANCE: 100 * feat_importance / sum(feature_importances) 77 | }) 78 | 79 | dfx = pd.DataFrame(feature_importance).sort_values(by=ModelDriftConstants.IMPORTANCE, ascending=False).reset_index(drop=True) 80 | dfx[ModelDriftConstants.CUMULATIVE_IMPORTANCE] = dfx[ModelDriftConstants.IMPORTANCE].cumsum() 81 | dfx_top = dfx.loc[dfx[ModelDriftConstants.CUMULATIVE_IMPORTANCE] <= cumulative_percentage_threshold] 82 | return dfx_top.rename_axis(ModelDriftConstants.RANK).reset_index().set_index(ModelDriftConstants.FEATURE) 83 | 84 | 85 | def get_selected_features(self): 86 | selected_features = [] 87 | for feat, feat_info in self.get_per_feature().items(): 88 | if feat_info.get('role') == 'INPUT': 89 | selected_features.append(feat) 90 | return selected_features 91 | 92 | def predict(self, df): 93 | return self.get_predictor().predict(df) 94 | 95 | def _algorithm_is_tree_based(self): 96 | predictor = self.get_predictor() 97 | algo = predictor._clf 98 | if isinstance(algo, CalibratedClassifierCV): 99 | logger.info('Algorithm is CalibratedClassifierCV.') 100 | return False 101 | for algorithm in ALGORITHMS_WITH_VARIABLE_IMPORTANCE: 102 | if isinstance(algo, algorithm): 103 | logger.info('Algorithm is tree-based: ', algo) 104 | return True 105 | elif predictor.params.modeling_params.get('algorithm') == 'XGBOOST_CLASSIFICATION': 106 | logger.info('Algorithm is tree-based: XGBOOST_CLASSIFICATION') 107 | return True 108 | elif predictor.params.modeling_params.get('algorithm') == 'XGBOOST_REGRESSION': 109 | logger.info('Algorithm is tree-based: XGBOOST_REGRESSION') 110 | return True 111 | return False -------------------------------------------------------------------------------- /python-lib/dku_data_drift/model_drift_constants.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | class ModelDriftConstants(object): 4 | 5 | TIMESTAMP = 'timestamp' 6 | MODEL_ID = 'model_id' 7 | VERSION_ID = 'version_id' 8 | TRAIN_DATE = 'train_date' 9 | DRIFT_SCORE = 'drift_model_accuracy' 10 | DRIFT_SCORE_DEFINITION = 'In order to detect data drift, we train a random forest classifier (the drift model) to discriminate the new data set from the test set. If this classifier has accuracy > 0.5, it implies that test data and new data can be distinguished and that you are observing data drift. You may consider retraining your model in that situation.' 11 | BINOMIAL_TEST = 'binomial_test' 12 | BINOMIAL_TEST_DEFINITION = 'The hypothesis tested is that there is no drift, in which case the expected drift model accuracy is 0.5 (datasets undistinguishable). The observed accuracy might deviate from this expectation and the Binomial test evaluates whether this deviation is statistically significant, modelling the number of correct predictions as a random variable drawn from a Binomial distribution. The p-value is the probability to observe this particular accuracy (or larger) under the hypothesis of absent drift. If this probability is lower than the significance level (i.e. 5%), it’s then unlikely to be in the situation of absent drift: the hypothesis of no drift is rejected, triggering a drift detection. The significance level indicates the rate of falsely-detected drifts we are ready to accept from the test.' 13 | BINOMIAL_P_VALUE = 'binomial_test_p_value' 14 | BINOMIAL_LOWER_BOUND = 'accuracy_lower_bound' 15 | BINOMIAL_LOWER_BOUND_DEFINITION = 'Confidence interval lower bound for the accuracy of the domain classifier' 16 | BINOMIAL_UPPER_BOUND = 'accuracy_upper_bound' 17 | BINOMIAL_UPPER_BOUND_DEFINITION = 'Confidence interval upper bound for the accuracy of the domain classifier' 18 | 19 | FUGACITY = 'fugacity' 20 | FUGACITY_CLASSIF_DEFINITION = 'Proportion of samples predicted (in %) in each class when scoring on both the original test and the new input dataset.' 21 | FUGACITY_REGRESSION_DEFINITION = 'Proportion of samples predicted (in %) in each decile when scoring on both the original test and the new input dataset.\n\n' 22 | FUGACITY_RELATIVE_CHANGE = 'fugacity_relative_change' 23 | FUGACITY_RELATIVE_CHANGE_CLASSIF_DEFINITION = 'Relative change (in %) in each class with respect to the original fugacity value.\n\nFormula: 100*(new_fugacity - original_fugacity)/original_fugacity' 24 | FUGACITY_RELATIVE_CHANGE_REGRESSION_DEFINITION = 'Relative change (in %) in each decile with respect to the original fugacity value.\n\nFormula: 100*(new_fugacity - original_fugacity)/original_fugacity\n\n' 25 | RISKIEST_FEATURES = 'riskiest_features' 26 | RISKIEST_FEATURES_DEFINITION = 'If the drift score is medium/high (above 0.1), we recommend you to check those features.\nA feature is considered risky if it is both in the top 40% of the most drifted features as well as the top 40% most important features in the original model.' 27 | MOST_DRIFTED_FEATURES = 'most_drifted_features' 28 | 29 | NUMBER_OF_DRIFTED_FEATURES = 20 30 | MOST_DRIFTED_FEATURES_DEFINITION = 'When the drift score is medium/high (above 0.1), this is the list of features that have been drifted the most, with their % of importance (max {0} features).'.format(NUMBER_OF_DRIFTED_FEATURES) 31 | MOST_IMPORTANT_FEATURES = 'most_important_features_in_deployed_model' 32 | MOST_IMPORTANT_FEATURES_DEFINTIION = 'Most important features in the deployed model, with their % of importance (max 20 features).' 33 | FEATURE_IMPORTANCE = 'feature_importance' 34 | 35 | ORIGIN_COLUMN = '__dku_row_origin__' # name for the column that will contain the information from where the row is from (original test dataset or new dataframe) 36 | FROM_ORIGINAL = 'original' 37 | FROM_NEW = 'new' 38 | MIN_NUM_ROWS = 500 39 | MAX_NUM_ROW = 100000 40 | CUMULATIVE_PERCENTAGE_THRESHOLD = 90 41 | PREDICTION_TEST_SIZE = 100000 42 | SURROGATE_TARGET = "_dku_predicted_label_" 43 | 44 | REGRRSSION_TYPE = 'REGRESSION' 45 | CLASSIFICATION_TYPE = 'CLASSIFICATION' 46 | CLUSTERING_TYPE = 'CLUSTERING' 47 | DKU_CLASSIFICATION_TYPE = ['BINARY_CLASSIFICATION', 'MULTICLASS'] 48 | 49 | 50 | FEAT_IMP_CUMULATIVE_PERCENTAGE_THRESHOLD = 95 51 | RISKIEST_FEATURES_RATIO_THRESHOLD = 0.65 52 | 53 | FEATURE = 'feature' 54 | IMPORTANCE = 'importance' 55 | CUMULATIVE_IMPORTANCE = 'cumulative_importance' 56 | RANK = 'rank' 57 | CLASS = 'class' 58 | PERCENTAGE = 'percentage' 59 | ORIGINAL_DATASET = 'original_dataset' 60 | NEW_DATASET = 'new_dataset' 61 | FUGACITY_RELATIVE_CHANGE_CLASSIF_LABEL = 'fugacity_relative_change_of_class_{0}' 62 | FUGACITY_RELATIVE_CHANGE_REGRESSION_LABEL = 'fugacity_relative_change_decile_{0}' 63 | FUGACITY_CLASSIF_LABEL = 'fugacity_class_{0}' 64 | 65 | 66 | @staticmethod 67 | def get_supported_metrics(): 68 | return ModelDriftConstants.DRIFT_SCORE, ModelDriftConstants.FUGACITY, ModelDriftConstants.FEATURE_IMPORTANCE, ModelDriftConstants.RISKIEST_FEATURES -------------------------------------------------------------------------------- /python-lib/dku_data_drift/model_tools.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import numpy as np 4 | import math 5 | from sklearn.neighbors import KernelDensity 6 | from sklearn.metrics import roc_auc_score 7 | from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor 8 | from dku_data_drift.preprocessing import Preprocessor 9 | from dku_data_drift.model_drift_constants import ModelDriftConstants 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | def mroc_auc_score(y_true, y_predictions, sample_weight=None): 15 | """ Returns a auc score. Handles multi-class 16 | For multi-class, the AUC score is in fact the MAUC 17 | score described in 18 | David J. Hand and Robert J. Till. 2001. 19 | A Simple Generalisation of the Area Under the ROC Curve 20 | for Multiple Class Classification Problems. 21 | Mach. Learn. 45, 2 (October 2001), 171-186. 22 | DOI=10.1023/A:1010920819831 23 | http://dx.doi.org/10.1023/A:1010920819831 24 | """ 25 | (nb_rows, max_nb_classes) = y_predictions.shape 26 | # Today, it may happen that if a class appears only once in a dataset 27 | # it can appear in the train and not in the validation set. 28 | # In this case it will not be in y_true and 29 | # y_predictions.nb_cols is not exactly the number of class 30 | # to consider when computing the mroc_auc_score. 31 | classes = np.unique(y_true) 32 | nb_classes = len(classes) 33 | if nb_classes > max_nb_classes: 34 | raise ValueError("Your test set contained more classes than the test set. Check your dataset or try a different split.") 35 | 36 | if nb_classes < 2: 37 | raise ValueError("Ended up with less than two-classes in the validation set.") 38 | 39 | if nb_classes == 2: 40 | classes = classes.tolist() 41 | y_true = y_true.map(lambda c: classes.index(c)) # ensure classes are [0 1] 42 | return roc_auc_score(y_true, y_predictions[:, 1], sample_weight=sample_weight) 43 | 44 | def A(i, j): 45 | """ 46 | Returns a asymmetric proximity metric, written A(i | j) 47 | in the paper. 48 | The sum of all (i, j) with i != j 49 | will give us the symmetry. 50 | """ 51 | mask = np.in1d(y_true, np.array([i, j])) 52 | y_true_i = y_true[mask] == i 53 | y_pred_i = y_predictions[mask][:, i] 54 | if sample_weight is not None: 55 | sample_weight_i = sample_weight[mask] 56 | else: 57 | sample_weight_i = None 58 | return roc_auc_score(y_true_i, y_pred_i, sample_weight=sample_weight_i) 59 | 60 | C = 1.0 / (nb_classes * (nb_classes - 1)) 61 | # TODO: double check 62 | return C * sum( 63 | A(i, j) 64 | for i in classes 65 | for j in classes 66 | if i != j) 67 | 68 | def format_proba_density(data, sample_weight=None, min_support=0, max_support=100): 69 | """ 70 | Estimate the density distribution of the target 1-dimensional data array. 71 | The support arguments (inf and sup) should be: 72 | - 0 and 1 for classification 73 | - min(data) and max(data) for regression 74 | 75 | Output format of the density 76 | >>> list(zip([1, 2, 3], [0.3, 0.3, 0.4])) 77 | 78 | :param data: Target data of the model 79 | :param sample_weight: 80 | :param min_support: Inferior boundary of the support for density estimation 81 | :param max_support: Superior boundary of the support for density estimation 82 | :return: 83 | """ 84 | data = np.array(data) 85 | if len(data) == 0: 86 | return [] 87 | # Heuristic for the bandwidth determination 88 | h = 1.06 * np.std(data) * math.pow(len(data), -.2) 89 | if h <= 0: 90 | h = 0.06 91 | if len(np.unique(data)) == 1: 92 | sample_weight = None 93 | # Definition of the support of the estimate 94 | X_plot = np.linspace(min_support, max_support, 500, dtype=float)[:, np.newaxis] 95 | kde = KernelDensity(kernel='gaussian', bandwidth=h).fit(data.reshape(-1, 1), sample_weight=sample_weight) 96 | Y_plot = [v if not np.isnan(v) else 0 for v in np.exp(kde.score_samples(X_plot))] 97 | return list(zip(X_plot.ravel(), Y_plot)) 98 | 99 | class SurrogateModel(object): 100 | """ 101 | In case the chosen saved model uses a non-tree based algorithm (and thus does not have feature importance), we fit this surrogate model 102 | on top of the prediction of the former one to be able to retrieve the feature importance information. 103 | 104 | """ 105 | 106 | def __init__(self, prediction_type): 107 | self.check(prediction_type) 108 | self.feature_names = None 109 | self.target = None 110 | self.prediction_type = prediction_type 111 | #TODO should we define some params of RF to avoid long computation ? 112 | if prediction_type == ModelDriftConstants.CLASSIFICATION_TYPE: 113 | self.clf = RandomForestClassifier(random_state=1407) 114 | else: 115 | self.clf = RandomForestRegressor(random_state=1407) 116 | 117 | def check(self, prediction_type): 118 | if prediction_type not in [ModelDriftConstants.CLASSIFICATION_TYPE, ModelDriftConstants.REGRRSSION_TYPE]: 119 | raise ValueError('Prediction type must either be CLASSIFICATION or REGRESSION.') 120 | 121 | def get_features(self): 122 | return self.feature_names 123 | 124 | def fit(self, df, target): 125 | preprocessor = Preprocessor(df, target) 126 | train, test = preprocessor.get_processed_train_test() 127 | train_X = train.drop(target, axis=1) 128 | train_Y = train[target] 129 | self.clf.fit(train_X, train_Y) 130 | self.feature_names = train_X.columns -------------------------------------------------------------------------------- /python-lib/dku_data_drift/preprocessing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import random 5 | from collections import Counter 6 | from datetime import datetime 7 | import logging 8 | import numpy as np 9 | 10 | logger = logging.getLogger(__name__) 11 | EPOCH = datetime(1900, 1, 1) 12 | 13 | 14 | class Preprocessor(object): 15 | 16 | def __init__ (self, df=None, target=None): 17 | self.df = df.reset_index(drop=True) 18 | self.target = target 19 | self._categorical_features = [] 20 | self._numerical_features = [] 21 | self._text_features = [] 22 | 23 | def check(self): 24 | if self.df is None: 25 | raise ValueError('df is not specified.') 26 | if self.target is None: 27 | raise ValueError('target is not specified.') 28 | 29 | def get_processed_train_test(self): 30 | self._categorical_features = [x for x in self._get_categorical_features() if x != self.target] 31 | self._numerical_features = self._get_numerical_features() 32 | self._text_features = self._get_text_features() 33 | self._parse_data() 34 | raw_train, raw_test = self._get_train_test_set() 35 | imputed_train, imputed_test = self._impute(raw_train, raw_test) 36 | dummy_values_dict = self._select_dummy_values(imputed_train, self._categorical_features) 37 | final_train = self._dummy_encode(imputed_train, dummy_values_dict) 38 | final_test = self._dummy_encode(imputed_test, dummy_values_dict) 39 | return final_train, final_test 40 | 41 | def _parse_data(self): 42 | def _datetime_to_epoch(series): 43 | return (series - EPOCH) / np.timedelta64(1, 's') 44 | 45 | for feature in self._categorical_features: 46 | self.df[feature] = self.df[feature].apply(self._coerce_to_unicode) 47 | for feature in self._text_features: 48 | self.df[feature] = self.df[feature].apply(self._coerce_to_unicode) 49 | for feature in self._numerical_features: 50 | if self.df[feature].dtype == np.dtype('M8[ns]'): 51 | self.df[feature] = _datetime_to_epoch(self.df[feature]) 52 | else: 53 | self.df[feature] = self.df[feature].astype('double') 54 | 55 | def _get_numerical_features(self): 56 | return self.df.select_dtypes(include=['number', 'M8[ns]']).columns.tolist() 57 | 58 | def _get_categorical_features(self): 59 | return self.df.select_dtypes(include=['object', 'category']).columns.tolist() 60 | 61 | def _get_text_features(self): 62 | return [] 63 | 64 | def _coerce_to_unicode(self, x): 65 | if sys.version_info < (3, 0): 66 | if isinstance(x, str): 67 | return unicode(x, 'utf-8') 68 | else: 69 | return unicode(x) 70 | else: 71 | return str(x) 72 | 73 | def _select_dummy_values(self, dfx, features, LIMIT_DUMMIES = 100): 74 | dummy_values = {} 75 | for feature in features: 76 | values = [ 77 | value 78 | for (value, _) in Counter(dfx[feature]).most_common(LIMIT_DUMMIES) 79 | ] 80 | dummy_values[feature] = values 81 | return dummy_values 82 | 83 | def _get_train_test_set(self, prop=0.8, seed=1234): 84 | k = int(self.df.shape[0] * prop) 85 | random.seed(seed) 86 | sampler = random.sample(self.df.index.tolist(), k) 87 | train = self.df.loc[sampler] 88 | test = self.df[~self.df.index.isin(sampler)] 89 | return train, test 90 | 91 | def _impute(self, df_train, df_test): 92 | for feature in self._numerical_features: 93 | v = df_train[feature].mean() 94 | df_train[feature] = df_train[feature].fillna(v) 95 | df_test[feature] = df_test[feature].fillna(v) 96 | logger.info('Imputed missing values in feature %s with value %s' % (feature, self._coerce_to_unicode(v))) 97 | 98 | for feature in self._categorical_features: 99 | v = 'NULL_CATEGORY' 100 | df_train[feature] = df_train[feature].fillna(v) 101 | df_test[feature] = df_test[feature].fillna(v) 102 | logger.info('Imputed missing values in feature %s with value %s' % (feature, self._coerce_to_unicode(v))) 103 | 104 | return df_train, df_test 105 | 106 | def _dummy_encode(self, dfx, dummy_values_dict): 107 | dfx_copy = dfx.copy() 108 | for (feature, dummy_values) in dummy_values_dict.items(): 109 | for dummy_value in dummy_values: 110 | #TODO add dummy:N/A and dummy:_Others_ 111 | dummy_name = u'dummy:%s:%s' % (feature, self._coerce_to_unicode(dummy_value)) 112 | dfx_copy[dummy_name] = (dfx_copy[feature] == dummy_value).astype(float) 113 | del dfx_copy[feature] 114 | logger.info('Dummy-encoded feature %s' % feature) 115 | 116 | return dfx_copy 117 | -------------------------------------------------------------------------------- /python-lib/dku_tools.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import datetime 3 | import json 4 | import dataiku 5 | from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role 6 | from dku_data_drift.model_drift_constants import ModelDriftConstants 7 | 8 | 9 | def process_timestamp(timestamp): 10 | """ 11 | Convert the timestamp to str date 12 | :param timestamp: 13 | :return: 14 | """ 15 | return str(datetime.datetime.fromtimestamp(timestamp / 1000)) 16 | 17 | 18 | def set_column_description(dataset, column_description_dict): 19 | dataset_schema = dataset.read_schema() 20 | for col_info in dataset_schema: 21 | col_name = col_info.get('name') 22 | col_info['comment'] = column_description_dict.get(col_name) 23 | dataset.write_schema(dataset_schema) 24 | 25 | 26 | def get_input_output(has_model_as_second_input=False): 27 | 28 | if len(get_input_names_for_role('new')) == 0: 29 | raise ValueError('No new dataset.') 30 | if len(get_output_names_for_role('output_dataset')) == 0: 31 | raise ValueError('No output dataset.') 32 | 33 | new_dataset_name = get_input_names_for_role('new')[0] 34 | new_dataset = dataiku.Dataset(new_dataset_name) 35 | 36 | output_dataset_name = get_output_names_for_role('output_dataset')[0] 37 | output_dataset = dataiku.Dataset(output_dataset_name) 38 | 39 | if has_model_as_second_input: 40 | if len(get_input_names_for_role('model')) == 0: 41 | raise ValueError('No input model.') 42 | model_name = get_input_names_for_role('model')[0] 43 | model = dataiku.Model(model_name) 44 | return (new_dataset, model, output_dataset) 45 | else: 46 | if len(get_input_names_for_role('original')) == 0: 47 | raise ValueError('No original dataset.') 48 | 49 | original_dataset_name = get_input_names_for_role('original')[0] 50 | original_dataset = dataiku.Dataset(original_dataset_name) 51 | return (new_dataset, original_dataset, output_dataset) 52 | 53 | 54 | def get_params_with_model(recipe_config, model): 55 | use_active_version = recipe_config.get('use_active_version') 56 | active_version = None 57 | if use_active_version: 58 | for version in model.list_versions(): 59 | active_version = version.get('active') is True 60 | if active_version: 61 | version_id = version.get('versionId') 62 | break 63 | else: 64 | version_id = recipe_config.get('version_id') 65 | if version_id is None: 66 | raise ValueError('Please choose a model version.') 67 | 68 | metric_list = recipe_config.get('metric_list') 69 | if len(metric_list) == 0 or metric_list is None: 70 | raise ValueError('Please choose at least one metric.') 71 | return version_id, metric_list 72 | 73 | 74 | def get_params_without_model(recipe_config): 75 | metric_list = recipe_config.get('metric_list_without_prediction') 76 | if len(metric_list) == 0 or metric_list is None: 77 | raise ValueError('Please choose at least one metric.') 78 | 79 | # Handle columns to remove 80 | columns_to_remove = recipe_config.get('columns_to_remove') 81 | return columns_to_remove, metric_list 82 | 83 | 84 | def build_drift_metric_dataframe(drifter, metric_list, based_df, has_model_as_input): 85 | 86 | new_df = based_df.copy() 87 | column_description_dict = {} 88 | 89 | if ModelDriftConstants.DRIFT_SCORE in metric_list: 90 | # new_df_with_drift_score, column_description_dict = extract_drift_score(drifter, new_df, column_description_dict) 91 | drift_score, drift_accuracy_lower, drift_accuracy_upper, drift_test_pvalue = drifter.get_drift_score(output_raw_score=True) 92 | new_df[ModelDriftConstants.DRIFT_SCORE] = [drift_score] 93 | column_description_dict[ModelDriftConstants.DRIFT_SCORE] = ModelDriftConstants.DRIFT_SCORE_DEFINITION 94 | 95 | new_df[ModelDriftConstants.BINOMIAL_P_VALUE] = [drift_test_pvalue] 96 | column_description_dict[ModelDriftConstants.BINOMIAL_P_VALUE] = ModelDriftConstants.BINOMIAL_TEST_DEFINITION 97 | 98 | new_df[ModelDriftConstants.BINOMIAL_LOWER_BOUND] = [drift_accuracy_lower] 99 | column_description_dict[ModelDriftConstants.BINOMIAL_LOWER_BOUND] = ModelDriftConstants.BINOMIAL_LOWER_BOUND_DEFINITION 100 | 101 | new_df[ModelDriftConstants.BINOMIAL_UPPER_BOUND] = [drift_accuracy_upper] 102 | column_description_dict[ModelDriftConstants.BINOMIAL_UPPER_BOUND] = ModelDriftConstants.BINOMIAL_UPPER_BOUND_DEFINITION 103 | 104 | 105 | if ModelDriftConstants.FUGACITY in metric_list: 106 | if drifter.get_prediction_type() == ModelDriftConstants.CLASSIFICATION_TYPE: 107 | fugacity, fugacity_relative_change = drifter.get_classification_fugacity() 108 | new_df[ModelDriftConstants.FUGACITY] = json.dumps(fugacity) 109 | new_df[ModelDriftConstants.FUGACITY_RELATIVE_CHANGE] = json.dumps(fugacity_relative_change) 110 | column_description_dict[ModelDriftConstants.FUGACITY] = ModelDriftConstants.FUGACITY_CLASSIF_DEFINITION 111 | column_description_dict[ModelDriftConstants.FUGACITY_RELATIVE_CHANGE] = ModelDriftConstants.FUGACITY_RELATIVE_CHANGE_CLASSIF_DEFINITION 112 | elif drifter.get_prediction_type() == ModelDriftConstants.REGRRSSION_TYPE: 113 | fugacity, fugacity_relative_change, bin_description = drifter.get_regression_fugacity() 114 | new_df[ModelDriftConstants.FUGACITY] = json.dumps(fugacity) 115 | new_df[ModelDriftConstants.FUGACITY_RELATIVE_CHANGE] = json.dumps(fugacity_relative_change) 116 | proper_bin_description = '\n'.join(['Decile {0}: {1}'.format(bin_index, bin_desc) for bin_index, bin_desc in enumerate(bin_description)]) 117 | column_description_dict[ModelDriftConstants.FUGACITY] = ModelDriftConstants.FUGACITY_REGRESSION_DEFINITION + proper_bin_description 118 | column_description_dict[ModelDriftConstants.FUGACITY_RELATIVE_CHANGE] = ModelDriftConstants.FUGACITY_RELATIVE_CHANGE_REGRESSION_DEFINITION + proper_bin_description 119 | else: 120 | raise ValueError('Unsupported prediction type: {0}'.format(drifter.get_prediction_type())) 121 | 122 | if ModelDriftConstants.FEATURE_IMPORTANCE in metric_list: 123 | 124 | drift_feature_importance = drifter.get_drift_feature_importance() 125 | feat_dict = {} 126 | for feat, feat_info in drift_feature_importance[:ModelDriftConstants.NUMBER_OF_DRIFTED_FEATURES].iterrows(): 127 | feat_dict[feat] = round(feat_info.get(ModelDriftConstants.IMPORTANCE), 2) 128 | new_df[ModelDriftConstants.MOST_DRIFTED_FEATURES] = [json.dumps(feat_dict)] 129 | column_description_dict[ModelDriftConstants.MOST_DRIFTED_FEATURES] = ModelDriftConstants.MOST_DRIFTED_FEATURES_DEFINITION 130 | 131 | if has_model_as_input: 132 | original_feature_importance = drifter.get_original_feature_importance() 133 | feat_dict = {} 134 | for feat, feat_info in original_feature_importance[:ModelDriftConstants.NUMBER_OF_DRIFTED_FEATURES].iterrows(): 135 | feat_dict[feat] = round(feat_info.get(ModelDriftConstants.IMPORTANCE), 2) 136 | new_df[ModelDriftConstants.MOST_IMPORTANT_FEATURES] = [json.dumps(feat_dict)] 137 | column_description_dict[ModelDriftConstants.MOST_IMPORTANT_FEATURES] = ModelDriftConstants.MOST_DRIFTED_FEATURES_DEFINITION 138 | 139 | if ModelDriftConstants.RISKIEST_FEATURES in metric_list: 140 | riskiest_feature = drifter.get_riskiest_features() 141 | new_df[ModelDriftConstants.RISKIEST_FEATURES] = json.dumps(riskiest_feature) 142 | column_description_dict[ModelDriftConstants.RISKIEST_FEATURES] = ModelDriftConstants.RISKIEST_FEATURES_DEFINITION 143 | 144 | return new_df, column_description_dict -------------------------------------------------------------------------------- /python-lib/model_metadata.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import sys 4 | import json 5 | import dataiku 6 | from dataiku.doctor.posttraining.model_information_handler import PredictionModelInformationHandler 7 | from dku_tools import process_timestamp 8 | 9 | 10 | def get_train_date(model_version, version_id): 11 | m = dataiku.Model(model_version, ignore_flow=True) 12 | for v in m.list_versions(): 13 | if v.get('versionId') == version_id: 14 | return process_timestamp((v.get('snippet').get('trainDate'))) 15 | return None 16 | 17 | 18 | def get_model_handler(model, version_id=None): 19 | saved_model_version_id = _get_saved_model_version_id(model, version_id) 20 | return _get_model_info_handler(saved_model_version_id) 21 | 22 | 23 | def _get_model_info_handler(saved_model_version_id): 24 | infos = saved_model_version_id.split("-") 25 | if len(infos) != 4 or infos[0] != "S": 26 | raise Exception("Invalid saved model id") 27 | pkey = infos[1] 28 | model_id = infos[2] 29 | version_id = infos[3] 30 | 31 | datadir_path = os.environ['DIP_HOME'] 32 | version_folder = os.path.join(datadir_path, "saved_models", pkey, model_id, "versions", version_id) 33 | 34 | # Loading and resolving paths in split_desc 35 | split_folder = os.path.join(version_folder, "split") 36 | with open(os.path.join(split_folder, "split.json")) as split_file: 37 | split_desc = json.load(split_file) 38 | 39 | path_field_names = ["trainPath", "testPath", "fullPath"] 40 | for field_name in path_field_names: 41 | if split_desc.get(field_name, None) is not None: 42 | split_desc[field_name] = os.path.join(split_folder, split_desc[field_name]) 43 | 44 | with open(os.path.join(version_folder, "core_params.json")) as core_params_file: 45 | core_params = json.load(core_params_file) 46 | 47 | try: 48 | return PredictionModelInformationHandler(split_desc, core_params, version_folder, version_folder) 49 | except Exception as e: 50 | from future.utils import raise_ 51 | if "ordinal not in range(128)" in str(e): 52 | raise_(Exception, "The plugin is using a python3 code-env, cannot load a python2 model.", sys.exc_info()[2]) 53 | elif str(e) == "non-string names in Numpy dtype unpickling": 54 | raise_(Exception, "The plugin is using a python2 code-env, cannot load a python3 model.", sys.exc_info()[2]) 55 | else: 56 | raise_(Exception, "Fail to load saved model: {}".format(e), sys.exc_info()[2]) 57 | 58 | 59 | def _get_saved_model_version_id(model, version_id=None): 60 | model_def = model.get_definition() 61 | if version_id is None: 62 | version_id = model_def.get('activeVersion') 63 | saved_model_version_id = 'S-{0}-{1}-{2}'.format(model_def.get('projectKey'), model_def.get('id'), version_id) 64 | return saved_model_version_id 65 | -------------------------------------------------------------------------------- /python-probes/drift-score/probe.json: -------------------------------------------------------------------------------- 1 | { 2 | "meta" : { 3 | "label": "Most recent drift score", 4 | "description": "", 5 | "icon": "icon-anchor" 6 | }, 7 | "handlesDataset": true, 8 | "handlesManagedFolder": false, 9 | "params": [] 10 | } 11 | -------------------------------------------------------------------------------- /python-probes/drift-score/probe.py: -------------------------------------------------------------------------------- 1 | from dku_data_drift.model_drift_constants import ModelDriftConstants 2 | import numpy as np 3 | 4 | def process(dataset, partition_id): 5 | df = dataset.get_dataframe() 6 | if len(df) == 0: 7 | return 'No data' 8 | if ModelDriftConstants.DRIFT_SCORE in df and ModelDriftConstants.TIMESTAMP in df: 9 | most_recent_drift_score = df[df[ModelDriftConstants.TIMESTAMP] == np.max(df[ModelDriftConstants.TIMESTAMP])][ModelDriftConstants.DRIFT_SCORE].values[0] 10 | metric_values = {ModelDriftConstants.DRIFT_SCORE: most_recent_drift_score} 11 | return metric_values 12 | else: 13 | return 'No drift score' 14 | -------------------------------------------------------------------------------- /resource/compute_model_id_choice.py: -------------------------------------------------------------------------------- 1 | """ 2 | Allow dynamic select of the model id in the model recipe. 3 | """ 4 | import dataiku 5 | from dku_tools import process_timestamp 6 | 7 | 8 | def do(payload, config, plugin_config, inputs): 9 | """ 10 | DSS built-in interface for param loading in the form. 11 | Retrieve the available versions of a pretrained model in DSS. 12 | :param payload: 13 | :param config: 14 | :param plugin_config: 15 | :param inputs: 16 | :return: 17 | """ 18 | model = None 19 | for input_ in inputs: 20 | if input_['role'] == 'model': 21 | model = str(input_['fullName']) 22 | if model is None: 23 | raise Exception("Did not catch the right input model") 24 | 25 | 26 | model_id = model.split('.')[-1] 27 | model = dataiku.Model(model_id) 28 | 29 | if model.get_info().get('type') != 'PREDICTION': 30 | raise ValueError('Model type {} is not supported. Please choose a regression or classifcation model.'.format(model.get_info().get('type'))) 31 | 32 | 33 | choice_list = [] 34 | for version in model.list_versions(): 35 | version_detail = version.get('snippet', {}) 36 | algorithm = version_detail.get('algorithm', '').lower().replace('_', ' ') 37 | active_version = version.get('active') is True 38 | train_date = process_timestamp(version_detail.get('trainDate')) 39 | version_id = version.get('versionId') 40 | 41 | if active_version: 42 | version_info = { 43 | 'value': version_id, 44 | 'label': 'active version, trained on {1}, {0}'.format(algorithm, train_date) 45 | } 46 | else: 47 | version_info = { 48 | 'value': version_id, 49 | 'label': 'trained on {1}, {0}'.format(algorithm, train_date) 50 | } 51 | choice_list.append((version_info, train_date)) 52 | 53 | sorted_choice_list = sorted(choice_list, key=lambda k: k[1]) 54 | final_choice_list = [choice[0] for choice in sorted_choice_list] 55 | 56 | return {"choices": final_choice_list} -------------------------------------------------------------------------------- /resource/dku-helpers.js: -------------------------------------------------------------------------------- 1 | /* 2 | Helper function to query webapp backend with a default implementation for error handling 3 | Assumes a dataiku object is defined 4 | v 1.5.0 5 | */ 6 | 7 | dataiku.webappBackend = (function() { 8 | function getUrl(path) { 9 | return dataiku.getWebAppBackendUrl(path); 10 | } 11 | 12 | // function dkuDisplayError(error) { 13 | // alert('Backend error, check the logs.'); 14 | // } 15 | 16 | function get(path, args={}, displayErrors=true) { 17 | return fetch(getUrl(path) + '?' + $.param(args), { 18 | method: 'GET', 19 | headers: { 20 | 'Accept': 'application/json', 21 | 'Content-Type': 'application/json' 22 | } 23 | }) 24 | .then(response => { 25 | if (response.status == 502) { 26 | throw Error("Webapp backend not started"); 27 | } else if (!response.ok) { 28 | response.text().then(text => dataiku.webappMessages.displayFatalError(`${response.statusText} (HTTP ${response.status}):\n${text}`)) 29 | throw Error(`${response.statusText} (HTTP ${response.status})`); 30 | } 31 | try { 32 | return response.json(); 33 | } catch { 34 | throw Error('The backend response is not JSON: '+ response.text()); 35 | } 36 | }) 37 | .catch(function(error) { 38 | if (displayErrors && error.message && !error.message.includes('not started')) { // little hack, backend not started should be handled elsewhere 39 | dataiku.webappMessages.displayFatalError(error) 40 | } 41 | throw error; 42 | }); 43 | } 44 | 45 | return Object.freeze({getUrl, get}); 46 | })(); 47 | 48 | 49 | dataiku.webappMessages = (function() { 50 | function displayFatalError(err) { 51 | const errElt = $('
') 52 | errElt.text(err); 53 | $('#error_message').html(errElt); 54 | } 55 | function clear() { 56 | $('#error_message').html(''); 57 | } 58 | return Object.freeze({displayFatalError, clear}); 59 | })(); 60 | -------------------------------------------------------------------------------- /resource/style.css: -------------------------------------------------------------------------------- 1 | /* 2 | DSS webapp base stylesheet v2.0.0 3 | Apache Software License 4 | Dataiku (Joachim Zentici) 5 | 6 | This stylesheet should allow you to simply style a webapp while keeping a good consistency with DSS 7 | For questions and requests, https://github.com/dataiku/dataiku-contrib/issues 8 | 9 | */ 10 | :root { 11 | /* DSS-like colors, prefer using them for better visual integration with core product */ 12 | --blue-lighten-5: #E7F3FF; 13 | --blue-lighten-4: #C4E0FE; 14 | --blue-lighten-3: #9DCCFE; 15 | --blue-lighten-2: #76B8FD; 16 | --blue-lighten-1: #58A8FC; 17 | --blue: #3B99FC; 18 | --blue-darken-1: #3591FC; 19 | --blue-darken-2: #2D86FB; 20 | --blue-darken-3: #267CFB; 21 | --blue-darken-4: #196BFA; 22 | 23 | --success-green: #4caf50; 24 | --warning-orange: #F28C37; 25 | --error-color: #CE1228; 26 | --error-background: #f9e3e5; 27 | 28 | --grey-lighten-7: #F2F2F2; 29 | --grey-lighten-6: #DDDDDD; 30 | --grey-lighten-5: #CCCCCC; 31 | --grey-lighten-4: #BBBBBB; 32 | --grey-lighten-3: #666666; 33 | --grey-lighten-2: #444444; 34 | --grey-lighten-1: #333333; 35 | --grey: #222222; 36 | 37 | --grey-text: var(--grey-lighten-1); 38 | --border-color: var(--grey-lighten-7); 39 | 40 | /* Backgrounds */ 41 | --grey-background: var(--grey-lighten-7); 42 | } 43 | 44 | @font-face { 45 | font-family: SourceSansPro; 46 | src: url(/static/dataiku/fonts/SourceSansPro-Bold.woff); 47 | font-weight: 600; 48 | } 49 | @font-face { 50 | font-family: SourceSansPro; 51 | src: url(/static/dataiku/fonts/SourceSansPro-Semibold.woff); 52 | font-weight: 500; 53 | } 54 | @font-face { 55 | font-family: SourceSansPro; 56 | src: url(/static/dataiku/fonts/SourceSansPro-Regular.woff); 57 | font-weight: 400; 58 | } 59 | 60 | body { 61 | font-family: 'SourceSansPro'; 62 | font-size: 13px; 63 | color: #333333; 64 | } 65 | a { 66 | color: #0088cc; 67 | text-decoration: none; 68 | } 69 | 70 | h1 { 71 | font-size: 32px; 72 | font-weight: 500; 73 | margin-top: 0; 74 | margin-bottom: 0; 75 | } 76 | 77 | h2 { 78 | font-size: 24px; 79 | font-weight: 400; 80 | margin-bottom: 0; 81 | } 82 | 83 | h3 { 84 | font-size: 18px; 85 | font-weight: 400; 86 | margin-bottom: 0; 87 | } 88 | 89 | h4 { 90 | font-size: 16px; 91 | font-weight: 400; 92 | margin-bottom: 0; 93 | } 94 | 95 | /* Buttons */ 96 | 97 | .btn { 98 | font-family: 'SourceSansPro'; 99 | text-transform: uppercase; 100 | font-size: 13px; 101 | font-weight: 500; 102 | padding: 3px 8px; 103 | margin: 0; 104 | line-height: 1.4; 105 | background-image: inherit; 106 | box-shadow: none; 107 | text-shadow: none; 108 | box-sizing: border-box; 109 | outline: 0; 110 | cursor: pointer; 111 | background-color: #ffffff; 112 | color: var(--grey-text); 113 | border: 1px solid #cccccc; 114 | } 115 | .btn:hover { 116 | background-color: #dddddd; 117 | } 118 | .btn:active { 119 | background-color: #cccccc; 120 | } 121 | 122 | .btn-primary { 123 | background: #28a9dd; 124 | color: #ffffff; 125 | border: 1px solid transparent; 126 | } 127 | .btn-primary:hover { 128 | background: #31adde; 129 | } 130 | .btn-primary:active { 131 | background: #22a4d9; 132 | } 133 | 134 | /* Layout */ 135 | 136 | .white-box { 137 | background: #ffffff; 138 | box-shadow: 0px 0px 2px 1px rgba(34, 34, 34, 0.15); 139 | box-sizing: border-box; 140 | padding: 24px; 141 | margin-bottom: 24px; 142 | width: 1000px; 143 | } 144 | 145 | /* tables */ 146 | 147 | table.ml-table { 148 | border-collapse: collapse; 149 | border-spacing: 0; 150 | } 151 | table.ml-table th { 152 | font-family: 'SourceSansPro'; 153 | font-weight: 400; 154 | font-size: 16px; 155 | } 156 | table.ml-table th, table.ml-table td { 157 | border: 1px solid #dddddd; 158 | padding: 8px; 159 | } 160 | 161 | 162 | /* Standard components */ 163 | 164 | .explanation { /* try to keep in sync with DSS .doctor-explanation */ 165 | background: #e6eef2; 166 | padding: 20px; 167 | border-radius: 5px; 168 | color: #31708f; 169 | } 170 | .error-message, #error_message { /*remove id*/ 171 | color: var(--error-color); 172 | background-color: var(--error-background); 173 | padding: 8px 16px 8px 16px; 174 | font-size: 15px; 175 | } -------------------------------------------------------------------------------- /tests/python/requirements.txt: -------------------------------------------------------------------------------- 1 | flask==1.1.2 2 | scikit-learn==0.20.2 3 | scipy==1.1.0 4 | xgboost==0.81 5 | pandas==0.23.4 6 | numpy==1.16.6 7 | future==0.18.2 8 | joblib==0.14.1 9 | enum34==1.1.10 10 | statsmodels==0.9.0 -------------------------------------------------------------------------------- /tests/python/unit/test_drift_analyzer.py: -------------------------------------------------------------------------------- 1 | # This is a test file intended to be used with pytest 2 | # pytest automatically runs all the function starting with "test_" 3 | # see https://docs.pytest.org for more information 4 | 5 | import sys 6 | import os 7 | import numpy as np 8 | import pandas as pd 9 | from sklearn.ensemble import RandomForestClassifier 10 | from sklearn.datasets import load_iris 11 | from sklearn.model_selection import train_test_split 12 | import pytest 13 | from dku_data_drift import DriftAnalyzer, ModelAccessor 14 | 15 | RANDOM_SEED = 65537 # Fermat prime number <3 16 | TEST_RATIO = 0.3 # if this ratio change the reference prediction results below need to be updated accordingly 17 | 18 | def load_data(): 19 | iris = load_iris() 20 | feature_names = iris['feature_names'] 21 | target = 'target' 22 | df = pd.DataFrame(data=np.c_[iris['data'], iris['target']], 23 | columns=feature_names + [target]) 24 | return df, feature_names, target 25 | 26 | 27 | class ScikitPredictor: 28 | 29 | def __init__(self, df, feature_names, target): 30 | self.feature_names = feature_names 31 | self._clf = RandomForestClassifier(n_estimators=10, random_state=RANDOM_SEED).fit(df[feature_names], df[target]) 32 | 33 | def get_features(self): 34 | return self.feature_names 35 | 36 | def predict(self, X): 37 | predictions = self._clf.predict(X[self.feature_names]) 38 | probas = self._clf.predict_proba(X[self.feature_names]) 39 | df = pd.DataFrame(probas, columns = ['proba_{}'.format(x) for x in range(probas.shape[1])]) 40 | df['prediction'] = predictions 41 | return df 42 | 43 | 44 | class ScikitModelHandler: 45 | 46 | def __init__(self): 47 | self.df, self.feature_names, self.target = load_data() 48 | self.train_df, self.test_df = train_test_split(self.df, test_size=0.3, random_state=RANDOM_SEED) 49 | self.predictor = ScikitPredictor(self.train_df, self.feature_names, self.target) 50 | 51 | def get_prediction_type(self): 52 | return 'MULTICLASS' 53 | 54 | def get_predictor(self): 55 | return self.predictor 56 | 57 | def get_target_variable(self): 58 | return self.target 59 | 60 | def get_test_df(self): 61 | return [self.test_df, True] 62 | 63 | def get_per_feature(self): 64 | per_feature_dict ={ 65 | self.target: {'role': 'TARGET'} 66 | } 67 | for feature in self.feature_names: 68 | dct = { 69 | 'role': 'INPUT', 70 | 'type': 'NUMERIC', 71 | 'missing_handling': 'IMPUTE', 72 | 'missing_impute_with': 'MEAN', 73 | 'numerical_handling': 'REGULAR', 74 | 'rescaling': 'AVGSTD', 75 | } 76 | per_feature_dict[feature] = dct 77 | 78 | return per_feature_dict 79 | 80 | def get_selected_features(self): 81 | selected_features = [] 82 | for feat, feat_info in self.get_per_feature().items(): 83 | if feat_info.get('role') == 'INPUT': 84 | selected_features.append(feat) 85 | return selected_features 86 | 87 | 88 | class TestDriftAnalyzer: 89 | 90 | def setup(self): 91 | self.model_handler = 'model_handler' 92 | self.model_handler = ScikitModelHandler() 93 | self.model_accessor = ModelAccessor(self.model_handler) 94 | self.drifter = DriftAnalyzer() 95 | 96 | def test_empty_set(self): 97 | _, feature_names, _ = load_data() 98 | new_test_df = pd.DataFrame(columns=feature_names) 99 | with pytest.raises(Exception) as e_info: 100 | self.drifter.fit(new_test_df, model_accessor=self.model_accessor) 101 | 102 | def test_missing_feature_set(self): 103 | df, feature_names, _ = load_data() 104 | _, new_test_df = train_test_split(df, test_size=TEST_RATIO, random_state=RANDOM_SEED) 105 | new_test_df = new_test_df.drop(feature_names[0], 1) 106 | 107 | with pytest.raises(Exception) as e_info: 108 | self.drifter.fit(new_test_df, model_accessor=self.model_accessor) 109 | 110 | def test_identical_set(self): 111 | df, _, _ = load_data() 112 | _, new_test_df = train_test_split(df, test_size=TEST_RATIO, random_state=RANDOM_SEED) 113 | self.drifter.fit(new_test_df, model_accessor=self.model_accessor) 114 | result_dict = self.drifter.get_drift_metrics_for_webapp() 115 | 116 | drift_accuracy = result_dict.get('drift_accuracy') 117 | fugacity = result_dict.get('fugacity') 118 | feature_importance = result_dict.get('feature_importance') 119 | 120 | original_model_feature_importance = sorted([feat_imp['original_model'] for feat_imp in feature_importance]) 121 | drift_model_feature_importance = sorted([feat_imp['drift_model'] for feat_imp in feature_importance]) 122 | 123 | assert drift_accuracy == 0.5 # no drift, model can not distinguish, accuracy is 0.5 124 | for fugacity_one_class in fugacity: 125 | assert fugacity_one_class.get('Selected dataset') == fugacity_one_class.get('Original dataset') 126 | 127 | assert np.array_equal(original_model_feature_importance, [0.01, 0.01, 43.17215785326303, 46.77454270154651]) 128 | assert np.array_equal(drift_model_feature_importance, 129 | [0.01, 25.14448373884474, 26.616157925410526, 27.984711759761264]) 130 | 131 | def test_drifted_set(self): 132 | df, feature_names, _ = load_data() 133 | _, original_test_df = train_test_split(df, test_size=TEST_RATIO, random_state=RANDOM_SEED) 134 | new_test_df = original_test_df.copy() 135 | new_test_df[feature_names] = new_test_df[feature_names] * 2 # shift the feature distribution 136 | 137 | self.drifter.fit(new_test_df, model_accessor=self.model_accessor) 138 | result_dict = self.drifter.get_drift_metrics_for_webapp() 139 | 140 | drift_accuracy = result_dict.get('drift_accuracy') 141 | fugacity = result_dict.get('fugacity') 142 | 143 | prediction_distribution_original_test_set = [fuga['Input dataset'] for fuga in fugacity] 144 | prediction_distribution_new_test_set = [fuga['Test dataset'] for fuga in fugacity] 145 | 146 | assert drift_accuracy == 1 147 | assert np.array_equal(prediction_distribution_original_test_set, [2.22, 75.56, 22.22]) 148 | assert np.array_equal(prediction_distribution_new_test_set, [40.0, 35.56, 24.44]) 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | -------------------------------------------------------------------------------- /webapps/model-drift-view/app.js: -------------------------------------------------------------------------------- 1 | let webAppConfig = dataiku.getWebAppConfig(); 2 | let modelId = webAppConfig['modelId']; 3 | let versionId = webAppConfig['versionId']; 4 | 5 | dataiku.webappBackend.get('list-datasets') 6 | .then(data => { 7 | $.each(data.dataset_list, function(i, option) { 8 | $('#dataset-selector').append($('').attr("value", option.name).text(option.name)); 9 | }); 10 | } 11 | ); 12 | 13 | $('#run-button').click(function() { 14 | dataiku.webappMessages.clear(); 15 | $('.landing-page').hide(); 16 | runAnalysis($('#run-button')); 17 | }); 18 | 19 | function changeInputColor(input, value){ 20 | $(input).removeClass(); 21 | if (value < 0.1){ 22 | $(input).addClass('low-risk'); 23 | $('#inline-drift-score-explain').html('low data drift.'); 24 | } 25 | else if(value >= 0.1 && value <= 0.5){ 26 | $(input).addClass('medium-risk'); 27 | $('#inline-drift-score-explain').html('medium data drift.'); 28 | } 29 | else{ 30 | $(input).addClass('high-risk'); 31 | $('#inline-drift-score-explain').html('high data drift.'); 32 | } 33 | } 34 | 35 | function runAnalysis($this) { 36 | markRunning(true); 37 | dataiku.webappBackend.get('get-drift-metrics', {'model_id': modelId, 'version_id': versionId, 'test_set': $("#dataset-selector").val()}) 38 | .then( 39 | function(data) { 40 | // first box 41 | $('#accuracy').text(data['drift_accuracy']); 42 | $('#lower-bound').text(data['drift_accuracy_lower']); 43 | $('#upper-bound').text(data['drift_accuracy_upper']); 44 | $('#inline-drift-score').text(data['drift_accuracy']); 45 | $('#inline-drift-score-2').text(data['drift_accuracy']); 46 | $('#binomial-p-value').text(data['drift_test_pvalue']); 47 | if (data['drift_test_pvalue'] <= 0.05){ 48 | $('#binomial-conclusion').innerHTML = '≤' + '0.05 so drift detected'; 49 | } else { 50 | $('#binomial-conclusion').text('> 0.05 so no drift detected') 51 | } 52 | $('#sample-size').text(data['sample_size']); 53 | 54 | changeInputColor('#drift-score', data['drift_accuracy']); 55 | $('#error_message').html(''); 56 | 57 | //other boxes 58 | draw(data); 59 | $('.result-state').show(); 60 | markRunning(false); 61 | } 62 | ) 63 | .catch(error => { 64 | markRunning(false); 65 | dataiku.webappMessages.displayFatalError(error); 66 | }); 67 | } 68 | 69 | function markRunning(running) { 70 | if (running) { 71 | $('.running-state').show(); 72 | $('.notrunning-state').hide(); 73 | $('.result-state').hide(); 74 | } else { 75 | $('.running-state').hide(); 76 | $('.notrunning-state').show(); 77 | } 78 | } 79 | 80 | function draw(data) { 81 | document.getElementById("riskiest_features_explanation").innerHTML = ''; 82 | switch(data.type){ 83 | case "CLASSIFICATION": 84 | drawFugacity(data['fugacity']); 85 | draw_KDE_classification(data['kde']); 86 | break; 87 | case "REGRESSION": 88 | d3.select("#fugacity_div").selectAll("div").remove(); 89 | d3.select("#fugacity_label").remove(); 90 | d3.select("#kde_class_option").select("#label-list").remove(); 91 | draw_KDE_regression(data['kde']); 92 | break; 93 | default: 94 | console.log("Value error for the type of learning task:") 95 | console.log(data.type) 96 | } 97 | drawFeatureImportance(data['feature_importance']); 98 | recommendation_text = ""; 99 | if (data.riskiest_features.length>0){ 100 | var i; 101 | var recommendation_text = "We recommend you to check the following feature(s): " 102 | for (i = 0; i < data.riskiest_features.length; i++) { 103 | recommendation_text += data.riskiest_features[i]; 104 | if (i < (data.riskiest_features.length - 1)){ 105 | recommendation_text += ", " 106 | } 107 | } 108 | } 109 | document.getElementById("riskiest_features_explanation").innerHTML = recommendation_text; 110 | 111 | if (data.drift_accuracy >= 0.1){ 112 | d3.select("#feature_importance_div").style('display', 'block') 113 | } else { 114 | d3.select("#feature_importance_div").style('display', 'none'); 115 | } 116 | } 117 | 118 | function drawFugacity(data) { 119 | $('#fugacity-score').html(json2table(data, 'table text-sb table-bordered table-hover')); // ml-table 120 | } 121 | 122 | function json2table(json, classes) { 123 | let cols = Object.keys(json[0]); 124 | let header = ''; 125 | let body = ''; 126 | classes = classes || ''; 127 | 128 | function capitalizeFirstLetter(string) { 129 | return string.charAt(0).toUpperCase() + string.slice(1); 130 | } 131 | 132 | body += '