├── .gitignore
├── Makefile
├── README.md
├── code-env
    └── python
    │   ├── desc.json
    │   └── spec
    │       └── requirements.txt
├── custom-recipes
    ├── compare-dataset-with-model
    │   ├── recipe.json
    │   └── recipe.py
    └── compare-datasets
    │   ├── recipe.json
    │   └── recipe.py
├── plugin.json
├── python-lib
    ├── dku_data_drift
    │   ├── __init__.py
    │   ├── dataframe_helpers.py
    │   ├── dataset_helpers.py
    │   ├── drift_analyzer.py
    │   ├── model_accessor.py
    │   ├── model_drift_constants.py
    │   ├── model_tools.py
    │   └── preprocessing.py
    ├── dku_tools.py
    └── model_metadata.py
├── python-probes
    └── drift-score
    │   ├── probe.json
    │   └── probe.py
├── resource
    ├── bootstrap.min.css
    ├── compute_model_id_choice.py
    ├── d3.v4.min.js
    ├── dku-helpers.js
    └── style.css
├── tests
    └── python
    │   ├── requirements.txt
    │   └── unit
    │       └── test_drift_analyzer.py
└── webapps
    └── model-drift-view
        ├── app.js
        ├── backend.py
        ├── body.html
        ├── style.css
        └── webapp.json


/.gitignore:
--------------------------------------------------------------------------------
  1 | unit.xml
  2 | state.json
  3 | *.pyc
  4 | .DS_Store
  5 | .idea/
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .coverage
 43 | .coverage.*
 44 | .cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *.cover
 48 | .hypothesis/
 49 | .pytest_cache/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | db.sqlite3
 59 | 
 60 | # Flask stuff:
 61 | instance/
 62 | .webassets-cache
 63 | 
 64 | # Scrapy stuff:
 65 | .scrapy
 66 | 
 67 | # Sphinx documentation
 68 | docs/_build/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # pyenv
 77 | .python-version
 78 | 
 79 | # celery beat schedule file
 80 | celerybeat-schedule
 81 | 
 82 | # SageMath parsed files
 83 | *.sage.py
 84 | 
 85 | # Environments
 86 | .env
 87 | .venv
 88 | env/
 89 | venv/
 90 | ENV/
 91 | env.bak/
 92 | venv.bak/
 93 | 
 94 | # Spyder project settings
 95 | .spyderproject
 96 | .spyproject
 97 | 
 98 | # Rope project settings
 99 | .ropeproject
100 | 
101 | # mkdocs documentation
102 | /site
103 | 
104 | # mypy
105 | .mypy_cache/
106 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile variables set automatically
 2 | plugin_id=`cat plugin.json | python -c "import sys, json; print(str(json.load(sys.stdin)['id']).replace('/',''))"`
 3 | plugin_version=`cat plugin.json | python -c "import sys, json; print(str(json.load(sys.stdin)['version']).replace('/',''))"`
 4 | archive_file_name="dss-plugin-${plugin_id}-${plugin_version}.zip"
 5 | remote_url=`git config --get remote.origin.url`
 6 | last_commit_id=`git rev-parse HEAD`
 7 | 
 8 | 
 9 | plugin:
10 | 	@echo "[START] Archiving plugin to dist/ folder..."
11 | 	@cat plugin.json | json_pp > /dev/null
12 | 	@rm -rf dist
13 | 	@mkdir dist
14 | 	@echo "{\"remote_url\":\"${remote_url}\",\"last_commit_id\":\"${last_commit_id}\"}" > release_info.json
15 | 	@git archive -v -9 --format zip -o dist/${archive_file_name} HEAD
16 | 	@zip -u dist/${archive_file_name} release_info.json
17 | 	@rm release_info.json
18 | 	@echo "[SUCCESS] Archiving plugin to dist/ folder: Done!"
19 | 
20 | unit-tests:
21 | 	@echo "[START] Running unit tests..."
22 | 	@( \
23 | 		PYTHON_VERSION=`python3 -V 2>&1 | sed 's/[^0-9]*//g' | cut -c 1,2`; \
24 | 		PYTHON_VERSION_IS_CORRECT=`cat code-env/python/desc.json | python3 -c "import sys, json; print(str($$PYTHON_VERSION) in [x[-2:] for x in json.load(sys.stdin)['acceptedPythonInterpreters']]);"`; \
25 | 		if [ ! $$PYTHON_VERSION_IS_CORRECT ]; then echo "Python version $$PYTHON_VERSION is not in acceptedPythonInterpreters"; exit 1; fi; \
26 | 	)
27 | 	@( \
28 | 		python3 -m venv env/; \
29 | 		source env/bin/activate; \
30 | 		pip3 install --upgrade pip; \
31 | 		pip install --no-cache-dir -r tests/python/requirements.txt; \
32 | 		pip install --no-cache-dir -r code-env/python/spec/requirements.txt; \
33 | 		export PYTHONPATH="$(PYTHONPATH):$(PWD)/python-lib"; \
34 |                 pytest -o junit_family=xunit2 --junitxml=unit.xml tests/python/unit || true; \
35 | 		deactivate; \
36 | 	)
37 | 	@echo "[SUCCESS] Running unit tests: Done!"
38 | 
39 | integration-tests:
40 | 	@echo "[START] Running integration tests..."
41 | 	# TODO add integration tests
42 | 	@echo "[SUCCESS] Running integration tests: Done!"
43 | 
44 | tests: unit-tests integration-tests
45 | 
46 | dist-clean:
47 | 	rm -rf dist


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ⚠️ Starting with DSS version 10.0.0, this plugin is considered as "Deprecated" and will be maintained only to fix critical issues. We recommend using the native feature [Model Evaluation Store](https://doc.dataiku.com/dss/latest/python-api/model-evaluation-stores.html).
 2 | 
 3 | 
 4 | # Model drift monitoring
 5 | 
 6 | Monitoring ML models in production is often a tedious task. You can apply a simply retraining strategy based on monitoring the model’s performance: if your AUC drops by a given percentage, retrain. Although accurate, this approach requires to obtain the ground truth for your preditctions, which is not always fast, and certainly not “real time”.
 7 | 
 8 | Instead of waiting for the ground truth, we propose to look at the recent data the model has had to score, and statistically compare it with the data on which the model was evaluated. If these datasets are too different, the model may need to be retrained.
 9 | 
10 | 
11 | ## Scope of the plugin
12 | This plugin offers a set of different DSS components to monitor input data drift (of a model):
13 | * Model view: visualise the drift metrics and graph
14 | * Recipe: compute feature drift of a deployed model
15 | * Recipe: compute drift between two datasets
16 | * Custom metric: retrieve the most recent drift metric
17 | 
18 | 
19 | ## Installation and requirements
20 | 
21 | Please see our [official plugin page](https://www.dataiku.com/product/plugins/model-drift-monitoring/) for installation.
22 | 
23 | ## Changelog
24 | 
25 | ****Version 3.1.5 (2022-07)**
26 | * Misc:
27 |   * Load js package locally to support offline DSS instances.
28 | 
29 | ****Version 3.1.4 (2022-03)**
30 | * Misc:
31 |   * Add cloudpickle to code-env requirements.
32 |   * Update existing packages version.
33 | 
34 | 
35 | ****Version 3.1.3 (2021-12)**
36 | * Enhancement: 
37 |   * Use feature importance from Tree-based regression models.
38 |   * Use surrogate model for CalibratedClassifierCV. 
39 | 
40 | **Version 3.0.0 (2020-12)**
41 | * Enhancement:
42 |   * Add binomial test to check the reliability of drift score.
43 |   * Improve model view's UI.
44 | 
45 | **Version 2.0.0 (2020-06)**
46 | * New components: 
47 |    * Recipe: Compute feature drift of a deployed model
48 |    * Recipe: Compute drift between two datasets
49 |    * Custom metric: Retrieve last drift metric
50 | * Enhancement: 
51 |    * Add support for regression algorithms and non tree-based algorithm
52 |    * Add riskiest features information allowing users to immediately have the list of features that they need to be careful about (ie. features that are drifted the most and are important in the deloyed model)
53 |    * Add support for partitioning
54 |    * Add support for all types of train-test split (with/without cross-validation)
55 | * Bug fixes:
56 |    * Fix bug with boolean dtype handling that leads to mismatch prediction probability and weird categorical variable encoding.
57 |    * Fix bug with Date dtype and python 3.
58 | 
59 | **Version 1.0.0 (2019-12)**
60 | 
61 | * Initial release
62 | * Model view component: support for tree-based classification algorithms
63 | 
64 | You can log feature requests or issues on our [dedicated Github repository](https://github.com/dataiku/dss-plugin-model-drift/issues).
65 | 
66 | # License
67 | 
68 | The Model drift monitoring plugin is:
69 | 
70 |    Copyright (c) 2020 Dataiku SAS
71 |    Licensed under the [MIT License](LICENSE.md).
72 | 


--------------------------------------------------------------------------------
/code-env/python/desc.json:
--------------------------------------------------------------------------------
1 | {
2 |   "acceptedPythonInterpreters": ["PYTHON27", "PYTHON36"],
3 |   "forceConda": false,
4 |   "installCorePackages": true,
5 |   "installJupyterSupport": true
6 | }


--------------------------------------------------------------------------------
/code-env/python/spec/requirements.txt:
--------------------------------------------------------------------------------
 1 | flask==1.1.2
 2 | scikit-learn>=0.20,<0.21
 3 | scipy>=1.2,<1.3
 4 | xgboost==0.82
 5 | future==0.18.2
 6 | joblib==0.14.1
 7 | enum34==1.1.10
 8 | statsmodels>=0.10,<0.11
 9 | cloudpickle>=1.3,<1.6
10 | jinja2>=2.10,<2.11
11 | 


--------------------------------------------------------------------------------
/custom-recipes/compare-dataset-with-model/recipe.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "meta": {
 3 |         "label": "Compute feature drift of a deployed model",
 4 |         "description": "Measure data drift between the original training data of a model and a new dataset.",
 5 |         "icon": "icon-anchor",
 6 |         "displayOrderRank": 1
 7 |     },
 8 | 
 9 |     "selectableFromDataset": "new",
10 |     "kind": "PYTHON",
11 |     "paramsPythonSetup": "compute_model_id_choice.py",
12 |     "inputRoles" : [
13 |         {
14 |             "name": "model",
15 |             "label": "Deployed model",
16 |             "arity": "UNARY",
17 |             "required": true,
18 |             "acceptsDataset": false,
19 |             "acceptsSavedModel": true
20 |         },
21 |         {
22 |             "name": "new",
23 |             "label": "New dataset",
24 |             "description": "New dataset",
25 |             "arity": "UNARY",
26 |             "required": true,
27 |             "acceptsDataset": true
28 |         }
29 |     ],
30 | 
31 |     "outputRoles" : [
32 |         {
33 |             "name": "output_dataset",
34 |             "label": "Metrics dataset",
35 |             "description": "Drift metrics will be stored in this dataset",
36 |             "arity": "UNARY",
37 |             "required": true,
38 |             "acceptsDataset": true
39 |         }
40 |     ],
41 | 
42 |     "params": [
43 |         {
44 |             "name": "use_active_version",
45 |             "label": "Use the model's active version",
46 |             "type": "BOOLEAN",
47 |             "defaultValue": true
48 |         },
49 |         {
50 |             "name": "version_id",
51 |             "label": "Model version",
52 |             "type": "SELECT",
53 |             "getChoicesFromPython": true,
54 |             "visibilityCondition": "!model.use_active_version"
55 | 
56 |         },
57 |         {
58 |             "name": "metric_list",
59 |             "label": "Metrics",
60 |             "type": "MULTISELECT",
61 |             "selectChoices": [
62 |                 { "value": "drift_model_accuracy", "label": "Drift score"},
63 |                 { "value": "fugacity", "label": "Fugacity"},
64 |                 { "value": "riskiest_features", "label": "Riskiest features"},
65 |                 { "value": "feature_importance", "label": "Feature importance"}
66 |             ],
67 |             "defaultValue": ["drift_score"]
68 |         }
69 |     ],
70 | 
71 |     "resourceKeys": []
72 | 
73 | }
74 | 


--------------------------------------------------------------------------------
/custom-recipes/compare-dataset-with-model/recipe.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import datetime
 3 | import logging
 4 | from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role, get_recipe_config
 5 | from dku_tools import set_column_description, get_input_output, get_params_with_model, build_drift_metric_dataframe
 6 | from model_metadata import get_train_date
 7 | from dku_data_drift.drift_analyzer import DriftAnalyzer
 8 | from dku_data_drift.model_accessor import ModelAccessor
 9 | from dku_data_drift.dataset_helpers import get_partitioning_columns
10 | from dku_data_drift.model_drift_constants import ModelDriftConstants
11 | from model_metadata import get_model_handler
12 | 
13 | 
14 | # init logger
15 | logger = logging.getLogger(__name__)
16 | logging.basicConfig(level=logging.INFO, format='Model Drift Recipe | %(levelname)s - %(message)s')
17 | 
18 | new_dataset, model, output_dataset = get_input_output(has_model_as_second_input=True)
19 | new_df = new_dataset.get_dataframe(bool_as_str=True, limit=ModelDriftConstants.MAX_NUM_ROW)
20 | 
21 | partition_cols_new_df = get_partitioning_columns(new_dataset)
22 | if partition_cols_new_df:
23 |     new_df = new_df.drop(partition_cols_new_df, axis=1)
24 | if len(new_df.columns) == 0:
25 |     raise ValueError('Without the partition column, dataset is empty.')
26 | 
27 | version_id, metric_list = get_params_with_model(get_recipe_config(), model)
28 | 
29 | # Access the model
30 | model_handler = get_model_handler(model=model, version_id=version_id)
31 | model_accessor = ModelAccessor(model_handler)
32 | 
33 | # Analyze the drift
34 | drifter = DriftAnalyzer(prediction_type=None)
35 | drifter.fit(new_df, model_accessor=model_accessor)
36 | 
37 | # Write the drift score and metrics
38 | timestamp = datetime.datetime.now()
39 | model_train_date = get_train_date(model.get_id(), version_id)
40 | new_df = pd.DataFrame({ModelDriftConstants.TIMESTAMP: [timestamp],
41 |                        ModelDriftConstants.MODEL_ID: [model.get_id()],
42 |                        ModelDriftConstants.VERSION_ID: [version_id],
43 |                        ModelDriftConstants.TRAIN_DATE: [model_train_date]})
44 | #specify the column order
45 | new_df = new_df[[ModelDriftConstants.TIMESTAMP, ModelDriftConstants.MODEL_ID, ModelDriftConstants.VERSION_ID, ModelDriftConstants.TRAIN_DATE]]
46 | metrics_df, column_description_dict = build_drift_metric_dataframe(drifter, metric_list, new_df, has_model_as_input=True)
47 | 
48 | output_dataset.write_with_schema(metrics_df)
49 | set_column_description(output_dataset, column_description_dict)


--------------------------------------------------------------------------------
/custom-recipes/compare-datasets/recipe.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "meta": {
 3 |         "label": "Compute drift between two datasets",
 4 |         "description": "Measure data drift between two datasets having the same schema.",
 5 |         "icon": "icon-anchor",
 6 |         "displayOrderRank": 2
 7 |     },
 8 | 
 9 |     "selectableFromDataset": "original",
10 |     "kind": "PYTHON",
11 |     "inputRoles" : [
12 |         {
13 |             "name": "original",
14 |             "label": "Original dataset",
15 |             "description": "",
16 |             "arity": "UNARY",
17 |             "required": true,
18 |             "acceptsDataset": true
19 |         },
20 |         {
21 |             "name": "new",
22 |             "label": "New dataset",
23 |             "description": "",
24 |             "arity": "UNARY",
25 |             "required": true,
26 |             "acceptsDataset": true
27 |         }
28 |     ],
29 | 
30 |     "outputRoles" : [
31 |         {
32 |             "name": "output_dataset",
33 |             "label": "Metrics dataset",
34 |             "description": "Dataset storing drift metrics",
35 |             "arity": "UNARY",
36 |             "required": true,
37 |             "acceptsDataset": true
38 |         }
39 |     ],
40 | 
41 |     "params": [
42 |         {
43 |             "name": "metric_list_without_prediction",
44 |             "label": "Metrics",
45 |             "type": "MULTISELECT",
46 |             "selectChoices": [
47 |                 { "value": "drift_model_accuracy", "label": "Drift score"},
48 |                 { "value": "feature_importance", "label": "Most drifted features"}
49 |             ],
50 |             "defaultValue": ["drift_model_accuracy"]
51 |         },
52 |         {
53 |             "name": "columns_to_remove",
54 |             "label": "Columns to ignore",
55 |             "type": "COLUMNS",
56 |             "columnRole": "original"
57 |         }
58 |     ],
59 |     "resourceKeys": []
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/custom-recipes/compare-datasets/recipe.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import datetime
 3 | import logging
 4 | from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role, get_recipe_config
 5 | from dku_data_drift.drift_analyzer import DriftAnalyzer
 6 | from dku_data_drift.dataset_helpers import get_partitioning_columns
 7 | from dku_data_drift.model_drift_constants import ModelDriftConstants
 8 | from dku_tools import set_column_description, get_input_output, get_params_without_model, build_drift_metric_dataframe
 9 | 
10 | logger = logging.getLogger(__name__)
11 | logging.basicConfig(level=logging.INFO, format='Model Drift Recipe | %(levelname)s - %(message)s')
12 | 
13 | new_dataset, original_dataset, output_dataset = get_input_output()
14 | original_df = original_dataset.get_dataframe(bool_as_str=True, limit=ModelDriftConstants.MAX_NUM_ROW)
15 | new_df = new_dataset.get_dataframe(bool_as_str=True, limit=ModelDriftConstants.MAX_NUM_ROW)
16 | 
17 | columns_to_remove, metric_list = get_params_without_model(get_recipe_config())
18 | 
19 | if len(columns_to_remove) != 0:
20 |     to_remove_in_original = set(original_df.columns).intersection(set(columns_to_remove))
21 |     if to_remove_in_original:
22 |         original_df = original_df.drop(list(to_remove_in_original), axis=1)
23 |     to_remove_in_new = set(new_df.columns).intersection(set(columns_to_remove))
24 |     if to_remove_in_new:
25 |         new_df = new_df.drop(list(to_remove_in_new), axis=1)
26 | 
27 | # Handle partitioning
28 | partition_cols_new_df = get_partitioning_columns(new_dataset)
29 | partition_cols_original_df = get_partitioning_columns(original_dataset)
30 | if partition_cols_original_df:
31 |     original_df = original_df.drop(partition_cols_original_df, axis=1)
32 | if partition_cols_new_df:
33 |     new_df = new_df.drop(partition_cols_new_df, axis=1)
34 | if len(new_df.columns) == 0 or len(original_df.columns) == 0:
35 |     raise ValueError('Without the partition column, at least one of the datasets is empty.')
36 | 
37 | # Analyse the drift
38 | drifter = DriftAnalyzer()
39 | drifter.fit(new_df=new_df, original_df=original_df)
40 | 
41 | # Write the drift score and metrics
42 | timestamp = datetime.datetime.now()
43 | new_df = pd.DataFrame({ModelDriftConstants.TIMESTAMP: [timestamp]})
44 | metrics_df, column_description_dict = build_drift_metric_dataframe(drifter, metric_list, new_df, has_model_as_input=False)
45 | 
46 | output_dataset.write_with_schema(metrics_df)
47 | set_column_description(output_dataset, column_description_dict)
48 | 
49 | 


--------------------------------------------------------------------------------
/plugin.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "id": "model-drift",
 3 |     "version": "3.1.6",
 4 |     "meta": {
 5 |         "label": "Model drift monitoring (deprecated)",
 6 |         "description": "Get insights on data drift between two (training) datasets. ⚠️ We recommend using the native feature Model Evaluation Store to compute feature drift of a deployed model.",
 7 |         "author": "Dataiku (Léo Dreyfus-Schmidt, Du Phan & Thibault Desfontaines)",
 8 |         "icon": "icon-anchor",
 9 |     	"supportLevel": "TIER2_SUPPORT",
10 |         "licenseInfo": "Apache Software License",
11 |         "url": "https://www.dataiku.com/product/plugins/model-drift/",
12 |         "tags": ["Machine Learning"]
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/python-lib/dku_data_drift/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | from dku_data_drift.drift_analyzer import DriftAnalyzer
3 | from dku_data_drift.model_accessor import ModelAccessor
4 | from dku_data_drift.model_drift_constants import ModelDriftConstants


--------------------------------------------------------------------------------
/python-lib/dku_data_drift/dataframe_helpers.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 |     Simple functions helpers
 5 | """
 6 | 
 7 | import logging
 8 | import sys
 9 | 
10 | logger = logging.getLogger(__name__)
11 | logging.basicConfig(level=logging.INFO, format='Model Drift Plugin | %(levelname)s - %(message)s')
12 | 
13 | logger.info("Python version: {}".format(sys.version))
14 | # python3 does not have basetring
15 | try:
16 |     basestring
17 | except NameError:
18 |     basestring = str
19 | 
20 | 
21 | def schema_are_compatible(df1, df2):
22 |     """
23 |     Return True if df1 and df2 have the same columns
24 |     :param df1: Pandas dataframe
25 |     :param df2: Pandas dataframe
26 |     :return:
27 |     """
28 |     return set(df1.columns) == set(df2.columns)
29 | 
30 | 
31 | def not_enough_data(df, min_len=1):
32 |     """
33 |         Compare length of dataframe to minimum lenght of the test data.
34 |         Used in the relevance of the measure.
35 |     :param df: Input dataframe
36 |     :param min_len:
37 |     :return:
38 |     """
39 |     return len(df) < min_len
40 | 
41 | 
42 | def nothing_to_do(stuff):
43 |     return stuff is None
44 | 
45 | 
46 | def generic_check_compute_arguments(datetime_column, groupby_columns):
47 |     """
48 |         Check columns argument in the dataframe. Date is always tricky to handle.
49 |     :param datetime_column:
50 |     :param groupby_columns:
51 |     :return:
52 |     """
53 |     if not isinstance(datetime_column, basestring):
54 |         raise ValueError('datetime_column param must be string. Got: ' + str(datetime_column))
55 |     if groupby_columns:
56 |         if not isinstance(groupby_columns, list):
57 |             raise ValueError('groupby_columns param must be an array of strings. Got: ' + str(groupby_columns))
58 |         for col in groupby_columns:
59 |             if not isinstance(col, basestring):
60 |                 raise ValueError('groupby_columns param must be an array of strings. Got: ' + str(col))


--------------------------------------------------------------------------------
/python-lib/dku_data_drift/dataset_helpers.py:
--------------------------------------------------------------------------------
 1 | def get_partitioning_columns(dataset):
 2 |     """
 3 |     Dataset DSS object
 4 |     :param dataset: 
 5 |     :return: The partitioning columns as a list of strings
 6 |     """
 7 |     partitioning_settings = dataset.get_config().get('partitioning', {})
 8 |     partitioning_dimensions = partitioning_settings.get('dimensions', [])
 9 |     is_filesystem_partition = 'filePathPattern' in dataset.get_config().get('partitioning', {})
10 |     if len(partitioning_dimensions) > 0 and not is_filesystem_partition:
11 |         partitioning_columns = [col.get('name') for col in partitioning_dimensions]
12 |         return partitioning_columns
13 |     return []
14 | 
15 | 


--------------------------------------------------------------------------------
/python-lib/dku_data_drift/drift_analyzer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import logging
  3 | import numpy as np
  4 | import pandas as pd
  5 | import scipy.stats
  6 | import statsmodels.stats.proportion
  7 | from sklearn.metrics import accuracy_score
  8 | from sklearn.ensemble import RandomForestClassifier
  9 | from sklearn.preprocessing import KBinsDiscretizer
 10 | from dku_data_drift.preprocessing import Preprocessor
 11 | from dku_data_drift.model_tools import format_proba_density
 12 | from dku_data_drift.model_drift_constants import ModelDriftConstants
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | class DriftAnalyzer(object):
 18 | 
 19 |     def __init__(self, prediction_type=None):
 20 |         self.prediction_type = prediction_type
 21 |         self.drift_clf = RandomForestClassifier(n_estimators=100, random_state=1337, max_depth=13, min_samples_leaf=1)
 22 |         self._original_df = None
 23 |         self._new_df = None
 24 |         self._drift_test_X = None
 25 |         self._drift_test_Y = None
 26 |         self._model_accessor = None
 27 |         self.has_predictions = False
 28 |         self.target = None
 29 |         self.features_in_drift_model = None
 30 |         self.sample_size = None
 31 | 
 32 |     def get_prediction_type(self):
 33 |         return self.prediction_type
 34 | 
 35 |     def fit(self, new_df, model_accessor=None, original_df=None, target=None):
 36 |         """
 37 |         Trains a classifier that attempts to discriminate between rows from the provided dataframe and
 38 |         rows from the dataset originally used to evaluate the model
 39 | 
 40 |         Returns (columns, classifier)
 41 |         """
 42 |         logger.info("Preparing the drift model...")
 43 | 
 44 |         if model_accessor is not None and original_df is not None:
 45 |             raise ValueError('model_accessor and original_df can not be defined at the same time. Please choose one of them.')
 46 | 
 47 |         if model_accessor is not None and original_df is None and target is None:
 48 |             self._model_accessor = model_accessor
 49 |             self.has_predictions = True
 50 |             self.target = self._model_accessor.get_target_variable()
 51 |             self.prediction_type = self._model_accessor.get_prediction_type()
 52 |             original_df = self._model_accessor.get_original_test_df()
 53 |             df = self.prepare_data_when_having_model(new_df, original_df)
 54 |         elif model_accessor is None and original_df is not None and target is not None:
 55 |             self.has_predictions = True
 56 |             self.target = target
 57 |             df = self.prepare_data_when_having_target(new_df, original_df)
 58 |         elif model_accessor is None and original_df is not None and target is None:
 59 |             df = self.prepare_data_when_without_target(new_df, original_df)
 60 |         else:
 61 |             raise NotImplementedError('You need to precise either a model accessor or an original df.')
 62 | 
 63 |         preprocessor = Preprocessor(df, target=ModelDriftConstants.ORIGIN_COLUMN)
 64 |         train, test = preprocessor.get_processed_train_test()
 65 |         drift_train_X = train.drop(ModelDriftConstants.ORIGIN_COLUMN, axis=1)
 66 |         drift_train_Y = np.array(train[ModelDriftConstants.ORIGIN_COLUMN])
 67 |         self._drift_test_X = test.drop(ModelDriftConstants.ORIGIN_COLUMN, axis=1)  # we will use them later when compute metrics
 68 |         self._drift_test_Y = np.array(test[ModelDriftConstants.ORIGIN_COLUMN])
 69 |         self.features_in_drift_model = drift_train_X.columns
 70 | 
 71 |         logger.info("Fitting the drift model...")
 72 |         self.drift_clf.fit(drift_train_X, drift_train_Y)
 73 | 
 74 |     def prepare_data_when_having_model(self, new_df, original_df):
 75 |         logger.info('Prepare data with model')
 76 | 
 77 |         if self.target not in original_df:
 78 |             raise ValueError('The original dataset does not contain target "{}".'.format(self.target))
 79 | 
 80 |         self._new_df = new_df
 81 |         self._original_df = original_df
 82 |         original_df_without_target = original_df.drop(self.target, axis=1)
 83 |         return self._prepare_data_for_drift_model(new_df, original_df_without_target)
 84 | 
 85 |     def prepare_data_when_having_target(self, new_df, original_df):
 86 |         logger.info('Prepare data with target for drift model')
 87 | 
 88 |         if self.target not in new_df:
 89 |             raise ValueError('The new dataset does not contain target "{}".'.format(self.target))
 90 | 
 91 |         if self.target not in original_df:
 92 |             raise ValueError('The original dataset does not contain target "{}".'.format(self.target))
 93 | 
 94 |         self._new_df = new_df
 95 |         self._original_df = original_df
 96 |         new_df_without_target = new_df.drop(self.target, axis=1)
 97 |         original_df_without_target = original_df.drop(self.target, axis=1)
 98 |         return self._prepare_data_for_drift_model(new_df_without_target, original_df_without_target)
 99 | 
100 |     def prepare_data_when_without_target(self, new_df, original_df):
101 |         logger.info('Prepare data without target for drift model')
102 |         return self._prepare_data_for_drift_model(new_df, original_df)
103 | 
104 |     def get_drift_metrics_for_webapp(self):
105 |         """
106 |         Return a dict of metrics with a format to be easily used in frontend
107 |         """
108 | 
109 |         if self.features_in_drift_model is None or self.drift_clf is None:
110 |             logger.warning('drift_features and drift_clf must be defined')
111 |             return {}
112 | 
113 |         logger.info("Computing drift metrics ...")
114 |         drift_accuracy, drift_accuracy_lower, drift_accuracy_upper, drift_test_pvalue = self.get_drift_score(output_raw_score=True)
115 |         feature_importance_metrics, riskiest_features = self._get_feature_importance_metrics()
116 | 
117 |         if self.prediction_type == ModelDriftConstants.REGRRSSION_TYPE:
118 |             kde_dict = self._get_regression_prediction_kde()
119 |             fugacity_metrics = {}
120 |             label_list = []
121 |         elif self.prediction_type == ModelDriftConstants.CLASSIFICATION_TYPE:
122 |             logger.info("Compute classification drift metrics for classification")
123 |             kde_dict, fugacity_metrics, label_list = self._get_classification_prediction_metrics()
124 |         else:
125 |             raise ValueError('Prediction type not defined.')
126 | 
127 |         return {'type': self.prediction_type,
128 |                 'sample_size': self.sample_size,
129 |                 'feature_importance': feature_importance_metrics,
130 |                 'drift_accuracy': round(drift_accuracy, 3),
131 |                 'drift_accuracy_lower': round(drift_accuracy_lower, 3),
132 |                 'drift_accuracy_upper': round(drift_accuracy_upper, 3),
133 |                 'drift_test_pvalue': round(drift_test_pvalue, 5),
134 |                 'kde': kde_dict,
135 |                 'fugacity': fugacity_metrics,
136 |                 'label_list': label_list,
137 |                 'riskiest_features': riskiest_features}
138 | 
139 |     def _get_classification_prediction_metrics(self):
140 | 
141 |         if not self.has_predictions:
142 |             raise ValueError('DriftAnalyzer needs a target.')
143 | 
144 |         if self.prediction_type != ModelDriftConstants.CLASSIFICATION_TYPE:
145 |             raise ValueError('Can not use this function with a {} model.'.format(self.prediction_type))
146 | 
147 |         if self._model_accessor is not None:
148 |             prediction_dict = self.get_predictions_from_original_model(limit=ModelDriftConstants.PREDICTION_TEST_SIZE)
149 |             predictions_by_class = {}
150 |             for label in prediction_dict.get(ModelDriftConstants.FROM_ORIGINAL).columns:
151 |                 if 'proba_' in label:
152 |                     original_proba = np.around(prediction_dict.get(ModelDriftConstants.FROM_ORIGINAL)[label].values, 2).tolist()
153 |                     new_proba = np.around(prediction_dict.get(ModelDriftConstants.FROM_NEW)[label].values, 2).tolist()
154 |                     predictions_by_class[label] = {ModelDriftConstants.FROM_ORIGINAL: original_proba, ModelDriftConstants.FROM_NEW: new_proba}
155 |             kde_dict = {}
156 |             for label in predictions_by_class.keys():
157 |                 kde_original = format_proba_density(predictions_by_class.get(label).get(ModelDriftConstants.FROM_ORIGINAL))
158 |                 kde_new = format_proba_density(predictions_by_class.get(label).get(ModelDriftConstants.FROM_NEW))
159 |                 cleaned_label = label.replace('proba_', ModelDriftConstants.CLASS)
160 |                 kde_dict[cleaned_label] = {ModelDriftConstants.FROM_ORIGINAL: kde_original, ModelDriftConstants.FROM_NEW: kde_new}
161 |             fugacity = self.get_classification_fugacity(reformat=True)
162 |             label_list = [label for label in fugacity[0].keys() if label != 'source']
163 | 
164 |             return kde_dict, fugacity, label_list
165 |         else:
166 |             fugacity = self.get_classification_fugacity()
167 |             label_list = fugacity[ModelDriftConstants.CLASS].unique()
168 |             return None, fugacity, label_list
169 | 
170 |     def _get_regression_prediction_kde(self):
171 | 
172 |         if not self.has_predictions:
173 |             raise ValueError('No target was defined at fit phase.')
174 | 
175 |         if self.prediction_type != ModelDriftConstants.REGRRSSION_TYPE:
176 |             raise ValueError('Can not use this function with a {} model.'.format(self.prediction_type))
177 | 
178 |         prediction_dict = self.get_predictions_from_original_model(limit=ModelDriftConstants.PREDICTION_TEST_SIZE)
179 |         original_serie = prediction_dict.get(ModelDriftConstants.FROM_ORIGINAL).values
180 |         new_serie = prediction_dict.get(ModelDriftConstants.FROM_NEW).values
181 |         min_support = float(min(min(original_serie), min(new_serie)))
182 |         max_support = float(max(max(original_serie), max(new_serie)))
183 |         logger.info("Computed histogram support: [{},{}]".format(min_support, max_support))
184 |         kde_original = format_proba_density(original_serie, min_support=min_support, max_support=max_support)
185 |         kde_new = format_proba_density(new_serie, min_support=min_support, max_support=max_support)
186 |         kde_dict= {
187 |             'Prediction': {
188 |                 ModelDriftConstants.FROM_ORIGINAL: kde_original,
189 |                 ModelDriftConstants.FROM_NEW: kde_new,
190 |                 "min_support": min_support,
191 |                 "max_support": max_support
192 |             }
193 |         }
194 |         return kde_dict
195 | 
196 |     def get_regression_fugacity(self):
197 |         """
198 |         TODO refactor
199 | 
200 |         """
201 |         kde_dict = self._get_regression_prediction_kde()
202 |         new = kde_dict.get('Prediction').get('new')
203 |         old = kde_dict.get('Prediction').get('original')
204 |         old_arr = np.array(old).T
205 |         df = pd.DataFrame(new, columns=['val_new', 'new_density'])
206 |         df['val_old'] = old_arr[0]
207 |         df['old_density'] = old_arr[1]
208 |         kb = KBinsDiscretizer(n_bins=10, encode='ordinal')
209 |         df['old_bin'] = kb.fit_transform(df['val_old'].values.reshape(-1, 1)).reshape(-1, ).astype(int)
210 |         df['new_bin'] = kb.transform(df['val_new'].values.reshape(-1, 1)).reshape(-1, ).astype(int)
211 |         full_density_old = df.old_density.sum()
212 |         full_density_new = df.new_density.sum()
213 |         fuga_old = 100 * df.groupby('old_bin').old_density.sum() / full_density_old
214 |         fuga_new = 100 * df.groupby('new_bin').new_density.sum() / full_density_new
215 | 
216 |         fuga_old_df = pd.DataFrame(fuga_old).reset_index()
217 |         fuga_old_df['old_bin'] = fuga_old_df['old_bin'].map(lambda x: 'fugacity_decile_{}'.format(x))
218 |         old_fugacity_values = fuga_old_df.set_index('old_bin').to_dict().get('old_density')
219 | 
220 |         fuga_new_df = pd.DataFrame(fuga_new).reset_index()
221 |         fuga_new_df['new_bin'] = fuga_new_df['new_bin'].map(lambda x: 'fugacity_decile_{}'.format(x))
222 |         new_fugacity_values = fuga_new_df.set_index('new_bin').to_dict().get('new_density')
223 |         fugacity = {}
224 |         for k, v in old_fugacity_values.items():
225 |             fugacity[k] = {ModelDriftConstants.ORIGINAL_DATASET: v, ModelDriftConstants.NEW_DATASET: new_fugacity_values.get(k)}
226 | 
227 |         fugacity_relative_change_values = np.around(100*(fuga_new - fuga_old)/fuga_old, decimals=3)
228 |         fuga_relative_change_df = pd.DataFrame(fugacity_relative_change_values.to_dict(), index=[0])
229 |         fuga_diff_columns = [ModelDriftConstants.FUGACITY_RELATIVE_CHANGE_REGRESSION_LABEL.format(col) for col in fuga_relative_change_df.columns]
230 |         fuga_relative_change_df.columns = fuga_diff_columns
231 | 
232 |         fugacity_relative_change = fuga_relative_change_df.iloc[0].to_dict()
233 | 
234 |         e = '-inf'
235 |         decile_interval_description = []
236 |         for edge in kb.bin_edges_[0][1:-1]:
237 |             decile_interval_description.append('from {0} to {1}'.format(e, round(edge, 2)))
238 |             e = round(edge, 3)
239 | 
240 |         decile_interval_description.append('from {0} to +inf'.format(round(kb.bin_edges_[0][-2], 2)))
241 |         return fugacity, fugacity_relative_change, decile_interval_description
242 | 
243 | 
244 |     def _prepare_data_for_drift_model(self, new_df, original_df, min_num_row=ModelDriftConstants.MIN_NUM_ROWS, max_num_row=ModelDriftConstants.MAX_NUM_ROW):
245 |         """
246 |         Sampling function so that original test set and new test set has the same ratio in the drift training set
247 |         For now only do top n sampling, with max n = MAX_NUM_ROW
248 | 
249 |         :return: a dataframe with data source target (orignal vs new)
250 |         """
251 | 
252 |         original_df[ModelDriftConstants.ORIGIN_COLUMN] = ModelDriftConstants.FROM_ORIGINAL
253 |         new_df[ModelDriftConstants.ORIGIN_COLUMN] = ModelDriftConstants.FROM_NEW
254 | 
255 |         logger.info("Rebalancing data:")
256 |         number_of_rows = min(original_df.shape[0], new_df.shape[0], max_num_row)
257 |         self.sample_size = number_of_rows
258 |         logger.info(" - original dataset had %s rows, new dataset has %s. Selecting the first %s for each." % (original_df.shape[0], new_df.shape[0], number_of_rows))
259 | 
260 |         df = pd.concat([original_df.head(number_of_rows), new_df.head(number_of_rows)], sort=False)
261 | 
262 |         if self._model_accessor is not None:
263 |             selected_features = [ModelDriftConstants.ORIGIN_COLUMN] + self._model_accessor.get_selected_features()
264 |         else:
265 |             selected_features = original_df.columns
266 | 
267 | 
268 |         logger.info('Features used for drift models: {}'.format(selected_features))
269 |         missing_features = set(selected_features) - set(new_df.columns)
270 |         if len(missing_features) > 0:
271 |             raise ValueError('Missing column(s) in the new dataframe: {}'.format(', '.join(list(missing_features))))
272 | 
273 |         return df.loc[:, selected_features]
274 | 
275 |     def get_drift_feature_importance(self, cumulative_percentage_threshold=ModelDriftConstants.FEAT_IMP_CUMULATIVE_PERCENTAGE_THRESHOLD):
276 |         feature_importance = []
277 |         for feature_name, feat_importance in zip(self.features_in_drift_model, self.drift_clf.feature_importances_):
278 |             feature_importance.append({
279 |                 ModelDriftConstants.FEATURE: feature_name,
280 |                 ModelDriftConstants.IMPORTANCE: 100 * feat_importance / sum(self.drift_clf.feature_importances_)
281 |             })
282 | 
283 |         dfx = pd.DataFrame(feature_importance).sort_values(by=ModelDriftConstants.IMPORTANCE, ascending=False).reset_index(drop=True)
284 |         dfx[ModelDriftConstants.CUMULATIVE_IMPORTANCE] = dfx[ModelDriftConstants.IMPORTANCE].cumsum()
285 |         dfx_top = dfx.loc[dfx[ModelDriftConstants.CUMULATIVE_IMPORTANCE] <= cumulative_percentage_threshold]
286 |         return dfx_top.rename_axis(ModelDriftConstants.RANK).reset_index().set_index(ModelDriftConstants.FEATURE)
287 | 
288 |     def get_original_feature_importance(self, cumulative_percentage_threshold=ModelDriftConstants.FEAT_IMP_CUMULATIVE_PERCENTAGE_THRESHOLD):
289 |         if self._model_accessor is not None:
290 |             return self._model_accessor.get_feature_importance(cumulative_percentage_threshold)
291 |         else:
292 |             raise ValueError('DriftAnalyzer needs a ModelAccessor as input.')
293 | 
294 |     def get_riskiest_features(self, drift_feature_importance=None, original_feature_importance=None, ratio_threshold=ModelDriftConstants.RISKIEST_FEATURES_RATIO_THRESHOLD):
295 |         """
296 |         Return a list of features that users should check (ie. those that are on the top right quadrant of the feat imp plot)
297 |         If either feature importance is not computed, compute them here
298 | 
299 |         :param drift_feature_importance:
300 |         :param original_feature_importance:
301 |         :return:
302 |         """
303 |         if drift_feature_importance is None:
304 |             drift_feature_importance = self.get_drift_feature_importance()
305 |         if original_feature_importance is None:
306 |             original_feature_importance = self.get_original_feature_importance()
307 | 
308 |         original_feat_imp_threshold = ratio_threshold * max(original_feature_importance[ModelDriftConstants.IMPORTANCE])
309 |         drift_feat_imp_threshold = ratio_threshold * max(drift_feature_importance[ModelDriftConstants.IMPORTANCE])
310 |         top_original_features = original_feature_importance[original_feature_importance[ModelDriftConstants.IMPORTANCE] > original_feat_imp_threshold].index
311 |         top_drift_features = drift_feature_importance[drift_feature_importance[ModelDriftConstants.IMPORTANCE] > drift_feat_imp_threshold].index
312 | 
313 |         return list(set(top_original_features).intersection(top_drift_features))
314 | 
315 |     def _get_feature_importance_metrics(self):
316 |         """
317 |         For visualisation purpose
318 | 
319 |         :return:
320 |         """
321 |         original_feature_importance_df = self.get_original_feature_importance()
322 |         drift_feature_importance_df = self.get_drift_feature_importance()
323 |         topn_drift_feature = drift_feature_importance_df.to_dict()[ModelDriftConstants.IMPORTANCE]
324 |         topn_original_feature = original_feature_importance_df.to_dict()[ModelDriftConstants.IMPORTANCE]
325 |         feature_importance_metrics = []
326 |         for feature in set(topn_original_feature.keys()).union(set(topn_drift_feature.keys())):
327 |             drift_feat_rank = topn_drift_feature.get(feature)
328 |             original_feat_rank = topn_original_feature.get(feature)
329 |             if drift_feat_rank is None:
330 |                 logger.warn('Feature {} does not exist in the most important features of the drift model.'.format(feature))
331 |             if original_feat_rank is None:
332 |                 logger.warn('Feature {} does not exist in the most important features of the orignal model.'.format(feature))
333 |             feature_importance_metrics.append({
334 |                 'original_model': original_feat_rank if original_feat_rank else 0.01,
335 |                  'drift_model': drift_feat_rank if drift_feat_rank else 0.01,
336 |                  'feature': feature
337 |             })
338 | 
339 |         riskiest_feature = self.get_riskiest_features(drift_feature_importance=drift_feature_importance_df, original_feature_importance=original_feature_importance_df)
340 |         return feature_importance_metrics, riskiest_feature
341 |     
342 |     def get_drift_score(self, output_raw_score=False, confidence_level=0.95):
343 | 
344 |         """
345 |         Drift score is the accuracy of drift model (with an exponential transform by default)
346 | 
347 |         :param
348 |             output_raw_score
349 |             confidence_level
350 |         :return:
351 |         """
352 |         predicted_Y = self.drift_clf.predict(self._drift_test_X)
353 |         test_Y = pd.Series(self._drift_test_Y)
354 |         drift_accuracy = accuracy_score(test_Y, predicted_Y)
355 | 
356 |         # 95% confidence interval around accuracy
357 |         nb_correct = sum(test_Y == predicted_Y)
358 |         nb_total = len(test_Y)
359 |         drift_accuracy_lower, drift_accuracy_upper = statsmodels.stats.proportion.proportion_confint(
360 |             nb_correct, nb_total, method="wilson", alpha=(1 - confidence_level)
361 |         )
362 | 
363 |         # H0: there is no drift (== domain classifier is correct 50% of the time)
364 |         drift_test_pvalue = scipy.stats.binom_test(nb_correct, nb_total, p=.5, alternative='greater')
365 | 
366 |         if output_raw_score:
367 |             return drift_accuracy, drift_accuracy_lower, drift_accuracy_upper, drift_test_pvalue
368 |         else:
369 |             exponential_function = lambda x: round(np.exp(1 - 1 / (np.power(x, 2.5))), 2)
370 |             return exponential_function(drift_accuracy) # make the score looks more "logic" from the user point of view
371 | 
372 |     def get_predictions_from_original_model(self, limit=ModelDriftConstants.MAX_NUM_ROW):
373 |         """
374 |         Predictions on the test set of original and new data
375 | 
376 |         The result of model_accessor.predict() is a dataframe prediction|proba_0|proba_1|...
377 |         """
378 |         if not self.has_predictions:
379 |             raise ValueError('No target was defined at fit phase.')
380 | 
381 |         if self._model_accessor is not None:
382 |             original_prediction_df = self._model_accessor.predict(self._original_df[:limit])
383 |             original_prediction_df = original_prediction_df.rename(columns={'prediction':self.target})
384 |             new_predicton_df = self._model_accessor.predict(self._new_df[:limit])
385 |             new_predicton_df = new_predicton_df.rename(columns={'prediction':self.target})
386 | 
387 |             if self._model_accessor.get_prediction_type() == ModelDriftConstants.CLASSIFICATION_TYPE:
388 |                 proba_columns = [col for col in original_prediction_df.columns if 'proba_' in col]
389 |                 # move to % scale, it plays nicer with d3 ...
390 |                 original_prediction_df.loc[:, proba_columns] = np.around(original_prediction_df.loc[:, proba_columns] * 100)
391 |                 new_predicton_df.loc[:, proba_columns] = np.around(new_predicton_df.loc[:, proba_columns] * 100)
392 | 
393 |             return {ModelDriftConstants.FROM_ORIGINAL: original_prediction_df, ModelDriftConstants.FROM_NEW: new_predicton_df}
394 | 
395 |         else: # no proba columns
396 |             original_prediction_df = self._original_df.loc[:, [self.target]]
397 |             new_prediciton_df = self._new_df.loc[:, [self.target]]
398 |             return {ModelDriftConstants.FROM_ORIGINAL: original_prediction_df, ModelDriftConstants.FROM_NEW: new_prediciton_df}
399 | 
400 | 
401 |     def get_classification_fugacity(self, reformat=False):
402 |         """
403 |         For classification only, this compute the ratio of each predicted label
404 | 
405 |         :param prediction_dict:
406 |         :return:
407 |         """
408 |         if self.prediction_type != ModelDriftConstants.CLASSIFICATION_TYPE:
409 |             raise ValueError('This function is for prediction of type {0}.'.format(self.prediction_type))
410 | 
411 |         if not self.has_predictions:
412 |             raise ValueError('No target was defined in the fit phase.')
413 | 
414 |         prediction_dict = self.get_predictions_from_original_model(limit=ModelDriftConstants.PREDICTION_TEST_SIZE)
415 |         original_prediction_df = prediction_dict.get(ModelDriftConstants.FROM_ORIGINAL)
416 |         new_prediciton_df = prediction_dict.get(ModelDriftConstants.FROM_NEW)
417 | 
418 |         if reformat: # for the model view
419 |             original_fugacity = (100 * original_prediction_df[self.target].value_counts(normalize=True)).round(decimals=2).to_dict()
420 |             new_fugacity = (100 * new_prediciton_df[self.target].value_counts(normalize=True)).round(decimals=2).to_dict()
421 |             fugacity = []
422 |             for key in original_fugacity.keys():
423 |                 temp_fugacity = {}
424 |                 new_key = "Predicted {} (%)".format(key)
425 |                 temp_fugacity[' Score'] = new_key
426 |                 temp_fugacity['Test dataset'] = original_fugacity.get(key, 0.)
427 |                 temp_fugacity['Input dataset'] = new_fugacity.get(key, 0.)
428 |                 fugacity.append(temp_fugacity)
429 |             return fugacity
430 |         else:
431 |             original_fugacity = (100 * original_prediction_df[self.target].value_counts(normalize=True)).round(decimals=2).rename_axis(ModelDriftConstants.CLASS).reset_index(name=ModelDriftConstants.PERCENTAGE)
432 |             new_fugacity = (100 * new_prediciton_df[self.target].value_counts(normalize=True)).round(decimals=2).rename_axis(ModelDriftConstants.CLASS).reset_index(name=ModelDriftConstants.PERCENTAGE)
433 |             fugacity_relative_change = {}
434 |             fugacity = {}
435 | 
436 |             for label in original_fugacity[ModelDriftConstants.CLASS].unique():
437 |                 new_value = new_fugacity[new_fugacity[ModelDriftConstants.CLASS] == label][ModelDriftConstants.PERCENTAGE].values[0]
438 |                 original_value = original_fugacity[original_fugacity[ModelDriftConstants.CLASS] == label][ModelDriftConstants.PERCENTAGE].values[0]
439 |                 fugacity_diff = 100 * float(new_value - original_value)/float(original_value)
440 |                 new_label_relative = ModelDriftConstants.FUGACITY_RELATIVE_CHANGE_CLASSIF_LABEL.format(label)
441 |                 fugacity_relative_change[new_label_relative] = round(fugacity_diff, 3)
442 |                 new_label = ModelDriftConstants.FUGACITY_CLASSIF_LABEL.format(label)
443 |                 fugacity[new_label] = {ModelDriftConstants.ORIGINAL_DATASET: original_value, ModelDriftConstants.NEW_DATASET: new_value}
444 |             return fugacity, fugacity_relative_change


--------------------------------------------------------------------------------
/python-lib/dku_data_drift/model_accessor.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import logging
  3 | import pandas as pd
  4 | from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
  5 | from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
  6 | from sklearn.calibration import CalibratedClassifierCV
  7 | from dku_data_drift.model_tools import SurrogateModel
  8 | from dku_data_drift.model_drift_constants import ModelDriftConstants
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | ALGORITHMS_WITH_VARIABLE_IMPORTANCE = [RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, DecisionTreeClassifier,
 13 |                                        RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, DecisionTreeRegressor]
 14 | 
 15 | 
 16 | class ModelAccessor(object):
 17 |     def __init__(self, model_handler=None):
 18 |         self.model_handler = model_handler
 19 | 
 20 |     def get_prediction_type(self):
 21 |         """
 22 |         Wrap the prediction type accessor of the model
 23 |         """
 24 |         if self.model_handler.get_prediction_type() in ModelDriftConstants.DKU_CLASSIFICATION_TYPE:
 25 |             return ModelDriftConstants.CLASSIFICATION_TYPE
 26 |         elif ModelDriftConstants.REGRRSSION_TYPE in self.model_handler.get_prediction_type():
 27 |             return ModelDriftConstants.REGRRSSION_TYPE
 28 |         else:
 29 |             return ModelDriftConstants.CLUSTERING_TYPE
 30 |             
 31 |     def get_target_variable(self):
 32 |         """
 33 |         Return the name of the target variable
 34 |         """
 35 |         return self.model_handler.get_target_variable()
 36 | 
 37 |     def get_original_test_df(self, limit=ModelDriftConstants.MAX_NUM_ROW):
 38 |         try:
 39 |             return self.model_handler.get_test_df()[0][:limit]
 40 |         except Exception as e:
 41 |             logger.warning('Can not retrieve original test set: {}. The plugin will take the whole original dataset.'.format(e))
 42 |             return self.model_handler.get_full_df()[0][:limit]
 43 | 
 44 |     def get_per_feature(self):
 45 |         return self.model_handler.get_per_feature()
 46 | 
 47 |     def get_predictor(self):
 48 |         return self.model_handler.get_predictor()
 49 | 
 50 |     def get_feature_importance(self, cumulative_percentage_threshold=ModelDriftConstants.FEAT_IMP_CUMULATIVE_PERCENTAGE_THRESHOLD):
 51 |         """
 52 |         :param cumulative_percentage_threshold: only return the top n features whose sum of importance reaches this threshold
 53 |         :return:
 54 |         """
 55 |         if self._algorithm_is_tree_based():
 56 |             predictor = self.get_predictor()
 57 |             clf = predictor._clf
 58 |             feature_names = predictor.get_features()
 59 |             feature_importances = clf.feature_importances_
 60 | 
 61 |         else: # use surrogate model
 62 |             logger.info('Fitting surrogate model ...')
 63 |             surrogate_model = SurrogateModel(self.get_prediction_type())
 64 |             original_test_df = self.get_original_test_df()
 65 |             predictions_on_original_test_df = self.get_predictor().predict(original_test_df)
 66 |             surrogate_df = original_test_df[self.get_selected_features()]
 67 |             surrogate_df[ModelDriftConstants.SURROGATE_TARGET] = predictions_on_original_test_df['prediction']
 68 |             surrogate_model.fit(surrogate_df, ModelDriftConstants.SURROGATE_TARGET)
 69 |             feature_names = surrogate_model.get_features()
 70 |             feature_importances = surrogate_model.clf.feature_importances_
 71 | 
 72 |         feature_importance = []
 73 |         for feature_name, feat_importance in zip(feature_names, feature_importances):
 74 |             feature_importance.append({
 75 |                 ModelDriftConstants.FEATURE: feature_name,
 76 |                 ModelDriftConstants.IMPORTANCE: 100 * feat_importance / sum(feature_importances)
 77 |             })
 78 | 
 79 |         dfx = pd.DataFrame(feature_importance).sort_values(by=ModelDriftConstants.IMPORTANCE, ascending=False).reset_index(drop=True)
 80 |         dfx[ModelDriftConstants.CUMULATIVE_IMPORTANCE] = dfx[ModelDriftConstants.IMPORTANCE].cumsum()
 81 |         dfx_top = dfx.loc[dfx[ModelDriftConstants.CUMULATIVE_IMPORTANCE] <= cumulative_percentage_threshold]
 82 |         return dfx_top.rename_axis(ModelDriftConstants.RANK).reset_index().set_index(ModelDriftConstants.FEATURE)
 83 | 
 84 | 
 85 |     def get_selected_features(self):
 86 |         selected_features = []
 87 |         for feat, feat_info in self.get_per_feature().items():
 88 |             if feat_info.get('role') == 'INPUT':
 89 |                 selected_features.append(feat)
 90 |         return selected_features
 91 | 
 92 |     def predict(self, df):
 93 |         return self.get_predictor().predict(df)
 94 | 
 95 |     def _algorithm_is_tree_based(self):
 96 |         predictor = self.get_predictor()
 97 |         algo = predictor._clf
 98 |         if isinstance(algo, CalibratedClassifierCV):
 99 |             logger.info('Algorithm is CalibratedClassifierCV.')
100 |             return False
101 |         for algorithm in ALGORITHMS_WITH_VARIABLE_IMPORTANCE:
102 |             if isinstance(algo, algorithm):
103 |                 logger.info('Algorithm is tree-based: ', algo)
104 |                 return True
105 |             elif predictor.params.modeling_params.get('algorithm') == 'XGBOOST_CLASSIFICATION':
106 |                 logger.info('Algorithm is tree-based: XGBOOST_CLASSIFICATION')
107 |                 return True
108 |             elif predictor.params.modeling_params.get('algorithm') == 'XGBOOST_REGRESSION':
109 |                 logger.info('Algorithm is tree-based: XGBOOST_REGRESSION')
110 |                 return True
111 |         return False


--------------------------------------------------------------------------------
/python-lib/dku_data_drift/model_drift_constants.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | class ModelDriftConstants(object):
 4 | 
 5 |     TIMESTAMP = 'timestamp'
 6 |     MODEL_ID = 'model_id'
 7 |     VERSION_ID = 'version_id'
 8 |     TRAIN_DATE = 'train_date'
 9 |     DRIFT_SCORE = 'drift_model_accuracy'
10 |     DRIFT_SCORE_DEFINITION = 'In order to detect data drift, we train a random forest classifier (the drift model) to discriminate the new data set from the test set. If this classifier has accuracy > 0.5, it implies that test data and new data can be distinguished and that you are observing data drift. You may consider retraining your model in that situation.'
11 |     BINOMIAL_TEST = 'binomial_test'
12 |     BINOMIAL_TEST_DEFINITION = 'The hypothesis tested is that there is no drift, in which case the expected drift model accuracy is 0.5 (datasets undistinguishable). The observed accuracy might deviate from this expectation and the Binomial test evaluates whether this deviation is statistically significant, modelling the number of correct predictions as a random variable drawn from a Binomial distribution. The p-value is the probability to observe this particular accuracy (or larger) under the hypothesis of absent drift. If this probability is lower than the significance level (i.e. 5%), it’s then unlikely to be in the situation of absent drift: the hypothesis of no drift is rejected, triggering a drift detection. The significance level indicates the rate of falsely-detected drifts we are ready to accept from the test.'
13 |     BINOMIAL_P_VALUE = 'binomial_test_p_value'
14 |     BINOMIAL_LOWER_BOUND = 'accuracy_lower_bound'
15 |     BINOMIAL_LOWER_BOUND_DEFINITION = 'Confidence interval lower bound for the accuracy of the domain classifier'
16 |     BINOMIAL_UPPER_BOUND = 'accuracy_upper_bound'
17 |     BINOMIAL_UPPER_BOUND_DEFINITION = 'Confidence interval upper bound for the accuracy of the domain classifier'
18 | 
19 |     FUGACITY = 'fugacity'
20 |     FUGACITY_CLASSIF_DEFINITION = 'Proportion of samples predicted (in %) in each class when scoring on both the original test and the new input dataset.'
21 |     FUGACITY_REGRESSION_DEFINITION = 'Proportion of samples predicted (in %) in each decile when scoring on both the original test and the new input dataset.\n\n'
22 |     FUGACITY_RELATIVE_CHANGE = 'fugacity_relative_change'
23 |     FUGACITY_RELATIVE_CHANGE_CLASSIF_DEFINITION = 'Relative change (in %) in each class with respect to the original fugacity value.\n\nFormula: 100*(new_fugacity - original_fugacity)/original_fugacity'
24 |     FUGACITY_RELATIVE_CHANGE_REGRESSION_DEFINITION = 'Relative change (in %) in each decile with respect to the original fugacity value.\n\nFormula: 100*(new_fugacity - original_fugacity)/original_fugacity\n\n'
25 |     RISKIEST_FEATURES = 'riskiest_features'
26 |     RISKIEST_FEATURES_DEFINITION = 'If the drift score is medium/high (above 0.1), we recommend you to check those features.\nA feature is considered risky if it is both in the top 40% of the most drifted features as well as the top 40% most important features in the original model.'
27 |     MOST_DRIFTED_FEATURES = 'most_drifted_features'
28 | 
29 |     NUMBER_OF_DRIFTED_FEATURES = 20
30 |     MOST_DRIFTED_FEATURES_DEFINITION = 'When the drift score is medium/high (above 0.1), this is the list of features that have been drifted the most, with their % of importance (max {0} features).'.format(NUMBER_OF_DRIFTED_FEATURES)
31 |     MOST_IMPORTANT_FEATURES = 'most_important_features_in_deployed_model'
32 |     MOST_IMPORTANT_FEATURES_DEFINTIION = 'Most important features in the deployed model, with their % of importance (max 20 features).'
33 |     FEATURE_IMPORTANCE = 'feature_importance'
34 | 
35 |     ORIGIN_COLUMN = '__dku_row_origin__'  # name for the column that will contain the information from where the row is from (original test dataset or new dataframe)
36 |     FROM_ORIGINAL = 'original'
37 |     FROM_NEW = 'new'
38 |     MIN_NUM_ROWS = 500
39 |     MAX_NUM_ROW = 100000
40 |     CUMULATIVE_PERCENTAGE_THRESHOLD = 90
41 |     PREDICTION_TEST_SIZE = 100000
42 |     SURROGATE_TARGET = "_dku_predicted_label_"
43 | 
44 |     REGRRSSION_TYPE = 'REGRESSION'
45 |     CLASSIFICATION_TYPE = 'CLASSIFICATION'
46 |     CLUSTERING_TYPE = 'CLUSTERING'
47 |     DKU_CLASSIFICATION_TYPE = ['BINARY_CLASSIFICATION', 'MULTICLASS']
48 | 
49 | 
50 |     FEAT_IMP_CUMULATIVE_PERCENTAGE_THRESHOLD = 95
51 |     RISKIEST_FEATURES_RATIO_THRESHOLD = 0.65
52 | 
53 |     FEATURE = 'feature'
54 |     IMPORTANCE = 'importance'
55 |     CUMULATIVE_IMPORTANCE = 'cumulative_importance'
56 |     RANK = 'rank'
57 |     CLASS = 'class'
58 |     PERCENTAGE = 'percentage'
59 |     ORIGINAL_DATASET = 'original_dataset'
60 |     NEW_DATASET = 'new_dataset'
61 |     FUGACITY_RELATIVE_CHANGE_CLASSIF_LABEL = 'fugacity_relative_change_of_class_{0}'
62 |     FUGACITY_RELATIVE_CHANGE_REGRESSION_LABEL = 'fugacity_relative_change_decile_{0}'
63 |     FUGACITY_CLASSIF_LABEL = 'fugacity_class_{0}'
64 | 
65 | 
66 |     @staticmethod
67 |     def get_supported_metrics():
68 |         return ModelDriftConstants.DRIFT_SCORE, ModelDriftConstants.FUGACITY, ModelDriftConstants.FEATURE_IMPORTANCE, ModelDriftConstants.RISKIEST_FEATURES


--------------------------------------------------------------------------------
/python-lib/dku_data_drift/model_tools.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import logging
  3 | import numpy as np
  4 | import math
  5 | from sklearn.neighbors import KernelDensity
  6 | from sklearn.metrics import roc_auc_score
  7 | from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
  8 | from dku_data_drift.preprocessing import Preprocessor
  9 | from dku_data_drift.model_drift_constants import ModelDriftConstants
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | def mroc_auc_score(y_true, y_predictions, sample_weight=None):
 15 |     """ Returns a auc score. Handles multi-class
 16 |     For multi-class, the AUC score is in fact the MAUC
 17 |     score described in
 18 |     David J. Hand and Robert J. Till. 2001.
 19 |     A Simple Generalisation of the Area Under the ROC Curve
 20 |     for Multiple Class Classification Problems.
 21 |     Mach. Learn. 45, 2 (October 2001), 171-186.
 22 |     DOI=10.1023/A:1010920819831
 23 |     http://dx.doi.org/10.1023/A:1010920819831
 24 |     """
 25 |     (nb_rows, max_nb_classes) = y_predictions.shape
 26 |     # Today, it may happen that if a class appears only once in a dataset
 27 |     # it can appear in the train and not in the validation set.
 28 |     # In this case it will not be in y_true and
 29 |     # y_predictions.nb_cols is not exactly the number of class
 30 |     # to consider when computing the mroc_auc_score.
 31 |     classes = np.unique(y_true)
 32 |     nb_classes = len(classes)
 33 |     if nb_classes > max_nb_classes:
 34 |         raise ValueError("Your test set contained more classes than the test set. Check your dataset or try a different split.")
 35 | 
 36 |     if nb_classes < 2:
 37 |         raise ValueError("Ended up with less than two-classes in the validation set.")
 38 | 
 39 |     if nb_classes == 2:
 40 |         classes = classes.tolist()
 41 |         y_true = y_true.map(lambda c: classes.index(c)) # ensure classes are [0 1]
 42 |         return roc_auc_score(y_true, y_predictions[:, 1], sample_weight=sample_weight)
 43 | 
 44 |     def A(i, j):
 45 |         """
 46 |         Returns a asymmetric proximity metric, written A(i | j)
 47 |         in the paper.
 48 |         The sum of all (i, j) with  i != j
 49 |         will give us the symmetry.
 50 |         """
 51 |         mask = np.in1d(y_true, np.array([i, j]))
 52 |         y_true_i = y_true[mask] == i
 53 |         y_pred_i = y_predictions[mask][:, i]
 54 |         if sample_weight is not None:
 55 |             sample_weight_i = sample_weight[mask]
 56 |         else:
 57 |             sample_weight_i = None
 58 |         return roc_auc_score(y_true_i, y_pred_i, sample_weight=sample_weight_i)
 59 | 
 60 |     C = 1.0 / (nb_classes * (nb_classes - 1))
 61 |     # TODO: double check
 62 |     return C * sum(
 63 |         A(i, j)
 64 |         for i in classes
 65 |         for j in classes
 66 |         if i != j)
 67 | 
 68 | def format_proba_density(data, sample_weight=None, min_support=0, max_support=100):
 69 |     """
 70 |     Estimate the density distribution of the target 1-dimensional data array.
 71 |     The support arguments (inf and sup) should be:
 72 |      - 0 and 1 for classification
 73 |      - min(data) and max(data) for regression
 74 | 
 75 |     Output format of the density
 76 |     >>> list(zip([1, 2, 3], [0.3, 0.3, 0.4]))
 77 | 
 78 |     :param data: Target data of the model
 79 |     :param sample_weight:
 80 |     :param min_support: Inferior boundary of the support for density estimation
 81 |     :param max_support: Superior boundary of the support for density estimation
 82 |     :return:
 83 |     """
 84 |     data = np.array(data)
 85 |     if len(data) == 0:
 86 |         return []
 87 |     # Heuristic for the bandwidth determination
 88 |     h = 1.06 * np.std(data) * math.pow(len(data), -.2)
 89 |     if h <= 0:
 90 |         h = 0.06
 91 |     if len(np.unique(data)) == 1:
 92 |         sample_weight = None
 93 |     # Definition of the support of the estimate
 94 |     X_plot = np.linspace(min_support, max_support, 500, dtype=float)[:, np.newaxis]
 95 |     kde = KernelDensity(kernel='gaussian', bandwidth=h).fit(data.reshape(-1, 1), sample_weight=sample_weight)
 96 |     Y_plot = [v if not np.isnan(v) else 0 for v in np.exp(kde.score_samples(X_plot))]
 97 |     return list(zip(X_plot.ravel(), Y_plot))
 98 | 
 99 | class SurrogateModel(object):
100 |     """
101 |     In case the chosen saved model uses a non-tree based algorithm (and thus does not have feature importance), we fit this surrogate model
102 |     on top of the prediction of the former one to be able to retrieve the feature importance information.
103 | 
104 |     """
105 | 
106 |     def __init__(self, prediction_type):
107 |         self.check(prediction_type)
108 |         self.feature_names = None
109 |         self.target = None
110 |         self.prediction_type = prediction_type
111 |         #TODO should we define some params of RF to avoid long computation ?
112 |         if prediction_type == ModelDriftConstants.CLASSIFICATION_TYPE:
113 |             self.clf = RandomForestClassifier(random_state=1407)
114 |         else:
115 |             self.clf = RandomForestRegressor(random_state=1407)
116 | 
117 |     def check(self, prediction_type):
118 |         if prediction_type not in [ModelDriftConstants.CLASSIFICATION_TYPE, ModelDriftConstants.REGRRSSION_TYPE]:
119 |             raise ValueError('Prediction type must either be CLASSIFICATION or REGRESSION.')
120 | 
121 |     def get_features(self):
122 |         return self.feature_names
123 | 
124 |     def fit(self, df, target):
125 |         preprocessor = Preprocessor(df, target)
126 |         train, test = preprocessor.get_processed_train_test()
127 |         train_X = train.drop(target, axis=1)
128 |         train_Y = train[target]
129 |         self.clf.fit(train_X, train_Y)
130 |         self.feature_names = train_X.columns


--------------------------------------------------------------------------------
/python-lib/dku_data_drift/preprocessing.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import sys
  4 | import random
  5 | from collections import Counter
  6 | from datetime import datetime
  7 | import logging
  8 | import numpy as np
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | EPOCH = datetime(1900, 1, 1)
 12 | 
 13 | 
 14 | class Preprocessor(object):
 15 | 
 16 |     def __init__ (self, df=None, target=None):
 17 |         self.df = df.reset_index(drop=True)
 18 |         self.target = target
 19 |         self._categorical_features = []
 20 |         self._numerical_features = []
 21 |         self._text_features = []
 22 | 
 23 |     def check(self):
 24 |         if self.df is None:
 25 |             raise ValueError('df is not specified.')
 26 |         if self.target is None:
 27 |             raise ValueError('target is not specified.')
 28 | 
 29 |     def get_processed_train_test(self):
 30 |         self._categorical_features = [x for x in self._get_categorical_features() if x != self.target]
 31 |         self._numerical_features = self._get_numerical_features()
 32 |         self._text_features = self._get_text_features()
 33 |         self._parse_data()
 34 |         raw_train, raw_test = self._get_train_test_set()
 35 |         imputed_train, imputed_test = self._impute(raw_train, raw_test)
 36 |         dummy_values_dict = self._select_dummy_values(imputed_train, self._categorical_features)
 37 |         final_train = self._dummy_encode(imputed_train, dummy_values_dict)
 38 |         final_test = self._dummy_encode(imputed_test, dummy_values_dict)
 39 |         return final_train, final_test
 40 | 
 41 |     def _parse_data(self):
 42 |         def _datetime_to_epoch(series):
 43 |             return (series - EPOCH) / np.timedelta64(1, 's')
 44 | 
 45 |         for feature in self._categorical_features:
 46 |             self.df[feature] = self.df[feature].apply(self._coerce_to_unicode)
 47 |         for feature in self._text_features:
 48 |             self.df[feature] = self.df[feature].apply(self._coerce_to_unicode)
 49 |         for feature in self._numerical_features:
 50 |             if self.df[feature].dtype == np.dtype('M8[ns]'):
 51 |                 self.df[feature] = _datetime_to_epoch(self.df[feature])
 52 |             else:
 53 |                 self.df[feature] = self.df[feature].astype('double')
 54 | 
 55 |     def _get_numerical_features(self):
 56 |         return self.df.select_dtypes(include=['number', 'M8[ns]']).columns.tolist()
 57 | 
 58 |     def _get_categorical_features(self):
 59 |         return self.df.select_dtypes(include=['object', 'category']).columns.tolist()
 60 | 
 61 |     def _get_text_features(self):
 62 |         return []
 63 | 
 64 |     def _coerce_to_unicode(self, x):
 65 |         if sys.version_info < (3, 0):
 66 |             if isinstance(x, str):
 67 |                 return unicode(x, 'utf-8')
 68 |             else:
 69 |                 return unicode(x)
 70 |         else:
 71 |             return str(x)
 72 | 
 73 |     def _select_dummy_values(self, dfx, features, LIMIT_DUMMIES = 100):
 74 |         dummy_values = {}
 75 |         for feature in features:
 76 |             values = [
 77 |                 value
 78 |                 for (value, _) in Counter(dfx[feature]).most_common(LIMIT_DUMMIES)
 79 |             ]
 80 |             dummy_values[feature] = values
 81 |         return dummy_values
 82 | 
 83 |     def _get_train_test_set(self, prop=0.8, seed=1234):
 84 |         k = int(self.df.shape[0] * prop)
 85 |         random.seed(seed)
 86 |         sampler = random.sample(self.df.index.tolist(), k)
 87 |         train = self.df.loc[sampler]
 88 |         test = self.df[~self.df.index.isin(sampler)]
 89 |         return train, test
 90 | 
 91 |     def _impute(self, df_train, df_test):
 92 |         for feature in self._numerical_features:
 93 |             v = df_train[feature].mean()
 94 |             df_train[feature] = df_train[feature].fillna(v)
 95 |             df_test[feature] = df_test[feature].fillna(v)
 96 |             logger.info('Imputed missing values in feature %s with value %s' % (feature, self._coerce_to_unicode(v)))
 97 | 
 98 |         for feature in self._categorical_features:
 99 |             v = 'NULL_CATEGORY'
100 |             df_train[feature] = df_train[feature].fillna(v)
101 |             df_test[feature] = df_test[feature].fillna(v)
102 |             logger.info('Imputed missing values in feature %s with value %s' % (feature, self._coerce_to_unicode(v)))
103 | 
104 |         return df_train, df_test
105 | 
106 |     def _dummy_encode(self, dfx, dummy_values_dict):
107 |         dfx_copy = dfx.copy()
108 |         for (feature, dummy_values) in dummy_values_dict.items():
109 |             for dummy_value in dummy_values:
110 |                 #TODO add dummy:N/A and dummy:_Others_
111 |                 dummy_name = u'dummy:%s:%s' % (feature, self._coerce_to_unicode(dummy_value))
112 |                 dfx_copy[dummy_name] = (dfx_copy[feature] == dummy_value).astype(float)
113 |             del dfx_copy[feature]
114 |             logger.info('Dummy-encoded feature %s' % feature)
115 | 
116 |         return dfx_copy
117 | 


--------------------------------------------------------------------------------
/python-lib/dku_tools.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import datetime
  3 | import json
  4 | import dataiku
  5 | from dataiku.customrecipe import get_input_names_for_role, get_output_names_for_role
  6 | from dku_data_drift.model_drift_constants import ModelDriftConstants
  7 | 
  8 | 
  9 | def process_timestamp(timestamp):
 10 |     """
 11 |     Convert the timestamp to str date
 12 |     :param timestamp:
 13 |     :return:
 14 |     """
 15 |     return str(datetime.datetime.fromtimestamp(timestamp / 1000))
 16 | 
 17 | 
 18 | def set_column_description(dataset, column_description_dict):
 19 |     dataset_schema = dataset.read_schema()
 20 |     for col_info in dataset_schema:
 21 |         col_name = col_info.get('name')
 22 |         col_info['comment'] = column_description_dict.get(col_name)
 23 |     dataset.write_schema(dataset_schema)
 24 | 
 25 | 
 26 | def get_input_output(has_model_as_second_input=False):
 27 | 
 28 |     if len(get_input_names_for_role('new')) == 0:
 29 |         raise ValueError('No new dataset.')
 30 |     if len(get_output_names_for_role('output_dataset')) == 0:
 31 |         raise ValueError('No output dataset.')
 32 | 
 33 |     new_dataset_name = get_input_names_for_role('new')[0]
 34 |     new_dataset = dataiku.Dataset(new_dataset_name)
 35 | 
 36 |     output_dataset_name = get_output_names_for_role('output_dataset')[0]
 37 |     output_dataset = dataiku.Dataset(output_dataset_name)
 38 | 
 39 |     if has_model_as_second_input:
 40 |         if len(get_input_names_for_role('model')) == 0:
 41 |             raise ValueError('No input model.')
 42 |         model_name = get_input_names_for_role('model')[0]
 43 |         model = dataiku.Model(model_name)
 44 |         return (new_dataset, model, output_dataset)
 45 |     else:
 46 |         if len(get_input_names_for_role('original')) == 0:
 47 |             raise ValueError('No original dataset.')
 48 | 
 49 |         original_dataset_name = get_input_names_for_role('original')[0]
 50 |         original_dataset = dataiku.Dataset(original_dataset_name)
 51 |         return (new_dataset, original_dataset, output_dataset)
 52 | 
 53 | 
 54 | def get_params_with_model(recipe_config, model):
 55 |     use_active_version = recipe_config.get('use_active_version')
 56 |     active_version = None
 57 |     if use_active_version:
 58 |         for version in model.list_versions():
 59 |             active_version = version.get('active') is True
 60 |             if active_version:
 61 |                 version_id = version.get('versionId')
 62 |                 break
 63 |     else:
 64 |         version_id = recipe_config.get('version_id')
 65 |         if version_id is None:
 66 |             raise ValueError('Please choose a model version.')
 67 | 
 68 |     metric_list = recipe_config.get('metric_list')
 69 |     if len(metric_list) == 0 or metric_list is None:
 70 |         raise ValueError('Please choose at least one metric.')
 71 |     return version_id, metric_list
 72 | 
 73 | 
 74 | def get_params_without_model(recipe_config):
 75 |     metric_list = recipe_config.get('metric_list_without_prediction')
 76 |     if len(metric_list) == 0 or metric_list is None:
 77 |         raise ValueError('Please choose at least one metric.')
 78 | 
 79 |     # Handle columns to remove
 80 |     columns_to_remove = recipe_config.get('columns_to_remove')
 81 |     return columns_to_remove, metric_list
 82 | 
 83 | 
 84 | def build_drift_metric_dataframe(drifter, metric_list, based_df, has_model_as_input):
 85 | 
 86 |     new_df = based_df.copy()
 87 |     column_description_dict = {}
 88 | 
 89 |     if ModelDriftConstants.DRIFT_SCORE in metric_list:
 90 |         # new_df_with_drift_score, column_description_dict = extract_drift_score(drifter, new_df, column_description_dict)
 91 |         drift_score, drift_accuracy_lower, drift_accuracy_upper, drift_test_pvalue = drifter.get_drift_score(output_raw_score=True)
 92 |         new_df[ModelDriftConstants.DRIFT_SCORE] = [drift_score]
 93 |         column_description_dict[ModelDriftConstants.DRIFT_SCORE] = ModelDriftConstants.DRIFT_SCORE_DEFINITION
 94 | 
 95 |         new_df[ModelDriftConstants.BINOMIAL_P_VALUE] = [drift_test_pvalue]
 96 |         column_description_dict[ModelDriftConstants.BINOMIAL_P_VALUE] = ModelDriftConstants.BINOMIAL_TEST_DEFINITION
 97 | 
 98 |         new_df[ModelDriftConstants.BINOMIAL_LOWER_BOUND] = [drift_accuracy_lower]
 99 |         column_description_dict[ModelDriftConstants.BINOMIAL_LOWER_BOUND] = ModelDriftConstants.BINOMIAL_LOWER_BOUND_DEFINITION
100 | 
101 |         new_df[ModelDriftConstants.BINOMIAL_UPPER_BOUND] = [drift_accuracy_upper]
102 |         column_description_dict[ModelDriftConstants.BINOMIAL_UPPER_BOUND] = ModelDriftConstants.BINOMIAL_UPPER_BOUND_DEFINITION
103 | 
104 | 
105 |     if ModelDriftConstants.FUGACITY in metric_list:
106 |         if drifter.get_prediction_type() == ModelDriftConstants.CLASSIFICATION_TYPE:
107 |             fugacity, fugacity_relative_change = drifter.get_classification_fugacity()
108 |             new_df[ModelDriftConstants.FUGACITY] = json.dumps(fugacity)
109 |             new_df[ModelDriftConstants.FUGACITY_RELATIVE_CHANGE] = json.dumps(fugacity_relative_change)
110 |             column_description_dict[ModelDriftConstants.FUGACITY] = ModelDriftConstants.FUGACITY_CLASSIF_DEFINITION
111 |             column_description_dict[ModelDriftConstants.FUGACITY_RELATIVE_CHANGE] = ModelDriftConstants.FUGACITY_RELATIVE_CHANGE_CLASSIF_DEFINITION
112 |         elif drifter.get_prediction_type() == ModelDriftConstants.REGRRSSION_TYPE:
113 |             fugacity, fugacity_relative_change, bin_description = drifter.get_regression_fugacity()
114 |             new_df[ModelDriftConstants.FUGACITY] = json.dumps(fugacity)
115 |             new_df[ModelDriftConstants.FUGACITY_RELATIVE_CHANGE] = json.dumps(fugacity_relative_change)
116 |             proper_bin_description = '\n'.join(['Decile {0}: {1}'.format(bin_index, bin_desc) for bin_index, bin_desc in enumerate(bin_description)])
117 |             column_description_dict[ModelDriftConstants.FUGACITY] = ModelDriftConstants.FUGACITY_REGRESSION_DEFINITION + proper_bin_description
118 |             column_description_dict[ModelDriftConstants.FUGACITY_RELATIVE_CHANGE] = ModelDriftConstants.FUGACITY_RELATIVE_CHANGE_REGRESSION_DEFINITION + proper_bin_description
119 |         else:
120 |             raise ValueError('Unsupported prediction type: {0}'.format(drifter.get_prediction_type()))
121 | 
122 |     if ModelDriftConstants.FEATURE_IMPORTANCE in metric_list:
123 | 
124 |         drift_feature_importance = drifter.get_drift_feature_importance()
125 |         feat_dict = {}
126 |         for feat, feat_info in drift_feature_importance[:ModelDriftConstants.NUMBER_OF_DRIFTED_FEATURES].iterrows():
127 |             feat_dict[feat] = round(feat_info.get(ModelDriftConstants.IMPORTANCE), 2)
128 |         new_df[ModelDriftConstants.MOST_DRIFTED_FEATURES] = [json.dumps(feat_dict)]
129 |         column_description_dict[ModelDriftConstants.MOST_DRIFTED_FEATURES] = ModelDriftConstants.MOST_DRIFTED_FEATURES_DEFINITION
130 | 
131 |         if has_model_as_input:
132 |             original_feature_importance = drifter.get_original_feature_importance()
133 |             feat_dict = {}
134 |             for feat, feat_info in original_feature_importance[:ModelDriftConstants.NUMBER_OF_DRIFTED_FEATURES].iterrows():
135 |                 feat_dict[feat] = round(feat_info.get(ModelDriftConstants.IMPORTANCE), 2)
136 |             new_df[ModelDriftConstants.MOST_IMPORTANT_FEATURES] = [json.dumps(feat_dict)]
137 |             column_description_dict[ModelDriftConstants.MOST_IMPORTANT_FEATURES] = ModelDriftConstants.MOST_DRIFTED_FEATURES_DEFINITION
138 | 
139 |     if ModelDriftConstants.RISKIEST_FEATURES in metric_list:
140 |         riskiest_feature = drifter.get_riskiest_features()
141 |         new_df[ModelDriftConstants.RISKIEST_FEATURES] = json.dumps(riskiest_feature)
142 |         column_description_dict[ModelDriftConstants.RISKIEST_FEATURES] = ModelDriftConstants.RISKIEST_FEATURES_DEFINITION
143 | 
144 |     return new_df, column_description_dict


--------------------------------------------------------------------------------
/python-lib/model_metadata.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | import sys
 4 | import json
 5 | import dataiku
 6 | from dataiku.doctor.posttraining.model_information_handler import PredictionModelInformationHandler
 7 | from dku_tools import process_timestamp
 8 | 
 9 | 
10 | def get_train_date(model_version, version_id):
11 |     m = dataiku.Model(model_version, ignore_flow=True)
12 |     for v in m.list_versions():
13 |         if v.get('versionId') == version_id:
14 |             return process_timestamp((v.get('snippet').get('trainDate')))
15 |     return None
16 | 
17 | 
18 | def get_model_handler(model, version_id=None):
19 |     saved_model_version_id = _get_saved_model_version_id(model, version_id)
20 |     return _get_model_info_handler(saved_model_version_id)
21 | 
22 | 
23 | def _get_model_info_handler(saved_model_version_id):
24 |     infos = saved_model_version_id.split("-")
25 |     if len(infos) != 4 or infos[0] != "S":
26 |         raise Exception("Invalid saved model id")
27 |     pkey = infos[1]
28 |     model_id = infos[2]
29 |     version_id = infos[3]
30 | 
31 |     datadir_path = os.environ['DIP_HOME']
32 |     version_folder = os.path.join(datadir_path, "saved_models", pkey, model_id, "versions", version_id)
33 | 
34 |     # Loading and resolving paths in split_desc
35 |     split_folder = os.path.join(version_folder, "split")
36 |     with open(os.path.join(split_folder, "split.json")) as split_file:
37 |         split_desc = json.load(split_file)
38 | 
39 |     path_field_names = ["trainPath", "testPath", "fullPath"]
40 |     for field_name in path_field_names:
41 |         if split_desc.get(field_name, None) is not None:
42 |             split_desc[field_name] = os.path.join(split_folder, split_desc[field_name])
43 | 
44 |     with open(os.path.join(version_folder, "core_params.json")) as core_params_file:
45 |         core_params = json.load(core_params_file)
46 |         
47 |     try:
48 |         return PredictionModelInformationHandler(split_desc, core_params, version_folder, version_folder)
49 |     except Exception as e:
50 |         from future.utils import raise_
51 |         if "ordinal not in range(128)" in str(e):
52 |             raise_(Exception, "The plugin is using a python3 code-env, cannot load a python2 model.", sys.exc_info()[2])
53 |         elif str(e) == "non-string names in Numpy dtype unpickling":
54 |             raise_(Exception, "The plugin is using a python2 code-env, cannot load a python3 model.", sys.exc_info()[2])
55 |         else:
56 |             raise_(Exception, "Fail to load saved model: {}".format(e), sys.exc_info()[2])
57 | 
58 | 
59 | def _get_saved_model_version_id(model, version_id=None):
60 |     model_def = model.get_definition()
61 |     if version_id is None:
62 |         version_id = model_def.get('activeVersion')
63 |     saved_model_version_id = 'S-{0}-{1}-{2}'.format(model_def.get('projectKey'), model_def.get('id'), version_id)
64 |     return saved_model_version_id
65 | 


--------------------------------------------------------------------------------
/python-probes/drift-score/probe.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "meta" : {
 3 |         "label": "Most recent drift score",
 4 |         "description": "",
 5 |         "icon": "icon-anchor"
 6 |     },
 7 |     "handlesDataset": true,
 8 |     "handlesManagedFolder": false,
 9 |     "params": []
10 | }
11 | 


--------------------------------------------------------------------------------
/python-probes/drift-score/probe.py:
--------------------------------------------------------------------------------
 1 | from dku_data_drift.model_drift_constants import ModelDriftConstants
 2 | import numpy as np
 3 | 
 4 | def process(dataset, partition_id):
 5 |     df = dataset.get_dataframe()
 6 |     if len(df) == 0:
 7 |         return 'No data'
 8 |     if ModelDriftConstants.DRIFT_SCORE in df and ModelDriftConstants.TIMESTAMP in df:
 9 |         most_recent_drift_score = df[df[ModelDriftConstants.TIMESTAMP] == np.max(df[ModelDriftConstants.TIMESTAMP])][ModelDriftConstants.DRIFT_SCORE].values[0]
10 |         metric_values = {ModelDriftConstants.DRIFT_SCORE: most_recent_drift_score}
11 |         return metric_values
12 |     else:
13 |         return 'No drift score'
14 | 


--------------------------------------------------------------------------------
/resource/compute_model_id_choice.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Allow dynamic select of the model id in the model recipe.
 3 | """
 4 | import dataiku
 5 | from dku_tools import process_timestamp
 6 | 
 7 | 
 8 | def do(payload, config, plugin_config, inputs):
 9 |     """
10 |     DSS built-in interface for param loading in the form.
11 |     Retrieve the available versions of a pretrained model in DSS.
12 |     :param payload:
13 |     :param config:
14 |     :param plugin_config:
15 |     :param inputs:
16 |     :return:
17 |     """
18 |     model = None
19 |     for input_ in inputs:
20 |         if input_['role'] == 'model':
21 |             model = str(input_['fullName'])
22 |     if model is None:
23 |         raise Exception("Did not catch the right input model")
24 | 
25 | 
26 |     model_id = model.split('.')[-1]
27 |     model = dataiku.Model(model_id)
28 | 
29 |     if model.get_info().get('type') != 'PREDICTION':
30 |         raise ValueError('Model type {} is not supported. Please choose a regression or classifcation model.'.format(model.get_info().get('type')))
31 | 
32 | 
33 |     choice_list = []
34 |     for version in model.list_versions():
35 |         version_detail = version.get('snippet', {})
36 |         algorithm = version_detail.get('algorithm', '').lower().replace('_', ' ')
37 |         active_version = version.get('active') is True
38 |         train_date = process_timestamp(version_detail.get('trainDate'))
39 |         version_id = version.get('versionId')
40 | 
41 |         if active_version:
42 |             version_info = {
43 |                 'value': version_id,
44 |                 'label': 'active version, trained on {1}, {0}'.format(algorithm, train_date)
45 |             }
46 |         else:
47 |             version_info = {
48 |                 'value': version_id,
49 |                 'label': 'trained on {1}, {0}'.format(algorithm, train_date)
50 |             }
51 |         choice_list.append((version_info, train_date))
52 | 
53 |     sorted_choice_list = sorted(choice_list, key=lambda k: k[1])
54 |     final_choice_list = [choice[0] for choice in sorted_choice_list]
55 | 
56 |     return {"choices": final_choice_list}


--------------------------------------------------------------------------------
/resource/dku-helpers.js:
--------------------------------------------------------------------------------
 1 | /*
 2 | Helper function to query webapp backend with a default implementation for error handling
 3 | Assumes a dataiku object is defined
 4 | v 1.5.0
 5 | */
 6 | 
 7 | dataiku.webappBackend = (function() {
 8 |     function getUrl(path) {
 9 |         return dataiku.getWebAppBackendUrl(path);
10 |     }
11 | 
12 |     // function dkuDisplayError(error) {
13 |     //     alert('Backend error, check the logs.');
14 |     // }
15 | 
16 |     function get(path, args={}, displayErrors=true) {
17 |         return fetch(getUrl(path) + '?' + $.param(args), {
18 |             method: 'GET',
19 |             headers: {
20 |                 'Accept': 'application/json',
21 |                 'Content-Type': 'application/json'
22 |             }
23 |         })
24 |         .then(response => {
25 |             if (response.status == 502) {
26 |                 throw Error("Webapp backend not started");
27 |             } else if (!response.ok) {
28 |                 response.text().then(text => dataiku.webappMessages.displayFatalError(`${response.statusText} (HTTP ${response.status}):\n${text}`))
29 |                 throw Error(`${response.statusText} (HTTP ${response.status})`);
30 |             }
31 |             try {
32 |                 return response.json();
33 |             } catch {
34 |                 throw Error('The backend response is not JSON: '+ response.text());
35 |             }
36 |         })
37 |         .catch(function(error) {
38 |             if (displayErrors && error.message && !error.message.includes('not started')) { // little hack, backend not started should be handled elsewhere
39 |                 dataiku.webappMessages.displayFatalError(error)
40 |             }
41 |             throw error;
42 |         });
43 |     }
44 | 
45 |     return Object.freeze({getUrl, get});
46 | })();
47 | 
48 | 
49 | dataiku.webappMessages = (function() {
50 |     function displayFatalError(err) {
51 |         const errElt = $('<div class="fatal-error" style="margin: 30px auto; text-align: center; color: var(--error-red)"></div>')
52 |         errElt.text(err);
53 |         $('#error_message').html(errElt);
54 |     }
55 |     function clear() {
56 |         $('#error_message').html('');
57 |     }
58 |     return Object.freeze({displayFatalError, clear});
59 | })();
60 | 


--------------------------------------------------------------------------------
/resource/style.css:
--------------------------------------------------------------------------------
  1 | /*
  2 | DSS webapp base stylesheet v2.0.0
  3 | Apache Software License
  4 | Dataiku (Joachim Zentici)
  5 | 
  6 | This stylesheet should allow you to simply style a webapp while keeping a good consistency with DSS
  7 | For questions and requests, https://github.com/dataiku/dataiku-contrib/issues
  8 | 
  9 | */
 10 | :root {
 11 |     /* DSS-like colors, prefer using them for better visual integration with core product */
 12 |     --blue-lighten-5:           #E7F3FF;
 13 |     --blue-lighten-4:           #C4E0FE;
 14 |     --blue-lighten-3:           #9DCCFE;
 15 |     --blue-lighten-2:           #76B8FD;
 16 |     --blue-lighten-1:           #58A8FC;
 17 |     --blue:                     #3B99FC;
 18 |     --blue-darken-1:            #3591FC;
 19 |     --blue-darken-2:            #2D86FB;
 20 |     --blue-darken-3:            #267CFB;
 21 |     --blue-darken-4:            #196BFA;
 22 | 
 23 |     --success-green:            #4caf50;
 24 |     --warning-orange:           #F28C37;
 25 |     --error-color:              #CE1228;
 26 |     --error-background:         #f9e3e5;
 27 | 
 28 |     --grey-lighten-7:           #F2F2F2;
 29 |     --grey-lighten-6:           #DDDDDD;
 30 |     --grey-lighten-5:           #CCCCCC;
 31 |     --grey-lighten-4:           #BBBBBB;
 32 |     --grey-lighten-3:           #666666;
 33 |     --grey-lighten-2:           #444444;
 34 |     --grey-lighten-1:           #333333;
 35 |     --grey:                     #222222;
 36 | 
 37 |     --grey-text:                var(--grey-lighten-1);
 38 |     --border-color:             var(--grey-lighten-7);
 39 | 
 40 |     /* Backgrounds */
 41 |     --grey-background:          var(--grey-lighten-7);
 42 | }
 43 | 
 44 | @font-face {
 45 |     font-family: SourceSansPro;
 46 |     src: url(/static/dataiku/fonts/SourceSansPro-Bold.woff);
 47 |     font-weight: 600;
 48 | }
 49 | @font-face {
 50 |     font-family: SourceSansPro;
 51 |     src: url(/static/dataiku/fonts/SourceSansPro-Semibold.woff);
 52 |     font-weight: 500;
 53 | }
 54 | @font-face {
 55 |     font-family: SourceSansPro;
 56 |     src: url(/static/dataiku/fonts/SourceSansPro-Regular.woff);
 57 |     font-weight: 400;
 58 | }
 59 | 
 60 | body {
 61 |     font-family: 'SourceSansPro';
 62 |     font-size: 13px;
 63 |     color: #333333;
 64 | }
 65 | a {
 66 |     color: #0088cc;
 67 |     text-decoration: none;
 68 | }
 69 | 
 70 | h1 {
 71 |     font-size: 32px;
 72 |     font-weight: 500;
 73 |     margin-top: 0;
 74 |     margin-bottom: 0;
 75 | }
 76 | 
 77 | h2 {
 78 |     font-size: 24px;
 79 |     font-weight: 400;
 80 |     margin-bottom: 0;
 81 | }
 82 | 
 83 | h3 {
 84 |     font-size: 18px;
 85 |     font-weight: 400;
 86 |     margin-bottom: 0;
 87 | }
 88 | 
 89 | h4 {
 90 |     font-size: 16px;
 91 |     font-weight: 400;
 92 |     margin-bottom: 0;
 93 | }
 94 | 
 95 | /* Buttons */
 96 | 
 97 | .btn {
 98 |     font-family: 'SourceSansPro';
 99 |     text-transform: uppercase;
100 |     font-size: 13px;
101 |     font-weight: 500;
102 |     padding: 3px 8px;
103 |     margin: 0;
104 |     line-height: 1.4;
105 |     background-image: inherit;
106 |     box-shadow: none;
107 |     text-shadow: none;
108 |     box-sizing: border-box;
109 |     outline: 0;
110 |     cursor: pointer;
111 |     background-color: #ffffff;
112 |     color: var(--grey-text);
113 |     border: 1px solid #cccccc;
114 | }
115 | .btn:hover {
116 |     background-color: #dddddd;
117 | }
118 | .btn:active {
119 |     background-color: #cccccc;
120 | }
121 | 
122 | .btn-primary {
123 |     background: #28a9dd;
124 |     color: #ffffff;
125 |     border: 1px solid transparent;
126 | }
127 | .btn-primary:hover {
128 |     background: #31adde;
129 | }
130 | .btn-primary:active {
131 |     background: #22a4d9;
132 | }
133 | 
134 | /* Layout */
135 | 
136 | .white-box {
137 |     background: #ffffff;
138 |     box-shadow: 0px 0px 2px 1px rgba(34, 34, 34, 0.15);
139 |     box-sizing: border-box;
140 |     padding: 24px;
141 |     margin-bottom: 24px;
142 |     width: 1000px;
143 | }
144 | 
145 | /* tables */
146 | 
147 | table.ml-table {
148 |     border-collapse: collapse;
149 |     border-spacing: 0;
150 | }
151 | table.ml-table th {
152 |     font-family: 'SourceSansPro';
153 |     font-weight: 400;
154 |     font-size: 16px;
155 | }
156 | table.ml-table th, table.ml-table td {
157 |     border: 1px solid #dddddd;
158 |     padding: 8px;
159 | }
160 | 
161 | 
162 | /* Standard components */
163 | 
164 | .explanation { /* try to keep in sync with DSS .doctor-explanation */
165 |     background: #e6eef2;
166 |     padding: 20px;
167 |     border-radius: 5px;
168 |     color: #31708f;
169 | }
170 | .error-message, #error_message { /*remove id*/
171 |     color: var(--error-color);
172 |     background-color: var(--error-background);
173 |     padding: 8px 16px 8px 16px;
174 |     font-size: 15px;
175 | }


--------------------------------------------------------------------------------
/tests/python/requirements.txt:
--------------------------------------------------------------------------------
 1 | flask==1.1.2
 2 | scikit-learn==0.20.2
 3 | scipy==1.1.0
 4 | xgboost==0.81
 5 | pandas==0.23.4
 6 | numpy==1.16.6
 7 | future==0.18.2
 8 | joblib==0.14.1
 9 | enum34==1.1.10
10 | statsmodels==0.9.0


--------------------------------------------------------------------------------
/tests/python/unit/test_drift_analyzer.py:
--------------------------------------------------------------------------------
  1 | # This is a test file intended to be used with pytest
  2 | # pytest automatically runs all the function starting with "test_"
  3 | # see https://docs.pytest.org for more information
  4 | 
  5 | import sys
  6 | import os
  7 | import numpy as np
  8 | import pandas as pd
  9 | from sklearn.ensemble import RandomForestClassifier
 10 | from sklearn.datasets import load_iris
 11 | from sklearn.model_selection import train_test_split
 12 | import pytest
 13 | from dku_data_drift import DriftAnalyzer, ModelAccessor
 14 | 
 15 | RANDOM_SEED = 65537 # Fermat prime number <3
 16 | TEST_RATIO = 0.3 # if this ratio change the reference prediction results below need to be updated accordingly
 17 | 
 18 | def load_data():
 19 |     iris = load_iris()
 20 |     feature_names = iris['feature_names']
 21 |     target = 'target'
 22 |     df = pd.DataFrame(data=np.c_[iris['data'], iris['target']],
 23 |                       columns=feature_names + [target])
 24 |     return df, feature_names, target
 25 | 
 26 | 
 27 | class ScikitPredictor:
 28 | 
 29 |     def __init__(self, df, feature_names, target):
 30 |         self.feature_names = feature_names
 31 |         self._clf = RandomForestClassifier(n_estimators=10, random_state=RANDOM_SEED).fit(df[feature_names], df[target])
 32 | 
 33 |     def get_features(self):
 34 |         return self.feature_names
 35 | 
 36 |     def predict(self, X):
 37 |         predictions = self._clf.predict(X[self.feature_names])
 38 |         probas =  self._clf.predict_proba(X[self.feature_names])
 39 |         df = pd.DataFrame(probas, columns = ['proba_{}'.format(x) for x in range(probas.shape[1])])
 40 |         df['prediction'] = predictions
 41 |         return df
 42 | 
 43 | 
 44 | class ScikitModelHandler:
 45 | 
 46 |     def __init__(self):
 47 |         self.df, self.feature_names, self.target = load_data()
 48 |         self.train_df, self.test_df = train_test_split(self.df, test_size=0.3, random_state=RANDOM_SEED)
 49 |         self.predictor = ScikitPredictor(self.train_df, self.feature_names, self.target)
 50 | 
 51 |     def get_prediction_type(self):
 52 |         return 'MULTICLASS'
 53 | 
 54 |     def get_predictor(self):
 55 |         return self.predictor
 56 | 
 57 |     def get_target_variable(self):
 58 |         return self.target
 59 | 
 60 |     def get_test_df(self):
 61 |         return [self.test_df, True]
 62 | 
 63 |     def get_per_feature(self):
 64 |         per_feature_dict ={
 65 |             self.target: {'role': 'TARGET'}
 66 |         }
 67 |         for feature in self.feature_names:
 68 |             dct = {
 69 |                 'role': 'INPUT',
 70 |                 'type': 'NUMERIC',
 71 |                 'missing_handling': 'IMPUTE',
 72 |                 'missing_impute_with': 'MEAN',
 73 |                 'numerical_handling': 'REGULAR',
 74 |                 'rescaling': 'AVGSTD',
 75 |             }
 76 |             per_feature_dict[feature] = dct
 77 | 
 78 |         return per_feature_dict
 79 | 
 80 |     def get_selected_features(self):
 81 |         selected_features = []
 82 |         for feat, feat_info in self.get_per_feature().items():
 83 |             if feat_info.get('role') == 'INPUT':
 84 |                 selected_features.append(feat)
 85 |         return selected_features
 86 | 
 87 | 
 88 | class TestDriftAnalyzer:
 89 | 
 90 |     def setup(self):
 91 |         self.model_handler = 'model_handler'
 92 |         self.model_handler = ScikitModelHandler()
 93 |         self.model_accessor = ModelAccessor(self.model_handler)
 94 |         self.drifter = DriftAnalyzer()
 95 | 
 96 |     def test_empty_set(self):
 97 |         _, feature_names, _ = load_data()
 98 |         new_test_df = pd.DataFrame(columns=feature_names)
 99 |         with pytest.raises(Exception) as e_info:
100 |             self.drifter.fit(new_test_df, model_accessor=self.model_accessor)
101 | 
102 |     def test_missing_feature_set(self):
103 |         df, feature_names, _ = load_data()
104 |         _, new_test_df = train_test_split(df, test_size=TEST_RATIO, random_state=RANDOM_SEED)
105 |         new_test_df = new_test_df.drop(feature_names[0], 1)
106 | 
107 |         with pytest.raises(Exception) as e_info:
108 |             self.drifter.fit(new_test_df, model_accessor=self.model_accessor)
109 | 
110 |     def test_identical_set(self):
111 |         df, _, _ = load_data()
112 |         _, new_test_df = train_test_split(df, test_size=TEST_RATIO, random_state=RANDOM_SEED)
113 |         self.drifter.fit(new_test_df, model_accessor=self.model_accessor)
114 |         result_dict = self.drifter.get_drift_metrics_for_webapp()
115 | 
116 |         drift_accuracy = result_dict.get('drift_accuracy')
117 |         fugacity = result_dict.get('fugacity')
118 |         feature_importance = result_dict.get('feature_importance')
119 | 
120 |         original_model_feature_importance = sorted([feat_imp['original_model'] for feat_imp in feature_importance])
121 |         drift_model_feature_importance = sorted([feat_imp['drift_model'] for feat_imp in feature_importance])
122 | 
123 |         assert drift_accuracy == 0.5  # no drift, model can not distinguish, accuracy is 0.5
124 |         for fugacity_one_class in fugacity:
125 |             assert fugacity_one_class.get('Selected dataset') == fugacity_one_class.get('Original dataset')
126 | 
127 |         assert np.array_equal(original_model_feature_importance, [0.01, 0.01, 43.17215785326303, 46.77454270154651])
128 |         assert np.array_equal(drift_model_feature_importance,
129 |                               [0.01, 25.14448373884474, 26.616157925410526, 27.984711759761264])
130 | 
131 |     def test_drifted_set(self):
132 |         df, feature_names, _ = load_data()
133 |         _, original_test_df = train_test_split(df, test_size=TEST_RATIO, random_state=RANDOM_SEED)
134 |         new_test_df = original_test_df.copy()
135 |         new_test_df[feature_names] = new_test_df[feature_names] * 2  # shift the feature distribution
136 | 
137 |         self.drifter.fit(new_test_df, model_accessor=self.model_accessor)
138 |         result_dict = self.drifter.get_drift_metrics_for_webapp()
139 | 
140 |         drift_accuracy = result_dict.get('drift_accuracy')
141 |         fugacity = result_dict.get('fugacity')
142 | 
143 |         prediction_distribution_original_test_set = [fuga['Input dataset'] for fuga in fugacity]
144 |         prediction_distribution_new_test_set = [fuga['Test dataset'] for fuga in fugacity]
145 | 
146 |         assert drift_accuracy == 1
147 |         assert np.array_equal(prediction_distribution_original_test_set, [2.22, 75.56, 22.22])
148 |         assert np.array_equal(prediction_distribution_new_test_set, [40.0, 35.56, 24.44])
149 | 
150 | 
151 | 
152 | 
153 | 
154 | 
155 | 
156 | 
157 | 
158 | 


--------------------------------------------------------------------------------
/webapps/model-drift-view/app.js:
--------------------------------------------------------------------------------
  1 | let webAppConfig = dataiku.getWebAppConfig();
  2 | let modelId = webAppConfig['modelId'];
  3 | let versionId = webAppConfig['versionId'];
  4 | 
  5 | dataiku.webappBackend.get('list-datasets')
  6 |     .then(data => {
  7 |             $.each(data.dataset_list, function(i, option) {
  8 |                 $('#dataset-selector').append($('<option/>').attr("value", option.name).text(option.name));
  9 |             });
 10 |         }
 11 |     );
 12 | 
 13 | $('#run-button').click(function() {
 14 |     dataiku.webappMessages.clear();
 15 |     $('.landing-page').hide();
 16 |     runAnalysis($('#run-button'));
 17 | });
 18 | 
 19 | function changeInputColor(input, value){
 20 |         $(input).removeClass();
 21 |         if (value < 0.1){
 22 |             $(input).addClass('low-risk');
 23 |             $('#inline-drift-score-explain').html('<b>low data drift</b>.');
 24 |         }
 25 |         else if(value >= 0.1 && value <= 0.5){
 26 |             $(input).addClass('medium-risk');
 27 |             $('#inline-drift-score-explain').html('<b>medium data drift</b>.');
 28 |         }
 29 |         else{
 30 |             $(input).addClass('high-risk');
 31 |             $('#inline-drift-score-explain').html('<b>high data drift</b>.');
 32 |         }
 33 |     }
 34 | 
 35 | function runAnalysis($this) {
 36 |     markRunning(true);
 37 |     dataiku.webappBackend.get('get-drift-metrics', {'model_id': modelId, 'version_id': versionId, 'test_set': $("#dataset-selector").val()})
 38 |         .then(
 39 |             function(data) {
 40 |                 // first box
 41 |                 $('#accuracy').text(data['drift_accuracy']);
 42 |                 $('#lower-bound').text(data['drift_accuracy_lower']);
 43 |                 $('#upper-bound').text(data['drift_accuracy_upper']);
 44 |                 $('#inline-drift-score').text(data['drift_accuracy']);
 45 |                 $('#inline-drift-score-2').text(data['drift_accuracy']);
 46 |                 $('#binomial-p-value').text(data['drift_test_pvalue']);
 47 |                 if (data['drift_test_pvalue'] <= 0.05){
 48 |                     $('#binomial-conclusion').innerHTML = '<span>&#8804;</span>' + '0.05 so drift detected';
 49 |                 } else {
 50 |                     $('#binomial-conclusion').text('> 0.05 so no drift detected')
 51 |                 }
 52 |                 $('#sample-size').text(data['sample_size']);
 53 | 
 54 |                 changeInputColor('#drift-score', data['drift_accuracy']);
 55 |                 $('#error_message').html('');
 56 | 
 57 |                 //other boxes
 58 |                 draw(data);
 59 |                 $('.result-state').show();
 60 |                 markRunning(false);
 61 |             }
 62 |         )
 63 |         .catch(error => {
 64 |             markRunning(false);
 65 |             dataiku.webappMessages.displayFatalError(error);
 66 |         });
 67 | }
 68 | 
 69 | function markRunning(running) {
 70 |     if (running) {
 71 |         $('.running-state').show();
 72 |         $('.notrunning-state').hide();
 73 |         $('.result-state').hide();
 74 |     } else {
 75 |         $('.running-state').hide();
 76 |         $('.notrunning-state').show();
 77 |     }
 78 | }
 79 | 
 80 | function draw(data) {
 81 |     document.getElementById("riskiest_features_explanation").innerHTML = '';
 82 |     switch(data.type){
 83 |         case "CLASSIFICATION":
 84 |             drawFugacity(data['fugacity']);
 85 |             draw_KDE_classification(data['kde']);
 86 |             break;
 87 |         case "REGRESSION":
 88 |             d3.select("#fugacity_div").selectAll("div").remove();
 89 |             d3.select("#fugacity_label").remove();
 90 |             d3.select("#kde_class_option").select("#label-list").remove();
 91 |             draw_KDE_regression(data['kde']);
 92 |             break;
 93 |         default:
 94 |             console.log("Value error for the type of learning task:")
 95 |             console.log(data.type)
 96 |     }
 97 |     drawFeatureImportance(data['feature_importance']);
 98 |     recommendation_text = "";
 99 |     if (data.riskiest_features.length>0){
100 |         var i;
101 |         var recommendation_text = "We recommend you to check the following feature(s): "
102 |         for (i = 0; i < data.riskiest_features.length; i++) {
103 |             recommendation_text += data.riskiest_features[i];
104 |             if (i < (data.riskiest_features.length - 1)){
105 |                 recommendation_text += ", "
106 |             }
107 |         }
108 |     }
109 |     document.getElementById("riskiest_features_explanation").innerHTML = recommendation_text;
110 | 
111 |     if (data.drift_accuracy >= 0.1){
112 |         d3.select("#feature_importance_div").style('display', 'block')
113 |     } else {
114 |         d3.select("#feature_importance_div").style('display', 'none');
115 |     }
116 | }
117 | 
118 | function drawFugacity(data) {
119 |     $('#fugacity-score').html(json2table(data, 'table text-sb table-bordered table-hover')); // ml-table
120 | }
121 | 
122 | function json2table(json, classes) {
123 |     let cols = Object.keys(json[0]);
124 |     let header = '';
125 |     let body = '';
126 |     classes = classes || '';
127 | 
128 |     function capitalizeFirstLetter(string) {
129 |         return string.charAt(0).toUpperCase() + string.slice(1);
130 |     }
131 | 
132 |     body += '<tr>';
133 |     cols.map(function(col){
134 |         body += '<td>' + capitalizeFirstLetter(col) + '</td>';
135 |     })
136 |     body += '</tr>';
137 | 
138 | 
139 |     json.map(function(row) {
140 |         cols.map(function(colName) {
141 |             body += '<td>' + row[colName] + '</td>';
142 |         });
143 |         body += '</tr>';
144 |     });
145 | 
146 |     return `<div><table class="${classes}"><tbody>${body}</tbody></table></div>`;
147 | 
148 | }
149 | 
150 | function draw_KDE_classification(data) {
151 | 
152 |     d3.select("#kde-chart").select("svg").remove();
153 |     d3.select("#label-list").selectAll("option").remove();
154 | 
155 |     document.getElementById('kde_explanation').innerHTML =
156 |         'This chart represents the probability density estimation for a given prediction class when scoring '+
157 |         'both the test dataset and your input dataset. <br><br>Visually different probability density estimations '+
158 |         'indicate high data drift.'
159 | 
160 |     let margin = {top: 30, right: 30, bottom: 30, left: 50};
161 |     let width = 550 - margin.left - margin.right;
162 |     let height = 450 - margin.top - margin.bottom;
163 | 
164 |     // append the svg object to the body of the page
165 |     let svg = d3.select("#kde-chart")
166 |         .append("svg")
167 |         .attr("width", width + margin.left + margin.right)
168 |         .attr("height", height + margin.top + margin.bottom)
169 |         .append("g")
170 |         .attr("transform", "translate(" + margin.left + "," + margin.top + ")");
171 | 
172 |     // List of groups (here I have one group per column)
173 |     let labels = Object.keys(data);
174 |     // add the options to the button
175 |     d3.select("#label-list")
176 |         .selectAll('myOptions')
177 |         .data(labels)
178 |         .enter()
179 |         .append("option")
180 |         .text(d => d) // text showed in the menu
181 |         .attr("value", d => d) // corresponding value returned by the button
182 |         .property("selected", d => d === labels[0]);
183 | 
184 |     // add the x Axis
185 |     let x = d3.scaleLinear()
186 |         .domain([0,100])
187 |         .range([0, width]);
188 |     svg.append("g")
189 |         .attr("transform", "translate(0," + height + ")")
190 |         .call(d3.axisBottom(x));
191 | 
192 |     let density1 = data[labels[0]]['original'];
193 |     let density2 = data[labels[0]]['new'];
194 | 
195 |     density1_array = density1.map(x=>x[1])
196 |     density2_array = density2.map(x=>x[1])
197 | 
198 |     // first and last value of array must be zero otherwise the color fill will mess up
199 |     density1[0] = [0,0];
200 |     density2[0] = [0,0];
201 |     density1[density1.length - 1] = [100, 0];
202 |     density2[density2.length - 1] = [100, 0];
203 | 
204 |     // add the y Axis
205 |     let maxY = Math.max.apply(Math, density1_array.concat(density2_array));
206 |     let y = d3.scaleLinear()
207 |         .range([height, 0])
208 |         .domain([0, maxY*1.1]);
209 |     svg.append("g")
210 |         .call(d3.axisLeft(y));
211 | 
212 |     // Plot the area
213 |     let curve1 = svg.append("path")
214 |         .attr("class", "mypath")
215 |         .datum(density1)
216 |         .attr("fill", "#2b67ff")
217 |         .attr("fill-opacity", ".4")
218 |         .attr("stroke", "#2b67ff")
219 |         .attr("stroke-width", 2)
220 |         .attr("stroke-linejoin", "round")
221 |         .attr("d", d3.line()
222 |             .curve(d3.curveBasis)
223 |             .x(d => x(d[0]))
224 |             .y(d => y(d[1]))
225 |         );
226 | 
227 |     // Plot the area
228 |     let curve2 = svg.append("path")
229 |         .attr("class", "mypath")
230 |         .datum(density2)
231 |         .attr("fill", "#ff832b")
232 |         .attr("fill-opacity", ".4")
233 |         .attr("stroke", "#ff832b")
234 |         .attr("stroke-width", 2)
235 |         .attr("stroke-linejoin", "round")
236 |         .attr("d", d3.line()
237 |             .curve(d3.curveBasis)
238 |             .x(d => x(d[0]))
239 |             .y(d => y(d[1]))
240 |         );
241 | 
242 |     // Handmade legend
243 |     svg.append("circle").attr("cx",280).attr("cy",10).attr("r", 6).style("fill", "#2b67ff")
244 |     svg.append("circle").attr("cx",280).attr("cy",40).attr("r", 6).style("fill", "#ff832b")
245 |     svg.append("text").attr("x", 300).attr("y", 10).text("Test dataset").style("font-size", "15px").attr("alignment-baseline","middle")
246 |     svg.append("text").attr("x", 300).attr("y", 40).text("Input dataset").style("font-size", "15px").attr("alignment-baseline","middle")
247 |     // Add X axis label:
248 |     svg.append("text")
249 |         .attr("text-anchor", "end")
250 |         .attr("x", width/2 + 90)
251 |         .attr("y", height + 29)
252 |         .attr("font-size", 12)
253 |         .text(" Predicted probability (in %)");
254 | 
255 |     // A function that update the chart when slider is moved?
256 |     function updateChart(selectedGroup) {
257 | 
258 |         // recompute density estimation
259 |         density1 = data[selectedGroup]['original'];
260 |         density2 = data[selectedGroup]['new'];
261 |         // first and last value of array must be zero otherwise the color fill will mess up
262 |         density1[0] = [0,0];
263 |         density2[0] = [0,0];
264 |         density1[density1.length - 1] = [100, 0];
265 |         density2[density2.length - 1] = [100, 0];
266 |         density1_array = density1.map(x=>x[1])
267 |         density2_array = density2.map(x=>x[1])
268 |         // add the y Axis
269 |         maxY = Math.max.apply(Math, density1_array.concat(density2_array));
270 |         y.domain([0, maxY*1.1]);
271 | 
272 |         // update the chart
273 |         curve1
274 |             .datum(density1)
275 |             .transition()
276 |             .duration(1000)
277 |             .attr("d",    d3.line()
278 |             .curve(d3.curveBasis)
279 |                 .x(d => x(d[0]))
280 |                 .y(d => y(d[1]))
281 |             );
282 |         curve2
283 |             .datum(density2)
284 |             .transition()
285 |             .duration(1000)
286 |             .attr("d",    d3.line()
287 |             .curve(d3.curveBasis)
288 |                 .x(d => x(d[0]))
289 |                 .y(d => y(d[1]))
290 |             );
291 |     }
292 | 
293 |     // Listen to the slider?
294 |     d3.select("#label-list").on("change", function(d) {
295 |         selectedGroup = this.value
296 |         updateChart(selectedGroup)
297 |     });
298 | }
299 | 
300 | function draw_KDE_regression(data) {
301 | 
302 |     d3.select("#kde-chart").select("svg").remove();
303 |     d3.select("#label-list").selectAll("option").remove();
304 | 
305 |     document.getElementById('kde_explanation').innerHTML =
306 |         'This chart represents the density estimation of the prediction values when scoring both the test dataset and '+
307 |         'your input dataset. <br><br>Visually different density estimations indicate high data drift.';
308 | 
309 |     let margin = {top: 30, right: 30, bottom: 30, left: 50};
310 |     let width = 550 - margin.left - margin.right;
311 |     let height = 450 - margin.top - margin.bottom;
312 | 
313 |     // append the svg object to the body of the page
314 |     let svg = d3.select("#kde-chart")
315 |         .append("svg")
316 |         .attr("width", width + margin.left + margin.right)
317 |         .attr("height", height + margin.top + margin.bottom)
318 |         .append("g")
319 |         .attr("transform", "translate(" + margin.left + "," + margin.top + ")");
320 | 
321 |     // List of groups (here I have one group per column)
322 |     let labels = Object.keys(data);
323 | 
324 |     var minSupport = data.Prediction.min_support;
325 |     var maxSupport = data.Prediction.max_support;
326 | 
327 |     // add the x Axis
328 |     let x = d3.scaleLinear()
329 |         .domain([minSupport, maxSupport])
330 |         .range([0, width]);
331 |     svg.append("g")
332 |         .attr("transform", "translate(0," + height + ")")
333 |         .call(d3.axisBottom(x));
334 | 
335 |     let density1 = data[labels[0]]['original'];
336 |     let density2 = data[labels[0]]['new'];
337 | 
338 |     density1_array = density1.map(x=>x[1])
339 |     density2_array = density2.map(x=>x[1])
340 | 
341 |     // first and last value of array must be zero otherwise the color fill will mess up
342 |     density1[0] = [minSupport,0];
343 |     density2[0] = [minSupport,0];
344 |     density1[density1.length - 1] = [maxSupport, 0];
345 |     density2[density2.length - 1] = [maxSupport, 0];
346 | 
347 |     // add the y Axis
348 |     let maxY = Math.max.apply(Math, density1_array.concat(density2_array));
349 |     let y = d3.scaleLinear()
350 |         .range([height, 0])
351 |         .domain([0, maxY*1.1]);
352 |     svg.append("g")
353 |         .call(d3.axisLeft(y));
354 | 
355 |     // Plot the area
356 |     let curve1 = svg.append("path")
357 |         .attr("class", "mypath")
358 |         .datum(density1)
359 |         .attr("fill", "#2b67ff")
360 |         .attr("fill-opacity", ".4")
361 |         .attr("stroke", "#2b67ff")
362 |         .attr("stroke-width", 2)
363 |         .attr("stroke-linejoin", "round")
364 |         .attr("d", d3.line()
365 |             .curve(d3.curveBasis)
366 |             .x(d => x(d[0]))
367 |             .y(d => y(d[1]))
368 |         );
369 | 
370 |     // Plot the area
371 |     let curve2 = svg.append("path")
372 |         .attr("class", "mypath")
373 |         .datum(density2)
374 |         .attr("fill", "#ff832b")
375 |         .attr("fill-opacity", ".4")
376 |         .attr("stroke", "#ff832b")
377 |         .attr("stroke-width", 2)
378 |         .attr("stroke-linejoin", "round")
379 |         .attr("d", d3.line()
380 |             .curve(d3.curveBasis)
381 |             .x(d => x(d[0]))
382 |             .y(d => y(d[1]))
383 |         );
384 | 
385 |     // Handmade legend
386 |     svg.append("circle").attr("cx",280).attr("cy",10).attr("r", 6).style("fill", "#2b67ff")
387 |     svg.append("circle").attr("cx",280).attr("cy",40).attr("r", 6).style("fill", "#ff832b")
388 |     svg.append("text").attr("x", 300).attr("y", 10).text("Test dataset").style("font-size", "15px").attr("alignment-baseline","middle")
389 |     svg.append("text").attr("x", 300).attr("y", 40).text("Input dataset").style("font-size", "15px").attr("alignment-baseline","middle")
390 |     // Add X axis label:
391 |     svg.append("text")
392 |         .attr("text-anchor", "end")
393 |         .attr("x", width/2 + 90)
394 |         .attr("y", height + 29)
395 |         .attr("font-size", 12)
396 |         .text("Probability density function");
397 | }
398 | 
399 | function getMaxX(data) {
400 |   return data.reduce((max, p) => p['drift_model'] > max ? p['drift_model'] : max, data[0]['drift_model']);
401 | }
402 | 
403 | function getMaxY(data) {
404 |   return data.reduce((max, p) => p['original_model'] > max ? p['original_model'] : max, data[0]['original_model']);
405 | }
406 | 
407 | function drawFeatureImportance(data) {
408 | 
409 |     d3.select("#feat-imp-plot").select("svg").remove();
410 | 
411 |     var values = Object.keys(data).map(function(key){
412 |         return data[key];
413 |     })
414 | 
415 |     let maxX = getMaxX(values);
416 |     let maxY = getMaxY(values);
417 |     let margin = {top: 10, right: 30, bottom: 30, left: 50};
418 |     let width = 550 - margin.left - margin.right;
419 |     let height = 450 - margin.top - margin.bottom;
420 | 
421 |     let svg = d3.select("#feat-imp-plot").append("svg")
422 |         .attr("width", width + margin.left + margin.right)
423 |         .attr("height", height + margin.top + margin.bottom)
424 |         .append("g")
425 |         .attr("transform","translate(" + margin.left + "," + margin.top + ")");
426 | 
427 |     let x = d3.scaleLinear()
428 |         .domain([0, maxX])
429 |         .range([ 0, width]);
430 | 
431 |     svg.append("g")
432 |         .attr("transform", "translate(0," + height + ")")
433 |         .call(d3.axisBottom(x));
434 | 
435 |     let y = d3.scaleLinear()
436 |         .domain([0, maxY])
437 |         .range([height, 0]);
438 | 
439 |     svg.append("g").call(d3.axisLeft(y));
440 | 
441 |     let tooltip = d3.select("#feat-imp-plot")
442 |         .append("div")
443 |         .style("opacity", 0)
444 |         .attr("class", "tooltip")
445 |         .style("background-color", "white")
446 |         .style("border", "solid")
447 |         .style("border-width", "1px")
448 |         .style("border-radius", "5px")
449 |         .style("padding", "10px")
450 |         .style("font-size", "20px")
451 |         .style("text-align", "center")
452 | 
453 |     let tipMouseover = function(d) {
454 |         var html  = d["feature"];
455 |         tooltip.html(html)
456 |             .style("left",d3.select(this).attr("cx") + "px")
457 |             .style("top", d3.select(this).attr("cy") + "px")
458 |             .transition()
459 |             .duration(200) // ms
460 |             .style("opacity", .9)
461 |     };
462 | 
463 |     // Add X axis label:
464 |       svg.append("text")
465 |           .attr("text-anchor", "end")
466 |           .attr("x", width/2 + margin.left + 20)
467 |           .attr("y", height + margin.top + 17)
468 |           .attr("font-size", 12)
469 |           .text("Drift model feature importance (%)");
470 | 
471 |       // Y axis label:
472 |       svg.append("text")
473 |           .attr("text-anchor", "end")
474 |           .attr("transform", "rotate(-90)")
475 |           .attr("y", - margin.left + 20)
476 |           .attr("x", - margin.top - height/2 + 120)
477 |           .attr("font-size", 12)
478 |           .text("Original model feature importance (%)");
479 | 
480 |       // tooltip mouseout event handler
481 |       let tipMouseout = function(d) {
482 |           tooltip.transition()
483 |               .duration(300) // ms
484 |               .style("opacity", 0); // don't care about position!
485 |       };
486 | 
487 |     // Add dots
488 |       svg.append('g')
489 |         .selectAll("dot")
490 |         .data(values)
491 |         .enter()
492 |         .append("circle")
493 |         .attr("cx", function (d) { return x(d['drift_model']); } )
494 |         .attr("cy", function (d) { return y(d['original_model']); } )
495 |         .attr("r", 6)
496 |         .style("fill", "#2b67ff")
497 |         .style("opacity", 1)
498 |         .on("mouseover", tipMouseover)
499 |         .on("mouseout", tipMouseout);
500 | }
501 | 


--------------------------------------------------------------------------------
/webapps/model-drift-view/backend.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import traceback
 3 | import logging
 4 | from flask import request
 5 | import json
 6 | import dataiku
 7 | from dataiku.customwebapp import get_webapp_config
 8 | from dataiku.doctor.posttraining.model_information_handler import PredictionModelInformationHandler
 9 | from dku_data_drift import DriftAnalyzer, ModelAccessor, ModelDriftConstants
10 | from model_metadata import get_model_handler
11 | logger = logging.getLogger(__name__)
12 | 
13 | 
14 | def convert_numpy_int64_to_int(o):
15 |     if isinstance(o, np.int64):
16 |         return int(o)
17 |     raise TypeError
18 | 
19 | @app.route('/list-datasets')
20 | def list_datasets():
21 |     project_key = dataiku.default_project_key()
22 |     client = dataiku.api_client()
23 |     project = client.get_project(project_key)
24 |     dataset_list = [{"name": dataset_dict['name']} for dataset_dict in project.list_datasets()]
25 |     return json.dumps({'dataset_list': dataset_list})
26 | 
27 | @app.route('/get-drift-metrics')
28 | def get_drift_metrics():
29 |     try:
30 |         model_id = request.args.get('model_id')
31 |         version_id = request.args.get('version_id')
32 |         test_set = request.args.get('test_set')
33 |         new_test_df = dataiku.Dataset(test_set).get_dataframe(bool_as_str=True, limit=ModelDriftConstants.MAX_NUM_ROW)
34 | 
35 |         fmi = get_webapp_config().get("trainedModelFullModelId")
36 |         if fmi is None:
37 |             model = dataiku.Model(model_id)
38 |             model_handler = get_model_handler(model, version_id=version_id)
39 |             model_accessor = ModelAccessor(model_handler)
40 |         else:
41 |             original_model_handler = PredictionModelInformationHandler.from_full_model_id(fmi)
42 |             model_accessor = ModelAccessor(original_model_handler)
43 | 
44 |         drifter = DriftAnalyzer()
45 |         drifter.fit(new_test_df, model_accessor=model_accessor)
46 |         return json.dumps(drifter.get_drift_metrics_for_webapp(), allow_nan=False, default=convert_numpy_int64_to_int)
47 |     except:
48 |         logger.error(traceback.format_exc())
49 |         return traceback.format_exc(), 500
50 | 


--------------------------------------------------------------------------------
/webapps/model-drift-view/body.html:
--------------------------------------------------------------------------------
  1 | <link rel="stylesheet" href="/plugins/model-drift/resource/style.css" />
  2 | <script src="/plugins/model-drift/resource/dku-helpers.js"></script>
  3 | <script type="text/javascript" src="https://www.gstatic.com/charts/loader.js"></script>
  4 | <script type='text/javascript' src='/plugins/model-drift/resource/d3.v4.min.js'></script>
  5 | <link rel="stylesheet" href="/plugins/model-drift/resource/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
  6 | 
  7 | 
  8 | <body class="report-box">
  9 | 
 10 | <div class="container-fluid">
 11 |     <div class="view-content text-r">
 12 |         <div style="display: inline;">
 13 |             Select dataset containing new test data
 14 |             <select id="dataset-selector" class="box2 text-r"></select>
 15 |             <button class="dku-btn dku-btn-primary notrunning-state" id="run-button" type="button" style="color: #FFFFFF;">COMPUTE DRIFT</button>
 16 |             <span class="running-state" style="display: none;">Computing ...</span>
 17 |         </div>
 18 |     </div>
 19 | </div>
 20 | 
 21 | <div class="landing-page" style="padding-top:40px; padding-bottom:40px; padding-left:100px; padding-right:100px">
 22 |     </br> </br> </br></br></br>
 23 | <div class="ultralarge-title-sb" style="padding-bottom:16px">
 24 |     Input Data Drift
 25 | </div>
 26 | <div class="grand-title-landing">
 27 |     Get insights on the applicability of a model by exploring how a dataset of new data differs from the original evaluation (test) dataset
 28 | </div>
 29 | </div>
 30 | 
 31 | <div class="container-fluid" id="error_message" style="box-sizing: border-box; padding: 0;"></div>
 32 | <div class="result-state" style="display: none; padding-top:16px; padding-bottom:16px">
 33 |     <div class="container-fluid">
 34 |         <div class="medium-title-sb" style="padding-bottom:4px">Global Drift Score</div>
 35 |         <div class="tiny-text-r" style="padding-bottom:16px">Sampling first <b><span id="sample-size"></span></b> rows per dataset</div>
 36 |         <div class="small-title-r" style="padding-bottom:16px">Drift model</div>
 37 |         <div class="row">
 38 |                 <div class="col drift_score_container">
 39 |                     <div>
 40 |                         <table class="table table-condensed text-sb">
 41 |                           <tbody>
 42 |                             <tr>
 43 |                                 <td colspan="2">Lower bound</td>
 44 |                                 <td></td>
 45 |                                 <td colspan="2">Accuracy</td>
 46 |                                 <td></td>
 47 |                                 <td colspan="2">Upper bound</td>
 48 |                             </tr>
 49 |                             <tr>
 50 |                                 <td id="lower-bound" colspan="2" class="grand-title-r" style="border-top: none; vertical-align: middle; padding-top:0px">0.45</td>
 51 |                                 <td class="grand-title-r" style="border-top: none; vertical-align: middle; padding-top:0px"><span>&#8804;</span></td>
 52 |                                 <td id="accuracy" colspan="2" class="huge-title-r" style="border-top: none; vertical-align: middle; padding-top:0px">0.47</td>
 53 |                                 <td class="grand-title-r" style="border-top: none; vertical-align: middle; padding-top:0px"><span>&#8804;</span></td>
 54 |                                 <td id="upper-bound" colspan="2" class="grand-title-r" style="border-top: none; vertical-align: middle; padding-top:0px">0.6</td>
 55 |                             </tr>
 56 |                           </tbody>
 57 |                         </table>
 58 |                     </div>
 59 |                 </div>
 60 |                 <div class="col explanation text-r"  id="drift-explanation">
 61 |                     <b>Lower is better.</b>
 62 |                     <br>
 63 |                     In order to detect data drift, we train a random forest classifier (the drift model) to discriminate the new data set from the test set. If this classifier has accuracy > 0.5, it implies that test data and new data can be distinguished and that you are observing data drift. You may consider retraining your model in that situation.
 64 |                 </div>
 65 |         </div>
 66 | 
 67 |         <div class="small-title-r" style="padding-bottom:16px">Binomial test</div>
 68 |         <div class="row">
 69 |             <div class="col">
 70 |                 <table class="table table-condensed text-sb table-hover">
 71 |                   <tbody>
 72 |                     <tr>
 73 |                       <th>Hypothesis tested</th>
 74 |                       <td>There is no drift (accuracy <span>&#8804;</span> 0.5)</td>
 75 |                     </tr>
 76 |                     <tr>
 77 |                       <th>Significance level</th>
 78 |                       <td>0.05</td>
 79 |                     </tr>
 80 |                     <tr>
 81 |                       <th>p-value</th>
 82 |                       <td id="binomial-p-value">0.00020</td>
 83 |                     </tr>
 84 |                     <tr>
 85 |                       <th>Conclusion</th>
 86 |                       <td id="binomial-conclusion"><span>&#8804;</span> 0.05 so drift detected</td>
 87 |                     </tr>
 88 |                   </tbody>
 89 |                 </table>
 90 |             </div>
 91 |             <div class="col explanation text-r">
 92 |                 <b>The hypothesis tested</b> is that there is no drift, in which case the expected drift model accuracy is 0.5 (datasets undistinguishable). The observed accuracy might deviate from this expectation and the Binomial test evaluates whether this deviation is statistically significant, modelling the number of correct predictions as a random variable drawn from a Binomial distribution.
 93 |                 <br>
 94 |                 The p-value is the probability to observe this particular accuracy (or larger) under the hypothesis of absent drift. If this probability is lower than the significance level (i.e. 5%), it’s then unlikely to be in the situation of absent drift: the hypothesis of no drift is rejected, triggering a drift detection. The significance level indicates the rate of falsely-detected drifts we are ready to accept from the test.
 95 |             </div>
 96 |         </div>
 97 |     </div>
 98 |     <hr/>
 99 |     <div class="container-fluid">
100 |         <div class="medium-title-sb" style="padding-bottom:16px">Model Information</div>
101 |         <div class="small-title-r" id="fugacity_label" style="padding-bottom:16px">Fugacity</div>
102 |         <div class="row" id="fugacity_div">
103 |             <div class="col fugacity_score_container">
104 |                 <div id="fugacity-score"></div>
105 |             </div>
106 |             <div class="col explanation text-r"> <b>Fugacity</b> expresses the difference between the expected "ideal" data your model was trained on and the observed "real" data you are analyzing. We compare the proportion of samples predicted in each class when scoring on both the test and your input datasets.
107 |             </div>
108 |         </div>
109 | 
110 |         <div class="small-title-r" id="kde_class_option" style="padding-bottom:16px padding-top:16px">Predicted probability density chart <select class="box text-r" id="label-list"></select> </div>
111 |         <div class="row">
112 |             <div class="col kde_chart_container" id="kde_container_div">
113 |                 <div id="kde-chart"></div>
114 |             </div>
115 |             <div class="col explanation text-r" id="kde_explanation">
116 |                 This chart represents the probability density estimation for a given prediction class when scoring both the test dataset and your input dataset.
117 |                 <br><br>Visually different probability density estimations indicate high data drift.
118 |             </div>
119 |         </div>
120 |     </div>
121 |     <hr/>
122 |     <div class="container-fluid" id="feature_importance_div">
123 |         <div class="medium-title-sb" style="padding-bottom:16px">Feature Drift Overview</div>
124 |         <div class="row">
125 |             <div class="col impl_plot_container">
126 |                 <div id="feat-imp-plot"></div>
127 |             </div>
128 |             <div class="col explanation text-r" >
129 |                 The scatter plot shows feature importance for the original model versus feature importance for the
130 |                 (data classifying) drift model.
131 |                 <br><br>
132 |                 <b>This graph should be examined alongside with the drift score (<span id="inline-drift-score-2"></span>)</b>.
133 |                 <br><br>
134 |                 For a highly drifted dataset (drift score ~1), if the features most responsible for data drift are of low importance
135 |                 in the original model (bottom right quadrant), you can expect the behavior of the model to remain the same.
136 |                 <br><br>
137 |                 Features in the top right quadrant of this scatter plot are highly drifted (i.e. they are powerful in
138 |                 distinguishing test data from new observations), but also of high importance for the original model.
139 |                 In this situation, you can expect the performance of the model to degrade as your model does not apply
140 |                 to your new observations.
141 |                 <br><br>
142 |                 <b><span id="riskiest_features_explanation"></span></b>
143 |             </div>
144 |         </div>
145 |     </div>
146 | </div>
147 | </body>


--------------------------------------------------------------------------------
/webapps/model-drift-view/style.css:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | html {
  5 |     background: #f2f2f2;
  6 | }
  7 | h3 {
  8 |     margin-bottom: 13px !important;
  9 | }
 10 | .explanation {
 11 |     margin-bottom: 24px;
 12 | }
 13 | #label-list{
 14 |     padding-left: 17px;
 15 | }
 16 | #drift-score{
 17 |     font-size: 50px;
 18 | }
 19 | .ml-table {
 20 |     width: max-content;
 21 | }
 22 | .tooltip {
 23 |     position: relative;
 24 |     font-size: 8px;
 25 |     width:  auto;
 26 |     height: auto;
 27 |     pointer-events: none;
 28 |     background-color: white;
 29 | }
 30 | .fatal-error {
 31 |     white-space: pre-wrap;
 32 |     text-align: left !important;
 33 | }
 34 | 
 35 | .low-risk{
 36 |     color: green;
 37 | }
 38 | 
 39 | .medium-risk{
 40 |     color: orange;
 41 | }
 42 | 
 43 | .high-risk{
 44 |     color: red;
 45 | }
 46 | 
 47 | table{
 48 |   width: 95% !important;
 49 |   table-layout: fixed;
 50 | }
 51 | 
 52 | .container-fluid {
 53 |   width: auto !important;
 54 |   margin-right: 8px;
 55 |   margin-left: 8px;
 56 | }
 57 | 
 58 | .landing-page {
 59 |     display: flex;
 60 |     flex-direction: column;  /* make main axis vertical */
 61 |     justify-content: center; /* center items vertically, in this case */
 62 |     align-items: center;     /* center items horizontally, in this case */
 63 |     margin: auto;
 64 | }
 65 | 
 66 | .error-page {
 67 |     display: flex;
 68 |     flex-direction: column;  /* make main axis vertical */
 69 |     justify-content: center; /* center items vertically, in this case */
 70 |     align-items: center;     /* center items horizontally, in this case */
 71 |     margin: auto;
 72 | }
 73 | /* Buttons */
 74 | 
 75 | .dku-btn {
 76 |     font-family: 'SourceSansPro';
 77 |     text-transform: uppercase;
 78 |     font-size: 13px;
 79 |     font-weight: 500;
 80 |     padding: 3px 8px;
 81 |     margin: 0;
 82 |     line-height: 1.4;
 83 |     background-image: inherit;
 84 |     box-shadow: none;
 85 |     text-shadow: none;
 86 |     box-sizing: border-box;
 87 |     outline: 0;
 88 |     cursor: pointer;
 89 |     background-color: #ffffff;
 90 |     color: #ffffff;
 91 |     border: 1px solid #cccccc;
 92 |     height: 26px;
 93 | }
 94 | 
 95 | .dku-btn:hover {
 96 |     background-color: #dddddd;
 97 | }
 98 | 
 99 | .dku-btn:disabled {
100 |     background-color: #dddddd;
101 |       cursor: not-allowed;
102 |           pointer-events: none;
103 | }
104 | 
105 | .dku-btn:active {
106 |     background-color: #cccccc;
107 | }
108 | 
109 | .dku-btn-primary {
110 |     background: var(--blue);
111 |     color: #ffffff;
112 |     border: 1px solid transparent;
113 | }
114 | .dku-btn-primary:hover {
115 |     background: #62a3f5;
116 | }
117 | 
118 | .dku-btn-primary:disabled {
119 |     background: #62a3f5;
120 |     cursor: not-allowed;
121 |     pointer-events: none;
122 | }
123 | .dku-btn-primary:active {
124 |     background: #22a4d9;
125 | }
126 | 
127 | .center {
128 |   position: absolute;
129 |   left: 50%;
130 |   top: 40%;
131 |   transform: translate(-50%, -50%);
132 |   padding: 10px;
133 | 
134 | }
135 | 
136 | .report-box {
137 |     width: 1150px;
138 |     margin-top:24px;
139 |     margin-right:24px;
140 |     margin-left:8px;
141 |     padding-top:24px;
142 |     padding-right:24px;
143 |     padding-left:24px;
144 |     box-shadow: 0px 0px 2px 1px rgba(34, 34, 34, 0.15);
145 | }
146 | 
147 | .card-deck {
148 |   margin-right: 8px;
149 |   margin-left: 8px;
150 |   padding-right: 8px;
151 |   padding-left: 8px;
152 | }
153 | 
154 | .container-fluid {
155 |   width: auto !important;
156 |   margin-right: 8px;
157 |   margin-left: 8px;
158 | }
159 | 
160 | 
161 | 
162 | body{
163 |     font-family: var(--font-family);
164 |     background-color : var(--grey-lighten-7);
165 |     font-size: 13px;
166 | }
167 | 
168 | html {
169 |     background: #f2f2f2;}
170 | 
171 | h3 {
172 |     margin-bottom: 13px !important;
173 | }
174 | 
175 | .box {
176 |     background: rgba(255, 255, 255, 0.5);
177 |     border: 1px solid #DDDDDD;
178 |     box-sizing: border-box;
179 |     height: 26px;
180 |     margin: 3px;
181 | }
182 | 
183 | .box2 {
184 |     background: rgba(255, 255, 255, 0.5);
185 |     border: 1px solid #DDDDDD;
186 |     box-sizing: border-box;
187 |     height: 26px;
188 |     margin-left: 3px;
189 |     margin-right: 16px;
190 | }
191 | 
192 | .ultralarge-title-sb{
193 |     font-family: var(--font-family);
194 |     font-style: normal;
195 |     font-weight: 400;
196 |     font-size: 32px;
197 |     line-height: 40px;
198 |     text-align: center;
199 | 
200 |     color: #666666;
201 |     flex: none;
202 |     order: 0;
203 |     align-self: center;
204 |     flex-grow: 0;
205 |     margin: 0px 24px;
206 | 
207 | }
208 | 
209 | .huge-title-r {
210 |    font-family: var(--font-family);
211 |     font-style: normal;
212 |     font-size: 40px;
213 |     color: #222222;
214 |     font-weight: 230;
215 |     line-height: 69px;
216 | }
217 | .grand-title-r{
218 |     font-family: var(--font-family);
219 |     font-style: normal;
220 |     font-weight: 300;
221 |     font-size: 22px;
222 |     line-height: 28px;
223 |     text-align: center;
224 | 
225 |     color: #444444;
226 | 
227 |     flex: none;
228 |     order: 1;
229 |     align-self: center;
230 |     flex-grow: 0;
231 |     margin: 0px 24px;
232 | }
233 | 
234 | .small-title-r {
235 |     font-family: var(--font-family);
236 |     font-style: normal;
237 |     font-weight: normal;
238 |     font-size: 15px;
239 |     line-height: 20px;
240 |     color: #222222;
241 | }
242 | 
243 | .small-title-sb {
244 |     font-family: var(--font-family);
245 |     font-style: normal;
246 |     font-weight: 500;
247 |     font-size: 15px;
248 |     line-height: 20px;
249 |     color: #222222;
250 | }
251 | 
252 | .medium-title-sb {
253 |     font-family: var(--font-family);
254 |     font-style: normal;
255 |     font-weight: 500;
256 |     font-size: 18px;
257 |     line-height: 23px;
258 |     color: #222222;
259 | }
260 | 
261 | .title-sb {
262 |     font-family: var(--font-family);
263 |     font-style: normal;
264 |     font-weight: 500;
265 |     font-size: 20px;
266 |     line-height: 20px;
267 |     color: #222222;
268 | }
269 | 
270 | .huge-title-sb {
271 |    font-family: var(--font-family);
272 |     font-style: normal;
273 |     font-weight: 500;
274 |     font-size: 38px;
275 |     line-height: 48px;
276 |     text-align: left;
277 |     color: #222222;
278 | }
279 | 
280 | 
281 | 
282 | .text {
283 |     font-family: var(--font-family);
284 |     font-style: normal;
285 |     font-weight: normal;
286 |     font-size: 13px;
287 |     line-height: 18px;
288 |     color: #666666;
289 | }
290 | 
291 | .tiny-text-r {
292 |     font-family: var(--font-family);
293 |     font-style: normal;
294 |     font-weight: normal;
295 |     font-size: 10px;
296 |     line-height: 15px;
297 |     color: #BBBBBB;
298 | }
299 | 
300 | .text-highlight {
301 |     font-weight: 500;
302 |     font-size: 13px;
303 |     color: #222222;
304 |     }
305 | 
306 | .text-r {
307 |     font-family: var(--font-family);
308 |     font-style: normal;
309 |     font-weight: normal;
310 |     font-size: 13px;
311 |     line-height: 18px;
312 |     color: #222222;
313 | }
314 | 
315 | .text-sb {
316 |     font-family: var(--font-family);
317 |     font-style: normal;
318 |     font-weight: 500;
319 |     font-size: 13px;
320 |     line-height: 16px;
321 |     color: #222222;
322 | }
323 | 
324 | .text-button {
325 |     font-family: var(--font-family);
326 |     font-style: normal;
327 |     font-weight: 600;
328 |     font-size: 13px;
329 |     line-height: 18px;
330 |     display: flex;
331 |     align-items: center;
332 |     text-align: center;
333 |     text-transform: uppercase;
334 | 
335 |     /* @white-base */
336 | 
337 |     color: #FFFFFF;
338 | 
339 |     /* Inside Auto Layout */
340 | 
341 |     flex: none;
342 |     order: 0;
343 |     align-self: flex-start;
344 |     flex-grow: 0;
345 |     margin: 10px 0px;
346 | }
347 | 
348 | .grand-title-sb{
349 |     font-family: var(--font-family);
350 |     font-style: normal;
351 |     font-weight: 500;
352 |     font-size: 22px;
353 |     line-height: 22px;
354 |     color: #000000;
355 | }
356 | 
357 | td
358 | {
359 |     text-align: center;
360 |     vertical-align: middle;
361 | }
362 | 
363 | table.table.table-condensed {
364 |     border-bottom: 1px solid #DDDDDD !important;
365 |     border-left: 1px solid #DDDDDD;
366 |     border-right: 1px solid #DDDDDD;
367 | }
368 | 
369 | table.table.table-bordered_out {
370 |     border-bottom: 1px solid #DDDDDD !important;
371 |     border-left: 1px solid #DDDDDD;
372 |     border-right: 1px solid #DDDDDD;
373 | }
374 | 
375 | 
376 | 
377 | .grand-title-landing{
378 |     font-family: var(--font-family);
379 |     font-style: normal;
380 |     font-weight: 300;
381 |     font-size: 22px;
382 |     line-height: 28px;
383 |     text-align: center;
384 | 
385 |     color: #666666;
386 | 
387 |     flex: none;
388 |     order: 1;
389 |     align-self: center;
390 |     flex-grow: 0;
391 |     margin: 0px 24px;
392 | }


--------------------------------------------------------------------------------
/webapps/model-drift-view/webapp.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "meta": {
 3 |         "label": "Data drift analysis",
 4 |         "description": "Get insights on the applicability of a model by exploring how a dataset of new data differs from the  evaluation (test) dataset.",
 5 |         "icon": "icon-anchor"
 6 |     },
 7 |     "baseType": "STANDARD",
 8 |     "hasBackend": "true",
 9 |     "standardWebAppLibraries": ["jquery", "dataiku", "font_awesome"],
10 |     "virtualWebAppAffinity": "PREFER_VIRTUAL",
11 |     "roles": [
12 |         {
13 |             "type": "SAVED_MODEL",
14 |             "targetParamsKey": "modelId",
15 |             "pathParamsKey": "versionId"
16 |         },
17 |         {
18 |           "type": "ANALYSIS",
19 |           "contentType": "prediction",
20 |           "targetParamsKey": "trainedModelFullModelId"
21 |         }
22 |     ],
23 |     "params":[]
24 | }
25 | 


--------------------------------------------------------------------------------