├── .DS_Store ├── .gitignore ├── 00-git-pull.ipynb ├── 01-connect_and_validate_data.ipynb ├── 02-data_preparation.ipynb ├── 03-train_models.ipynb ├── 04-deploy_model.ipynb ├── 05_monitor_deployment.ipynb ├── 2023_04_MLOps_AOT_Initiative.drawio ├── LICENSE ├── README.md ├── TODO.md ├── assets └── .METADATA │ ├── .gitkeep │ └── .version.json ├── assettypes └── .gitkeep ├── custom_env.yaml ├── images ├── 2023-08-31 09_10_14-2023_04_MLOps_AOT_Initiative.drawio - diagrams.net.png ├── 2023-08-31-09_10_14.png ├── 2023-09-05-11_00_27.png ├── 2023_04_MLOps_AOT_Initiative.drawio ├── banner.png ├── detailed_overview.png ├── image-1.png ├── image-10.png ├── image-100.png ├── image-101.png ├── image-102.png ├── image-103.png ├── image-11.png ├── image-12.png ├── image-13.png ├── image-14.png ├── image-15.png ├── image-16.png ├── image-17.png ├── image-18.png ├── image-19.png ├── image-2.png ├── image-20.png ├── image-200.png ├── image-201.png ├── image-202.png ├── image-203.png ├── image-204.png ├── image-205.png ├── image-206.png ├── image-207.png ├── image-21.png ├── image-22.png ├── image-23.png ├── image-24.png ├── image-25.png ├── image-26.png ├── image-27.png ├── image-28.png ├── image-29.png ├── image-3.png ├── image-4.png ├── image-5.png ├── image-6.png ├── image-7.png ├── image-8.png ├── image-9.png ├── image.png ├── overview-image-1.png └── overview-image.png ├── utils ├── catalog_utils.py └── fs_utils.py └── vars_and_utils.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | venv/ 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 96 | __pypackages__/ 97 | 98 | # Celery stuff 99 | celerybeat-schedule 100 | celerybeat.pid 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | 132 | **/.ipynb_checkpoints/* 133 | **/.virtual_documents/* 134 | assets/.METADATA/job_run.* 135 | assets/job_run 136 | assets/.METADATA/auto_ml.* 137 | assets/auto_ml 138 | assets/.METADATA/experiment.* 139 | assets/.METADATA/wml_experiment.* 140 | assets/experiment 141 | assets/.METADATA/wml_training_definition.* 142 | assets/wml_training_definition 143 | assets/.METADATA/wml_remote_training_system.* 144 | assets/wml_remote_training_system 145 | assets/federated_learning 146 | cover/ 147 | .pybuilder/ 148 | .pytype/ 149 | cython_debug/ -------------------------------------------------------------------------------- /00-git-pull.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "d720957f", 6 | "metadata": {}, 7 | "source": [ 8 | "![Alt text](images/banner.png)" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "4c4febc4-bc42-48cb-a226-e8a352003011", 14 | "metadata": { 15 | "id": "1c9ea256-70b7-4d27-9628-ef9f55978309", 16 | "tags": [] 17 | }, 18 | "source": [ 19 | "## Pulling changes from git\n", 20 | "\n", 21 | "This notebook will pull the current state from a given git repository.\n", 22 | "This is necessary so Watson Pipelines will always execute the newest changes!\n", 23 | "\n", 24 | "You might have to open a terminal to create the .env file since jupyterlab does not show .env files by defaul " 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "id": "76dc108f-1be7-4618-b9c8-425ff448c732", 31 | "metadata": { 32 | "id": "76dc108f-1be7-4618-b9c8-425ff448c732", 33 | "tags": [] 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "from ibm_watson_studio_pipelines import WSPipelines\n", 38 | "import subprocess\n", 39 | "import os\n", 40 | "from dotenv import load_dotenv\n", 41 | "\n", 42 | "TOKEN = os.getenv(\"USER_ACCESS_TOKEN\")\n", 43 | "\n", 44 | "load_dotenv()\n", 45 | "\n", 46 | "repo_adresse = os.getenv('repo_adresse') # without https://\n", 47 | "personal_access_token=os.getenv('personal_access_token') # generate this in github \n", 48 | "branch_name = os.getenv('branch_name') # should match the branch that is currently checked out within this project\n" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "id": "2d9e7962-d85a-4f20-8593-9e0ca93bdb2c", 54 | "metadata": { 55 | "id": "e9e0d7ad-e7a0-4ef5-b008-b6474d0449f1" 56 | }, 57 | "source": [ 58 | "#### setting personal access token --> TODO: get this from a secret vault!" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "id": "d52fe0de-10de-45e6-a69a-1528cee38c40", 64 | "metadata": { 65 | "id": "e37c810c-8b5d-448c-be88-bfac1dc21619" 66 | }, 67 | "source": [ 68 | "ditching all possibly existing local changes" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "id": "ba1edbf0-504c-475d-9bf4-a5e7b8ee9383", 75 | "metadata": { 76 | "id": "ba1edbf0-504c-475d-9bf4-a5e7b8ee9383", 77 | "tags": [] 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "!git reset --hard HEAD" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "id": "eeaf8fb6-23b8-48c3-9fd8-a92203eb80e3", 87 | "metadata": { 88 | "id": "5ac99cf8-ce89-49c0-b397-0ab66c60a2ce" 89 | }, 90 | "source": [ 91 | "removing current origin to replace it with an origin with explicit credentials" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "id": "23055252-3170-41fe-918a-9025af772ab8", 98 | "metadata": { 99 | "id": "23055252-3170-41fe-918a-9025af772ab8", 100 | "tags": [] 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "import os\n", 105 | "\n", 106 | "exit_code = os.system(\"git remote remove origin\")\n", 107 | "if exit_code != 0:\n", 108 | " print(\"An error occurred.\")" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "id": "dd53ddaf-3621-4ac1-8026-e35e713fceda", 115 | "metadata": { 116 | "id": "dd53ddaf-3621-4ac1-8026-e35e713fceda", 117 | "tags": [] 118 | }, 119 | "outputs": [], 120 | "source": [ 121 | "command=\"git remote add origin https://\"+personal_access_token+\"@\"+repo_adresse\n", 122 | "!{command}" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "id": "162fa803-c479-4c06-81fa-76f5af9d3b28", 129 | "metadata": { 130 | "id": "11465116-165a-4af5-bfbc-5fef6cda7d73", 131 | "tags": [] 132 | }, 133 | "outputs": [], 134 | "source": [ 135 | "!git fetch " 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "id": "62cb29ec-2115-4cc1-b6bd-58bbbac00934", 141 | "metadata": { 142 | "id": "04850200-69cf-4031-8c09-44e75f6e5371" 143 | }, 144 | "source": [ 145 | "for some reason git forgets which branch it is looking at right now --> have to set it explicity" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "id": "934d5b9f-a8e5-484e-a770-cfa8ff4f17f2", 152 | "metadata": { 153 | "id": "934d5b9f-a8e5-484e-a770-cfa8ff4f17f2", 154 | "tags": [] 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "command=\"git branch --set-upstream-to=origin/\"+branch_name\n", 159 | "!{command}" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "id": "db3e6c1a-7465-4cf5-aab5-a5cb7ddb7340", 166 | "metadata": { 167 | "id": "db3e6c1a-7465-4cf5-aab5-a5cb7ddb7340", 168 | "tags": [] 169 | }, 170 | "outputs": [], 171 | "source": [ 172 | "!git pull " 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "id": "283e1d1c", 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "import os\n", 183 | "#TODO: remove this --> potentially leaking secrets\n", 184 | "for key, value in os.environ.items():\n", 185 | " print(f\"{key}={value}\")" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "id": "0838de24", 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "!git status" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "id": "5dbfac9b", 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "import subprocess\n", 206 | "\n", 207 | "def is_git_up_to_date():\n", 208 | " try:\n", 209 | " # Execute the git status command and get its output as a string\n", 210 | " result = subprocess.run(['git', 'status'], stdout=subprocess.PIPE, check=True)\n", 211 | " output = result.stdout.decode('utf-8')\n", 212 | " \n", 213 | " # Check if the desired string is in the output\n", 214 | " return \"Your branch is up to date\" in output\n", 215 | " except subprocess.CalledProcessError:\n", 216 | " # Handle errors related to the git command\n", 217 | " print(\"Error executing git status. Ensure you're in a git repository.\")\n", 218 | " return False\n", 219 | "\n", 220 | "# Test the function\n", 221 | "print(is_git_up_to_date())" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "id": "8cc5aebc", 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "validation_params = {}\n", 232 | "validation_params['was_succesfull'] = is_git_up_to_date()\n", 233 | "\n", 234 | "\n", 235 | "pipelines_client = WSPipelines.from_token(TOKEN)\n", 236 | "pipelines_client.store_results(validation_params)" 237 | ] 238 | } 239 | ], 240 | "metadata": { 241 | "kernelspec": { 242 | "display_name": "Python 3.10", 243 | "language": "python", 244 | "name": "python3" 245 | }, 246 | "language_info": { 247 | "codemirror_mode": { 248 | "name": "ipython", 249 | "version": 3 250 | }, 251 | "file_extension": ".py", 252 | "mimetype": "text/x-python", 253 | "name": "python", 254 | "nbconvert_exporter": "python", 255 | "pygments_lexer": "ipython3", 256 | "version": "3.10.10" 257 | } 258 | }, 259 | "nbformat": 4, 260 | "nbformat_minor": 5 261 | } 262 | -------------------------------------------------------------------------------- /01-connect_and_validate_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "![Alt text](images/banner.png)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": { 13 | "id": "9501e8a8-d435-4451-a8ae-513e984aafe9" 14 | }, 15 | "source": [ 16 | "## Connection and Data Validation Notebook" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "062a2e71-ddbf-4c77-aaeb-d5aa7ac8d265" 23 | }, 24 | "source": [ 25 | "### Load the Credentials\n", 26 | "\n", 27 | "These environment variables are automatically set in WS Pipelines and are needed to access various services. " 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "id": "e875ff36-3235-43fb-8008-4bfb334c1325", 35 | "tags": [] 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "import os\n", 40 | "TOKEN = os.getenv(\"USER_ACCESS_TOKEN\")" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": { 47 | "id": "5dc1650f-d3fd-49f3-820d-dbce4ab98d04" 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "## Imports" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 4, 57 | "metadata": { 58 | "id": "fe00539c-a0ab-4769-ba7b-805adea59cf8", 59 | "tags": [] 60 | }, 61 | "outputs": [ 62 | { 63 | "name": "stderr", 64 | "output_type": "stream", 65 | "text": [ 66 | "2023-10-23 07:24:33.933101: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", 67 | "2023-10-23 07:24:33.933161: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", 68 | "2023-10-23 07:24:33.933204: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", 69 | "2023-10-23 07:24:35.469552: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n" 70 | ] 71 | } 72 | ], 73 | "source": [ 74 | "from botocore.client import Config\n", 75 | "from sklearn.model_selection import train_test_split\n", 76 | "from dataclasses import dataclass\n", 77 | "import tensorflow_data_validation as tfdv\n", 78 | "import numpy as np\n", 79 | "import pandas as pd\n", 80 | "from ibm_watson_studio_pipelines import WSPipelines\n", 81 | "import warnings\n", 82 | "\n", 83 | "\n", 84 | "warnings.filterwarnings(\"ignore\")" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "### Loading Variables and Utils from common python file\n", 92 | "\n", 93 | "In this section we load the variables and functions from the common python file. This file contains the variables and functions that are common to all the notebooks in this project." 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "import vars_and_utils as vars_and_utils" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": { 108 | "id": "d44e72ca-25cf-44c2-a8f4-4ad6eff50e4a" 109 | }, 110 | "source": [ 111 | "## Load the Training Data \n", 112 | "\n", 113 | "this will check if the training data exists within a defined db2 table. If it does not exist, it will load the data from the web and store it in the project space as a .csv file." 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 14, 119 | "metadata": { 120 | "id": "0487c397-3d75-4292-ae6f-8f2cd4bb8f19", 121 | "tags": [] 122 | }, 123 | "outputs": [ 124 | { 125 | "ename": "NameError", 126 | "evalue": "name 'training_file_path' is not defined", 127 | "output_type": "error", 128 | "traceback": [ 129 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 130 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 131 | "Cell \u001b[0;32mIn[14], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m gcr_df \u001b[38;5;241m=\u001b[39m load_data_from_project(\u001b[43mtraining_file_path\u001b[49m)\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m## Encode for ease of use with OpenScale\u001b[39;00m\n\u001b[1;32m 4\u001b[0m gcr_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mRisk\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m gcr_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mRisk\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mmap({\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mRisk\u001b[39m\u001b[38;5;124m'\u001b[39m:\u001b[38;5;241m1\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mNo Risk\u001b[39m\u001b[38;5;124m'\u001b[39m:\u001b[38;5;241m0\u001b[39m})\n", 132 | "\u001b[0;31mNameError\u001b[0m: name 'training_file_path' is not defined" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "gcr_df = vars_and_utils.load_german_credit_risk_data()\n", 138 | "\n", 139 | "## Encode for ease of use with OpenScale\n", 140 | "gcr_df['Risk'] = gcr_df['Risk'].map({'Risk':1,'No Risk':0})\n", 141 | "gcr_df.head()" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": { 147 | "id": "e38d5859-3df7-4174-8f8c-a047c0dcdb3c" 148 | }, 149 | "source": [ 150 | "## Data Validation " 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": { 157 | "id": "37d30747-1f1d-42ad-ab50-6001740df627", 158 | "tags": [] 159 | }, 160 | "outputs": [], 161 | "source": [ 162 | "@dataclass\n", 163 | "class Datavalidation:\n", 164 | " \"\"\"\n", 165 | " \n", 166 | " Data Validation Class\n", 167 | " \n", 168 | " \"\"\"\n", 169 | " dataframe : pd.DataFrame\n", 170 | " mask_per :int\n", 171 | " \n", 172 | " \n", 173 | " def split_data(self,seed=32):\n", 174 | " \"\"\"\n", 175 | " Split Data into Train and Test Splits\n", 176 | " \n", 177 | " \"\"\"\n", 178 | " np.random.seed(seed)\n", 179 | " mask = np.random.rand(len(self.dataframe)) <= self.mask_per\n", 180 | " training_data = gcr_df[mask]\n", 181 | " testing_data = gcr_df[~mask]\n", 182 | "\n", 183 | " print(f\"No. of training examples: {training_data.shape[0]}\")\n", 184 | " print(f\"No. of testing examples: {testing_data.shape[0]}\")\n", 185 | " \n", 186 | " return training_data, testing_data\n", 187 | " \n", 188 | " # TODO: Replace with Db2/fileystem\n", 189 | " def save_data_in_filesystem(self,df,filename):\n", 190 | " \"\"\"\n", 191 | " Save Data in Filesystem\n", 192 | "\n", 193 | " Passed filename should involve path\n", 194 | "\n", 195 | " \"\"\"\n", 196 | " try:\n", 197 | " df.to_csv(filename,index=False)\n", 198 | " print(f\"File {filename} persisted successfully\")\n", 199 | " except Exception as e:\n", 200 | " print(e)\n", 201 | " print(f\"File serialization for {filename} failed\")\n", 202 | " \n", 203 | " def generate_statistics(self,df):\n", 204 | " \"\"\"\n", 205 | " \n", 206 | " Generate Statistics on a given Dataframe\n", 207 | " \n", 208 | " \"\"\"\n", 209 | " train_stats = tfdv.generate_statistics_from_dataframe(df)\n", 210 | " tfdv.visualize_statistics(train_stats)\n", 211 | " return train_stats\n", 212 | " \n", 213 | " def inferSchema(self,stats):\n", 214 | " \n", 215 | " \"\"\"\n", 216 | " InferSchema on a given Dataframe\n", 217 | " \n", 218 | " \"\"\"\n", 219 | " schema = tfdv.infer_schema(statistics=stats)\n", 220 | " tfdv.display_schema(schema=schema)\n", 221 | " return schema\n", 222 | " \n", 223 | " def compare_statistics(self,lhs,rhs):\n", 224 | " \"\"\"\n", 225 | " \n", 226 | " Compare Statistics between a test dataframe and reference Schema\n", 227 | " \n", 228 | " \"\"\"\n", 229 | " # Compare evaluation data with training data\n", 230 | " tfdv.visualize_statistics(lhs_statistics=lhs, rhs_statistics=rhs,\n", 231 | " lhs_name='TEST_DATASET', rhs_name='TRAIN_DATASET')\n", 232 | " \n", 233 | " \n", 234 | " def check_for_anomalies(self,testable_stats,ref_schema):\n", 235 | " \"\"\"\n", 236 | " \n", 237 | " Check for any anomalies based on statistics and schema and values\n", 238 | " \n", 239 | " \"\"\"\n", 240 | " anomalies = tfdv.validate_statistics(statistics=testable_stats, schema=ref_schema)\n", 241 | " tfdv.display_anomalies(anomalies)\n", 242 | " if len(anomalies.anomaly_info.items()) > 0:\n", 243 | " logger.error(\"Anomalies found in dataset...\")\n", 244 | " logger.error(str(self.anomalies.anomaly_info.items()))\n", 245 | " return True\n", 246 | " else:\n", 247 | " return False" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": { 253 | "id": "fff2bb05-abd0-4e0a-9727-d1127a617f57" 254 | }, 255 | "source": [ 256 | "### Split Data into Train and Eval Splits to Check for Consistency" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": { 263 | "id": "65163ff9-3193-4837-ab55-d66de3a5076f", 264 | "tags": [] 265 | }, 266 | "outputs": [], 267 | "source": [ 268 | "classvalidate = Datavalidation(dataframe=gcr_df,mask_per=0.8) \n", 269 | "\n", 270 | "training_data, testing_data = classvalidate.split_data()" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": { 276 | "id": "75029b2c-6341-4a59-957c-cbb2d33e3e39" 277 | }, 278 | "source": [ 279 | "## Generate Training Stats on both Splits" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": { 286 | "id": "36a97f3f-3cd7-493d-8a4b-c3c87ae0710f", 287 | "tags": [] 288 | }, 289 | "outputs": [], 290 | "source": [ 291 | "train_stats = classvalidate.generate_statistics(training_data)\n", 292 | "test_stats = classvalidate.generate_statistics(testing_data)" 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": { 298 | "id": "ed4d3f56-e852-4269-a3dd-8426d71bed8e" 299 | }, 300 | "source": [ 301 | "## Infer Data Schemas" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "metadata": { 308 | "id": "13f7be93-3fb1-49ca-9238-ebb1fcc1af28", 309 | "tags": [] 310 | }, 311 | "outputs": [], 312 | "source": [ 313 | "train_schema = classvalidate.inferSchema(train_stats)\n", 314 | "test_schema = classvalidate.inferSchema(test_stats)" 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": { 320 | "id": "ea9f07ac-ac4b-4ecd-8840-a5ab07bfb7f8" 321 | }, 322 | "source": [ 323 | "## Compare Eval and Train Data " 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "metadata": { 330 | "id": "f5ccfb5d-05bd-4b3b-9f4d-8082252457c3", 331 | "tags": [] 332 | }, 333 | "outputs": [], 334 | "source": [ 335 | "classvalidate.compare_statistics(lhs=test_stats,rhs=train_stats)" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": { 341 | "id": "915f6a20-aab3-47b5-86e9-93bde2f548ff" 342 | }, 343 | "source": [ 344 | "## Check For Data Anomalies " 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": { 350 | "id": "eae32b5f-46b3-4e5d-919e-68c206a47b8e" 351 | }, 352 | "source": [ 353 | "### Check eval data for errors by validating the eval data stats using the previously inferred schema." 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": { 360 | "id": "f2927d8b-fa0f-420f-9181-a080cdfcb748", 361 | "tags": [] 362 | }, 363 | "outputs": [], 364 | "source": [ 365 | "anomaly_status = classvalidate.check_for_anomalies(test_stats,train_schema)\n", 366 | "anomaly_status" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": { 372 | "id": "c914eade-25d9-497a-960e-ec5a57282def" 373 | }, 374 | "source": [ 375 | "## Save Train and Test Data for Data Preparation Stage" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "metadata": { 382 | "id": "0f1d08cc-e4f7-4010-8b4e-7414529e874a", 383 | "tags": [] 384 | }, 385 | "outputs": [], 386 | "source": [ 387 | "# TODO: Replace with Db2/fileystem\n", 388 | "if not anomaly_status:\n", 389 | " classvalidate.save_data_in_filesystem(df=training_data,filename=vars_and_utils.train_data_path)\n", 390 | " classvalidate.save_data_in_filesystem(df=testing_data,filename=vars_and_utils.test_data_path)" 391 | ] 392 | }, 393 | { 394 | "cell_type": "markdown", 395 | "metadata": { 396 | "id": "23d79a03-46e7-49de-a549-65797ba3d46c" 397 | }, 398 | "source": [ 399 | "## Check if the validation steps were successful\n", 400 | "This checks if anomalies were found and if the data was successfully split into train and eval splits and stored as files." 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": null, 406 | "metadata": {}, 407 | "outputs": [], 408 | "source": [ 409 | "def validation_successfull(train_data_path, test_data_path):\n", 410 | " if anomaly_status: # no anomalies\n", 411 | " return False\n", 412 | " elif not os.path.exists(train_data_path): # train data file exists\n", 413 | " return False\n", 414 | " elif not os.path.exists(test_data_path): # test data file exists\n", 415 | " return False\n", 416 | " else:\n", 417 | " print (\"validation of the data successfull\")\n", 418 | " return True\n", 419 | " \n", 420 | "validation_successfull(vars_and_utils.train_data_path, vars_and_utils.test_data_path)" 421 | ] 422 | }, 423 | { 424 | "cell_type": "markdown", 425 | "metadata": { 426 | "id": "b42568a9-36f8-407e-be7a-f8bbbf93444a" 427 | }, 428 | "source": [ 429 | "## Register the output variables for the next pipeine stage\n", 430 | "every notebook outputs a \"was_successful\" boolean variable. The logic behind this is different for every notebook and can be altered to fit the needs of the project.\n", 431 | "If needed additional variables can be created here but they also need to registered as output variables in the Watson Pipelines UI." 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": null, 437 | "metadata": { 438 | "id": "9f210efd-7358-41a4-9ab5-37a856f3ab47", 439 | "tags": [] 440 | }, 441 | "outputs": [], 442 | "source": [ 443 | "validation_params = {}\n", 444 | "validation_params['was_succesfull'] = validation_successfull(vars_and_utils.train_data_path, vars_and_utils.test_data_path)\n", 445 | "\n", 446 | "pipelines_client = WSPipelines.from_token(TOKEN)\n", 447 | "pipelines_client.store_results(validation_params)" 448 | ] 449 | } 450 | ], 451 | "metadata": { 452 | "kernelspec": { 453 | "display_name": "Python 3.10", 454 | "language": "python", 455 | "name": "python3" 456 | }, 457 | "language_info": { 458 | "codemirror_mode": { 459 | "name": "ipython", 460 | "version": 3 461 | }, 462 | "file_extension": ".py", 463 | "mimetype": "text/x-python", 464 | "name": "python", 465 | "nbconvert_exporter": "python", 466 | "pygments_lexer": "ipython3", 467 | "version": "3.10.10" 468 | }, 469 | "vscode": { 470 | "interpreter": { 471 | "hash": "bd385fe162c5ca0c84973b7dd5c518456272446b2b64e67c2a69f949ca7a1754" 472 | } 473 | } 474 | }, 475 | "nbformat": 4, 476 | "nbformat_minor": 4 477 | } 478 | -------------------------------------------------------------------------------- /02-data_preparation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "![Alt text](images/banner.png)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": { 13 | "id": "4292d1bf-4e75-4a75-b8ca-914d4f58d925" 14 | }, 15 | "source": [ 16 | "## Data Preparation Notebook" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "### Initial Setup\n", 24 | "\n", 25 | "Some initial setup specific to running this notebook as part of the pipeline. " 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": { 32 | "id": "db343866-6051-45ab-a62d-a1afef8b9428" 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "import os\n", 37 | "#This environment variable is automatically set in WS Pipelines and are needed to access various services.\n", 38 | "TOKEN = os.getenv(\"USER_ACCESS_TOKEN\")" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": { 45 | "id": "dc2767f8-217a-4dca-a18e-1939eea8cd1f" 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "if os.getenv(\"running_in_production_pipeline\"):\n", 50 | " running_in_production_pipeline = True\n", 51 | " # If you want to run additional steps when deploying to production like reporting to external services, you can use this variable to trigger that\n", 52 | " # It can also be used to skip steps that are only needed in development like plotting\n", 53 | " print(\"notebook is running in a production pipeline!\")\n", 54 | "else:\n", 55 | " running_in_production_pipeline = False" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": { 61 | "id": "1e30a381-886e-4710-8874-3ff365e537cd" 62 | }, 63 | "source": [ 64 | "## Imports" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": { 71 | "id": "f161da10-75e1-4e66-8859-345ff5fcb899", 72 | "tags": [] 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "from sklearn.feature_selection import SelectKBest\n", 77 | "from sklearn.feature_selection import chi2\n", 78 | "from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,LabelEncoder,MinMaxScaler\n", 79 | "from sklearn.feature_selection import mutual_info_classif\n", 80 | "from sklearn.compose import ColumnTransformer\n", 81 | "from sklearn.pipeline import Pipeline\n", 82 | "from botocore.client import Config\n", 83 | "from ibm_watson_studio_pipelines import WSPipelines\n", 84 | "import matplotlib.pyplot as plt\n", 85 | "import heapq\n", 86 | "import os\n", 87 | "import pandas as pd\n", 88 | "\n", 89 | "# Loading Variables and Utils from common python file\n", 90 | "import vars_and_utils as vars_and_utils\n" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": { 96 | "id": "538bf340-c35f-439a-af79-ad9f5a0c6389" 97 | }, 98 | "source": [ 99 | "## Preparing the Train Data " 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": { 106 | "id": "aa1cd88c-f7df-4b8c-a908-49e84c4de6e5", 107 | "tags": [] 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "train_data = vars_and_utils.load_data_from_filesystem(vars_and_utils.train_data_path)\n", 112 | "train_data.head()" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "id": "08be57ad-b834-42f8-bb29-4f66474a0b3b", 120 | "tags": [] 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "object_df = train_data.select_dtypes('O')\n", 125 | "object_df.head()" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": { 132 | "id": "c3bf8612-3433-4810-b5c5-3b6d9d23d57d", 133 | "tags": [] 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "object_cols = list(set(object_df.columns.tolist()) - set(['Risk']))\n", 138 | "object_cols" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": { 145 | "id": "27f48dcb-7dee-44a8-b8ed-2ed58385912f", 146 | "tags": [] 147 | }, 148 | "outputs": [], 149 | "source": [ 150 | "numerical_columns = [col for col in train_data.columns.tolist() if col not in object_cols and col!='Risk']" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": { 156 | "id": "10c03047-c200-4f7e-91e6-0bdb31554fea" 157 | }, 158 | "source": [ 159 | "## Preparing the Test Data " 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": { 166 | "id": "ca4e5bb6-b11e-48cd-8457-a127a60b06c8", 167 | "tags": [] 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "test_data = vars_and_utils.load_data_from_filesystem(vars_and_utils.test_data_path)\n", 172 | "test_data.head()" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": { 178 | "id": "f31f399e-904c-49f0-9e2b-12b042d8e8b1" 179 | }, 180 | "source": [ 181 | "## Split the data sets " 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": { 188 | "id": "2dfd43ce-7cb2-4db1-b76d-d2ab6c796b2c", 189 | "tags": [] 190 | }, 191 | "outputs": [], 192 | "source": [ 193 | "y_train = train_data['Risk']\n", 194 | "X_train = train_data.drop(\"Risk\",axis=1)\n", 195 | "\n", 196 | "\n", 197 | "y_test = test_data['Risk']\n", 198 | "X_test = test_data.drop(\"Risk\",axis=1)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": { 204 | "id": "0af40507-eb8d-4e9c-a263-ab624f657375" 205 | }, 206 | "source": [ 207 | "## Categorcial Feature Analysis " 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": { 214 | "id": "f97b9595-c499-4dbd-b143-85310dfa73bd", 215 | "tags": [] 216 | }, 217 | "outputs": [], 218 | "source": [ 219 | "def prepare_input_data(X_train, X_test):\n", 220 | " oe = OrdinalEncoder()\n", 221 | " oe.fit(X_train)\n", 222 | " X_train_enc = oe.transform(X_train)\n", 223 | " X_test_enc = oe.transform(X_test)\n", 224 | " return X_train_enc, X_test_enc\n", 225 | "\n", 226 | "\n", 227 | "def prepare_output_data(y_train, y_test):\n", 228 | " le = LabelEncoder()\n", 229 | " le.fit(y_train)\n", 230 | " y_train_enc = le.transform(y_train)\n", 231 | " y_test_enc = le.transform(y_test)\n", 232 | " return y_train_enc, y_test_enc\n", 233 | "\n", 234 | "\n", 235 | "def select_best_chi2_features(X_train, y_train, X_test,score_func=chi2):\n", 236 | " featureselector = SelectKBest(score_func=chi2, k='all')\n", 237 | " featureselector.fit(X_train, y_train)\n", 238 | " X_train_best_feat = featureselector.transform(X_train)\n", 239 | " X_test_best_feat= featureselector.transform(X_test)\n", 240 | " return X_train_best_feat, X_test_best_feat, featureselector\n", 241 | "\n", 242 | "\n", 243 | "def select_best_mutualinf_features(X_train, y_train, X_test,k=5):\n", 244 | " featureselector = SelectKBest(score_func=mutual_info_classif, k=k)\n", 245 | " featureselector.fit(X_train, y_train)\n", 246 | " X_train_best_feat = fs.transform(X_train)\n", 247 | " X_test_best_feat= fs.transform(X_test)\n", 248 | " return X_train_best_feat, X_test_best_feat, featureselector\n", 249 | " \n", 250 | " \n", 251 | "def get_top_k_catgeorical(fs,train_cat,k=10):\n", 252 | " fs_score_map = {}\n", 253 | " for i in range(len(fs.scores_)):\n", 254 | " #print(f\"Feature {train_cat.columns.tolist()[i]} {fs.scores_[i]}\")\n", 255 | " fs_score_map[train_cat.columns.tolist()[i]] = fs.scores_[i]\n", 256 | " \n", 257 | " k_keys_sorted_by_values = heapq.nlargest(k, fs_score_map, key=fs_score_map.get)\n", 258 | " \n", 259 | " return k_keys_sorted_by_values" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": { 265 | "id": "660e9d2f-34db-4ecc-9fd5-970462fc2009" 266 | }, 267 | "source": [ 268 | "## Encode and shape the Variables " 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": { 275 | "id": "01ed54fe-7afa-4bc4-a37b-b85e6bf6cdc4", 276 | "tags": [] 277 | }, 278 | "outputs": [], 279 | "source": [ 280 | "X_train_enc, X_test_enc = prepare_input_data(X_train[object_cols], X_test[object_cols])\n", 281 | "\n", 282 | "y_train_enc, y_test_enc = prepare_output_data(y_train, y_test)\n", 283 | "\n", 284 | "X_train_fs, X_test_fs, fs = select_best_chi2_features(X_train_enc, y_train_enc, X_test_enc)\n", 285 | "\n" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": { 291 | "id": "8b210746-a131-4ac1-84db-105848cdbf8e" 292 | }, 293 | "source": [ 294 | "## Top K Categorical Features based on Chi2" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": { 301 | "id": "889c067d-bbb9-4655-8c52-03769449b649", 302 | "tags": [] 303 | }, 304 | "outputs": [], 305 | "source": [ 306 | "top_k_cat = get_top_k_catgeorical(fs,X_train[object_cols])\n", 307 | "top_k_cat" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": { 313 | "id": "0a017731-d213-43da-8fbb-67b98b28f8a8" 314 | }, 315 | "source": [ 316 | "## Top K Categorical Features based on Mutual Information Feature Selection" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": { 323 | "id": "ccb4d703-e822-4e9d-b66b-bc19c0ea94e3", 324 | "tags": [] 325 | }, 326 | "outputs": [], 327 | "source": [ 328 | "X_train_enc_mf, X_test_enc_mf = prepare_input_data(X_train[object_cols], X_test[object_cols])\n", 329 | "\n", 330 | "y_train_enc_mf, y_test_enc_mf = prepare_output_data(y_train, y_test)\n", 331 | "\n", 332 | "X_train_fs_mf, X_test_fs_mf, fs_mf = select_best_chi2_features(X_train_enc_mf, y_train_enc_mf, X_test_enc_mf)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": { 339 | "id": "577ace08-4e4f-4a97-a56c-7d26b610bec0", 340 | "tags": [] 341 | }, 342 | "outputs": [], 343 | "source": [ 344 | "top_k_cat_mf = get_top_k_catgeorical(fs_mf,X_train[object_cols])\n", 345 | "top_k_cat_mf" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "metadata": { 352 | "id": "faefd6b2-000e-4545-8607-e8ed0efbe772", 353 | "tags": [] 354 | }, 355 | "outputs": [], 356 | "source": [ 357 | "union_features = list(set(top_k_cat+top_k_cat_mf))\n", 358 | "if \"Sex\" not in union_features:\n", 359 | " union_features.append(\"Sex\")\n", 360 | "union_features" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "metadata": { 366 | "id": "74b9d9c5-636d-4504-841b-53ef8e072462" 367 | }, 368 | "source": [ 369 | "## Filter the Top K Categorical features and Merge to Original Train and Test Dataframes" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": { 376 | "id": "54680c6a-59b4-447e-bfda-cf0ac7076aef", 377 | "tags": [] 378 | }, 379 | "outputs": [], 380 | "source": [ 381 | "X_train_object_filtered = X_train[union_features]\n", 382 | "X_test_object_filtered = X_test[union_features]\n", 383 | "\n", 384 | "X_train_final = pd.concat([X_train[numerical_columns],X_train_object_filtered],axis=1)\n", 385 | "\n", 386 | "X_test_final = pd.concat([X_test[numerical_columns],X_test_object_filtered],axis=1)" 387 | ] 388 | }, 389 | { 390 | "cell_type": "markdown", 391 | "metadata": { 392 | "id": "c997be4b-7089-49f6-b51a-048d8003e0bc" 393 | }, 394 | "source": [ 395 | "## Use Column Transformer and Pipelines to encode the Input and Output Variables . Scale the Numerical columns using MinMaxScaler." 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": null, 401 | "metadata": { 402 | "id": "c2b11376-2c27-47e6-ba1e-db772d63ca12", 403 | "tags": [] 404 | }, 405 | "outputs": [], 406 | "source": [ 407 | "numerical_ix = X_train_final.select_dtypes(include=['int64', 'float64']).columns\n", 408 | "categorical_ix = X_train_final.select_dtypes(include=['object', 'bool']).columns" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": null, 414 | "metadata": { 415 | "id": "b2b5304f-43f4-4509-bf06-9d901fe6b666", 416 | "tags": [] 417 | }, 418 | "outputs": [], 419 | "source": [ 420 | "encoding_steps = [('cat', OrdinalEncoder(), categorical_ix), ('num', MinMaxScaler(), numerical_ix)]\n", 421 | "col_transform = ColumnTransformer(transformers=encoding_steps)" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": null, 427 | "metadata": { 428 | "id": "40b974d3-afc0-46a0-a490-01984513cdae", 429 | "tags": [] 430 | }, 431 | "outputs": [], 432 | "source": [ 433 | "pipeline = Pipeline(steps=[('prep',col_transform)])" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": null, 439 | "metadata": { 440 | "id": "221e73d5-e43a-4a1e-a4b0-1203a67ce96e", 441 | "tags": [] 442 | }, 443 | "outputs": [], 444 | "source": [ 445 | "train_final = pd.concat([X_train_final,y_train],axis=1)" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": null, 451 | "metadata": { 452 | "id": "2b074997-65a5-4e8a-8ab5-c8594938c4ae", 453 | "tags": [] 454 | }, 455 | "outputs": [], 456 | "source": [ 457 | "test_final = pd.concat([X_test_final,y_test],axis=1)" 458 | ] 459 | }, 460 | { 461 | "cell_type": "markdown", 462 | "metadata": { 463 | "id": "ad254eee-5c2d-4f2f-802b-9bea72d23d16" 464 | }, 465 | "source": [ 466 | "## Save the Prepared Data to the project filesystem" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": null, 472 | "metadata": { 473 | "id": "4ac74b08-e76a-435c-8ded-95a4e6907e4b", 474 | "tags": [] 475 | }, 476 | "outputs": [], 477 | "source": [ 478 | "vars_and_utils.save_data_in_filesystem(df=train_final, filename=vars_and_utils.train_data_path)" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": null, 484 | "metadata": { 485 | "id": "5de8e8a2-0401-4fcf-bf1d-e64f9d15c325", 486 | "tags": [] 487 | }, 488 | "outputs": [], 489 | "source": [ 490 | "vars_and_utils.save_data_in_filesystem(df=test_final, filename=vars_and_utils.test_data_path)" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": null, 496 | "metadata": { 497 | "id": "17576e9f-73ee-422c-8c41-df8debc9a743", 498 | "tags": [] 499 | }, 500 | "outputs": [], 501 | "source": [ 502 | "vars_and_utils.save_data_in_filesystem(df=pipeline, filename=vars_and_utils.pipeline_path)" 503 | ] 504 | }, 505 | { 506 | "cell_type": "markdown", 507 | "metadata": { 508 | "id": "b3b2d02f-82bf-4021-8c4d-1a80e8f9d393" 509 | }, 510 | "source": [ 511 | "## Custom succes check: Check if files have been succesfully created " 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": null, 517 | "metadata": { 518 | "id": "c50b68a4-00b7-474a-9b8b-e750eb28a93d", 519 | "tags": [] 520 | }, 521 | "outputs": [], 522 | "source": [ 523 | "data_prep_done = os.path.exists(vars_and_utils.train_data_path) and os.path.exists(vars_and_utils.test_data_path) and os.path.exists(vars_and_utils.pipeline_path)\n", 524 | "data_prep_done" 525 | ] 526 | }, 527 | { 528 | "cell_type": "markdown", 529 | "metadata": { 530 | "id": "da0ba538-74ac-45ff-a262-0a6420ff8759" 531 | }, 532 | "source": [ 533 | "## Register the output variables for the next pipeine stage\n", 534 | "every notebook outputs a \"was_successful\" boolean variable. The logic behind this is different for every notebook and can be altered to fit the needs of the project.\n", 535 | "If needed additional variables can be created here but they also need to registered as output variables in the Watson Pipelines UI." 536 | ] 537 | }, 538 | { 539 | "cell_type": "code", 540 | "execution_count": null, 541 | "metadata": { 542 | "id": "e977c379-c462-4e9e-91c5-3ef319062156", 543 | "tags": [] 544 | }, 545 | "outputs": [], 546 | "source": [ 547 | "preparation_params = {}\n", 548 | "preparation_params['was_succesfull'] = data_prep_done\n", 549 | "\n", 550 | "pipelines_client = WSPipelines.from_token(TOKEN)\n", 551 | "pipelines_client.store_results(preparation_params)" 552 | ] 553 | } 554 | ], 555 | "metadata": { 556 | "kernelspec": { 557 | "display_name": "Python 3.10", 558 | "language": "python", 559 | "name": "python3" 560 | }, 561 | "language_info": { 562 | "codemirror_mode": { 563 | "name": "ipython", 564 | "version": 3 565 | }, 566 | "file_extension": ".py", 567 | "mimetype": "text/x-python", 568 | "name": "python", 569 | "nbconvert_exporter": "python", 570 | "pygments_lexer": "ipython3", 571 | "version": "3.10.10" 572 | }, 573 | "vscode": { 574 | "interpreter": { 575 | "hash": "bd385fe162c5ca0c84973b7dd5c518456272446b2b64e67c2a69f949ca7a1754" 576 | } 577 | } 578 | }, 579 | "nbformat": 4, 580 | "nbformat_minor": 4 581 | } 582 | -------------------------------------------------------------------------------- /04-deploy_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "![Alt text](images/banner.png)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": { 13 | "id": "9909ba97-1cfe-495b-bd18-d663ea13c7fa" 14 | }, 15 | "source": [ 16 | "## Deploy the Saved Model in the project to Deployment Space" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "### Initial Setup\n", 24 | "\n", 25 | "Some initial setup specific to running this notebook as part of the pipeline." 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "import os\n", 35 | "#This environment variable is automatically set in WS Pipelines and are needed to access various services.\n", 36 | "TOKEN = os.getenv(\"USER_ACCESS_TOKEN\")" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "if os.getenv(\"running_in_production_pipeline\"):\n", 46 | " running_in_production_pipeline = True\n", 47 | " # If you want to run additional steps when deploying to production like reporting to external services, you can use this variable to trigger that\n", 48 | " # It can also be used to skip steps that are only needed in development like plotting\n", 49 | " print(\"notebook is running in a production pipeline!\")\n", 50 | "else:\n", 51 | " running_in_production_pipeline = False\n", 52 | " print(\"notebook is running in a development enviroment!\")" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 26, 58 | "metadata": { 59 | "id": "d600fa56-ad81-4587-95c4-3f6c67e211c4", 60 | "tags": [] 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "from ibm_cloud_sdk_core.authenticators import IAMAuthenticator\n", 65 | "from ibm_watson_machine_learning import APIClient\n", 66 | "# from ibm_aigov_facts_client import AIGovFactsClient #removing due to current issues --> put back ASAP\n", 67 | "from ibm_watson_studio_pipelines import WSPipelines\n", 68 | "from botocore.client import Config\n", 69 | "import ibm_boto3\n", 70 | "import pandas as pd\n", 71 | "import json\n", 72 | "import os\n", 73 | "import requests\n", 74 | "import pickle\n", 75 | "import vars_and_utils as vars_and_utils\n" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": { 81 | "id": "c892c8cd-46b7-4a16-85cf-0ac039fd8a61" 82 | }, 83 | "source": [ 84 | "## Instantiate WML Client" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": { 91 | "id": "6a324dbf-3194-4a1b-897e-3a74d7ec1346", 92 | "tags": [] 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "WML_CREDENTIALS = {\n", 97 | " \"token\": TOKEN,\n", 98 | " \"instance_id\" : \"openshift\",\n", 99 | " \"url\": os.environ['RUNTIME_ENV_APSX_URL'],\n", 100 | " \"version\": \"4.6\"\n", 101 | "}\n", 102 | "\n", 103 | "wml_client = APIClient(WML_CREDENTIALS)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "if running_in_production_pipeline:\n", 113 | " deployment_space_id=vars_and_utils.deployment_space_id_PROD\n", 114 | "else:\n", 115 | " deployment_space_id=vars_and_utils.deployment_space_id_DEV\n", 116 | " \n", 117 | "deployment_space_id\n", 118 | " " 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": { 125 | "id": "08df0fad-f279-483f-87dc-e0b8d96f14bf", 126 | "tags": [] 127 | }, 128 | "outputs": [], 129 | "source": [ 130 | "wml_client.set.default_space(deployment_space_id)" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": { 136 | "id": "288ce55e-9b82-4185-8297-a399f1648cea", 137 | "tags": [] 138 | }, 139 | "source": [ 140 | "# Deserialize model\n", 141 | "\n", 142 | "TODO: Later get model from model inventory instead" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": { 149 | "id": "e5bd323f-cf79-4ab1-ae8c-c81a14d81f3b", 150 | "tags": [] 151 | }, 152 | "outputs": [], 153 | "source": [ 154 | "with open(vars_and_utils.model_path, 'rb') as f:\n", 155 | " model = pickle.load(f)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": { 161 | "id": "40fc914b-9675-46a3-ad8f-e6f53dc72437" 162 | }, 163 | "source": [ 164 | "### Load Sample Data " 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": { 171 | "id": "1f16e30e-c9cb-4581-bead-352c127018d4", 172 | "tags": [] 173 | }, 174 | "outputs": [], 175 | "source": [ 176 | "payload_data = vars_and_utils.load_data_from_filesystem(vars_and_utils.test_data_path)\n", 177 | "payload_data = payload_data.drop('Risk',axis=1)\n", 178 | "fields = payload_data.columns.tolist()\n", 179 | "values = [payload_data.values.tolist()[0]]\n", 180 | "\n", 181 | "payload_scoring = {\"input_data\": [{\"fields\": fields, \"values\": values}]}\n", 182 | "json.dumps(payload_scoring)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": { 189 | "id": "eb8a1006-f0a0-4223-8c67-2f4aafe610dd", 190 | "tags": [] 191 | }, 192 | "outputs": [], 193 | "source": [ 194 | "# TODO: Move to notebook 3 and skip (de)serialization process\n", 195 | "software_spec_uid = wml_client.software_specifications.get_id_by_name(\"runtime-22.2-py3.10\")\n", 196 | "model_props_gbt = {\n", 197 | " wml_client.repository.ModelMetaNames.NAME: vars_and_utils.model_name,\n", 198 | " wml_client.repository.ModelMetaNames.DESCRIPTION: vars_and_utils.model_name,\n", 199 | " wml_client.repository.ModelMetaNames.SOFTWARE_SPEC_UID: software_spec_uid,\n", 200 | " wml_client.repository.ModelMetaNames.TYPE: \"scikit-learn_1.1\"\n", 201 | "}\n", 202 | "\n", 203 | "published_model_details = wml_client.repository.store_model(model=model, meta_props=model_props_gbt, training_data=fields,training_target=values)\n", 204 | "print(published_model_details)" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": { 211 | "id": "920713be-ff4d-4fa3-8dd8-81623801b641", 212 | "tags": [] 213 | }, 214 | "outputs": [], 215 | "source": [ 216 | "model_id = wml_client.repository.get_model_id(published_model_details)\n", 217 | "model_id" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "# There is an issue with WML Deployments right now. TODO: fix this\n", 227 | "# This is the ID of the model that was uploaded manually --> that works\n", 228 | "model_id = \"65c060dc-ea5b-4c48-86fe-acb97853b5df\"\n", 229 | "# This is fake and we need to fix this!" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": { 235 | "id": "ad6fe0e3-d01e-4363-83c0-4ac57bd9e3dc" 236 | }, 237 | "source": [ 238 | "## Promote the Model to deployment space and Deploy the Model" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": { 245 | "id": "d25ed89c-62dd-4bc8-8644-3439c9586f8b", 246 | "tags": [] 247 | }, 248 | "outputs": [], 249 | "source": [ 250 | "meta_data = {\n", 251 | " wml_client.deployments.ConfigurationMetaNames.NAME: vars_and_utils.deployment_name,\n", 252 | " wml_client.deployments.ConfigurationMetaNames.ONLINE: {},\n", 253 | " wml_client.deployments.ConfigurationMetaNames.HARDWARE_SPEC: {\n", 254 | " \"name\": \"S\",\n", 255 | " \"num_nodes\": 1,\n", 256 | " }\n", 257 | "}\n", 258 | "\n", 259 | "deployment_details = wml_client.deployments.create(model_id, meta_props=meta_data)" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": { 266 | "id": "4802233a-fbc2-458e-91b0-a859a2f57d66", 267 | "tags": [] 268 | }, 269 | "outputs": [], 270 | "source": [ 271 | "deployment_uid = wml_client.deployments.get_id(deployment_details)\n", 272 | "deployment_uid" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": { 278 | "id": "728d19a0-395f-4b11-a5cd-df099ea4abdb", 279 | "tags": [] 280 | }, 281 | "source": [ 282 | "## Score the Endpoint" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": { 288 | "id": "5f93a3a4-aaa5-4304-a513-e887619e6079" 289 | }, 290 | "source": [ 291 | "### Model Testing on the Serving Endpoint\n", 292 | "\n" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "metadata": { 299 | "id": "cb6e54f2-aed6-4f1e-8ec9-8297454f6a07", 300 | "tags": [] 301 | }, 302 | "outputs": [], 303 | "source": [ 304 | "predictions = wml_client.deployments.score(deployment_uid, payload_scoring)\n", 305 | "predictions" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": { 311 | "id": "cff76dc2-be3f-48a6-a583-a75d6583e1da" 312 | }, 313 | "source": [ 314 | "### Test for Downstream Apps without using WML SDK." 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": { 321 | "id": "ff65ef37-ba78-4149-9f06-6d2744b1a668", 322 | "tags": [] 323 | }, 324 | "outputs": [], 325 | "source": [ 326 | "# deploy_done is true if deployment_uid and model_id are not null\n", 327 | "deploy_done = bool(deployment_uid) and bool(model_id)" 328 | ] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "metadata": { 333 | "id": "a4032cce-515e-4509-810d-51d216ed9ac3" 334 | }, 335 | "source": [ 336 | "## Register the output variables for the next pipeine stage\n", 337 | "every notebook outputs a \"was_successful\" boolean variable. The logic behind this is different for every notebook and can be altered to fit the needs of the project.\n", 338 | "If needed additional variables can be created here but they also need to registered as output variables in the Watson Pipelines UI." 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": null, 344 | "metadata": { 345 | "id": "96ccb5fc-9d30-42c8-ac75-4141e7c663df", 346 | "tags": [] 347 | }, 348 | "outputs": [], 349 | "source": [ 350 | "deployment_done = {}\n", 351 | "deployment_done['was_succesfull'] = deploy_done\n", 352 | "deployment_done['deployment_id'] = deployment_uid\n", 353 | "deployment_done['model_id'] = model_id\n", 354 | "\n", 355 | "pipelines_client = WSPipelines.from_token(TOKEN)\n", 356 | "pipelines_client.store_results(deployment_done)" 357 | ] 358 | } 359 | ], 360 | "metadata": { 361 | "kernelspec": { 362 | "display_name": "Python 3.10", 363 | "language": "python", 364 | "name": "python3" 365 | }, 366 | "language_info": { 367 | "codemirror_mode": { 368 | "name": "ipython", 369 | "version": 3 370 | }, 371 | "file_extension": ".py", 372 | "mimetype": "text/x-python", 373 | "name": "python", 374 | "nbconvert_exporter": "python", 375 | "pygments_lexer": "ipython3", 376 | "version": "3.10.10" 377 | }, 378 | "vscode": { 379 | "interpreter": { 380 | "hash": "bd385fe162c5ca0c84973b7dd5c518456272446b2b64e67c2a69f949ca7a1754" 381 | } 382 | } 383 | }, 384 | "nbformat": 4, 385 | "nbformat_minor": 4 386 | } 387 | -------------------------------------------------------------------------------- /05_monitor_deployment.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"markdown","metadata":{"collapsed":true,"id":"75feab07-55ef-4436-ab6c-48d5c2d133e4","jupyter":{"outputs_hidden":true}},"source":["\"banner\""]},{"cell_type":"markdown","metadata":{"id":"b18c0eec7d1c46f6b8fe4c779c478b7b"},"source":["# Working with Watson OpenScale - Headless Subscription"]},{"cell_type":"markdown","metadata":{"id":"fc3ebd630e524b3e812e2d17d603e5c9"},"source":["# Pipeline variables"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"27e86130ede246668dc25badce38f298"},"outputs":[],"source":["# Deploymentspace ID\n","space_uid = 'c4238e9c-1cbd-4776-aa6e-4f6b1f865ed1'\n","\n","# DeploymentID of the model that will be monitored\n","# TODO: make 04_deploy_models emit this and then use that URL\n","deployment_uid = '66271495-2e3e-4ab2-ae2f-521330555bdf'"]},{"cell_type":"markdown","metadata":{"id":"1b7933c7711e412582e7dbf0b4c1ade8"},"source":["# Credentials"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"f3505ba0c4fe4866a93ff0e810e90fe4"},"outputs":[],"source":["from ibm_watson_machine_learning import APIClient\n","import os\n","from dotenv import load_dotenv\n","# Loading Variables and Utils from common python file\n","import vars_and_utils as vars_and_utils\n","\n","load_dotenv()\n","\n","token = os.environ['USER_ACCESS_TOKEN']\n","cpd_technical_user = os.environ['cpd_technical_user']\n","cpd_technical_user_password = os.environ['cpd_technical_user_password']\n","cpd_url = os.environ['cpd_url']\n","\n","\n","\n","WML_CREDENTIALS = {\n"," \"token\": token,\n"," \"instance_id\" : \"openshift\",\n"," \"url\": os.environ['RUNTIME_ENV_APSX_URL'],\n"," \"version\": \"4.7\"\n","}\n","\n","\n","\n","WOS_CREDENTIALS = {\n"," \"url\": cpd_url,\n"," \"username\": cpd_technical_user,\n"," \"password\": cpd_technical_user_password,\n"," \"version\": \"4.7\"\n","}"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["print(WOS_CREDENTIALS)"]},{"cell_type":"markdown","metadata":{"id":"262ffb75c64b4523bfda24ea95d4b8f5"},"source":["# WOS name definitions"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"1f4872bb1f6944678d36c12ff05f9b5c"},"outputs":[],"source":["SERVICE_PROVIDER_NAME = \"OpenScale Headless Service Provider\"\n","SERVICE_PROVIDER_DESCRIPTION = \"Added by automated WOS configuration notebook.\"\n","SUBSCRIPTION_NAME = \"AOT Initiative - Headless Subscription\""]},{"cell_type":"markdown","metadata":{"id":"3b7c0a42ac1e43139057e5d78abfe075"},"source":["# Setup - Package installation "]},{"cell_type":"code","execution_count":null,"metadata":{"id":"dc441429a8ad4aada3fa29d81e98e5cf"},"outputs":[],"source":["!pip install --upgrade ibm-watson-machine-learning --user | tail -n 1\n","!pip install --upgrade ibm-watson-openscale --no-cache | tail -n 1\n","!pip install --upgrade \"ibm-metrics-plugin>=4.6.4.0\""]},{"cell_type":"markdown","metadata":{"id":"eb4fc86b293a4dd188bd341121b45aae"},"source":["# Imports"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"e672b2b0765448338e4c5df902054027"},"outputs":[],"source":["import pandas as pd\n","import tarfile\n","from io import BytesIO\n","\n","from ibm_watson_openscale import APIClient\n","from ibm_watson_openscale.utils import *\n","from ibm_watson_openscale.supporting_classes import *\n","from ibm_watson_openscale.supporting_classes.enums import *\n","from ibm_watson_openscale.base_classes.watson_open_scale_v2 import *\n","from ibm_cloud_sdk_core.authenticators import IAMAuthenticator,BearerTokenAuthenticator, CloudPakForDataAuthenticator\n","\n","import json\n","import requests\n","import base64\n","from requests.auth import HTTPBasicAuth\n","import time\n","\n","# disable warnings\n","import warnings\n","warnings.filterwarnings('ignore')"]},{"cell_type":"markdown","metadata":{"id":"2963bb1dc0064426839aa24a0ea45150"},"source":["# Get training data statistics"]},{"cell_type":"markdown","metadata":{"id":"636c77c1db4a48da85ed3dd8c6846a45"},"source":["### Get the training data"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["data_df = vars_and_utils.load_data_from_filesystem(vars_and_utils.raw_data_path)\n","data_df.head()"]},{"cell_type":"markdown","metadata":{"id":"b5afc1d931da4c6495a10f497b5c4d52"},"source":["### Generate the training data stats"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"49a11fd0e1734d6789c54ebb58977f43"},"outputs":[],"source":["from ibm_watson_openscale.utils.training_stats import TrainingStats"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"e99bf24d2ea34f848ce12ea738cc4210"},"outputs":[],"source":["feature_columns = data_df.drop(\"Risk\", axis=1).columns.tolist()\n","cat_features = [x for x in feature_columns if data_df[x].dtype == 'object']\n","class_label = \"Risk\"\n","prediction_column = \"prediction\"\n","probability_column = \"probability\""]},{"cell_type":"code","execution_count":null,"metadata":{"id":"216ed095143047dab3c4332c97b5d31b"},"outputs":[],"source":["service_configuration_support = {\n"," \"enable_fairness\": True,\n"," \"enable_explainability\": True,\n"," \"enable_drift\": True\n","}\n","\n","fairness_attributes = [{\n"," \"feature\": \"Sex\", \n"," \"majority\": [\n"," \"male\"\n"," ],\n"," \"minority\": [\n"," \"female\"\n"," ],\n"," \"threshold\": 0.8\n","}]\n","\n","model_type = \"binary\"\n","parameters = {\n"," \"favourable_class\" : [ \"No Risk\" ],\n"," \"unfavourable_class\": [ \"Risk\" ]\n","}\n","min_records = 100\n","\n","# Generate Training stats\n","enable_explainability = service_configuration_support.get('enable_explainability')\n","enable_fairness = service_configuration_support.get('enable_fairness')\n","training_data_stats = None\n","if enable_explainability or enable_fairness:\n"," fairness_inputs = None\n"," if enable_fairness:\n"," fairness_inputs = {\n"," \"fairness_attributes\": fairness_attributes,\n"," \"min_records\" : min_records,\n"," \"favourable_class\" : parameters[\"favourable_class\"],\n"," \"unfavourable_class\": parameters[\"unfavourable_class\"]\n"," }\n"," \n","input_parameters = {\n"," \"label_column\": class_label,\n"," \"feature_columns\": feature_columns,\n"," \"categorical_columns\": cat_features,\n"," \"fairness_inputs\": fairness_inputs,\n"," \"problem_type\" : \"binary\",\n"," \"prediction\": prediction_column,\n"," \"probability\": \"probability\"\n","}"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"b0ddac7d43784eff8f51faac32315fb7"},"outputs":[],"source":["training_stats = TrainingStats(data_df,input_parameters, explain=True, fairness=enable_explainability, drop_na=True)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"69f30aa6275a45c1aabee80b8bc91ba2"},"outputs":[],"source":["training_data_stats = training_stats.get_training_statistics()"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"ba14fdd0d5fc42148ffa0c70318e0bc0"},"outputs":[],"source":["training_data_stats[\"notebook_version\"] = 5.0\n","print(training_data_stats)"]},{"cell_type":"markdown","metadata":{"id":"cdc330c2-ce03-4665-a5d3-bbb00ee9e2c5"},"source":["### This JSON contains the training statistics"]},{"cell_type":"markdown","metadata":{"id":"9b0707ac1440466492c933e7069aa015"},"source":["# Configure OpenScale \n","\n","The notebook will now import the necessary libraries and set up a Python OpenScale client."]},{"cell_type":"markdown","metadata":{"id":"3043544c667e49d38cd70855d8ccb5ad"},"source":["## Get a instance of the OpenScale SDK client and connect to WOS datamart\n","\n","Watson OpenScale uses a database to store payload and feedback logs and calculated metrics. Here we are using already configured data mart in IBM Cloud."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"1662b2fa228e432c8b6a4b46fd5d5b3f"},"outputs":[],"source":["authenticator = CloudPakForDataAuthenticator(\n"," url=WOS_CREDENTIALS[\"url\"],\n"," username=WOS_CREDENTIALS[\"username\"],\n"," password=WOS_CREDENTIALS[\"password\"],\n"," disable_ssl_verification=True\n"," )\n","\n","try:\n"," wos_client = APIClient(authenticator=authenticator, service_url=WOS_CREDENTIALS[\"url\"])\n"," print(\"Authentication Successful\")\n"," data_marts = wos_client.data_marts.list().result.data_marts\n"," data_mart_id=data_marts[0].metadata.id\n"," print('Using existing datamart {}'.format(data_mart_id))\n","except:\n"," print(\"ERROR: Authorization request has been rejected with message: AIQCS0002E : Not authorized to access datamart id `00000000-0000-0000-0000-000000000000`.\")\n"," if DATAMART_ID==\"00000000-0000-0000-0000-000000000000\":\n"," DATAMART_ID=input(\"Please enter your datamart id to authenticate\")\n"," print(\"\\nTrying to authenticate with the DATAMART_ID provided..\")\n"," wos_client = APIClient(authenticator=authenticator, service_url=WOS_CREDENTIALS[\"url\"], service_instance_id=DATAMART_ID)\n"," print(\"Authentication Successful.\")"]},{"cell_type":"markdown","metadata":{"id":"b1ecf806072240cd89af4324cb844744"},"source":["## Remove existing service provider\n","\n","Multiple service providers for the same engine instance are avaiable in Watson OpenScale. To avoid multiple service providers of used WML instance in the tutorial notebook the following code deletes existing service provder(s) and then adds new one."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"ebd41ed59d0a44b3837152bdd67e10fc"},"outputs":[],"source":["service_providers = wos_client.service_providers.list().result.service_providers\n","for service_provider in service_providers:\n"," service_instance_name = service_provider.entity.name\n"," if service_instance_name == SERVICE_PROVIDER_NAME:\n"," service_provider_id = service_provider.metadata.id\n"," wos_client.service_providers.delete(service_provider_id)\n"," print(\"Deleted existing service_provider for WML instance: {}\".format(service_provider_id))"]},{"cell_type":"markdown","metadata":{"id":"48cf5770290744c28274669a5af0103b"},"source":["## Add service provider\n","\n","Watson OpenScale needs to be bound to the Watson Machine Learning instance to capture payload data into and out of the model.\n","\n","Note: Here the service provider is created with empty credentials, meaning no endpoint. Just to demonstrate the use case were we don't need an actual end point serving requests."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"2d260d321cad475682aea0857c9b1054"},"outputs":[],"source":["MLCredentials = {}\n","added_service_provider_result = wos_client.service_providers.add(\n"," name=SERVICE_PROVIDER_NAME,\n"," description=SERVICE_PROVIDER_DESCRIPTION,\n"," service_type=ServiceTypes.CUSTOM_MACHINE_LEARNING,\n"," operational_space_id = \"production\",\n"," credentials=MLCredentials,\n"," background_mode=False\n"," ).result\n","service_provider_id = added_service_provider_result.metadata.id"]},{"cell_type":"markdown","metadata":{"id":"09f4993d2caa48269b9832c7e9b145d6"},"source":["## Subscriptions"]},{"cell_type":"markdown","metadata":{"id":"2e59906593c44bda97f79a9c35e728f6"},"source":["This code removes previous subscriptions to the model to refresh the monitors with the new model and new data."]},{"cell_type":"markdown","metadata":{"id":"b0d085c8a875458b834992abd3577219"},"source":["## Remove the existing subscription"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"f5ef621a07cb4e468fca6cc62d22595e"},"outputs":[],"source":["subscriptions = wos_client.subscriptions.list().result.subscriptions\n","for subscription in subscriptions:\n"," if subscription.entity.asset.name == '[asset] ' + SUBSCRIPTION_NAME:\n"," sub_model_id = subscription.metadata.id\n"," wos_client.subscriptions.delete(subscription.metadata.id)\n"," print('Deleted existing subscription for model', sub_model_id)"]},{"cell_type":"markdown","metadata":{"id":"d117477d751b4d6380d2feceac14450e"},"source":["This code creates the model subscription in OpenScale using the Python client API. Note that we need to provide the model unique identifier, and some information about the model itself."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"e2df727978ef4fbe8062c30a63944b6d"},"outputs":[],"source":["print(\"Data Mart ID: \" + data_mart_id)\n","print(\"Service Provide ID: \" + service_provider_id)\n","import uuid\n","asset_id = str(uuid.uuid4())\n","asset_name = '[asset] ' + SUBSCRIPTION_NAME\n","url = None\n","\n","asset_deployment_id = str(uuid.uuid4())\n","asset_deployment_name = asset_name"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"1de20abba6d84c6c8ba84cd3ed3bc96e"},"outputs":[],"source":["prediction_column = prediction_column\n","probability_columns = ['probability']\n","predicted_target_column = prediction_column\n","\n","subscription_details = wos_client.subscriptions.add(data_mart_id,\n"," service_provider_id,\n"," asset=Asset(\n"," asset_id=asset_id,\n"," name=asset_name,\n"," url=url,\n"," asset_type=AssetTypes.MODEL,\n"," input_data_type=InputDataType.STRUCTURED,\n"," problem_type=ProblemType.BINARY_CLASSIFICATION\n"," ),\n"," deployment=None, \n"," training_data_stats=training_data_stats, \n"," prediction_field = prediction_column,\n"," predicted_target_field = predicted_target_column,\n"," probability_fields = probability_columns,background_mode = False\n"," ,deployment_name = asset_name\n"," ).result\n","\n","subscription_id = subscription_details.metadata.id\n","print(\"Subscription id {}\".format(subscription_id))"]},{"cell_type":"markdown","metadata":{"id":"e9f01d4d-7c2c-4fb4-ab8a-95c3034dfb9c"},"source":["### The following code fetches the data set id, against which we would be performing the payload logging"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"9dee41fc8ae2407483f7d08f7ecd9663"},"outputs":[],"source":["import time\n","\n","time.sleep(5)\n","payload_data_set_id = None\n","payload_data_set_id = wos_client.data_sets.list(type=DataSetTypes.PAYLOAD_LOGGING, \n"," target_target_id=subscription_id, \n"," target_target_type=TargetTypes.SUBSCRIPTION).result.data_sets[0].metadata.id\n","if payload_data_set_id is None:\n"," print(\"Payload data set not found. Please check subscription status.\")\n","else:\n"," print(\"Payload data set id:\", payload_data_set_id)"]},{"cell_type":"markdown","metadata":{"id":"2d75c57ab8914d70acdc3ed523675b1b"},"source":["## Push a payload record to setup the required schemas in the subscription\n","\n","This is the location where one needs to fetch the output of the batch scoring model and construct the payload as per the OpenScale Payload Logging format.\n","\n","Note : No scoring is done against the model. The PayloadRecord is constructed with the request and response from the model/deployment."]},{"cell_type":"markdown","metadata":{"id":"7ea1d04fe73a452685847cfc4f780256"},"source":["## Scoring Request Payload"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"cb53d8ad950146bda2fd9ad017112835"},"outputs":[],"source":["scoring_request = {\n"," \"fields\": [\n"," \"CheckingStatus\",\n"," \"LoanDuration\",\n"," \"CreditHistory\",\n"," \"LoanPurpose\",\n"," \"LoanAmount\",\n"," \"ExistingSavings\",\n"," \"EmploymentDuration\",\n"," \"InstallmentPercent\",\n"," \"Sex\",\n"," \"OthersOnLoan\",\n"," \"CurrentResidenceDuration\",\n"," \"OwnsProperty\",\n"," \"Age\",\n"," \"InstallmentPlans\",\n"," \"Housing\",\n"," \"ExistingCreditsCount\",\n"," \"Job\",\n"," \"Dependents\",\n"," \"Telephone\",\n"," \"ForeignWorker\",\n"," \"Risk\"\n"," ],\n"," \"values\": [\n"," [\n"," \"no_checking\",\n"," 28,\n"," \"outstanding_credit\",\n"," \"appliances\",\n"," 5990,\n"," \"500_to_1000\",\n"," \"greater_7\",\n"," 5,\n"," \"male\",\n"," \"co-applicant\",\n"," 3,\n"," \"car_other\",\n"," 55,\n"," \"none\",\n"," \"free\",\n"," 2,\n"," \"skilled\",\n"," 2,\n"," \"yes\",\n"," \"yes\",\n"," \"Risk\"\n"," ],\n"," [\n"," \"greater_200\",\n"," 22,\n"," \"all_credits_paid_back\",\n"," \"car_used\",\n"," 3376,\n"," \"less_100\",\n"," \"less_1\",\n"," 3,\n"," \"female\",\n"," \"none\",\n"," 2,\n"," \"car_other\",\n"," 32,\n"," \"none\",\n"," \"own\",\n"," 1,\n"," \"skilled\",\n"," 1,\n"," \"none\",\n"," \"yes\",\n"," \"No Risk\"\n"," ],\n"," [\n"," \"no_checking\",\n"," 39,\n"," \"credits_paid_to_date\",\n"," \"vacation\",\n"," 6434,\n"," \"unknown\",\n"," \"greater_7\",\n"," 5,\n"," \"male\",\n"," \"none\",\n"," 4,\n"," \"car_other\",\n"," 39,\n"," \"none\",\n"," \"own\",\n"," 2,\n"," \"skilled\",\n"," 2,\n"," \"yes\",\n"," \"yes\",\n"," \"Risk\"\n"," ],\n"," [\n"," \"0_to_200\",\n"," 20,\n"," \"credits_paid_to_date\",\n"," \"furniture\",\n"," 2442,\n"," \"less_100\",\n"," \"unemployed\",\n"," 3,\n"," \"female\",\n"," \"none\",\n"," 1,\n"," \"real_estate\",\n"," 42,\n"," \"none\",\n"," \"own\",\n"," 1,\n"," \"skilled\",\n"," 1,\n"," \"none\",\n"," \"yes\",\n"," \"No Risk\"\n"," ],\n"," [\n"," \"greater_200\",\n"," 4,\n"," \"all_credits_paid_back\",\n"," \"education\",\n"," 4206,\n"," \"less_100\",\n"," \"unemployed\",\n"," 1,\n"," \"female\",\n"," \"none\",\n"," 3,\n"," \"savings_insurance\",\n"," 27,\n"," \"none\",\n"," \"own\",\n"," 1,\n"," \"management_self-employed\",\n"," 1,\n"," \"none\",\n"," \"yes\",\n"," \"No Risk\"\n"," ],\n"," [\n"," \"greater_200\",\n"," 23,\n"," \"credits_paid_to_date\",\n"," \"car_used\",\n"," 2963,\n"," \"greater_1000\",\n"," \"greater_7\",\n"," 4,\n"," \"male\",\n"," \"none\",\n"," 4,\n"," \"car_other\",\n"," 46,\n"," \"none\",\n"," \"own\",\n"," 2,\n"," \"skilled\",\n"," 1,\n"," \"none\",\n"," \"yes\",\n"," \"Risk\"\n"," ],\n"," [\n"," \"no_checking\",\n"," 31,\n"," \"prior_payments_delayed\",\n"," \"vacation\",\n"," 2673,\n"," \"500_to_1000\",\n"," \"1_to_4\",\n"," 3,\n"," \"male\",\n"," \"none\",\n"," 2,\n"," \"real_estate\",\n"," 35,\n"," \"stores\",\n"," \"rent\",\n"," 1,\n"," \"skilled\",\n"," 2,\n"," \"none\",\n"," \"yes\",\n"," \"Risk\"\n"," ],\n"," [\n"," \"no_checking\",\n"," 37,\n"," \"prior_payments_delayed\",\n"," \"other\",\n"," 6971,\n"," \"500_to_1000\",\n"," \"1_to_4\",\n"," 3,\n"," \"male\",\n"," \"none\",\n"," 3,\n"," \"savings_insurance\",\n"," 54,\n"," \"none\",\n"," \"own\",\n"," 2,\n"," \"skilled\",\n"," 1,\n"," \"yes\",\n"," \"yes\",\n"," \"Risk\"\n"," ],\n"," [\n"," \"no_checking\",\n"," 14,\n"," \"all_credits_paid_back\",\n"," \"car_new\",\n"," 1525,\n"," \"500_to_1000\",\n"," \"4_to_7\",\n"," 3,\n"," \"male\",\n"," \"none\",\n"," 4,\n"," \"real_estate\",\n"," 33,\n"," \"none\",\n"," \"own\",\n"," 1,\n"," \"skilled\",\n"," 1,\n"," \"none\",\n"," \"yes\",\n"," \"No Risk\"\n"," ],\n"," [\n"," \"less_0\",\n"," 10,\n"," \"prior_payments_delayed\",\n"," \"furniture\",\n"," 4037,\n"," \"less_100\",\n"," \"4_to_7\",\n"," 3,\n"," \"male\",\n"," \"none\",\n"," 3,\n"," \"savings_insurance\",\n"," 31,\n"," \"none\",\n"," \"rent\",\n"," 1,\n"," \"skilled\",\n"," 1,\n"," \"none\",\n"," \"yes\",\n"," \"Risk\"\n"," ]\n"," ]\n"," }"]},{"cell_type":"markdown","metadata":{"id":"3e469b9bf6cc4baa855d4954b28b5ff7"},"source":["## Scoring Response Payload"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"12303ea22a7146b2ab77b898a3fdffe2"},"outputs":[],"source":["scoring_response = {\n"," \"predictions\": [\n"," {\n"," \"fields\": [\n"," \"prediction\",\n"," \"probability\"\n"," ],\n"," \"values\": [\n"," [\n"," \"Risk\",\n"," [\n"," 0.104642951112211,\n"," 0.895357048887789\n"," ]\n"," ],\n"," [\n"," \"No Risk\",\n"," [\n"," 0.892112895920181,\n"," 0.10788710407981907\n"," ]\n"," ],\n"," [\n"," \"Risk\",\n"," [\n"," 0.4863177905287259,\n"," 0.5136822094712741\n"," ]\n"," ],\n"," [\n"," \"No Risk\",\n"," [\n"," 0.980811537315731,\n"," 0.01918846268426898\n"," ]\n"," ],\n"," [\n"," \"No Risk\",\n"," [\n"," 0.9053052561083984,\n"," 0.09469474389160164\n"," ]\n"," ],\n"," [\n"," \"No Risk\",\n"," [\n"," 0.5315146773053994,\n"," 0.4684853226946007\n"," ]\n"," ],\n"," [\n"," \"No Risk\",\n"," [\n"," 0.7689466209701616,\n"," 0.23105337902983833\n"," ]\n"," ],\n"," [\n"," \"Risk\",\n"," [\n"," 0.41317664143643873,\n"," 0.5868233585635613\n"," ]\n"," ],\n"," [\n"," \"No Risk\",\n"," [\n"," 0.9190247585206522,\n"," 0.08097524147934775\n"," ]\n"," ],\n"," [\n"," \"No Risk\",\n"," [\n"," 0.781841942776921,\n"," 0.21815805722307902\n"," ]\n"," ]\n"," ]\n"," }\n"," ]\n","}"]},{"cell_type":"markdown","metadata":{"id":"8030376f-faad-418a-85d1-b1c8f930da2d"},"source":["### Construct the payload using the scoring_request and scoring_response and then log the records"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"edb0cf12b0164869879f0ea101f520e0"},"outputs":[],"source":["from ibm_watson_openscale.supporting_classes.payload_record import PayloadRecord\n","\n","records_list=[]\n","for x in range(10):\n"," pl_record = PayloadRecord(request=scoring_request, response=scoring_response)\n"," records_list.append(pl_record)\n","\n","wos_client.data_sets.store_records(data_set_id=payload_data_set_id, request_body=records_list)"]},{"cell_type":"markdown","metadata":{"id":"d0c60596-0d19-4997-836c-d0aeade843bc"},"source":["### Make sure the records reached the payload logging table inside the OpenScale DataMart."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"933b5220cd9d4c5681f54308ce266ade"},"outputs":[],"source":["import time\n","time.sleep(30)\n","pl_records_count = wos_client.data_sets.get_records_count(payload_data_set_id)\n","print(\"Number of records in the payload logging table: {}\".format(pl_records_count))\n","if pl_records_count == 0:\n"," raise Exception(\"Payload logging did not happen!\")"]},{"cell_type":"markdown","metadata":{"id":"c9154f4c69ad4c4db3c14d8f0bc6236c"},"source":["# Explainability Monitor Configuration\n","From the notebook, perform offline scoring against the customer model, create an explain archive and save this archive to data mart.\n","\n","* Only Local explanations and Lime global explanations are supported.\n","* For contrastive explanations, as scoring is needed and because it is headless subscription without any deployment URL, contrastive explanations are not supported."]},{"cell_type":"markdown","metadata":{"id":"aac1dc85b142421290d5096d9646982b"},"source":["## Score the perturbations\n","\n","Here, this notebook uses a credit risk model deployment in WML. This can be replaced with the scoring engine of your choice, but making sure the scoring response is in the format that OpenScale understands for monitor processing."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"a8cfbd042be041cd8f807ada9a55c477"},"outputs":[],"source":["import json\n","from ibm_watson_machine_learning import APIClient\n","\n","wml_client = APIClient(WML_CREDENTIALS)\n","wml_client.set.default_space(space_uid) # connect to deployment space"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"7f491c0a56df457c819324e6419cf876"},"outputs":[],"source":["perturbs_df = data_df.copy()\n","cols_to_remove = [\"Risk\"]"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"3da3e7a8d7544b84822132a418b683ea"},"outputs":[],"source":["def get_scoring_payload(no_of_records_to_score = 1):\n"," for col in cols_to_remove:\n"," if col in perturbs_df.columns:\n"," del perturbs_df[col] \n","\n"," fields = perturbs_df.columns.tolist()\n"," training_data_rows = perturbs_df[fields].values.tolist()\n","\n"," payload_scoring = {\"input_data\": [{\n"," \"fields\": fields, \n"," \"values\": [x for x in training_data_rows]\n"," }]} \n"," return payload_scoring\n","\n","def sample_scoring(no_of_records_to_score = 1):\n"," job_payload_ref = get_scoring_payload(no_of_records_to_score)\n"," score = wml_client.deployments.score(deployment_uid, meta_props=job_payload_ref)\n"," return job_payload_ref, scoring_response\n","\n","payload_scoring, scoring_response = sample_scoring(no_of_records_to_score = 5000)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"483e8d7fd6fe46739ba5e280f43d43e2"},"outputs":[],"source":["fields = scoring_response['predictions'][0]['fields']\n","values = scoring_response['predictions'][0]['values']\n","scored_data = pd.DataFrame(values, columns = fields)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"7574a842cdc34331b30fd74475b7c072"},"outputs":[],"source":["probabilities = [pro for pro in scored_data['probability']]\n","predictions = [pre for pre in scored_data['prediction']]\n","\n","explain_perturb_payload = {'probabilities' : probabilities,\n"," 'predictions' : predictions}"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"f1ef24fb6ced43549e83dcacb890eb0c"},"outputs":[],"source":["with open('explain_scoring_response.json', 'w') as outfile:\n"," json.dump(explain_perturb_payload, outfile)\n"," \n","file_name = 'explain_scoring_response.tar.gz'\n","\n","with tarfile.open(file_name, 'w:gz') as archive:\n"," archive.add('explain_scoring_response.json')\n","\n","with open(file_name, 'rb') as fh:\n"," buf = BytesIO(fh.read())\n","buf = open(file_name, mode=\"rb\").read() "]},{"cell_type":"code","execution_count":null,"metadata":{"id":"7b9f223d5ad849f380729e8b058d7f6c"},"outputs":[],"source":["with open(\"explain_scoring_response.tar.gz\", mode=\"rb\") as perturbations_tar:\n"," wos_client.monitor_instances.upload_explainability_archive(subscription_id=subscription_id, archive=perturbations_tar)\n","\n","print(\"Uploaded perturbations scoring response archive successfully.\")"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"6d3660b4a9a54fb8875c36009694d50a"},"outputs":[],"source":["target = Target(\n"," target_type=TargetTypes.SUBSCRIPTION,\n"," target_id=subscription_id\n",")\n","\n","parameters = {\n"," \"enabled\": True\n","}\n","\n","print(\"Creating monitor instances...\")\n","response = wos_client.monitor_instances.create(monitor_definition_id = None, \n"," target = None, data_mart_id = data_mart_id, training_data_stats=training_data_stats, \n"," subscription_id=subscription_id,background_mode=False, parameters = parameters)\n","print(response)"]},{"cell_type":"markdown","metadata":{"id":"864da785a62b4bb782651e7a89b04837"},"source":["# Quality monitoring and feedback logging"]},{"cell_type":"markdown","metadata":{"id":"2cfa62a024bd4951827899957267c0ae"},"source":["## Enable quality monitoring\n","\n","The code below waits ten seconds to allow the payload logging table to be set up before it begins enabling monitors. First, it turns on the quality (accuracy) monitor and sets an alert threshold of 70%. OpenScale will show an alert on the dashboard if the model accuracy measurement (area under the curve, in the case of a binary classifier) falls below this threshold.\n","\n","The second paramater supplied, min_records, specifies the minimum number of feedback records OpenScale needs before it calculates a new measurement. The quality monitor runs hourly, but the accuracy reading in the dashboard will not change until an additional 50 feedback records have been added, via the user interface, the Python client, or the supplied feedback endpoint."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"315eff5637f3499789af5bff0e952c27"},"outputs":[],"source":["import time\n","\n","time.sleep(10)\n","target = Target(\n"," target_type=TargetTypes.SUBSCRIPTION,\n"," target_id=subscription_id\n",")\n","parameters = {\n"," \"min_feedback_data_size\": 100\n","}\n","thresholds = [\n"," {\n"," \"metric_id\": \"area_under_roc\",\n"," \"type\": \"lower_limit\",\n"," \"value\": .80\n"," }\n"," ]\n","quality_monitor_details = wos_client.monitor_instances.create(\n"," data_mart_id=data_mart_id,\n"," background_mode=False,\n"," monitor_definition_id=wos_client.monitor_definitions.MONITORS.QUALITY.ID,\n"," target=target,\n"," parameters=parameters,\n"," thresholds=thresholds\n",").result"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"4854f81d452a4e41804bf69fffd23793"},"outputs":[],"source":["quality_monitor_instance_id = quality_monitor_details.metadata.id\n","quality_monitor_instance_id"]},{"cell_type":"markdown","metadata":{"id":"37088b2299714a2d8a7ff123dca6f67b"},"source":["## Get feedback logging dataset ID"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"eb0240054545408e886dc81e11073107"},"outputs":[],"source":["feedback_dataset_id = None\n","feedback_dataset = wos_client.data_sets.list(type=DataSetTypes.FEEDBACK, \n"," target_target_id=subscription_id, \n"," target_target_type=TargetTypes.SUBSCRIPTION).result\n","feedback_dataset_id = feedback_dataset.data_sets[0].metadata.id\n","if feedback_dataset_id is None:\n"," print(\"Feedback data set not found. Please check quality monitor status.\")"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"d3440261ffba406589148380ed0296a9"},"outputs":[],"source":["feedback_dataset_id"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"37812e16eee44208819af7dec9d14986"},"outputs":[],"source":["feedback_payload = {\n"," \"fields\": [\n"," \"CheckingStatus\",\n"," \"LoanDuration\",\n"," \"CreditHistory\",\n"," \"LoanPurpose\",\n"," \"LoanAmount\",\n"," \"ExistingSavings\",\n"," \"EmploymentDuration\",\n"," \"InstallmentPercent\",\n"," \"Sex\",\n"," \"OthersOnLoan\",\n"," \"CurrentResidenceDuration\",\n"," \"OwnsProperty\",\n"," \"Age\",\n"," \"InstallmentPlans\",\n"," \"Housing\",\n"," \"ExistingCreditsCount\",\n"," \"Job\",\n"," \"Dependents\",\n"," \"Telephone\",\n"," \"ForeignWorker\",\n"," \"Risk\",\n"," \"_original_probability\",\n"," \"_original_prediction\",\n"," \"_debiased_probability\",\n"," \"_debiased_prediction\" \n"," ],\n"," \"values\": [\n"," [\n"," \"less_0\",\n"," 18,\n"," \"credits_paid_to_date\",\n"," \"car_new\",\n"," 462,\n"," \"less_100\",\n"," \"1_to_4\",\n"," 2,\n"," \"female\",\n"," \"none\",\n"," 2,\n"," \"savings_insurance\",\n"," 37,\n"," \"stores\",\n"," \"own\",\n"," 2,\n"," \"skilled\",\n"," 1,\n"," \"none\",\n"," \"yes\",\n"," \"No Risk\",\n"," [\n"," 0.767955712021837,\n"," 0.23204428797816307\n"," ],\n"," \"Risk\",\n"," [\n"," 0.767955712021837,\n"," 0.23204428797816307\n"," ],\n"," \"Risk\"\n"," ],\n"," [\n"," \"less_0\",\n"," 15,\n"," \"prior_payments_delayed\",\n"," \"furniture\",\n"," 250,\n"," \"less_100\",\n"," \"1_to_4\",\n"," 2,\n"," \"male\",\n"," \"none\",\n"," 3,\n"," \"real_estate\",\n"," 28,\n"," \"none\",\n"," \"own\",\n"," 2,\n"," \"skilled\",\n"," 1,\n"," \"yes\",\n"," \"no\",\n"," \"No Risk\",\n"," [\n"," 0.7419002139563244,\n"," 0.25809978604367556\n"," ],\n"," \"Risk\",\n"," [\n"," 0.767955712021837,\n"," 0.23204428797816307\n"," ],\n"," \"Risk\"\n"," ],\n"," [\n"," \"0_to_200\",\n"," 28,\n"," \"credits_paid_to_date\",\n"," \"retraining\",\n"," 3693,\n"," \"less_100\",\n"," \"greater_7\",\n"," 3,\n"," \"male\",\n"," \"none\",\n"," 2,\n"," \"savings_insurance\",\n"," 32,\n"," \"none\",\n"," \"own\",\n"," 1,\n"," \"skilled\",\n"," 1,\n"," \"none\",\n"," \"yes\",\n"," \"No Risk\",\n"," [\n"," 0.6935080115729353,\n"," 0.3064919884270647\n"," ],\n"," \"Risk\",\n"," [\n"," 0.8,\n"," 0.2\n"," ],\n"," \"Risk\"\n"," ],\n"," [\n"," \"no_checking\",\n"," 28,\n"," \"prior_payments_delayed\",\n"," \"education\",\n"," 6235,\n"," \"500_to_1000\",\n"," \"greater_7\",\n"," 3,\n"," \"male\",\n"," \"none\",\n"," 3,\n"," \"unknown\",\n"," 57,\n"," \"none\",\n"," \"own\",\n"," 2,\n"," \"skilled\",\n"," 1,\n"," \"none\",\n"," \"yes\",\n"," \"Risk\",\n"," [\n"," 0.331110352092386,\n"," 0.668889647907614\n"," ],\n"," \"Risk\",\n"," [\n"," 0.9,\n"," 0.1\n"," ],\n"," \"Risk\"\n"," ],\n"," [\n"," \"no_checking\",\n"," 32,\n"," \"outstanding_credit\",\n"," \"vacation\",\n"," 9604,\n"," \"500_to_1000\",\n"," \"greater_7\",\n"," 6,\n"," \"male\",\n"," \"co-applicant\",\n"," 5,\n"," \"unknown\",\n"," 57,\n"," \"none\",\n"," \"free\",\n"," 2,\n"," \"skilled\",\n"," 2,\n"," \"yes\",\n"," \"yes\",\n"," \"Risk\",\n"," [\n"," 0.11270206970758759,\n"," 0.8872979302924124\n"," ],\n"," \"Risk\",\n"," [\n"," 0.1,\n"," 0.9\n"," ],\n"," \"Risk\"\n"," ],\n"," [\n"," \"no_checking\",\n"," 9,\n"," \"prior_payments_delayed\",\n"," \"car_new\",\n"," 1032,\n"," \"100_to_500\",\n"," \"4_to_7\",\n"," 3,\n"," \"male\",\n"," \"none\",\n"," 4,\n"," \"savings_insurance\",\n"," 41,\n"," \"none\",\n"," \"own\",\n"," 1,\n"," \"management_self-employed\",\n"," 1,\n"," \"none\",\n"," \"yes\",\n"," \"No Risk\",\n"," [\n"," 0.6704819620865308,\n"," 0.32951803791346923\n"," ],\n"," \"Risk\",\n"," [\n"," 0.767955712021837,\n"," 0.23204428797816307\n"," ],\n"," \"Risk\"\n"," ],\n"," [\n"," \"less_0\",\n"," 16,\n"," \"credits_paid_to_date\",\n"," \"vacation\",\n"," 3109,\n"," \"less_100\",\n"," \"4_to_7\",\n"," 3,\n"," \"female\",\n"," \"none\",\n"," 1,\n"," \"car_other\",\n"," 36,\n"," \"none\",\n"," \"own\",\n"," 2,\n"," \"skilled\",\n"," 1,\n"," \"none\",\n"," \"yes\",\n"," \"No Risk\",\n"," [\n"," 0.6735810290914039,\n"," 0.3264189709085961\n"," ],\n"," \"Risk\",\n"," [\n"," 0.6,\n"," 0.4\n"," ],\n"," \"Risk\"\n"," ],\n"," [\n"," \"0_to_200\",\n"," 11,\n"," \"credits_paid_to_date\",\n"," \"car_new\",\n"," 4553,\n"," \"less_100\",\n"," \"less_1\",\n"," 3,\n"," \"female\",\n"," \"none\",\n"," 3,\n"," \"savings_insurance\",\n"," 22,\n"," \"none\",\n"," \"own\",\n"," 1,\n"," \"management_self-employed\",\n"," 1,\n"," \"none\",\n"," \"yes\",\n"," \"No Risk\",\n"," [\n"," 0.637964656269084,\n"," 0.362035343730916\n"," ],\n"," \"Risk\",\n"," [\n"," 0.767955712021837,\n"," 0.23204428797816307\n"," ],\n"," \"Risk\"\n"," ],\n"," [\n"," \"no_checking\",\n"," 35,\n"," \"outstanding_credit\",\n"," \"appliances\",\n"," 7138,\n"," \"500_to_1000\",\n"," \"greater_7\",\n"," 5,\n"," \"male\",\n"," \"co-applicant\",\n"," 4,\n"," \"unknown\",\n"," 49,\n"," \"none\",\n"," \"free\",\n"," 2,\n"," \"skilled\",\n"," 2,\n"," \"yes\",\n"," \"yes\",\n"," \"Risk\",\n"," [\n"," 0.11270206970758759,\n"," 0.8872979302924124\n"," ],\n"," \"Risk\",\n"," [\n"," 0.767955712021837,\n"," 0.23204428797816307\n"," ],\n"," \"Risk\"\n"," ],\n"," [\n"," \"less_0\",\n"," 5,\n"," \"all_credits_paid_back\",\n"," \"car_new\",\n"," 1523,\n"," \"less_100\",\n"," \"unemployed\",\n"," 2,\n"," \"female\",\n"," \"none\",\n"," 2,\n"," \"real_estate\",\n"," 19,\n"," \"none\",\n"," \"rent\",\n"," 1,\n"," \"management_self-employed\",\n"," 1,\n"," \"none\",\n"," \"yes\",\n"," \"No Risk\",\n"," [\n"," 0.7304597628653227,\n"," 0.26954023713467745\n"," ],\n"," \"Risk\",\n"," [\n"," 0.767955712021837,\n"," 0.23204428797816307\n"," ],\n"," \"Risk\"\n"," ]\n"," ]\n","}"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"64005c8a27d64081891606d683a3ed41"},"outputs":[],"source":["import urllib3, requests, json\n","from requests.auth import HTTPBasicAuth\n","def generate_access_token():\n"," headers={}\n"," headers[\"Accept\"] = \"application/json\"\n"," auth = HTTPBasicAuth(WOS_CREDENTIALS[\"username\"], WOS_CREDENTIALS[\"password\"])\n"," \n"," ICP_TOKEN_URL= WOS_CREDENTIALS[\"url\"] + \"/v1/preauth/validateAuth\"\n"," \n"," response = requests.get(ICP_TOKEN_URL, headers=headers, auth=auth, verify=False)\n"," json_data = response.json()\n"," icp_access_token = json_data['accessToken']\n"," return icp_access_token\n","icp_access_token = generate_access_token()"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"f12e73b3c73345fba2ba5c486f7c4bb4"},"outputs":[],"source":["header = {\n"," 'Content-Type': 'application/json', \n"," 'Authorization': 'Bearer ' + icp_access_token\n","}"]},{"cell_type":"markdown","metadata":{"id":"8e2cf6e368ee414d8ffe8f178bbf4294"},"source":["### Store the feedback payload using the data sets API\n","\n","There are two ways OpenScale APIs can be used - a) using OpenScale Python SDK b) using OpenScale REST APIs.\n","\n","For any reason if in the customer environment one cannot use the SDK, then the alternative is to use the REST APIs. The below cell demostrates to invoke one such OpenScale REST API, to log the feedback records to the OpenScale DataMart."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"f2a376a7df1642c189f1920445012f6a"},"outputs":[],"source":["DATASETS_STORE_RECORDS_URL = WOS_CREDENTIALS[\"url\"] + \"/openscale/{0}/v2/data_sets/{1}/records\".format(data_mart_id, feedback_dataset_id)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"19d83529ce1d44a59d6ed709ccdd4e42"},"outputs":[],"source":["for x in range(10):\n"," response = requests.post(DATASETS_STORE_RECORDS_URL, json=feedback_payload, headers=header, verify=False)\n"," json_data = response.json()\n"," print(json_data)"]},{"cell_type":"markdown","metadata":{"id":"373ef04bff5b4772833b6b1120fd2314"},"source":["### Wait for sometime, and make sure the records have reached to data sets related table."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"88c162434aa24e018a131ca10c188d5c"},"outputs":[],"source":["time.sleep(30)\n","DATASETS_STORE_RECORDS_URL = WOS_CREDENTIALS[\"url\"] + \"/openscale/{0}/v2/data_sets/{1}/records?limit={2}&include_total_count={3}\".format(data_mart_id, feedback_dataset_id, 1, \"true\")\n","response = requests.get(DATASETS_STORE_RECORDS_URL, headers=header, verify=False)\n","json_data = response.json()\n","print(json_data['total_count'])"]},{"cell_type":"markdown","metadata":{"id":"96bdfc3a376d44a585432889b033f060"},"source":["## Run Quality Monitor"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"adfd17e52f3e4c4f88c4fe8016c4d6b0"},"outputs":[],"source":["run_details = wos_client.monitor_instances.run(monitor_instance_id=quality_monitor_instance_id, background_mode=False).result"]},{"cell_type":"markdown","metadata":{"id":"37c0967b10a1460c817d7eebe273d549"},"source":["# Drift Configuration"]},{"cell_type":"markdown","metadata":{"id":"bde7dd5dd58c46049d38d0943bdad90d"},"source":["# Scoring function for drift configuration"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"2767b93cd00b4af58e9bd5b8492f0605"},"outputs":[],"source":["def score(training_data_frame):\n"," \n"," #The data type of the label column and prediction column should be same .\n"," #User needs to make sure that label column and prediction column array should have the same unique class labels\n"," \n"," feature_columns = list(training_data_frame.columns)\n"," if class_label in feature_columns:\n"," feature_columns.remove(class_label)\n"," training_data_rows = training_data_frame[feature_columns].values.tolist()\n"," \n"," payload_scoring = {\n"," wml_client.deployments.ScoringMetaNames.INPUT_DATA: [{\n"," \"fields\": feature_columns,\n"," \"values\": [x for x in training_data_rows]\n"," }]\n"," }\n","\n"," score = wml_client.deployments.score(deployment_uid, payload_scoring)\n"," score_predictions = score.get('predictions')[0]\n","\n"," prob_col_index = list(score_predictions.get('fields')).index(probability_column)\n"," predict_col_index = list(score_predictions.get('fields')).index(prediction_column)\n","\n"," if prob_col_index < 0 or predict_col_index < 0:\n"," raise Exception(\"Missing prediction/probability column in the scoring response\")\n","\n"," import numpy as np\n"," probability_array = np.array([value[prob_col_index] for value in score_predictions.get('values')])\n"," prediction_vector = np.array([value[predict_col_index] for value in score_predictions.get('values')])\n","\n"," return probability_array, prediction_vector"]},{"cell_type":"markdown","metadata":{"id":"033d1ee83da3498ab2b1981ea9d3f144"},"source":["## Create the drift detection model archive"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"226adfb1fd1c4e628582a894746bf9be"},"outputs":[],"source":["probability_array, prediction_vector = score(data_df)"]},{"cell_type":"markdown","metadata":{"id":"cb07a0651c994d7f9c51032f7eaa4b11"},"source":["# Payload and Feedback dataset id for pushing model information into the datamart"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"89e9a902168843109e9f84f68a3fb216"},"outputs":[],"source":["print(\"Payload data set id:\", payload_data_set_id)\n","print(\"Feedback data set id:\", feedback_dataset_id)"]},{"cell_type":"markdown","metadata":{"id":"48c3a513d1a44de782f747574f431d66"},"source":["Authors: Moritz Scheele (moritz.scheele@ibm.com) and Ravi Chamarthy (ravi.chamarthy@in.ibm.com)"]}],"metadata":{"kernelspec":{"display_name":"Python 3.10","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.10"},"vscode":{"interpreter":{"hash":"dbf53fce1b4f805dfa7248c873cd5919f94c1d61eaf85ef20024e88fdf9444a6"}}},"nbformat":4,"nbformat_minor":4} 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Alt text](images/banner.png) 2 | # README 3 | --- 4 | 5 | This repo can be used as a starter kit to setup a fully git integrated Machine Learning Operations enviroment using Cloud Pak for Data and (in the future) watsonx. It uses a simple "credit score prediction" usecase that is split up into 4 jupyter notebooks as an example, which can easily be adapted to your business problem. 6 | 7 | It tries to be as simple as possible and showing the basic concepts of MLOps using IBM tools. The intended use it that after you have set everyhting up and familiarized yourself with the concepts you throw out all the "credit score prediction" code and replace it with whatever problem you are trying to solve. 8 | 9 | ![high level overview using three stages](/images/2023-09-05-11_00_27.png) 10 | 11 | *high level overview using three stages* 12 | 13 | 14 | 15 | # Setup Instructions 16 | These instructions will guide you through the setup of a simple MLOps environment that uses just two stages ("dev" and "prod"). The setup can be easily extended to more stages if needed. 17 | 18 | It is assumed that you have a "Cloud Pak for Data" instance available and that you have admin rights to it (This will not work with the cloud based "as a Service" Offering). 19 | 20 | ![Alt text](/images/detailed_overview.png) 21 | 22 | *detailed view using two stages* 23 | ## 1. Fork this repo 24 | 25 |
26 | need a detailed description? 27 | 28 | ![Alt text](/images/image-1.png) 29 | 30 | click the "Fork" button in the upper right corner of this repo. **IMPORTANT: uncheck the "only fork the master branch" checkbox.** 31 | This will create a copy of this repo in your own github account. We will be using this copy in the following steps. 32 |
33 | 34 | 35 | ## 2. Create one git-enabled project called "00-datascience-playground" 36 | 37 |
38 | need a detailed description? 39 | 40 | ### Overview 41 | ![Alt text](/images/2023-08-31-09_10_14.png) 42 | *this is the project that we are creating in this step* 43 | 44 | ### Step by step 45 | ![Alt text](/images/image-2.png) 46 | navigate to all projects 47 | ![Alt text](/images/image-4.png) 48 | create a project that is "integrated with git". In the next window we will need to provide the github repo address and a private access token. So lets create that token first. 49 | ![Alt text](/images/image-5.png) 50 | navigate to https://github.com/settings/tokens and choose "Generate new token". Give it a name and select the "repo" scope as shown in the next image. 51 | ![Alt text](/images/image-6.png) 52 | **Copy the generated token to your clipboard.** You will not be able to see it again after you close the window. 53 | ![Alt text](/images/image-7.png) 54 | Make this token available within your CP4D by creating a "New Token" and using the token you just created. Once you created it use the dropdown to select it. 55 | ![Alt text](/images/image-8.png) 56 | add the Repo URL (dont forget the .git at the end ;-) and choose the main branch. Then hit "Create" 57 | 58 | Use the github repo address and your private access token 59 | You can Alter the notebooks to your needs if you want to. It is important that you keep the naming of the notebooks. 60 |
61 | 62 | 63 | ## 3. Create one git-enabled project called "01-staging-area" 64 | 65 |
66 | need a detailed description? 67 | 68 | ### Overview 69 | ![Alt text](/images/image-3.png) 70 | *this is the project that we are creating in this step* 71 | 72 | ### Step by step 73 | 74 | ![Alt text](/images/image-2.png) 75 | navigate to all projects 76 | ![Alt text](/images/image-4.png) 77 | In your CP4D Instance you access the project overview by clicking on the "Projects" Icon in the upper left corner. Then click on "New Project" and select "Create a project integrated with a Git repository". Give it the name "01-staging-area" and select "create" 78 | 79 | Use the same github repo address and your private access token as in 2 80 | 81 |
82 | 83 | ## 4. Configure custom enviroment in "01-staging-area" 84 |
85 | need a detailed description? 86 | 87 | TODO: Add description here! (use custom_env.yaml) 88 | 89 |
90 | 91 | ## 5. Configure Jobs in "01-staging-area" 92 |
93 | need a detailed description? 94 | 95 | ### Overview 96 | ![Alt text](/images/image-4.png) 97 | *this is the project that we are creating in this step* 98 | 99 | ### Step by step 100 | ![Alt text](/images/image-9.png) 101 | navigate to "view local branch" 102 | 103 | ![Alt text](/images/image-11.png) 104 | click "New code job" 105 | 106 | ![Alt text](/images/image-12.png) 107 | choose the first notebook "00-git-pull.ipynb" and click "configure job" 108 | 109 | ![Alt text](/images/image-13.png) 110 | give it the same name as the notebook and click "next" 111 | TODO: choose correct enviroment for every job 112 | accept all the defaults and click "next" until you can click "create job" 113 | TODO: add the "was_successful" output to every job 114 | repeat those steps for all six notebooks. 115 | 116 | ![Alt text](/images/image-14.png) 117 | once you are done it should look like this. 118 | 119 | 120 | We also need to create a .env file within the "01-staging-area" project. This file will contain the credentials that the pipeline will use to pull the code from github. 121 | 122 | ![Alt text](/images/image-100.png) 123 | 124 | Click "Launch IDE" and then "JupyterLab" to get access to the JupyterLab environment. 125 | 126 | ![Alt text](/images/image-103.png) 127 | 128 | You will be greeted by a tab called "Terminal 1". There you copy the following commands and hit enter: 129 | 130 | ```bash 131 | 132 | echo "repo_adresse=PUT_YOUR_REPO_ADDRESS_HERE" > .env 133 | echo "personal_access_token=PUT_YOUR_TOKEN_HERE" >> .env 134 | echo "project_id=PUT_YOUR_PROJECT_ID_HERE" >> .env 135 | echo "branch_name=main" >> .env 136 | echo "cpd_technical_user=PUT_USERNAME_HERE" >> .env 137 | echo "cpd_technical_user_password=PUT_PASSWORD_HERE">> .env 138 | echo "cpd_url=PUT_URL_HERE">> .env 139 | 140 | ``` 141 | 142 | *cpd_technical_user* is a user that was created only to be used as a proxy in those scripts. If this is not available you can also use a *personal* user (i.e. the credentials you use to login) even though this not best practise 143 | 144 | ![Alt text](/images/image-102.png) 145 | 146 | You can check if everything worked by typing 147 | 148 | ```bash 149 | cat .env 150 | ``` 151 | If that command displays the content of the .env file you are good to go. 152 | 153 |
154 | 155 | ## 5. Create a NON-git-enabled project called "02-automation-area" 156 | 157 |
158 | need a detailed description? 159 | 160 | ### Overview 161 | ![Alt text](/images/image-5.png) 162 | *this is the project that we are creating in this step* 163 | 164 | ### Step by step 165 | 166 | ![Alt text](/images/image-3.png) 167 | repeat the same steps as in 2 and 3 but choose "create an empty project" to create a NON-git-enabled project. Name it "02-automation-area" 168 | 169 | 170 |
171 | 172 | 173 | 174 | ## 6. Configure pipeline in "02-automation-area" 175 |
176 | need a detailed description? 177 | 178 | ### Overview 179 | ![Alt text](/images/image-6.png) 180 | *those are the pieces we are creating in this step* 181 | 182 | ### Step by step 183 | TODO: add global parameters 184 | 185 | ![Alt text](/images/image-16.png) 186 | Click "New Asset" and choose "Pipeline". Name the pipeline "mlops_pipeline" 187 | 188 | ![Alt text](/images/image-18.png) 189 | go to "Run">"Run Notebook Job" and drag it onto the plane. Then doubleclick this newly created node and click "select Job". 190 | 191 | ![Alt text](/images/image-19.png) 192 | choose "01-staging-area" and there the first notebook "00-git-pull.ipynb" and click "choose" and then "save" 193 | 194 | TODO: choose enviroment 195 | TODO: add pipeline params 196 | 197 | ![Alt text](/images/image-20.png) 198 | repeat those steps for all notebooks until you end up with something that looks like this. 199 | 200 | ![Alt text](/images/image-29.png) 201 | Click "Run Pipeline" and then "create job". Give it a name like "mlops_pipeline_job" . **IMPORTANT: The github action assumes that you only have ONE job in this project. If you have more than one job you will need to change the github action accordingly.** 202 | 203 | 204 |
205 | 206 | ## 7. Setup Github Actions 207 |
208 | need a detailed description? 209 | 210 | ### Overview 211 | ![Alt text](/images/image-7.png) 212 | *this is the piece that we are creating in this step* 213 | 214 | ### Step by step 215 | 216 | We need a set of secrets to be able to run the github actions. Those secrets are: 217 | 218 | - **API_KEY** 219 | - **USER_NAME** 220 | - **CLUSTER_URL** 221 | - **PROJECT_ID** 222 | - **PERSONAL_ACCESS_TOKEN_GITHUB** 223 | 224 | 225 | We will now go through all those step by step: 226 | 227 | ![Alt text](/images/image-21.png) 228 | navigate to your fork of the github repo then "Settings">"Secrets and variables">"actions">"new repository secret" 229 | 230 | ### 7.1. retriving your CP4D **API_KEY** and **USER_NAME** 231 | 232 | ![Alt text](/images/image-22.png) 233 | go to the "profile and settings" tab in your cp4d instance 234 | 235 | ![Alt text](/images/image-23.png) 236 | copy the api key to your clipboard (and write it down somewhere. You will not be able to see it again after you close the window) 237 | 238 | ![Alt text](/images/image-24.png) 239 | go back to github and creaete a new repository secret called "API_KEY" 240 | 241 | ![Alt text](/images/image-27.png) 242 | 243 | Also create the repository secret USER_NAME using the username that you use to login to your CP4D instance 244 | 245 | 246 | ### 7.2. retriving your CP4D **CLUSTER_URL** 247 | 248 | this one is simple :-) 249 | ![Alt text](/images/image-25.png) 250 | 251 | just take the URL of the cluster that you have been workin on 252 | 253 | ![Alt text](/images/image-26.png) 254 | 255 | and use it to create a secret called "CLUSTER_URL" 256 | 257 | ### 7.3. retriving your CP4D **PROJECT_ID** 258 | 259 | ![Alt text](/images/image-28.png) 260 | 261 | ### 7.4. retriving your github **PERSONAL_ACCESS_TOKEN_GITHUB** 262 | 263 | You can use the same token you used in step 2. If you dont have it anymore you can create a new one by following the steps in 2. 264 | 265 | 266 | 267 |
268 | 269 | 270 | ## 8. Create deployment space 271 |
272 | need a detailed description? 273 | 274 | TODO: describe how to create deployment space 275 | 276 |
277 | 278 | ## 9. Setup monitoring using open scale 279 |
280 | need a detailed description? 281 | 282 | TODO: describe how to set up open scale 283 | 284 |
285 | 286 | ## 10. Try it out :-) 287 | 288 | 289 | 290 | ## 11. Future Work and known issues 291 |
292 | need a detailed description? 293 | 294 | - Future Work: 295 | - [ ] Put AI Fact sheets back into the "03-train_model" notebook 296 | - [ ] Figure out what is wrong with the deployments and fix it 297 | - [ ] Figure out what is wrong with monitoring (probably issue with the cluster we use) 298 | - [ ] Finish Documentation of 8. Create deployment space and 9. Setup monitoring using open scale 299 | - [ ] Delete all projects and set everything up again acording to documentation to find what is missing(~ one day of work) 300 | - [ ] describe how good usermanagement can work (e.g. normal Users can only see the "01_data_science_playground" project) 301 | - [ ] integrate Model Inventory/ model versioning 302 | 303 | 304 | - Known Issues 305 | - 306 | 307 |
308 | 309 | 310 | 311 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | **Mission**
2 | Convert APAC MLOps Accelerator from CP4DaaS to CP4DS 3 | 4 | ### Info 5 | --- 6 | - Awaiting tensorflow-data-validation update for compatibility w/ py3.10 7 | 8 | 9 | ### Todo 10 | --- 11 | - [x] Cut out all use of Cloud Object Storage ✅ 2023-10-04 12 | - [x] Reduce use of /utils to a minimum ✅ 2023-10-04 13 | - [x] Remove use of *seaborn/sns* ✅ 2023-10-04 14 | - [x] **Use WML .from_token instead of api_key ✅ 2023-10-04 15 | - [ ] 16 | - [ ] 17 | 18 | 19 | ### Save for later 20 | --- 21 | 22 | #### Work w/ WML on CP4DS 23 | ```python 24 | CPD_URL = '' 25 | from ibm_watson_studio_lib import access_project_or_space 26 | wslib = access_project_or_space() 27 | wml_credentials = { 28 | "url": CPD_URL, 29 | "token": wslib.auth.get_current_token(), 30 | "instance_id": "wml_local", 31 | "version" : "4.6" 32 | } 33 | from ibm_watson_machine_learning import APIClient 34 | client = APIClient(wml_credentials) 35 | client.spaces.list() 36 | ``` 37 | 38 | #### Use Pipelines on CP4DS 39 | 40 | ```python 41 | token = os.environ['USER_ACCESS_TOKEN'] 42 | ``` 43 | 44 | [WSPipelines On-Prem Tutorial](https://github.ibm.com/Lucas-Baier/ws-pipelines-guide) -------------------------------------------------------------------------------- /assets/.METADATA/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/assets/.METADATA/.gitkeep -------------------------------------------------------------------------------- /assets/.METADATA/.version.json: -------------------------------------------------------------------------------- 1 | {"version":"\"2.0.0\"","source":"local"} -------------------------------------------------------------------------------- /assettypes/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/assettypes/.gitkeep -------------------------------------------------------------------------------- /custom_env.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - defaults 3 | 4 | dependencies: 5 | - pip: 6 | - tensorflow-data-validation==1.14.0 7 | - ibm-watson-studio-pipelines==0.2.12 8 | - python-dotenv==1.0.0 -------------------------------------------------------------------------------- /images/2023-08-31 09_10_14-2023_04_MLOps_AOT_Initiative.drawio - diagrams.net.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/2023-08-31 09_10_14-2023_04_MLOps_AOT_Initiative.drawio - diagrams.net.png -------------------------------------------------------------------------------- /images/2023-08-31-09_10_14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/2023-08-31-09_10_14.png -------------------------------------------------------------------------------- /images/2023-09-05-11_00_27.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/2023-09-05-11_00_27.png -------------------------------------------------------------------------------- /images/banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/banner.png -------------------------------------------------------------------------------- /images/detailed_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/detailed_overview.png -------------------------------------------------------------------------------- /images/image-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-1.png -------------------------------------------------------------------------------- /images/image-10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-10.png -------------------------------------------------------------------------------- /images/image-100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-100.png -------------------------------------------------------------------------------- /images/image-101.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-101.png -------------------------------------------------------------------------------- /images/image-102.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-102.png -------------------------------------------------------------------------------- /images/image-103.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-103.png -------------------------------------------------------------------------------- /images/image-11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-11.png -------------------------------------------------------------------------------- /images/image-12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-12.png -------------------------------------------------------------------------------- /images/image-13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-13.png -------------------------------------------------------------------------------- /images/image-14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-14.png -------------------------------------------------------------------------------- /images/image-15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-15.png -------------------------------------------------------------------------------- /images/image-16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-16.png -------------------------------------------------------------------------------- /images/image-17.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-17.png -------------------------------------------------------------------------------- /images/image-18.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-18.png -------------------------------------------------------------------------------- /images/image-19.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-19.png -------------------------------------------------------------------------------- /images/image-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-2.png -------------------------------------------------------------------------------- /images/image-20.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-20.png -------------------------------------------------------------------------------- /images/image-200.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-200.png -------------------------------------------------------------------------------- /images/image-201.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-201.png -------------------------------------------------------------------------------- /images/image-202.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-202.png -------------------------------------------------------------------------------- /images/image-203.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-203.png -------------------------------------------------------------------------------- /images/image-204.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-204.png -------------------------------------------------------------------------------- /images/image-205.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-205.png -------------------------------------------------------------------------------- /images/image-206.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-206.png -------------------------------------------------------------------------------- /images/image-207.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-207.png -------------------------------------------------------------------------------- /images/image-21.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-21.png -------------------------------------------------------------------------------- /images/image-22.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-22.png -------------------------------------------------------------------------------- /images/image-23.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-23.png -------------------------------------------------------------------------------- /images/image-24.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-24.png -------------------------------------------------------------------------------- /images/image-25.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-25.png -------------------------------------------------------------------------------- /images/image-26.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-26.png -------------------------------------------------------------------------------- /images/image-27.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-27.png -------------------------------------------------------------------------------- /images/image-28.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-28.png -------------------------------------------------------------------------------- /images/image-29.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-29.png -------------------------------------------------------------------------------- /images/image-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-3.png -------------------------------------------------------------------------------- /images/image-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-4.png -------------------------------------------------------------------------------- /images/image-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-5.png -------------------------------------------------------------------------------- /images/image-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-6.png -------------------------------------------------------------------------------- /images/image-7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-7.png -------------------------------------------------------------------------------- /images/image-8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-8.png -------------------------------------------------------------------------------- /images/image-9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-9.png -------------------------------------------------------------------------------- /images/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image.png -------------------------------------------------------------------------------- /images/overview-image-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/overview-image-1.png -------------------------------------------------------------------------------- /images/overview-image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/overview-image.png -------------------------------------------------------------------------------- /utils/catalog_utils.py: -------------------------------------------------------------------------------- 1 | """__author__ == "Nijesh" 2 | email : knijesh@sg.ibm.com 3 | 4 | WKC and Model Inventory Utility Client 5 | 6 | 7 | """ 8 | 9 | import json 10 | import os 11 | from dataclasses import dataclass 12 | from operator import itemgetter 13 | 14 | import numpy as np 15 | import pandas as pd 16 | import requests 17 | from ibm_watson_machine_learning import APIClient 18 | from requests.structures import CaseInsensitiveDict 19 | 20 | 21 | @dataclass 22 | class CatalogUtils: 23 | """ 24 | Encapsulated Catalog Utils Class to enable the use of WKC via Watson Data API. 25 | 26 | """ 27 | 28 | 29 | access_token: str 30 | project_id: str 31 | 32 | def get_wml_client(self): 33 | 34 | 35 | 36 | wml_credentials = { 37 | "token": access_token, 38 | "instance_id" : "openshift", 39 | "url": os.environ['RUNTIME_ENV_APSX_URL'], 40 | "version": "4.6" 41 | } 42 | wml_client = APIClient(wml_credentials) 43 | return wml_client 44 | 45 | def create_access_token(self): 46 | headers = { 47 | "Content-Type": "application/x-www-form-urlencoded", 48 | } 49 | 50 | data = ( 51 | f"grant_type=urn:ibm:params:oauth:grant-type:apikey&apikey={self.api_key}" 52 | ) 53 | 54 | response = requests.post(self.auth_url, headers=headers, data=data) 55 | 56 | return response.json()["access_token"] 57 | 58 | def list_catalogs(self): 59 | access_token = self.create_access_token() 60 | headers = CaseInsensitiveDict() 61 | headers["Accept"] = "application/json" 62 | headers["Authorization"] = f"Bearer {access_token}" 63 | list_catalogs = requests.get(self.service_url + "/v2/catalogs", headers=headers) 64 | return list_catalogs.json() 65 | 66 | def get_catalog_id_map(self): 67 | result = self.list_catalogs() 68 | asset_map = {} 69 | for keys, values in result.items(): 70 | if type(values) == list: 71 | for each in values: 72 | asset_map[each["entity"]["name"]] = each["metadata"]["guid"] 73 | return asset_map 74 | 75 | def get_latest_asset_id(self, name): 76 | 77 | wml_client = self.get_wml_client() 78 | wml_client.set.default_project(self.project_id) 79 | result = wml_client.repository.get_model_details() 80 | result_meta = [ 81 | each["metadata"] 82 | for each in result["resources"] 83 | if each["metadata"]["name"] == name 84 | ] 85 | 86 | my_asset_list = sorted(result_meta, key=itemgetter("created_at"), reverse=True) 87 | return my_asset_list[0]["id"] 88 | 89 | def get_revisions_asset(self, catalog_id, asset_id): 90 | access_token = self.create_access_token() 91 | headers = CaseInsensitiveDict() 92 | headers["Accept"] = "application/json" 93 | headers["Authorization"] = f"Bearer {access_token}" 94 | search_asset = requests.get( 95 | self.service_url 96 | + f"/v2/assets/{asset_id}/revisions?catalog_id={catalog_id}", 97 | headers=headers, 98 | ) 99 | return search_asset.json() 100 | 101 | def publish_asset(self, catalog_id, asset_id, name, desc, tags): 102 | """Publish Assets to Catalog 103 | 104 | Args: 105 | catalog_id (str): catalog id 106 | asset_id (str): Id of the asset to be published 107 | name (str): name of the asset 108 | desc (str): description 109 | tags (str): asset tag 110 | """ 111 | access_token = self.create_access_token() 112 | url = f"{self.service_url}/v2/assets/{asset_id}/publish?project_id={self.project_id}" 113 | 114 | payload = json.dumps( 115 | { 116 | "catalog_id": catalog_id, 117 | "mode": 0, 118 | "metadata": { 119 | "name": name, 120 | "description": desc, 121 | "tags": tags, 122 | }, 123 | } 124 | ) 125 | headers = { 126 | "Authorization": f"Bearer {access_token}", 127 | "Content-Type": "application/json", 128 | } 129 | 130 | response = requests.request("POST", url, headers=headers, data=payload) 131 | 132 | print(response.text) 133 | 134 | def get_model_from_registry(self, name): 135 | """Get latest model from registry 136 | 137 | Args: 138 | name (str): name of the Model 139 | 140 | Returns: 141 | str: model id 142 | """ 143 | access_token = self.create_access_token() 144 | 145 | url = "https://api.dataplatform.cloud.ibm.com/v1/aigov/model_inventory/model_entries?bss_account_id=27ff418fedd6aedffb8dc6ae4164a1d2" 146 | 147 | payload = {} 148 | headers = { 149 | "Authorization": f"Bearer {access_token}", 150 | "Content-Type": "application/json", 151 | } 152 | response = requests.request("GET", url, headers=headers, data=payload) 153 | 154 | result = response.json() 155 | 156 | for each in result["results"]: 157 | for item in each["entity"]["modelfacts_global"]["physical_models"]: 158 | if ( 159 | item["name"] == name 160 | and item["container_id"] == self.project_id 161 | and item["is_deleted"] == False 162 | ): 163 | return item["id"] 164 | -------------------------------------------------------------------------------- /utils/fs_utils.py: -------------------------------------------------------------------------------- 1 | """__author__ == "Nijesh" 2 | email : knijesh@sg.ibm.com 3 | 4 | Factsheets and Model Metadata Utility Client 5 | 6 | 7 | """ 8 | 9 | 10 | import contextlib 11 | import os 12 | import time 13 | from collections import defaultdict 14 | from dataclasses import dataclass 15 | 16 | import requests 17 | from ibm_aigov_facts_client import AIGovFactsClient 18 | from ibm_watson_machine_learning import APIClient 19 | 20 | 21 | @dataclass 22 | class FSUtils: 23 | wml_client: APIClient 24 | catalog_id: str 25 | project_id: str 26 | bss_account_id: str 27 | space_id: str 28 | facts_client: AIGovFactsClient 29 | service_url: str = "https://api.dataplatform.cloud.ibm.com" 30 | 31 | def register_new_model_entry( 32 | self, model_uid, model_entry_name, model_entry_description 33 | ): 34 | self.wml_client.set.default_project(self.project_id) 35 | meta_props = { 36 | self.wml_client.factsheets.ConfigurationMetaNames.NAME: model_entry_name, 37 | self.wml_client.factsheets.ConfigurationMetaNames.DESCRIPTION: model_entry_description, 38 | self.wml_client.factsheets.ConfigurationMetaNames.MODEL_ENTRY_CATALOG_ID: self.catalog_id, 39 | } 40 | model_registration = self.wml_client.factsheets.register_model_entry( 41 | model_id=model_uid, meta_props=meta_props 42 | ) 43 | return model_registration 44 | 45 | def register_existing_model_entry(self, model_uid, model_entry_asset_id): 46 | meta_props = { 47 | self.wml_client.factsheets.ConfigurationMetaNames.ASSET_ID: model_entry_asset_id, 48 | self.wml_client.factsheets.ConfigurationMetaNames.MODEL_ENTRY_CATALOG_ID: self.catalog_id, 49 | } 50 | model_registration = self.wml_client.factsheets.register_model_entry( 51 | model_id=model_uid, meta_props=meta_props 52 | ) 53 | return model_registration 54 | 55 | def get_model_entries(self): 56 | headers = { 57 | "Content-Type": "application/json", 58 | "Accept": "application/json", 59 | "Authorization": self.wml_client._get_headers()["Authorization"], 60 | } 61 | params = {"bss_account_id": self.bss_account_id} 62 | r = requests.get( 63 | f"{self.service_url}/v1/aigov/model_inventory/{self.catalog_id}/model_entries", 64 | headers=headers, 65 | params=params, 66 | ) 67 | return r.json() 68 | 69 | def get_model_entry(self, model_entry_asset_id): 70 | headers = { 71 | "Content-Type": "application/json", 72 | "Accept": "application/json", 73 | "Authorization": self.wml_client._get_headers()["Authorization"], 74 | } 75 | params = {"catalog_id": self.catalog_id} 76 | r = requests.get( 77 | f"{self.service_url}/v1/aigov/model_inventory/model_entries/{model_entry_asset_id}", 78 | headers=headers, 79 | params=params, 80 | ) 81 | return r.json() 82 | 83 | def get_model_entry_asset_id_by_name(self, model_entry_name): 84 | response = self.get_model_entries() 85 | return next( 86 | ( 87 | x["metadata"]["asset_id"] 88 | for x in response["results"] 89 | if x["metadata"]["name"] == model_entry_name 90 | ), 91 | None, 92 | ) 93 | 94 | def prepare_training_reference( 95 | self, bucket_name, apikey, crn, endpoint, training_file_name 96 | ): 97 | """_summary_ 98 | 99 | Args: 100 | bucket_name (str): Bucket_Name 101 | apikey (str): APIKEY 102 | crn (str): CRN Of COS 103 | endpoint (str): ENDPOINT 104 | training_file_name (str): Training Data Filename 105 | 106 | Returns: 107 | list[dict]: Training Data Reference 108 | """ 109 | 110 | self.wml_client.set.default_project(self.project_id) 111 | datasource_type = self.wml_client.connections.get_datasource_type_uid_by_name( 112 | "bluemixcloudobjectstorage" 113 | ) 114 | conn_meta_props = { 115 | self.wml_client.connections.ConfigurationMetaNames.NAME: "MLOps COS", 116 | self.wml_client.connections.ConfigurationMetaNames.DATASOURCE_TYPE: datasource_type, 117 | self.wml_client.connections.ConfigurationMetaNames.DESCRIPTION: "MLOpsCOS COnnection", 118 | self.wml_client.connections.ConfigurationMetaNames.PROPERTIES: { 119 | "bucket": bucket_name, 120 | "api_key": apikey, 121 | "resource_instance_id": crn, 122 | "iam_url": "https://iam.ng.bluemix.net/oidc/token", 123 | "url": endpoint, 124 | }, 125 | } 126 | 127 | conn_details = self.wml_client.connections.create(meta_props=conn_meta_props) 128 | connection_id = self.wml_client.connections.get_uid(conn_details) 129 | 130 | training_data_references = [ 131 | { 132 | "id": "German Credit Risk", 133 | "type": "connection_asset", 134 | "connection": { 135 | "id": connection_id, 136 | "href": "/v2/connections/" 137 | + connection_id 138 | + "?space_id=" 139 | + self.space_id, 140 | }, 141 | "location": {"bucket": bucket_name, "file_name": training_file_name}, 142 | } 143 | ] 144 | return training_data_references 145 | 146 | def save_model( 147 | self, 148 | model, 149 | model_name, 150 | model_entry_description, 151 | model_entry_name, 152 | target, 153 | X, 154 | y, 155 | train_data_ref, 156 | model_type="scikit-learn_1.0" 157 | ): 158 | # sourcery skip: use-named-expression 159 | self.wml_client.set.default_project(self.project_id) 160 | for x in self.wml_client.repository.get_model_details()["resources"]: 161 | if x["metadata"]["name"] == model_name: 162 | self.wml_client.repository.delete(x["metadata"]["id"]) 163 | 164 | run_id = self.facts_client.runs.get_current_run_id() 165 | 166 | self.facts_client.export_facts.export_payload(run_id) 167 | 168 | software_spec_uid = self.wml_client.software_specifications.get_id_by_name( 169 | "runtime-22.2-py3.10" 170 | ) 171 | 172 | meta_props = { 173 | self.wml_client.repository.ModelMetaNames.NAME: model_name, 174 | self.wml_client.repository.ModelMetaNames.TYPE: model_type, 175 | self.wml_client.repository.ModelMetaNames.SOFTWARE_SPEC_UID: software_spec_uid, 176 | self.wml_client.repository.ModelMetaNames.LABEL_FIELD: target, 177 | self.wml_client._models.ConfigurationMetaNames.TRAINING_DATA_REFERENCES: train_data_ref, 178 | self.wml_client.repository.ModelMetaNames.INPUT_DATA_SCHEMA: [ 179 | { 180 | "id": "input_data_schema", 181 | "type": "list", 182 | "fields": [ 183 | {"name": index, "type": value} 184 | for index, value in X.dtypes.astype(str).items() 185 | ], 186 | }, 187 | ], 188 | } 189 | 190 | self.facts_client.export_facts.prepare_model_meta( 191 | wml_client=self.wml_client, meta_props=meta_props 192 | ) 193 | 194 | model_details = self.wml_client.repository.store_model( 195 | model=model, meta_props=meta_props, training_data=X, training_target=y 196 | ) 197 | model_uid = self.wml_client.repository.get_model_id(model_details) 198 | model_entry_asset_id = self.get_model_entry_asset_id_by_name(model_entry_name) 199 | if model_entry_asset_id: 200 | self.register_existing_model_entry(model_uid, model_entry_asset_id) 201 | else: 202 | self.register_new_model_entry( 203 | model_uid, model_entry_name, model_entry_description 204 | ) 205 | return model_uid 206 | 207 | def save_custom_model( 208 | self, 209 | model, 210 | model_name, 211 | model_entry_description, 212 | model_entry_name, 213 | X, 214 | y, 215 | model_type="scikit-learn_1.0" 216 | ): 217 | # sourcery skip: use-named-expression 218 | self.wml_client.set.default_project(self.project_id) 219 | for x in self.wml_client.repository.get_model_details()["resources"]: 220 | if x["metadata"]["name"] == model_name: 221 | self.wml_client.repository.delete(x["metadata"]["id"]) 222 | 223 | run_id = self.facts_client.runs.get_current_run_id() 224 | 225 | self.facts_client.export_facts.export_payload(run_id) 226 | 227 | software_spec_uid = self.wml_client.software_specifications.get_id_by_name( 228 | "runtime-22.2-py3.10" 229 | ) 230 | 231 | meta_props = { 232 | self.wml_client.repository.ModelMetaNames.NAME: model_name, 233 | self.wml_client.repository.ModelMetaNames.TYPE: model_type, 234 | self.wml_client.repository.ModelMetaNames.SOFTWARE_SPEC_UID: software_spec_uid, 235 | } 236 | 237 | self.facts_client.export_facts.prepare_model_meta( 238 | wml_client=self.wml_client, meta_props=meta_props 239 | ) 240 | 241 | model_details = self.wml_client.repository.store_model( 242 | model=model, meta_props=meta_props#, training_data=X, training_target=y 243 | ) 244 | model_uid = self.wml_client.repository.get_model_id(model_details) 245 | model_entry_asset_id = self.get_model_entry_asset_id_by_name(model_entry_name) 246 | if model_entry_asset_id: 247 | self.register_existing_model_entry(model_uid, model_entry_asset_id) 248 | else: 249 | self.register_new_model_entry( 250 | model_uid, model_entry_name, model_entry_description 251 | ) 252 | return model_uid 253 | 254 | 255 | 256 | def promote_model(self, model_uid, model_name): 257 | """ 258 | 259 | Promote the model to deployment Space by checking duplicate deployments and 260 | model assets in the space repository 261 | 262 | Args: 263 | model_uid (str): Model_ID 264 | model_name (str): Name of the Model 265 | 266 | Returns: 267 | json: model saved result in json 268 | """ 269 | self.wml_client.set.default_space(self.space_id) 270 | 271 | to_delete = defaultdict(list) 272 | 273 | model_ids = [ 274 | model["metadata"]["id"] 275 | for model in self.wml_client.repository.get_model_details()["resources"] 276 | if model["metadata"]["name"] == model_name 277 | ] 278 | 279 | for model in self.wml_client.deployments.get_details()["resources"]: 280 | if ( 281 | model["entity"]["asset"]["id"] in model_ids 282 | and "WOS-INTERNAL" not in model["metadata"]["name"] 283 | ): 284 | to_delete[model["entity"]["asset"]["id"]].append( 285 | model["metadata"]["id"] 286 | ) 287 | 288 | # to_delete = { 289 | # model["entity"]["asset"]["id"]: model["metadata"]["id"] 290 | # for model in self.wml_client.deployments.get_details()["resources"] 291 | # if model["entity"]["asset"]["id"] in model_ids 292 | # and "WOS-INTERNAL" not in model["metadata"]["name"] 293 | # } 294 | 295 | print(to_delete) 296 | 297 | ## Delete Deployment IDs and Duplicate Assets 298 | 299 | with contextlib.suppress(Exception): 300 | for key, value in to_delete.items(): 301 | for each in value: 302 | print(f"Deleting {each}") 303 | self.wml_client.deployments.delete(each) 304 | time.sleep(3) 305 | print(f"Deleting {key}") 306 | self.wml_client.repository.delete(key) 307 | 308 | headers = { 309 | "Content-Type": "application/json", 310 | "Accept": "application/json", 311 | "Authorization": self.wml_client._get_headers()["Authorization"], 312 | } 313 | params = {"project_id": self.project_id} 314 | data = {"mode": 0, "space_id": self.space_id} 315 | r = requests.post( 316 | f"{self.service_url}/v2/assets/{model_uid}/promote", 317 | headers=headers, 318 | params=params, 319 | json=data, 320 | ) 321 | return r.json() 322 | 323 | def deploy_model(self, space_id, deployment_name, model_uid): 324 | self.wml_client.set.default_space(space_id) 325 | # with contextlib.suppress(Exception): 326 | # for x in self.wml_client.deployments.get_details()["resources"]: 327 | # if x["metadata"]["name"] == deployment_name: 328 | # self.wml_client.deployments.delete(x["metadata"]["id"]) 329 | meta_props = { 330 | self.wml_client.deployments.ConfigurationMetaNames.NAME: deployment_name, 331 | self.wml_client.deployments.ConfigurationMetaNames.ONLINE: {}, 332 | } 333 | deployment_details = self.wml_client.deployments.create( 334 | model_uid, meta_props=meta_props 335 | ) 336 | deployment_uid = self.wml_client.deployments.get_uid(deployment_details) 337 | return deployment_uid 338 | -------------------------------------------------------------------------------- /vars_and_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | import pandas as pd 4 | import pickle 5 | 6 | 7 | #### VARS #### 8 | 9 | data_path="data" 10 | raw_data_filename = "german_credit_data_biased_training.csv" 11 | raw_data_path=os.path.join(data_path,raw_data_filename) 12 | 13 | train_data_filename = "train_gcr.csv" 14 | test_data_filename = "test_gcr.csv" 15 | train_data_path=os.path.join(data_path,train_data_filename) 16 | test_data_path=os.path.join(data_path,test_data_filename) 17 | 18 | pipeline_filename = "feature_encode.pkl" 19 | pipeline_path = os.path.join(data_path, pipeline_filename) 20 | 21 | model_name="credit_risk_prediction" 22 | model_path = os.path.join(data_path, model_name+".pkl") 23 | 24 | deployment_name="credit_risk_prediction" 25 | 26 | deployment_space_id_DEV="c4238e9c-1cbd-4776-aa6e-4f6b1f865ed1" 27 | deployment_space_id_PROD="c4238e9c-1cbd-4776-aa6e-4f6b1f865ed1" 28 | 29 | #### UTILS #### 30 | 31 | def download_data_to_filesystem(raw_data_path): 32 | """ 33 | Download the german_credit_data_biased_training.csv data from a given URL and save it to the specified file path. 34 | 35 | Parameters: 36 | - raw_data_path (str): Destination path where the CSV will be saved. 37 | """ 38 | url = "https://raw.githubusercontent.com/IBM/monitor-wml-model-with-watson-openscale/master/data/german_credit_data_biased_training.csv" 39 | response = requests.get(url) 40 | 41 | # Check if the request was successful 42 | if response.status_code == 200: 43 | 44 | # Ensure the directory exists 45 | directory = os.path.dirname(raw_data_path) 46 | if not os.path.exists(directory): 47 | os.makedirs(directory) 48 | 49 | # Write the content to the file 50 | with open(raw_data_path, "wb") as file: 51 | file.write(response.content) 52 | print("Downloaded and saved as "+raw_data_path) 53 | else: 54 | print("Failed to download the CSV file. Status code:", response.status_code) 55 | 56 | 57 | def check_for_file_in_filesystem(path): 58 | """ 59 | Check existence of path in filesystem 60 | """ 61 | if os.path.exists(path): 62 | return True 63 | else: 64 | print("File not found in specified path.") 65 | return False 66 | 67 | 68 | def load_model(filename): 69 | """ 70 | Load model from the specified file path. 71 | 72 | Parameters: 73 | - filename (str): Path to the model file. 74 | 75 | Returns: 76 | - object: The deserialized model/pipeline object. 77 | """ 78 | check_for_file_in_filesystem(filename) 79 | with open (filename,"rb") as f: 80 | pipeline = pickle.load(f) 81 | return pipeline 82 | 83 | 84 | def load_data_from_db2(): 85 | ''' 86 | currently not implemented due to issues with the flight service 87 | ''' 88 | # data_request = { 89 | # 'connection_name': """DB2_DATA""", 90 | # 'interaction_properties': { 91 | # 'select_statement': 'SELECT * FROM "CUSTOMER_DATA"."GERMAN_CREDIT_RISK_TRAINING" FETCH FIRST 5000 ROWS ONLY' 92 | # } 93 | # } 94 | 95 | # read_client = itcfs.get_flight_client() 96 | 97 | 98 | # flightInfo = itcfs.get_flight_info(read_client, nb_data_request=data_request) 99 | 100 | # df = itcfs.read_pandas_and_concat(read_client, flightInfo, timeout=240) 101 | # create empty dataframe to have a valid return type 102 | 103 | # throw an exception to signal that this functionality is not available 104 | print("not implemented") 105 | raise Exception("not implemented") 106 | 107 | 108 | def load_german_credit_risk_data(): 109 | """ 110 | checks if it can find the data in db2 or on the local filesystem. 111 | If necessary downloads it from the internet. 112 | Returns it as a dataframe 113 | 114 | Returns: 115 | pandas df: german credit risk data 116 | """ 117 | try: 118 | return load_data_from_db2() 119 | except: 120 | print("Error while loading data from db2. downloading csv file to filesystem instead") 121 | 122 | if os.path.isfile(raw_data_path): 123 | print("File already exists in filesystem.") 124 | else: 125 | download_data_to_filesystem(raw_data_path) 126 | print("loading data to pandas dataframe") 127 | return pd.read_csv(raw_data_path) 128 | 129 | 130 | def save_data_in_filesystem(df,filename): 131 | """ 132 | Save Data in Filesystem 133 | 134 | Passed filename should involve path 135 | 136 | """ 137 | try: 138 | if filename[-3:] == "csv": 139 | df.to_csv(filename,index=False) 140 | print(f"File {filename} persisted successfully as csv") 141 | else: 142 | with open(filename, 'wb') as f: 143 | pickle.dump(df, f) 144 | print(f"File {filename} pickled successfully") 145 | except Exception as e: 146 | print(e) 147 | print(f"File serialization for {filename} failed") 148 | 149 | 150 | def load_data_from_filesystem(path): 151 | """ 152 | Check existence of path in filesystem. 153 | If it does exist, loads csv via path 154 | If it does NOT exist, try to load data from Db2 155 | """ 156 | body = check_for_file_in_filesystem(path) 157 | if body: 158 | suffix = path[-3:] 159 | # Check whether path ends on csv 160 | if suffix == "csv": 161 | gcf_df = pd.read_csv(path) 162 | else: 163 | with open(path) as f: 164 | gcf_df = pickle.load(f) 165 | return gcf_df --------------------------------------------------------------------------------