├── .DS_Store
├── .gitignore
├── 00-git-pull.ipynb
├── 01-connect_and_validate_data.ipynb
├── 02-data_preparation.ipynb
├── 03-train_models.ipynb
├── 04-deploy_model.ipynb
├── 05_monitor_deployment.ipynb
├── 2023_04_MLOps_AOT_Initiative.drawio
├── LICENSE
├── README.md
├── TODO.md
├── assets
    └── .METADATA
    │   ├── .gitkeep
    │   └── .version.json
├── assettypes
    └── .gitkeep
├── custom_env.yaml
├── images
    ├── 2023-08-31 09_10_14-2023_04_MLOps_AOT_Initiative.drawio - diagrams.net.png
    ├── 2023-08-31-09_10_14.png
    ├── 2023-09-05-11_00_27.png
    ├── 2023_04_MLOps_AOT_Initiative.drawio
    ├── banner.png
    ├── detailed_overview.png
    ├── image-1.png
    ├── image-10.png
    ├── image-100.png
    ├── image-101.png
    ├── image-102.png
    ├── image-103.png
    ├── image-11.png
    ├── image-12.png
    ├── image-13.png
    ├── image-14.png
    ├── image-15.png
    ├── image-16.png
    ├── image-17.png
    ├── image-18.png
    ├── image-19.png
    ├── image-2.png
    ├── image-20.png
    ├── image-200.png
    ├── image-201.png
    ├── image-202.png
    ├── image-203.png
    ├── image-204.png
    ├── image-205.png
    ├── image-206.png
    ├── image-207.png
    ├── image-21.png
    ├── image-22.png
    ├── image-23.png
    ├── image-24.png
    ├── image-25.png
    ├── image-26.png
    ├── image-27.png
    ├── image-28.png
    ├── image-29.png
    ├── image-3.png
    ├── image-4.png
    ├── image-5.png
    ├── image-6.png
    ├── image-7.png
    ├── image-8.png
    ├── image-9.png
    ├── image.png
    ├── overview-image-1.png
    └── overview-image.png
├── utils
    ├── catalog_utils.py
    └── fs_utils.py
└── vars_and_utils.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | venv/
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # pipenv
 89 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 90 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 91 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 92 | #   install all needed dependencies.
 93 | #Pipfile.lock
 94 | 
 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 96 | __pypackages__/
 97 | 
 98 | # Celery stuff
 99 | celerybeat-schedule
100 | celerybeat.pid
101 | 
102 | # SageMath parsed files
103 | *.sage.py
104 | 
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 | 
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 | 
118 | # Rope project settings
119 | .ropeproject
120 | 
121 | # mkdocs documentation
122 | /site
123 | 
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 | 
129 | # Pyre type checker
130 | .pyre/
131 | 
132 | **/.ipynb_checkpoints/*
133 | **/.virtual_documents/*
134 | assets/.METADATA/job_run.*
135 | assets/job_run
136 | assets/.METADATA/auto_ml.*
137 | assets/auto_ml
138 | assets/.METADATA/experiment.*
139 | assets/.METADATA/wml_experiment.*
140 | assets/experiment
141 | assets/.METADATA/wml_training_definition.*
142 | assets/wml_training_definition
143 | assets/.METADATA/wml_remote_training_system.*
144 | assets/wml_remote_training_system
145 | assets/federated_learning
146 | cover/
147 | .pybuilder/
148 | .pytype/
149 | cython_debug/


--------------------------------------------------------------------------------
/00-git-pull.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "d720957f",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "![Alt text](images/banner.png)"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "4c4febc4-bc42-48cb-a226-e8a352003011",
 14 |    "metadata": {
 15 |     "id": "1c9ea256-70b7-4d27-9628-ef9f55978309",
 16 |     "tags": []
 17 |    },
 18 |    "source": [
 19 |     "## Pulling changes from git\n",
 20 |     "\n",
 21 |     "This notebook will pull the current state from a given git repository.\n",
 22 |     "This is necessary so Watson Pipelines will always execute the newest changes!\n",
 23 |     "\n",
 24 |     "You might have to open a terminal to create the .env file since jupyterlab does not show .env files by defaul "
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "id": "76dc108f-1be7-4618-b9c8-425ff448c732",
 31 |    "metadata": {
 32 |     "id": "76dc108f-1be7-4618-b9c8-425ff448c732",
 33 |     "tags": []
 34 |    },
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "from ibm_watson_studio_pipelines import WSPipelines\n",
 38 |     "import subprocess\n",
 39 |     "import os\n",
 40 |     "from dotenv import load_dotenv\n",
 41 |     "\n",
 42 |     "TOKEN = os.getenv(\"USER_ACCESS_TOKEN\")\n",
 43 |     "\n",
 44 |     "load_dotenv()\n",
 45 |     "\n",
 46 |     "repo_adresse = os.getenv('repo_adresse') # without https://\n",
 47 |     "personal_access_token=os.getenv('personal_access_token') # generate this in github \n",
 48 |     "branch_name = os.getenv('branch_name') # should match the branch that is currently checked out within this project\n"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "id": "2d9e7962-d85a-4f20-8593-9e0ca93bdb2c",
 54 |    "metadata": {
 55 |     "id": "e9e0d7ad-e7a0-4ef5-b008-b6474d0449f1"
 56 |    },
 57 |    "source": [
 58 |     "#### setting personal access token --> TODO: get this from a secret vault!"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "id": "d52fe0de-10de-45e6-a69a-1528cee38c40",
 64 |    "metadata": {
 65 |     "id": "e37c810c-8b5d-448c-be88-bfac1dc21619"
 66 |    },
 67 |    "source": [
 68 |     "ditching all possibly existing local changes"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "id": "ba1edbf0-504c-475d-9bf4-a5e7b8ee9383",
 75 |    "metadata": {
 76 |     "id": "ba1edbf0-504c-475d-9bf4-a5e7b8ee9383",
 77 |     "tags": []
 78 |    },
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "!git reset --hard HEAD"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "id": "eeaf8fb6-23b8-48c3-9fd8-a92203eb80e3",
 87 |    "metadata": {
 88 |     "id": "5ac99cf8-ce89-49c0-b397-0ab66c60a2ce"
 89 |    },
 90 |    "source": [
 91 |     "removing current origin to replace it with an origin with explicit credentials"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "id": "23055252-3170-41fe-918a-9025af772ab8",
 98 |    "metadata": {
 99 |     "id": "23055252-3170-41fe-918a-9025af772ab8",
100 |     "tags": []
101 |    },
102 |    "outputs": [],
103 |    "source": [
104 |     "import os\n",
105 |     "\n",
106 |     "exit_code = os.system(\"git remote remove origin\")\n",
107 |     "if exit_code != 0:\n",
108 |     "    print(\"An error occurred.\")"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "id": "dd53ddaf-3621-4ac1-8026-e35e713fceda",
115 |    "metadata": {
116 |     "id": "dd53ddaf-3621-4ac1-8026-e35e713fceda",
117 |     "tags": []
118 |    },
119 |    "outputs": [],
120 |    "source": [
121 |     "command=\"git remote add origin https://\"+personal_access_token+\"@\"+repo_adresse\n",
122 |     "!{command}"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "id": "162fa803-c479-4c06-81fa-76f5af9d3b28",
129 |    "metadata": {
130 |     "id": "11465116-165a-4af5-bfbc-5fef6cda7d73",
131 |     "tags": []
132 |    },
133 |    "outputs": [],
134 |    "source": [
135 |     "!git fetch "
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "markdown",
140 |    "id": "62cb29ec-2115-4cc1-b6bd-58bbbac00934",
141 |    "metadata": {
142 |     "id": "04850200-69cf-4031-8c09-44e75f6e5371"
143 |    },
144 |    "source": [
145 |     "for some reason git forgets which branch it is looking at right now --> have to set it explicity"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "id": "934d5b9f-a8e5-484e-a770-cfa8ff4f17f2",
152 |    "metadata": {
153 |     "id": "934d5b9f-a8e5-484e-a770-cfa8ff4f17f2",
154 |     "tags": []
155 |    },
156 |    "outputs": [],
157 |    "source": [
158 |     "command=\"git branch --set-upstream-to=origin/\"+branch_name\n",
159 |     "!{command}"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "id": "db3e6c1a-7465-4cf5-aab5-a5cb7ddb7340",
166 |    "metadata": {
167 |     "id": "db3e6c1a-7465-4cf5-aab5-a5cb7ddb7340",
168 |     "tags": []
169 |    },
170 |    "outputs": [],
171 |    "source": [
172 |     "!git pull "
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "id": "283e1d1c",
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "import os\n",
183 |     "#TODO: remove this --> potentially leaking secrets\n",
184 |     "for key, value in os.environ.items():\n",
185 |     "    print(f\"{key}={value}\")"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "id": "0838de24",
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "!git status"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "id": "5dbfac9b",
202 |    "metadata": {},
203 |    "outputs": [],
204 |    "source": [
205 |     "import subprocess\n",
206 |     "\n",
207 |     "def is_git_up_to_date():\n",
208 |     "    try:\n",
209 |     "        # Execute the git status command and get its output as a string\n",
210 |     "        result = subprocess.run(['git', 'status'], stdout=subprocess.PIPE, check=True)\n",
211 |     "        output = result.stdout.decode('utf-8')\n",
212 |     "        \n",
213 |     "        # Check if the desired string is in the output\n",
214 |     "        return \"Your branch is up to date\" in output\n",
215 |     "    except subprocess.CalledProcessError:\n",
216 |     "        # Handle errors related to the git command\n",
217 |     "        print(\"Error executing git status. Ensure you're in a git repository.\")\n",
218 |     "        return False\n",
219 |     "\n",
220 |     "# Test the function\n",
221 |     "print(is_git_up_to_date())"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "id": "8cc5aebc",
228 |    "metadata": {},
229 |    "outputs": [],
230 |    "source": [
231 |     "validation_params = {}\n",
232 |     "validation_params['was_succesfull'] = is_git_up_to_date()\n",
233 |     "\n",
234 |     "\n",
235 |     "pipelines_client = WSPipelines.from_token(TOKEN)\n",
236 |     "pipelines_client.store_results(validation_params)"
237 |    ]
238 |   }
239 |  ],
240 |  "metadata": {
241 |   "kernelspec": {
242 |    "display_name": "Python 3.10",
243 |    "language": "python",
244 |    "name": "python3"
245 |   },
246 |   "language_info": {
247 |    "codemirror_mode": {
248 |     "name": "ipython",
249 |     "version": 3
250 |    },
251 |    "file_extension": ".py",
252 |    "mimetype": "text/x-python",
253 |    "name": "python",
254 |    "nbconvert_exporter": "python",
255 |    "pygments_lexer": "ipython3",
256 |    "version": "3.10.10"
257 |   }
258 |  },
259 |  "nbformat": 4,
260 |  "nbformat_minor": 5
261 | }
262 | 


--------------------------------------------------------------------------------
/01-connect_and_validate_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "![Alt text](images/banner.png)"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {
 13 |     "id": "9501e8a8-d435-4451-a8ae-513e984aafe9"
 14 |    },
 15 |    "source": [
 16 |     "## Connection and Data Validation Notebook"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {
 22 |     "id": "062a2e71-ddbf-4c77-aaeb-d5aa7ac8d265"
 23 |    },
 24 |    "source": [
 25 |     "### Load the Credentials\n",
 26 |     "\n",
 27 |     "These environment variables are automatically set in WS Pipelines and are needed to access various services. "
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {
 34 |     "id": "e875ff36-3235-43fb-8008-4bfb334c1325",
 35 |     "tags": []
 36 |    },
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "import os\n",
 40 |     "TOKEN = os.getenv(\"USER_ACCESS_TOKEN\")"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "metadata": {
 47 |     "id": "5dc1650f-d3fd-49f3-820d-dbce4ab98d04"
 48 |    },
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "## Imports"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 4,
 57 |    "metadata": {
 58 |     "id": "fe00539c-a0ab-4769-ba7b-805adea59cf8",
 59 |     "tags": []
 60 |    },
 61 |    "outputs": [
 62 |     {
 63 |      "name": "stderr",
 64 |      "output_type": "stream",
 65 |      "text": [
 66 |       "2023-10-23 07:24:33.933101: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
 67 |       "2023-10-23 07:24:33.933161: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
 68 |       "2023-10-23 07:24:33.933204: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
 69 |       "2023-10-23 07:24:35.469552: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
 70 |      ]
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "from botocore.client import Config\n",
 75 |     "from sklearn.model_selection import train_test_split\n",
 76 |     "from dataclasses import dataclass\n",
 77 |     "import tensorflow_data_validation as tfdv\n",
 78 |     "import numpy as np\n",
 79 |     "import pandas as pd\n",
 80 |     "from ibm_watson_studio_pipelines import WSPipelines\n",
 81 |     "import warnings\n",
 82 |     "\n",
 83 |     "\n",
 84 |     "warnings.filterwarnings(\"ignore\")"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "### Loading Variables and Utils from common python file\n",
 92 |     "\n",
 93 |     "In this section we load the variables and functions from the common python file. This file contains the variables and functions that are common to all the notebooks in this project."
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "import vars_and_utils as vars_and_utils"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {
108 |     "id": "d44e72ca-25cf-44c2-a8f4-4ad6eff50e4a"
109 |    },
110 |    "source": [
111 |     "## Load the Training Data \n",
112 |     "\n",
113 |     "this will check if the training data exists within a defined db2 table. If it does not exist, it will load the data from the web and store it in the project space as a .csv file."
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 14,
119 |    "metadata": {
120 |     "id": "0487c397-3d75-4292-ae6f-8f2cd4bb8f19",
121 |     "tags": []
122 |    },
123 |    "outputs": [
124 |     {
125 |      "ename": "NameError",
126 |      "evalue": "name 'training_file_path' is not defined",
127 |      "output_type": "error",
128 |      "traceback": [
129 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
130 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
131 |       "Cell \u001b[0;32mIn[14], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m gcr_df \u001b[38;5;241m=\u001b[39m load_data_from_project(\u001b[43mtraining_file_path\u001b[49m)\n\u001b[1;32m      3\u001b[0m \u001b[38;5;66;03m## Encode for ease of use with OpenScale\u001b[39;00m\n\u001b[1;32m      4\u001b[0m gcr_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mRisk\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m gcr_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mRisk\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mmap({\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mRisk\u001b[39m\u001b[38;5;124m'\u001b[39m:\u001b[38;5;241m1\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mNo Risk\u001b[39m\u001b[38;5;124m'\u001b[39m:\u001b[38;5;241m0\u001b[39m})\n",
132 |       "\u001b[0;31mNameError\u001b[0m: name 'training_file_path' is not defined"
133 |      ]
134 |     }
135 |    ],
136 |    "source": [
137 |     "gcr_df = vars_and_utils.load_german_credit_risk_data()\n",
138 |     "\n",
139 |     "## Encode for ease of use with OpenScale\n",
140 |     "gcr_df['Risk'] = gcr_df['Risk'].map({'Risk':1,'No Risk':0})\n",
141 |     "gcr_df.head()"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "metadata": {
147 |     "id": "e38d5859-3df7-4174-8f8c-a047c0dcdb3c"
148 |    },
149 |    "source": [
150 |     "## Data Validation "
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {
157 |     "id": "37d30747-1f1d-42ad-ab50-6001740df627",
158 |     "tags": []
159 |    },
160 |    "outputs": [],
161 |    "source": [
162 |     "@dataclass\n",
163 |     "class Datavalidation:\n",
164 |     "    \"\"\"\n",
165 |     "    \n",
166 |     "    Data Validation Class\n",
167 |     "    \n",
168 |     "    \"\"\"\n",
169 |     "    dataframe : pd.DataFrame\n",
170 |     "    mask_per :int\n",
171 |     "    \n",
172 |     "    \n",
173 |     "    def split_data(self,seed=32):\n",
174 |     "        \"\"\"\n",
175 |     "        Split Data into Train and Test Splits\n",
176 |     "        \n",
177 |     "        \"\"\"\n",
178 |     "        np.random.seed(seed)\n",
179 |     "        mask = np.random.rand(len(self.dataframe)) <= self.mask_per\n",
180 |     "        training_data = gcr_df[mask]\n",
181 |     "        testing_data = gcr_df[~mask]\n",
182 |     "\n",
183 |     "        print(f\"No. of training examples: {training_data.shape[0]}\")\n",
184 |     "        print(f\"No. of testing examples: {testing_data.shape[0]}\")\n",
185 |     "        \n",
186 |     "        return training_data, testing_data\n",
187 |     "    \n",
188 |     "    # TODO: Replace with Db2/fileystem\n",
189 |     "    def save_data_in_filesystem(self,df,filename):\n",
190 |     "        \"\"\"\n",
191 |     "        Save Data in Filesystem\n",
192 |     "\n",
193 |     "        Passed filename should involve path\n",
194 |     "\n",
195 |     "        \"\"\"\n",
196 |     "        try:\n",
197 |     "            df.to_csv(filename,index=False)\n",
198 |     "            print(f\"File {filename} persisted successfully\")\n",
199 |     "        except Exception as e:\n",
200 |     "            print(e)\n",
201 |     "            print(f\"File serialization for {filename} failed\")\n",
202 |     "    \n",
203 |     "    def generate_statistics(self,df):\n",
204 |     "        \"\"\"\n",
205 |     "        \n",
206 |     "        Generate Statistics on a given Dataframe\n",
207 |     "        \n",
208 |     "        \"\"\"\n",
209 |     "        train_stats = tfdv.generate_statistics_from_dataframe(df)\n",
210 |     "        tfdv.visualize_statistics(train_stats)\n",
211 |     "        return train_stats\n",
212 |     "    \n",
213 |     "    def inferSchema(self,stats):\n",
214 |     "        \n",
215 |     "        \"\"\"\n",
216 |     "        InferSchema on a given Dataframe\n",
217 |     "        \n",
218 |     "        \"\"\"\n",
219 |     "        schema = tfdv.infer_schema(statistics=stats)\n",
220 |     "        tfdv.display_schema(schema=schema)\n",
221 |     "        return schema\n",
222 |     "    \n",
223 |     "    def compare_statistics(self,lhs,rhs):\n",
224 |     "        \"\"\"\n",
225 |     "        \n",
226 |     "        Compare Statistics between a test dataframe and reference Schema\n",
227 |     "        \n",
228 |     "        \"\"\"\n",
229 |     "        # Compare evaluation data with training data\n",
230 |     "        tfdv.visualize_statistics(lhs_statistics=lhs, rhs_statistics=rhs,\n",
231 |     "                                  lhs_name='TEST_DATASET', rhs_name='TRAIN_DATASET')\n",
232 |     "        \n",
233 |     "        \n",
234 |     "    def check_for_anomalies(self,testable_stats,ref_schema):\n",
235 |     "        \"\"\"\n",
236 |     "        \n",
237 |     "        Check for any anomalies based on statistics and schema and values\n",
238 |     "        \n",
239 |     "        \"\"\"\n",
240 |     "        anomalies = tfdv.validate_statistics(statistics=testable_stats, schema=ref_schema)\n",
241 |     "        tfdv.display_anomalies(anomalies)\n",
242 |     "        if len(anomalies.anomaly_info.items()) > 0:\n",
243 |     "            logger.error(\"Anomalies found in dataset...\")\n",
244 |     "            logger.error(str(self.anomalies.anomaly_info.items()))\n",
245 |     "            return True\n",
246 |     "        else:\n",
247 |     "            return False"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "markdown",
252 |    "metadata": {
253 |     "id": "fff2bb05-abd0-4e0a-9727-d1127a617f57"
254 |    },
255 |    "source": [
256 |     "###  Split Data into Train and Eval Splits to Check for Consistency"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": null,
262 |    "metadata": {
263 |     "id": "65163ff9-3193-4837-ab55-d66de3a5076f",
264 |     "tags": []
265 |    },
266 |    "outputs": [],
267 |    "source": [
268 |     "classvalidate = Datavalidation(dataframe=gcr_df,mask_per=0.8) \n",
269 |     "\n",
270 |     "training_data, testing_data = classvalidate.split_data()"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "markdown",
275 |    "metadata": {
276 |     "id": "75029b2c-6341-4a59-957c-cbb2d33e3e39"
277 |    },
278 |    "source": [
279 |     "## Generate Training Stats on both Splits"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": null,
285 |    "metadata": {
286 |     "id": "36a97f3f-3cd7-493d-8a4b-c3c87ae0710f",
287 |     "tags": []
288 |    },
289 |    "outputs": [],
290 |    "source": [
291 |     "train_stats = classvalidate.generate_statistics(training_data)\n",
292 |     "test_stats = classvalidate.generate_statistics(testing_data)"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "markdown",
297 |    "metadata": {
298 |     "id": "ed4d3f56-e852-4269-a3dd-8426d71bed8e"
299 |    },
300 |    "source": [
301 |     "## Infer Data Schemas"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": null,
307 |    "metadata": {
308 |     "id": "13f7be93-3fb1-49ca-9238-ebb1fcc1af28",
309 |     "tags": []
310 |    },
311 |    "outputs": [],
312 |    "source": [
313 |     "train_schema = classvalidate.inferSchema(train_stats)\n",
314 |     "test_schema = classvalidate.inferSchema(test_stats)"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "markdown",
319 |    "metadata": {
320 |     "id": "ea9f07ac-ac4b-4ecd-8840-a5ab07bfb7f8"
321 |    },
322 |    "source": [
323 |     "## Compare Eval and Train Data "
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": null,
329 |    "metadata": {
330 |     "id": "f5ccfb5d-05bd-4b3b-9f4d-8082252457c3",
331 |     "tags": []
332 |    },
333 |    "outputs": [],
334 |    "source": [
335 |     "classvalidate.compare_statistics(lhs=test_stats,rhs=train_stats)"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "markdown",
340 |    "metadata": {
341 |     "id": "915f6a20-aab3-47b5-86e9-93bde2f548ff"
342 |    },
343 |    "source": [
344 |     "## Check For Data Anomalies "
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "markdown",
349 |    "metadata": {
350 |     "id": "eae32b5f-46b3-4e5d-919e-68c206a47b8e"
351 |    },
352 |    "source": [
353 |     "### Check eval data for errors by validating the eval data stats using the previously inferred schema."
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": null,
359 |    "metadata": {
360 |     "id": "f2927d8b-fa0f-420f-9181-a080cdfcb748",
361 |     "tags": []
362 |    },
363 |    "outputs": [],
364 |    "source": [
365 |     "anomaly_status = classvalidate.check_for_anomalies(test_stats,train_schema)\n",
366 |     "anomaly_status"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "markdown",
371 |    "metadata": {
372 |     "id": "c914eade-25d9-497a-960e-ec5a57282def"
373 |    },
374 |    "source": [
375 |     "## Save Train and Test Data for Data Preparation Stage"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": null,
381 |    "metadata": {
382 |     "id": "0f1d08cc-e4f7-4010-8b4e-7414529e874a",
383 |     "tags": []
384 |    },
385 |    "outputs": [],
386 |    "source": [
387 |     "# TODO: Replace with Db2/fileystem\n",
388 |     "if not anomaly_status:\n",
389 |     "    classvalidate.save_data_in_filesystem(df=training_data,filename=vars_and_utils.train_data_path)\n",
390 |     "    classvalidate.save_data_in_filesystem(df=testing_data,filename=vars_and_utils.test_data_path)"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "markdown",
395 |    "metadata": {
396 |     "id": "23d79a03-46e7-49de-a549-65797ba3d46c"
397 |    },
398 |    "source": [
399 |     "## Check if the validation steps were successful\n",
400 |     "This checks if anomalies were found and if the data was successfully split into train and eval splits and stored as files."
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "code",
405 |    "execution_count": null,
406 |    "metadata": {},
407 |    "outputs": [],
408 |    "source": [
409 |     "def validation_successfull(train_data_path, test_data_path):\n",
410 |     "    if anomaly_status: # no anomalies\n",
411 |     "        return False\n",
412 |     "    elif not os.path.exists(train_data_path): # train data file exists\n",
413 |     "        return False\n",
414 |     "    elif not os.path.exists(test_data_path): # test data file exists\n",
415 |     "        return False\n",
416 |     "    else:\n",
417 |     "        print (\"validation of the data successfull\")\n",
418 |     "        return True\n",
419 |     "    \n",
420 |     "validation_successfull(vars_and_utils.train_data_path, vars_and_utils.test_data_path)"
421 |    ]
422 |   },
423 |   {
424 |    "cell_type": "markdown",
425 |    "metadata": {
426 |     "id": "b42568a9-36f8-407e-be7a-f8bbbf93444a"
427 |    },
428 |    "source": [
429 |     "## Register the output variables for the next pipeine stage\n",
430 |     "every notebook outputs a \"was_successful\" boolean variable. The logic behind this is different for every notebook and can be altered to fit the needs of the project.\n",
431 |     "If needed additional variables can be created here but they also need to registered as output variables in the Watson Pipelines UI."
432 |    ]
433 |   },
434 |   {
435 |    "cell_type": "code",
436 |    "execution_count": null,
437 |    "metadata": {
438 |     "id": "9f210efd-7358-41a4-9ab5-37a856f3ab47",
439 |     "tags": []
440 |    },
441 |    "outputs": [],
442 |    "source": [
443 |     "validation_params = {}\n",
444 |     "validation_params['was_succesfull'] = validation_successfull(vars_and_utils.train_data_path, vars_and_utils.test_data_path)\n",
445 |     "\n",
446 |     "pipelines_client = WSPipelines.from_token(TOKEN)\n",
447 |     "pipelines_client.store_results(validation_params)"
448 |    ]
449 |   }
450 |  ],
451 |  "metadata": {
452 |   "kernelspec": {
453 |    "display_name": "Python 3.10",
454 |    "language": "python",
455 |    "name": "python3"
456 |   },
457 |   "language_info": {
458 |    "codemirror_mode": {
459 |     "name": "ipython",
460 |     "version": 3
461 |    },
462 |    "file_extension": ".py",
463 |    "mimetype": "text/x-python",
464 |    "name": "python",
465 |    "nbconvert_exporter": "python",
466 |    "pygments_lexer": "ipython3",
467 |    "version": "3.10.10"
468 |   },
469 |   "vscode": {
470 |    "interpreter": {
471 |     "hash": "bd385fe162c5ca0c84973b7dd5c518456272446b2b64e67c2a69f949ca7a1754"
472 |    }
473 |   }
474 |  },
475 |  "nbformat": 4,
476 |  "nbformat_minor": 4
477 | }
478 | 


--------------------------------------------------------------------------------
/02-data_preparation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "![Alt text](images/banner.png)"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {
 13 |     "id": "4292d1bf-4e75-4a75-b8ca-914d4f58d925"
 14 |    },
 15 |    "source": [
 16 |     "## Data Preparation Notebook"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "### Initial Setup\n",
 24 |     "\n",
 25 |     "Some initial setup specific to running this notebook as part of the pipeline.   "
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {
 32 |     "id": "db343866-6051-45ab-a62d-a1afef8b9428"
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "import os\n",
 37 |     "#This environment variable is automatically set in WS Pipelines and are needed to access various services.\n",
 38 |     "TOKEN = os.getenv(\"USER_ACCESS_TOKEN\")"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "metadata": {
 45 |     "id": "dc2767f8-217a-4dca-a18e-1939eea8cd1f"
 46 |    },
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "if os.getenv(\"running_in_production_pipeline\"):\n",
 50 |     "    running_in_production_pipeline = True\n",
 51 |     "    # If you want to run additional steps when deploying to production like reporting to external services, you can use this variable to trigger that\n",
 52 |     "    # It can also be used to skip steps that are only needed in development like plotting\n",
 53 |     "    print(\"notebook is running in a production pipeline!\")\n",
 54 |     "else:\n",
 55 |     "    running_in_production_pipeline = False"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {
 61 |     "id": "1e30a381-886e-4710-8874-3ff365e537cd"
 62 |    },
 63 |    "source": [
 64 |     "## Imports"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {
 71 |     "id": "f161da10-75e1-4e66-8859-345ff5fcb899",
 72 |     "tags": []
 73 |    },
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "from sklearn.feature_selection import SelectKBest\n",
 77 |     "from sklearn.feature_selection import chi2\n",
 78 |     "from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,LabelEncoder,MinMaxScaler\n",
 79 |     "from sklearn.feature_selection import mutual_info_classif\n",
 80 |     "from sklearn.compose import ColumnTransformer\n",
 81 |     "from sklearn.pipeline import Pipeline\n",
 82 |     "from botocore.client import Config\n",
 83 |     "from ibm_watson_studio_pipelines import WSPipelines\n",
 84 |     "import matplotlib.pyplot as plt\n",
 85 |     "import heapq\n",
 86 |     "import os\n",
 87 |     "import pandas as pd\n",
 88 |     "\n",
 89 |     "# Loading Variables and Utils from common python file\n",
 90 |     "import vars_and_utils as vars_and_utils\n"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "metadata": {
 96 |     "id": "538bf340-c35f-439a-af79-ad9f5a0c6389"
 97 |    },
 98 |    "source": [
 99 |     "## Preparing the Train Data "
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {
106 |     "id": "aa1cd88c-f7df-4b8c-a908-49e84c4de6e5",
107 |     "tags": []
108 |    },
109 |    "outputs": [],
110 |    "source": [
111 |     "train_data = vars_and_utils.load_data_from_filesystem(vars_and_utils.train_data_path)\n",
112 |     "train_data.head()"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {
119 |     "id": "08be57ad-b834-42f8-bb29-4f66474a0b3b",
120 |     "tags": []
121 |    },
122 |    "outputs": [],
123 |    "source": [
124 |     "object_df = train_data.select_dtypes('O')\n",
125 |     "object_df.head()"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {
132 |     "id": "c3bf8612-3433-4810-b5c5-3b6d9d23d57d",
133 |     "tags": []
134 |    },
135 |    "outputs": [],
136 |    "source": [
137 |     "object_cols = list(set(object_df.columns.tolist()) - set(['Risk']))\n",
138 |     "object_cols"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {
145 |     "id": "27f48dcb-7dee-44a8-b8ed-2ed58385912f",
146 |     "tags": []
147 |    },
148 |    "outputs": [],
149 |    "source": [
150 |     "numerical_columns = [col for col in train_data.columns.tolist() if col not in object_cols and col!='Risk']"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "metadata": {
156 |     "id": "10c03047-c200-4f7e-91e6-0bdb31554fea"
157 |    },
158 |    "source": [
159 |     "## Preparing the Test Data "
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {
166 |     "id": "ca4e5bb6-b11e-48cd-8457-a127a60b06c8",
167 |     "tags": []
168 |    },
169 |    "outputs": [],
170 |    "source": [
171 |     "test_data = vars_and_utils.load_data_from_filesystem(vars_and_utils.test_data_path)\n",
172 |     "test_data.head()"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "markdown",
177 |    "metadata": {
178 |     "id": "f31f399e-904c-49f0-9e2b-12b042d8e8b1"
179 |    },
180 |    "source": [
181 |     "## Split the data sets  "
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {
188 |     "id": "2dfd43ce-7cb2-4db1-b76d-d2ab6c796b2c",
189 |     "tags": []
190 |    },
191 |    "outputs": [],
192 |    "source": [
193 |     "y_train = train_data['Risk']\n",
194 |     "X_train = train_data.drop(\"Risk\",axis=1)\n",
195 |     "\n",
196 |     "\n",
197 |     "y_test = test_data['Risk']\n",
198 |     "X_test = test_data.drop(\"Risk\",axis=1)"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "metadata": {
204 |     "id": "0af40507-eb8d-4e9c-a263-ab624f657375"
205 |    },
206 |    "source": [
207 |     "## Categorcial Feature Analysis "
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {
214 |     "id": "f97b9595-c499-4dbd-b143-85310dfa73bd",
215 |     "tags": []
216 |    },
217 |    "outputs": [],
218 |    "source": [
219 |     "def prepare_input_data(X_train, X_test):\n",
220 |     "    oe = OrdinalEncoder()\n",
221 |     "    oe.fit(X_train)\n",
222 |     "    X_train_enc = oe.transform(X_train)\n",
223 |     "    X_test_enc = oe.transform(X_test)\n",
224 |     "    return X_train_enc, X_test_enc\n",
225 |     "\n",
226 |     "\n",
227 |     "def prepare_output_data(y_train, y_test):\n",
228 |     "    le = LabelEncoder()\n",
229 |     "    le.fit(y_train)\n",
230 |     "    y_train_enc = le.transform(y_train)\n",
231 |     "    y_test_enc = le.transform(y_test)\n",
232 |     "    return y_train_enc, y_test_enc\n",
233 |     "\n",
234 |     "\n",
235 |     "def select_best_chi2_features(X_train, y_train, X_test,score_func=chi2):\n",
236 |     "    featureselector = SelectKBest(score_func=chi2, k='all')\n",
237 |     "    featureselector.fit(X_train, y_train)\n",
238 |     "    X_train_best_feat = featureselector.transform(X_train)\n",
239 |     "    X_test_best_feat= featureselector.transform(X_test)\n",
240 |     "    return X_train_best_feat, X_test_best_feat, featureselector\n",
241 |     "\n",
242 |     "\n",
243 |     "def select_best_mutualinf_features(X_train, y_train, X_test,k=5):\n",
244 |     "    featureselector = SelectKBest(score_func=mutual_info_classif, k=k)\n",
245 |     "    featureselector.fit(X_train, y_train)\n",
246 |     "    X_train_best_feat = fs.transform(X_train)\n",
247 |     "    X_test_best_feat= fs.transform(X_test)\n",
248 |     "    return X_train_best_feat, X_test_best_feat, featureselector\n",
249 |     "    \n",
250 |     "    \n",
251 |     "def get_top_k_catgeorical(fs,train_cat,k=10):\n",
252 |     "    fs_score_map = {}\n",
253 |     "    for i in range(len(fs.scores_)):\n",
254 |     "        #print(f\"Feature {train_cat.columns.tolist()[i]} {fs.scores_[i]}\")\n",
255 |     "        fs_score_map[train_cat.columns.tolist()[i]] = fs.scores_[i]\n",
256 |     "        \n",
257 |     "    k_keys_sorted_by_values = heapq.nlargest(k, fs_score_map, key=fs_score_map.get)\n",
258 |     "    \n",
259 |     "    return k_keys_sorted_by_values"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "markdown",
264 |    "metadata": {
265 |     "id": "660e9d2f-34db-4ecc-9fd5-970462fc2009"
266 |    },
267 |    "source": [
268 |     "## Encode and shape the Variables "
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": null,
274 |    "metadata": {
275 |     "id": "01ed54fe-7afa-4bc4-a37b-b85e6bf6cdc4",
276 |     "tags": []
277 |    },
278 |    "outputs": [],
279 |    "source": [
280 |     "X_train_enc, X_test_enc = prepare_input_data(X_train[object_cols], X_test[object_cols])\n",
281 |     "\n",
282 |     "y_train_enc, y_test_enc = prepare_output_data(y_train, y_test)\n",
283 |     "\n",
284 |     "X_train_fs, X_test_fs, fs = select_best_chi2_features(X_train_enc, y_train_enc, X_test_enc)\n",
285 |     "\n"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "markdown",
290 |    "metadata": {
291 |     "id": "8b210746-a131-4ac1-84db-105848cdbf8e"
292 |    },
293 |    "source": [
294 |     "## Top K Categorical Features  based on Chi2"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "metadata": {
301 |     "id": "889c067d-bbb9-4655-8c52-03769449b649",
302 |     "tags": []
303 |    },
304 |    "outputs": [],
305 |    "source": [
306 |     "top_k_cat = get_top_k_catgeorical(fs,X_train[object_cols])\n",
307 |     "top_k_cat"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "markdown",
312 |    "metadata": {
313 |     "id": "0a017731-d213-43da-8fbb-67b98b28f8a8"
314 |    },
315 |    "source": [
316 |     "## Top K Categorical Features  based on Mutual Information Feature Selection"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": null,
322 |    "metadata": {
323 |     "id": "ccb4d703-e822-4e9d-b66b-bc19c0ea94e3",
324 |     "tags": []
325 |    },
326 |    "outputs": [],
327 |    "source": [
328 |     "X_train_enc_mf, X_test_enc_mf = prepare_input_data(X_train[object_cols], X_test[object_cols])\n",
329 |     "\n",
330 |     "y_train_enc_mf, y_test_enc_mf = prepare_output_data(y_train, y_test)\n",
331 |     "\n",
332 |     "X_train_fs_mf, X_test_fs_mf, fs_mf = select_best_chi2_features(X_train_enc_mf, y_train_enc_mf, X_test_enc_mf)"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "code",
337 |    "execution_count": null,
338 |    "metadata": {
339 |     "id": "577ace08-4e4f-4a97-a56c-7d26b610bec0",
340 |     "tags": []
341 |    },
342 |    "outputs": [],
343 |    "source": [
344 |     "top_k_cat_mf = get_top_k_catgeorical(fs_mf,X_train[object_cols])\n",
345 |     "top_k_cat_mf"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "code",
350 |    "execution_count": null,
351 |    "metadata": {
352 |     "id": "faefd6b2-000e-4545-8607-e8ed0efbe772",
353 |     "tags": []
354 |    },
355 |    "outputs": [],
356 |    "source": [
357 |     "union_features = list(set(top_k_cat+top_k_cat_mf))\n",
358 |     "if \"Sex\" not in union_features:\n",
359 |     "    union_features.append(\"Sex\")\n",
360 |     "union_features"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "markdown",
365 |    "metadata": {
366 |     "id": "74b9d9c5-636d-4504-841b-53ef8e072462"
367 |    },
368 |    "source": [
369 |     "## Filter the Top K Categorical features and Merge to Original Train and Test Dataframes"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": null,
375 |    "metadata": {
376 |     "id": "54680c6a-59b4-447e-bfda-cf0ac7076aef",
377 |     "tags": []
378 |    },
379 |    "outputs": [],
380 |    "source": [
381 |     "X_train_object_filtered = X_train[union_features]\n",
382 |     "X_test_object_filtered = X_test[union_features]\n",
383 |     "\n",
384 |     "X_train_final = pd.concat([X_train[numerical_columns],X_train_object_filtered],axis=1)\n",
385 |     "\n",
386 |     "X_test_final = pd.concat([X_test[numerical_columns],X_test_object_filtered],axis=1)"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "markdown",
391 |    "metadata": {
392 |     "id": "c997be4b-7089-49f6-b51a-048d8003e0bc"
393 |    },
394 |    "source": [
395 |     "## Use Column Transformer and Pipelines to encode the Input and Output Variables . Scale the Numerical columns using MinMaxScaler."
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": null,
401 |    "metadata": {
402 |     "id": "c2b11376-2c27-47e6-ba1e-db772d63ca12",
403 |     "tags": []
404 |    },
405 |    "outputs": [],
406 |    "source": [
407 |     "numerical_ix = X_train_final.select_dtypes(include=['int64', 'float64']).columns\n",
408 |     "categorical_ix = X_train_final.select_dtypes(include=['object', 'bool']).columns"
409 |    ]
410 |   },
411 |   {
412 |    "cell_type": "code",
413 |    "execution_count": null,
414 |    "metadata": {
415 |     "id": "b2b5304f-43f4-4509-bf06-9d901fe6b666",
416 |     "tags": []
417 |    },
418 |    "outputs": [],
419 |    "source": [
420 |     "encoding_steps = [('cat', OrdinalEncoder(), categorical_ix), ('num', MinMaxScaler(), numerical_ix)]\n",
421 |     "col_transform = ColumnTransformer(transformers=encoding_steps)"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": null,
427 |    "metadata": {
428 |     "id": "40b974d3-afc0-46a0-a490-01984513cdae",
429 |     "tags": []
430 |    },
431 |    "outputs": [],
432 |    "source": [
433 |     "pipeline = Pipeline(steps=[('prep',col_transform)])"
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "code",
438 |    "execution_count": null,
439 |    "metadata": {
440 |     "id": "221e73d5-e43a-4a1e-a4b0-1203a67ce96e",
441 |     "tags": []
442 |    },
443 |    "outputs": [],
444 |    "source": [
445 |     "train_final = pd.concat([X_train_final,y_train],axis=1)"
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "code",
450 |    "execution_count": null,
451 |    "metadata": {
452 |     "id": "2b074997-65a5-4e8a-8ab5-c8594938c4ae",
453 |     "tags": []
454 |    },
455 |    "outputs": [],
456 |    "source": [
457 |     "test_final = pd.concat([X_test_final,y_test],axis=1)"
458 |    ]
459 |   },
460 |   {
461 |    "cell_type": "markdown",
462 |    "metadata": {
463 |     "id": "ad254eee-5c2d-4f2f-802b-9bea72d23d16"
464 |    },
465 |    "source": [
466 |     "## Save the Prepared Data to the project filesystem"
467 |    ]
468 |   },
469 |   {
470 |    "cell_type": "code",
471 |    "execution_count": null,
472 |    "metadata": {
473 |     "id": "4ac74b08-e76a-435c-8ded-95a4e6907e4b",
474 |     "tags": []
475 |    },
476 |    "outputs": [],
477 |    "source": [
478 |     "vars_and_utils.save_data_in_filesystem(df=train_final, filename=vars_and_utils.train_data_path)"
479 |    ]
480 |   },
481 |   {
482 |    "cell_type": "code",
483 |    "execution_count": null,
484 |    "metadata": {
485 |     "id": "5de8e8a2-0401-4fcf-bf1d-e64f9d15c325",
486 |     "tags": []
487 |    },
488 |    "outputs": [],
489 |    "source": [
490 |     "vars_and_utils.save_data_in_filesystem(df=test_final, filename=vars_and_utils.test_data_path)"
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "code",
495 |    "execution_count": null,
496 |    "metadata": {
497 |     "id": "17576e9f-73ee-422c-8c41-df8debc9a743",
498 |     "tags": []
499 |    },
500 |    "outputs": [],
501 |    "source": [
502 |     "vars_and_utils.save_data_in_filesystem(df=pipeline, filename=vars_and_utils.pipeline_path)"
503 |    ]
504 |   },
505 |   {
506 |    "cell_type": "markdown",
507 |    "metadata": {
508 |     "id": "b3b2d02f-82bf-4021-8c4d-1a80e8f9d393"
509 |    },
510 |    "source": [
511 |     "## Custom succes check: Check if files have been succesfully created "
512 |    ]
513 |   },
514 |   {
515 |    "cell_type": "code",
516 |    "execution_count": null,
517 |    "metadata": {
518 |     "id": "c50b68a4-00b7-474a-9b8b-e750eb28a93d",
519 |     "tags": []
520 |    },
521 |    "outputs": [],
522 |    "source": [
523 |     "data_prep_done = os.path.exists(vars_and_utils.train_data_path) and os.path.exists(vars_and_utils.test_data_path) and os.path.exists(vars_and_utils.pipeline_path)\n",
524 |     "data_prep_done"
525 |    ]
526 |   },
527 |   {
528 |    "cell_type": "markdown",
529 |    "metadata": {
530 |     "id": "da0ba538-74ac-45ff-a262-0a6420ff8759"
531 |    },
532 |    "source": [
533 |     "## Register the output variables for the next pipeine stage\n",
534 |     "every notebook outputs a \"was_successful\" boolean variable. The logic behind this is different for every notebook and can be altered to fit the needs of the project.\n",
535 |     "If needed additional variables can be created here but they also need to registered as output variables in the Watson Pipelines UI."
536 |    ]
537 |   },
538 |   {
539 |    "cell_type": "code",
540 |    "execution_count": null,
541 |    "metadata": {
542 |     "id": "e977c379-c462-4e9e-91c5-3ef319062156",
543 |     "tags": []
544 |    },
545 |    "outputs": [],
546 |    "source": [
547 |     "preparation_params = {}\n",
548 |     "preparation_params['was_succesfull'] = data_prep_done\n",
549 |     "\n",
550 |     "pipelines_client = WSPipelines.from_token(TOKEN)\n",
551 |     "pipelines_client.store_results(preparation_params)"
552 |    ]
553 |   }
554 |  ],
555 |  "metadata": {
556 |   "kernelspec": {
557 |    "display_name": "Python 3.10",
558 |    "language": "python",
559 |    "name": "python3"
560 |   },
561 |   "language_info": {
562 |    "codemirror_mode": {
563 |     "name": "ipython",
564 |     "version": 3
565 |    },
566 |    "file_extension": ".py",
567 |    "mimetype": "text/x-python",
568 |    "name": "python",
569 |    "nbconvert_exporter": "python",
570 |    "pygments_lexer": "ipython3",
571 |    "version": "3.10.10"
572 |   },
573 |   "vscode": {
574 |    "interpreter": {
575 |     "hash": "bd385fe162c5ca0c84973b7dd5c518456272446b2b64e67c2a69f949ca7a1754"
576 |    }
577 |   }
578 |  },
579 |  "nbformat": 4,
580 |  "nbformat_minor": 4
581 | }
582 | 


--------------------------------------------------------------------------------
/04-deploy_model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "![Alt text](images/banner.png)"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {
 13 |     "id": "9909ba97-1cfe-495b-bd18-d663ea13c7fa"
 14 |    },
 15 |    "source": [
 16 |     "## Deploy the Saved Model in the project to Deployment Space"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "### Initial Setup\n",
 24 |     "\n",
 25 |     "Some initial setup specific to running this notebook as part of the pipeline."
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "import os\n",
 35 |     "#This environment variable is automatically set in WS Pipelines and are needed to access various services.\n",
 36 |     "TOKEN = os.getenv(\"USER_ACCESS_TOKEN\")"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "if os.getenv(\"running_in_production_pipeline\"):\n",
 46 |     "    running_in_production_pipeline = True\n",
 47 |     "    # If you want to run additional steps when deploying to production like reporting to external services, you can use this variable to trigger that\n",
 48 |     "    # It can also be used to skip steps that are only needed in development like plotting\n",
 49 |     "    print(\"notebook is running in a production pipeline!\")\n",
 50 |     "else:\n",
 51 |     "    running_in_production_pipeline = False\n",
 52 |     "    print(\"notebook is running in a development enviroment!\")"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 26,
 58 |    "metadata": {
 59 |     "id": "d600fa56-ad81-4587-95c4-3f6c67e211c4",
 60 |     "tags": []
 61 |    },
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "from ibm_cloud_sdk_core.authenticators import IAMAuthenticator\n",
 65 |     "from ibm_watson_machine_learning import APIClient\n",
 66 |     "# from ibm_aigov_facts_client import AIGovFactsClient #removing due to current issues --> put back ASAP\n",
 67 |     "from ibm_watson_studio_pipelines import WSPipelines\n",
 68 |     "from botocore.client import Config\n",
 69 |     "import ibm_boto3\n",
 70 |     "import pandas as pd\n",
 71 |     "import json\n",
 72 |     "import os\n",
 73 |     "import requests\n",
 74 |     "import pickle\n",
 75 |     "import vars_and_utils as vars_and_utils\n"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "metadata": {
 81 |     "id": "c892c8cd-46b7-4a16-85cf-0ac039fd8a61"
 82 |    },
 83 |    "source": [
 84 |     "## Instantiate WML Client"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {
 91 |     "id": "6a324dbf-3194-4a1b-897e-3a74d7ec1346",
 92 |     "tags": []
 93 |    },
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "WML_CREDENTIALS = {\n",
 97 |     "   \"token\": TOKEN,\n",
 98 |     "   \"instance_id\" : \"openshift\",\n",
 99 |     "   \"url\": os.environ['RUNTIME_ENV_APSX_URL'],\n",
100 |     "   \"version\": \"4.6\"\n",
101 |     "}\n",
102 |     "\n",
103 |     "wml_client = APIClient(WML_CREDENTIALS)"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "if running_in_production_pipeline:\n",
113 |     "    deployment_space_id=vars_and_utils.deployment_space_id_PROD\n",
114 |     "else:\n",
115 |     "    deployment_space_id=vars_and_utils.deployment_space_id_DEV\n",
116 |     "    \n",
117 |     "deployment_space_id\n",
118 |     "    "
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {
125 |     "id": "08df0fad-f279-483f-87dc-e0b8d96f14bf",
126 |     "tags": []
127 |    },
128 |    "outputs": [],
129 |    "source": [
130 |     "wml_client.set.default_space(deployment_space_id)"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "metadata": {
136 |     "id": "288ce55e-9b82-4185-8297-a399f1648cea",
137 |     "tags": []
138 |    },
139 |    "source": [
140 |     "# Deserialize model\n",
141 |     "\n",
142 |     "TODO: Later get model from model inventory instead"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "metadata": {
149 |     "id": "e5bd323f-cf79-4ab1-ae8c-c81a14d81f3b",
150 |     "tags": []
151 |    },
152 |    "outputs": [],
153 |    "source": [
154 |     "with open(vars_and_utils.model_path, 'rb') as f:\n",
155 |     "    model = pickle.load(f)"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "metadata": {
161 |     "id": "40fc914b-9675-46a3-ad8f-e6f53dc72437"
162 |    },
163 |    "source": [
164 |     "### Load Sample Data "
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {
171 |     "id": "1f16e30e-c9cb-4581-bead-352c127018d4",
172 |     "tags": []
173 |    },
174 |    "outputs": [],
175 |    "source": [
176 |     "payload_data = vars_and_utils.load_data_from_filesystem(vars_and_utils.test_data_path)\n",
177 |     "payload_data = payload_data.drop('Risk',axis=1)\n",
178 |     "fields = payload_data.columns.tolist()\n",
179 |     "values = [payload_data.values.tolist()[0]]\n",
180 |     "\n",
181 |     "payload_scoring = {\"input_data\": [{\"fields\": fields, \"values\": values}]}\n",
182 |     "json.dumps(payload_scoring)"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "metadata": {
189 |     "id": "eb8a1006-f0a0-4223-8c67-2f4aafe610dd",
190 |     "tags": []
191 |    },
192 |    "outputs": [],
193 |    "source": [
194 |     "# TODO: Move to notebook 3 and skip (de)serialization process\n",
195 |     "software_spec_uid = wml_client.software_specifications.get_id_by_name(\"runtime-22.2-py3.10\")\n",
196 |     "model_props_gbt = {\n",
197 |     "    wml_client.repository.ModelMetaNames.NAME: vars_and_utils.model_name,\n",
198 |     "    wml_client.repository.ModelMetaNames.DESCRIPTION: vars_and_utils.model_name,\n",
199 |     "    wml_client.repository.ModelMetaNames.SOFTWARE_SPEC_UID: software_spec_uid,\n",
200 |     "    wml_client.repository.ModelMetaNames.TYPE: \"scikit-learn_1.1\"\n",
201 |     "}\n",
202 |     "\n",
203 |     "published_model_details = wml_client.repository.store_model(model=model, meta_props=model_props_gbt, training_data=fields,training_target=values)\n",
204 |     "print(published_model_details)"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "metadata": {
211 |     "id": "920713be-ff4d-4fa3-8dd8-81623801b641",
212 |     "tags": []
213 |    },
214 |    "outputs": [],
215 |    "source": [
216 |     "model_id = wml_client.repository.get_model_id(published_model_details)\n",
217 |     "model_id"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": null,
223 |    "metadata": {},
224 |    "outputs": [],
225 |    "source": [
226 |     "# There is an issue with WML Deployments right now. TODO: fix this\n",
227 |     "# This is the ID of the model that was uploaded manually --> that works\n",
228 |     "model_id = \"65c060dc-ea5b-4c48-86fe-acb97853b5df\"\n",
229 |     "# This is fake and we need to fix this!"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "markdown",
234 |    "metadata": {
235 |     "id": "ad6fe0e3-d01e-4363-83c0-4ac57bd9e3dc"
236 |    },
237 |    "source": [
238 |     "## Promote the Model to deployment space and Deploy the Model"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": null,
244 |    "metadata": {
245 |     "id": "d25ed89c-62dd-4bc8-8644-3439c9586f8b",
246 |     "tags": []
247 |    },
248 |    "outputs": [],
249 |    "source": [
250 |     "meta_data = {\n",
251 |     "    wml_client.deployments.ConfigurationMetaNames.NAME: vars_and_utils.deployment_name,\n",
252 |     "    wml_client.deployments.ConfigurationMetaNames.ONLINE: {},\n",
253 |     "    wml_client.deployments.ConfigurationMetaNames.HARDWARE_SPEC: {\n",
254 |     "        \"name\": \"S\",\n",
255 |     "        \"num_nodes\": 1,\n",
256 |     "    }\n",
257 |     "}\n",
258 |     "\n",
259 |     "deployment_details = wml_client.deployments.create(model_id, meta_props=meta_data)"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": null,
265 |    "metadata": {
266 |     "id": "4802233a-fbc2-458e-91b0-a859a2f57d66",
267 |     "tags": []
268 |    },
269 |    "outputs": [],
270 |    "source": [
271 |     "deployment_uid = wml_client.deployments.get_id(deployment_details)\n",
272 |     "deployment_uid"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "markdown",
277 |    "metadata": {
278 |     "id": "728d19a0-395f-4b11-a5cd-df099ea4abdb",
279 |     "tags": []
280 |    },
281 |    "source": [
282 |     "## Score the Endpoint"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "markdown",
287 |    "metadata": {
288 |     "id": "5f93a3a4-aaa5-4304-a513-e887619e6079"
289 |    },
290 |    "source": [
291 |     "### Model Testing on the Serving Endpoint\n",
292 |     "\n"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": null,
298 |    "metadata": {
299 |     "id": "cb6e54f2-aed6-4f1e-8ec9-8297454f6a07",
300 |     "tags": []
301 |    },
302 |    "outputs": [],
303 |    "source": [
304 |     "predictions = wml_client.deployments.score(deployment_uid, payload_scoring)\n",
305 |     "predictions"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "markdown",
310 |    "metadata": {
311 |     "id": "cff76dc2-be3f-48a6-a583-a75d6583e1da"
312 |    },
313 |    "source": [
314 |     "### Test for Downstream Apps without using WML SDK."
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": null,
320 |    "metadata": {
321 |     "id": "ff65ef37-ba78-4149-9f06-6d2744b1a668",
322 |     "tags": []
323 |    },
324 |    "outputs": [],
325 |    "source": [
326 |     "# deploy_done is true if deployment_uid and model_id are not null\n",
327 |     "deploy_done = bool(deployment_uid) and bool(model_id)"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "markdown",
332 |    "metadata": {
333 |     "id": "a4032cce-515e-4509-810d-51d216ed9ac3"
334 |    },
335 |    "source": [
336 |     "## Register the output variables for the next pipeine stage\n",
337 |     "every notebook outputs a \"was_successful\" boolean variable. The logic behind this is different for every notebook and can be altered to fit the needs of the project.\n",
338 |     "If needed additional variables can be created here but they also need to registered as output variables in the Watson Pipelines UI."
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": null,
344 |    "metadata": {
345 |     "id": "96ccb5fc-9d30-42c8-ac75-4141e7c663df",
346 |     "tags": []
347 |    },
348 |    "outputs": [],
349 |    "source": [
350 |     "deployment_done = {}\n",
351 |     "deployment_done['was_succesfull'] = deploy_done\n",
352 |     "deployment_done['deployment_id'] = deployment_uid\n",
353 |     "deployment_done['model_id'] = model_id\n",
354 |     "\n",
355 |     "pipelines_client = WSPipelines.from_token(TOKEN)\n",
356 |     "pipelines_client.store_results(deployment_done)"
357 |    ]
358 |   }
359 |  ],
360 |  "metadata": {
361 |   "kernelspec": {
362 |    "display_name": "Python 3.10",
363 |    "language": "python",
364 |    "name": "python3"
365 |   },
366 |   "language_info": {
367 |    "codemirror_mode": {
368 |     "name": "ipython",
369 |     "version": 3
370 |    },
371 |    "file_extension": ".py",
372 |    "mimetype": "text/x-python",
373 |    "name": "python",
374 |    "nbconvert_exporter": "python",
375 |    "pygments_lexer": "ipython3",
376 |    "version": "3.10.10"
377 |   },
378 |   "vscode": {
379 |    "interpreter": {
380 |     "hash": "bd385fe162c5ca0c84973b7dd5c518456272446b2b64e67c2a69f949ca7a1754"
381 |    }
382 |   }
383 |  },
384 |  "nbformat": 4,
385 |  "nbformat_minor": 4
386 | }
387 | 


--------------------------------------------------------------------------------
/05_monitor_deployment.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","metadata":{"collapsed":true,"id":"75feab07-55ef-4436-ab6c-48d5c2d133e4","jupyter":{"outputs_hidden":true}},"source":["<img src=\"https://github.com/pmservice/ai-openscale-tutorials/raw/master/notebooks/images/banner.png\" align=\"left\" alt=\"banner\">"]},{"cell_type":"markdown","metadata":{"id":"b18c0eec7d1c46f6b8fe4c779c478b7b"},"source":["# Working with Watson OpenScale - Headless Subscription"]},{"cell_type":"markdown","metadata":{"id":"fc3ebd630e524b3e812e2d17d603e5c9"},"source":["# Pipeline variables"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"27e86130ede246668dc25badce38f298"},"outputs":[],"source":["# Deploymentspace ID\n","space_uid = 'c4238e9c-1cbd-4776-aa6e-4f6b1f865ed1'\n","\n","# DeploymentID of the model that will be monitored\n","# TODO: make 04_deploy_models emit this and then use that URL\n","deployment_uid = '66271495-2e3e-4ab2-ae2f-521330555bdf'"]},{"cell_type":"markdown","metadata":{"id":"1b7933c7711e412582e7dbf0b4c1ade8"},"source":["# Credentials"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"f3505ba0c4fe4866a93ff0e810e90fe4"},"outputs":[],"source":["from ibm_watson_machine_learning import APIClient\n","import os\n","from dotenv import load_dotenv\n","# Loading Variables and Utils from common python file\n","import vars_and_utils as vars_and_utils\n","\n","load_dotenv()\n","\n","token = os.environ['USER_ACCESS_TOKEN']\n","cpd_technical_user = os.environ['cpd_technical_user']\n","cpd_technical_user_password = os.environ['cpd_technical_user_password']\n","cpd_url = os.environ['cpd_url']\n","\n","\n","\n","WML_CREDENTIALS = {\n","   \"token\": token,\n","   \"instance_id\" : \"openshift\",\n","   \"url\": os.environ['RUNTIME_ENV_APSX_URL'],\n","   \"version\": \"4.7\"\n","}\n","\n","\n","\n","WOS_CREDENTIALS = {\n","    \"url\": cpd_url,\n","    \"username\": cpd_technical_user,\n","    \"password\": cpd_technical_user_password,\n","    \"version\": \"4.7\"\n","}"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["print(WOS_CREDENTIALS)"]},{"cell_type":"markdown","metadata":{"id":"262ffb75c64b4523bfda24ea95d4b8f5"},"source":["# WOS name definitions"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"1f4872bb1f6944678d36c12ff05f9b5c"},"outputs":[],"source":["SERVICE_PROVIDER_NAME = \"OpenScale Headless Service Provider\"\n","SERVICE_PROVIDER_DESCRIPTION = \"Added by automated WOS configuration notebook.\"\n","SUBSCRIPTION_NAME = \"AOT Initiative - Headless Subscription\""]},{"cell_type":"markdown","metadata":{"id":"3b7c0a42ac1e43139057e5d78abfe075"},"source":["# Setup - Package installation <a name=\"setup\"></a>"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"dc441429a8ad4aada3fa29d81e98e5cf"},"outputs":[],"source":["!pip install --upgrade ibm-watson-machine-learning --user | tail -n 1\n","!pip install --upgrade ibm-watson-openscale --no-cache | tail -n 1\n","!pip install --upgrade \"ibm-metrics-plugin>=4.6.4.0\""]},{"cell_type":"markdown","metadata":{"id":"eb4fc86b293a4dd188bd341121b45aae"},"source":["# Imports"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"e672b2b0765448338e4c5df902054027"},"outputs":[],"source":["import pandas as pd\n","import tarfile\n","from io import BytesIO\n","\n","from ibm_watson_openscale import APIClient\n","from ibm_watson_openscale.utils import *\n","from ibm_watson_openscale.supporting_classes import *\n","from ibm_watson_openscale.supporting_classes.enums import *\n","from ibm_watson_openscale.base_classes.watson_open_scale_v2 import *\n","from ibm_cloud_sdk_core.authenticators import IAMAuthenticator,BearerTokenAuthenticator, CloudPakForDataAuthenticator\n","\n","import json\n","import requests\n","import base64\n","from requests.auth import HTTPBasicAuth\n","import time\n","\n","# disable warnings\n","import warnings\n","warnings.filterwarnings('ignore')"]},{"cell_type":"markdown","metadata":{"id":"2963bb1dc0064426839aa24a0ea45150"},"source":["# Get training data statistics"]},{"cell_type":"markdown","metadata":{"id":"636c77c1db4a48da85ed3dd8c6846a45"},"source":["### Get the training data"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["data_df = vars_and_utils.load_data_from_filesystem(vars_and_utils.raw_data_path)\n","data_df.head()"]},{"cell_type":"markdown","metadata":{"id":"b5afc1d931da4c6495a10f497b5c4d52"},"source":["### Generate the training data stats"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"49a11fd0e1734d6789c54ebb58977f43"},"outputs":[],"source":["from ibm_watson_openscale.utils.training_stats import TrainingStats"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"e99bf24d2ea34f848ce12ea738cc4210"},"outputs":[],"source":["feature_columns = data_df.drop(\"Risk\", axis=1).columns.tolist()\n","cat_features = [x for x in feature_columns if data_df[x].dtype == 'object']\n","class_label = \"Risk\"\n","prediction_column = \"prediction\"\n","probability_column = \"probability\""]},{"cell_type":"code","execution_count":null,"metadata":{"id":"216ed095143047dab3c4332c97b5d31b"},"outputs":[],"source":["service_configuration_support = {\n","    \"enable_fairness\": True,\n","    \"enable_explainability\": True,\n","    \"enable_drift\": True\n","}\n","\n","fairness_attributes = [{\n","   \"feature\": \"Sex\", \n","   \"majority\": [\n","       \"male\"\n","   ],\n","   \"minority\": [\n","       \"female\"\n","   ],\n","   \"threshold\": 0.8\n","}]\n","\n","model_type = \"binary\"\n","parameters = {\n","    \"favourable_class\" :  [ \"No Risk\" ],\n","    \"unfavourable_class\": [ \"Risk\" ]\n","}\n","min_records = 100\n","\n","# Generate Training stats\n","enable_explainability = service_configuration_support.get('enable_explainability')\n","enable_fairness = service_configuration_support.get('enable_fairness')\n","training_data_stats = None\n","if enable_explainability or enable_fairness:\n","    fairness_inputs = None\n","    if enable_fairness:\n","        fairness_inputs = {\n","                \"fairness_attributes\": fairness_attributes,\n","                \"min_records\" : min_records,\n","                \"favourable_class\" :  parameters[\"favourable_class\"],\n","                \"unfavourable_class\": parameters[\"unfavourable_class\"]\n","            }\n","        \n","input_parameters = {\n","    \"label_column\": class_label,\n","    \"feature_columns\": feature_columns,\n","    \"categorical_columns\": cat_features,\n","    \"fairness_inputs\": fairness_inputs,\n","    \"problem_type\" : \"binary\",\n","    \"prediction\": prediction_column,\n","    \"probability\": \"probability\"\n","}"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"b0ddac7d43784eff8f51faac32315fb7"},"outputs":[],"source":["training_stats = TrainingStats(data_df,input_parameters, explain=True, fairness=enable_explainability, drop_na=True)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"69f30aa6275a45c1aabee80b8bc91ba2"},"outputs":[],"source":["training_data_stats = training_stats.get_training_statistics()"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"ba14fdd0d5fc42148ffa0c70318e0bc0"},"outputs":[],"source":["training_data_stats[\"notebook_version\"] = 5.0\n","print(training_data_stats)"]},{"cell_type":"markdown","metadata":{"id":"cdc330c2-ce03-4665-a5d3-bbb00ee9e2c5"},"source":["### This JSON contains the training statistics"]},{"cell_type":"markdown","metadata":{"id":"9b0707ac1440466492c933e7069aa015"},"source":["# Configure OpenScale \n","\n","The notebook will now import the necessary libraries and set up a Python OpenScale client."]},{"cell_type":"markdown","metadata":{"id":"3043544c667e49d38cd70855d8ccb5ad"},"source":["## Get a instance of the OpenScale SDK client and connect to WOS datamart\n","\n","Watson OpenScale uses a database to store payload and feedback logs and calculated metrics. Here we are using already configured data mart in IBM Cloud."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"1662b2fa228e432c8b6a4b46fd5d5b3f"},"outputs":[],"source":["authenticator = CloudPakForDataAuthenticator(\n","        url=WOS_CREDENTIALS[\"url\"],\n","        username=WOS_CREDENTIALS[\"username\"],\n","        password=WOS_CREDENTIALS[\"password\"],\n","        disable_ssl_verification=True\n","    )\n","\n","try:\n","    wos_client = APIClient(authenticator=authenticator, service_url=WOS_CREDENTIALS[\"url\"])\n","    print(\"Authentication Successful\")\n","    data_marts = wos_client.data_marts.list().result.data_marts\n","    data_mart_id=data_marts[0].metadata.id\n","    print('Using existing datamart {}'.format(data_mart_id))\n","except:\n","        print(\"ERROR: Authorization request has been rejected with message: AIQCS0002E : Not authorized to access datamart id `00000000-0000-0000-0000-000000000000`.\")\n","        if DATAMART_ID==\"00000000-0000-0000-0000-000000000000\":\n","            DATAMART_ID=input(\"Please enter your datamart id to authenticate\")\n","        print(\"\\nTrying to authenticate with the DATAMART_ID provided..\")\n","        wos_client = APIClient(authenticator=authenticator, service_url=WOS_CREDENTIALS[\"url\"], service_instance_id=DATAMART_ID)\n","        print(\"Authentication Successful.\")"]},{"cell_type":"markdown","metadata":{"id":"b1ecf806072240cd89af4324cb844744"},"source":["## Remove existing service provider\n","\n","Multiple service providers for the same engine instance are avaiable in Watson OpenScale. To avoid multiple service providers of used WML instance in the tutorial notebook the following code deletes existing service provder(s) and then adds new one."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"ebd41ed59d0a44b3837152bdd67e10fc"},"outputs":[],"source":["service_providers = wos_client.service_providers.list().result.service_providers\n","for service_provider in service_providers:\n","    service_instance_name = service_provider.entity.name\n","    if service_instance_name == SERVICE_PROVIDER_NAME:\n","        service_provider_id = service_provider.metadata.id\n","        wos_client.service_providers.delete(service_provider_id)\n","        print(\"Deleted existing service_provider for WML instance: {}\".format(service_provider_id))"]},{"cell_type":"markdown","metadata":{"id":"48cf5770290744c28274669a5af0103b"},"source":["## Add service provider\n","\n","Watson OpenScale needs to be bound to the Watson Machine Learning instance to capture payload data into and out of the model.\n","\n","Note: Here the service provider is created with empty credentials, meaning no endpoint. Just to demonstrate the use case were we don't need an actual end point serving requests."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"2d260d321cad475682aea0857c9b1054"},"outputs":[],"source":["MLCredentials = {}\n","added_service_provider_result = wos_client.service_providers.add(\n","        name=SERVICE_PROVIDER_NAME,\n","        description=SERVICE_PROVIDER_DESCRIPTION,\n","        service_type=ServiceTypes.CUSTOM_MACHINE_LEARNING,\n","        operational_space_id = \"production\",\n","        credentials=MLCredentials,\n","        background_mode=False\n","    ).result\n","service_provider_id = added_service_provider_result.metadata.id"]},{"cell_type":"markdown","metadata":{"id":"09f4993d2caa48269b9832c7e9b145d6"},"source":["## Subscriptions"]},{"cell_type":"markdown","metadata":{"id":"2e59906593c44bda97f79a9c35e728f6"},"source":["This code removes previous subscriptions to the model to refresh the monitors with the new model and new data."]},{"cell_type":"markdown","metadata":{"id":"b0d085c8a875458b834992abd3577219"},"source":["## Remove the existing subscription"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"f5ef621a07cb4e468fca6cc62d22595e"},"outputs":[],"source":["subscriptions = wos_client.subscriptions.list().result.subscriptions\n","for subscription in subscriptions:\n","    if subscription.entity.asset.name == '[asset] ' + SUBSCRIPTION_NAME:\n","        sub_model_id = subscription.metadata.id\n","        wos_client.subscriptions.delete(subscription.metadata.id)\n","        print('Deleted existing subscription for model', sub_model_id)"]},{"cell_type":"markdown","metadata":{"id":"d117477d751b4d6380d2feceac14450e"},"source":["This code creates the model subscription in OpenScale using the Python client API. Note that we need to provide the model unique identifier, and some information about the model itself."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"e2df727978ef4fbe8062c30a63944b6d"},"outputs":[],"source":["print(\"Data Mart ID: \" + data_mart_id)\n","print(\"Service Provide ID: \" + service_provider_id)\n","import uuid\n","asset_id = str(uuid.uuid4())\n","asset_name = '[asset] ' + SUBSCRIPTION_NAME\n","url = None\n","\n","asset_deployment_id = str(uuid.uuid4())\n","asset_deployment_name = asset_name"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"1de20abba6d84c6c8ba84cd3ed3bc96e"},"outputs":[],"source":["prediction_column = prediction_column\n","probability_columns = ['probability']\n","predicted_target_column = prediction_column\n","\n","subscription_details = wos_client.subscriptions.add(data_mart_id,\n","    service_provider_id,\n","    asset=Asset(\n","        asset_id=asset_id,\n","        name=asset_name,\n","        url=url,\n","        asset_type=AssetTypes.MODEL,\n","        input_data_type=InputDataType.STRUCTURED,\n","        problem_type=ProblemType.BINARY_CLASSIFICATION\n","    ),\n","    deployment=None,    \n","    training_data_stats=training_data_stats, \n","    prediction_field = prediction_column,\n","    predicted_target_field = predicted_target_column,\n","    probability_fields = probability_columns,background_mode = False\n","    ,deployment_name = asset_name\n","    ).result\n","\n","subscription_id = subscription_details.metadata.id\n","print(\"Subscription id {}\".format(subscription_id))"]},{"cell_type":"markdown","metadata":{"id":"e9f01d4d-7c2c-4fb4-ab8a-95c3034dfb9c"},"source":["### The following code fetches the data set id, against which we would be performing the payload logging"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"9dee41fc8ae2407483f7d08f7ecd9663"},"outputs":[],"source":["import time\n","\n","time.sleep(5)\n","payload_data_set_id = None\n","payload_data_set_id = wos_client.data_sets.list(type=DataSetTypes.PAYLOAD_LOGGING, \n","                                                target_target_id=subscription_id, \n","                                                target_target_type=TargetTypes.SUBSCRIPTION).result.data_sets[0].metadata.id\n","if payload_data_set_id is None:\n","    print(\"Payload data set not found. Please check subscription status.\")\n","else:\n","    print(\"Payload data set id:\", payload_data_set_id)"]},{"cell_type":"markdown","metadata":{"id":"2d75c57ab8914d70acdc3ed523675b1b"},"source":["## Push a payload record to setup the required schemas in the subscription\n","\n","This is the location where one needs to fetch the output of the batch scoring model and construct the payload as per the OpenScale Payload Logging format.\n","\n","Note : No scoring is done against the model. The PayloadRecord is constructed with the request and response from the model/deployment."]},{"cell_type":"markdown","metadata":{"id":"7ea1d04fe73a452685847cfc4f780256"},"source":["## Scoring Request Payload"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"cb53d8ad950146bda2fd9ad017112835"},"outputs":[],"source":["scoring_request =   {\n","        \"fields\": [\n","            \"CheckingStatus\",\n","            \"LoanDuration\",\n","            \"CreditHistory\",\n","            \"LoanPurpose\",\n","            \"LoanAmount\",\n","            \"ExistingSavings\",\n","            \"EmploymentDuration\",\n","            \"InstallmentPercent\",\n","            \"Sex\",\n","            \"OthersOnLoan\",\n","            \"CurrentResidenceDuration\",\n","            \"OwnsProperty\",\n","            \"Age\",\n","            \"InstallmentPlans\",\n","            \"Housing\",\n","            \"ExistingCreditsCount\",\n","            \"Job\",\n","            \"Dependents\",\n","            \"Telephone\",\n","            \"ForeignWorker\",\n","            \"Risk\"\n","        ],\n","        \"values\": [\n","            [\n","                \"no_checking\",\n","                28,\n","                \"outstanding_credit\",\n","                \"appliances\",\n","                5990,\n","                \"500_to_1000\",\n","                \"greater_7\",\n","                5,\n","                \"male\",\n","                \"co-applicant\",\n","                3,\n","                \"car_other\",\n","                55,\n","                \"none\",\n","                \"free\",\n","                2,\n","                \"skilled\",\n","                2,\n","                \"yes\",\n","                \"yes\",\n","                \"Risk\"\n","            ],\n","            [\n","                \"greater_200\",\n","                22,\n","                \"all_credits_paid_back\",\n","                \"car_used\",\n","                3376,\n","                \"less_100\",\n","                \"less_1\",\n","                3,\n","                \"female\",\n","                \"none\",\n","                2,\n","                \"car_other\",\n","                32,\n","                \"none\",\n","                \"own\",\n","                1,\n","                \"skilled\",\n","                1,\n","                \"none\",\n","                \"yes\",\n","                \"No Risk\"\n","            ],\n","            [\n","                \"no_checking\",\n","                39,\n","                \"credits_paid_to_date\",\n","                \"vacation\",\n","                6434,\n","                \"unknown\",\n","                \"greater_7\",\n","                5,\n","                \"male\",\n","                \"none\",\n","                4,\n","                \"car_other\",\n","                39,\n","                \"none\",\n","                \"own\",\n","                2,\n","                \"skilled\",\n","                2,\n","                \"yes\",\n","                \"yes\",\n","                \"Risk\"\n","            ],\n","            [\n","                \"0_to_200\",\n","                20,\n","                \"credits_paid_to_date\",\n","                \"furniture\",\n","                2442,\n","                \"less_100\",\n","                \"unemployed\",\n","                3,\n","                \"female\",\n","                \"none\",\n","                1,\n","                \"real_estate\",\n","                42,\n","                \"none\",\n","                \"own\",\n","                1,\n","                \"skilled\",\n","                1,\n","                \"none\",\n","                \"yes\",\n","                \"No Risk\"\n","            ],\n","            [\n","                \"greater_200\",\n","                4,\n","                \"all_credits_paid_back\",\n","                \"education\",\n","                4206,\n","                \"less_100\",\n","                \"unemployed\",\n","                1,\n","                \"female\",\n","                \"none\",\n","                3,\n","                \"savings_insurance\",\n","                27,\n","                \"none\",\n","                \"own\",\n","                1,\n","                \"management_self-employed\",\n","                1,\n","                \"none\",\n","                \"yes\",\n","                \"No Risk\"\n","            ],\n","            [\n","                \"greater_200\",\n","                23,\n","                \"credits_paid_to_date\",\n","                \"car_used\",\n","                2963,\n","                \"greater_1000\",\n","                \"greater_7\",\n","                4,\n","                \"male\",\n","                \"none\",\n","                4,\n","                \"car_other\",\n","                46,\n","                \"none\",\n","                \"own\",\n","                2,\n","                \"skilled\",\n","                1,\n","                \"none\",\n","                \"yes\",\n","                \"Risk\"\n","            ],\n","            [\n","                \"no_checking\",\n","                31,\n","                \"prior_payments_delayed\",\n","                \"vacation\",\n","                2673,\n","                \"500_to_1000\",\n","                \"1_to_4\",\n","                3,\n","                \"male\",\n","                \"none\",\n","                2,\n","                \"real_estate\",\n","                35,\n","                \"stores\",\n","                \"rent\",\n","                1,\n","                \"skilled\",\n","                2,\n","                \"none\",\n","                \"yes\",\n","                \"Risk\"\n","            ],\n","            [\n","                \"no_checking\",\n","                37,\n","                \"prior_payments_delayed\",\n","                \"other\",\n","                6971,\n","                \"500_to_1000\",\n","                \"1_to_4\",\n","                3,\n","                \"male\",\n","                \"none\",\n","                3,\n","                \"savings_insurance\",\n","                54,\n","                \"none\",\n","                \"own\",\n","                2,\n","                \"skilled\",\n","                1,\n","                \"yes\",\n","                \"yes\",\n","                \"Risk\"\n","            ],\n","            [\n","                \"no_checking\",\n","                14,\n","                \"all_credits_paid_back\",\n","                \"car_new\",\n","                1525,\n","                \"500_to_1000\",\n","                \"4_to_7\",\n","                3,\n","                \"male\",\n","                \"none\",\n","                4,\n","                \"real_estate\",\n","                33,\n","                \"none\",\n","                \"own\",\n","                1,\n","                \"skilled\",\n","                1,\n","                \"none\",\n","                \"yes\",\n","                \"No Risk\"\n","            ],\n","            [\n","                \"less_0\",\n","                10,\n","                \"prior_payments_delayed\",\n","                \"furniture\",\n","                4037,\n","                \"less_100\",\n","                \"4_to_7\",\n","                3,\n","                \"male\",\n","                \"none\",\n","                3,\n","                \"savings_insurance\",\n","                31,\n","                \"none\",\n","                \"rent\",\n","                1,\n","                \"skilled\",\n","                1,\n","                \"none\",\n","                \"yes\",\n","                \"Risk\"\n","            ]\n","        ]\n","    }"]},{"cell_type":"markdown","metadata":{"id":"3e469b9bf6cc4baa855d4954b28b5ff7"},"source":["## Scoring Response Payload"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"12303ea22a7146b2ab77b898a3fdffe2"},"outputs":[],"source":["scoring_response = {\n","    \"predictions\": [\n","        {\n","            \"fields\": [\n","                \"prediction\",\n","                \"probability\"\n","            ],\n","            \"values\": [\n","                [\n","                    \"Risk\",\n","                    [\n","                        0.104642951112211,\n","                        0.895357048887789\n","                    ]\n","                ],\n","                [\n","                    \"No Risk\",\n","                    [\n","                        0.892112895920181,\n","                        0.10788710407981907\n","                    ]\n","                ],\n","                [\n","                    \"Risk\",\n","                    [\n","                        0.4863177905287259,\n","                        0.5136822094712741\n","                    ]\n","                ],\n","                [\n","                    \"No Risk\",\n","                    [\n","                        0.980811537315731,\n","                        0.01918846268426898\n","                    ]\n","                ],\n","                [\n","                    \"No Risk\",\n","                    [\n","                        0.9053052561083984,\n","                        0.09469474389160164\n","                    ]\n","                ],\n","                [\n","                    \"No Risk\",\n","                    [\n","                        0.5315146773053994,\n","                        0.4684853226946007\n","                    ]\n","                ],\n","                [\n","                    \"No Risk\",\n","                    [\n","                        0.7689466209701616,\n","                        0.23105337902983833\n","                    ]\n","                ],\n","                [\n","                    \"Risk\",\n","                    [\n","                        0.41317664143643873,\n","                        0.5868233585635613\n","                    ]\n","                ],\n","                [\n","                    \"No Risk\",\n","                    [\n","                        0.9190247585206522,\n","                        0.08097524147934775\n","                    ]\n","                ],\n","                [\n","                    \"No Risk\",\n","                    [\n","                        0.781841942776921,\n","                        0.21815805722307902\n","                    ]\n","                ]\n","            ]\n","        }\n","    ]\n","}"]},{"cell_type":"markdown","metadata":{"id":"8030376f-faad-418a-85d1-b1c8f930da2d"},"source":["### Construct the payload using the scoring_request and scoring_response and then log the records"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"edb0cf12b0164869879f0ea101f520e0"},"outputs":[],"source":["from ibm_watson_openscale.supporting_classes.payload_record import PayloadRecord\n","\n","records_list=[]\n","for x in range(10):\n","    pl_record = PayloadRecord(request=scoring_request, response=scoring_response)\n","    records_list.append(pl_record)\n","\n","wos_client.data_sets.store_records(data_set_id=payload_data_set_id, request_body=records_list)"]},{"cell_type":"markdown","metadata":{"id":"d0c60596-0d19-4997-836c-d0aeade843bc"},"source":["### Make sure the records reached the payload logging table inside the OpenScale DataMart."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"933b5220cd9d4c5681f54308ce266ade"},"outputs":[],"source":["import time\n","time.sleep(30)\n","pl_records_count = wos_client.data_sets.get_records_count(payload_data_set_id)\n","print(\"Number of records in the payload logging table: {}\".format(pl_records_count))\n","if pl_records_count == 0:\n","    raise Exception(\"Payload logging did not happen!\")"]},{"cell_type":"markdown","metadata":{"id":"c9154f4c69ad4c4db3c14d8f0bc6236c"},"source":["# Explainability Monitor Configuration\n","From the notebook, perform offline scoring against the customer model, create an explain archive and save this archive to data mart.\n","\n","* Only Local explanations and Lime global explanations are supported.\n","* For contrastive explanations, as scoring is needed and because it is headless subscription without any deployment URL, contrastive explanations are not supported."]},{"cell_type":"markdown","metadata":{"id":"aac1dc85b142421290d5096d9646982b"},"source":["## Score the perturbations\n","\n","Here, this notebook uses a credit risk model deployment in WML. This can be replaced with the scoring engine of your choice, but making sure the scoring response is in the format that OpenScale understands for monitor processing."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"a8cfbd042be041cd8f807ada9a55c477"},"outputs":[],"source":["import json\n","from ibm_watson_machine_learning import APIClient\n","\n","wml_client = APIClient(WML_CREDENTIALS)\n","wml_client.set.default_space(space_uid) # connect to deployment space"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"7f491c0a56df457c819324e6419cf876"},"outputs":[],"source":["perturbs_df = data_df.copy()\n","cols_to_remove = [\"Risk\"]"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"3da3e7a8d7544b84822132a418b683ea"},"outputs":[],"source":["def get_scoring_payload(no_of_records_to_score = 1):\n","    for col in cols_to_remove:\n","        if col in perturbs_df.columns:\n","            del perturbs_df[col] \n","\n","    fields = perturbs_df.columns.tolist()\n","    training_data_rows = perturbs_df[fields].values.tolist()\n","\n","    payload_scoring = {\"input_data\": [{\n","        \"fields\": fields, \n","        \"values\": [x for x in training_data_rows]\n","    }]}  \n","    return payload_scoring\n","\n","def sample_scoring(no_of_records_to_score = 1):\n","    job_payload_ref = get_scoring_payload(no_of_records_to_score)\n","    score = wml_client.deployments.score(deployment_uid, meta_props=job_payload_ref)\n","    return job_payload_ref, scoring_response\n","\n","payload_scoring, scoring_response = sample_scoring(no_of_records_to_score = 5000)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"483e8d7fd6fe46739ba5e280f43d43e2"},"outputs":[],"source":["fields = scoring_response['predictions'][0]['fields']\n","values = scoring_response['predictions'][0]['values']\n","scored_data = pd.DataFrame(values, columns = fields)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"7574a842cdc34331b30fd74475b7c072"},"outputs":[],"source":["probabilities = [pro for pro in scored_data['probability']]\n","predictions = [pre for pre in scored_data['prediction']]\n","\n","explain_perturb_payload = {'probabilities' : probabilities,\n","                            'predictions' : predictions}"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"f1ef24fb6ced43549e83dcacb890eb0c"},"outputs":[],"source":["with open('explain_scoring_response.json', 'w') as outfile:\n","    json.dump(explain_perturb_payload, outfile)\n","    \n","file_name = 'explain_scoring_response.tar.gz'\n","\n","with tarfile.open(file_name, 'w:gz') as archive:\n","    archive.add('explain_scoring_response.json')\n","\n","with open(file_name, 'rb') as fh:\n","    buf = BytesIO(fh.read())\n","buf = open(file_name, mode=\"rb\").read()    "]},{"cell_type":"code","execution_count":null,"metadata":{"id":"7b9f223d5ad849f380729e8b058d7f6c"},"outputs":[],"source":["with open(\"explain_scoring_response.tar.gz\", mode=\"rb\") as perturbations_tar:\n","    wos_client.monitor_instances.upload_explainability_archive(subscription_id=subscription_id, archive=perturbations_tar)\n","\n","print(\"Uploaded perturbations scoring response archive successfully.\")"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"6d3660b4a9a54fb8875c36009694d50a"},"outputs":[],"source":["target = Target(\n","    target_type=TargetTypes.SUBSCRIPTION,\n","    target_id=subscription_id\n",")\n","\n","parameters = {\n","    \"enabled\": True\n","}\n","\n","print(\"Creating monitor instances...\")\n","response = wos_client.monitor_instances.create(monitor_definition_id = None, \n","                        target = None, data_mart_id = data_mart_id, training_data_stats=training_data_stats, \n","                        subscription_id=subscription_id,background_mode=False, parameters = parameters)\n","print(response)"]},{"cell_type":"markdown","metadata":{"id":"864da785a62b4bb782651e7a89b04837"},"source":["# Quality monitoring and feedback logging"]},{"cell_type":"markdown","metadata":{"id":"2cfa62a024bd4951827899957267c0ae"},"source":["## Enable quality monitoring\n","\n","The code below waits ten seconds to allow the payload logging table to be set up before it begins enabling monitors. First, it turns on the quality (accuracy) monitor and sets an alert threshold of 70%. OpenScale will show an alert on the dashboard if the model accuracy measurement (area under the curve, in the case of a binary classifier) falls below this threshold.\n","\n","The second paramater supplied, min_records, specifies the minimum number of feedback records OpenScale needs before it calculates a new measurement. The quality monitor runs hourly, but the accuracy reading in the dashboard will not change until an additional 50 feedback records have been added, via the user interface, the Python client, or the supplied feedback endpoint."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"315eff5637f3499789af5bff0e952c27"},"outputs":[],"source":["import time\n","\n","time.sleep(10)\n","target = Target(\n","        target_type=TargetTypes.SUBSCRIPTION,\n","        target_id=subscription_id\n",")\n","parameters = {\n","    \"min_feedback_data_size\": 100\n","}\n","thresholds = [\n","                {\n","                    \"metric_id\": \"area_under_roc\",\n","                    \"type\": \"lower_limit\",\n","                    \"value\": .80\n","                }\n","            ]\n","quality_monitor_details = wos_client.monitor_instances.create(\n","    data_mart_id=data_mart_id,\n","    background_mode=False,\n","    monitor_definition_id=wos_client.monitor_definitions.MONITORS.QUALITY.ID,\n","    target=target,\n","    parameters=parameters,\n","    thresholds=thresholds\n",").result"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"4854f81d452a4e41804bf69fffd23793"},"outputs":[],"source":["quality_monitor_instance_id = quality_monitor_details.metadata.id\n","quality_monitor_instance_id"]},{"cell_type":"markdown","metadata":{"id":"37088b2299714a2d8a7ff123dca6f67b"},"source":["## Get feedback logging dataset ID"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"eb0240054545408e886dc81e11073107"},"outputs":[],"source":["feedback_dataset_id = None\n","feedback_dataset = wos_client.data_sets.list(type=DataSetTypes.FEEDBACK, \n","                                                target_target_id=subscription_id, \n","                                                target_target_type=TargetTypes.SUBSCRIPTION).result\n","feedback_dataset_id = feedback_dataset.data_sets[0].metadata.id\n","if feedback_dataset_id is None:\n","    print(\"Feedback data set not found. Please check quality monitor status.\")"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"d3440261ffba406589148380ed0296a9"},"outputs":[],"source":["feedback_dataset_id"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"37812e16eee44208819af7dec9d14986"},"outputs":[],"source":["feedback_payload = {\n","    \"fields\": [\n","        \"CheckingStatus\",\n","        \"LoanDuration\",\n","        \"CreditHistory\",\n","        \"LoanPurpose\",\n","        \"LoanAmount\",\n","        \"ExistingSavings\",\n","        \"EmploymentDuration\",\n","        \"InstallmentPercent\",\n","        \"Sex\",\n","        \"OthersOnLoan\",\n","        \"CurrentResidenceDuration\",\n","        \"OwnsProperty\",\n","        \"Age\",\n","        \"InstallmentPlans\",\n","        \"Housing\",\n","        \"ExistingCreditsCount\",\n","        \"Job\",\n","        \"Dependents\",\n","        \"Telephone\",\n","        \"ForeignWorker\",\n","        \"Risk\",\n","        \"_original_probability\",\n","        \"_original_prediction\",\n","        \"_debiased_probability\",\n","        \"_debiased_prediction\"        \n","    ],\n","    \"values\": [\n","        [\n","            \"less_0\",\n","            18,\n","            \"credits_paid_to_date\",\n","            \"car_new\",\n","            462,\n","            \"less_100\",\n","            \"1_to_4\",\n","            2,\n","            \"female\",\n","            \"none\",\n","            2,\n","            \"savings_insurance\",\n","            37,\n","            \"stores\",\n","            \"own\",\n","            2,\n","            \"skilled\",\n","            1,\n","            \"none\",\n","            \"yes\",\n","            \"No Risk\",\n","            [\n","                0.767955712021837,\n","                0.23204428797816307\n","            ],\n","            \"Risk\",\n","            [\n","                0.767955712021837,\n","                0.23204428797816307\n","            ],\n","            \"Risk\"\n","        ],\n","        [\n","            \"less_0\",\n","            15,\n","            \"prior_payments_delayed\",\n","            \"furniture\",\n","            250,\n","            \"less_100\",\n","            \"1_to_4\",\n","            2,\n","            \"male\",\n","            \"none\",\n","            3,\n","            \"real_estate\",\n","            28,\n","            \"none\",\n","            \"own\",\n","            2,\n","            \"skilled\",\n","            1,\n","            \"yes\",\n","            \"no\",\n","            \"No Risk\",\n","            [\n","                0.7419002139563244,\n","                0.25809978604367556\n","            ],\n","            \"Risk\",\n","            [\n","                0.767955712021837,\n","                0.23204428797816307\n","            ],\n","            \"Risk\"\n","        ],\n","        [\n","            \"0_to_200\",\n","            28,\n","            \"credits_paid_to_date\",\n","            \"retraining\",\n","            3693,\n","            \"less_100\",\n","            \"greater_7\",\n","            3,\n","            \"male\",\n","            \"none\",\n","            2,\n","            \"savings_insurance\",\n","            32,\n","            \"none\",\n","            \"own\",\n","            1,\n","            \"skilled\",\n","            1,\n","            \"none\",\n","            \"yes\",\n","            \"No Risk\",\n","            [\n","                0.6935080115729353,\n","                0.3064919884270647\n","            ],\n","            \"Risk\",\n","            [\n","                0.8,\n","                0.2\n","            ],\n","            \"Risk\"\n","        ],\n","        [\n","            \"no_checking\",\n","            28,\n","            \"prior_payments_delayed\",\n","            \"education\",\n","            6235,\n","            \"500_to_1000\",\n","            \"greater_7\",\n","            3,\n","            \"male\",\n","            \"none\",\n","            3,\n","            \"unknown\",\n","            57,\n","            \"none\",\n","            \"own\",\n","            2,\n","            \"skilled\",\n","            1,\n","            \"none\",\n","            \"yes\",\n","            \"Risk\",\n","            [\n","                0.331110352092386,\n","                0.668889647907614\n","            ],\n","            \"Risk\",\n","            [\n","                0.9,\n","                0.1\n","            ],\n","            \"Risk\"\n","        ],\n","        [\n","            \"no_checking\",\n","            32,\n","            \"outstanding_credit\",\n","            \"vacation\",\n","            9604,\n","            \"500_to_1000\",\n","            \"greater_7\",\n","            6,\n","            \"male\",\n","            \"co-applicant\",\n","            5,\n","            \"unknown\",\n","            57,\n","            \"none\",\n","            \"free\",\n","            2,\n","            \"skilled\",\n","            2,\n","            \"yes\",\n","            \"yes\",\n","            \"Risk\",\n","            [\n","                0.11270206970758759,\n","                0.8872979302924124\n","            ],\n","            \"Risk\",\n","            [\n","                0.1,\n","                0.9\n","            ],\n","            \"Risk\"\n","        ],\n","        [\n","            \"no_checking\",\n","            9,\n","            \"prior_payments_delayed\",\n","            \"car_new\",\n","            1032,\n","            \"100_to_500\",\n","            \"4_to_7\",\n","            3,\n","            \"male\",\n","            \"none\",\n","            4,\n","            \"savings_insurance\",\n","            41,\n","            \"none\",\n","            \"own\",\n","            1,\n","            \"management_self-employed\",\n","            1,\n","            \"none\",\n","            \"yes\",\n","            \"No Risk\",\n","            [\n","                0.6704819620865308,\n","                0.32951803791346923\n","            ],\n","            \"Risk\",\n","            [\n","                0.767955712021837,\n","                0.23204428797816307\n","            ],\n","            \"Risk\"\n","        ],\n","        [\n","            \"less_0\",\n","            16,\n","            \"credits_paid_to_date\",\n","            \"vacation\",\n","            3109,\n","            \"less_100\",\n","            \"4_to_7\",\n","            3,\n","            \"female\",\n","            \"none\",\n","            1,\n","            \"car_other\",\n","            36,\n","            \"none\",\n","            \"own\",\n","            2,\n","            \"skilled\",\n","            1,\n","            \"none\",\n","            \"yes\",\n","            \"No Risk\",\n","            [\n","                0.6735810290914039,\n","                0.3264189709085961\n","            ],\n","            \"Risk\",\n","            [\n","                0.6,\n","                0.4\n","            ],\n","            \"Risk\"\n","        ],\n","        [\n","            \"0_to_200\",\n","            11,\n","            \"credits_paid_to_date\",\n","            \"car_new\",\n","            4553,\n","            \"less_100\",\n","            \"less_1\",\n","            3,\n","            \"female\",\n","            \"none\",\n","            3,\n","            \"savings_insurance\",\n","            22,\n","            \"none\",\n","            \"own\",\n","            1,\n","            \"management_self-employed\",\n","            1,\n","            \"none\",\n","            \"yes\",\n","            \"No Risk\",\n","            [\n","                0.637964656269084,\n","                0.362035343730916\n","            ],\n","            \"Risk\",\n","            [\n","                0.767955712021837,\n","                0.23204428797816307\n","            ],\n","            \"Risk\"\n","        ],\n","        [\n","            \"no_checking\",\n","            35,\n","            \"outstanding_credit\",\n","            \"appliances\",\n","            7138,\n","            \"500_to_1000\",\n","            \"greater_7\",\n","            5,\n","            \"male\",\n","            \"co-applicant\",\n","            4,\n","            \"unknown\",\n","            49,\n","            \"none\",\n","            \"free\",\n","            2,\n","            \"skilled\",\n","            2,\n","            \"yes\",\n","            \"yes\",\n","            \"Risk\",\n","            [\n","                0.11270206970758759,\n","                0.8872979302924124\n","            ],\n","            \"Risk\",\n","            [\n","                0.767955712021837,\n","                0.23204428797816307\n","            ],\n","            \"Risk\"\n","        ],\n","        [\n","            \"less_0\",\n","            5,\n","            \"all_credits_paid_back\",\n","            \"car_new\",\n","            1523,\n","            \"less_100\",\n","            \"unemployed\",\n","            2,\n","            \"female\",\n","            \"none\",\n","            2,\n","            \"real_estate\",\n","            19,\n","            \"none\",\n","            \"rent\",\n","            1,\n","            \"management_self-employed\",\n","            1,\n","            \"none\",\n","            \"yes\",\n","            \"No Risk\",\n","            [\n","                0.7304597628653227,\n","                0.26954023713467745\n","            ],\n","            \"Risk\",\n","            [\n","                0.767955712021837,\n","                0.23204428797816307\n","            ],\n","            \"Risk\"\n","        ]\n","    ]\n","}"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"64005c8a27d64081891606d683a3ed41"},"outputs":[],"source":["import urllib3, requests, json\n","from requests.auth import HTTPBasicAuth\n","def generate_access_token():\n","    headers={}\n","    headers[\"Accept\"] = \"application/json\"\n","    auth = HTTPBasicAuth(WOS_CREDENTIALS[\"username\"], WOS_CREDENTIALS[\"password\"])\n","    \n","    ICP_TOKEN_URL= WOS_CREDENTIALS[\"url\"] + \"/v1/preauth/validateAuth\"\n","    \n","    response = requests.get(ICP_TOKEN_URL, headers=headers, auth=auth, verify=False)\n","    json_data = response.json()\n","    icp_access_token = json_data['accessToken']\n","    return icp_access_token\n","icp_access_token = generate_access_token()"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"f12e73b3c73345fba2ba5c486f7c4bb4"},"outputs":[],"source":["header = {\n","    'Content-Type': 'application/json', \n","    'Authorization': 'Bearer ' + icp_access_token\n","}"]},{"cell_type":"markdown","metadata":{"id":"8e2cf6e368ee414d8ffe8f178bbf4294"},"source":["### Store the feedback payload using the data sets API\n","\n","There are two ways OpenScale APIs can be used - a) using OpenScale Python SDK b) using OpenScale REST APIs.\n","\n","For any reason if in the customer environment one cannot use the SDK, then the alternative is to use the REST APIs. The below cell demostrates to invoke one such OpenScale REST API, to log the feedback records to the OpenScale DataMart."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"f2a376a7df1642c189f1920445012f6a"},"outputs":[],"source":["DATASETS_STORE_RECORDS_URL =   WOS_CREDENTIALS[\"url\"] + \"/openscale/{0}/v2/data_sets/{1}/records\".format(data_mart_id, feedback_dataset_id)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"19d83529ce1d44a59d6ed709ccdd4e42"},"outputs":[],"source":["for x in range(10):\n","    response = requests.post(DATASETS_STORE_RECORDS_URL, json=feedback_payload, headers=header, verify=False)\n","    json_data = response.json()\n","    print(json_data)"]},{"cell_type":"markdown","metadata":{"id":"373ef04bff5b4772833b6b1120fd2314"},"source":["### Wait for sometime, and make sure the records have reached to data sets related table."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"88c162434aa24e018a131ca10c188d5c"},"outputs":[],"source":["time.sleep(30)\n","DATASETS_STORE_RECORDS_URL =   WOS_CREDENTIALS[\"url\"] + \"/openscale/{0}/v2/data_sets/{1}/records?limit={2}&include_total_count={3}\".format(data_mart_id, feedback_dataset_id, 1, \"true\")\n","response = requests.get(DATASETS_STORE_RECORDS_URL, headers=header, verify=False)\n","json_data = response.json()\n","print(json_data['total_count'])"]},{"cell_type":"markdown","metadata":{"id":"96bdfc3a376d44a585432889b033f060"},"source":["## Run Quality Monitor"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"adfd17e52f3e4c4f88c4fe8016c4d6b0"},"outputs":[],"source":["run_details = wos_client.monitor_instances.run(monitor_instance_id=quality_monitor_instance_id, background_mode=False).result"]},{"cell_type":"markdown","metadata":{"id":"37c0967b10a1460c817d7eebe273d549"},"source":["# Drift Configuration"]},{"cell_type":"markdown","metadata":{"id":"bde7dd5dd58c46049d38d0943bdad90d"},"source":["# Scoring function for drift configuration"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"2767b93cd00b4af58e9bd5b8492f0605"},"outputs":[],"source":["def score(training_data_frame):\n","      \n","    #The data type of the label column and prediction column should be same .\n","    #User needs to make sure that label column and prediction column array should have the same unique class labels\n","    \n","    feature_columns = list(training_data_frame.columns)\n","    if class_label in feature_columns:\n","        feature_columns.remove(class_label)\n","    training_data_rows = training_data_frame[feature_columns].values.tolist()\n","    \n","    payload_scoring = {\n","      wml_client.deployments.ScoringMetaNames.INPUT_DATA: [{\n","           \"fields\": feature_columns,\n","           \"values\": [x for x in training_data_rows]\n","      }]\n","    }\n","\n","    score = wml_client.deployments.score(deployment_uid, payload_scoring)\n","    score_predictions = score.get('predictions')[0]\n","\n","    prob_col_index = list(score_predictions.get('fields')).index(probability_column)\n","    predict_col_index = list(score_predictions.get('fields')).index(prediction_column)\n","\n","    if prob_col_index < 0 or predict_col_index < 0:\n","        raise Exception(\"Missing prediction/probability column in the scoring response\")\n","\n","    import numpy as np\n","    probability_array = np.array([value[prob_col_index] for value in score_predictions.get('values')])\n","    prediction_vector = np.array([value[predict_col_index] for value in score_predictions.get('values')])\n","\n","    return probability_array, prediction_vector"]},{"cell_type":"markdown","metadata":{"id":"033d1ee83da3498ab2b1981ea9d3f144"},"source":["## Create the drift detection model archive"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"226adfb1fd1c4e628582a894746bf9be"},"outputs":[],"source":["probability_array, prediction_vector = score(data_df)"]},{"cell_type":"markdown","metadata":{"id":"cb07a0651c994d7f9c51032f7eaa4b11"},"source":["# Payload and Feedback dataset id for pushing model information into the datamart"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"89e9a902168843109e9f84f68a3fb216"},"outputs":[],"source":["print(\"Payload data set id:\", payload_data_set_id)\n","print(\"Feedback data set id:\", feedback_dataset_id)"]},{"cell_type":"markdown","metadata":{"id":"48c3a513d1a44de782f747574f431d66"},"source":["Authors: Moritz Scheele (moritz.scheele@ibm.com) and Ravi Chamarthy (ravi.chamarthy@in.ibm.com)"]}],"metadata":{"kernelspec":{"display_name":"Python 3.10","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.10"},"vscode":{"interpreter":{"hash":"dbf53fce1b4f805dfa7248c873cd5919f94c1d61eaf85ef20024e88fdf9444a6"}}},"nbformat":4,"nbformat_minor":4}
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![Alt text](images/banner.png)
  2 | # README
  3 | ---
  4 | 
  5 | This repo can be used as a starter kit to setup a fully git integrated Machine Learning Operations enviroment using Cloud Pak for Data and (in the future) watsonx. It uses a simple "credit score prediction" usecase that is split up into 4 jupyter notebooks as an example, which can easily be adapted to your business problem. 
  6 | 
  7 | It tries to be as simple as possible and showing the basic concepts of MLOps using IBM tools. The intended use it that after you have set everyhting up and familiarized yourself with the concepts you throw out all the "credit score prediction" code and replace it with whatever problem you are trying to solve.
  8 | 
  9 | ![high level overview using three stages](/images/2023-09-05-11_00_27.png)
 10 | 
 11 | *high level overview using three stages*
 12 | 
 13 | 
 14 | 
 15 | # Setup Instructions
 16 | These instructions will guide you through the setup of a simple MLOps environment that uses just two stages ("dev" and "prod"). The setup can be easily extended to more stages if needed. 
 17 | 
 18 | It is assumed that you have a "Cloud Pak for Data" instance available and that you have admin rights to it (This will not work with the cloud based "as a Service" Offering). 
 19 | 
 20 | ![Alt text](/images/detailed_overview.png)
 21 | 
 22 | *detailed view using two stages*
 23 | ## 1. Fork this repo
 24 | 
 25 | <details>
 26 | <summary><b> need a detailed description?</b></summary>
 27 | 
 28 | ![Alt text](/images/image-1.png)
 29 | 
 30 | click the "Fork" button in the upper right corner of this repo. **IMPORTANT: uncheck the "only fork the master branch" checkbox.**
 31 |  This will create a copy of this repo in your own github account. We will be using this copy in the following steps.
 32 | </details>
 33 | 
 34 | 
 35 | ## 2.   Create one git-enabled project called "00-datascience-playground"
 36 | 
 37 | <details>
 38 | <summary><b> need a detailed description?</b></summary>
 39 | 
 40 | ### Overview
 41 | ![Alt text](/images/2023-08-31-09_10_14.png)
 42 | *this is the project that we are creating in this step*
 43 | 
 44 | ### Step by step
 45 | ![Alt text](/images/image-2.png)
 46 | navigate to all projects
 47 | ![Alt text](/images/image-4.png)
 48 | create a project that is "integrated with git". In the next window we will need to provide the github repo address and a private access token. So lets create that token first.
 49 | ![Alt text](/images/image-5.png)
 50 | navigate to https://github.com/settings/tokens and choose "Generate new token". Give it a name and select the "repo" scope as shown in the next image. 
 51 | ![Alt text](/images/image-6.png)
 52 | **Copy the generated token to your clipboard.** You will not be able to see it again after you close the window.
 53 | ![Alt text](/images/image-7.png)
 54 | Make this token available within your CP4D by creating a "New Token" and using the token you just created. Once you created it use the dropdown to select it.
 55 | ![Alt text](/images/image-8.png)
 56 | add the Repo URL (dont forget the .git at the end ;-) and choose the main branch. Then hit "Create"
 57 | 
 58 | Use the github repo address and your private access token 
 59 | You can Alter the notebooks to your needs if you want to. It is important that you keep the naming of the notebooks.
 60 | </details>
 61 | 
 62 | 
 63 | ## 3. Create one git-enabled project called "01-staging-area"
 64 | 
 65 | <details>
 66 | <summary><b> need a detailed description?</b></summary>
 67 | 
 68 | ### Overview
 69 | ![Alt text](/images/image-3.png)
 70 | *this is the project that we are creating in this step*
 71 | 
 72 | ### Step by step
 73 | 
 74 | ![Alt text](/images/image-2.png)
 75 | navigate to all projects
 76 | ![Alt text](/images/image-4.png)
 77 | In your CP4D Instance you access the project overview by clicking on the "Projects" Icon in the upper left corner. Then click on "New Project" and select "Create a project integrated with a Git repository". Give it the name "01-staging-area" and select "create"
 78 | 
 79 | Use the same github repo address and your private access token as in 2
 80 |  
 81 | </details>
 82 | 
 83 | ## 4. Configure custom enviroment in "01-staging-area"
 84 | <details>
 85 | <summary><b> need a detailed description?</b></summary>
 86 | 
 87 | TODO: Add description here! (use custom_env.yaml)
 88 | 
 89 | </details>
 90 | 
 91 | ## 5. Configure Jobs in "01-staging-area"
 92 | <details>
 93 | <summary><b> need a detailed description?</b></summary>
 94 | 
 95 | ### Overview
 96 | ![Alt text](/images/image-4.png)
 97 | *this is the project that we are creating in this step*
 98 | 
 99 | ### Step by step
100 | ![Alt text](/images/image-9.png)
101 | navigate to "view local branch"
102 | 
103 | ![Alt text](/images/image-11.png)
104 | click "New code job"
105 | 
106 | ![Alt text](/images/image-12.png)
107 | choose the first notebook "00-git-pull.ipynb" and click "configure job"
108 | 
109 | ![Alt text](/images/image-13.png)
110 | give it the same name as the notebook and click "next"
111 | TODO: choose correct enviroment for every job
112 | accept all the defaults and click "next" until you can click "create job"
113 | TODO: add the "was_successful" output to every job
114 | repeat those steps for all six notebooks.
115 | 
116 | ![Alt text](/images/image-14.png)
117 | once you are done it should look like this.
118 | 
119 | 
120 | We also need to create a .env file within the "01-staging-area" project. This file will contain the credentials that the pipeline will use to pull the code from github.
121 | 
122 | ![Alt text](/images/image-100.png)
123 | 
124 | Click "Launch IDE" and then "JupyterLab" to get access to the JupyterLab environment. 
125 | 
126 | ![Alt text](/images/image-103.png)
127 | 
128 | You will be greeted by a tab called "Terminal 1". There you copy the following commands and hit enter:
129 | 
130 | ```bash
131 | 
132 | echo "repo_adresse=PUT_YOUR_REPO_ADDRESS_HERE" > .env
133 | echo "personal_access_token=PUT_YOUR_TOKEN_HERE" >> .env
134 | echo "project_id=PUT_YOUR_PROJECT_ID_HERE" >> .env
135 | echo "branch_name=main" >> .env
136 | echo "cpd_technical_user=PUT_USERNAME_HERE" >> .env
137 | echo "cpd_technical_user_password=PUT_PASSWORD_HERE">> .env
138 | echo "cpd_url=PUT_URL_HERE">> .env
139 | 
140 | ```
141 | 
142 | *cpd_technical_user* is a user that was created only to be used as a proxy in those scripts. If this is not available you can also use a *personal* user (i.e. the credentials you use to login) even though this not best practise
143 | 
144 | ![Alt text](/images/image-102.png)
145 | 
146 | You can check if everything worked by typing 
147 |    
148 |    ```bash
149 |    cat .env
150 |    ```
151 | If that command displays the content of the .env file you are good to go. 
152 | 
153 | </details>
154 | 
155 | ## 5. Create a NON-git-enabled project called "02-automation-area"
156 | 
157 | <details>
158 | <summary><b> need a detailed description?</b></summary>
159 | 
160 | ### Overview
161 | ![Alt text](/images/image-5.png)
162 | *this is the project that we are creating in this step*
163 | 
164 | ### Step by step
165 | 
166 | ![Alt text](/images/image-3.png)
167 | repeat the same steps as in 2 and 3 but choose "create an empty project" to create a NON-git-enabled project. Name it "02-automation-area"
168 | 
169 | 
170 | </details>
171 | 
172 | 
173 | 
174 | ## 6. Configure pipeline in "02-automation-area"
175 | <details>
176 | <summary><b> need a detailed description?</b></summary>
177 | 
178 | ### Overview
179 | ![Alt text](/images/image-6.png)
180 | *those are the pieces we are creating in this step*
181 | 
182 | ### Step by step
183 | TODO: add global parameters
184 | 
185 | ![Alt text](/images/image-16.png)
186 | Click "New Asset" and choose "Pipeline". Name the pipeline "mlops_pipeline"
187 | 
188 | ![Alt text](/images/image-18.png)
189 | go to "Run">"Run Notebook Job" and drag it onto the plane. Then doubleclick this newly created node and click "select Job".
190 | 
191 | ![Alt text](/images/image-19.png)
192 | choose "01-staging-area" and there the first notebook "00-git-pull.ipynb" and click "choose" and then "save"
193 | 
194 | TODO: choose enviroment
195 | TODO: add pipeline params
196 | 
197 | ![Alt text](/images/image-20.png)
198 | repeat those steps for all notebooks until you end up with something that looks like this.
199 | 
200 | ![Alt text](/images/image-29.png)
201 | Click "Run Pipeline" and then "create job". Give it a name like "mlops_pipeline_job" . **IMPORTANT: The github action assumes that you only have ONE job in this project. If you have more than one job you will need to change the github action accordingly.**
202 | 
203 | 
204 | </details>
205 | 
206 | ## 7. Setup Github Actions 
207 | <details>
208 | <summary><b> need a detailed description?</b></summary>
209 | 
210 | ### Overview
211 | ![Alt text](/images/image-7.png)
212 | *this is the piece that we are creating in this step*
213 | 
214 | ### Step by step
215 | 
216 | We need a set of secrets to be able to run the github actions. Those secrets are:
217 | 
218 | - **API_KEY**
219 | - **USER_NAME**
220 | - **CLUSTER_URL**
221 | - **PROJECT_ID**
222 | - **PERSONAL_ACCESS_TOKEN_GITHUB**
223 | 
224 | 
225 | We will now go through all those step by step:
226 | 
227 | ![Alt text](/images/image-21.png)
228 | navigate to your fork of the github repo then "Settings">"Secrets and variables">"actions">"new repository secret" 
229 | 
230 | ### 7.1. retriving your CP4D **API_KEY** and **USER_NAME**
231 | 
232 | ![Alt text](/images/image-22.png)
233 | go to the "profile and settings" tab in your cp4d instance
234 | 
235 | ![Alt text](/images/image-23.png)
236 | copy the api key to your clipboard (and write it down somewhere. You will not be able to see it again after you close the window)
237 | 
238 | ![Alt text](/images/image-24.png)
239 | go back to github and creaete a new repository secret called "API_KEY"
240 | 
241 | ![Alt text](/images/image-27.png)
242 | 
243 | Also create the repository secret USER_NAME using the username that you use to login to your CP4D instance
244 | 
245 | 
246 | ### 7.2. retriving your CP4D **CLUSTER_URL**
247 | 
248 | this one is simple :-) 
249 | ![Alt text](/images/image-25.png)
250 | 
251 | just take the URL of the cluster that you have been workin on
252 | 
253 | ![Alt text](/images/image-26.png)
254 | 
255 | and use it to create a secret called "CLUSTER_URL"
256 | 
257 | ### 7.3. retriving your CP4D **PROJECT_ID**
258 | 
259 | ![Alt text](/images/image-28.png)
260 | 
261 | ### 7.4. retriving your github **PERSONAL_ACCESS_TOKEN_GITHUB**
262 | 
263 | You can use the same token you used in step 2. If you dont have it anymore you can create a new one by following the steps in 2.
264 | 
265 | 
266 | 
267 | </details>
268 | 
269 | 
270 | ## 8. Create deployment space
271 | <details>
272 | <summary><b> need a detailed description?</b></summary>
273 | 
274 | TODO: describe how to create deployment space
275 | 
276 | </details>
277 | 
278 | ## 9. Setup monitoring using open scale
279 | <details>
280 | <summary><b> need a detailed description?</b></summary>
281 | 
282 | TODO: describe how to set up open scale
283 | 
284 | </details>
285 | 
286 | ## 10. Try it out :-) 
287 | 
288 | 
289 | 
290 | ## 11. Future Work and known issues
291 | <details>
292 | <summary><b> need a detailed description?</b></summary>
293 | 
294 | - Future Work:
295 |    - [ ] Put AI Fact sheets back into the "03-train_model" notebook 
296 |    - [ ] Figure out what is wrong with the deployments and fix it
297 |    - [ ] Figure out what is wrong with monitoring (probably issue with the cluster we use)
298 |    - [ ] Finish Documentation of 8. Create deployment space and 9. Setup monitoring using open scale
299 |    - [ ] Delete all projects and set everything up again acording to documentation to find what is missing(~ one day of work)
300 |    - [ ] describe how good usermanagement can work (e.g. normal Users can only see the "01_data_science_playground" project)
301 |    - [ ] integrate Model Inventory/ model versioning
302 | 
303 | 
304 | - Known Issues
305 |    - 
306 | 
307 | </details>
308 | 
309 | 
310 |     
311 | 


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
 1 | **Mission**<br>
 2 | Convert APAC MLOps Accelerator from CP4DaaS to CP4DS
 3 | 
 4 | ### Info
 5 | ---
 6 | - Awaiting tensorflow-data-validation update for compatibility w/ py3.10
 7 | 
 8 | 
 9 | ### Todo
10 | ---
11 | - [x] Cut out all use of Cloud Object Storage ✅ 2023-10-04
12 | - [x] Reduce use of /utils to a minimum ✅ 2023-10-04
13 | - [x] Remove use of *seaborn/sns* ✅ 2023-10-04
14 | - [x] **Use WML .from_token instead of api_key ✅ 2023-10-04
15 | - [ ]
16 | - [ ]
17 | 
18 | 
19 | ### Save for later
20 | ---
21 | 
22 | #### Work w/ WML on CP4DS
23 | ```python
24 | CPD_URL = '<CPD PLATFORM URL>'
25 | from ibm_watson_studio_lib import access_project_or_space
26 | wslib = access_project_or_space()
27 | wml_credentials = {
28 |     "url": CPD_URL,
29 |     "token": wslib.auth.get_current_token(),
30 |     "instance_id": "wml_local",
31 |     "version" : "4.6"
32 | }
33 | from ibm_watson_machine_learning import APIClient
34 | client = APIClient(wml_credentials)
35 | client.spaces.list()
36 | ```
37 | 
38 | #### Use Pipelines on CP4DS
39 | 
40 | ```python
41 | token = os.environ['USER_ACCESS_TOKEN']
42 | ```
43 | 
44 | [WSPipelines On-Prem Tutorial](https://github.ibm.com/Lucas-Baier/ws-pipelines-guide)


--------------------------------------------------------------------------------
/assets/.METADATA/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/assets/.METADATA/.gitkeep


--------------------------------------------------------------------------------
/assets/.METADATA/.version.json:
--------------------------------------------------------------------------------
1 | {"version":"\"2.0.0\"","source":"local"}


--------------------------------------------------------------------------------
/assettypes/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/assettypes/.gitkeep


--------------------------------------------------------------------------------
/custom_env.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - defaults
3 | 
4 | dependencies:
5 |   - pip:
6 |     - tensorflow-data-validation==1.14.0
7 |     - ibm-watson-studio-pipelines==0.2.12
8 |     - python-dotenv==1.0.0


--------------------------------------------------------------------------------
/images/2023-08-31 09_10_14-2023_04_MLOps_AOT_Initiative.drawio - diagrams.net.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/2023-08-31 09_10_14-2023_04_MLOps_AOT_Initiative.drawio - diagrams.net.png


--------------------------------------------------------------------------------
/images/2023-08-31-09_10_14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/2023-08-31-09_10_14.png


--------------------------------------------------------------------------------
/images/2023-09-05-11_00_27.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/2023-09-05-11_00_27.png


--------------------------------------------------------------------------------
/images/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/banner.png


--------------------------------------------------------------------------------
/images/detailed_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/detailed_overview.png


--------------------------------------------------------------------------------
/images/image-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-1.png


--------------------------------------------------------------------------------
/images/image-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-10.png


--------------------------------------------------------------------------------
/images/image-100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-100.png


--------------------------------------------------------------------------------
/images/image-101.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-101.png


--------------------------------------------------------------------------------
/images/image-102.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-102.png


--------------------------------------------------------------------------------
/images/image-103.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-103.png


--------------------------------------------------------------------------------
/images/image-11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-11.png


--------------------------------------------------------------------------------
/images/image-12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-12.png


--------------------------------------------------------------------------------
/images/image-13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-13.png


--------------------------------------------------------------------------------
/images/image-14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-14.png


--------------------------------------------------------------------------------
/images/image-15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-15.png


--------------------------------------------------------------------------------
/images/image-16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-16.png


--------------------------------------------------------------------------------
/images/image-17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-17.png


--------------------------------------------------------------------------------
/images/image-18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-18.png


--------------------------------------------------------------------------------
/images/image-19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-19.png


--------------------------------------------------------------------------------
/images/image-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-2.png


--------------------------------------------------------------------------------
/images/image-20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-20.png


--------------------------------------------------------------------------------
/images/image-200.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-200.png


--------------------------------------------------------------------------------
/images/image-201.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-201.png


--------------------------------------------------------------------------------
/images/image-202.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-202.png


--------------------------------------------------------------------------------
/images/image-203.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-203.png


--------------------------------------------------------------------------------
/images/image-204.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-204.png


--------------------------------------------------------------------------------
/images/image-205.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-205.png


--------------------------------------------------------------------------------
/images/image-206.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-206.png


--------------------------------------------------------------------------------
/images/image-207.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-207.png


--------------------------------------------------------------------------------
/images/image-21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-21.png


--------------------------------------------------------------------------------
/images/image-22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-22.png


--------------------------------------------------------------------------------
/images/image-23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-23.png


--------------------------------------------------------------------------------
/images/image-24.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-24.png


--------------------------------------------------------------------------------
/images/image-25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-25.png


--------------------------------------------------------------------------------
/images/image-26.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-26.png


--------------------------------------------------------------------------------
/images/image-27.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-27.png


--------------------------------------------------------------------------------
/images/image-28.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-28.png


--------------------------------------------------------------------------------
/images/image-29.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-29.png


--------------------------------------------------------------------------------
/images/image-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-3.png


--------------------------------------------------------------------------------
/images/image-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-4.png


--------------------------------------------------------------------------------
/images/image-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-5.png


--------------------------------------------------------------------------------
/images/image-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-6.png


--------------------------------------------------------------------------------
/images/image-7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-7.png


--------------------------------------------------------------------------------
/images/image-8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-8.png


--------------------------------------------------------------------------------
/images/image-9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image-9.png


--------------------------------------------------------------------------------
/images/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/image.png


--------------------------------------------------------------------------------
/images/overview-image-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/overview-image-1.png


--------------------------------------------------------------------------------
/images/overview-image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/MLOps/65d410c5d6e78fceee8516f27c8163680b712af5/images/overview-image.png


--------------------------------------------------------------------------------
/utils/catalog_utils.py:
--------------------------------------------------------------------------------
  1 | """__author__ == "Nijesh"
  2 | email : knijesh@sg.ibm.com
  3 | 
  4 | WKC and Model Inventory Utility Client
  5 | 
  6 | 
  7 | """
  8 | 
  9 | import json
 10 | import os
 11 | from dataclasses import dataclass
 12 | from operator import itemgetter
 13 | 
 14 | import numpy as np
 15 | import pandas as pd
 16 | import requests
 17 | from ibm_watson_machine_learning import APIClient
 18 | from requests.structures import CaseInsensitiveDict
 19 | 
 20 | 
 21 | @dataclass
 22 | class CatalogUtils:
 23 |     """
 24 |     Encapsulated Catalog Utils Class to enable the use of WKC via Watson Data API.
 25 | 
 26 |     """
 27 | 
 28 | 
 29 |     access_token: str
 30 |     project_id: str
 31 | 
 32 |     def get_wml_client(self):
 33 | 
 34 | 
 35 | 
 36 |         wml_credentials = {
 37 |            "token": access_token,
 38 |            "instance_id" : "openshift",
 39 |            "url": os.environ['RUNTIME_ENV_APSX_URL'],
 40 |            "version": "4.6"
 41 |         }
 42 |         wml_client = APIClient(wml_credentials)
 43 |         return wml_client
 44 | 
 45 |     def create_access_token(self):
 46 |         headers = {
 47 |             "Content-Type": "application/x-www-form-urlencoded",
 48 |         }
 49 | 
 50 |         data = (
 51 |             f"grant_type=urn:ibm:params:oauth:grant-type:apikey&apikey={self.api_key}"
 52 |         )
 53 | 
 54 |         response = requests.post(self.auth_url, headers=headers, data=data)
 55 | 
 56 |         return response.json()["access_token"]
 57 | 
 58 |     def list_catalogs(self):
 59 |         access_token = self.create_access_token()
 60 |         headers = CaseInsensitiveDict()
 61 |         headers["Accept"] = "application/json"
 62 |         headers["Authorization"] = f"Bearer {access_token}"
 63 |         list_catalogs = requests.get(self.service_url + "/v2/catalogs", headers=headers)
 64 |         return list_catalogs.json()
 65 | 
 66 |     def get_catalog_id_map(self):
 67 |         result = self.list_catalogs()
 68 |         asset_map = {}
 69 |         for keys, values in result.items():
 70 |             if type(values) == list:
 71 |                 for each in values:
 72 |                     asset_map[each["entity"]["name"]] = each["metadata"]["guid"]
 73 |         return asset_map
 74 | 
 75 |     def get_latest_asset_id(self, name):
 76 | 
 77 |         wml_client = self.get_wml_client()
 78 |         wml_client.set.default_project(self.project_id)
 79 |         result = wml_client.repository.get_model_details()
 80 |         result_meta = [
 81 |             each["metadata"]
 82 |             for each in result["resources"]
 83 |             if each["metadata"]["name"] == name
 84 |         ]
 85 | 
 86 |         my_asset_list = sorted(result_meta, key=itemgetter("created_at"), reverse=True)
 87 |         return my_asset_list[0]["id"]
 88 | 
 89 |     def get_revisions_asset(self, catalog_id, asset_id):
 90 |         access_token = self.create_access_token()
 91 |         headers = CaseInsensitiveDict()
 92 |         headers["Accept"] = "application/json"
 93 |         headers["Authorization"] = f"Bearer {access_token}"
 94 |         search_asset = requests.get(
 95 |             self.service_url
 96 |             + f"/v2/assets/{asset_id}/revisions?catalog_id={catalog_id}",
 97 |             headers=headers,
 98 |         )
 99 |         return search_asset.json()
100 | 
101 |     def publish_asset(self, catalog_id, asset_id, name, desc, tags):
102 |         """Publish Assets to Catalog
103 | 
104 |         Args:
105 |             catalog_id (str): catalog id
106 |             asset_id (str): Id of the asset to be published
107 |             name (str): name of the asset
108 |             desc (str): description
109 |             tags (str): asset tag
110 |         """
111 |         access_token = self.create_access_token()
112 |         url = f"{self.service_url}/v2/assets/{asset_id}/publish?project_id={self.project_id}"
113 | 
114 |         payload = json.dumps(
115 |             {
116 |                 "catalog_id": catalog_id,
117 |                 "mode": 0,
118 |                 "metadata": {
119 |                     "name": name,
120 |                     "description": desc,
121 |                     "tags": tags,
122 |                 },
123 |             }
124 |         )
125 |         headers = {
126 |             "Authorization": f"Bearer {access_token}",
127 |             "Content-Type": "application/json",
128 |         }
129 | 
130 |         response = requests.request("POST", url, headers=headers, data=payload)
131 | 
132 |         print(response.text)
133 | 
134 |     def get_model_from_registry(self, name):
135 |         """Get latest model from registry
136 | 
137 |         Args:
138 |             name (str): name of the Model
139 | 
140 |         Returns:
141 |             str: model id
142 |         """
143 |         access_token = self.create_access_token()
144 | 
145 |         url = "https://api.dataplatform.cloud.ibm.com/v1/aigov/model_inventory/model_entries?bss_account_id=27ff418fedd6aedffb8dc6ae4164a1d2"
146 | 
147 |         payload = {}
148 |         headers = {
149 |             "Authorization": f"Bearer {access_token}",
150 |             "Content-Type": "application/json",
151 |         }
152 |         response = requests.request("GET", url, headers=headers, data=payload)
153 | 
154 |         result = response.json()
155 | 
156 |         for each in result["results"]:
157 |             for item in each["entity"]["modelfacts_global"]["physical_models"]:
158 |                 if (
159 |                     item["name"] == name
160 |                     and item["container_id"] == self.project_id
161 |                     and item["is_deleted"] == False
162 |                 ):
163 |                     return item["id"]
164 | 


--------------------------------------------------------------------------------
/utils/fs_utils.py:
--------------------------------------------------------------------------------
  1 | """__author__ == "Nijesh"
  2 | email : knijesh@sg.ibm.com
  3 | 
  4 | Factsheets and Model Metadata Utility Client
  5 | 
  6 | 
  7 | """
  8 | 
  9 | 
 10 | import contextlib
 11 | import os
 12 | import time
 13 | from collections import defaultdict
 14 | from dataclasses import dataclass
 15 | 
 16 | import requests
 17 | from ibm_aigov_facts_client import AIGovFactsClient
 18 | from ibm_watson_machine_learning import APIClient
 19 | 
 20 | 
 21 | @dataclass
 22 | class FSUtils:
 23 |     wml_client: APIClient
 24 |     catalog_id: str
 25 |     project_id: str
 26 |     bss_account_id: str
 27 |     space_id: str
 28 |     facts_client: AIGovFactsClient
 29 |     service_url: str = "https://api.dataplatform.cloud.ibm.com"
 30 | 
 31 |     def register_new_model_entry(
 32 |         self, model_uid, model_entry_name, model_entry_description
 33 |     ):
 34 |         self.wml_client.set.default_project(self.project_id)
 35 |         meta_props = {
 36 |             self.wml_client.factsheets.ConfigurationMetaNames.NAME: model_entry_name,
 37 |             self.wml_client.factsheets.ConfigurationMetaNames.DESCRIPTION: model_entry_description,
 38 |             self.wml_client.factsheets.ConfigurationMetaNames.MODEL_ENTRY_CATALOG_ID: self.catalog_id,
 39 |         }
 40 |         model_registration = self.wml_client.factsheets.register_model_entry(
 41 |             model_id=model_uid, meta_props=meta_props
 42 |         )
 43 |         return model_registration
 44 | 
 45 |     def register_existing_model_entry(self, model_uid, model_entry_asset_id):
 46 |         meta_props = {
 47 |             self.wml_client.factsheets.ConfigurationMetaNames.ASSET_ID: model_entry_asset_id,
 48 |             self.wml_client.factsheets.ConfigurationMetaNames.MODEL_ENTRY_CATALOG_ID: self.catalog_id,
 49 |         }
 50 |         model_registration = self.wml_client.factsheets.register_model_entry(
 51 |             model_id=model_uid, meta_props=meta_props
 52 |         )
 53 |         return model_registration
 54 | 
 55 |     def get_model_entries(self):
 56 |         headers = {
 57 |             "Content-Type": "application/json",
 58 |             "Accept": "application/json",
 59 |             "Authorization": self.wml_client._get_headers()["Authorization"],
 60 |         }
 61 |         params = {"bss_account_id": self.bss_account_id}
 62 |         r = requests.get(
 63 |             f"{self.service_url}/v1/aigov/model_inventory/{self.catalog_id}/model_entries",
 64 |             headers=headers,
 65 |             params=params,
 66 |         )
 67 |         return r.json()
 68 | 
 69 |     def get_model_entry(self, model_entry_asset_id):
 70 |         headers = {
 71 |             "Content-Type": "application/json",
 72 |             "Accept": "application/json",
 73 |             "Authorization": self.wml_client._get_headers()["Authorization"],
 74 |         }
 75 |         params = {"catalog_id": self.catalog_id}
 76 |         r = requests.get(
 77 |             f"{self.service_url}/v1/aigov/model_inventory/model_entries/{model_entry_asset_id}",
 78 |             headers=headers,
 79 |             params=params,
 80 |         )
 81 |         return r.json()
 82 | 
 83 |     def get_model_entry_asset_id_by_name(self, model_entry_name):
 84 |         response = self.get_model_entries()
 85 |         return next(
 86 |             (
 87 |                 x["metadata"]["asset_id"]
 88 |                 for x in response["results"]
 89 |                 if x["metadata"]["name"] == model_entry_name
 90 |             ),
 91 |             None,
 92 |         )
 93 | 
 94 |     def prepare_training_reference(
 95 |         self, bucket_name, apikey, crn, endpoint, training_file_name
 96 |     ):
 97 |         """_summary_
 98 | 
 99 |         Args:
100 |             bucket_name (str): Bucket_Name
101 |             apikey (str): APIKEY
102 |             crn (str): CRN  Of COS
103 |             endpoint (str): ENDPOINT
104 |             training_file_name (str): Training Data Filename
105 | 
106 |         Returns:
107 |             list[dict]: Training Data Reference
108 |         """
109 | 
110 |         self.wml_client.set.default_project(self.project_id)
111 |         datasource_type = self.wml_client.connections.get_datasource_type_uid_by_name(
112 |             "bluemixcloudobjectstorage"
113 |         )
114 |         conn_meta_props = {
115 |             self.wml_client.connections.ConfigurationMetaNames.NAME: "MLOps COS",
116 |             self.wml_client.connections.ConfigurationMetaNames.DATASOURCE_TYPE: datasource_type,
117 |             self.wml_client.connections.ConfigurationMetaNames.DESCRIPTION: "MLOpsCOS COnnection",
118 |             self.wml_client.connections.ConfigurationMetaNames.PROPERTIES: {
119 |                 "bucket": bucket_name,
120 |                 "api_key": apikey,
121 |                 "resource_instance_id": crn,
122 |                 "iam_url": "https://iam.ng.bluemix.net/oidc/token",
123 |                 "url": endpoint,
124 |             },
125 |         }
126 | 
127 |         conn_details = self.wml_client.connections.create(meta_props=conn_meta_props)
128 |         connection_id = self.wml_client.connections.get_uid(conn_details)
129 | 
130 |         training_data_references = [
131 |             {
132 |                 "id": "German Credit Risk",
133 |                 "type": "connection_asset",
134 |                 "connection": {
135 |                     "id": connection_id,
136 |                     "href": "/v2/connections/"
137 |                     + connection_id
138 |                     + "?space_id="
139 |                     + self.space_id,
140 |                 },
141 |                 "location": {"bucket": bucket_name, "file_name": training_file_name},
142 |             }
143 |         ]
144 |         return training_data_references
145 | 
146 |     def save_model(
147 |         self,
148 |         model,
149 |         model_name,
150 |         model_entry_description,
151 |         model_entry_name,
152 |         target,
153 |         X,
154 |         y,
155 |         train_data_ref,
156 |         model_type="scikit-learn_1.0"
157 |     ):
158 |         # sourcery skip: use-named-expression
159 |         self.wml_client.set.default_project(self.project_id)
160 |         for x in self.wml_client.repository.get_model_details()["resources"]:
161 |             if x["metadata"]["name"] == model_name:
162 |                 self.wml_client.repository.delete(x["metadata"]["id"])
163 | 
164 |         run_id = self.facts_client.runs.get_current_run_id()
165 | 
166 |         self.facts_client.export_facts.export_payload(run_id)
167 | 
168 |         software_spec_uid = self.wml_client.software_specifications.get_id_by_name(
169 |             "runtime-22.2-py3.10"
170 |         )
171 | 
172 |         meta_props = {
173 |             self.wml_client.repository.ModelMetaNames.NAME: model_name,
174 |             self.wml_client.repository.ModelMetaNames.TYPE: model_type,
175 |             self.wml_client.repository.ModelMetaNames.SOFTWARE_SPEC_UID: software_spec_uid,
176 |             self.wml_client.repository.ModelMetaNames.LABEL_FIELD: target,
177 |             self.wml_client._models.ConfigurationMetaNames.TRAINING_DATA_REFERENCES: train_data_ref,
178 |             self.wml_client.repository.ModelMetaNames.INPUT_DATA_SCHEMA: [
179 |                 {
180 |                     "id": "input_data_schema",
181 |                     "type": "list",
182 |                     "fields": [
183 |                         {"name": index, "type": value}
184 |                         for index, value in X.dtypes.astype(str).items()
185 |                     ],
186 |                 },
187 |             ],
188 |         }
189 | 
190 |         self.facts_client.export_facts.prepare_model_meta(
191 |             wml_client=self.wml_client, meta_props=meta_props
192 |         )
193 | 
194 |         model_details = self.wml_client.repository.store_model(
195 |             model=model, meta_props=meta_props, training_data=X, training_target=y
196 |         )
197 |         model_uid = self.wml_client.repository.get_model_id(model_details)
198 |         model_entry_asset_id = self.get_model_entry_asset_id_by_name(model_entry_name)
199 |         if model_entry_asset_id:
200 |             self.register_existing_model_entry(model_uid, model_entry_asset_id)
201 |         else:
202 |             self.register_new_model_entry(
203 |                 model_uid, model_entry_name, model_entry_description
204 |             )
205 |         return model_uid
206 | 
207 |     def save_custom_model(
208 |         self,
209 |         model,
210 |         model_name,
211 |         model_entry_description,
212 |         model_entry_name,
213 |         X,
214 |         y,
215 |         model_type="scikit-learn_1.0"
216 |     ):
217 |         # sourcery skip: use-named-expression
218 |         self.wml_client.set.default_project(self.project_id)
219 |         for x in self.wml_client.repository.get_model_details()["resources"]:
220 |             if x["metadata"]["name"] == model_name:
221 |                 self.wml_client.repository.delete(x["metadata"]["id"])
222 | 
223 |         run_id = self.facts_client.runs.get_current_run_id()
224 | 
225 |         self.facts_client.export_facts.export_payload(run_id)
226 | 
227 |         software_spec_uid = self.wml_client.software_specifications.get_id_by_name(
228 |             "runtime-22.2-py3.10"
229 |         )
230 | 
231 |         meta_props = {
232 |             self.wml_client.repository.ModelMetaNames.NAME: model_name,
233 |             self.wml_client.repository.ModelMetaNames.TYPE: model_type,
234 |             self.wml_client.repository.ModelMetaNames.SOFTWARE_SPEC_UID: software_spec_uid,
235 |         }
236 |         
237 |         self.facts_client.export_facts.prepare_model_meta(
238 |             wml_client=self.wml_client, meta_props=meta_props
239 |         )
240 | 
241 |         model_details = self.wml_client.repository.store_model(
242 |             model=model, meta_props=meta_props#, training_data=X, training_target=y
243 |         )
244 |         model_uid = self.wml_client.repository.get_model_id(model_details)
245 |         model_entry_asset_id = self.get_model_entry_asset_id_by_name(model_entry_name)
246 |         if model_entry_asset_id:
247 |             self.register_existing_model_entry(model_uid, model_entry_asset_id)
248 |         else:
249 |             self.register_new_model_entry(
250 |                 model_uid, model_entry_name, model_entry_description
251 |             )
252 |         return model_uid
253 | 
254 | 
255 | 
256 |     def promote_model(self, model_uid, model_name):
257 |         """
258 | 
259 |         Promote the model to deployment Space by checking duplicate deployments and
260 |         model assets in the space repository
261 | 
262 |         Args:
263 |             model_uid (str): Model_ID
264 |             model_name (str): Name of the Model
265 | 
266 |         Returns:
267 |             json: model saved result in json
268 |         """
269 |         self.wml_client.set.default_space(self.space_id)
270 | 
271 |         to_delete = defaultdict(list)
272 | 
273 |         model_ids = [
274 |             model["metadata"]["id"]
275 |             for model in self.wml_client.repository.get_model_details()["resources"]
276 |             if model["metadata"]["name"] == model_name
277 |         ]
278 | 
279 |         for model in self.wml_client.deployments.get_details()["resources"]:
280 |             if (
281 |                 model["entity"]["asset"]["id"] in model_ids
282 |                 and "WOS-INTERNAL" not in model["metadata"]["name"]
283 |             ):
284 |                 to_delete[model["entity"]["asset"]["id"]].append(
285 |                     model["metadata"]["id"]
286 |                 )
287 | 
288 |         # to_delete = {
289 |         #     model["entity"]["asset"]["id"]: model["metadata"]["id"]
290 |         #     for model in self.wml_client.deployments.get_details()["resources"]
291 |         #     if model["entity"]["asset"]["id"] in model_ids
292 |         #     and "WOS-INTERNAL" not in model["metadata"]["name"]
293 |         # }
294 | 
295 |         print(to_delete)
296 | 
297 |         ## Delete Deployment IDs and Duplicate Assets
298 | 
299 |         with contextlib.suppress(Exception):
300 |             for key, value in to_delete.items():
301 |                 for each in value:
302 |                     print(f"Deleting {each}")
303 |                     self.wml_client.deployments.delete(each)
304 |                 time.sleep(3)
305 |                 print(f"Deleting {key}")
306 |                 self.wml_client.repository.delete(key)
307 | 
308 |         headers = {
309 |             "Content-Type": "application/json",
310 |             "Accept": "application/json",
311 |             "Authorization": self.wml_client._get_headers()["Authorization"],
312 |         }
313 |         params = {"project_id": self.project_id}
314 |         data = {"mode": 0, "space_id": self.space_id}
315 |         r = requests.post(
316 |             f"{self.service_url}/v2/assets/{model_uid}/promote",
317 |             headers=headers,
318 |             params=params,
319 |             json=data,
320 |         )
321 |         return r.json()
322 | 
323 |     def deploy_model(self, space_id, deployment_name, model_uid):
324 |         self.wml_client.set.default_space(space_id)
325 |         # with contextlib.suppress(Exception):
326 |         #     for x in self.wml_client.deployments.get_details()["resources"]:
327 |         #         if x["metadata"]["name"] == deployment_name:
328 |         #             self.wml_client.deployments.delete(x["metadata"]["id"])
329 |         meta_props = {
330 |             self.wml_client.deployments.ConfigurationMetaNames.NAME: deployment_name,
331 |             self.wml_client.deployments.ConfigurationMetaNames.ONLINE: {},
332 |         }
333 |         deployment_details = self.wml_client.deployments.create(
334 |             model_uid, meta_props=meta_props
335 |         )
336 |         deployment_uid = self.wml_client.deployments.get_uid(deployment_details)
337 |         return deployment_uid
338 | 


--------------------------------------------------------------------------------
/vars_and_utils.py:
--------------------------------------------------------------------------------
  1 | import os 
  2 | import requests
  3 | import pandas as pd
  4 | import pickle
  5 | 
  6 | 
  7 | #### VARS ####
  8 | 
  9 | data_path="data"
 10 | raw_data_filename = "german_credit_data_biased_training.csv"
 11 | raw_data_path=os.path.join(data_path,raw_data_filename)
 12 | 
 13 | train_data_filename = "train_gcr.csv"
 14 | test_data_filename = "test_gcr.csv"
 15 | train_data_path=os.path.join(data_path,train_data_filename)
 16 | test_data_path=os.path.join(data_path,test_data_filename)
 17 | 
 18 | pipeline_filename = "feature_encode.pkl"
 19 | pipeline_path = os.path.join(data_path, pipeline_filename)
 20 | 
 21 | model_name="credit_risk_prediction"
 22 | model_path = os.path.join(data_path, model_name+".pkl")
 23 | 
 24 | deployment_name="credit_risk_prediction"
 25 | 
 26 | deployment_space_id_DEV="c4238e9c-1cbd-4776-aa6e-4f6b1f865ed1"
 27 | deployment_space_id_PROD="c4238e9c-1cbd-4776-aa6e-4f6b1f865ed1"
 28 | 
 29 | #### UTILS ####
 30 | 
 31 | def download_data_to_filesystem(raw_data_path):
 32 |     """
 33 |     Download the german_credit_data_biased_training.csv data from a given URL and save it to the specified file path.
 34 | 
 35 |     Parameters:
 36 |     - raw_data_path (str): Destination path where the CSV will be saved.
 37 |     """
 38 |     url = "https://raw.githubusercontent.com/IBM/monitor-wml-model-with-watson-openscale/master/data/german_credit_data_biased_training.csv"
 39 |     response = requests.get(url)
 40 | 
 41 |     # Check if the request was successful
 42 |     if response.status_code == 200:
 43 |         
 44 |         # Ensure the directory exists
 45 |         directory = os.path.dirname(raw_data_path)
 46 |         if not os.path.exists(directory):
 47 |             os.makedirs(directory)
 48 | 
 49 |         # Write the content to the file
 50 |         with open(raw_data_path, "wb") as file:
 51 |             file.write(response.content)
 52 |         print("Downloaded and saved as "+raw_data_path)
 53 |     else:
 54 |         print("Failed to download the CSV file. Status code:", response.status_code)
 55 |         
 56 |         
 57 | def check_for_file_in_filesystem(path):
 58 |     """
 59 |     Check existence of path in filesystem
 60 |     """
 61 |     if os.path.exists(path):
 62 |         return True
 63 |     else:
 64 |         print("File not found in specified path.")
 65 |         return False   
 66 | 
 67 | 
 68 | def load_model(filename):
 69 |     """
 70 |     Load model from the specified file path.
 71 | 
 72 |     Parameters:
 73 |     - filename (str): Path to the model file.
 74 | 
 75 |     Returns:
 76 |     - object: The deserialized model/pipeline object.
 77 |     """
 78 |     check_for_file_in_filesystem(filename)
 79 |     with open (filename,"rb") as f:
 80 |         pipeline = pickle.load(f)
 81 |     return pipeline
 82 | 
 83 | 
 84 | def load_data_from_db2():
 85 |     '''
 86 |     currently not implemented due to issues with the flight service
 87 |     '''
 88 |     # data_request = {
 89 |     #         'connection_name': """DB2_DATA""",
 90 |     #         'interaction_properties': {
 91 |     #             'select_statement': 'SELECT * FROM "CUSTOMER_DATA"."GERMAN_CREDIT_RISK_TRAINING" FETCH FIRST 5000 ROWS ONLY'
 92 |     #         }
 93 |     #     }
 94 | 
 95 |     # read_client = itcfs.get_flight_client()
 96 | 
 97 | 
 98 |     # flightInfo = itcfs.get_flight_info(read_client, nb_data_request=data_request)
 99 | 
100 |     # df = itcfs.read_pandas_and_concat(read_client, flightInfo, timeout=240)
101 |     # create empty dataframe to have a valid return type
102 |     
103 |     # throw an exception to signal that this functionality is not available
104 |     print("not implemented")
105 |     raise Exception("not implemented")
106 | 
107 | 
108 | def load_german_credit_risk_data():
109 |     """
110 |     checks if it can find the data in db2 or on the local filesystem.
111 |     If necessary downloads it from the internet. 
112 |     Returns it as a dataframe
113 | 
114 |     Returns:
115 |         pandas df: german credit risk data
116 |     """
117 |     try:
118 |         return load_data_from_db2()
119 |     except:
120 |         print("Error while loading data from db2. downloading csv file to filesystem instead")
121 | 
122 |     if os.path.isfile(raw_data_path):
123 |         print("File already exists in filesystem.")
124 |     else:
125 |         download_data_to_filesystem(raw_data_path)
126 |     print("loading data to pandas dataframe")
127 |     return pd.read_csv(raw_data_path)
128 | 
129 | 
130 | def save_data_in_filesystem(df,filename):
131 |     """
132 |     Save Data in Filesystem
133 | 
134 |     Passed filename should involve path
135 | 
136 |     """
137 |     try:
138 |         if filename[-3:] == "csv":
139 |             df.to_csv(filename,index=False)
140 |             print(f"File {filename} persisted successfully as csv")
141 |         else:
142 |             with open(filename, 'wb') as f:
143 |                 pickle.dump(df, f)
144 |             print(f"File {filename} pickled successfully")
145 |     except Exception as e:
146 |         print(e)
147 |         print(f"File serialization for {filename} failed")
148 |         
149 |         
150 | def load_data_from_filesystem(path):
151 |     """
152 |     Check existence of path in filesystem.
153 |     If it does exist, loads csv via path
154 |     If it does NOT exist, try to load data from Db2
155 |     """
156 |     body = check_for_file_in_filesystem(path)
157 |     if body:
158 |         suffix = path[-3:]
159 |         # Check whether path ends on csv
160 |         if suffix == "csv":
161 |             gcf_df = pd.read_csv(path)
162 |         else:
163 |             with open(path) as f:
164 |                 gcf_df = pickle.load(f)
165 |         return gcf_df


--------------------------------------------------------------------------------