├── .gitignore ├── README.md ├── first_experiment.ipynb ├── ml_flow_binary_classification.ipynb ├── ml_flow_dagshub.ipynb └── ml_flow_model_management.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mlflow_dagshub_demo 2 | Demo for mlflow and dagshub 3 | -------------------------------------------------------------------------------- /first_experiment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "id": "5f05cdda", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "from sklearn.datasets import make_classification\n", 12 | "from sklearn.model_selection import train_test_split\n", 13 | "from sklearn.linear_model import LogisticRegression\n", 14 | "from sklearn.ensemble import RandomForestClassifier\n", 15 | "from xgboost import XGBClassifier\n", 16 | "from sklearn.metrics import classification_report\n", 17 | "import warnings\n", 18 | "warnings.filterwarnings('ignore')" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 3, 24 | "id": "56014f75", 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "data": { 29 | "text/plain": [ 30 | "(array([0, 1]), array([900, 100], dtype=int64))" 31 | ] 32 | }, 33 | "execution_count": 3, 34 | "metadata": {}, 35 | "output_type": "execute_result" 36 | } 37 | ], 38 | "source": [ 39 | "# Step 1: Create an imbalanced binary classification dataset\n", 40 | "X, y = make_classification(n_samples=1000, n_features=10, n_informative=2, n_redundant=8, \n", 41 | " weights=[0.9, 0.1], flip_y=0, random_state=42)\n", 42 | "\n", 43 | "np.unique(y, return_counts=True)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 5, 49 | "id": "e94ae830", 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "# Split the dataset into training and testing sets\n", 54 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 13, 60 | "id": "f7d21a3c", 61 | "metadata": { 62 | "scrolled": false 63 | }, 64 | "outputs": [ 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | " precision recall f1-score support\n", 70 | "\n", 71 | " 0 0.95 0.97 0.96 270\n", 72 | " 1 0.62 0.50 0.56 30\n", 73 | "\n", 74 | " accuracy 0.92 300\n", 75 | " macro avg 0.79 0.73 0.76 300\n", 76 | "weighted avg 0.91 0.92 0.92 300\n", 77 | "\n" 78 | ] 79 | } 80 | ], 81 | "source": [ 82 | "# Define the model hyperparameters\n", 83 | "params = {\n", 84 | " \"solver\": \"lbfgs\",\n", 85 | " \"max_iter\": 1000,\n", 86 | " \"multi_class\": \"auto\",\n", 87 | " \"random_state\": 8888,\n", 88 | "}\n", 89 | "\n", 90 | "# Train the model\n", 91 | "lr = LogisticRegression(**params)\n", 92 | "lr.fit(X_train, y_train)\n", 93 | "\n", 94 | "# Predict on the test set\n", 95 | "y_pred = lr.predict(X_test)\n", 96 | "\n", 97 | "report = classification_report(y_test, y_pred)\n", 98 | "print(report)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 15, 104 | "id": "c37eb3c8", 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "data": { 109 | "text/plain": [ 110 | "{'0': {'precision': 0.9456521739130435,\n", 111 | " 'recall': 0.9666666666666667,\n", 112 | " 'f1-score': 0.956043956043956,\n", 113 | " 'support': 270.0},\n", 114 | " '1': {'precision': 0.625,\n", 115 | " 'recall': 0.5,\n", 116 | " 'f1-score': 0.5555555555555556,\n", 117 | " 'support': 30.0},\n", 118 | " 'accuracy': 0.92,\n", 119 | " 'macro avg': {'precision': 0.7853260869565217,\n", 120 | " 'recall': 0.7333333333333334,\n", 121 | " 'f1-score': 0.7557997557997558,\n", 122 | " 'support': 300.0},\n", 123 | " 'weighted avg': {'precision': 0.9135869565217392,\n", 124 | " 'recall': 0.92,\n", 125 | " 'f1-score': 0.915995115995116,\n", 126 | " 'support': 300.0}}" 127 | ] 128 | }, 129 | "execution_count": 15, 130 | "metadata": {}, 131 | "output_type": "execute_result" 132 | } 133 | ], 134 | "source": [ 135 | "report_dict = classification_report(y_test, y_pred, output_dict=True)\n", 136 | "report_dict" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 7, 142 | "id": "66f89a13", 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "import mlflow" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 23, 152 | "id": "f380ca75", 153 | "metadata": {}, 154 | "outputs": [ 155 | { 156 | "name": "stderr", 157 | "output_type": "stream", 158 | "text": [ 159 | "2024/07/29 13:57:02 INFO mlflow.tracking.fluent: Experiment with name 'First Experiment' does not exist. Creating a new experiment.\n", 160 | "Registered model 'tracking-quickstart' already exists. Creating a new version of this model...\n", 161 | "2024/07/29 13:57:04 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: tracking-quickstart, version 2\n", 162 | "Created version '2' of model 'tracking-quickstart'.\n" 163 | ] 164 | } 165 | ], 166 | "source": [ 167 | "mlflow.set_experiment(\"First Experiment\")\n", 168 | "mlflow.set_tracking_uri(uri=\"http://127.0.0.1:5000/\")\n", 169 | "\n", 170 | "with mlflow.start_run():\n", 171 | " mlflow.log_params(params)\n", 172 | " mlflow.log_metrics({\n", 173 | " 'accuracy': report_dict['accuracy'],\n", 174 | " 'recall_class_0': report_dict['0']['recall'],\n", 175 | " 'recall_class_1': report_dict['1']['recall'],\n", 176 | " 'f1_score_macro': report_dict['macro avg']['f1-score']\n", 177 | " })\n", 178 | " mlflow.sklearn.log_model(lr, \"Logistic Regression\") " 179 | ] 180 | } 181 | ], 182 | "metadata": { 183 | "kernelspec": { 184 | "display_name": "Python 3 (ipykernel)", 185 | "language": "python", 186 | "name": "python3" 187 | }, 188 | "language_info": { 189 | "codemirror_mode": { 190 | "name": "ipython", 191 | "version": 3 192 | }, 193 | "file_extension": ".py", 194 | "mimetype": "text/x-python", 195 | "name": "python", 196 | "nbconvert_exporter": "python", 197 | "pygments_lexer": "ipython3", 198 | "version": "3.10.11" 199 | } 200 | }, 201 | "nbformat": 4, 202 | "nbformat_minor": 5 203 | } 204 | -------------------------------------------------------------------------------- /ml_flow_binary_classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "f4e36302", 6 | "metadata": {}, 7 | "source": [ 8 | "

Codebasics ML Course: ML Flow Tutorial

" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 37, 14 | "id": "295e5486", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "from sklearn.datasets import make_classification\n", 20 | "from sklearn.model_selection import train_test_split\n", 21 | "from sklearn.linear_model import LogisticRegression\n", 22 | "from sklearn.ensemble import RandomForestClassifier\n", 23 | "from xgboost import XGBClassifier\n", 24 | "from sklearn.metrics import classification_report\n", 25 | "import warnings\n", 26 | "warnings.filterwarnings('ignore')" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 38, 32 | "id": "ac73cd36", 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "data": { 37 | "text/plain": [ 38 | "(array([0, 1]), array([900, 100], dtype=int64))" 39 | ] 40 | }, 41 | "execution_count": 38, 42 | "metadata": {}, 43 | "output_type": "execute_result" 44 | } 45 | ], 46 | "source": [ 47 | "# Step 1: Create an imbalanced binary classification dataset\n", 48 | "X, y = make_classification(n_samples=1000, n_features=10, n_informative=2, n_redundant=8, \n", 49 | " weights=[0.9, 0.1], flip_y=0, random_state=42)\n", 50 | "\n", 51 | "np.unique(y, return_counts=True)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 39, 57 | "id": "0934ac03", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "# Split the dataset into training and testing sets\n", 62 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "id": "027f7e0a", 68 | "metadata": {}, 69 | "source": [ 70 | "### Experiment 1: Train Logistic Regression Classifier" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 40, 76 | "id": "df52d46a", 77 | "metadata": { 78 | "scrolled": true 79 | }, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | " precision recall f1-score support\n", 86 | "\n", 87 | " 0 0.95 0.96 0.95 270\n", 88 | " 1 0.60 0.50 0.55 30\n", 89 | "\n", 90 | " accuracy 0.92 300\n", 91 | " macro avg 0.77 0.73 0.75 300\n", 92 | "weighted avg 0.91 0.92 0.91 300\n", 93 | "\n" 94 | ] 95 | } 96 | ], 97 | "source": [ 98 | "log_reg = LogisticRegression(C=1, solver='liblinear')\n", 99 | "log_reg.fit(X_train, y_train)\n", 100 | "y_pred_log_reg = log_reg.predict(X_test)\n", 101 | "print(classification_report(y_test, y_pred_log_reg))" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "id": "1468bab4", 107 | "metadata": {}, 108 | "source": [ 109 | "### Experiment 2: Train Random Forest Classifier" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 41, 115 | "id": "2742e30d", 116 | "metadata": { 117 | "scrolled": true 118 | }, 119 | "outputs": [ 120 | { 121 | "name": "stdout", 122 | "output_type": "stream", 123 | "text": [ 124 | " precision recall f1-score support\n", 125 | "\n", 126 | " 0 0.96 1.00 0.98 270\n", 127 | " 1 0.95 0.67 0.78 30\n", 128 | "\n", 129 | " accuracy 0.96 300\n", 130 | " macro avg 0.96 0.83 0.88 300\n", 131 | "weighted avg 0.96 0.96 0.96 300\n", 132 | "\n" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "rf_clf = RandomForestClassifier(n_estimators=30, max_depth=3)\n", 138 | "rf_clf.fit(X_train, y_train)\n", 139 | "y_pred_rf = rf_clf.predict(X_test)\n", 140 | "print(classification_report(y_test, y_pred_rf))" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "id": "7db18915", 146 | "metadata": {}, 147 | "source": [ 148 | "### Experiment 3: Train XGBoost" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 42, 154 | "id": "fa3fe3e3", 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "name": "stdout", 159 | "output_type": "stream", 160 | "text": [ 161 | " precision recall f1-score support\n", 162 | "\n", 163 | " 0 0.98 1.00 0.99 270\n", 164 | " 1 0.96 0.80 0.87 30\n", 165 | "\n", 166 | " accuracy 0.98 300\n", 167 | " macro avg 0.97 0.90 0.93 300\n", 168 | "weighted avg 0.98 0.98 0.98 300\n", 169 | "\n" 170 | ] 171 | } 172 | ], 173 | "source": [ 174 | "xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')\n", 175 | "xgb_clf.fit(X_train, y_train)\n", 176 | "y_pred_xgb = xgb_clf.predict(X_test)\n", 177 | "print(classification_report(y_test, y_pred_xgb))" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "id": "b70bbef1", 183 | "metadata": {}, 184 | "source": [ 185 | "### Experiment 4: Handle class imbalance using SMOTETomek and then Train XGBoost" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 43, 191 | "id": "5ecbe6a5", 192 | "metadata": {}, 193 | "outputs": [ 194 | { 195 | "data": { 196 | "text/plain": [ 197 | "(array([0, 1]), array([619, 619], dtype=int64))" 198 | ] 199 | }, 200 | "execution_count": 43, 201 | "metadata": {}, 202 | "output_type": "execute_result" 203 | } 204 | ], 205 | "source": [ 206 | "from imblearn.combine import SMOTETomek\n", 207 | "\n", 208 | "smt = SMOTETomek(random_state=42)\n", 209 | "X_train_res, y_train_res = smt.fit_resample(X_train, y_train)\n", 210 | "\n", 211 | "np.unique(y_train_res, return_counts=True)" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 44, 217 | "id": "2b931191", 218 | "metadata": { 219 | "scrolled": true 220 | }, 221 | "outputs": [ 222 | { 223 | "name": "stdout", 224 | "output_type": "stream", 225 | "text": [ 226 | " precision recall f1-score support\n", 227 | "\n", 228 | " 0 0.98 0.98 0.98 270\n", 229 | " 1 0.81 0.83 0.82 30\n", 230 | "\n", 231 | " accuracy 0.96 300\n", 232 | " macro avg 0.89 0.91 0.90 300\n", 233 | "weighted avg 0.96 0.96 0.96 300\n", 234 | "\n" 235 | ] 236 | } 237 | ], 238 | "source": [ 239 | "xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')\n", 240 | "xgb_clf.fit(X_train_res, y_train_res)\n", 241 | "y_pred_xgb = xgb_clf.predict(X_test)\n", 242 | "print(classification_report(y_test, y_pred_xgb))" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "id": "8ac546b4", 248 | "metadata": {}, 249 | "source": [ 250 | "

Track Experiments Using MLFlow

" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 59, 256 | "id": "9fc788a3", 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "models = [\n", 261 | " (\n", 262 | " \"Logistic Regression\", \n", 263 | " LogisticRegression(C=1, solver='liblinear'), \n", 264 | " (X_train, y_train),\n", 265 | " (X_test, y_test)\n", 266 | " ),\n", 267 | " (\n", 268 | " \"Random Forest\", \n", 269 | " RandomForestClassifier(n_estimators=30, max_depth=3), \n", 270 | " (X_train, y_train),\n", 271 | " (X_test, y_test)\n", 272 | " ),\n", 273 | " (\n", 274 | " \"XGBClassifier\",\n", 275 | " XGBClassifier(use_label_encoder=False, eval_metric='logloss'), \n", 276 | " (X_train, y_train),\n", 277 | " (X_test, y_test)\n", 278 | " ),\n", 279 | " (\n", 280 | " \"XGBClassifier With SMOTE\",\n", 281 | " XGBClassifier(use_label_encoder=False, eval_metric='logloss'), \n", 282 | " (X_train_res, y_train_res),\n", 283 | " (X_test, y_test)\n", 284 | " )\n", 285 | "]" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 60, 291 | "id": "1a827a88", 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "reports = []\n", 296 | "\n", 297 | "for model_name, model, train_set, test_set in models:\n", 298 | " X_train = train_set[0]\n", 299 | " y_train = train_set[1]\n", 300 | " X_test = test_set[0]\n", 301 | " y_test = test_set[1]\n", 302 | " \n", 303 | " model.fit(X_train, y_train)\n", 304 | " y_pred = model.predict(X_test)\n", 305 | " report = classification_report(y_test, y_pred, output_dict=True)\n", 306 | " reports.append(report)" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 61, 312 | "id": "29ca91b0", 313 | "metadata": {}, 314 | "outputs": [], 315 | "source": [ 316 | "import mlflow\n", 317 | "import mlflow.sklearn\n", 318 | "import mlflow.xgboost" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 63, 324 | "id": "420f2511", 325 | "metadata": { 326 | "scrolled": false 327 | }, 328 | "outputs": [], 329 | "source": [ 330 | "# Initialize MLflow\n", 331 | "mlflow.set_experiment(\"Anomaly Detection\")\n", 332 | "mlflow.set_tracking_uri(\"http://localhost:5000\")\n", 333 | "\n", 334 | "for i, element in enumerate(models):\n", 335 | " model_name = element[0]\n", 336 | " model = element[1]\n", 337 | " report = reports[i]\n", 338 | " \n", 339 | " with mlflow.start_run(run_name=model_name): \n", 340 | " mlflow.log_param(\"model\", model_name)\n", 341 | " mlflow.log_metric('accuracy', report['accuracy'])\n", 342 | " mlflow.log_metric('recall_class_1', report['1']['recall'])\n", 343 | " mlflow.log_metric('recall_class_0', report['0']['recall'])\n", 344 | " mlflow.log_metric('f1_score_macro', report['macro avg']['f1-score']) \n", 345 | " \n", 346 | " if \"XGB\" in model_name:\n", 347 | " mlflow.xgboost.log_model(model, \"model\")\n", 348 | " else:\n", 349 | " mlflow.sklearn.log_model(model, \"model\") " 350 | ] 351 | } 352 | ], 353 | "metadata": { 354 | "kernelspec": { 355 | "display_name": "Python 3 (ipykernel)", 356 | "language": "python", 357 | "name": "python3" 358 | }, 359 | "language_info": { 360 | "codemirror_mode": { 361 | "name": "ipython", 362 | "version": 3 363 | }, 364 | "file_extension": ".py", 365 | "mimetype": "text/x-python", 366 | "name": "python", 367 | "nbconvert_exporter": "python", 368 | "pygments_lexer": "ipython3", 369 | "version": "3.10.11" 370 | } 371 | }, 372 | "nbformat": 4, 373 | "nbformat_minor": 5 374 | } 375 | -------------------------------------------------------------------------------- /ml_flow_dagshub.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "b4f9d400", 6 | "metadata": {}, 7 | "source": [ 8 | "

Codebasics ML Course: ML Flow Dagshub Tutorial

" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "5eb3c2b2", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "from sklearn.datasets import make_classification\n", 20 | "from sklearn.model_selection import train_test_split\n", 21 | "from sklearn.linear_model import LogisticRegression\n", 22 | "from sklearn.ensemble import RandomForestClassifier\n", 23 | "from xgboost import XGBClassifier\n", 24 | "from sklearn.metrics import classification_report\n", 25 | "import warnings\n", 26 | "warnings.filterwarnings('ignore')" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "id": "0878fc4c", 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "data": { 37 | "text/plain": [ 38 | "(array([0, 1]), array([900, 100], dtype=int64))" 39 | ] 40 | }, 41 | "execution_count": 2, 42 | "metadata": {}, 43 | "output_type": "execute_result" 44 | } 45 | ], 46 | "source": [ 47 | "# Step 1: Create an imbalanced binary classification dataset\n", 48 | "X, y = make_classification(n_samples=1000, n_features=10, n_informative=2, n_redundant=8, \n", 49 | " weights=[0.9, 0.1], flip_y=0, random_state=42)\n", 50 | "\n", 51 | "np.unique(y, return_counts=True)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "id": "2a6b80dd", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "# Split the dataset into training and testing sets\n", 62 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "id": "f3a6191b", 68 | "metadata": {}, 69 | "source": [ 70 | "#### Handle class imbalance" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 4, 76 | "id": "3190fd47", 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "data": { 81 | "text/plain": [ 82 | "(array([0, 1]), array([619, 619], dtype=int64))" 83 | ] 84 | }, 85 | "execution_count": 4, 86 | "metadata": {}, 87 | "output_type": "execute_result" 88 | } 89 | ], 90 | "source": [ 91 | "from imblearn.combine import SMOTETomek\n", 92 | "\n", 93 | "smt = SMOTETomek(random_state=42)\n", 94 | "X_train_res, y_train_res = smt.fit_resample(X_train, y_train)\n", 95 | "np.unique(y_train_res, return_counts=True)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "id": "75b6f715", 101 | "metadata": {}, 102 | "source": [ 103 | "### Track Experiments" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 5, 109 | "id": "1eb49554", 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "models = [\n", 114 | " (\n", 115 | " \"Logistic Regression\", \n", 116 | " {\"C\": 1, \"solver\": 'liblinear'},\n", 117 | " LogisticRegression(), \n", 118 | " (X_train, y_train),\n", 119 | " (X_test, y_test)\n", 120 | " ),\n", 121 | " (\n", 122 | " \"Random Forest\", \n", 123 | " {\"n_estimators\": 30, \"max_depth\": 3},\n", 124 | " RandomForestClassifier(), \n", 125 | " (X_train, y_train),\n", 126 | " (X_test, y_test)\n", 127 | " ),\n", 128 | " (\n", 129 | " \"XGBClassifier\",\n", 130 | " {\"use_label_encoder\": False, \"eval_metric\": 'logloss'},\n", 131 | " XGBClassifier(), \n", 132 | " (X_train, y_train),\n", 133 | " (X_test, y_test)\n", 134 | " ),\n", 135 | " (\n", 136 | " \"XGBClassifier With SMOTE\",\n", 137 | " {\"use_label_encoder\": False, \"eval_metric\": 'logloss'},\n", 138 | " XGBClassifier(), \n", 139 | " (X_train_res, y_train_res),\n", 140 | " (X_test, y_test)\n", 141 | " )\n", 142 | "]" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 6, 148 | "id": "a91ad5ae", 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "reports = []\n", 153 | "\n", 154 | "for model_name, params, model, train_set, test_set in models:\n", 155 | " X_train = train_set[0]\n", 156 | " y_train = train_set[1]\n", 157 | " X_test = test_set[0]\n", 158 | " y_test = test_set[1]\n", 159 | " \n", 160 | " model.set_params(**params)\n", 161 | " model.fit(X_train, y_train)\n", 162 | " y_pred = model.predict(X_test)\n", 163 | " report = classification_report(y_test, y_pred, output_dict=True)\n", 164 | " reports.append(report)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 7, 170 | "id": "08741b0a", 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "import mlflow\n", 175 | "import mlflow.sklearn\n", 176 | "import mlflow.xgboost" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 8, 182 | "id": "503f6e93", 183 | "metadata": {}, 184 | "outputs": [ 185 | { 186 | "data": { 187 | "text/html": [ 188 | "
Accessing as dhavalsays\n",
189 |        "
\n" 190 | ], 191 | "text/plain": [ 192 | "Accessing as dhavalsays\n" 193 | ] 194 | }, 195 | "metadata": {}, 196 | "output_type": "display_data" 197 | }, 198 | { 199 | "data": { 200 | "text/html": [ 201 | "
Initialized MLflow to track repo \"learnpythonlanguage/mlflow_dagshub_demo\"\n",
202 |        "
\n" 203 | ], 204 | "text/plain": [ 205 | "Initialized MLflow to track repo \u001b[32m\"learnpythonlanguage/mlflow_dagshub_demo\"\u001b[0m\n" 206 | ] 207 | }, 208 | "metadata": {}, 209 | "output_type": "display_data" 210 | }, 211 | { 212 | "data": { 213 | "text/html": [ 214 | "
Repository learnpythonlanguage/mlflow_dagshub_demo initialized!\n",
215 |        "
\n" 216 | ], 217 | "text/plain": [ 218 | "Repository learnpythonlanguage/mlflow_dagshub_demo initialized!\n" 219 | ] 220 | }, 221 | "metadata": {}, 222 | "output_type": "display_data" 223 | } 224 | ], 225 | "source": [ 226 | "# dagshub setup\n", 227 | "\n", 228 | "import dagshub\n", 229 | "dagshub.init(repo_owner='learnpythonlanguage', repo_name='mlflow_dagshub_demo', mlflow=True)" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 12, 235 | "id": "cfcc9a19", 236 | "metadata": { 237 | "scrolled": false 238 | }, 239 | "outputs": [ 240 | { 241 | "name": "stderr", 242 | "output_type": "stream", 243 | "text": [ 244 | "2024/08/01 11:50:40 INFO mlflow.tracking.fluent: Experiment with name 'Anomaly Detection' does not exist. Creating a new experiment.\n" 245 | ] 246 | } 247 | ], 248 | "source": [ 249 | "# Ideally you will not require following 4 lines if you have started fresh and do not have any previous dagshub credentials on your computer\n", 250 | "import os\n", 251 | "os.environ['MLFLOW_TRACKING_USERNAME'] = 'your user name' # 'learnpythonlanguage'\n", 252 | "os.environ['MLFLOW_TRACKING_PASSWORD'] = 'your password' # \n", 253 | "os.environ['MLFLOW_TRACKING_URI'] = 'your dagshub unique uri' # https://dagshub.com/learnpythonlanguage/mlflow_dagshub_demo.mlflow\n", 254 | "\n", 255 | "# Initialize MLflow\n", 256 | "mlflow.set_experiment(\"Anomaly Detection\")\n", 257 | "# mlflow.set_tracking_uri(\"http://localhost:5000\")\n", 258 | "\n", 259 | "for i, element in enumerate(models):\n", 260 | " model_name = element[0]\n", 261 | " params = element[1]\n", 262 | " model = element[2]\n", 263 | " report = reports[i]\n", 264 | " \n", 265 | " with mlflow.start_run(run_name=model_name): \n", 266 | " mlflow.log_params(params)\n", 267 | " mlflow.log_metrics({\n", 268 | " 'accuracy': report['accuracy'],\n", 269 | " 'recall_class_1': report['1']['recall'],\n", 270 | " 'recall_class_0': report['0']['recall'],\n", 271 | " 'f1_score_macro': report['macro avg']['f1-score']\n", 272 | " }) \n", 273 | " \n", 274 | " if \"XGB\" in model_name:\n", 275 | " mlflow.xgboost.log_model(model, \"model\")\n", 276 | " else:\n", 277 | " mlflow.sklearn.log_model(model, \"model\") " 278 | ] 279 | } 280 | ], 281 | "metadata": { 282 | "kernelspec": { 283 | "display_name": "Python 3 (ipykernel)", 284 | "language": "python", 285 | "name": "python3" 286 | }, 287 | "language_info": { 288 | "codemirror_mode": { 289 | "name": "ipython", 290 | "version": 3 291 | }, 292 | "file_extension": ".py", 293 | "mimetype": "text/x-python", 294 | "name": "python", 295 | "nbconvert_exporter": "python", 296 | "pygments_lexer": "ipython3", 297 | "version": "3.10.11" 298 | } 299 | }, 300 | "nbformat": 4, 301 | "nbformat_minor": 5 302 | } 303 | -------------------------------------------------------------------------------- /ml_flow_model_management.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "20086d7c", 6 | "metadata": {}, 7 | "source": [ 8 | "

Codebasics ML Course: ML Flow Tutorial

" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 29, 14 | "id": "2134f63a", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "from sklearn.datasets import make_classification\n", 20 | "from sklearn.model_selection import train_test_split\n", 21 | "from sklearn.linear_model import LogisticRegression\n", 22 | "from sklearn.ensemble import RandomForestClassifier\n", 23 | "from xgboost import XGBClassifier\n", 24 | "from sklearn.metrics import classification_report\n", 25 | "import warnings\n", 26 | "warnings.filterwarnings('ignore')" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 30, 32 | "id": "8a467445", 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "data": { 37 | "text/plain": [ 38 | "(array([0, 1]), array([900, 100], dtype=int64))" 39 | ] 40 | }, 41 | "execution_count": 30, 42 | "metadata": {}, 43 | "output_type": "execute_result" 44 | } 45 | ], 46 | "source": [ 47 | "# Step 1: Create an imbalanced binary classification dataset\n", 48 | "X, y = make_classification(n_samples=1000, n_features=10, n_informative=2, n_redundant=8, \n", 49 | " weights=[0.9, 0.1], flip_y=0, random_state=42)\n", 50 | "\n", 51 | "np.unique(y, return_counts=True)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 31, 57 | "id": "7fc473ad", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "# Split the dataset into training and testing sets\n", 62 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "id": "ce174acd", 68 | "metadata": {}, 69 | "source": [ 70 | "#### Handle class imbalance" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 32, 76 | "id": "0c6d768a", 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "data": { 81 | "text/plain": [ 82 | "(array([0, 1]), array([619, 619], dtype=int64))" 83 | ] 84 | }, 85 | "execution_count": 32, 86 | "metadata": {}, 87 | "output_type": "execute_result" 88 | } 89 | ], 90 | "source": [ 91 | "from imblearn.combine import SMOTETomek\n", 92 | "\n", 93 | "smt = SMOTETomek(random_state=42)\n", 94 | "X_train_res, y_train_res = smt.fit_resample(X_train, y_train)\n", 95 | "np.unique(y_train_smt, return_counts=True)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "id": "f04a52b2", 101 | "metadata": {}, 102 | "source": [ 103 | "### Track Experiments" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 33, 109 | "id": "82fdaf1c", 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "models = [\n", 114 | " (\n", 115 | " \"Logistic Regression\", \n", 116 | " {\"C\": 1, \"solver\": 'liblinear'},\n", 117 | " LogisticRegression(), \n", 118 | " (X_train, y_train),\n", 119 | " (X_test, y_test)\n", 120 | " ),\n", 121 | " (\n", 122 | " \"Random Forest\", \n", 123 | " {\"n_estimators\": 30, \"max_depth\": 3},\n", 124 | " RandomForestClassifier(), \n", 125 | " (X_train, y_train),\n", 126 | " (X_test, y_test)\n", 127 | " ),\n", 128 | " (\n", 129 | " \"XGBClassifier\",\n", 130 | " {\"use_label_encoder\": False, \"eval_metric\": 'logloss'},\n", 131 | " XGBClassifier(), \n", 132 | " (X_train, y_train),\n", 133 | " (X_test, y_test)\n", 134 | " ),\n", 135 | " (\n", 136 | " \"XGBClassifier With SMOTE\",\n", 137 | " {\"use_label_encoder\": False, \"eval_metric\": 'logloss'},\n", 138 | " XGBClassifier(), \n", 139 | " (X_train_res, y_train_res),\n", 140 | " (X_test, y_test)\n", 141 | " )\n", 142 | "]" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 34, 148 | "id": "13a992c3", 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "reports = []\n", 153 | "\n", 154 | "for model_name, params, model, train_set, test_set in models:\n", 155 | " X_train = train_set[0]\n", 156 | " y_train = train_set[1]\n", 157 | " X_test = test_set[0]\n", 158 | " y_test = test_set[1]\n", 159 | " \n", 160 | " model.set_params(**params)\n", 161 | " model.fit(X_train, y_train)\n", 162 | " y_pred = model.predict(X_test)\n", 163 | " report = classification_report(y_test, y_pred, output_dict=True)\n", 164 | " reports.append(report)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 35, 170 | "id": "d9301bc0", 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "import mlflow\n", 175 | "import mlflow.sklearn\n", 176 | "import mlflow.xgboost" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 36, 182 | "id": "9ad9cf4d", 183 | "metadata": { 184 | "scrolled": false 185 | }, 186 | "outputs": [], 187 | "source": [ 188 | "# Initialize MLflow\n", 189 | "mlflow.set_experiment(\"Anomaly Detection\")\n", 190 | "mlflow.set_tracking_uri(\"http://localhost:5000\")\n", 191 | "\n", 192 | "for i, element in enumerate(models):\n", 193 | " model_name = element[0]\n", 194 | " params = element[1]\n", 195 | " model = element[2]\n", 196 | " report = reports[i]\n", 197 | " \n", 198 | " with mlflow.start_run(run_name=model_name): \n", 199 | " mlflow.log_params(params)\n", 200 | " mlflow.log_metrics({\n", 201 | " 'accuracy': report['accuracy'],\n", 202 | " 'recall_class_1': report['1']['recall'],\n", 203 | " 'recall_class_0': report['0']['recall'],\n", 204 | " 'f1_score_macro': report['macro avg']['f1-score']\n", 205 | " }) \n", 206 | " \n", 207 | " if \"XGB\" in model_name:\n", 208 | " mlflow.xgboost.log_model(model, \"model\")\n", 209 | " else:\n", 210 | " mlflow.sklearn.log_model(model, \"model\") " 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "id": "7446ae8a", 216 | "metadata": {}, 217 | "source": [ 218 | "### Register the Model" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "id": "51c0013a", 225 | "metadata": { 226 | "scrolled": false 227 | }, 228 | "outputs": [], 229 | "source": [ 230 | "model_name = 'XGB-Smote'\n", 231 | "run_id=input('Please type RunID')\n", 232 | "model_uri = f'runs:/{run_id}/model_name'\n", 233 | "\n", 234 | "with mlflow.start_run(run_id=run_id):\n", 235 | " mlflow.register_model(model_uri=model_uri, name=model_name)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "id": "1b074a08", 241 | "metadata": {}, 242 | "source": [ 243 | "### Load the Model" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 41, 249 | "id": "a40fef12", 250 | "metadata": { 251 | "scrolled": true 252 | }, 253 | "outputs": [ 254 | { 255 | "name": "stderr", 256 | "output_type": "stream", 257 | "text": [ 258 | "Downloading artifacts: 100%|██████████| 5/5 [00:02<00:00, 2.33it/s]\n" 259 | ] 260 | }, 261 | { 262 | "data": { 263 | "text/plain": [ 264 | "array([0, 0, 0, 0])" 265 | ] 266 | }, 267 | "execution_count": 41, 268 | "metadata": {}, 269 | "output_type": "execute_result" 270 | } 271 | ], 272 | "source": [ 273 | "model_version = 1\n", 274 | "model_uri = f\"models:/{model_name}/{model_version}\"\n", 275 | "\n", 276 | "loaded_model = mlflow.xgboost.load_model(model_uri)\n", 277 | "y_pred = loaded_model.predict(X_test)\n", 278 | "y_pred[:4]" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "id": "5f8d2893", 284 | "metadata": {}, 285 | "source": [ 286 | "### Transition the Model to Production" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 42, 292 | "id": "9c8ac1e3", 293 | "metadata": { 294 | "scrolled": true 295 | }, 296 | "outputs": [ 297 | { 298 | "data": { 299 | "text/plain": [ 300 | "" 301 | ] 302 | }, 303 | "execution_count": 42, 304 | "metadata": {}, 305 | "output_type": "execute_result" 306 | } 307 | ], 308 | "source": [ 309 | "current_model_uri = f\"models:/{model_name}@challenger\"\n", 310 | "production_model_name = \"anomaly-detection-prod\"\n", 311 | "\n", 312 | "client = mlflow.MlflowClient()\n", 313 | "client.copy_model_version(src_model_uri=current_model_uri, dst_name=production_model_name)" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 43, 319 | "id": "4297a2fe", 320 | "metadata": {}, 321 | "outputs": [ 322 | { 323 | "name": "stderr", 324 | "output_type": "stream", 325 | "text": [ 326 | "Downloading artifacts: 100%|██████████| 5/5 [00:02<00:00, 2.33it/s]\n" 327 | ] 328 | }, 329 | { 330 | "data": { 331 | "text/plain": [ 332 | "array([0, 0, 0, 0])" 333 | ] 334 | }, 335 | "execution_count": 43, 336 | "metadata": {}, 337 | "output_type": "execute_result" 338 | } 339 | ], 340 | "source": [ 341 | "model_version = 1\n", 342 | "prod_model_uri = f\"models:/{production_model_name}@champion\"\n", 343 | "\n", 344 | "loaded_model = mlflow.xgboost.load_model(prod_model_uri)\n", 345 | "y_pred = loaded_model.predict(X_test)\n", 346 | "y_pred[:4]" 347 | ] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "id": "ca565a87", 352 | "metadata": {}, 353 | "source": [ 354 | "Please refer to following to learn more about model registry\n", 355 | "\n", 356 | "https://mlflow.org/docs/latest/model-registry.html#model-registry-workflows to learn " 357 | ] 358 | } 359 | ], 360 | "metadata": { 361 | "kernelspec": { 362 | "display_name": "Python 3 (ipykernel)", 363 | "language": "python", 364 | "name": "python3" 365 | }, 366 | "language_info": { 367 | "codemirror_mode": { 368 | "name": "ipython", 369 | "version": 3 370 | }, 371 | "file_extension": ".py", 372 | "mimetype": "text/x-python", 373 | "name": "python", 374 | "nbconvert_exporter": "python", 375 | "pygments_lexer": "ipython3", 376 | "version": "3.10.11" 377 | } 378 | }, 379 | "nbformat": 4, 380 | "nbformat_minor": 5 381 | } 382 | --------------------------------------------------------------------------------