├── .gitignore
├── README.md
├── first_experiment.ipynb
├── ml_flow_binary_classification.ipynb
├── ml_flow_dagshub.ipynb
└── ml_flow_model_management.ipynb
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 |
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 |
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 |
121 | # SageMath parsed files
122 | *.sage.py
123 |
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 |
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 |
137 | # Rope project settings
138 | .ropeproject
139 |
140 | # mkdocs documentation
141 | /site
142 |
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 |
148 | # Pyre type checker
149 | .pyre/
150 |
151 | # pytype static type analyzer
152 | .pytype/
153 |
154 | # Cython debug symbols
155 | cython_debug/
156 |
157 | # PyCharm
158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | # and can be added to the global gitignore or merged into this file. For a more nuclear
161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # mlflow_dagshub_demo
2 | Demo for mlflow and dagshub
3 |
--------------------------------------------------------------------------------
/first_experiment.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "id": "5f05cdda",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import numpy as np\n",
11 | "from sklearn.datasets import make_classification\n",
12 | "from sklearn.model_selection import train_test_split\n",
13 | "from sklearn.linear_model import LogisticRegression\n",
14 | "from sklearn.ensemble import RandomForestClassifier\n",
15 | "from xgboost import XGBClassifier\n",
16 | "from sklearn.metrics import classification_report\n",
17 | "import warnings\n",
18 | "warnings.filterwarnings('ignore')"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 3,
24 | "id": "56014f75",
25 | "metadata": {},
26 | "outputs": [
27 | {
28 | "data": {
29 | "text/plain": [
30 | "(array([0, 1]), array([900, 100], dtype=int64))"
31 | ]
32 | },
33 | "execution_count": 3,
34 | "metadata": {},
35 | "output_type": "execute_result"
36 | }
37 | ],
38 | "source": [
39 | "# Step 1: Create an imbalanced binary classification dataset\n",
40 | "X, y = make_classification(n_samples=1000, n_features=10, n_informative=2, n_redundant=8, \n",
41 | " weights=[0.9, 0.1], flip_y=0, random_state=42)\n",
42 | "\n",
43 | "np.unique(y, return_counts=True)"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 5,
49 | "id": "e94ae830",
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "# Split the dataset into training and testing sets\n",
54 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 13,
60 | "id": "f7d21a3c",
61 | "metadata": {
62 | "scrolled": false
63 | },
64 | "outputs": [
65 | {
66 | "name": "stdout",
67 | "output_type": "stream",
68 | "text": [
69 | " precision recall f1-score support\n",
70 | "\n",
71 | " 0 0.95 0.97 0.96 270\n",
72 | " 1 0.62 0.50 0.56 30\n",
73 | "\n",
74 | " accuracy 0.92 300\n",
75 | " macro avg 0.79 0.73 0.76 300\n",
76 | "weighted avg 0.91 0.92 0.92 300\n",
77 | "\n"
78 | ]
79 | }
80 | ],
81 | "source": [
82 | "# Define the model hyperparameters\n",
83 | "params = {\n",
84 | " \"solver\": \"lbfgs\",\n",
85 | " \"max_iter\": 1000,\n",
86 | " \"multi_class\": \"auto\",\n",
87 | " \"random_state\": 8888,\n",
88 | "}\n",
89 | "\n",
90 | "# Train the model\n",
91 | "lr = LogisticRegression(**params)\n",
92 | "lr.fit(X_train, y_train)\n",
93 | "\n",
94 | "# Predict on the test set\n",
95 | "y_pred = lr.predict(X_test)\n",
96 | "\n",
97 | "report = classification_report(y_test, y_pred)\n",
98 | "print(report)"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 15,
104 | "id": "c37eb3c8",
105 | "metadata": {},
106 | "outputs": [
107 | {
108 | "data": {
109 | "text/plain": [
110 | "{'0': {'precision': 0.9456521739130435,\n",
111 | " 'recall': 0.9666666666666667,\n",
112 | " 'f1-score': 0.956043956043956,\n",
113 | " 'support': 270.0},\n",
114 | " '1': {'precision': 0.625,\n",
115 | " 'recall': 0.5,\n",
116 | " 'f1-score': 0.5555555555555556,\n",
117 | " 'support': 30.0},\n",
118 | " 'accuracy': 0.92,\n",
119 | " 'macro avg': {'precision': 0.7853260869565217,\n",
120 | " 'recall': 0.7333333333333334,\n",
121 | " 'f1-score': 0.7557997557997558,\n",
122 | " 'support': 300.0},\n",
123 | " 'weighted avg': {'precision': 0.9135869565217392,\n",
124 | " 'recall': 0.92,\n",
125 | " 'f1-score': 0.915995115995116,\n",
126 | " 'support': 300.0}}"
127 | ]
128 | },
129 | "execution_count": 15,
130 | "metadata": {},
131 | "output_type": "execute_result"
132 | }
133 | ],
134 | "source": [
135 | "report_dict = classification_report(y_test, y_pred, output_dict=True)\n",
136 | "report_dict"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 7,
142 | "id": "66f89a13",
143 | "metadata": {},
144 | "outputs": [],
145 | "source": [
146 | "import mlflow"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": 23,
152 | "id": "f380ca75",
153 | "metadata": {},
154 | "outputs": [
155 | {
156 | "name": "stderr",
157 | "output_type": "stream",
158 | "text": [
159 | "2024/07/29 13:57:02 INFO mlflow.tracking.fluent: Experiment with name 'First Experiment' does not exist. Creating a new experiment.\n",
160 | "Registered model 'tracking-quickstart' already exists. Creating a new version of this model...\n",
161 | "2024/07/29 13:57:04 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: tracking-quickstart, version 2\n",
162 | "Created version '2' of model 'tracking-quickstart'.\n"
163 | ]
164 | }
165 | ],
166 | "source": [
167 | "mlflow.set_experiment(\"First Experiment\")\n",
168 | "mlflow.set_tracking_uri(uri=\"http://127.0.0.1:5000/\")\n",
169 | "\n",
170 | "with mlflow.start_run():\n",
171 | " mlflow.log_params(params)\n",
172 | " mlflow.log_metrics({\n",
173 | " 'accuracy': report_dict['accuracy'],\n",
174 | " 'recall_class_0': report_dict['0']['recall'],\n",
175 | " 'recall_class_1': report_dict['1']['recall'],\n",
176 | " 'f1_score_macro': report_dict['macro avg']['f1-score']\n",
177 | " })\n",
178 | " mlflow.sklearn.log_model(lr, \"Logistic Regression\") "
179 | ]
180 | }
181 | ],
182 | "metadata": {
183 | "kernelspec": {
184 | "display_name": "Python 3 (ipykernel)",
185 | "language": "python",
186 | "name": "python3"
187 | },
188 | "language_info": {
189 | "codemirror_mode": {
190 | "name": "ipython",
191 | "version": 3
192 | },
193 | "file_extension": ".py",
194 | "mimetype": "text/x-python",
195 | "name": "python",
196 | "nbconvert_exporter": "python",
197 | "pygments_lexer": "ipython3",
198 | "version": "3.10.11"
199 | }
200 | },
201 | "nbformat": 4,
202 | "nbformat_minor": 5
203 | }
204 |
--------------------------------------------------------------------------------
/ml_flow_binary_classification.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "f4e36302",
6 | "metadata": {},
7 | "source": [
8 | "
Codebasics ML Course: ML Flow Tutorial
"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 37,
14 | "id": "295e5486",
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import numpy as np\n",
19 | "from sklearn.datasets import make_classification\n",
20 | "from sklearn.model_selection import train_test_split\n",
21 | "from sklearn.linear_model import LogisticRegression\n",
22 | "from sklearn.ensemble import RandomForestClassifier\n",
23 | "from xgboost import XGBClassifier\n",
24 | "from sklearn.metrics import classification_report\n",
25 | "import warnings\n",
26 | "warnings.filterwarnings('ignore')"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 38,
32 | "id": "ac73cd36",
33 | "metadata": {},
34 | "outputs": [
35 | {
36 | "data": {
37 | "text/plain": [
38 | "(array([0, 1]), array([900, 100], dtype=int64))"
39 | ]
40 | },
41 | "execution_count": 38,
42 | "metadata": {},
43 | "output_type": "execute_result"
44 | }
45 | ],
46 | "source": [
47 | "# Step 1: Create an imbalanced binary classification dataset\n",
48 | "X, y = make_classification(n_samples=1000, n_features=10, n_informative=2, n_redundant=8, \n",
49 | " weights=[0.9, 0.1], flip_y=0, random_state=42)\n",
50 | "\n",
51 | "np.unique(y, return_counts=True)"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 39,
57 | "id": "0934ac03",
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "# Split the dataset into training and testing sets\n",
62 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)"
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "id": "027f7e0a",
68 | "metadata": {},
69 | "source": [
70 | "### Experiment 1: Train Logistic Regression Classifier"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 40,
76 | "id": "df52d46a",
77 | "metadata": {
78 | "scrolled": true
79 | },
80 | "outputs": [
81 | {
82 | "name": "stdout",
83 | "output_type": "stream",
84 | "text": [
85 | " precision recall f1-score support\n",
86 | "\n",
87 | " 0 0.95 0.96 0.95 270\n",
88 | " 1 0.60 0.50 0.55 30\n",
89 | "\n",
90 | " accuracy 0.92 300\n",
91 | " macro avg 0.77 0.73 0.75 300\n",
92 | "weighted avg 0.91 0.92 0.91 300\n",
93 | "\n"
94 | ]
95 | }
96 | ],
97 | "source": [
98 | "log_reg = LogisticRegression(C=1, solver='liblinear')\n",
99 | "log_reg.fit(X_train, y_train)\n",
100 | "y_pred_log_reg = log_reg.predict(X_test)\n",
101 | "print(classification_report(y_test, y_pred_log_reg))"
102 | ]
103 | },
104 | {
105 | "cell_type": "markdown",
106 | "id": "1468bab4",
107 | "metadata": {},
108 | "source": [
109 | "### Experiment 2: Train Random Forest Classifier"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 41,
115 | "id": "2742e30d",
116 | "metadata": {
117 | "scrolled": true
118 | },
119 | "outputs": [
120 | {
121 | "name": "stdout",
122 | "output_type": "stream",
123 | "text": [
124 | " precision recall f1-score support\n",
125 | "\n",
126 | " 0 0.96 1.00 0.98 270\n",
127 | " 1 0.95 0.67 0.78 30\n",
128 | "\n",
129 | " accuracy 0.96 300\n",
130 | " macro avg 0.96 0.83 0.88 300\n",
131 | "weighted avg 0.96 0.96 0.96 300\n",
132 | "\n"
133 | ]
134 | }
135 | ],
136 | "source": [
137 | "rf_clf = RandomForestClassifier(n_estimators=30, max_depth=3)\n",
138 | "rf_clf.fit(X_train, y_train)\n",
139 | "y_pred_rf = rf_clf.predict(X_test)\n",
140 | "print(classification_report(y_test, y_pred_rf))"
141 | ]
142 | },
143 | {
144 | "cell_type": "markdown",
145 | "id": "7db18915",
146 | "metadata": {},
147 | "source": [
148 | "### Experiment 3: Train XGBoost"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": 42,
154 | "id": "fa3fe3e3",
155 | "metadata": {},
156 | "outputs": [
157 | {
158 | "name": "stdout",
159 | "output_type": "stream",
160 | "text": [
161 | " precision recall f1-score support\n",
162 | "\n",
163 | " 0 0.98 1.00 0.99 270\n",
164 | " 1 0.96 0.80 0.87 30\n",
165 | "\n",
166 | " accuracy 0.98 300\n",
167 | " macro avg 0.97 0.90 0.93 300\n",
168 | "weighted avg 0.98 0.98 0.98 300\n",
169 | "\n"
170 | ]
171 | }
172 | ],
173 | "source": [
174 | "xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')\n",
175 | "xgb_clf.fit(X_train, y_train)\n",
176 | "y_pred_xgb = xgb_clf.predict(X_test)\n",
177 | "print(classification_report(y_test, y_pred_xgb))"
178 | ]
179 | },
180 | {
181 | "cell_type": "markdown",
182 | "id": "b70bbef1",
183 | "metadata": {},
184 | "source": [
185 | "### Experiment 4: Handle class imbalance using SMOTETomek and then Train XGBoost"
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": 43,
191 | "id": "5ecbe6a5",
192 | "metadata": {},
193 | "outputs": [
194 | {
195 | "data": {
196 | "text/plain": [
197 | "(array([0, 1]), array([619, 619], dtype=int64))"
198 | ]
199 | },
200 | "execution_count": 43,
201 | "metadata": {},
202 | "output_type": "execute_result"
203 | }
204 | ],
205 | "source": [
206 | "from imblearn.combine import SMOTETomek\n",
207 | "\n",
208 | "smt = SMOTETomek(random_state=42)\n",
209 | "X_train_res, y_train_res = smt.fit_resample(X_train, y_train)\n",
210 | "\n",
211 | "np.unique(y_train_res, return_counts=True)"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": 44,
217 | "id": "2b931191",
218 | "metadata": {
219 | "scrolled": true
220 | },
221 | "outputs": [
222 | {
223 | "name": "stdout",
224 | "output_type": "stream",
225 | "text": [
226 | " precision recall f1-score support\n",
227 | "\n",
228 | " 0 0.98 0.98 0.98 270\n",
229 | " 1 0.81 0.83 0.82 30\n",
230 | "\n",
231 | " accuracy 0.96 300\n",
232 | " macro avg 0.89 0.91 0.90 300\n",
233 | "weighted avg 0.96 0.96 0.96 300\n",
234 | "\n"
235 | ]
236 | }
237 | ],
238 | "source": [
239 | "xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')\n",
240 | "xgb_clf.fit(X_train_res, y_train_res)\n",
241 | "y_pred_xgb = xgb_clf.predict(X_test)\n",
242 | "print(classification_report(y_test, y_pred_xgb))"
243 | ]
244 | },
245 | {
246 | "cell_type": "markdown",
247 | "id": "8ac546b4",
248 | "metadata": {},
249 | "source": [
250 | "Track Experiments Using MLFlow
"
251 | ]
252 | },
253 | {
254 | "cell_type": "code",
255 | "execution_count": 59,
256 | "id": "9fc788a3",
257 | "metadata": {},
258 | "outputs": [],
259 | "source": [
260 | "models = [\n",
261 | " (\n",
262 | " \"Logistic Regression\", \n",
263 | " LogisticRegression(C=1, solver='liblinear'), \n",
264 | " (X_train, y_train),\n",
265 | " (X_test, y_test)\n",
266 | " ),\n",
267 | " (\n",
268 | " \"Random Forest\", \n",
269 | " RandomForestClassifier(n_estimators=30, max_depth=3), \n",
270 | " (X_train, y_train),\n",
271 | " (X_test, y_test)\n",
272 | " ),\n",
273 | " (\n",
274 | " \"XGBClassifier\",\n",
275 | " XGBClassifier(use_label_encoder=False, eval_metric='logloss'), \n",
276 | " (X_train, y_train),\n",
277 | " (X_test, y_test)\n",
278 | " ),\n",
279 | " (\n",
280 | " \"XGBClassifier With SMOTE\",\n",
281 | " XGBClassifier(use_label_encoder=False, eval_metric='logloss'), \n",
282 | " (X_train_res, y_train_res),\n",
283 | " (X_test, y_test)\n",
284 | " )\n",
285 | "]"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": 60,
291 | "id": "1a827a88",
292 | "metadata": {},
293 | "outputs": [],
294 | "source": [
295 | "reports = []\n",
296 | "\n",
297 | "for model_name, model, train_set, test_set in models:\n",
298 | " X_train = train_set[0]\n",
299 | " y_train = train_set[1]\n",
300 | " X_test = test_set[0]\n",
301 | " y_test = test_set[1]\n",
302 | " \n",
303 | " model.fit(X_train, y_train)\n",
304 | " y_pred = model.predict(X_test)\n",
305 | " report = classification_report(y_test, y_pred, output_dict=True)\n",
306 | " reports.append(report)"
307 | ]
308 | },
309 | {
310 | "cell_type": "code",
311 | "execution_count": 61,
312 | "id": "29ca91b0",
313 | "metadata": {},
314 | "outputs": [],
315 | "source": [
316 | "import mlflow\n",
317 | "import mlflow.sklearn\n",
318 | "import mlflow.xgboost"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": 63,
324 | "id": "420f2511",
325 | "metadata": {
326 | "scrolled": false
327 | },
328 | "outputs": [],
329 | "source": [
330 | "# Initialize MLflow\n",
331 | "mlflow.set_experiment(\"Anomaly Detection\")\n",
332 | "mlflow.set_tracking_uri(\"http://localhost:5000\")\n",
333 | "\n",
334 | "for i, element in enumerate(models):\n",
335 | " model_name = element[0]\n",
336 | " model = element[1]\n",
337 | " report = reports[i]\n",
338 | " \n",
339 | " with mlflow.start_run(run_name=model_name): \n",
340 | " mlflow.log_param(\"model\", model_name)\n",
341 | " mlflow.log_metric('accuracy', report['accuracy'])\n",
342 | " mlflow.log_metric('recall_class_1', report['1']['recall'])\n",
343 | " mlflow.log_metric('recall_class_0', report['0']['recall'])\n",
344 | " mlflow.log_metric('f1_score_macro', report['macro avg']['f1-score']) \n",
345 | " \n",
346 | " if \"XGB\" in model_name:\n",
347 | " mlflow.xgboost.log_model(model, \"model\")\n",
348 | " else:\n",
349 | " mlflow.sklearn.log_model(model, \"model\") "
350 | ]
351 | }
352 | ],
353 | "metadata": {
354 | "kernelspec": {
355 | "display_name": "Python 3 (ipykernel)",
356 | "language": "python",
357 | "name": "python3"
358 | },
359 | "language_info": {
360 | "codemirror_mode": {
361 | "name": "ipython",
362 | "version": 3
363 | },
364 | "file_extension": ".py",
365 | "mimetype": "text/x-python",
366 | "name": "python",
367 | "nbconvert_exporter": "python",
368 | "pygments_lexer": "ipython3",
369 | "version": "3.10.11"
370 | }
371 | },
372 | "nbformat": 4,
373 | "nbformat_minor": 5
374 | }
375 |
--------------------------------------------------------------------------------
/ml_flow_dagshub.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "b4f9d400",
6 | "metadata": {},
7 | "source": [
8 | "Codebasics ML Course: ML Flow Dagshub Tutorial
"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 1,
14 | "id": "5eb3c2b2",
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import numpy as np\n",
19 | "from sklearn.datasets import make_classification\n",
20 | "from sklearn.model_selection import train_test_split\n",
21 | "from sklearn.linear_model import LogisticRegression\n",
22 | "from sklearn.ensemble import RandomForestClassifier\n",
23 | "from xgboost import XGBClassifier\n",
24 | "from sklearn.metrics import classification_report\n",
25 | "import warnings\n",
26 | "warnings.filterwarnings('ignore')"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 2,
32 | "id": "0878fc4c",
33 | "metadata": {},
34 | "outputs": [
35 | {
36 | "data": {
37 | "text/plain": [
38 | "(array([0, 1]), array([900, 100], dtype=int64))"
39 | ]
40 | },
41 | "execution_count": 2,
42 | "metadata": {},
43 | "output_type": "execute_result"
44 | }
45 | ],
46 | "source": [
47 | "# Step 1: Create an imbalanced binary classification dataset\n",
48 | "X, y = make_classification(n_samples=1000, n_features=10, n_informative=2, n_redundant=8, \n",
49 | " weights=[0.9, 0.1], flip_y=0, random_state=42)\n",
50 | "\n",
51 | "np.unique(y, return_counts=True)"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 3,
57 | "id": "2a6b80dd",
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "# Split the dataset into training and testing sets\n",
62 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)"
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "id": "f3a6191b",
68 | "metadata": {},
69 | "source": [
70 | "#### Handle class imbalance"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 4,
76 | "id": "3190fd47",
77 | "metadata": {},
78 | "outputs": [
79 | {
80 | "data": {
81 | "text/plain": [
82 | "(array([0, 1]), array([619, 619], dtype=int64))"
83 | ]
84 | },
85 | "execution_count": 4,
86 | "metadata": {},
87 | "output_type": "execute_result"
88 | }
89 | ],
90 | "source": [
91 | "from imblearn.combine import SMOTETomek\n",
92 | "\n",
93 | "smt = SMOTETomek(random_state=42)\n",
94 | "X_train_res, y_train_res = smt.fit_resample(X_train, y_train)\n",
95 | "np.unique(y_train_res, return_counts=True)"
96 | ]
97 | },
98 | {
99 | "cell_type": "markdown",
100 | "id": "75b6f715",
101 | "metadata": {},
102 | "source": [
103 | "### Track Experiments"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": 5,
109 | "id": "1eb49554",
110 | "metadata": {},
111 | "outputs": [],
112 | "source": [
113 | "models = [\n",
114 | " (\n",
115 | " \"Logistic Regression\", \n",
116 | " {\"C\": 1, \"solver\": 'liblinear'},\n",
117 | " LogisticRegression(), \n",
118 | " (X_train, y_train),\n",
119 | " (X_test, y_test)\n",
120 | " ),\n",
121 | " (\n",
122 | " \"Random Forest\", \n",
123 | " {\"n_estimators\": 30, \"max_depth\": 3},\n",
124 | " RandomForestClassifier(), \n",
125 | " (X_train, y_train),\n",
126 | " (X_test, y_test)\n",
127 | " ),\n",
128 | " (\n",
129 | " \"XGBClassifier\",\n",
130 | " {\"use_label_encoder\": False, \"eval_metric\": 'logloss'},\n",
131 | " XGBClassifier(), \n",
132 | " (X_train, y_train),\n",
133 | " (X_test, y_test)\n",
134 | " ),\n",
135 | " (\n",
136 | " \"XGBClassifier With SMOTE\",\n",
137 | " {\"use_label_encoder\": False, \"eval_metric\": 'logloss'},\n",
138 | " XGBClassifier(), \n",
139 | " (X_train_res, y_train_res),\n",
140 | " (X_test, y_test)\n",
141 | " )\n",
142 | "]"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": 6,
148 | "id": "a91ad5ae",
149 | "metadata": {},
150 | "outputs": [],
151 | "source": [
152 | "reports = []\n",
153 | "\n",
154 | "for model_name, params, model, train_set, test_set in models:\n",
155 | " X_train = train_set[0]\n",
156 | " y_train = train_set[1]\n",
157 | " X_test = test_set[0]\n",
158 | " y_test = test_set[1]\n",
159 | " \n",
160 | " model.set_params(**params)\n",
161 | " model.fit(X_train, y_train)\n",
162 | " y_pred = model.predict(X_test)\n",
163 | " report = classification_report(y_test, y_pred, output_dict=True)\n",
164 | " reports.append(report)"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 7,
170 | "id": "08741b0a",
171 | "metadata": {},
172 | "outputs": [],
173 | "source": [
174 | "import mlflow\n",
175 | "import mlflow.sklearn\n",
176 | "import mlflow.xgboost"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": 8,
182 | "id": "503f6e93",
183 | "metadata": {},
184 | "outputs": [
185 | {
186 | "data": {
187 | "text/html": [
188 | "Accessing as dhavalsays\n",
189 | "
\n"
190 | ],
191 | "text/plain": [
192 | "Accessing as dhavalsays\n"
193 | ]
194 | },
195 | "metadata": {},
196 | "output_type": "display_data"
197 | },
198 | {
199 | "data": {
200 | "text/html": [
201 | "Initialized MLflow to track repo \"learnpythonlanguage/mlflow_dagshub_demo\"\n",
202 | "
\n"
203 | ],
204 | "text/plain": [
205 | "Initialized MLflow to track repo \u001b[32m\"learnpythonlanguage/mlflow_dagshub_demo\"\u001b[0m\n"
206 | ]
207 | },
208 | "metadata": {},
209 | "output_type": "display_data"
210 | },
211 | {
212 | "data": {
213 | "text/html": [
214 | "Repository learnpythonlanguage/mlflow_dagshub_demo initialized!\n",
215 | "
\n"
216 | ],
217 | "text/plain": [
218 | "Repository learnpythonlanguage/mlflow_dagshub_demo initialized!\n"
219 | ]
220 | },
221 | "metadata": {},
222 | "output_type": "display_data"
223 | }
224 | ],
225 | "source": [
226 | "# dagshub setup\n",
227 | "\n",
228 | "import dagshub\n",
229 | "dagshub.init(repo_owner='learnpythonlanguage', repo_name='mlflow_dagshub_demo', mlflow=True)"
230 | ]
231 | },
232 | {
233 | "cell_type": "code",
234 | "execution_count": 12,
235 | "id": "cfcc9a19",
236 | "metadata": {
237 | "scrolled": false
238 | },
239 | "outputs": [
240 | {
241 | "name": "stderr",
242 | "output_type": "stream",
243 | "text": [
244 | "2024/08/01 11:50:40 INFO mlflow.tracking.fluent: Experiment with name 'Anomaly Detection' does not exist. Creating a new experiment.\n"
245 | ]
246 | }
247 | ],
248 | "source": [
249 | "# Ideally you will not require following 4 lines if you have started fresh and do not have any previous dagshub credentials on your computer\n",
250 | "import os\n",
251 | "os.environ['MLFLOW_TRACKING_USERNAME'] = 'your user name' # 'learnpythonlanguage'\n",
252 | "os.environ['MLFLOW_TRACKING_PASSWORD'] = 'your password' # \n",
253 | "os.environ['MLFLOW_TRACKING_URI'] = 'your dagshub unique uri' # https://dagshub.com/learnpythonlanguage/mlflow_dagshub_demo.mlflow\n",
254 | "\n",
255 | "# Initialize MLflow\n",
256 | "mlflow.set_experiment(\"Anomaly Detection\")\n",
257 | "# mlflow.set_tracking_uri(\"http://localhost:5000\")\n",
258 | "\n",
259 | "for i, element in enumerate(models):\n",
260 | " model_name = element[0]\n",
261 | " params = element[1]\n",
262 | " model = element[2]\n",
263 | " report = reports[i]\n",
264 | " \n",
265 | " with mlflow.start_run(run_name=model_name): \n",
266 | " mlflow.log_params(params)\n",
267 | " mlflow.log_metrics({\n",
268 | " 'accuracy': report['accuracy'],\n",
269 | " 'recall_class_1': report['1']['recall'],\n",
270 | " 'recall_class_0': report['0']['recall'],\n",
271 | " 'f1_score_macro': report['macro avg']['f1-score']\n",
272 | " }) \n",
273 | " \n",
274 | " if \"XGB\" in model_name:\n",
275 | " mlflow.xgboost.log_model(model, \"model\")\n",
276 | " else:\n",
277 | " mlflow.sklearn.log_model(model, \"model\") "
278 | ]
279 | }
280 | ],
281 | "metadata": {
282 | "kernelspec": {
283 | "display_name": "Python 3 (ipykernel)",
284 | "language": "python",
285 | "name": "python3"
286 | },
287 | "language_info": {
288 | "codemirror_mode": {
289 | "name": "ipython",
290 | "version": 3
291 | },
292 | "file_extension": ".py",
293 | "mimetype": "text/x-python",
294 | "name": "python",
295 | "nbconvert_exporter": "python",
296 | "pygments_lexer": "ipython3",
297 | "version": "3.10.11"
298 | }
299 | },
300 | "nbformat": 4,
301 | "nbformat_minor": 5
302 | }
303 |
--------------------------------------------------------------------------------
/ml_flow_model_management.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "20086d7c",
6 | "metadata": {},
7 | "source": [
8 | "Codebasics ML Course: ML Flow Tutorial
"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 29,
14 | "id": "2134f63a",
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import numpy as np\n",
19 | "from sklearn.datasets import make_classification\n",
20 | "from sklearn.model_selection import train_test_split\n",
21 | "from sklearn.linear_model import LogisticRegression\n",
22 | "from sklearn.ensemble import RandomForestClassifier\n",
23 | "from xgboost import XGBClassifier\n",
24 | "from sklearn.metrics import classification_report\n",
25 | "import warnings\n",
26 | "warnings.filterwarnings('ignore')"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 30,
32 | "id": "8a467445",
33 | "metadata": {},
34 | "outputs": [
35 | {
36 | "data": {
37 | "text/plain": [
38 | "(array([0, 1]), array([900, 100], dtype=int64))"
39 | ]
40 | },
41 | "execution_count": 30,
42 | "metadata": {},
43 | "output_type": "execute_result"
44 | }
45 | ],
46 | "source": [
47 | "# Step 1: Create an imbalanced binary classification dataset\n",
48 | "X, y = make_classification(n_samples=1000, n_features=10, n_informative=2, n_redundant=8, \n",
49 | " weights=[0.9, 0.1], flip_y=0, random_state=42)\n",
50 | "\n",
51 | "np.unique(y, return_counts=True)"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 31,
57 | "id": "7fc473ad",
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "# Split the dataset into training and testing sets\n",
62 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)"
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "id": "ce174acd",
68 | "metadata": {},
69 | "source": [
70 | "#### Handle class imbalance"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 32,
76 | "id": "0c6d768a",
77 | "metadata": {},
78 | "outputs": [
79 | {
80 | "data": {
81 | "text/plain": [
82 | "(array([0, 1]), array([619, 619], dtype=int64))"
83 | ]
84 | },
85 | "execution_count": 32,
86 | "metadata": {},
87 | "output_type": "execute_result"
88 | }
89 | ],
90 | "source": [
91 | "from imblearn.combine import SMOTETomek\n",
92 | "\n",
93 | "smt = SMOTETomek(random_state=42)\n",
94 | "X_train_res, y_train_res = smt.fit_resample(X_train, y_train)\n",
95 | "np.unique(y_train_smt, return_counts=True)"
96 | ]
97 | },
98 | {
99 | "cell_type": "markdown",
100 | "id": "f04a52b2",
101 | "metadata": {},
102 | "source": [
103 | "### Track Experiments"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": 33,
109 | "id": "82fdaf1c",
110 | "metadata": {},
111 | "outputs": [],
112 | "source": [
113 | "models = [\n",
114 | " (\n",
115 | " \"Logistic Regression\", \n",
116 | " {\"C\": 1, \"solver\": 'liblinear'},\n",
117 | " LogisticRegression(), \n",
118 | " (X_train, y_train),\n",
119 | " (X_test, y_test)\n",
120 | " ),\n",
121 | " (\n",
122 | " \"Random Forest\", \n",
123 | " {\"n_estimators\": 30, \"max_depth\": 3},\n",
124 | " RandomForestClassifier(), \n",
125 | " (X_train, y_train),\n",
126 | " (X_test, y_test)\n",
127 | " ),\n",
128 | " (\n",
129 | " \"XGBClassifier\",\n",
130 | " {\"use_label_encoder\": False, \"eval_metric\": 'logloss'},\n",
131 | " XGBClassifier(), \n",
132 | " (X_train, y_train),\n",
133 | " (X_test, y_test)\n",
134 | " ),\n",
135 | " (\n",
136 | " \"XGBClassifier With SMOTE\",\n",
137 | " {\"use_label_encoder\": False, \"eval_metric\": 'logloss'},\n",
138 | " XGBClassifier(), \n",
139 | " (X_train_res, y_train_res),\n",
140 | " (X_test, y_test)\n",
141 | " )\n",
142 | "]"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": 34,
148 | "id": "13a992c3",
149 | "metadata": {},
150 | "outputs": [],
151 | "source": [
152 | "reports = []\n",
153 | "\n",
154 | "for model_name, params, model, train_set, test_set in models:\n",
155 | " X_train = train_set[0]\n",
156 | " y_train = train_set[1]\n",
157 | " X_test = test_set[0]\n",
158 | " y_test = test_set[1]\n",
159 | " \n",
160 | " model.set_params(**params)\n",
161 | " model.fit(X_train, y_train)\n",
162 | " y_pred = model.predict(X_test)\n",
163 | " report = classification_report(y_test, y_pred, output_dict=True)\n",
164 | " reports.append(report)"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 35,
170 | "id": "d9301bc0",
171 | "metadata": {},
172 | "outputs": [],
173 | "source": [
174 | "import mlflow\n",
175 | "import mlflow.sklearn\n",
176 | "import mlflow.xgboost"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": 36,
182 | "id": "9ad9cf4d",
183 | "metadata": {
184 | "scrolled": false
185 | },
186 | "outputs": [],
187 | "source": [
188 | "# Initialize MLflow\n",
189 | "mlflow.set_experiment(\"Anomaly Detection\")\n",
190 | "mlflow.set_tracking_uri(\"http://localhost:5000\")\n",
191 | "\n",
192 | "for i, element in enumerate(models):\n",
193 | " model_name = element[0]\n",
194 | " params = element[1]\n",
195 | " model = element[2]\n",
196 | " report = reports[i]\n",
197 | " \n",
198 | " with mlflow.start_run(run_name=model_name): \n",
199 | " mlflow.log_params(params)\n",
200 | " mlflow.log_metrics({\n",
201 | " 'accuracy': report['accuracy'],\n",
202 | " 'recall_class_1': report['1']['recall'],\n",
203 | " 'recall_class_0': report['0']['recall'],\n",
204 | " 'f1_score_macro': report['macro avg']['f1-score']\n",
205 | " }) \n",
206 | " \n",
207 | " if \"XGB\" in model_name:\n",
208 | " mlflow.xgboost.log_model(model, \"model\")\n",
209 | " else:\n",
210 | " mlflow.sklearn.log_model(model, \"model\") "
211 | ]
212 | },
213 | {
214 | "cell_type": "markdown",
215 | "id": "7446ae8a",
216 | "metadata": {},
217 | "source": [
218 | "### Register the Model"
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": null,
224 | "id": "51c0013a",
225 | "metadata": {
226 | "scrolled": false
227 | },
228 | "outputs": [],
229 | "source": [
230 | "model_name = 'XGB-Smote'\n",
231 | "run_id=input('Please type RunID')\n",
232 | "model_uri = f'runs:/{run_id}/model_name'\n",
233 | "\n",
234 | "with mlflow.start_run(run_id=run_id):\n",
235 | " mlflow.register_model(model_uri=model_uri, name=model_name)"
236 | ]
237 | },
238 | {
239 | "cell_type": "markdown",
240 | "id": "1b074a08",
241 | "metadata": {},
242 | "source": [
243 | "### Load the Model"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": 41,
249 | "id": "a40fef12",
250 | "metadata": {
251 | "scrolled": true
252 | },
253 | "outputs": [
254 | {
255 | "name": "stderr",
256 | "output_type": "stream",
257 | "text": [
258 | "Downloading artifacts: 100%|██████████| 5/5 [00:02<00:00, 2.33it/s]\n"
259 | ]
260 | },
261 | {
262 | "data": {
263 | "text/plain": [
264 | "array([0, 0, 0, 0])"
265 | ]
266 | },
267 | "execution_count": 41,
268 | "metadata": {},
269 | "output_type": "execute_result"
270 | }
271 | ],
272 | "source": [
273 | "model_version = 1\n",
274 | "model_uri = f\"models:/{model_name}/{model_version}\"\n",
275 | "\n",
276 | "loaded_model = mlflow.xgboost.load_model(model_uri)\n",
277 | "y_pred = loaded_model.predict(X_test)\n",
278 | "y_pred[:4]"
279 | ]
280 | },
281 | {
282 | "cell_type": "markdown",
283 | "id": "5f8d2893",
284 | "metadata": {},
285 | "source": [
286 | "### Transition the Model to Production"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": 42,
292 | "id": "9c8ac1e3",
293 | "metadata": {
294 | "scrolled": true
295 | },
296 | "outputs": [
297 | {
298 | "data": {
299 | "text/plain": [
300 | ""
301 | ]
302 | },
303 | "execution_count": 42,
304 | "metadata": {},
305 | "output_type": "execute_result"
306 | }
307 | ],
308 | "source": [
309 | "current_model_uri = f\"models:/{model_name}@challenger\"\n",
310 | "production_model_name = \"anomaly-detection-prod\"\n",
311 | "\n",
312 | "client = mlflow.MlflowClient()\n",
313 | "client.copy_model_version(src_model_uri=current_model_uri, dst_name=production_model_name)"
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": 43,
319 | "id": "4297a2fe",
320 | "metadata": {},
321 | "outputs": [
322 | {
323 | "name": "stderr",
324 | "output_type": "stream",
325 | "text": [
326 | "Downloading artifacts: 100%|██████████| 5/5 [00:02<00:00, 2.33it/s]\n"
327 | ]
328 | },
329 | {
330 | "data": {
331 | "text/plain": [
332 | "array([0, 0, 0, 0])"
333 | ]
334 | },
335 | "execution_count": 43,
336 | "metadata": {},
337 | "output_type": "execute_result"
338 | }
339 | ],
340 | "source": [
341 | "model_version = 1\n",
342 | "prod_model_uri = f\"models:/{production_model_name}@champion\"\n",
343 | "\n",
344 | "loaded_model = mlflow.xgboost.load_model(prod_model_uri)\n",
345 | "y_pred = loaded_model.predict(X_test)\n",
346 | "y_pred[:4]"
347 | ]
348 | },
349 | {
350 | "cell_type": "markdown",
351 | "id": "ca565a87",
352 | "metadata": {},
353 | "source": [
354 | "Please refer to following to learn more about model registry\n",
355 | "\n",
356 | "https://mlflow.org/docs/latest/model-registry.html#model-registry-workflows to learn "
357 | ]
358 | }
359 | ],
360 | "metadata": {
361 | "kernelspec": {
362 | "display_name": "Python 3 (ipykernel)",
363 | "language": "python",
364 | "name": "python3"
365 | },
366 | "language_info": {
367 | "codemirror_mode": {
368 | "name": "ipython",
369 | "version": 3
370 | },
371 | "file_extension": ".py",
372 | "mimetype": "text/x-python",
373 | "name": "python",
374 | "nbconvert_exporter": "python",
375 | "pygments_lexer": "ipython3",
376 | "version": "3.10.11"
377 | }
378 | },
379 | "nbformat": 4,
380 | "nbformat_minor": 5
381 | }
382 |
--------------------------------------------------------------------------------