├── .gitignore ├── .idea ├── .gitignore ├── ds-quick-tips.iml ├── misc.xml ├── modules.xml └── vcs.xml ├── 001_reverse_ohe ├── .ipynb_checkpoints │ └── reverse_ohe-checkpoint.ipynb └── reverse_ohe.ipynb ├── 002_flask_cronjob ├── README.md └── app.py ├── 003_sklearn_pipelines ├── .ipynb_checkpoints │ └── titanic_pipeline-checkpoint.ipynb ├── model │ └── rfc_pipeline.pkl └── titanic_pipeline.ipynb ├── 004_pipeline_custom_transformers ├── .ipynb_checkpoints │ └── titanic_custom_pipeline-checkpoint.ipynb ├── model │ └── rfc_pipeline.pkl └── titanic_custom_pipeline.ipynb ├── 005_two_ways_to_ohe ├── .ipynb_checkpoints │ └── two_ways_to_ohe-checkpoint.ipynb ├── ce_ohe.pkl ├── sklearn_ohe.pkl └── two_ways_to_ohe.ipynb ├── 006_shap_lime ├── .ipynb_checkpoints │ └── titanic_shap_lime-checkpoint.ipynb └── titanic_shap_lime.ipynb ├── 007_performance_testing_locust ├── __pycache__ │ └── locustfile.cpython-37.pyc ├── api │ ├── __pycache__ │ │ ├── api.cpython-37.pyc │ │ └── helpers.cpython-37.pyc │ ├── api.py │ ├── helpers.py │ └── run.sh ├── locustfile.py ├── model │ └── rfc_pipeline.pkl └── test_data │ ├── test_1.json │ ├── test_2.json │ └── tests.sh ├── 008_mlflow_getting_started ├── .ipynb_checkpoints │ └── mlflow_wine_notebook-checkpoint.ipynb ├── mlflow-existing-model.py ├── mlflow-wine.py ├── mlflow_wine_notebook.ipynb ├── mlruns │ └── 0 │ │ ├── 09fa7bc156ff4d59b4b00b8fdbe84728 │ │ ├── artifacts │ │ │ └── model │ │ │ │ ├── MLmodel │ │ │ │ ├── conda.yaml │ │ │ │ └── model.pkl │ │ ├── meta.yaml │ │ ├── metrics │ │ │ ├── abs_error │ │ │ ├── r2 │ │ │ └── rmse │ │ ├── params │ │ │ ├── alpha │ │ │ └── l1_ratio │ │ └── tags │ │ │ ├── mlflow.log-model.history │ │ │ ├── mlflow.source.name │ │ │ ├── mlflow.source.type │ │ │ └── mlflow.user │ │ ├── 27398db7fb544a269a0c85ec637bbab9 │ │ ├── artifacts │ │ │ └── model │ │ │ │ ├── MLmodel │ │ │ │ ├── conda.yaml │ │ │ │ └── model.pkl │ │ ├── meta.yaml │ │ └── tags │ │ │ ├── mlflow.log-model.history │ │ │ ├── mlflow.source.git.commit │ │ │ ├── mlflow.source.name │ │ │ ├── mlflow.source.type │ │ │ └── mlflow.user │ │ ├── 5a2bf3f0cb504b40ac9cd9a70af32ac6 │ │ ├── artifacts │ │ │ └── model │ │ │ │ ├── MLmodel │ │ │ │ ├── conda.yaml │ │ │ │ └── model.pkl │ │ ├── meta.yaml │ │ ├── metrics │ │ │ ├── abs_error │ │ │ ├── r2 │ │ │ └── rmse │ │ ├── params │ │ │ ├── alpha │ │ │ └── l1_ratio │ │ └── tags │ │ │ ├── mlflow.log-model.history │ │ │ ├── mlflow.source.name │ │ │ ├── mlflow.source.type │ │ │ └── mlflow.user │ │ ├── 93cfbd77d77f4e308297d9a47ea3abd6 │ │ ├── artifacts │ │ │ └── model │ │ │ │ ├── MLmodel │ │ │ │ ├── conda.yaml │ │ │ │ └── model.pkl │ │ ├── meta.yaml │ │ ├── metrics │ │ │ ├── abs_error │ │ │ ├── r2 │ │ │ └── rmse │ │ ├── params │ │ │ ├── alpha │ │ │ └── l1_ratio │ │ └── tags │ │ │ ├── mlflow.log-model.history │ │ │ ├── mlflow.source.git.commit │ │ │ ├── mlflow.source.name │ │ │ ├── mlflow.source.type │ │ │ └── mlflow.user │ │ ├── befa6150910e4724b1248ee939971dc2 │ │ ├── artifacts │ │ │ └── model │ │ │ │ ├── MLmodel │ │ │ │ ├── conda.yaml │ │ │ │ └── model.pkl │ │ ├── meta.yaml │ │ ├── metrics │ │ │ ├── abs_error │ │ │ ├── r2 │ │ │ └── rmse │ │ ├── params │ │ │ ├── alpha │ │ │ └── l1_ratio │ │ └── tags │ │ │ ├── mlflow.log-model.history │ │ │ ├── mlflow.source.git.commit │ │ │ ├── mlflow.source.name │ │ │ ├── mlflow.source.type │ │ │ └── mlflow.user │ │ └── meta.yaml └── model │ └── model.pkl ├── 009_mlflow_tracking_server ├── Dockerfile └── k8s │ ├── mlflow_deployment.yaml │ ├── mlflow_minio.yaml │ └── mlflow_postgres.yaml ├── 010_mlflow_logging_to_server ├── .ipynb_checkpoints │ └── Untitled-checkpoint.ipynb └── mlflow-wine.py ├── 011_mlflow_interacting_with_client ├── .ipynb_checkpoints │ └── MLflow_client_interaction-checkpoint.ipynb ├── MLflow_client_interaction.ipynb └── mlruns │ └── 0 │ └── meta.yaml ├── 012_dockerizing_fastapi ├── Dockerfile ├── container │ ├── api.py │ ├── start_api.sh │ └── train.py ├── dependencies │ └── requirements.txt ├── k8s │ └── deployment.yaml ├── model │ └── iris_model.pkl ├── notebooks │ ├── .ipynb_checkpoints │ │ └── iris_model_creation-checkpoint.ipynb │ └── iris_model_creation.ipynb └── tests │ ├── test_bad_predict.sh │ ├── test_basic_predict.sh │ ├── test_json │ ├── bad_data.json │ └── test_data.json │ └── test_predict.sh ├── 013_fastapi_tests_scans ├── Dockerfile ├── container │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── api.cpython-37.pyc │ │ └── api.cpython-38.pyc │ ├── api.py │ ├── start_api.sh │ └── train.py ├── dependencies │ └── requirements.txt ├── models │ └── iris_model.pkl └── tests │ ├── .pytest_cache │ └── v │ │ └── cache │ │ ├── lastfailed │ │ ├── nodeids │ │ └── stepwise │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-37.pyc │ ├── __init__.cpython-38.pyc │ └── test_api.cpython-37-pytest-6.2.4.pyc │ ├── curl_scripts │ └── test_predict.sh │ ├── performance_testing │ ├── __pycache__ │ │ └── locustfile.cpython-38.pyc │ └── locustfile.py │ ├── reports │ ├── container_scan_results.txt │ ├── dependency_scan_report.txt │ ├── linter_report.txt │ ├── performance_test_exceptions.csv │ ├── performance_test_failures.csv │ ├── performance_test_stats.csv │ ├── performance_test_stats_history.csv │ ├── static_scan_report.txt │ └── unit_test_report.txt │ ├── run_all_tests.sh │ ├── run_container_scan.sh │ ├── run_dependency_scan.sh │ ├── run_linter.sh │ ├── run_perf_test.sh │ ├── run_static_scan.sh │ ├── run_unit_tests.sh │ ├── test_json │ ├── bad_data.json │ └── test_data.json │ └── unit_testing │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-38.pyc │ ├── test_api.cpython-38-pytest-6.2.3.pyc │ └── test_api.cpython-38-pytest-6.2.4.pyc │ └── test_api.py ├── 014_kfolds_validation └── notebooks │ ├── .ipynb_checkpoints │ └── kfolds-validation-checkpoint.ipynb │ └── kfolds-validation.ipynb ├── 015_synthesizing_test_data └── notebooks │ └── synthesizing_test_data.ipynb ├── 016_intro_to_polars └── intro_to_polars.ipynb ├── README.md └── data ├── titanic ├── test.csv └── train.csv └── wine └── train.csv /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /.idea/ds-quick-tips.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /002_flask_cronjob/README.md: -------------------------------------------------------------------------------- 1 | # Data Science Quick Tip #002: Running a Cronjob from Within a Flask API! 2 | This repo contains the code supporting the blog post discussing how to run a cronjob within a Flask API. In this README, we'll quickly touch on the required pieces to run this code as well as how to invoke this script. 3 | 4 | ## Required Installations 5 | If not installed already, you will need to pip install the following packages. 6 | - ```APscheduler==3.6.3``` 7 | - ```flask==1.1.1``` 8 | 9 | ## Script Invocation 10 | I have this script to run very simply using the following command in your terminal: 11 | 12 | ```python app.py``` 13 | 14 | This will start up the Flask API, at which point you can sit back and watch the statement ```Hello world!``` printed at the beginning of every minute. 15 | 16 | If you would also like to invoke the test endpoint I created, simply open another window in your terminal and issue the following command: 17 | 18 | ```curl 0.0.0.0/5000/test``` 19 | -------------------------------------------------------------------------------- /002_flask_cronjob/app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request, json, Response, jsonify, make_response 2 | from apscheduler.schedulers.background import BackgroundScheduler 3 | 4 | # Instantiating the Flask application 5 | application = Flask(__name__) 6 | 7 | # Instantiating the scheduler for the cronjob 8 | sched = BackgroundScheduler(daemon = True) 9 | sched.start() 10 | 11 | # Defining a cronjob function to run alongside the Flask app 12 | @sched.scheduled_job(trigger = 'cron', minute = '*') 13 | def print_hello(): 14 | print('Hello world!') 15 | 16 | # Defining a single API endpoint 17 | @application.route('/test') 18 | def test_func(): 19 | js = json.dumps({'Test': 'Successful!'}) 20 | return Response(json.dumps(js), status = 200, mimetype = 'application/json') 21 | 22 | if __name__ == '__main__': 23 | # Starting Flask application 24 | application.run(host = '0.0.0.0') 25 | -------------------------------------------------------------------------------- /003_sklearn_pipelines/.ipynb_checkpoints/titanic_pipeline-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Science Quick Tip #003: Using Scikit-Learn Pipelines!\n", 8 | "In this notebook, I'll show you how to create a pipeline that produces a single binary file in the end for clean inference purposes. The goal is NOT to create a necessarily accurate model here, so don't worry if your accuracy scores are bad. This project will only focus on using Scikit-Learn's default transformers. In the next quick tip post, I'll teach you how to create custom transfomers and also make use of those within this same pipeline format." 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "## Project Setup\n", 16 | "Let's go ahead and import the libraries we'll be using as well as the datasets." 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 1, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "# Importing the libraries we'll be using for this project\n", 26 | "import pandas as pd\n", 27 | "import joblib\n", 28 | "\n", 29 | "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", 30 | "from sklearn.compose import ColumnTransformer\n", 31 | "from sklearn.pipeline import Pipeline\n", 32 | "from sklearn.ensemble import RandomForestClassifier\n", 33 | "from sklearn.model_selection import train_test_split\n", 34 | "from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 2, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "# Importing the training dataset\n", 44 | "raw_train = pd.read_csv('data/train.csv')" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "# Splitting the training data into appropriate training and validation sets\n", 54 | "X = raw_train.drop(columns = ['Survived'])\n", 55 | "y = raw_train[['Survived']]\n", 56 | "\n", 57 | "X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 4, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/html": [ 68 | "
\n", 69 | "\n", 82 | "\n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | "
PassengerIdPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
2982991Saalfeld, Mr. AdolphemaleNaN001998830.5000C106S
8848853Sutehall, Mr. Henry Jrmale25.0000SOTON/OQ 3920767.0500NaNS
2472482Hamalainen, Mrs. William (Anna)female24.000225064914.5000NaNS
4784793Karlsson, Mr. Nils Augustmale22.00003500607.5208NaNS
3053061Allison, Master. Hudson Trevormale0.9212113781151.5500C22 C26S
\n", 172 | "
" 173 | ], 174 | "text/plain": [ 175 | " PassengerId Pclass Name Sex Age \\\n", 176 | "298 299 1 Saalfeld, Mr. Adolphe male NaN \n", 177 | "884 885 3 Sutehall, Mr. Henry Jr male 25.00 \n", 178 | "247 248 2 Hamalainen, Mrs. William (Anna) female 24.00 \n", 179 | "478 479 3 Karlsson, Mr. Nils August male 22.00 \n", 180 | "305 306 1 Allison, Master. Hudson Trevor male 0.92 \n", 181 | "\n", 182 | " SibSp Parch Ticket Fare Cabin Embarked \n", 183 | "298 0 0 19988 30.5000 C106 S \n", 184 | "884 0 0 SOTON/OQ 392076 7.0500 NaN S \n", 185 | "247 0 2 250649 14.5000 NaN S \n", 186 | "478 0 0 350060 7.5208 NaN S \n", 187 | "305 1 2 113781 151.5500 C22 C26 S " 188 | ] 189 | }, 190 | "execution_count": 4, 191 | "metadata": {}, 192 | "output_type": "execute_result" 193 | } 194 | ], 195 | "source": [ 196 | "# Viewing first few rows of X_train dataset\n", 197 | "X_train.head()" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "## Creating Our Pipeline\n", 205 | "With our data imported, we're ready to go ahead and start creating our pipeline. As mentioned above, we'll only be using the default transformers here, so we definitely won't be getting great results out of our model predictions. But that's okay! The purpose here is learning how to use a pipeline.\n", 206 | "\n", 207 | "Note: You might be wondering in the next cell why we're creating a column transformer for a single column. This is because in the next post, we'll be adding custom transformers making use of mostly the same code you'll see below. (With a few additions!)" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 5, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "# Creating a preprocessor to transform the 'Sex' column\n", 217 | "data_preprocessor = ColumnTransformer(transformers = [\n", 218 | " ('sex_transformer', OneHotEncoder(), ['Sex'])\n", 219 | "])" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 6, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "# Creating our pipeline that first preprocesses the data, then scales the data, then fits the data to a RandomForestClassifier\n", 229 | "rfc_pipeline = Pipeline(steps = [\n", 230 | " ('data_preprocessing', data_preprocessor),\n", 231 | " ('data_scaling', StandardScaler()),\n", 232 | " ('model', RandomForestClassifier(max_depth = 10,\n", 233 | " min_samples_leaf = 3,\n", 234 | " min_samples_split = 4,\n", 235 | " n_estimators = 200))\n", 236 | "])" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 7, 242 | "metadata": {}, 243 | "outputs": [ 244 | { 245 | "name": "stderr", 246 | "output_type": "stream", 247 | "text": [ 248 | "/Users/dkhundley/opt/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py:354: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", 249 | " self._final_estimator.fit(Xt, y, **fit_params)\n" 250 | ] 251 | }, 252 | { 253 | "data": { 254 | "text/plain": [ 255 | "Pipeline(memory=None,\n", 256 | " steps=[('data_preprocessing',\n", 257 | " ColumnTransformer(n_jobs=None, remainder='drop',\n", 258 | " sparse_threshold=0.3,\n", 259 | " transformer_weights=None,\n", 260 | " transformers=[('sex_transformer',\n", 261 | " OneHotEncoder(categories='auto',\n", 262 | " drop=None,\n", 263 | " dtype=,\n", 264 | " handle_unknown='error',\n", 265 | " sparse=True),\n", 266 | " ['Sex'])],\n", 267 | " verbose=False)),\n", 268 | " ('data_scaling',\n", 269 | " StandardScaler(copy=True,...\n", 270 | " RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,\n", 271 | " class_weight=None, criterion='gini',\n", 272 | " max_depth=10, max_features='auto',\n", 273 | " max_leaf_nodes=None, max_samples=None,\n", 274 | " min_impurity_decrease=0.0,\n", 275 | " min_impurity_split=None,\n", 276 | " min_samples_leaf=3, min_samples_split=4,\n", 277 | " min_weight_fraction_leaf=0.0,\n", 278 | " n_estimators=200, n_jobs=None,\n", 279 | " oob_score=False, random_state=None,\n", 280 | " verbose=0, warm_start=False))],\n", 281 | " verbose=False)" 282 | ] 283 | }, 284 | "execution_count": 7, 285 | "metadata": {}, 286 | "output_type": "execute_result" 287 | } 288 | ], 289 | "source": [ 290 | "# Fitting the training data to our pipeline\n", 291 | "rfc_pipeline.fit(X_train, y_train)" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 8, 297 | "metadata": {}, 298 | "outputs": [ 299 | { 300 | "data": { 301 | "text/plain": [ 302 | "['model/rfc_pipeline.pkl']" 303 | ] 304 | }, 305 | "execution_count": 8, 306 | "metadata": {}, 307 | "output_type": "execute_result" 308 | } 309 | ], 310 | "source": [ 311 | "# Saving our pipeline to a binary pickle file\n", 312 | "joblib.dump(rfc_pipeline, 'model/rfc_pipeline.pkl')" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 9, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "# Loading back in our serialized model\n", 322 | "loaded_model = joblib.load('model/rfc_pipeline.pkl')" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 10, 328 | "metadata": {}, 329 | "outputs": [ 330 | { 331 | "name": "stdout", 332 | "output_type": "stream", 333 | "text": [ 334 | "Accuracy Score: 0.7847533632286996\n", 335 | "ROC AUC Score: 0.7718430320308569\n", 336 | "Confusion Matrix: \n", 337 | "[[112 22]\n", 338 | " [ 26 63]]\n" 339 | ] 340 | } 341 | ], 342 | "source": [ 343 | "# Checking out our predicted results using the validation dataset\n", 344 | "pipeline_preds = loaded_model.predict(X_val)\n", 345 | "\n", 346 | "val_accuracy = accuracy_score(y_val, pipeline_preds)\n", 347 | "val_roc_auc = roc_auc_score(y_val, pipeline_preds)\n", 348 | "val_confusion_matrix = confusion_matrix(y_val, pipeline_preds)\n", 349 | "\n", 350 | "print(f'Accuracy Score: {val_accuracy}')\n", 351 | "print(f'ROC AUC Score: {val_roc_auc}')\n", 352 | "print(f'Confusion Matrix: \\n{val_confusion_matrix}')" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": null, 358 | "metadata": {}, 359 | "outputs": [], 360 | "source": [] 361 | } 362 | ], 363 | "metadata": { 364 | "kernelspec": { 365 | "display_name": "Python 3", 366 | "language": "python", 367 | "name": "python3" 368 | }, 369 | "language_info": { 370 | "codemirror_mode": { 371 | "name": "ipython", 372 | "version": 3 373 | }, 374 | "file_extension": ".py", 375 | "mimetype": "text/x-python", 376 | "name": "python", 377 | "nbconvert_exporter": "python", 378 | "pygments_lexer": "ipython3", 379 | "version": "3.7.6" 380 | } 381 | }, 382 | "nbformat": 4, 383 | "nbformat_minor": 2 384 | } 385 | -------------------------------------------------------------------------------- /003_sklearn_pipelines/model/rfc_pipeline.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/003_sklearn_pipelines/model/rfc_pipeline.pkl -------------------------------------------------------------------------------- /003_sklearn_pipelines/titanic_pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Science Quick Tip #003: Using Scikit-Learn Pipelines!\n", 8 | "In this notebook, I'll show you how to create a pipeline that produces a single binary file in the end for clean inference purposes. The goal is NOT to create a necessarily accurate model here, so don't worry if your accuracy scores are bad. This project will only focus on using Scikit-Learn's default transformers. In the next quick tip post, I'll teach you how to create custom transfomers and also make use of those within this same pipeline format." 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "## Project Setup\n", 16 | "Let's go ahead and import the libraries we'll be using as well as the datasets." 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 1, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "# Importing the libraries we'll be using for this project\n", 26 | "import pandas as pd\n", 27 | "import joblib\n", 28 | "\n", 29 | "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", 30 | "from sklearn.compose import ColumnTransformer\n", 31 | "from sklearn.pipeline import Pipeline\n", 32 | "from sklearn.ensemble import RandomForestClassifier\n", 33 | "from sklearn.model_selection import train_test_split\n", 34 | "from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 2, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "# Importing the training dataset\n", 44 | "raw_train = pd.read_csv('data/train.csv')" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "# Splitting the training data into appropriate training and validation sets\n", 54 | "X = raw_train.drop(columns = ['Survived'])\n", 55 | "y = raw_train[['Survived']]\n", 56 | "\n", 57 | "X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 4, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/html": [ 68 | "
\n", 69 | "\n", 82 | "\n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | "
PassengerIdPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
2982991Saalfeld, Mr. AdolphemaleNaN001998830.5000C106S
8848853Sutehall, Mr. Henry Jrmale25.0000SOTON/OQ 3920767.0500NaNS
2472482Hamalainen, Mrs. William (Anna)female24.000225064914.5000NaNS
4784793Karlsson, Mr. Nils Augustmale22.00003500607.5208NaNS
3053061Allison, Master. Hudson Trevormale0.9212113781151.5500C22 C26S
\n", 172 | "
" 173 | ], 174 | "text/plain": [ 175 | " PassengerId Pclass Name Sex Age \\\n", 176 | "298 299 1 Saalfeld, Mr. Adolphe male NaN \n", 177 | "884 885 3 Sutehall, Mr. Henry Jr male 25.00 \n", 178 | "247 248 2 Hamalainen, Mrs. William (Anna) female 24.00 \n", 179 | "478 479 3 Karlsson, Mr. Nils August male 22.00 \n", 180 | "305 306 1 Allison, Master. Hudson Trevor male 0.92 \n", 181 | "\n", 182 | " SibSp Parch Ticket Fare Cabin Embarked \n", 183 | "298 0 0 19988 30.5000 C106 S \n", 184 | "884 0 0 SOTON/OQ 392076 7.0500 NaN S \n", 185 | "247 0 2 250649 14.5000 NaN S \n", 186 | "478 0 0 350060 7.5208 NaN S \n", 187 | "305 1 2 113781 151.5500 C22 C26 S " 188 | ] 189 | }, 190 | "execution_count": 4, 191 | "metadata": {}, 192 | "output_type": "execute_result" 193 | } 194 | ], 195 | "source": [ 196 | "# Viewing first few rows of X_train dataset\n", 197 | "X_train.head()" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "## Creating Our Pipeline\n", 205 | "With our data imported, we're ready to go ahead and start creating our pipeline. As mentioned above, we'll only be using the default transformers here, so we definitely won't be getting great results out of our model predictions. But that's okay! The purpose here is learning how to use a pipeline.\n", 206 | "\n", 207 | "Note: You might be wondering in the next cell why we're creating a column transformer for a single column. This is because in the next post, we'll be adding custom transformers making use of mostly the same code you'll see below. (With a few additions!)" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 5, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "# Creating a preprocessor to transform the 'Sex' column\n", 217 | "data_preprocessor = ColumnTransformer(transformers = [\n", 218 | " ('sex_transformer', OneHotEncoder(), ['Sex'])\n", 219 | "])" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 6, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "# Creating our pipeline that first preprocesses the data, then scales the data, then fits the data to a RandomForestClassifier\n", 229 | "rfc_pipeline = Pipeline(steps = [\n", 230 | " ('data_preprocessing', data_preprocessor),\n", 231 | " ('data_scaling', StandardScaler()),\n", 232 | " ('model', RandomForestClassifier(max_depth = 10,\n", 233 | " min_samples_leaf = 3,\n", 234 | " min_samples_split = 4,\n", 235 | " n_estimators = 200))\n", 236 | "])" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 7, 242 | "metadata": {}, 243 | "outputs": [ 244 | { 245 | "name": "stderr", 246 | "output_type": "stream", 247 | "text": [ 248 | "/Users/dkhundley/opt/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py:354: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", 249 | " self._final_estimator.fit(Xt, y, **fit_params)\n" 250 | ] 251 | }, 252 | { 253 | "data": { 254 | "text/plain": [ 255 | "Pipeline(memory=None,\n", 256 | " steps=[('data_preprocessing',\n", 257 | " ColumnTransformer(n_jobs=None, remainder='drop',\n", 258 | " sparse_threshold=0.3,\n", 259 | " transformer_weights=None,\n", 260 | " transformers=[('sex_transformer',\n", 261 | " OneHotEncoder(categories='auto',\n", 262 | " drop=None,\n", 263 | " dtype=,\n", 264 | " handle_unknown='error',\n", 265 | " sparse=True),\n", 266 | " ['Sex'])],\n", 267 | " verbose=False)),\n", 268 | " ('data_scaling',\n", 269 | " StandardScaler(copy=True,...\n", 270 | " RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,\n", 271 | " class_weight=None, criterion='gini',\n", 272 | " max_depth=10, max_features='auto',\n", 273 | " max_leaf_nodes=None, max_samples=None,\n", 274 | " min_impurity_decrease=0.0,\n", 275 | " min_impurity_split=None,\n", 276 | " min_samples_leaf=3, min_samples_split=4,\n", 277 | " min_weight_fraction_leaf=0.0,\n", 278 | " n_estimators=200, n_jobs=None,\n", 279 | " oob_score=False, random_state=None,\n", 280 | " verbose=0, warm_start=False))],\n", 281 | " verbose=False)" 282 | ] 283 | }, 284 | "execution_count": 7, 285 | "metadata": {}, 286 | "output_type": "execute_result" 287 | } 288 | ], 289 | "source": [ 290 | "# Fitting the training data to our pipeline\n", 291 | "rfc_pipeline.fit(X_train, y_train)" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 8, 297 | "metadata": {}, 298 | "outputs": [ 299 | { 300 | "data": { 301 | "text/plain": [ 302 | "['model/rfc_pipeline.pkl']" 303 | ] 304 | }, 305 | "execution_count": 8, 306 | "metadata": {}, 307 | "output_type": "execute_result" 308 | } 309 | ], 310 | "source": [ 311 | "# Saving our pipeline to a binary pickle file\n", 312 | "joblib.dump(rfc_pipeline, 'model/rfc_pipeline.pkl')" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 9, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "# Loading back in our serialized model\n", 322 | "loaded_model = joblib.load('model/rfc_pipeline.pkl')" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 10, 328 | "metadata": {}, 329 | "outputs": [ 330 | { 331 | "name": "stdout", 332 | "output_type": "stream", 333 | "text": [ 334 | "Accuracy Score: 0.7847533632286996\n", 335 | "ROC AUC Score: 0.7718430320308569\n", 336 | "Confusion Matrix: \n", 337 | "[[112 22]\n", 338 | " [ 26 63]]\n" 339 | ] 340 | } 341 | ], 342 | "source": [ 343 | "# Checking out our predicted results using the validation dataset\n", 344 | "pipeline_preds = loaded_model.predict(X_val)\n", 345 | "\n", 346 | "val_accuracy = accuracy_score(y_val, pipeline_preds)\n", 347 | "val_roc_auc = roc_auc_score(y_val, pipeline_preds)\n", 348 | "val_confusion_matrix = confusion_matrix(y_val, pipeline_preds)\n", 349 | "\n", 350 | "print(f'Accuracy Score: {val_accuracy}')\n", 351 | "print(f'ROC AUC Score: {val_roc_auc}')\n", 352 | "print(f'Confusion Matrix: \\n{val_confusion_matrix}')" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": null, 358 | "metadata": {}, 359 | "outputs": [], 360 | "source": [] 361 | } 362 | ], 363 | "metadata": { 364 | "kernelspec": { 365 | "display_name": "Python 3", 366 | "language": "python", 367 | "name": "python3" 368 | }, 369 | "language_info": { 370 | "codemirror_mode": { 371 | "name": "ipython", 372 | "version": 3 373 | }, 374 | "file_extension": ".py", 375 | "mimetype": "text/x-python", 376 | "name": "python", 377 | "nbconvert_exporter": "python", 378 | "pygments_lexer": "ipython3", 379 | "version": "3.7.6" 380 | } 381 | }, 382 | "nbformat": 4, 383 | "nbformat_minor": 2 384 | } 385 | -------------------------------------------------------------------------------- /004_pipeline_custom_transformers/model/rfc_pipeline.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/004_pipeline_custom_transformers/model/rfc_pipeline.pkl -------------------------------------------------------------------------------- /004_pipeline_custom_transformers/titanic_custom_pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Science Quick Tip #004: Using Custom Transformers in Scikit-Learn Pipelines!\n", 8 | "In our last post, we covered how to use Scikit-Learn pipelines to conjoin all the appropriate transformers into a single output. In this new post, we'll take things a step further by adding custom transformers to the pipeline. Because this is very much building on top of the last post, much of this code should already appear to be familiar to you." 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "## Project Setup\n", 16 | "Let's go ahead and import the libraries we'll be using as well as the datasets." 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 1, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "# Importing the libraries we'll be using for this project\n", 26 | "import pandas as pd\n", 27 | "import joblib\n", 28 | "\n", 29 | "from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer\n", 30 | "from sklearn.impute import SimpleImputer\n", 31 | "from sklearn.compose import ColumnTransformer\n", 32 | "from sklearn.pipeline import Pipeline\n", 33 | "from sklearn.ensemble import RandomForestClassifier\n", 34 | "from sklearn.model_selection import train_test_split\n", 35 | "from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "# Importing the training dataset\n", 45 | "raw_train = pd.read_csv('../data/titanic/train.csv')" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 3, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "# Splitting the training data into appropriate training and validation sets\n", 55 | "X = raw_train.drop(columns = ['Survived'])\n", 56 | "y = raw_train[['Survived']]\n", 57 | "\n", 58 | "X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 4, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "data": { 68 | "text/html": [ 69 | "
\n", 70 | "\n", 83 | "\n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | "
PassengerIdPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
2982991Saalfeld, Mr. AdolphemaleNaN001998830.5000C106S
8848853Sutehall, Mr. Henry Jrmale25.0000SOTON/OQ 3920767.0500NaNS
2472482Hamalainen, Mrs. William (Anna)female24.000225064914.5000NaNS
4784793Karlsson, Mr. Nils Augustmale22.00003500607.5208NaNS
3053061Allison, Master. Hudson Trevormale0.9212113781151.5500C22 C26S
\n", 173 | "
" 174 | ], 175 | "text/plain": [ 176 | " PassengerId Pclass Name Sex Age \\\n", 177 | "298 299 1 Saalfeld, Mr. Adolphe male NaN \n", 178 | "884 885 3 Sutehall, Mr. Henry Jr male 25.00 \n", 179 | "247 248 2 Hamalainen, Mrs. William (Anna) female 24.00 \n", 180 | "478 479 3 Karlsson, Mr. Nils August male 22.00 \n", 181 | "305 306 1 Allison, Master. Hudson Trevor male 0.92 \n", 182 | "\n", 183 | " SibSp Parch Ticket Fare Cabin Embarked \n", 184 | "298 0 0 19988 30.5000 C106 S \n", 185 | "884 0 0 SOTON/OQ 392076 7.0500 NaN S \n", 186 | "247 0 2 250649 14.5000 NaN S \n", 187 | "478 0 0 350060 7.5208 NaN S \n", 188 | "305 1 2 113781 151.5500 C22 C26 S " 189 | ] 190 | }, 191 | "execution_count": 4, 192 | "metadata": {}, 193 | "output_type": "execute_result" 194 | } 195 | ], 196 | "source": [ 197 | "# Viewing first few rows of X_train dataset\n", 198 | "X_train.head()" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 5, 204 | "metadata": {}, 205 | "outputs": [ 206 | { 207 | "data": { 208 | "text/html": [ 209 | "
\n", 210 | "\n", 223 | "\n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | "
Survived
2981
8840
2471
4780
3051
\n", 253 | "
" 254 | ], 255 | "text/plain": [ 256 | " Survived\n", 257 | "298 1\n", 258 | "884 0\n", 259 | "247 1\n", 260 | "478 0\n", 261 | "305 1" 262 | ] 263 | }, 264 | "execution_count": 5, 265 | "metadata": {}, 266 | "output_type": "execute_result" 267 | } 268 | ], 269 | "source": [ 270 | "# Viewing first few rows of y_train dataset\n", 271 | "y_train.head()" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "## Creating Our Pipeline (Now With Custom Transformers!)\n", 279 | "With our data imported, we're ready to go ahead and start creating our pipeline. As mentioned above, we'll only be using the default transformers here, so we definitely won't be getting great results out of our model predictions. But that's okay! The purpose here is learning how to use a pipeline.\n", 280 | "\n", 281 | "Note: You might be wondering in the next cell why we're creating a column transformer for a single column. This is because in the next post, we'll be adding custom transformers making use of mostly the same code you'll see below. (With a few additions!)" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 6, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "# Creating a function to appropriately engineer the 'Age' column\n", 291 | "def create_age_bins(col):\n", 292 | " '''Engineers age bin variables for pipeline'''\n", 293 | " \n", 294 | " # Defining / instantiating the necessary variables\n", 295 | " age_bins = [-1, 12, 18, 25, 50, 100]\n", 296 | " age_labels = ['child', 'teen', 'young_adult', 'adult', 'elder']\n", 297 | " age_imputer = SimpleImputer(strategy = 'median')\n", 298 | " age_ohe = OneHotEncoder()\n", 299 | " \n", 300 | " # Performing basic imputation for nulls\n", 301 | " imputed = age_imputer.fit_transform(col)\n", 302 | " ages_filled = pd.DataFrame(data = imputed, columns = ['Age'])\n", 303 | " \n", 304 | " # Segregating ages into age bins\n", 305 | " age_cat_cols = pd.cut(ages_filled['Age'], bins = age_bins, labels = age_labels)\n", 306 | " age_cats = pd.DataFrame(data = age_cat_cols, columns = ['Age'])\n", 307 | " \n", 308 | " # One hot encoding new age bins\n", 309 | " ages_encoded = age_ohe.fit_transform(age_cats[['Age']])\n", 310 | " ages_encoded = pd.DataFrame(data = ages_encoded.toarray())\n", 311 | " \n", 312 | " return ages_encoded" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 7, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "# Creating function to appropriately engineer the 'Embarked' column\n", 322 | "def create_embarked_columns(col):\n", 323 | " '''Engineers the embarked variables for pipeline'''\n", 324 | " \n", 325 | " # Instantiating the transformer objects\n", 326 | " embarked_imputer = SimpleImputer(strategy = 'most_frequent')\n", 327 | " embarked_ohe = OneHotEncoder()\n", 328 | " \n", 329 | " # Performing basic imputation for nulls\n", 330 | " imputed = embarked_imputer.fit_transform(col)\n", 331 | " embarked_filled = pd.DataFrame(data = imputed, columns = ['Embarked'])\n", 332 | " \n", 333 | " # Performing OHE on the col data\n", 334 | " embarked_columns = embarked_ohe.fit_transform(embarked_filled[['Embarked']])\n", 335 | " embarked_columns_df = pd.DataFrame(data = embarked_columns.toarray())\n", 336 | " \n", 337 | " return embarked_columns_df" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 8, 343 | "metadata": {}, 344 | "outputs": [], 345 | "source": [ 346 | "# Creating a preprocessor to transform the 'Sex' column\n", 347 | "data_preprocessor = ColumnTransformer(transformers = [\n", 348 | " ('sex_transformer', OneHotEncoder(), ['Sex']),\n", 349 | " ('age_transformer', FunctionTransformer(create_age_bins, validate = False), ['Age']),\n", 350 | " ('embarked_transformer', FunctionTransformer(create_embarked_columns, validate = False), ['Embarked'])\n", 351 | "])" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 9, 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": [ 360 | "# Creating our pipeline that first preprocesses the data, then scales the data, then fits the data to a RandomForestClassifier\n", 361 | "rfc_pipeline = Pipeline(steps = [\n", 362 | " ('data_preprocessing', data_preprocessor),\n", 363 | " ('data_scaling', StandardScaler()),\n", 364 | " ('model', RandomForestClassifier(max_depth = 10,\n", 365 | " min_samples_leaf = 3,\n", 366 | " min_samples_split = 4,\n", 367 | " n_estimators = 200))\n", 368 | "])" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": 10, 374 | "metadata": {}, 375 | "outputs": [ 376 | { 377 | "name": "stderr", 378 | "output_type": "stream", 379 | "text": [ 380 | "/Users/dkhundley/opt/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py:354: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", 381 | " self._final_estimator.fit(Xt, y, **fit_params)\n" 382 | ] 383 | }, 384 | { 385 | "data": { 386 | "text/plain": [ 387 | "Pipeline(memory=None,\n", 388 | " steps=[('data_preprocessing',\n", 389 | " ColumnTransformer(n_jobs=None, remainder='drop',\n", 390 | " sparse_threshold=0.3,\n", 391 | " transformer_weights=None,\n", 392 | " transformers=[('sex_transformer',\n", 393 | " OneHotEncoder(categories='auto',\n", 394 | " drop=None,\n", 395 | " dtype=,\n", 396 | " handle_unknown='error',\n", 397 | " sparse=True),\n", 398 | " ['Sex']),\n", 399 | " ('age_transformer',\n", 400 | " FunctionTransformer(accept_sparse=False...\n", 401 | " RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,\n", 402 | " class_weight=None, criterion='gini',\n", 403 | " max_depth=10, max_features='auto',\n", 404 | " max_leaf_nodes=None, max_samples=None,\n", 405 | " min_impurity_decrease=0.0,\n", 406 | " min_impurity_split=None,\n", 407 | " min_samples_leaf=3, min_samples_split=4,\n", 408 | " min_weight_fraction_leaf=0.0,\n", 409 | " n_estimators=200, n_jobs=None,\n", 410 | " oob_score=False, random_state=None,\n", 411 | " verbose=0, warm_start=False))],\n", 412 | " verbose=False)" 413 | ] 414 | }, 415 | "execution_count": 10, 416 | "metadata": {}, 417 | "output_type": "execute_result" 418 | } 419 | ], 420 | "source": [ 421 | "# Fitting the training data to our pipeline\n", 422 | "rfc_pipeline.fit(X_train, y_train)" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": 11, 428 | "metadata": {}, 429 | "outputs": [ 430 | { 431 | "data": { 432 | "text/plain": [ 433 | "['model/rfc_pipeline.pkl']" 434 | ] 435 | }, 436 | "execution_count": 11, 437 | "metadata": {}, 438 | "output_type": "execute_result" 439 | } 440 | ], 441 | "source": [ 442 | "# Saving our pipeline to a binary pickle file\n", 443 | "joblib.dump(rfc_pipeline, 'model/rfc_pipeline.pkl')" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": 12, 449 | "metadata": {}, 450 | "outputs": [], 451 | "source": [ 452 | "# Loading back in our serialized model\n", 453 | "loaded_model = joblib.load('model/rfc_pipeline.pkl')" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": 13, 459 | "metadata": {}, 460 | "outputs": [ 461 | { 462 | "name": "stdout", 463 | "output_type": "stream", 464 | "text": [ 465 | "Accuracy Score: 0.7847533632286996\n", 466 | "ROC AUC Score: 0.7775029347643804\n", 467 | "Confusion Matrix: \n", 468 | "[[109 25]\n", 469 | " [ 23 66]]\n" 470 | ] 471 | } 472 | ], 473 | "source": [ 474 | "# Checking out our predicted results using the validation dataset\n", 475 | "pipeline_preds = loaded_model.predict(X_val)\n", 476 | "\n", 477 | "val_accuracy = accuracy_score(y_val, pipeline_preds)\n", 478 | "val_roc_auc = roc_auc_score(y_val, pipeline_preds)\n", 479 | "val_confusion_matrix = confusion_matrix(y_val, pipeline_preds)\n", 480 | "\n", 481 | "print(f'Accuracy Score: {val_accuracy}')\n", 482 | "print(f'ROC AUC Score: {val_roc_auc}')\n", 483 | "print(f'Confusion Matrix: \\n{val_confusion_matrix}')" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": null, 489 | "metadata": {}, 490 | "outputs": [], 491 | "source": [] 492 | } 493 | ], 494 | "metadata": { 495 | "kernelspec": { 496 | "display_name": "Python 3", 497 | "language": "python", 498 | "name": "python3" 499 | }, 500 | "language_info": { 501 | "codemirror_mode": { 502 | "name": "ipython", 503 | "version": 3 504 | }, 505 | "file_extension": ".py", 506 | "mimetype": "text/x-python", 507 | "name": "python", 508 | "nbconvert_exporter": "python", 509 | "pygments_lexer": "ipython3", 510 | "version": "3.7.6" 511 | } 512 | }, 513 | "nbformat": 4, 514 | "nbformat_minor": 2 515 | } 516 | -------------------------------------------------------------------------------- /005_two_ways_to_ohe/ce_ohe.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/005_two_ways_to_ohe/ce_ohe.pkl -------------------------------------------------------------------------------- /005_two_ways_to_ohe/sklearn_ohe.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/005_two_ways_to_ohe/sklearn_ohe.pkl -------------------------------------------------------------------------------- /007_performance_testing_locust/__pycache__/locustfile.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/007_performance_testing_locust/__pycache__/locustfile.cpython-37.pyc -------------------------------------------------------------------------------- /007_performance_testing_locust/api/__pycache__/api.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/007_performance_testing_locust/api/__pycache__/api.cpython-37.pyc -------------------------------------------------------------------------------- /007_performance_testing_locust/api/__pycache__/helpers.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/007_performance_testing_locust/api/__pycache__/helpers.cpython-37.pyc -------------------------------------------------------------------------------- /007_performance_testing_locust/api/api.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import joblib 3 | from flask import Flask, request, json, Response, jsonify, make_response 4 | from helpers import create_embarked_columns, create_age_bins 5 | 6 | 7 | ## PRELOADED COMPONENTS 8 | # ------------------------------------------------------------------------------ 9 | 10 | # Instantiating the Flask application 11 | application = Flask(__name__) 12 | 13 | # Loading the saved, serialized model 14 | model = joblib.load('../model/rfc_pipeline.pkl') 15 | 16 | ## API ENDPOINTS 17 | # ------------------------------------------------------------------------------ 18 | 19 | # Defining our prediction endpoint 20 | @application.route('/predict', methods = ['POST']) 21 | def predict(): 22 | 23 | # Getting incoming data from request 24 | predict_json = request.json 25 | 26 | # Transforming JSON data to DataFrame 27 | predict_df = pd.json_normalize(predict_json) 28 | 29 | # Running data through model 30 | preds = model.predict(predict_df) 31 | 32 | # Prepping preds to be returned to user 33 | js = json.dumps({'preds': str(preds[0])}) 34 | 35 | return Response(js, status = 200, mimetype = 'application/json') 36 | 37 | 38 | 39 | # Defining a basic health endpoint 40 | @application.route('/health', methods = ['GET']) 41 | def health(): 42 | 43 | # Dumping out simple health message 44 | js = json.dumps({'Status': 'Healthy!'}) 45 | 46 | return Response(js, status = 200, mimetype = 'application/json') 47 | 48 | 49 | ## SCRIPT INVOCATION 50 | # ------------------------------------------------------------------------------ 51 | 52 | if __name__ == '__main__': 53 | # Starting the Flask API on script invocation 54 | application.run(host = '0.0.0.0', debug = True) 55 | -------------------------------------------------------------------------------- /007_performance_testing_locust/api/helpers.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.impute import SimpleImputer 3 | from sklearn.preprocessing import OneHotEncoder, StandardScaler 4 | 5 | # Creating a function to appropriately engineer the 'Age' column 6 | def create_age_bins(col): 7 | '''Engineers age bin variables for pipeline''' 8 | 9 | # Defining / instantiating the necessary variables 10 | age_bins = [-1, 12, 18, 25, 50, 100] 11 | age_labels = ['child', 'teen', 'young_adult', 'adult', 'elder'] 12 | age_imputer = SimpleImputer(strategy = 'median') 13 | age_ohe = OneHotEncoder() 14 | 15 | # Performing basic imputation for nulls 16 | imputed = age_imputer.fit_transform(col) 17 | ages_filled = pd.DataFrame(data = imputed, columns = ['Age']) 18 | 19 | # Segregating ages into age bins 20 | age_cat_cols = pd.cut(ages_filled['Age'], bins = age_bins, labels = age_labels) 21 | age_cats = pd.DataFrame(data = age_cat_cols, columns = ['Age']) 22 | 23 | # One hot encoding new age bins 24 | ages_encoded = age_ohe.fit_transform(age_cats[['Age']]) 25 | ages_encoded = pd.DataFrame(data = ages_encoded.toarray()) 26 | 27 | return ages_encoded 28 | 29 | 30 | 31 | # Creating function to appropriately engineer the 'Embarked' column 32 | def create_embarked_columns(col): 33 | '''Engineers the embarked variables for pipeline''' 34 | 35 | # Instantiating the transformer objects 36 | embarked_imputer = SimpleImputer(strategy = 'most_frequent') 37 | embarked_ohe = OneHotEncoder() 38 | 39 | # Performing basic imputation for nulls 40 | imputed = embarked_imputer.fit_transform(col) 41 | embarked_filled = pd.DataFrame(data = imputed, columns = ['Embarked']) 42 | 43 | # Performing OHE on the col data 44 | embarked_columns = embarked_ohe.fit_transform(embarked_filled[['Embarked']]) 45 | embarked_columns_df = pd.DataFrame(data = embarked_columns.toarray()) 46 | 47 | return embarked_columns_df 48 | -------------------------------------------------------------------------------- /007_performance_testing_locust/api/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | gunicorn --bind 0.0.0.0:5001 --workers 5 api:application 3 | -------------------------------------------------------------------------------- /007_performance_testing_locust/locustfile.py: -------------------------------------------------------------------------------- 1 | from locust import HttpUser, task, between 2 | import json 3 | 4 | # Loading the test JSON data 5 | with open('test_data/test_1.json') as f: 6 | test_data = json.loads(f.read()) 7 | 8 | # Creating an API User class inheriting from Locust's HttpUser class 9 | class APIUser(HttpUser): 10 | # Setting the host name and wait_time 11 | host = 'http://localhost:5001' 12 | wait_time = between(3, 5) 13 | 14 | # Defining the post task using the JSON test data 15 | @task() 16 | def predict_endpoint(self): 17 | self.client.post('/predict', json = test_data) 18 | -------------------------------------------------------------------------------- /007_performance_testing_locust/model/rfc_pipeline.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/007_performance_testing_locust/model/rfc_pipeline.pkl -------------------------------------------------------------------------------- /007_performance_testing_locust/test_data/test_1.json: -------------------------------------------------------------------------------- 1 | [{"PassengerId":892,"Pclass":3,"Name":"Kelly, Mr. James","Sex":"male","Age":34.5,"SibSp":0,"Parch":0,"Ticket":"330911","Fare":7.8292,"Cabin":null,"Embarked":"Q"}] 2 | -------------------------------------------------------------------------------- /007_performance_testing_locust/test_data/test_2.json: -------------------------------------------------------------------------------- 1 | [{"PassengerId":893,"Pclass":3,"Name":"Wilkes, Mrs. James (Ellen Needs)","Sex":"female","Age":47.0,"SibSp":1,"Parch":0,"Ticket":"363272","Fare":7.0,"Cabin":null,"Embarked":"S"}] 2 | -------------------------------------------------------------------------------- /007_performance_testing_locust/test_data/tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo 'Test 1' 3 | curl --request POST --header 'content-type: application/json' --data @test_1.json --url localhost:5001/predict 4 | echo 5 | echo 'Test 2' 6 | curl --request POST --header 'content-type: application/json' --data @test_2.json --url localhost:5001/predict 7 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/.ipynb_checkpoints/mlflow_wine_notebook-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Science Quick Tips #008: MLFlow Part 1 - Getting Started with MLFlow!\n", 8 | "In this first post in our sub-series on MLFlow, we're going to take things easy by getting up and running with MLFlow. To make things easy on ourselves for modeling, we're simply going to use the Red Wine quality dataset instead of the Titanic dataset we've made use of in other posts. The script here is the exact same code you'll find in mlflow-wine.py. The only difference is that when you launch the MLFlow UI, you'll be able to see that the source is technically different." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "# Importing in necessary libraries\n", 18 | "import pandas as pd\n", 19 | "\n", 20 | "from sklearn.model_selection import train_test_split\n", 21 | "from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n", 22 | "from sklearn.linear_model import ElasticNet\n", 23 | "\n", 24 | "import mlflow\n", 25 | "import mlflow.sklearn" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "# Loading data and prepping for training\n", 35 | "df_wine = pd.read_csv('../data/wine/train.csv')\n", 36 | "\n", 37 | "X = df_wine.drop(columns = 'quality')\n", 38 | "y = df_wine[['quality']]\n", 39 | "\n", 40 | "X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "# Defining model parameters\n", 50 | "alpha = 1\n", 51 | "l1_ratio = 0.5" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 13, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "# Running MLFlow script\n", 61 | "with mlflow.start_run():\n", 62 | " \n", 63 | " # Instantiating model with model parameters\n", 64 | " model = ElasticNet(alpha = alpha,\n", 65 | " l1_ratio = l1_ratio)\n", 66 | " \n", 67 | " # Fitting training data to the model\n", 68 | " model.fit(X_train, y_train)\n", 69 | " \n", 70 | " # Running prediction on validation dataset\n", 71 | " preds = model.predict(X_val)\n", 72 | " \n", 73 | " # Getting metrics on the validation dataset\n", 74 | " rmse = mean_squared_error(preds, y_val)\n", 75 | " abs_error = mean_absolute_error(preds, y_val)\n", 76 | " r2 = r2_score(preds, y_val)\n", 77 | " \n", 78 | " # Logging params and metrics to MLFlow\n", 79 | " mlflow.log_param('alpha', alpha)\n", 80 | " mlflow.log_param('l1_ratio', l1_ratio)\n", 81 | " mlflow.log_metric('rmse', rmse)\n", 82 | " mlflow.log_metric('abs_error', abs_error)\n", 83 | " mlflow.log_metric('r2', r2)\n", 84 | " \n", 85 | " # Logging model to MLFlow\n", 86 | " mlflow.sklearn.log_model(model, 'model')" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [] 95 | } 96 | ], 97 | "metadata": { 98 | "kernelspec": { 99 | "display_name": "Python 3", 100 | "language": "python", 101 | "name": "python3" 102 | }, 103 | "language_info": { 104 | "codemirror_mode": { 105 | "name": "ipython", 106 | "version": 3 107 | }, 108 | "file_extension": ".py", 109 | "mimetype": "text/x-python", 110 | "name": "python", 111 | "nbconvert_exporter": "python", 112 | "pygments_lexer": "ipython3", 113 | "version": "3.7.6" 114 | } 115 | }, 116 | "nbformat": 4, 117 | "nbformat_minor": 2 118 | } 119 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlflow-existing-model.py: -------------------------------------------------------------------------------- 1 | # Importing in necessary libraries 2 | import pandas as pd 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score 5 | from sklearn.linear_model import ElasticNet 6 | import mlflow 7 | import mlflow.sklearn 8 | import joblib 9 | 10 | # Loading serialized model 11 | model = joblib.load('model/model.pkl') 12 | 13 | # Logging model to MLFlow 14 | mlflow.sklearn.log_model(model, 'model') 15 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlflow-wine.py: -------------------------------------------------------------------------------- 1 | # Importing in necessary libraries 2 | import pandas as pd 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score 5 | from sklearn.linear_model import ElasticNet 6 | import mlflow 7 | import mlflow.sklearn 8 | 9 | # Loading data and prepping for training 10 | df_wine = pd.read_csv('../data/wine/train.csv') 11 | 12 | X = df_wine.drop(columns = 'quality') 13 | y = df_wine[['quality']] 14 | 15 | X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42) 16 | 17 | # Defining model parameters 18 | alpha = 1 19 | l1_ratio = 1 20 | 21 | # Running MLFlow script 22 | with mlflow.start_run(): 23 | 24 | # Instantiating model with model parameters 25 | model = ElasticNet(alpha = alpha, 26 | l1_ratio = l1_ratio) 27 | 28 | # Fitting training data to the model 29 | model.fit(X_train, y_train) 30 | 31 | # Running prediction on validation dataset 32 | preds = model.predict(X_val) 33 | 34 | # Getting metrics on the validation dataset 35 | rmse = mean_squared_error(preds, y_val) 36 | abs_error = mean_absolute_error(preds, y_val) 37 | r2 = r2_score(preds, y_val) 38 | 39 | # Logging params and metrics to MLFlow 40 | mlflow.log_param('alpha', alpha) 41 | mlflow.log_param('l1_ratio', l1_ratio) 42 | mlflow.log_metric('rmse', rmse) 43 | mlflow.log_metric('abs_error', abs_error) 44 | mlflow.log_metric('r2', r2) 45 | 46 | # Logging model to MLFlow 47 | mlflow.sklearn.log_model(model, 'model') 48 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlflow_wine_notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Science Quick Tips #008: MLFlow Part 1 - Getting Started with MLFlow!\n", 8 | "In this first post in our sub-series on MLFlow, we're going to take things easy by getting up and running with MLFlow. To make things easy on ourselves for modeling, we're simply going to use the Red Wine quality dataset instead of the Titanic dataset we've made use of in other posts. The script here is the exact same code you'll find in mlflow-wine.py. The only difference is that when you launch the MLFlow UI, you'll be able to see that the source is technically different." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "# Importing in necessary libraries\n", 18 | "import pandas as pd\n", 19 | "\n", 20 | "from sklearn.model_selection import train_test_split\n", 21 | "from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n", 22 | "from sklearn.linear_model import ElasticNet\n", 23 | "\n", 24 | "import mlflow\n", 25 | "import mlflow.sklearn" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "# Loading data and prepping for training\n", 35 | "df_wine = pd.read_csv('../data/wine/train.csv')\n", 36 | "\n", 37 | "X = df_wine.drop(columns = 'quality')\n", 38 | "y = df_wine[['quality']]\n", 39 | "\n", 40 | "X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "# Defining model parameters\n", 50 | "alpha = 1\n", 51 | "l1_ratio = 0.5" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 4, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "# Running MLFlow script\n", 61 | "with mlflow.start_run():\n", 62 | " \n", 63 | " # Instantiating model with model parameters\n", 64 | " model = ElasticNet(alpha = alpha,\n", 65 | " l1_ratio = l1_ratio)\n", 66 | " \n", 67 | " # Fitting training data to the model\n", 68 | " model.fit(X_train, y_train)\n", 69 | " \n", 70 | " # Running prediction on validation dataset\n", 71 | " preds = model.predict(X_val)\n", 72 | " \n", 73 | " # Getting metrics on the validation dataset\n", 74 | " rmse = mean_squared_error(preds, y_val)\n", 75 | " abs_error = mean_absolute_error(preds, y_val)\n", 76 | " r2 = r2_score(preds, y_val)\n", 77 | " \n", 78 | " # Logging params and metrics to MLFlow\n", 79 | " mlflow.log_param('alpha', alpha)\n", 80 | " mlflow.log_param('l1_ratio', l1_ratio)\n", 81 | " mlflow.log_metric('rmse', rmse)\n", 82 | " mlflow.log_metric('abs_error', abs_error)\n", 83 | " mlflow.log_metric('r2', r2)\n", 84 | " \n", 85 | " # Logging model to MLFlow\n", 86 | " mlflow.sklearn.log_model(model, 'model')" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 6, 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "data": { 96 | "text/plain": [ 97 | "['model/model.pkl']" 98 | ] 99 | }, 100 | "execution_count": 6, 101 | "metadata": {}, 102 | "output_type": "execute_result" 103 | } 104 | ], 105 | "source": [ 106 | "import joblib\n", 107 | "joblib.dump(model, 'model/model.pkl')" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [] 116 | } 117 | ], 118 | "metadata": { 119 | "kernelspec": { 120 | "display_name": "Python 3", 121 | "language": "python", 122 | "name": "python3" 123 | }, 124 | "language_info": { 125 | "codemirror_mode": { 126 | "name": "ipython", 127 | "version": 3 128 | }, 129 | "file_extension": ".py", 130 | "mimetype": "text/x-python", 131 | "name": "python", 132 | "nbconvert_exporter": "python", 133 | "pygments_lexer": "ipython3", 134 | "version": "3.7.6" 135 | } 136 | }, 137 | "nbformat": 4, 138 | "nbformat_minor": 2 139 | } 140 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/artifacts/model/MLmodel: -------------------------------------------------------------------------------- 1 | artifact_path: model 2 | flavors: 3 | python_function: 4 | env: conda.yaml 5 | loader_module: mlflow.sklearn 6 | model_path: model.pkl 7 | python_version: 3.7.6 8 | sklearn: 9 | pickled_model: model.pkl 10 | serialization_format: cloudpickle 11 | sklearn_version: 0.22.2.post1 12 | run_id: 09fa7bc156ff4d59b4b00b8fdbe84728 13 | utc_time_created: '2020-09-26 23:40:11.580877' 14 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/artifacts/model/conda.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - defaults 3 | - conda-forge 4 | dependencies: 5 | - python=3.7.6 6 | - scikit-learn=0.22.2.post1 7 | - pip 8 | - pip: 9 | - mlflow 10 | - cloudpickle==1.2.2 11 | name: mlflow-env 12 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/artifacts/model/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/artifacts/model/model.pkl -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/meta.yaml: -------------------------------------------------------------------------------- 1 | artifact_uri: file:///Users/dkhundley/Documents/Repositories/ds-quick-tips/008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/artifacts 2 | end_time: 1601163611593 3 | entry_point_name: '' 4 | experiment_id: '0' 5 | lifecycle_stage: active 6 | name: '' 7 | run_id: 09fa7bc156ff4d59b4b00b8fdbe84728 8 | run_uuid: 09fa7bc156ff4d59b4b00b8fdbe84728 9 | source_name: '' 10 | source_type: 4 11 | source_version: '' 12 | start_time: 1601163611321 13 | status: 3 14 | tags: [] 15 | user_id: dkhundley 16 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/metrics/abs_error: -------------------------------------------------------------------------------- 1 | 1601163611578 0.6442845590438651 0 2 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/metrics/r2: -------------------------------------------------------------------------------- 1 | 1601163611579 -25.56867782599562 0 2 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/metrics/rmse: -------------------------------------------------------------------------------- 1 | 1601163611576 0.6163736624975248 0 2 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/params/alpha: -------------------------------------------------------------------------------- 1 | 1 -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/params/l1_ratio: -------------------------------------------------------------------------------- 1 | 0.5 -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/tags/mlflow.log-model.history: -------------------------------------------------------------------------------- 1 | [{"run_id": "09fa7bc156ff4d59b4b00b8fdbe84728", "artifact_path": "model", "utc_time_created": "2020-09-26 23:40:11.580877", "flavors": {"python_function": {"model_path": "model.pkl", "loader_module": "mlflow.sklearn", "python_version": "3.7.6", "env": "conda.yaml"}, "sklearn": {"pickled_model": "model.pkl", "sklearn_version": "0.22.2.post1", "serialization_format": "cloudpickle"}}}] -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/tags/mlflow.source.name: -------------------------------------------------------------------------------- 1 | /Users/dkhundley/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/tags/mlflow.source.type: -------------------------------------------------------------------------------- 1 | LOCAL -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/tags/mlflow.user: -------------------------------------------------------------------------------- 1 | dkhundley -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/27398db7fb544a269a0c85ec637bbab9/artifacts/model/MLmodel: -------------------------------------------------------------------------------- 1 | artifact_path: model 2 | flavors: 3 | python_function: 4 | env: conda.yaml 5 | loader_module: mlflow.sklearn 6 | model_path: model.pkl 7 | python_version: 3.7.6 8 | sklearn: 9 | pickled_model: model.pkl 10 | serialization_format: cloudpickle 11 | sklearn_version: 0.22.2.post1 12 | run_id: 27398db7fb544a269a0c85ec637bbab9 13 | utc_time_created: '2020-09-29 23:05:32.886408' 14 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/27398db7fb544a269a0c85ec637bbab9/artifacts/model/conda.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - defaults 3 | - conda-forge 4 | dependencies: 5 | - python=3.7.6 6 | - scikit-learn=0.22.2.post1 7 | - pip 8 | - pip: 9 | - mlflow 10 | - cloudpickle==1.2.2 11 | name: mlflow-env 12 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/27398db7fb544a269a0c85ec637bbab9/artifacts/model/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/008_mlflow_getting_started/mlruns/0/27398db7fb544a269a0c85ec637bbab9/artifacts/model/model.pkl -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/27398db7fb544a269a0c85ec637bbab9/meta.yaml: -------------------------------------------------------------------------------- 1 | artifact_uri: file:///Users/dkhundley/Documents/Repositories/ds-quick-tips/008_mlflow_getting_started/mlruns/0/27398db7fb544a269a0c85ec637bbab9/artifacts 2 | end_time: 1601420732900 3 | entry_point_name: '' 4 | experiment_id: '0' 5 | lifecycle_stage: active 6 | name: '' 7 | run_id: 27398db7fb544a269a0c85ec637bbab9 8 | run_uuid: 27398db7fb544a269a0c85ec637bbab9 9 | source_name: '' 10 | source_type: 4 11 | source_version: '' 12 | start_time: 1601420732876 13 | status: 3 14 | tags: [] 15 | user_id: dkhundley 16 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/27398db7fb544a269a0c85ec637bbab9/tags/mlflow.log-model.history: -------------------------------------------------------------------------------- 1 | [{"run_id": "27398db7fb544a269a0c85ec637bbab9", "artifact_path": "model", "utc_time_created": "2020-09-29 23:05:32.886408", "flavors": {"python_function": {"model_path": "model.pkl", "loader_module": "mlflow.sklearn", "python_version": "3.7.6", "env": "conda.yaml"}, "sklearn": {"pickled_model": "model.pkl", "sklearn_version": "0.22.2.post1", "serialization_format": "cloudpickle"}}}] -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/27398db7fb544a269a0c85ec637bbab9/tags/mlflow.source.git.commit: -------------------------------------------------------------------------------- 1 | 061a26344091a53ae58beaf1efe36cce697843a2 -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/27398db7fb544a269a0c85ec637bbab9/tags/mlflow.source.name: -------------------------------------------------------------------------------- 1 | mlflow-existing-model.py -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/27398db7fb544a269a0c85ec637bbab9/tags/mlflow.source.type: -------------------------------------------------------------------------------- 1 | LOCAL -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/27398db7fb544a269a0c85ec637bbab9/tags/mlflow.user: -------------------------------------------------------------------------------- 1 | dkhundley -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/artifacts/model/MLmodel: -------------------------------------------------------------------------------- 1 | artifact_path: model 2 | flavors: 3 | python_function: 4 | env: conda.yaml 5 | loader_module: mlflow.sklearn 6 | model_path: model.pkl 7 | python_version: 3.7.6 8 | sklearn: 9 | pickled_model: model.pkl 10 | serialization_format: cloudpickle 11 | sklearn_version: 0.22.2.post1 12 | run_id: 5a2bf3f0cb504b40ac9cd9a70af32ac6 13 | utc_time_created: '2020-09-29 23:02:21.478675' 14 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/artifacts/model/conda.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - defaults 3 | - conda-forge 4 | dependencies: 5 | - python=3.7.6 6 | - scikit-learn=0.22.2.post1 7 | - pip 8 | - pip: 9 | - mlflow 10 | - cloudpickle==1.2.2 11 | name: mlflow-env 12 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/artifacts/model/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/artifacts/model/model.pkl -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/meta.yaml: -------------------------------------------------------------------------------- 1 | artifact_uri: file:///Users/dkhundley/Documents/Repositories/ds-quick-tips/008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/artifacts 2 | end_time: 1601420541495 3 | entry_point_name: '' 4 | experiment_id: '0' 5 | lifecycle_stage: active 6 | name: '' 7 | run_id: 5a2bf3f0cb504b40ac9cd9a70af32ac6 8 | run_uuid: 5a2bf3f0cb504b40ac9cd9a70af32ac6 9 | source_name: '' 10 | source_type: 4 11 | source_version: '' 12 | start_time: 1601420540829 13 | status: 3 14 | tags: [] 15 | user_id: dkhundley 16 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/metrics/abs_error: -------------------------------------------------------------------------------- 1 | 1601420541476 0.6442845590438651 0 2 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/metrics/r2: -------------------------------------------------------------------------------- 1 | 1601420541477 -25.56867782599562 0 2 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/metrics/rmse: -------------------------------------------------------------------------------- 1 | 1601420541475 0.6163736624975248 0 2 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/params/alpha: -------------------------------------------------------------------------------- 1 | 1 -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/params/l1_ratio: -------------------------------------------------------------------------------- 1 | 0.5 -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/tags/mlflow.log-model.history: -------------------------------------------------------------------------------- 1 | [{"run_id": "5a2bf3f0cb504b40ac9cd9a70af32ac6", "artifact_path": "model", "utc_time_created": "2020-09-29 23:02:21.478675", "flavors": {"python_function": {"model_path": "model.pkl", "loader_module": "mlflow.sklearn", "python_version": "3.7.6", "env": "conda.yaml"}, "sklearn": {"pickled_model": "model.pkl", "sklearn_version": "0.22.2.post1", "serialization_format": "cloudpickle"}}}] -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/tags/mlflow.source.name: -------------------------------------------------------------------------------- 1 | /Users/dkhundley/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/tags/mlflow.source.type: -------------------------------------------------------------------------------- 1 | LOCAL -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/tags/mlflow.user: -------------------------------------------------------------------------------- 1 | dkhundley -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/artifacts/model/MLmodel: -------------------------------------------------------------------------------- 1 | artifact_path: model 2 | flavors: 3 | python_function: 4 | env: conda.yaml 5 | loader_module: mlflow.sklearn 6 | model_path: model.pkl 7 | python_version: 3.7.6 8 | sklearn: 9 | pickled_model: model.pkl 10 | serialization_format: cloudpickle 11 | sklearn_version: 0.22.2.post1 12 | run_id: 93cfbd77d77f4e308297d9a47ea3abd6 13 | utc_time_created: '2020-09-26 23:48:14.718142' 14 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/artifacts/model/conda.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - defaults 3 | - conda-forge 4 | dependencies: 5 | - python=3.7.6 6 | - scikit-learn=0.22.2.post1 7 | - pip 8 | - pip: 9 | - mlflow 10 | - cloudpickle==1.2.2 11 | name: mlflow-env 12 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/artifacts/model/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/artifacts/model/model.pkl -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/meta.yaml: -------------------------------------------------------------------------------- 1 | artifact_uri: file:///Users/dkhundley/Documents/Repositories/ds-quick-tips/008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/artifacts 2 | end_time: 1601164094731 3 | entry_point_name: '' 4 | experiment_id: '0' 5 | lifecycle_stage: deleted 6 | name: '' 7 | run_id: 93cfbd77d77f4e308297d9a47ea3abd6 8 | run_uuid: 93cfbd77d77f4e308297d9a47ea3abd6 9 | source_name: '' 10 | source_type: 4 11 | source_version: '' 12 | start_time: 1601164094443 13 | status: 3 14 | tags: [] 15 | user_id: dkhundley 16 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/metrics/abs_error: -------------------------------------------------------------------------------- 1 | 1601164094715 0.6468353681504646 0 2 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/metrics/r2: -------------------------------------------------------------------------------- 1 | 1601164094716 -31.43732803690922 0 2 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/metrics/rmse: -------------------------------------------------------------------------------- 1 | 1601164094714 0.6150055162124933 0 2 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/params/alpha: -------------------------------------------------------------------------------- 1 | 1 -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/params/l1_ratio: -------------------------------------------------------------------------------- 1 | 1 -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/tags/mlflow.log-model.history: -------------------------------------------------------------------------------- 1 | [{"run_id": "93cfbd77d77f4e308297d9a47ea3abd6", "artifact_path": "model", "utc_time_created": "2020-09-26 23:48:14.718142", "flavors": {"python_function": {"model_path": "model.pkl", "loader_module": "mlflow.sklearn", "python_version": "3.7.6", "env": "conda.yaml"}, "sklearn": {"pickled_model": "model.pkl", "sklearn_version": "0.22.2.post1", "serialization_format": "cloudpickle"}}}] -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/tags/mlflow.source.git.commit: -------------------------------------------------------------------------------- 1 | a069e3387d68e43416be7a2b7626dc8d102a5079 -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/tags/mlflow.source.name: -------------------------------------------------------------------------------- 1 | mlflow-wine.py -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/tags/mlflow.source.type: -------------------------------------------------------------------------------- 1 | LOCAL -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/tags/mlflow.user: -------------------------------------------------------------------------------- 1 | dkhundley -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/artifacts/model/MLmodel: -------------------------------------------------------------------------------- 1 | artifact_path: model 2 | flavors: 3 | python_function: 4 | env: conda.yaml 5 | loader_module: mlflow.sklearn 6 | model_path: model.pkl 7 | python_version: 3.7.6 8 | sklearn: 9 | pickled_model: model.pkl 10 | serialization_format: cloudpickle 11 | sklearn_version: 0.22.2.post1 12 | run_id: befa6150910e4724b1248ee939971dc2 13 | utc_time_created: '2020-09-26 23:40:30.180546' 14 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/artifacts/model/conda.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - defaults 3 | - conda-forge 4 | dependencies: 5 | - python=3.7.6 6 | - scikit-learn=0.22.2.post1 7 | - pip 8 | - pip: 9 | - mlflow 10 | - cloudpickle==1.2.2 11 | name: mlflow-env 12 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/artifacts/model/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/artifacts/model/model.pkl -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/meta.yaml: -------------------------------------------------------------------------------- 1 | artifact_uri: file:///Users/dkhundley/Documents/Repositories/ds-quick-tips/008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/artifacts 2 | end_time: 1601163630196 3 | entry_point_name: '' 4 | experiment_id: '0' 5 | lifecycle_stage: active 6 | name: '' 7 | run_id: befa6150910e4724b1248ee939971dc2 8 | run_uuid: befa6150910e4724b1248ee939971dc2 9 | source_name: '' 10 | source_type: 4 11 | source_version: '' 12 | start_time: 1601163630009 13 | status: 3 14 | tags: [] 15 | user_id: dkhundley 16 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/metrics/abs_error: -------------------------------------------------------------------------------- 1 | 1601163630177 0.6468353681504646 0 2 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/metrics/r2: -------------------------------------------------------------------------------- 1 | 1601163630178 -31.43732803690922 0 2 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/metrics/rmse: -------------------------------------------------------------------------------- 1 | 1601163630176 0.6150055162124933 0 2 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/params/alpha: -------------------------------------------------------------------------------- 1 | 1 -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/params/l1_ratio: -------------------------------------------------------------------------------- 1 | 1 -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/tags/mlflow.log-model.history: -------------------------------------------------------------------------------- 1 | [{"run_id": "befa6150910e4724b1248ee939971dc2", "artifact_path": "model", "utc_time_created": "2020-09-26 23:40:30.180546", "flavors": {"python_function": {"model_path": "model.pkl", "loader_module": "mlflow.sklearn", "python_version": "3.7.6", "env": "conda.yaml"}, "sklearn": {"pickled_model": "model.pkl", "sklearn_version": "0.22.2.post1", "serialization_format": "cloudpickle"}}}] -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/tags/mlflow.source.git.commit: -------------------------------------------------------------------------------- 1 | a069e3387d68e43416be7a2b7626dc8d102a5079 -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/tags/mlflow.source.name: -------------------------------------------------------------------------------- 1 | mlflow-wine.py -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/tags/mlflow.source.type: -------------------------------------------------------------------------------- 1 | LOCAL -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/tags/mlflow.user: -------------------------------------------------------------------------------- 1 | dkhundley -------------------------------------------------------------------------------- /008_mlflow_getting_started/mlruns/0/meta.yaml: -------------------------------------------------------------------------------- 1 | artifact_location: file:///Users/dkhundley/Documents/Repositories/ds-quick-tips/008_mlflow_getting_started/mlruns/0 2 | experiment_id: '0' 3 | lifecycle_stage: active 4 | name: Default 5 | -------------------------------------------------------------------------------- /008_mlflow_getting_started/model/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/008_mlflow_getting_started/model/model.pkl -------------------------------------------------------------------------------- /009_mlflow_tracking_server/Dockerfile: -------------------------------------------------------------------------------- 1 | # Defining base image 2 | FROM python:3.8.2-slim 3 | 4 | # Installing packages from PyPi 5 | RUN pip install mlflow[extras]==1.9.1 && \ 6 | pip install psycopg2-binary==2.8.5 && \ 7 | pip install boto3==1.15.16 8 | 9 | # Defining start up command 10 | EXPOSE 5000 11 | ENTRYPOINT ["mlflow", "server"] 12 | -------------------------------------------------------------------------------- /009_mlflow_tracking_server/k8s/mlflow_deployment.yaml: -------------------------------------------------------------------------------- 1 | # Creating MLflow deployment 2 | apiVersion: apps/v1 3 | kind: Deployment 4 | metadata: 5 | name: mlflow-deployment 6 | spec: 7 | replicas: 1 8 | selector: 9 | matchLabels: 10 | app: mlflow-deployment 11 | template: 12 | metadata: 13 | labels: 14 | app: mlflow-deployment 15 | spec: 16 | containers: 17 | - name: mlflow-deployment 18 | image: dkhundley/mlflow-server:1.0.3 19 | imagePullPolicy: Always 20 | args: 21 | - --host=0.0.0.0 22 | - --port=5000 23 | - --backend-store-uri=postgresql://mlflow_user:mlflow_pwd@10.109.74.95:5432/mlflow_db 24 | - --default-artifact-root=s3://mlflow/ 25 | - --workers=2 26 | env: 27 | - name: MLFLOW_S3_ENDPOINT_URL 28 | value: http://10.111.110.13:9000/ 29 | - name: AWS_ACCESS_KEY_ID 30 | value: "minio" 31 | - name: AWS_SECRET_ACCESS_KEY 32 | value: "minio123" 33 | ports: 34 | - name: http 35 | containerPort: 5000 36 | protocol: TCP 37 | resources: 38 | requests: 39 | cpu: "500m" 40 | --- 41 | apiVersion: v1 42 | kind: Service 43 | metadata: 44 | name: mlflow-service 45 | spec: 46 | type: NodePort 47 | ports: 48 | - port: 5000 49 | targetPort: 5000 50 | protocol: TCP 51 | name: http 52 | selector: 53 | app: mlflow-deployment 54 | --- 55 | apiVersion: networking.k8s.io/v1beta1 56 | kind: Ingress 57 | metadata: 58 | name: mlflow-ingress 59 | annotations: 60 | kubernetes.io/ingress.class: nginx 61 | nginx.ingress.kubernetes.il/add-base-url: "true" 62 | spec: 63 | rules: 64 | - host: mlflow-server.local 65 | http: 66 | paths: 67 | - backend: 68 | serviceName: mlflow-service 69 | servicePort: 5000 70 | path: / 71 | -------------------------------------------------------------------------------- /009_mlflow_tracking_server/k8s/mlflow_minio.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: mlflow-minio 5 | spec: 6 | selector: 7 | matchLabels: 8 | app: mlflow-minio 9 | template: 10 | metadata: 11 | labels: 12 | app: mlflow-minio 13 | spec: 14 | volumes: 15 | - name: mlflow-pvc 16 | persistentVolumeClaim: 17 | claimName: mlflow-pvc 18 | containers: 19 | - name: mlflow-minio 20 | image: minio/minio:latest 21 | args: 22 | - server 23 | - /data 24 | volumeMounts: 25 | - name: mlflow-pvc 26 | mountPath: '/data' 27 | env: 28 | - name: MINIO_ACCESS_KEY 29 | value: "minio" 30 | - name: MINIO_SECRET_KEY 31 | value: "minio123" 32 | ports: 33 | - containerPort: 9000 34 | --- 35 | apiVersion: v1 36 | kind: Service 37 | metadata: 38 | name: mlflow-minio-service 39 | spec: 40 | type: NodePort 41 | ports: 42 | - port: 9000 43 | targetPort: 9000 44 | protocol: TCP 45 | selector: 46 | app: mlflow-minio 47 | --- 48 | apiVersion: networking.k8s.io/v1beta1 49 | kind: Ingress 50 | metadata: 51 | name: mlflow-minio-ingress 52 | annotations: 53 | kubernetes.io/ingress.class: nginx 54 | nginx.ingress.kubernetes.il/add-base-url: "true" 55 | nginx.ingress.kubernetes.io/ssl-redirect: "false" 56 | spec: 57 | rules: 58 | - host: mlflow-minio.local 59 | http: 60 | paths: 61 | - backend: 62 | serviceName: mlflow-minio-service 63 | servicePort: 9000 64 | path: / 65 | --- 66 | apiVersion: v1 67 | kind: PersistentVolumeClaim 68 | metadata: 69 | name: mlflow-pvc 70 | spec: 71 | accessModes: 72 | - ReadWriteMany 73 | resources: 74 | requests: 75 | storage: 100Mi 76 | -------------------------------------------------------------------------------- /009_mlflow_tracking_server/k8s/mlflow_postgres.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: mlflow-postgres-config 5 | labels: 6 | app: mlflow-postgres 7 | data: 8 | POSTGRES_DB: mlflow_db 9 | POSTGRES_USER: mlflow_user 10 | POSTGRES_PASSWORD: mlflow_pwd 11 | PGDATA: /var/lib/postgresql/mlflow/data 12 | --- 13 | apiVersion: apps/v1 14 | kind: StatefulSet 15 | metadata: 16 | name: mlflow-postgres 17 | labels: 18 | app: mlflow-postgres 19 | spec: 20 | selector: 21 | matchLabels: 22 | app: mlflow-postgres 23 | serviceName: "mlflow-postgres-service" 24 | replicas: 1 25 | template: 26 | metadata: 27 | labels: 28 | app: mlflow-postgres 29 | spec: 30 | containers: 31 | - name: mlflow-postgres 32 | image: postgres:11 33 | ports: 34 | - containerPort: 5432 35 | protocol: TCP 36 | envFrom: 37 | - configMapRef: 38 | name: mlflow-postgres-config 39 | resources: 40 | requests: 41 | memory: "1Gi" 42 | cpu: "500m" 43 | volumeMounts: 44 | - name: mlflow-pvc 45 | mountPath: /var/lib/postgresql/mlflow 46 | volumeClaimTemplates: 47 | - metadata: 48 | name: mlflow-pvc 49 | spec: 50 | accessModes: [ "ReadWriteOnce" ] 51 | resources: 52 | requests: 53 | storage: 100Mi 54 | --- 55 | apiVersion: v1 56 | kind: Service 57 | metadata: 58 | name: mlflow-postgres-service 59 | labels: 60 | svc: mlflow-postgres-service 61 | spec: 62 | type: NodePort 63 | ports: 64 | - port: 5432 65 | targetPort: 5432 66 | protocol: TCP 67 | selector: 68 | app: mlflow-postgres 69 | -------------------------------------------------------------------------------- /010_mlflow_logging_to_server/.ipynb_checkpoints/Untitled-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Importing in necessary libraries\n", 10 | "import pandas as pd\n", 11 | "from sklearn.model_selection import train_test_split\n", 12 | "from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n", 13 | "from sklearn.linear_model import ElasticNet\n", 14 | "import mlflow\n", 15 | "import mlflow.sklearn" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 2, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "# Setting tracking URI\n", 25 | "mlflow.set_tracking_uri('http://mlflow-server.local')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "# Loading data and prepping for training\n", 35 | "df_wine = pd.read_csv('../data/wine/train.csv')\n", 36 | "\n", 37 | "X = df_wine.drop(columns = 'quality')\n", 38 | "y = df_wine[['quality']]\n", 39 | "\n", 40 | "X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 4, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "# Defining model parameters\n", 50 | "alpha = 1\n", 51 | "l1_ratio = 1" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 6, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "ename": "PermissionError", 61 | "evalue": "[Errno 13] Permission denied: '/opt/mlflow'", 62 | "output_type": "error", 63 | "traceback": [ 64 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 65 | "\u001b[0;31mPermissionError\u001b[0m Traceback (most recent call last)", 66 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0;31m# Logging model to MLFlow\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 27\u001b[0;31m \u001b[0mmlflow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlog_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'model'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 67 | "\u001b[0;32m~/opt/anaconda3/lib/python3.7/site-packages/mlflow/sklearn/__init__.py\u001b[0m in \u001b[0;36mlog_model\u001b[0;34m(sk_model, artifact_path, conda_env, serialization_format, registered_model_name, signature, input_example)\u001b[0m\n\u001b[1;32m 296\u001b[0m \u001b[0mregistered_model_name\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mregistered_model_name\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 297\u001b[0m \u001b[0msignature\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msignature\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 298\u001b[0;31m \u001b[0minput_example\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minput_example\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 299\u001b[0m )\n\u001b[1;32m 300\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 68 | "\u001b[0;32m~/opt/anaconda3/lib/python3.7/site-packages/mlflow/models/model.py\u001b[0m in \u001b[0;36mlog\u001b[0;34m(cls, artifact_path, flavor, registered_model_name, **kwargs)\u001b[0m\n\u001b[1;32m 159\u001b[0m \u001b[0mmlflow_model\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcls\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0martifact_path\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0martifact_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrun_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrun_id\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 160\u001b[0m \u001b[0mflavor\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlocal_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmlflow_model\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmlflow_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 161\u001b[0;31m \u001b[0mmlflow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtracking\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfluent\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlog_artifacts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlocal_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0martifact_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 162\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 163\u001b[0m \u001b[0mmlflow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtracking\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfluent\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_record_logged_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmlflow_model\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 69 | "\u001b[0;32m~/opt/anaconda3/lib/python3.7/site-packages/mlflow/tracking/fluent.py\u001b[0m in \u001b[0;36mlog_artifacts\u001b[0;34m(local_dir, artifact_path)\u001b[0m\n\u001b[1;32m 326\u001b[0m \"\"\"\n\u001b[1;32m 327\u001b[0m \u001b[0mrun_id\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_get_or_start_run\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_id\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 328\u001b[0;31m \u001b[0mMlflowClient\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlog_artifacts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrun_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlocal_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0martifact_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 329\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 330\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 70 | "\u001b[0;32m~/opt/anaconda3/lib/python3.7/site-packages/mlflow/tracking/client.py\u001b[0m in \u001b[0;36mlog_artifacts\u001b[0;34m(self, run_id, local_dir, artifact_path)\u001b[0m\n\u001b[1;32m 288\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0mparam\u001b[0m \u001b[0martifact_path\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mIf\u001b[0m \u001b[0mprovided\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mdirectory\u001b[0m \u001b[0;32min\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0martifact_uri\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;31m`\u001b[0m \u001b[0mto\u001b[0m \u001b[0mwrite\u001b[0m \u001b[0mto\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 289\u001b[0m \"\"\"\n\u001b[0;32m--> 290\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_tracking_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlog_artifacts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrun_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlocal_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0martifact_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 291\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 292\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_record_logged_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrun_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmlflow_model\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 71 | "\u001b[0;32m~/opt/anaconda3/lib/python3.7/site-packages/mlflow/tracking/_tracking_service/client.py\u001b[0m in \u001b[0;36mlog_artifacts\u001b[0;34m(self, run_id, local_dir, artifact_path)\u001b[0m\n\u001b[1;32m 286\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0mparam\u001b[0m \u001b[0martifact_path\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mIf\u001b[0m \u001b[0mprovided\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mdirectory\u001b[0m \u001b[0;32min\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0martifact_uri\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;31m`\u001b[0m \u001b[0mto\u001b[0m \u001b[0mwrite\u001b[0m \u001b[0mto\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 287\u001b[0m \"\"\"\n\u001b[0;32m--> 288\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_artifact_repo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrun_id\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlog_artifacts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlocal_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0martifact_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 289\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 290\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mlist_artifacts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrun_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 72 | "\u001b[0;32m~/opt/anaconda3/lib/python3.7/site-packages/mlflow/store/artifact/local_artifact_repo.py\u001b[0m in \u001b[0;36mlog_artifacts\u001b[0;34m(self, local_dir, artifact_path)\u001b[0m\n\u001b[1;32m 55\u001b[0m )\n\u001b[1;32m 56\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0martifact_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 57\u001b[0;31m \u001b[0mmkdir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0martifact_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 58\u001b[0m \u001b[0mdir_util\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy_tree\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlocal_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdst\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0martifact_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpreserve_mode\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpreserve_times\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 59\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 73 | "\u001b[0;32m~/opt/anaconda3/lib/python3.7/site-packages/mlflow/utils/file_utils.py\u001b[0m in \u001b[0;36mmkdir\u001b[0;34m(root, name)\u001b[0m\n\u001b[1;32m 109\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mOSError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 110\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merrno\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0merrno\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mEEXIST\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misdir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtarget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 111\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 112\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 113\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 74 | "\u001b[0;32m~/opt/anaconda3/lib/python3.7/site-packages/mlflow/utils/file_utils.py\u001b[0m in \u001b[0;36mmkdir\u001b[0;34m(root, name)\u001b[0m\n\u001b[1;32m 106\u001b[0m \u001b[0mtarget\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mroot\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mroot\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 107\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 108\u001b[0;31m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmakedirs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtarget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 109\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mOSError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 110\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merrno\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0merrno\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mEEXIST\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misdir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtarget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 75 | "\u001b[0;32m~/opt/anaconda3/lib/python3.7/os.py\u001b[0m in \u001b[0;36mmakedirs\u001b[0;34m(name, mode, exist_ok)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhead\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mtail\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 211\u001b[0;31m \u001b[0mmakedirs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexist_ok\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mexist_ok\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 212\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mFileExistsError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[0;31m# Defeats race condition when another thread created the path\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 76 | "\u001b[0;32m~/opt/anaconda3/lib/python3.7/os.py\u001b[0m in \u001b[0;36mmakedirs\u001b[0;34m(name, mode, exist_ok)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhead\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mtail\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 211\u001b[0;31m \u001b[0mmakedirs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexist_ok\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mexist_ok\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 212\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mFileExistsError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[0;31m# Defeats race condition when another thread created the path\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 77 | "\u001b[0;32m~/opt/anaconda3/lib/python3.7/os.py\u001b[0m in \u001b[0;36mmakedirs\u001b[0;34m(name, mode, exist_ok)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhead\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mtail\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 211\u001b[0;31m \u001b[0mmakedirs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexist_ok\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mexist_ok\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 212\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mFileExistsError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[0;31m# Defeats race condition when another thread created the path\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 78 | "\u001b[0;32m~/opt/anaconda3/lib/python3.7/os.py\u001b[0m in \u001b[0;36mmakedirs\u001b[0;34m(name, mode, exist_ok)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhead\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mtail\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 211\u001b[0;31m \u001b[0mmakedirs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexist_ok\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mexist_ok\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 212\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mFileExistsError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[0;31m# Defeats race condition when another thread created the path\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 79 | "\u001b[0;32m~/opt/anaconda3/lib/python3.7/os.py\u001b[0m in \u001b[0;36mmakedirs\u001b[0;34m(name, mode, exist_ok)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhead\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mtail\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 211\u001b[0;31m \u001b[0mmakedirs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexist_ok\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mexist_ok\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 212\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mFileExistsError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[0;31m# Defeats race condition when another thread created the path\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 80 | "\u001b[0;32m~/opt/anaconda3/lib/python3.7/os.py\u001b[0m in \u001b[0;36mmakedirs\u001b[0;34m(name, mode, exist_ok)\u001b[0m\n\u001b[1;32m 219\u001b[0m \u001b[0;32mreturn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 220\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 221\u001b[0;31m \u001b[0mmkdir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 222\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mOSError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 223\u001b[0m \u001b[0;31m# Cannot rely on checking for EEXIST, since the operating system\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 81 | "\u001b[0;31mPermissionError\u001b[0m: [Errno 13] Permission denied: '/opt/mlflow'" 82 | ] 83 | } 84 | ], 85 | "source": [ 86 | "# Running MLFlow script\n", 87 | "with mlflow.start_run():\n", 88 | "\n", 89 | " # Instantiating model with model parameters\n", 90 | " model = ElasticNet(alpha = alpha,\n", 91 | " l1_ratio = l1_ratio)\n", 92 | "\n", 93 | " # Fitting training data to the model\n", 94 | " model.fit(X_train, y_train)\n", 95 | "\n", 96 | " # Running prediction on validation dataset\n", 97 | " preds = model.predict(X_val)\n", 98 | "\n", 99 | " # Getting metrics on the validation dataset\n", 100 | " rmse = mean_squared_error(preds, y_val)\n", 101 | " abs_error = mean_absolute_error(preds, y_val)\n", 102 | " r2 = r2_score(preds, y_val)\n", 103 | "\n", 104 | " # Logging params and metrics to MLFlow\n", 105 | " mlflow.log_param('alpha', alpha)\n", 106 | " mlflow.log_param('l1_ratio', l1_ratio)\n", 107 | " mlflow.log_metric('rmse', rmse)\n", 108 | " mlflow.log_metric('abs_error', abs_error)\n", 109 | " mlflow.log_metric('r2', r2)\n", 110 | "\n", 111 | " # Logging model to MLFlow\n", 112 | " mlflow.sklearn.log_model(model)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [] 135 | } 136 | ], 137 | "metadata": { 138 | "kernelspec": { 139 | "display_name": "Python 3", 140 | "language": "python", 141 | "name": "python3" 142 | }, 143 | "language_info": { 144 | "codemirror_mode": { 145 | "name": "ipython", 146 | "version": 3 147 | }, 148 | "file_extension": ".py", 149 | "mimetype": "text/x-python", 150 | "name": "python", 151 | "nbconvert_exporter": "python", 152 | "pygments_lexer": "ipython3", 153 | "version": "3.7.6" 154 | } 155 | }, 156 | "nbformat": 4, 157 | "nbformat_minor": 2 158 | } 159 | -------------------------------------------------------------------------------- /010_mlflow_logging_to_server/mlflow-wine.py: -------------------------------------------------------------------------------- 1 | # Importing in necessary libraries 2 | import os 3 | import pandas as pd 4 | from sklearn.model_selection import train_test_split 5 | from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score 6 | from sklearn.linear_model import ElasticNet 7 | import mlflow 8 | import mlflow.sklearn 9 | 10 | 11 | 12 | # PROJECT SETUP 13 | # ------------------------------------------------------------------------------ 14 | # Setting the MLflow tracking server 15 | mlflow.set_tracking_uri('http://mlflow-server.local') 16 | 17 | # Setting the requried environment variables 18 | os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'http://mlflow-minio.local/' 19 | os.environ['AWS_ACCESS_KEY_ID'] = 'minio' 20 | os.environ['AWS_SECRET_ACCESS_KEY'] = 'minio123' 21 | 22 | # Loading data from a CSV file 23 | df_wine = pd.read_csv('../data/wine/train.csv') 24 | 25 | # Separating the target class ('quality') from remainder of the training data 26 | X = df_wine.drop(columns = 'quality') 27 | y = df_wine[['quality']] 28 | 29 | # Splitting the data into training and validation sets 30 | X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42) 31 | 32 | 33 | 34 | 35 | # MODEL TRAINING AND LOGGING 36 | # ------------------------------------------------------------------------------ 37 | # Defining model parameters 38 | alpha = 1 39 | l1_ratio = 1 40 | 41 | # Running MLFlow script 42 | with mlflow.start_run(): 43 | 44 | # Instantiating model with model parameters 45 | model = ElasticNet(alpha = alpha, 46 | l1_ratio = l1_ratio) 47 | 48 | # Fitting training data to the model 49 | model.fit(X_train, y_train) 50 | 51 | # Running prediction on validation dataset 52 | preds = model.predict(X_val) 53 | 54 | # Getting metrics on the validation dataset 55 | rmse = mean_squared_error(preds, y_val) 56 | abs_error = mean_absolute_error(preds, y_val) 57 | r2 = r2_score(preds, y_val) 58 | 59 | # Logging params and metrics to MLFlow 60 | mlflow.log_param('alpha', alpha) 61 | mlflow.log_param('l1_ratio', l1_ratio) 62 | mlflow.log_metric('rmse', rmse) 63 | mlflow.log_metric('abs_error', abs_error) 64 | mlflow.log_metric('r2', r2) 65 | 66 | # Logging training data 67 | mlflow.log_artifact(local_path = '../data/wine/train.csv') 68 | 69 | # Logging training code 70 | mlflow.log_artifact(local_path = './mlflow-wine.py') 71 | 72 | # Logging model to MLFlow 73 | mlflow.sklearn.log_model(sk_model = model, 74 | artifact_path = 'wine-pyfile-model', 75 | registered_model_name = 'wine-pyfile-model') 76 | -------------------------------------------------------------------------------- /011_mlflow_interacting_with_client/.ipynb_checkpoints/MLflow_client_interaction-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Importing our required libraries\n", 10 | "import mlflow\n", 11 | "import mlflow.sklearn\n", 12 | "import pandas as pd\n", 13 | "import os" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "# Setting the MLflow client\n", 23 | "client = mlflow.tracking.MlflowClient(tracking_uri = 'http://mlflow-server.local')" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 3, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "# Setting the requried environment variables\n", 33 | "os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'http://mlflow-minio.local/'\n", 34 | "os.environ['AWS_ACCESS_KEY_ID'] = 'minio'\n", 35 | "os.environ['AWS_SECRET_ACCESS_KEY'] = 'minio123'" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 4, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/plain": [ 46 | "[]" 47 | ] 48 | }, 49 | "execution_count": 4, 50 | "metadata": {}, 51 | "output_type": "execute_result" 52 | } 53 | ], 54 | "source": [ 55 | "# Listing the MLflow experiments\n", 56 | "client.list_experiments()" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 7, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "name": "stdout", 66 | "output_type": "stream", 67 | "text": [ 68 | "name=wine-pyfile-model; version=1\n", 69 | "name=wine-pyfile-model; version=2\n", 70 | "name=wine-pyfile-model; version=3\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "# Getting the model versions for the wine-pyfile-model\n", 76 | "results = client.search_model_versions(\"name='wine-pyfile-model'\")\n", 77 | "\n", 78 | "for res in results:\n", 79 | " print(f'name={res.name}; version={res.version}')" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 5, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | "s3://mlflow/0/3a496ea82c304ea38a4ebe1281f7faf2/artifacts/wine-pyfile-model/\n" 92 | ] 93 | } 94 | ], 95 | "source": [ 96 | "# Getting the URI for version 2 of the Wine model\n", 97 | "uri = (client.get_model_version_download_uri(name = 'wine-pyfile-model', version='2')) + '/'\n", 98 | "print(uri)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 8, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "# Loading the model using the URI above\n", 108 | "model = mlflow.sklearn.load_model(model_uri = uri)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 9, 114 | "metadata": {}, 115 | "outputs": [ 116 | { 117 | "data": { 118 | "text/plain": [ 119 | "ElasticNet(alpha=1, copy_X=True, fit_intercept=True, l1_ratio=1, max_iter=1000,\n", 120 | " normalize=False, positive=False, precompute=False, random_state=None,\n", 121 | " selection='cyclic', tol=0.0001, warm_start=False)" 122 | ] 123 | }, 124 | "execution_count": 9, 125 | "metadata": {}, 126 | "output_type": "execute_result" 127 | } 128 | ], 129 | "source": [ 130 | "# Showing the model object itself\n", 131 | "model" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 11, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "# Loading in the training data\n", 141 | "df_wine = pd.read_csv('../data/wine/train.csv')" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 12, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "# Dropping the predictor column\n", 151 | "X = df_wine.drop(columns = ['quality'])" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 14, 157 | "metadata": {}, 158 | "outputs": [ 159 | { 160 | "data": { 161 | "text/plain": [ 162 | "array([5.68130396, 5.5414187 , 5.59652501, ..., 5.65587028, 5.63891449,\n", 163 | " 5.64739238])" 164 | ] 165 | }, 166 | "execution_count": 14, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "# Getting model predictions\n", 173 | "model.predict(X)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [] 182 | } 183 | ], 184 | "metadata": { 185 | "kernelspec": { 186 | "display_name": "Python 3", 187 | "language": "python", 188 | "name": "python3" 189 | }, 190 | "language_info": { 191 | "codemirror_mode": { 192 | "name": "ipython", 193 | "version": 3 194 | }, 195 | "file_extension": ".py", 196 | "mimetype": "text/x-python", 197 | "name": "python", 198 | "nbconvert_exporter": "python", 199 | "pygments_lexer": "ipython3", 200 | "version": "3.7.6" 201 | } 202 | }, 203 | "nbformat": 4, 204 | "nbformat_minor": 2 205 | } 206 | -------------------------------------------------------------------------------- /011_mlflow_interacting_with_client/MLflow_client_interaction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Importing our required libraries\n", 10 | "import mlflow\n", 11 | "import mlflow.sklearn\n", 12 | "import pandas as pd\n", 13 | "import os" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "# Setting the MLflow client\n", 23 | "client = mlflow.tracking.MlflowClient(tracking_uri = 'http://mlflow-server.local')" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 3, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "# Setting the requried environment variables\n", 33 | "os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'http://mlflow-minio.local/'\n", 34 | "os.environ['AWS_ACCESS_KEY_ID'] = 'minio'\n", 35 | "os.environ['AWS_SECRET_ACCESS_KEY'] = 'minio123'" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 4, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/plain": [ 46 | "[]" 47 | ] 48 | }, 49 | "execution_count": 4, 50 | "metadata": {}, 51 | "output_type": "execute_result" 52 | } 53 | ], 54 | "source": [ 55 | "# Listing the MLflow experiments\n", 56 | "client.list_experiments()" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 7, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "name": "stdout", 66 | "output_type": "stream", 67 | "text": [ 68 | "name=wine-pyfile-model; version=1\n", 69 | "name=wine-pyfile-model; version=2\n", 70 | "name=wine-pyfile-model; version=3\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "# Getting the model versions for the wine-pyfile-model\n", 76 | "results = client.search_model_versions(\"name='wine-pyfile-model'\")\n", 77 | "\n", 78 | "for res in results:\n", 79 | " print(f'name={res.name}; version={res.version}')" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 5, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | "s3://mlflow/0/3a496ea82c304ea38a4ebe1281f7faf2/artifacts/wine-pyfile-model/\n" 92 | ] 93 | } 94 | ], 95 | "source": [ 96 | "# Getting the URI for version 2 of the Wine model\n", 97 | "uri = (client.get_model_version_download_uri(name = 'wine-pyfile-model', version='2')) + '/'\n", 98 | "print(uri)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 8, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "# Loading the model using the URI above\n", 108 | "model = mlflow.sklearn.load_model(model_uri = uri)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 9, 114 | "metadata": {}, 115 | "outputs": [ 116 | { 117 | "data": { 118 | "text/plain": [ 119 | "ElasticNet(alpha=1, copy_X=True, fit_intercept=True, l1_ratio=1, max_iter=1000,\n", 120 | " normalize=False, positive=False, precompute=False, random_state=None,\n", 121 | " selection='cyclic', tol=0.0001, warm_start=False)" 122 | ] 123 | }, 124 | "execution_count": 9, 125 | "metadata": {}, 126 | "output_type": "execute_result" 127 | } 128 | ], 129 | "source": [ 130 | "# Showing the model object itself\n", 131 | "model" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 11, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "# Loading in the training data\n", 141 | "df_wine = pd.read_csv('../data/wine/train.csv')" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 12, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "# Dropping the predictor column\n", 151 | "X = df_wine.drop(columns = ['quality'])" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 14, 157 | "metadata": {}, 158 | "outputs": [ 159 | { 160 | "data": { 161 | "text/plain": [ 162 | "array([5.68130396, 5.5414187 , 5.59652501, ..., 5.65587028, 5.63891449,\n", 163 | " 5.64739238])" 164 | ] 165 | }, 166 | "execution_count": 14, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "# Getting model predictions\n", 173 | "model.predict(X)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [] 182 | } 183 | ], 184 | "metadata": { 185 | "kernelspec": { 186 | "display_name": "Python 3", 187 | "language": "python", 188 | "name": "python3" 189 | }, 190 | "language_info": { 191 | "codemirror_mode": { 192 | "name": "ipython", 193 | "version": 3 194 | }, 195 | "file_extension": ".py", 196 | "mimetype": "text/x-python", 197 | "name": "python", 198 | "nbconvert_exporter": "python", 199 | "pygments_lexer": "ipython3", 200 | "version": "3.7.6" 201 | } 202 | }, 203 | "nbformat": 4, 204 | "nbformat_minor": 2 205 | } 206 | -------------------------------------------------------------------------------- /011_mlflow_interacting_with_client/mlruns/0/meta.yaml: -------------------------------------------------------------------------------- 1 | artifact_location: file:///Users/dkhundley/Documents/Repositories/ds-quick-tips/011_mlflow_interacting_with_client/mlruns/0 2 | experiment_id: '0' 3 | lifecycle_stage: active 4 | name: Default 5 | -------------------------------------------------------------------------------- /012_dockerizing_fastapi/Dockerfile: -------------------------------------------------------------------------------- 1 | # Starting with base image 2 | FROM python:3.8-slim-buster 3 | 4 | # Installing required packages from requirements.txt file 5 | COPY dependencies/requirements.txt / 6 | RUN pip install -r /requirements.txt 7 | 8 | # Copying the FastAPI inference script 9 | COPY container/ /container 10 | 11 | # Setting the working directory appropriately 12 | WORKDIR /container 13 | 14 | # Exposing the appropriate port on the container 15 | EXPOSE 5000 16 | 17 | # Setting the entrypoint for the container 18 | ENTRYPOINT ["uvicorn"] -------------------------------------------------------------------------------- /012_dockerizing_fastapi/container/api.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pickle 3 | from fastapi import FastAPI, Request 4 | from pydantic import BaseModel 5 | 6 | 7 | 8 | ## API INSTANTIATION 9 | ## ---------------------------------------------------------------- 10 | 11 | # Instantiating FastAPI 12 | api = FastAPI() 13 | 14 | # Loading in model from serialized .pkl file 15 | pkl_filename = "../model/iris_model.pkl" 16 | with open(pkl_filename, 'rb') as file: 17 | lr_model = pickle.load(file) 18 | 19 | # Creating the data model for data validation 20 | class Iris(BaseModel): 21 | sepal_length: float 22 | sepal_width: float 23 | petal_length: float 24 | petal_width: float 25 | 26 | 27 | 28 | ## API ENDPOINTS 29 | ## ---------------------------------------------------------------- 30 | 31 | # Defining a test root path and message 32 | @api.get('/') 33 | def root(): 34 | return {'message': 'Hello friends!'} 35 | 36 | 37 | 38 | # Defining the prediction endpoint without data validation 39 | @api.post('/basic_predict') 40 | async def basic_predict(request: Request): 41 | 42 | # Getting the JSON from the body of the request 43 | input_data = await request.json() 44 | 45 | # Converting JSON to Pandas DataFrame 46 | input_df = pd.DataFrame([input_data]) 47 | 48 | # Getting the prediction from the Logistic Regression model 49 | pred = lr_model.predict(input_df)[0] 50 | 51 | return pred 52 | 53 | 54 | 55 | # Defining the prediction endpoint with data validation 56 | @api.post('/predict') 57 | async def predict(iris: Iris): 58 | 59 | # Converting input data into Pandas DataFrame 60 | input_df = pd.DataFrame([iris.dict()]) 61 | 62 | # Getting the prediction from the Logistic Regression model 63 | pred = lr_model.predict(input_df)[0] 64 | 65 | return pred -------------------------------------------------------------------------------- /012_dockerizing_fastapi/container/start_api.sh: -------------------------------------------------------------------------------- 1 | uvicorn api:api --host 0.0.0.0 --port 5001 --reload -------------------------------------------------------------------------------- /012_dockerizing_fastapi/container/train.py: -------------------------------------------------------------------------------- 1 | # Importing the required Python libraries 2 | import numpy as np 3 | import pandas as pd 4 | import pickle 5 | from sklearn import datasets 6 | from sklearn.linear_model import LogisticRegression 7 | 8 | # Loading the iris dataset from Scikit-Learn 9 | iris = datasets.load_iris() 10 | 11 | # Converting the iris dataset into a Pandas DataFrame 12 | df_iris = pd.DataFrame(data = np.c_[iris['data'], iris['target']], 13 | columns = iris['feature_names'] + ['target']) 14 | 15 | # Separating the training dataset (X) from the predictor value (y) 16 | X = df_iris.drop(columns = ['target']) 17 | y = df_iris[['target']] 18 | 19 | # Instantiating a Logistic Regression (LR) model 20 | lr_model = LogisticRegression() 21 | 22 | # Fitting the dataset to the LR model 23 | lr_model.fit(X, y) 24 | 25 | # Saving the model to a serialized .pkl file 26 | pkl_filename = "../model/iris_model.pkl" 27 | with open(pkl_filename, 'wb') as file: 28 | pickle.dump(lr_model, file) -------------------------------------------------------------------------------- /012_dockerizing_fastapi/dependencies/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi==0.63.0 2 | pandas==1.2.1 3 | scikit-learn==0.24.1 4 | uvicorn==0.13.4 -------------------------------------------------------------------------------- /012_dockerizing_fastapi/k8s/deployment.yaml: -------------------------------------------------------------------------------- 1 | # Creating the deployment for the Iris FastAPI 2 | apiVersion: apps/v1 3 | kind: Deployment 4 | metadata: 5 | name: fastapi-iris 6 | labels: 7 | app: fastapi-iris 8 | spec: 9 | replicas: 1 10 | selector: 11 | matchLabels: 12 | app: fastapi-iris 13 | template: 14 | metadata: 15 | labels: 16 | app: fastapi-iris 17 | spec: 18 | containers: 19 | - name: fastapi-iris 20 | image: fastapi-iris:1.0.0 21 | ports: 22 | = containerPort: 5000 23 | resources: 24 | requests: 25 | cpu: 100m 26 | memory: 100Mi 27 | --- 28 | # Creating the service to support the Iris FastAPI deployment 29 | apiVersion: v1 30 | kind: Service 31 | metadata: 32 | name: fastapi-iris-service 33 | labels: 34 | app: fastapi-iris 35 | spec: 36 | type: LoadBalancer 37 | ports: 38 | - port: 5000 39 | protocol: TCP 40 | targetPort: 5000 41 | selector: 42 | app: fastapi-iris -------------------------------------------------------------------------------- /012_dockerizing_fastapi/model/iris_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/012_dockerizing_fastapi/model/iris_model.pkl -------------------------------------------------------------------------------- /012_dockerizing_fastapi/notebooks/.ipynb_checkpoints/iris_model_creation-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Importing the required Python libraries\n", 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "from sklearn import datasets\n", 13 | "from sklearn.linear_model import LogisticRegression" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "# Loading the iris dataset\n", 23 | "iris = datasets.load_iris()\n", 24 | "\n", 25 | "# Converting the iris dataset into a pandas dataframe\n", 26 | "df_iris = pd.DataFrame(data = np.c_[iris['data'], iris['target']],\n", 27 | " columns = iris['feature_names'] + ['target'])" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 8, 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "data": { 37 | "text/plain": [ 38 | "'[5.1,3.5,1.4,0.2]'" 39 | ] 40 | }, 41 | "execution_count": 8, 42 | "metadata": {}, 43 | "output_type": "execute_result" 44 | } 45 | ], 46 | "source": [ 47 | "df_iris.drop(columns = ['target']).iloc[0].to_json(orient = 'records')" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 3, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "# Separating the training dataset (X) from the predictor value (y)\n", 57 | "X = df_iris.drop(columns = ['target'])\n", 58 | "y = df_iris[['target']]" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 4, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "name": "stderr", 68 | "output_type": "stream", 69 | "text": [ 70 | "/home/pi/.local/lib/python3.7/site-packages/sklearn/utils/validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", 71 | " return f(*args, **kwargs)\n", 72 | "/home/pi/.local/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):\n", 73 | "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", 74 | "\n", 75 | "Increase the number of iterations (max_iter) or scale the data as shown in:\n", 76 | " https://scikit-learn.org/stable/modules/preprocessing.html\n", 77 | "Please also refer to the documentation for alternative solver options:\n", 78 | " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", 79 | " extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n" 80 | ] 81 | }, 82 | { 83 | "data": { 84 | "text/plain": [ 85 | "LogisticRegression()" 86 | ] 87 | }, 88 | "execution_count": 4, 89 | "metadata": {}, 90 | "output_type": "execute_result" 91 | } 92 | ], 93 | "source": [ 94 | "# Instantiating a Logistic Regression (LR) model\n", 95 | "lr_model = LogisticRegression()\n", 96 | "\n", 97 | "# Fitting the dataset to the LR model\n", 98 | "lr_model.fit(X, y)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 6, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "# Saving model to .pkl file\n", 108 | "import pickle\n", 109 | "pkl_filename = \"../model/iris_model.pkl\"\n", 110 | "with open(pkl_filename, 'wb') as file:\n", 111 | " pickle.dump(lr_model, file)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 6, 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "name": "stderr", 121 | "output_type": "stream", 122 | "text": [ 123 | "/var/mobile/Containers/Data/Application/8FC05BBA-B11D-49BC-B4D9-87CB282BBBF2/Library/Application Support/com.rationalmatter.junoapp/python-home/lib/python3.6/site-packages/sklearn/base.py:334: UserWarning: Trying to unpickle estimator LogisticRegression from version 0.24.1 when using version 0.23.1. This might lead to breaking code or invalid results. Use at your own risk.\n", 124 | " UserWarning)\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "# Loading model back in from .pkl file\n", 130 | "import pickle\n", 131 | "pkl_filename = \"model/iris_model.pkl\"\n", 132 | "with open(pkl_filename, 'rb') as file:\n", 133 | " lr_loaded_model_model = pickle.load(file)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 5, 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/plain": [ 144 | "['Dockerfile', 'k8s', 'dependencies', 'container', 'model', 'notebooks']" 145 | ] 146 | }, 147 | "execution_count": 5, 148 | "metadata": {}, 149 | "output_type": "execute_result" 150 | } 151 | ], 152 | "source": [ 153 | "import os\n", 154 | "os.listdir()" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [] 163 | } 164 | ], 165 | "metadata": { 166 | "kernelspec": { 167 | "display_name": "Python 3", 168 | "language": "python", 169 | "name": "python3" 170 | }, 171 | "language_info": { 172 | "codemirror_mode": { 173 | "name": "ipython", 174 | "version": 3 175 | }, 176 | "file_extension": ".py", 177 | "mimetype": "text/x-python", 178 | "name": "python", 179 | "nbconvert_exporter": "python", 180 | "pygments_lexer": "ipython3", 181 | "version": "3.6.6+" 182 | } 183 | }, 184 | "nbformat": 4, 185 | "nbformat_minor": 2 186 | } 187 | -------------------------------------------------------------------------------- /012_dockerizing_fastapi/notebooks/iris_model_creation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Importing the required Python libraries\n", 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "from sklearn import datasets\n", 13 | "from sklearn.linear_model import LogisticRegression" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "# Loading the iris dataset\n", 23 | "iris = datasets.load_iris()\n", 24 | "\n", 25 | "# Converting the iris dataset into a pandas dataframe\n", 26 | "df_iris = pd.DataFrame(data = np.c_[iris['data'], iris['target']],\n", 27 | " columns = iris['feature_names'] + ['target'])" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 8, 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "data": { 37 | "text/plain": [ 38 | "'[5.1,3.5,1.4,0.2]'" 39 | ] 40 | }, 41 | "execution_count": 8, 42 | "metadata": {}, 43 | "output_type": "execute_result" 44 | } 45 | ], 46 | "source": [ 47 | "df_iris.drop(columns = ['target']).iloc[0].to_json(orient = 'records')" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 3, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "# Separating the training dataset (X) from the predictor value (y)\n", 57 | "X = df_iris.drop(columns = ['target'])\n", 58 | "y = df_iris[['target']]" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 4, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "name": "stderr", 68 | "output_type": "stream", 69 | "text": [ 70 | "/home/pi/.local/lib/python3.7/site-packages/sklearn/utils/validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", 71 | " return f(*args, **kwargs)\n", 72 | "/home/pi/.local/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):\n", 73 | "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", 74 | "\n", 75 | "Increase the number of iterations (max_iter) or scale the data as shown in:\n", 76 | " https://scikit-learn.org/stable/modules/preprocessing.html\n", 77 | "Please also refer to the documentation for alternative solver options:\n", 78 | " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", 79 | " extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n" 80 | ] 81 | }, 82 | { 83 | "data": { 84 | "text/plain": [ 85 | "LogisticRegression()" 86 | ] 87 | }, 88 | "execution_count": 4, 89 | "metadata": {}, 90 | "output_type": "execute_result" 91 | } 92 | ], 93 | "source": [ 94 | "# Instantiating a Logistic Regression (LR) model\n", 95 | "lr_model = LogisticRegression()\n", 96 | "\n", 97 | "# Fitting the dataset to the LR model\n", 98 | "lr_model.fit(X, y)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 6, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "# Saving model to .pkl file\n", 108 | "import pickle\n", 109 | "pkl_filename = \"../model/iris_model.pkl\"\n", 110 | "with open(pkl_filename, 'wb') as file:\n", 111 | " pickle.dump(lr_model, file)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 6, 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "name": "stderr", 121 | "output_type": "stream", 122 | "text": [ 123 | "/var/mobile/Containers/Data/Application/8FC05BBA-B11D-49BC-B4D9-87CB282BBBF2/Library/Application Support/com.rationalmatter.junoapp/python-home/lib/python3.6/site-packages/sklearn/base.py:334: UserWarning: Trying to unpickle estimator LogisticRegression from version 0.24.1 when using version 0.23.1. This might lead to breaking code or invalid results. Use at your own risk.\n", 124 | " UserWarning)\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "# Loading model back in from .pkl file\n", 130 | "import pickle\n", 131 | "pkl_filename = \"model/iris_model.pkl\"\n", 132 | "with open(pkl_filename, 'rb') as file:\n", 133 | " lr_loaded_model_model = pickle.load(file)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 5, 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/plain": [ 144 | "['Dockerfile', 'k8s', 'dependencies', 'container', 'model', 'notebooks']" 145 | ] 146 | }, 147 | "execution_count": 5, 148 | "metadata": {}, 149 | "output_type": "execute_result" 150 | } 151 | ], 152 | "source": [ 153 | "import os\n", 154 | "os.listdir()" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [] 163 | } 164 | ], 165 | "metadata": { 166 | "kernelspec": { 167 | "display_name": "Python 3", 168 | "language": "python", 169 | "name": "python3" 170 | }, 171 | "language_info": { 172 | "codemirror_mode": { 173 | "name": "ipython", 174 | "version": 3 175 | }, 176 | "file_extension": ".py", 177 | "mimetype": "text/x-python", 178 | "name": "python", 179 | "nbconvert_exporter": "python", 180 | "pygments_lexer": "ipython3", 181 | "version": "3.6.6+" 182 | } 183 | }, 184 | "nbformat": 4, 185 | "nbformat_minor": 2 186 | } 187 | -------------------------------------------------------------------------------- /012_dockerizing_fastapi/tests/test_bad_predict.sh: -------------------------------------------------------------------------------- 1 | curl --request POST \ 2 | --header 'Content-Type: application/json' \ 3 | --data @test_json/bad_data.json \ 4 | --url http://0.0.0.0:5001/predict -------------------------------------------------------------------------------- /012_dockerizing_fastapi/tests/test_basic_predict.sh: -------------------------------------------------------------------------------- 1 | curl --request POST \ 2 | --header 'Content-Type: application/json' \ 3 | --data @test_json/test_data.json \ 4 | --url http://0.0.0.0:5001/basic_predict -------------------------------------------------------------------------------- /012_dockerizing_fastapi/tests/test_json/bad_data.json: -------------------------------------------------------------------------------- 1 | {"sepal_length":"dkhundley","sepal_width":3.5,"petal_length":1.4,"petal_width":0.2} -------------------------------------------------------------------------------- /012_dockerizing_fastapi/tests/test_json/test_data.json: -------------------------------------------------------------------------------- 1 | {"sepal_length":5.1,"sepal_width":3.5,"petal_length":1.4,"petal_width":0.2} -------------------------------------------------------------------------------- /012_dockerizing_fastapi/tests/test_predict.sh: -------------------------------------------------------------------------------- 1 | curl --request POST \ 2 | --header 'Content-Type: application/json' \ 3 | --data @test_json/test_data.json \ 4 | --url http://0.0.0.0:5001/predict -------------------------------------------------------------------------------- /013_fastapi_tests_scans/Dockerfile: -------------------------------------------------------------------------------- 1 | # Starting with base image 2 | FROM python:3.9-slim-buster 3 | 4 | # Installing required packages from requirements.txt file 5 | COPY dependencies/requirements.txt / 6 | RUN pip install -r /requirements.txt 7 | 8 | # Copying the FastAPI inference script and model 9 | COPY container/ /container 10 | COPY models/ /models 11 | 12 | # Setting the working directory appropriately 13 | WORKDIR /container 14 | 15 | # Exposing the appropriate port on the container 16 | EXPOSE 5001 17 | 18 | # Setting the entrypoint for the container 19 | ENTRYPOINT ["uvicorn", "--host", "0.0.0.0", "--port", "5001", "api:api"] 20 | -------------------------------------------------------------------------------- /013_fastapi_tests_scans/container/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/container/__init__.py -------------------------------------------------------------------------------- /013_fastapi_tests_scans/container/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/container/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /013_fastapi_tests_scans/container/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/container/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /013_fastapi_tests_scans/container/__pycache__/api.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/container/__pycache__/api.cpython-37.pyc -------------------------------------------------------------------------------- /013_fastapi_tests_scans/container/__pycache__/api.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/container/__pycache__/api.cpython-38.pyc -------------------------------------------------------------------------------- /013_fastapi_tests_scans/container/api.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pickle 3 | from fastapi import FastAPI, Request 4 | from fastapi.responses import JSONResponse 5 | 6 | 7 | ## API INSTANTIATION 8 | ## ---------------------------------------------------------------- 9 | # Instantiating FastAPI 10 | api = FastAPI() 11 | 12 | # Loading in model from serialized .pkl file 13 | pkl_filename = "../models/iris_model.pkl" 14 | with open(pkl_filename, 'rb') as file: 15 | lr_model = pickle.load(file) 16 | 17 | 18 | 19 | ## API ENDPOINTS 20 | ## ---------------------------------------------------------------- 21 | # Defining a test root path and message 22 | @api.get('/') 23 | def root(): 24 | msg = {'message': 'Hello friends!'} 25 | return JSONResponse(content = msg, status_code = 200) 26 | 27 | 28 | 29 | # Defining the prediction endpoint without data validation 30 | @api.post('/predict') 31 | async def predict(request: Request): 32 | 33 | # Getting the JSON from the body of the request 34 | input_data = await request.json() 35 | 36 | # Converting JSON to Pandas DataFrame 37 | input_df = pd.DataFrame([input_data]) 38 | 39 | # Getting the prediction from the Logistic Regression model 40 | pred = lr_model.predict(input_df)[0] 41 | 42 | return JSONResponse(content = pred, status_code = 200) -------------------------------------------------------------------------------- /013_fastapi_tests_scans/container/start_api.sh: -------------------------------------------------------------------------------- 1 | uvicorn api:api --host 0.0.0.0 --port 5001 --reload -------------------------------------------------------------------------------- /013_fastapi_tests_scans/container/train.py: -------------------------------------------------------------------------------- 1 | # Importing the required Python libraries 2 | import numpy as np 3 | import pandas as pd 4 | import pickle 5 | from sklearn import datasets 6 | from sklearn.linear_model import LogisticRegression 7 | 8 | # Loading the iris dataset from Scikit-Learn 9 | iris = datasets.load_iris() 10 | 11 | # Converting the iris dataset into a Pandas DataFrame 12 | df_iris = pd.DataFrame(data = np.c_[iris['data'], iris['target']], 13 | columns = iris['feature_names'] + ['target']) 14 | 15 | # Separating the training dataset (X) from the predictor value (y) 16 | X = df_iris.drop(columns = ['target']) 17 | y = df_iris[['target']] 18 | 19 | # Instantiating a Logistic Regression (LR) model 20 | lr_model = LogisticRegression() 21 | 22 | # Fitting the dataset to the LR model 23 | lr_model.fit(X, y) 24 | 25 | # Saving the model to a serialized .pkl file 26 | pkl_filename = "../models/iris_model.pkl" 27 | with open(pkl_filename, 'wb') as file: 28 | pickle.dump(lr_model, file) -------------------------------------------------------------------------------- /013_fastapi_tests_scans/dependencies/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi==0.68.0 2 | numpy==1.21.1 3 | pandas==1.3.1 4 | scikit-learn==0.24.1 5 | uvicorn==0.14.0 -------------------------------------------------------------------------------- /013_fastapi_tests_scans/models/iris_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/models/iris_model.pkl -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/.pytest_cache/v/cache/lastfailed: -------------------------------------------------------------------------------- 1 | { 2 | "test_api.py::TestClient": true 3 | } -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/.pytest_cache/v/cache/nodeids: -------------------------------------------------------------------------------- 1 | [ 2 | "test_api.py::test_predict", 3 | "test_api.py::test_root_message" 4 | ] -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/.pytest_cache/v/cache/stepwise: -------------------------------------------------------------------------------- 1 | [] -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/tests/__init__.py -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/tests/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/tests/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/__pycache__/test_api.cpython-37-pytest-6.2.4.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/tests/__pycache__/test_api.cpython-37-pytest-6.2.4.pyc -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/curl_scripts/test_predict.sh: -------------------------------------------------------------------------------- 1 | curl --request POST \ 2 | --header 'Content-Type: application/json' \ 3 | --data @../test_json/test_data.json \ 4 | --url http://0.0.0.0:5001/predict -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/performance_testing/__pycache__/locustfile.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/tests/performance_testing/__pycache__/locustfile.cpython-38.pyc -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/performance_testing/locustfile.py: -------------------------------------------------------------------------------- 1 | from locust import HttpUser, task, between 2 | import json 3 | 4 | # Loading the test JSON data 5 | with open('test_json/test_data.json') as f: 6 | test_data = json.loads(f.read()) 7 | 8 | # Creating an API User class inheriting from Locust's HttpUser class 9 | class APIUser(HttpUser): 10 | # Setting the host name and wait_time 11 | host = 'http://localhost:5001' 12 | wait_time = between(3, 5) 13 | 14 | # Defining the post task using the JSON test data 15 | @task() 16 | def predict_endpoint(self): 17 | self.client.post('/predict', json = test_data) -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/reports/dependency_scan_report.txt: -------------------------------------------------------------------------------- 1 | +==============================================================================+ 2 | | | 3 | | /$$$$$$ /$$ | 4 | | /$$__ $$ | $$ | 5 | | /$$$$$$$ /$$$$$$ | $$ \__//$$$$$$ /$$$$$$ /$$ /$$ | 6 | | /$$_____/ |____ $$| $$$$ /$$__ $$|_ $$_/ | $$ | $$ | 7 | | | $$$$$$ /$$$$$$$| $$_/ | $$$$$$$$ | $$ | $$ | $$ | 8 | | \____ $$ /$$__ $$| $$ | $$_____/ | $$ /$$| $$ | $$ | 9 | | /$$$$$$$/| $$$$$$$| $$ | $$$$$$$ | $$$$/| $$$$$$$ | 10 | | |_______/ \_______/|__/ \_______/ \___/ \____ $$ | 11 | | /$$ | $$ | 12 | | | $$$$$$/ | 13 | | by pyup.io \______/ | 14 | | | 15 | +==============================================================================+ 16 | | REPORT | 17 | | checked 5 packages, using free DB (updated once a month) | 18 | +============================+===========+==========================+==========+ 19 | | package | installed | affected | ID | 20 | +============================+===========+==========================+==========+ 21 | | uvicorn | 0.0.4 | <0.11.7 | 38664 | 22 | +==============================================================================+ 23 | | The request logger provided by Uvicorn prior to version 0.11.7 is vulnerable | 24 | | to ASNI escape sequence injection. Whenever any HTTP request is received, | 25 | | the default behaviour of uvicorn is to log its details to either the console | 26 | | or a log file. When attackers request crafted URLs with percent-encoded | 27 | | escape sequences, the logging component will log the URL after it's been | 28 | | processed with urllib.parse.unquote, therefore converting any percent- | 29 | | encoded characters into their single-character equivalent, which can have | 30 | | special meaning in terminal emulators. By requesting URLs with crafted | 31 | | paths, attackers can: * Pollute uvicorn's access logs, therefore | 32 | | jeopardising the integrity of such files. * Use ANSI sequence codes to | 33 | | attempt to interact with the terminal emulator that's displaying the logs | 34 | | (either in real time or from a file). See: CVE-2020-7694. | 35 | +==============================================================================+ 36 | | uvicorn | 0.0.4 | <0.11.7 | 38665 | 37 | +==============================================================================+ 38 | | Uvicorn before 0.11.7 is vulnerable to HTTP response splitting. CRLF | 39 | | sequences are not escaped in the value of HTTP headers. Attackers can | 40 | | exploit this to add arbitrary headers to HTTP responses, or even return an | 41 | | arbitrary response body, whenever crafted input is used to construct HTTP | 42 | | headers. See: CVE-2020-7695. | 43 | +==============================================================================+ -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/reports/linter_report.txt: -------------------------------------------------------------------------------- 1 | ************* Module container.api 2 | /Users/dkhundley/Documents/Repositories/ds-quick-tips/013_fastapi_tests_scans/container/api.py:42:0: C0304: Final newline missing (missing-final-newline) 3 | /Users/dkhundley/Documents/Repositories/ds-quick-tips/013_fastapi_tests_scans/container/api.py:1:0: C0114: Missing module docstring (missing-module-docstring) 4 | /Users/dkhundley/Documents/Repositories/ds-quick-tips/013_fastapi_tests_scans/container/api.py:3:0: E0401: Unable to import 'fastapi' (import-error) 5 | /Users/dkhundley/Documents/Repositories/ds-quick-tips/013_fastapi_tests_scans/container/api.py:4:0: E0401: Unable to import 'fastapi.responses' (import-error) 6 | /Users/dkhundley/Documents/Repositories/ds-quick-tips/013_fastapi_tests_scans/container/api.py:13:0: C0103: Constant name "pkl_filename" doesn't conform to UPPER_CASE naming style (invalid-name) 7 | /Users/dkhundley/Documents/Repositories/ds-quick-tips/013_fastapi_tests_scans/container/api.py:23:0: C0116: Missing function or method docstring (missing-function-docstring) 8 | /Users/dkhundley/Documents/Repositories/ds-quick-tips/013_fastapi_tests_scans/container/api.py:31:0: C0116: Missing function or method docstring (missing-function-docstring) 9 | /Users/dkhundley/Documents/Repositories/ds-quick-tips/013_fastapi_tests_scans/container/api.py:2:0: C0411: standard import "import pickle" should be placed before "import pandas as pd" (wrong-import-order) 10 | ************* Module container.train 11 | /Users/dkhundley/Documents/Repositories/ds-quick-tips/013_fastapi_tests_scans/container/train.py:28:0: C0304: Final newline missing (missing-final-newline) 12 | /Users/dkhundley/Documents/Repositories/ds-quick-tips/013_fastapi_tests_scans/container/train.py:28:0: W0311: Bad indentation. Found 1 spaces, expected 4 (bad-indentation) 13 | /Users/dkhundley/Documents/Repositories/ds-quick-tips/013_fastapi_tests_scans/container/train.py:1:0: C0114: Missing module docstring (missing-module-docstring) 14 | /Users/dkhundley/Documents/Repositories/ds-quick-tips/013_fastapi_tests_scans/container/train.py:26:0: C0103: Constant name "pkl_filename" doesn't conform to UPPER_CASE naming style (invalid-name) 15 | /Users/dkhundley/Documents/Repositories/ds-quick-tips/013_fastapi_tests_scans/container/train.py:4:0: C0411: standard import "import pickle" should be placed before "import numpy as np" (wrong-import-order) 16 | 17 | 18 | Report 19 | ====== 20 | 30 statements analysed. 21 | 22 | Statistics by type 23 | ------------------ 24 | 25 | +---------+-------+-----------+-----------+------------+---------+ 26 | |type |number |old number |difference |%documented |%badname | 27 | +=========+=======+===========+===========+============+=========+ 28 | |module |3 |3 |= |33.33 |0.00 | 29 | +---------+-------+-----------+-----------+------------+---------+ 30 | |class |0 |0 |= |0 |0 | 31 | +---------+-------+-----------+-----------+------------+---------+ 32 | |method |0 |0 |= |0 |0 | 33 | +---------+-------+-----------+-----------+------------+---------+ 34 | |function |2 |2 |= |0.00 |0.00 | 35 | +---------+-------+-----------+-----------+------------+---------+ 36 | 37 | 38 | 39 | External dependencies 40 | --------------------- 41 | :: 42 | 43 | numpy (container.train) 44 | pandas (container.api,container.train) 45 | sklearn 46 | \-datasets (container.train) 47 | \-linear_model (container.train) 48 | 49 | 50 | 51 | Raw metrics 52 | ----------- 53 | 54 | +----------+-------+------+---------+-----------+ 55 | |type |number |% |previous |difference | 56 | +==========+=======+======+=========+===========+ 57 | |code |38 |50.00 |38 |= | 58 | +----------+-------+------+---------+-----------+ 59 | |docstring |0 |0.00 |0 |= | 60 | +----------+-------+------+---------+-----------+ 61 | |comment |18 |23.68 |18 |= | 62 | +----------+-------+------+---------+-----------+ 63 | |empty |20 |26.32 |20 |= | 64 | +----------+-------+------+---------+-----------+ 65 | 66 | 67 | 68 | Duplication 69 | ----------- 70 | 71 | +-------------------------+------+---------+-----------+ 72 | | |now |previous |difference | 73 | +=========================+======+=========+===========+ 74 | |nb duplicated lines |0 |0 |= | 75 | +-------------------------+------+---------+-----------+ 76 | |percent duplicated lines |0.000 |0.000 |= | 77 | +-------------------------+------+---------+-----------+ 78 | 79 | 80 | 81 | Messages by category 82 | -------------------- 83 | 84 | +-----------+-------+---------+-----------+ 85 | |type |number |previous |difference | 86 | +===========+=======+=========+===========+ 87 | |convention |10 |10 |= | 88 | +-----------+-------+---------+-----------+ 89 | |refactor |0 |0 |= | 90 | +-----------+-------+---------+-----------+ 91 | |warning |1 |1 |= | 92 | +-----------+-------+---------+-----------+ 93 | |error |2 |2 |= | 94 | +-----------+-------+---------+-----------+ 95 | 96 | 97 | 98 | % errors / warnings by module 99 | ----------------------------- 100 | 101 | +----------------+-------+--------+---------+-----------+ 102 | |module |error |warning |refactor |convention | 103 | +================+=======+========+=========+===========+ 104 | |container.api |100.00 |0.00 |0.00 |60.00 | 105 | +----------------+-------+--------+---------+-----------+ 106 | |container.train |0.00 |100.00 |0.00 |40.00 | 107 | +----------------+-------+--------+---------+-----------+ 108 | 109 | 110 | 111 | Messages 112 | -------- 113 | 114 | +---------------------------+------------+ 115 | |message id |occurrences | 116 | +===========================+============+ 117 | |wrong-import-order |2 | 118 | +---------------------------+------------+ 119 | |missing-module-docstring |2 | 120 | +---------------------------+------------+ 121 | |missing-function-docstring |2 | 122 | +---------------------------+------------+ 123 | |missing-final-newline |2 | 124 | +---------------------------+------------+ 125 | |invalid-name |2 | 126 | +---------------------------+------------+ 127 | |import-error |2 | 128 | +---------------------------+------------+ 129 | |bad-indentation |1 | 130 | +---------------------------+------------+ 131 | 132 | 133 | 134 | 135 | ------------------------------------------------------------------ 136 | Your code has been rated at 3.00/10 (previous run: 3.00/10, +0.00) 137 | 138 | -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/reports/performance_test_exceptions.csv: -------------------------------------------------------------------------------- 1 | Count,Message,Traceback,Nodes 2 | -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/reports/performance_test_failures.csv: -------------------------------------------------------------------------------- 1 | Method,Name,Error,Occurrences 2 | POST,/predict,"HTTPConnectionPool(host='localhost', port=5001): Max retries exceeded with url: /predict (Caused by NewConnectionError(': Failed to establish a new connection: [Errno 61] Connection refused'))",112 3 | -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/reports/performance_test_stats.csv: -------------------------------------------------------------------------------- 1 | Type,Name,Request Count,Failure Count,Median Response Time,Average Response Time,Min Response Time,Max Response Time,Average Content Size,Requests/s,Failures/s,50%,66%,75%,80%,90%,95%,98%,99%,99.9%,99.99%,100% 2 | POST,/predict,112,112,6,6.156811776785702,2.4135830000000524,15.114749999999955,0.0,3.883597734176632,3.883597734176632,6,7,7,8,10,11,12,13,15,15,15 3 | ,Aggregated,112,112,6,6.156811776785702,2.4135830000000524,15.114749999999955,0.0,3.883597734176632,3.883597734176632,6,7,7,8,10,11,12,13,15,15,15 4 | -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/reports/performance_test_stats_history.csv: -------------------------------------------------------------------------------- 1 | Timestamp,User Count,Type,Name,Requests/s,Failures/s,50%,66%,75%,80%,90%,95%,98%,99%,99.9%,99.99%,100%,Total Request Count,Total Failure Count,Total Median Response Time,Total Average Response Time,Total Min Response Time,Total Max Response Time,Total Average Content Size 2 | 1628458765,0,,Aggregated,0.000000,0.000000,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0,0,0,0,0,0,0 3 | 1628458766,5,,Aggregated,0.000000,0.000000,11,11,11,11,11,11,11,11,11,11,11,5,5,11,10.770100000000003,10.469167000000057,11.046166000000024,0.0 4 | 1628458767,10,,Aggregated,0.000000,0.000000,11,11,11,11,12,12,12,12,12,12,12,10,10,11,10.620950000000006,8.912667000000152,11.83004199999993,0.0 5 | 1628458768,15,,Aggregated,0.000000,0.000000,11,11,12,12,13,15,15,15,15,15,15,15,15,11,11.227561066666707,8.912667000000152,15.114749999999955,0.0 6 | 1628458769,15,,Aggregated,5.000000,5.000000,11,11,12,12,13,15,15,15,15,15,15,19,19,11,10.092164473684207,4.092667000000105,15.114749999999955,0.0 7 | 1628458770,15,,Aggregated,5.000000,5.000000,10,11,11,12,12,13,15,15,15,15,15,24,24,10,8.92718575000004,2.4135830000000524,15.114749999999955,0.0 8 | 1628458771,15,,Aggregated,4.250000,4.250000,10,11,11,11,12,13,15,15,15,15,15,27,27,10,8.581003111111146,2.4135830000000524,15.114749999999955,0.0 9 | 1628458772,15,,Aggregated,4.400000,4.400000,7,11,11,11,12,13,15,15,15,15,15,31,31,7,8.185465064516132,2.4135830000000524,15.114749999999955,0.0 10 | 1628458773,15,,Aggregated,4.166667,4.166667,7,10,11,11,12,13,15,15,15,15,15,33,33,7,8.123130060606037,2.4135830000000524,15.114749999999955,0.0 11 | 1628458774,15,,Aggregated,4.142857,4.142857,7,10,11,11,12,13,15,15,15,15,15,38,38,7,7.933628289473686,2.4135830000000524,15.114749999999955,0.0 12 | 1628458775,15,,Aggregated,4.000000,4.000000,7,9,11,11,12,12,15,15,15,15,15,42,42,7,7.679332333333274,2.4135830000000524,15.114749999999955,0.0 13 | 1628458776,15,,Aggregated,4.000000,4.000000,7,9,11,11,12,12,15,15,15,15,15,44,44,6,7.511122159090815,2.4135830000000524,15.114749999999955,0.0 14 | 1628458777,15,,Aggregated,4.000000,4.000000,7,8,10,11,12,12,15,15,15,15,15,48,48,6,7.331673583333242,2.4135830000000524,15.114749999999955,0.0 15 | 1628458778,15,,Aggregated,3.800000,3.800000,6,7,10,11,11,12,13,15,15,15,15,54,54,6,7.058474518518427,2.4135830000000524,15.114749999999955,0.0 16 | 1628458779,15,,Aggregated,3.800000,3.800000,6,7,10,10,11,12,13,15,15,15,15,56,56,6,6.985906232142781,2.4135830000000524,15.114749999999955,0.0 17 | 1628458780,15,,Aggregated,3.800000,3.800000,6,7,8,10,11,12,13,15,15,15,15,61,61,6,6.849107213114675,2.4135830000000524,15.114749999999955,0.0 18 | 1628458781,15,,Aggregated,3.900000,3.900000,6,7,8,10,11,12,13,15,15,15,15,63,63,6,6.834124984126912,2.4135830000000524,15.114749999999955,0.0 19 | 1628458782,15,,Aggregated,3.700000,3.700000,6,7,8,10,11,12,13,15,15,15,15,68,68,6,6.756037970588114,2.4135830000000524,15.114749999999955,0.0 20 | 1628458783,15,,Aggregated,3.700000,3.700000,6,7,8,10,11,12,13,15,15,15,15,70,70,6,6.763961285714137,2.4135830000000524,15.114749999999955,0.0 21 | 1628458784,15,,Aggregated,3.700000,3.700000,6,7,8,9,11,12,13,15,15,15,15,74,74,6,6.658054594594409,2.4135830000000524,15.114749999999955,0.0 22 | 1628458785,15,,Aggregated,3.700000,3.700000,6,7,8,9,11,12,13,15,15,15,15,80,80,6,6.586210399999874,2.4135830000000524,15.114749999999955,0.0 23 | -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/reports/static_scan_report.txt: -------------------------------------------------------------------------------- 1 | Run started:2021-08-08 21:40:22.468129 2 | 3 | Test results: 4 | >> Issue: [B403:blacklist] Consider possible security implications associated with pickle module. 5 | Severity: Low Confidence: High 6 | Location: ../container/api.py:2 7 | More Info: https://bandit.readthedocs.io/en/latest/blacklists/blacklist_imports.html#b403-import-pickle 8 | 1 import pandas as pd 9 | 2 import pickle 10 | 3 from fastapi import FastAPI, Request 11 | 12 | -------------------------------------------------- 13 | >> Issue: [B301:blacklist] Pickle and modules that wrap it can be unsafe when used to deserialize untrusted data, possible security issue. 14 | Severity: Medium Confidence: High 15 | Location: ../container/api.py:15 16 | More Info: https://bandit.readthedocs.io/en/latest/blacklists/blacklist_calls.html#b301-pickle 17 | 14 with open(pkl_filename, 'rb') as file: 18 | 15 lr_model = pickle.load(file) 19 | 16 20 | 21 | -------------------------------------------------- 22 | >> Issue: [B403:blacklist] Consider possible security implications associated with pickle module. 23 | Severity: Low Confidence: High 24 | Location: ../container/train.py:4 25 | More Info: https://bandit.readthedocs.io/en/latest/blacklists/blacklist_imports.html#b403-import-pickle 26 | 3 import pandas as pd 27 | 4 import pickle 28 | 5 from sklearn import datasets 29 | 30 | -------------------------------------------------- 31 | 32 | Code scanned: 33 | Total lines of code: 33 34 | Total lines skipped (#nosec): 0 35 | 36 | Run metrics: 37 | Total issues (by severity): 38 | Undefined: 0.0 39 | Low: 2.0 40 | Medium: 1.0 41 | High: 0.0 42 | Total issues (by confidence): 43 | Undefined: 0.0 44 | Low: 0.0 45 | Medium: 0.0 46 | High: 3.0 47 | Files skipped (0): 48 | -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/reports/unit_test_report.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/tests/reports/unit_test_report.txt -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/run_all_tests.sh: -------------------------------------------------------------------------------- 1 | docker build -t iris-api:dev -f ../Dockerfile 2 | export CONTAINER_ID=$(docker run -d -p 5001:5001 iris-api:dev) 3 | bash run_container_scan.sh 4 | bash run_dependency_scan.sh 5 | bash run_linter.sh 6 | bash run_perf_test.sh 7 | bash run_static_scan.sh 8 | bash run_unit_tests.sh 9 | docker stop $CONTAINER_ID -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/run_container_scan.sh: -------------------------------------------------------------------------------- 1 | docker scan iris-api:dev | tee reports/container_scan_results.txt -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/run_dependency_scan.sh: -------------------------------------------------------------------------------- 1 | safety check -r ../dependencies/requirements.txt --full-report -o reports/dependency_scan_report.txt -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/run_linter.sh: -------------------------------------------------------------------------------- 1 | pylint ../container/ --reports=y --output=reports/linter_report.txt -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/run_perf_test.sh: -------------------------------------------------------------------------------- 1 | locust --locustfile performance_testing/locustfile.py --headless --users 15 --spawn-rate 5 --run-time 30s --only-summary --csv reports/performance_test -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/run_static_scan.sh: -------------------------------------------------------------------------------- 1 | bandit --format=txt --output=reports/static_scan_report.txt -r ../container/ -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/run_unit_tests.sh: -------------------------------------------------------------------------------- 1 | pytest --log-file=reports/unit_test_report.txt unit_testing/ -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/test_json/bad_data.json: -------------------------------------------------------------------------------- 1 | {"sepal_length":"dkhundley","sepal_width":3.5,"petal_length":1.4,"petal_width":0.2} -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/test_json/test_data.json: -------------------------------------------------------------------------------- 1 | {"sepal_length":5.1,"sepal_width":3.5,"petal_length":1.4,"petal_width":0.2} -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/unit_testing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/tests/unit_testing/__init__.py -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/unit_testing/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/tests/unit_testing/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/unit_testing/__pycache__/test_api.cpython-38-pytest-6.2.3.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/tests/unit_testing/__pycache__/test_api.cpython-38-pytest-6.2.3.pyc -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/unit_testing/__pycache__/test_api.cpython-38-pytest-6.2.4.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/tests/unit_testing/__pycache__/test_api.cpython-38-pytest-6.2.4.pyc -------------------------------------------------------------------------------- /013_fastapi_tests_scans/tests/unit_testing/test_api.py: -------------------------------------------------------------------------------- 1 | import json 2 | from fastapi.testclient import TestClient 3 | from container.api import api 4 | 5 | 6 | 7 | ## PYTEST SETUP 8 | ## -------------------------------------------------------------------------------------------------------------------- 9 | # Instantiating the test client from our container's API 10 | client = TestClient(api) 11 | 12 | # Loading test JSON file 13 | with open('test_json/test_data.json', 'rb') as file: 14 | test_json = json.load(file) 15 | 16 | 17 | 18 | ## UNIT TEST CASES 19 | ## -------------------------------------------------------------------------------------------------------------------- 20 | # Creating a unit test for the basic root path 21 | def test_root_message(): 22 | response = client.get("/") 23 | assert response.status_code == 200 24 | assert response.json() == {'message': 'Hello friend!'} 25 | 26 | # Creating a unit test for the prediction endpoint 27 | def test_predict(): 28 | response = client.post('/predict', json = test_json) 29 | assert response.status_code == 200 -------------------------------------------------------------------------------- /014_kfolds_validation/notebooks/.ipynb_checkpoints/kfolds-validation-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "5f86ded3", 6 | "metadata": {}, 7 | "source": [ 8 | "# K-Folds Validation\n", 9 | "As part of this notebook, we will be exploring how to make efficient use of small datasets by utilizing **k-folds validation**. K-folds validation splits a training dataset into multiple small batches. One of these datasets is reserved as the validation dataset " 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "id": "863c0f2e", 15 | "metadata": {}, 16 | "source": [ 17 | "## Project Setup" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 29, 23 | "id": "834b1058", 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "# Importing the necessary Python libraries\n", 28 | "import numpy as np\n", 29 | "import pandas as pd\n", 30 | "from sklearn import datasets\n", 31 | "from sklearn.model_selection import train_test_split\n", 32 | "from sklearn.ensemble import RandomForestClassifier\n", 33 | "from sklearn.metrics import accuracy_score, confusion_matrix" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "id": "1aaca1cb", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "# Getting the Iris dataset from Scikit-Learn\n", 44 | "iris = datasets.load_iris()" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 15, 50 | "id": "86e7124e", 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "# Loading the predictor value (y) and remainder of the training dataset (X) as Pandas DataFrames\n", 55 | "X = pd.DataFrame(data = iris['data'], columns = iris['feature_names'])\n", 56 | "y = pd.DataFrame(data = iris['target'], columns = ['target'])" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "id": "552f0963", 62 | "metadata": {}, 63 | "source": [ 64 | "## Performing a Typical Split\n", 65 | "Before we jump into how we perform k-folds validation, let's do a quick refresher on how we typically split our dataset using a traditional `train_test_split`. Then we'll later contrast this method with k-folds validation." 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 18, 71 | "id": "22eb9945", 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "# Performing a train_test_split on the dataset\n", 76 | "X_train, X_val, y_train, y_val = train_test_split(X, y)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 21, 82 | "id": "07fd229f", 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "# Instantiating a RandomForestClassifier model\n", 87 | "rfc_model = RandomForestClassifier()" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 22, 93 | "id": "09bffb52", 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "name": "stderr", 98 | "output_type": "stream", 99 | "text": [ 100 | ":2: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", 101 | " rfc_model.fit(X_train, y_train)\n" 102 | ] 103 | }, 104 | { 105 | "data": { 106 | "text/plain": [ 107 | "RandomForestClassifier()" 108 | ] 109 | }, 110 | "execution_count": 22, 111 | "metadata": {}, 112 | "output_type": "execute_result" 113 | } 114 | ], 115 | "source": [ 116 | "# Fitting the X_train and y_train datasets to the RandomForestClassifier model\n", 117 | "rfc_model.fit(X_train, y_train)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 23, 123 | "id": "53fb4d46", 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "# Getting inferential predictions for the validation dataset\n", 128 | "val_preds = rfc_model.predict(X_val)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 28, 134 | "id": "041535cf", 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "# Generating validation metrics by comparing the inferential predictions (val_preds) to the actuals (y_val)\n", 139 | "val_accuracy = accuracy_score(y_val, val_preds)\n", 140 | "val_confusion_matrix = confusion_matrix(y_val, val_preds)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 30, 146 | "id": "343f7364", 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "name": "stdout", 151 | "output_type": "stream", 152 | "text": [ 153 | "Accuracy Score: 0.9210526315789473\n", 154 | "Confusion Matrix: \n", 155 | "[[14 0 0]\n", 156 | " [ 0 7 0]\n", 157 | " [ 0 3 14]]\n" 158 | ] 159 | } 160 | ], 161 | "source": [ 162 | "# Printing out the validation metrics\n", 163 | "print(f'Accuracy Score: {val_accuracy}')\n", 164 | "print(f'Confusion Matrix: \\n{val_confusion_matrix}')" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "id": "36bd650e", 170 | "metadata": {}, 171 | "source": [ 172 | "## Training with K-Folds Validation\n", 173 | "Now that we have performed a very basic model training using a traditional `train_test_split`, we are now ready to perform a training using k-folds validation." 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "id": "bdfc755e", 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [] 183 | } 184 | ], 185 | "metadata": { 186 | "kernelspec": { 187 | "display_name": "Python 3", 188 | "language": "python", 189 | "name": "python3" 190 | }, 191 | "language_info": { 192 | "codemirror_mode": { 193 | "name": "ipython", 194 | "version": 3 195 | }, 196 | "file_extension": ".py", 197 | "mimetype": "text/x-python", 198 | "name": "python", 199 | "nbconvert_exporter": "python", 200 | "pygments_lexer": "ipython3", 201 | "version": "3.8.8" 202 | } 203 | }, 204 | "nbformat": 4, 205 | "nbformat_minor": 5 206 | } 207 | -------------------------------------------------------------------------------- /014_kfolds_validation/notebooks/kfolds-validation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "5f86ded3", 6 | "metadata": {}, 7 | "source": [ 8 | "# K-Folds Validation\n", 9 | "As part of this notebook, we will be exploring how to make efficient use of small datasets by utilizing **k-folds validation**. K-folds validation splits a training dataset into multiple small batches. One of these datasets is reserved as the validation dataset " 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "id": "863c0f2e", 15 | "metadata": {}, 16 | "source": [ 17 | "## Project Setup" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 1, 23 | "id": "834b1058", 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "# Importing the necessary Python libraries\n", 28 | "import numpy as np\n", 29 | "import pandas as pd\n", 30 | "from sklearn import datasets\n", 31 | "from sklearn.model_selection import train_test_split, KFold\n", 32 | "from sklearn.ensemble import RandomForestClassifier\n", 33 | "from sklearn.metrics import accuracy_score, confusion_matrix\n", 34 | "\n", 35 | "import warnings\n", 36 | "warnings.filterwarnings('ignore')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "id": "1aaca1cb", 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "# Getting the Iris dataset from Scikit-Learn\n", 47 | "iris = datasets.load_iris()" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 3, 53 | "id": "86e7124e", 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "# Loading the predictor value (y) and remainder of the training dataset (X) as Pandas DataFrames\n", 58 | "X = pd.DataFrame(data = iris['data'], columns = iris['feature_names'])\n", 59 | "y = pd.DataFrame(data = iris['target'], columns = ['target'])" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "id": "552f0963", 65 | "metadata": {}, 66 | "source": [ 67 | "## Performing a Typical Split\n", 68 | "Before we jump into how we perform k-folds validation, let's do a quick refresher on how we typically split our dataset using a traditional `train_test_split`. Then we'll later contrast this method with k-folds validation." 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 4, 74 | "id": "22eb9945", 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "# Performing a train_test_split on the dataset\n", 79 | "X_train, X_val, y_train, y_val = train_test_split(X, y)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 5, 85 | "id": "07fd229f", 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "# Instantiating a RandomForestClassifier model\n", 90 | "rfc_model = RandomForestClassifier()" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 6, 96 | "id": "09bffb52", 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "data": { 101 | "text/plain": [ 102 | "RandomForestClassifier()" 103 | ] 104 | }, 105 | "execution_count": 6, 106 | "metadata": {}, 107 | "output_type": "execute_result" 108 | } 109 | ], 110 | "source": [ 111 | "# Fitting the X_train and y_train datasets to the RandomForestClassifier model\n", 112 | "rfc_model.fit(X_train, y_train)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 7, 118 | "id": "53fb4d46", 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "# Getting inferential predictions for the validation dataset\n", 123 | "val_preds = rfc_model.predict(X_val)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 8, 129 | "id": "041535cf", 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "# Generating validation metrics by comparing the inferential predictions (val_preds) to the actuals (y_val)\n", 134 | "val_accuracy = accuracy_score(y_val, val_preds)\n", 135 | "val_confusion_matrix = confusion_matrix(y_val, val_preds)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 9, 141 | "id": "343f7364", 142 | "metadata": {}, 143 | "outputs": [ 144 | { 145 | "name": "stdout", 146 | "output_type": "stream", 147 | "text": [ 148 | "Accuracy Score: 0.9210526315789473\n", 149 | "Confusion Matrix: \n", 150 | "[[12 0 0]\n", 151 | " [ 0 11 0]\n", 152 | " [ 0 3 12]]\n" 153 | ] 154 | } 155 | ], 156 | "source": [ 157 | "# Printing out the validation metrics\n", 158 | "print(f'Accuracy Score: {val_accuracy}')\n", 159 | "print(f'Confusion Matrix: \\n{val_confusion_matrix}')" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "id": "36bd650e", 165 | "metadata": {}, 166 | "source": [ 167 | "## Training with K-Folds Validation\n", 168 | "Now that we have performed a very basic model training using a traditional `train_test_split`, we are now ready to perform a training using k-folds validation." 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 10, 174 | "id": "bdfc755e", 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "# Instantiating the K-Fold cross validation object with 5 folds\n", 179 | "k_folds = KFold(n_splits = 5, shuffle = True, random_state = 42)" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 11, 185 | "id": "de62dc1a", 186 | "metadata": {}, 187 | "outputs": [ 188 | { 189 | "name": "stdout", 190 | "output_type": "stream", 191 | "text": [ 192 | "Accuracy Score: 1.0\n", 193 | "Confusion Matrix: \n", 194 | "[[10 0 0]\n", 195 | " [ 0 9 0]\n", 196 | " [ 0 0 11]]\n", 197 | "Accuracy Score: 0.9666666666666667\n", 198 | "Confusion Matrix: \n", 199 | "[[13 0 0]\n", 200 | " [ 0 10 0]\n", 201 | " [ 0 1 6]]\n", 202 | "Accuracy Score: 0.9333333333333333\n", 203 | "Confusion Matrix: \n", 204 | "[[12 0 0]\n", 205 | " [ 0 8 2]\n", 206 | " [ 0 0 8]]\n", 207 | "Accuracy Score: 0.9333333333333333\n", 208 | "Confusion Matrix: \n", 209 | "[[ 8 0 0]\n", 210 | " [ 0 9 1]\n", 211 | " [ 0 1 11]]\n", 212 | "Accuracy Score: 0.9666666666666667\n", 213 | "Confusion Matrix: \n", 214 | "[[ 7 0 0]\n", 215 | " [ 0 11 0]\n", 216 | " [ 0 1 11]]\n" 217 | ] 218 | } 219 | ], 220 | "source": [ 221 | "# Iterating through each of the folds in K-Folds\n", 222 | "for train_index, val_index in k_folds.split(X):\n", 223 | " \n", 224 | " # Splitting the training set from the validation set for this specific fold\n", 225 | " X_train, X_val = X.iloc[train_index, :], X.iloc[val_index, :]\n", 226 | " y_train, y_val = y.iloc[train_index], y.iloc[val_index]\n", 227 | " \n", 228 | " # Instantiating a RandomForestClassifier model\n", 229 | " rfc_model = RandomForestClassifier()\n", 230 | " \n", 231 | " # Fitting the X_train and y_train datasets to the RandomForestClassifier model\n", 232 | " rfc_model.fit(X_train, y_train)\n", 233 | " \n", 234 | " # Getting inferential predictions for the validation dataset\n", 235 | " val_preds = rfc_model.predict(X_val)\n", 236 | " \n", 237 | " # Generating validation metrics by comparing the inferential predictions (val_preds) to the actuals (y_val)\n", 238 | " val_accuracy = accuracy_score(y_val, val_preds)\n", 239 | " val_confusion_matrix = confusion_matrix(y_val, val_preds)\n", 240 | " \n", 241 | " # Printing out the validation metrics\n", 242 | " print(f'Accuracy Score: {val_accuracy}')\n", 243 | " print(f'Confusion Matrix: \\n{val_confusion_matrix}')" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "id": "46e23280", 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [] 253 | } 254 | ], 255 | "metadata": { 256 | "kernelspec": { 257 | "display_name": "Python 3 (ipykernel)", 258 | "language": "python", 259 | "name": "python3" 260 | }, 261 | "language_info": { 262 | "codemirror_mode": { 263 | "name": "ipython", 264 | "version": 3 265 | }, 266 | "file_extension": ".py", 267 | "mimetype": "text/x-python", 268 | "name": "python", 269 | "nbconvert_exporter": "python", 270 | "pygments_lexer": "ipython3", 271 | "version": "3.10.1" 272 | } 273 | }, 274 | "nbformat": 4, 275 | "nbformat_minor": 5 276 | } 277 | -------------------------------------------------------------------------------- /015_synthesizing_test_data/notebooks/synthesizing_test_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "314c6067", 6 | "metadata": {}, 7 | "source": [ 8 | "# Data Science Quick Tip #015: Synthesizing Your Own Test Data\n", 9 | "In this notebook, we'll be sharing how to synthesize your own test data for test purposes. We will cover how to synthesize data for three use cases: binary classification, multiclass classification, and regression." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "id": "c7f25f02", 15 | "metadata": {}, 16 | "source": [ 17 | "## Project Setup" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 1, 23 | "id": "70806b94", 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "# Importing the required Python libraries\n", 28 | "import pandas as pd\n", 29 | "from sklearn.datasets import make_blobs, make_classification, make_regression\n", 30 | "from sklearn.model_selection import train_test_split\n", 31 | "from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, mean_absolute_error, mean_squared_error, r2_score\n", 32 | "from sklearn.ensemble import RandomForestClassifier\n", 33 | "from sklearn.linear_model import LogisticRegression" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "id": "4915e523", 39 | "metadata": {}, 40 | "source": [ 41 | "## Use Case #1: Binary Classification" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 19, 47 | "id": "3c1034be", 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "# Generating synthetic binary classification in the form of arrays\n", 52 | "X, y = make_classification(n_samples = 10000,\n", 53 | " n_features = 25,\n", 54 | " n_informative = 10,\n", 55 | " n_redundant = 10,\n", 56 | " n_repeated = 5,\n", 57 | " n_classes = 2,\n", 58 | " weights = [0.6, 0.4])" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 20, 64 | "id": "7160aa2d", 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "# Transforming the arrays into Pandas DataFrames\n", 69 | "df_X = pd.DataFrame(data = X)\n", 70 | "df_y = pd.DataFrame(data = y, columns = ['target'])" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 21, 76 | "id": "2a422c2b", 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "# Performing a split on the data to save data as a holdout, validation set\n", 81 | "X_train, X_val, y_train, y_val = train_test_split(df_X, df_y)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 22, 87 | "id": "32ea8505", 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "# Instantiating the binary classification model with the RandomForestClassifier algorithm\n", 92 | "binary_classification_model = RandomForestClassifier()" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 23, 98 | "id": "20f39ca5", 99 | "metadata": {}, 100 | "outputs": [ 101 | { 102 | "name": "stderr", 103 | "output_type": "stream", 104 | "text": [ 105 | "C:\\Users\\david\\AppData\\Local\\Temp/ipykernel_7920/4251291611.py:2: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", 106 | " binary_classification_model.fit(X_train, y_train)\n" 107 | ] 108 | }, 109 | { 110 | "data": { 111 | "text/plain": [ 112 | "RandomForestClassifier()" 113 | ] 114 | }, 115 | "execution_count": 23, 116 | "metadata": {}, 117 | "output_type": "execute_result" 118 | } 119 | ], 120 | "source": [ 121 | "# Training the binary classification model against the training data\n", 122 | "binary_classification_model.fit(X_train, y_train)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 24, 128 | "id": "08a870a4", 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "# Getting inferential predictions from the validation dataset\n", 133 | "val_preds = binary_classification_model.predict(X_val)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 25, 139 | "id": "7214f419", 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "# Generating validation metrics by comparing the inferential predictions (val_preds) to the actuals (y_val)\n", 144 | "val_accuracy = accuracy_score(y_val, val_preds)\n", 145 | "val_roc_auc_score = roc_auc_score(y_val, val_preds)\n", 146 | "val_f1_score = f1_score(y_val, val_preds)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 26, 152 | "id": "f1a61d57", 153 | "metadata": {}, 154 | "outputs": [ 155 | { 156 | "name": "stdout", 157 | "output_type": "stream", 158 | "text": [ 159 | "Accuracy score: 0.9484\n", 160 | "ROC AUC score: 0.9330565646081993\n", 161 | "F1 score: 0.9330565646081993\n" 162 | ] 163 | } 164 | ], 165 | "source": [ 166 | "# Printing out the average validation metrics\n", 167 | "print(f'Accuracy score: {val_accuracy}')\n", 168 | "print(f'ROC AUC score: {val_f1_score}')\n", 169 | "print(f'F1 score: {val_f1_score}')" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "id": "e0645cd9", 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [] 179 | } 180 | ], 181 | "metadata": { 182 | "kernelspec": { 183 | "display_name": "Python 3 (ipykernel)", 184 | "language": "python", 185 | "name": "python3" 186 | }, 187 | "language_info": { 188 | "codemirror_mode": { 189 | "name": "ipython", 190 | "version": 3 191 | }, 192 | "file_extension": ".py", 193 | "mimetype": "text/x-python", 194 | "name": "python", 195 | "nbconvert_exporter": "python", 196 | "pygments_lexer": "ipython3", 197 | "version": "3.10.1" 198 | } 199 | }, 200 | "nbformat": 4, 201 | "nbformat_minor": 5 202 | } 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Science Quick Tips Repository! 2 | This repository contains all the code associated with each of my "Data Science Quick Tips" posts. Feel free to use as you please! 3 | --------------------------------------------------------------------------------