├── .gitignore
├── .idea
├── .gitignore
├── ds-quick-tips.iml
├── misc.xml
├── modules.xml
└── vcs.xml
├── 001_reverse_ohe
├── .ipynb_checkpoints
│ └── reverse_ohe-checkpoint.ipynb
└── reverse_ohe.ipynb
├── 002_flask_cronjob
├── README.md
└── app.py
├── 003_sklearn_pipelines
├── .ipynb_checkpoints
│ └── titanic_pipeline-checkpoint.ipynb
├── model
│ └── rfc_pipeline.pkl
└── titanic_pipeline.ipynb
├── 004_pipeline_custom_transformers
├── .ipynb_checkpoints
│ └── titanic_custom_pipeline-checkpoint.ipynb
├── model
│ └── rfc_pipeline.pkl
└── titanic_custom_pipeline.ipynb
├── 005_two_ways_to_ohe
├── .ipynb_checkpoints
│ └── two_ways_to_ohe-checkpoint.ipynb
├── ce_ohe.pkl
├── sklearn_ohe.pkl
└── two_ways_to_ohe.ipynb
├── 006_shap_lime
├── .ipynb_checkpoints
│ └── titanic_shap_lime-checkpoint.ipynb
└── titanic_shap_lime.ipynb
├── 007_performance_testing_locust
├── __pycache__
│ └── locustfile.cpython-37.pyc
├── api
│ ├── __pycache__
│ │ ├── api.cpython-37.pyc
│ │ └── helpers.cpython-37.pyc
│ ├── api.py
│ ├── helpers.py
│ └── run.sh
├── locustfile.py
├── model
│ └── rfc_pipeline.pkl
└── test_data
│ ├── test_1.json
│ ├── test_2.json
│ └── tests.sh
├── 008_mlflow_getting_started
├── .ipynb_checkpoints
│ └── mlflow_wine_notebook-checkpoint.ipynb
├── mlflow-existing-model.py
├── mlflow-wine.py
├── mlflow_wine_notebook.ipynb
├── mlruns
│ └── 0
│ │ ├── 09fa7bc156ff4d59b4b00b8fdbe84728
│ │ ├── artifacts
│ │ │ └── model
│ │ │ │ ├── MLmodel
│ │ │ │ ├── conda.yaml
│ │ │ │ └── model.pkl
│ │ ├── meta.yaml
│ │ ├── metrics
│ │ │ ├── abs_error
│ │ │ ├── r2
│ │ │ └── rmse
│ │ ├── params
│ │ │ ├── alpha
│ │ │ └── l1_ratio
│ │ └── tags
│ │ │ ├── mlflow.log-model.history
│ │ │ ├── mlflow.source.name
│ │ │ ├── mlflow.source.type
│ │ │ └── mlflow.user
│ │ ├── 27398db7fb544a269a0c85ec637bbab9
│ │ ├── artifacts
│ │ │ └── model
│ │ │ │ ├── MLmodel
│ │ │ │ ├── conda.yaml
│ │ │ │ └── model.pkl
│ │ ├── meta.yaml
│ │ └── tags
│ │ │ ├── mlflow.log-model.history
│ │ │ ├── mlflow.source.git.commit
│ │ │ ├── mlflow.source.name
│ │ │ ├── mlflow.source.type
│ │ │ └── mlflow.user
│ │ ├── 5a2bf3f0cb504b40ac9cd9a70af32ac6
│ │ ├── artifacts
│ │ │ └── model
│ │ │ │ ├── MLmodel
│ │ │ │ ├── conda.yaml
│ │ │ │ └── model.pkl
│ │ ├── meta.yaml
│ │ ├── metrics
│ │ │ ├── abs_error
│ │ │ ├── r2
│ │ │ └── rmse
│ │ ├── params
│ │ │ ├── alpha
│ │ │ └── l1_ratio
│ │ └── tags
│ │ │ ├── mlflow.log-model.history
│ │ │ ├── mlflow.source.name
│ │ │ ├── mlflow.source.type
│ │ │ └── mlflow.user
│ │ ├── 93cfbd77d77f4e308297d9a47ea3abd6
│ │ ├── artifacts
│ │ │ └── model
│ │ │ │ ├── MLmodel
│ │ │ │ ├── conda.yaml
│ │ │ │ └── model.pkl
│ │ ├── meta.yaml
│ │ ├── metrics
│ │ │ ├── abs_error
│ │ │ ├── r2
│ │ │ └── rmse
│ │ ├── params
│ │ │ ├── alpha
│ │ │ └── l1_ratio
│ │ └── tags
│ │ │ ├── mlflow.log-model.history
│ │ │ ├── mlflow.source.git.commit
│ │ │ ├── mlflow.source.name
│ │ │ ├── mlflow.source.type
│ │ │ └── mlflow.user
│ │ ├── befa6150910e4724b1248ee939971dc2
│ │ ├── artifacts
│ │ │ └── model
│ │ │ │ ├── MLmodel
│ │ │ │ ├── conda.yaml
│ │ │ │ └── model.pkl
│ │ ├── meta.yaml
│ │ ├── metrics
│ │ │ ├── abs_error
│ │ │ ├── r2
│ │ │ └── rmse
│ │ ├── params
│ │ │ ├── alpha
│ │ │ └── l1_ratio
│ │ └── tags
│ │ │ ├── mlflow.log-model.history
│ │ │ ├── mlflow.source.git.commit
│ │ │ ├── mlflow.source.name
│ │ │ ├── mlflow.source.type
│ │ │ └── mlflow.user
│ │ └── meta.yaml
└── model
│ └── model.pkl
├── 009_mlflow_tracking_server
├── Dockerfile
└── k8s
│ ├── mlflow_deployment.yaml
│ ├── mlflow_minio.yaml
│ └── mlflow_postgres.yaml
├── 010_mlflow_logging_to_server
├── .ipynb_checkpoints
│ └── Untitled-checkpoint.ipynb
└── mlflow-wine.py
├── 011_mlflow_interacting_with_client
├── .ipynb_checkpoints
│ └── MLflow_client_interaction-checkpoint.ipynb
├── MLflow_client_interaction.ipynb
└── mlruns
│ └── 0
│ └── meta.yaml
├── 012_dockerizing_fastapi
├── Dockerfile
├── container
│ ├── api.py
│ ├── start_api.sh
│ └── train.py
├── dependencies
│ └── requirements.txt
├── k8s
│ └── deployment.yaml
├── model
│ └── iris_model.pkl
├── notebooks
│ ├── .ipynb_checkpoints
│ │ └── iris_model_creation-checkpoint.ipynb
│ └── iris_model_creation.ipynb
└── tests
│ ├── test_bad_predict.sh
│ ├── test_basic_predict.sh
│ ├── test_json
│ ├── bad_data.json
│ └── test_data.json
│ └── test_predict.sh
├── 013_fastapi_tests_scans
├── Dockerfile
├── container
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-37.pyc
│ │ ├── __init__.cpython-38.pyc
│ │ ├── api.cpython-37.pyc
│ │ └── api.cpython-38.pyc
│ ├── api.py
│ ├── start_api.sh
│ └── train.py
├── dependencies
│ └── requirements.txt
├── models
│ └── iris_model.pkl
└── tests
│ ├── .pytest_cache
│ └── v
│ │ └── cache
│ │ ├── lastfailed
│ │ ├── nodeids
│ │ └── stepwise
│ ├── __init__.py
│ ├── __pycache__
│ ├── __init__.cpython-37.pyc
│ ├── __init__.cpython-38.pyc
│ └── test_api.cpython-37-pytest-6.2.4.pyc
│ ├── curl_scripts
│ └── test_predict.sh
│ ├── performance_testing
│ ├── __pycache__
│ │ └── locustfile.cpython-38.pyc
│ └── locustfile.py
│ ├── reports
│ ├── container_scan_results.txt
│ ├── dependency_scan_report.txt
│ ├── linter_report.txt
│ ├── performance_test_exceptions.csv
│ ├── performance_test_failures.csv
│ ├── performance_test_stats.csv
│ ├── performance_test_stats_history.csv
│ ├── static_scan_report.txt
│ └── unit_test_report.txt
│ ├── run_all_tests.sh
│ ├── run_container_scan.sh
│ ├── run_dependency_scan.sh
│ ├── run_linter.sh
│ ├── run_perf_test.sh
│ ├── run_static_scan.sh
│ ├── run_unit_tests.sh
│ ├── test_json
│ ├── bad_data.json
│ └── test_data.json
│ └── unit_testing
│ ├── __init__.py
│ ├── __pycache__
│ ├── __init__.cpython-38.pyc
│ ├── test_api.cpython-38-pytest-6.2.3.pyc
│ └── test_api.cpython-38-pytest-6.2.4.pyc
│ └── test_api.py
├── 014_kfolds_validation
└── notebooks
│ ├── .ipynb_checkpoints
│ └── kfolds-validation-checkpoint.ipynb
│ └── kfolds-validation.ipynb
├── 015_synthesizing_test_data
└── notebooks
│ └── synthesizing_test_data.ipynb
├── 016_intro_to_polars
└── intro_to_polars.ipynb
├── README.md
└── data
├── titanic
├── test.csv
└── train.csv
└── wine
└── train.csv
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 |
--------------------------------------------------------------------------------
/.idea/ds-quick-tips.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/002_flask_cronjob/README.md:
--------------------------------------------------------------------------------
1 | # Data Science Quick Tip #002: Running a Cronjob from Within a Flask API!
2 | This repo contains the code supporting the blog post discussing how to run a cronjob within a Flask API. In this README, we'll quickly touch on the required pieces to run this code as well as how to invoke this script.
3 |
4 | ## Required Installations
5 | If not installed already, you will need to pip install the following packages.
6 | - ```APscheduler==3.6.3```
7 | - ```flask==1.1.1```
8 |
9 | ## Script Invocation
10 | I have this script to run very simply using the following command in your terminal:
11 |
12 | ```python app.py```
13 |
14 | This will start up the Flask API, at which point you can sit back and watch the statement ```Hello world!``` printed at the beginning of every minute.
15 |
16 | If you would also like to invoke the test endpoint I created, simply open another window in your terminal and issue the following command:
17 |
18 | ```curl 0.0.0.0/5000/test```
19 |
--------------------------------------------------------------------------------
/002_flask_cronjob/app.py:
--------------------------------------------------------------------------------
1 | from flask import Flask, request, json, Response, jsonify, make_response
2 | from apscheduler.schedulers.background import BackgroundScheduler
3 |
4 | # Instantiating the Flask application
5 | application = Flask(__name__)
6 |
7 | # Instantiating the scheduler for the cronjob
8 | sched = BackgroundScheduler(daemon = True)
9 | sched.start()
10 |
11 | # Defining a cronjob function to run alongside the Flask app
12 | @sched.scheduled_job(trigger = 'cron', minute = '*')
13 | def print_hello():
14 | print('Hello world!')
15 |
16 | # Defining a single API endpoint
17 | @application.route('/test')
18 | def test_func():
19 | js = json.dumps({'Test': 'Successful!'})
20 | return Response(json.dumps(js), status = 200, mimetype = 'application/json')
21 |
22 | if __name__ == '__main__':
23 | # Starting Flask application
24 | application.run(host = '0.0.0.0')
25 |
--------------------------------------------------------------------------------
/003_sklearn_pipelines/.ipynb_checkpoints/titanic_pipeline-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Science Quick Tip #003: Using Scikit-Learn Pipelines!\n",
8 | "In this notebook, I'll show you how to create a pipeline that produces a single binary file in the end for clean inference purposes. The goal is NOT to create a necessarily accurate model here, so don't worry if your accuracy scores are bad. This project will only focus on using Scikit-Learn's default transformers. In the next quick tip post, I'll teach you how to create custom transfomers and also make use of those within this same pipeline format."
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "## Project Setup\n",
16 | "Let's go ahead and import the libraries we'll be using as well as the datasets."
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 1,
22 | "metadata": {},
23 | "outputs": [],
24 | "source": [
25 | "# Importing the libraries we'll be using for this project\n",
26 | "import pandas as pd\n",
27 | "import joblib\n",
28 | "\n",
29 | "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n",
30 | "from sklearn.compose import ColumnTransformer\n",
31 | "from sklearn.pipeline import Pipeline\n",
32 | "from sklearn.ensemble import RandomForestClassifier\n",
33 | "from sklearn.model_selection import train_test_split\n",
34 | "from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 2,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "# Importing the training dataset\n",
44 | "raw_train = pd.read_csv('data/train.csv')"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 3,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "# Splitting the training data into appropriate training and validation sets\n",
54 | "X = raw_train.drop(columns = ['Survived'])\n",
55 | "y = raw_train[['Survived']]\n",
56 | "\n",
57 | "X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42)"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 4,
63 | "metadata": {},
64 | "outputs": [
65 | {
66 | "data": {
67 | "text/html": [
68 | "
\n",
69 | "\n",
82 | "
\n",
83 | " \n",
84 | " \n",
85 | " | \n",
86 | " PassengerId | \n",
87 | " Pclass | \n",
88 | " Name | \n",
89 | " Sex | \n",
90 | " Age | \n",
91 | " SibSp | \n",
92 | " Parch | \n",
93 | " Ticket | \n",
94 | " Fare | \n",
95 | " Cabin | \n",
96 | " Embarked | \n",
97 | "
\n",
98 | " \n",
99 | " \n",
100 | " \n",
101 | " 298 | \n",
102 | " 299 | \n",
103 | " 1 | \n",
104 | " Saalfeld, Mr. Adolphe | \n",
105 | " male | \n",
106 | " NaN | \n",
107 | " 0 | \n",
108 | " 0 | \n",
109 | " 19988 | \n",
110 | " 30.5000 | \n",
111 | " C106 | \n",
112 | " S | \n",
113 | "
\n",
114 | " \n",
115 | " 884 | \n",
116 | " 885 | \n",
117 | " 3 | \n",
118 | " Sutehall, Mr. Henry Jr | \n",
119 | " male | \n",
120 | " 25.00 | \n",
121 | " 0 | \n",
122 | " 0 | \n",
123 | " SOTON/OQ 392076 | \n",
124 | " 7.0500 | \n",
125 | " NaN | \n",
126 | " S | \n",
127 | "
\n",
128 | " \n",
129 | " 247 | \n",
130 | " 248 | \n",
131 | " 2 | \n",
132 | " Hamalainen, Mrs. William (Anna) | \n",
133 | " female | \n",
134 | " 24.00 | \n",
135 | " 0 | \n",
136 | " 2 | \n",
137 | " 250649 | \n",
138 | " 14.5000 | \n",
139 | " NaN | \n",
140 | " S | \n",
141 | "
\n",
142 | " \n",
143 | " 478 | \n",
144 | " 479 | \n",
145 | " 3 | \n",
146 | " Karlsson, Mr. Nils August | \n",
147 | " male | \n",
148 | " 22.00 | \n",
149 | " 0 | \n",
150 | " 0 | \n",
151 | " 350060 | \n",
152 | " 7.5208 | \n",
153 | " NaN | \n",
154 | " S | \n",
155 | "
\n",
156 | " \n",
157 | " 305 | \n",
158 | " 306 | \n",
159 | " 1 | \n",
160 | " Allison, Master. Hudson Trevor | \n",
161 | " male | \n",
162 | " 0.92 | \n",
163 | " 1 | \n",
164 | " 2 | \n",
165 | " 113781 | \n",
166 | " 151.5500 | \n",
167 | " C22 C26 | \n",
168 | " S | \n",
169 | "
\n",
170 | " \n",
171 | "
\n",
172 | "
"
173 | ],
174 | "text/plain": [
175 | " PassengerId Pclass Name Sex Age \\\n",
176 | "298 299 1 Saalfeld, Mr. Adolphe male NaN \n",
177 | "884 885 3 Sutehall, Mr. Henry Jr male 25.00 \n",
178 | "247 248 2 Hamalainen, Mrs. William (Anna) female 24.00 \n",
179 | "478 479 3 Karlsson, Mr. Nils August male 22.00 \n",
180 | "305 306 1 Allison, Master. Hudson Trevor male 0.92 \n",
181 | "\n",
182 | " SibSp Parch Ticket Fare Cabin Embarked \n",
183 | "298 0 0 19988 30.5000 C106 S \n",
184 | "884 0 0 SOTON/OQ 392076 7.0500 NaN S \n",
185 | "247 0 2 250649 14.5000 NaN S \n",
186 | "478 0 0 350060 7.5208 NaN S \n",
187 | "305 1 2 113781 151.5500 C22 C26 S "
188 | ]
189 | },
190 | "execution_count": 4,
191 | "metadata": {},
192 | "output_type": "execute_result"
193 | }
194 | ],
195 | "source": [
196 | "# Viewing first few rows of X_train dataset\n",
197 | "X_train.head()"
198 | ]
199 | },
200 | {
201 | "cell_type": "markdown",
202 | "metadata": {},
203 | "source": [
204 | "## Creating Our Pipeline\n",
205 | "With our data imported, we're ready to go ahead and start creating our pipeline. As mentioned above, we'll only be using the default transformers here, so we definitely won't be getting great results out of our model predictions. But that's okay! The purpose here is learning how to use a pipeline.\n",
206 | "\n",
207 | "Note: You might be wondering in the next cell why we're creating a column transformer for a single column. This is because in the next post, we'll be adding custom transformers making use of mostly the same code you'll see below. (With a few additions!)"
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": 5,
213 | "metadata": {},
214 | "outputs": [],
215 | "source": [
216 | "# Creating a preprocessor to transform the 'Sex' column\n",
217 | "data_preprocessor = ColumnTransformer(transformers = [\n",
218 | " ('sex_transformer', OneHotEncoder(), ['Sex'])\n",
219 | "])"
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": 6,
225 | "metadata": {},
226 | "outputs": [],
227 | "source": [
228 | "# Creating our pipeline that first preprocesses the data, then scales the data, then fits the data to a RandomForestClassifier\n",
229 | "rfc_pipeline = Pipeline(steps = [\n",
230 | " ('data_preprocessing', data_preprocessor),\n",
231 | " ('data_scaling', StandardScaler()),\n",
232 | " ('model', RandomForestClassifier(max_depth = 10,\n",
233 | " min_samples_leaf = 3,\n",
234 | " min_samples_split = 4,\n",
235 | " n_estimators = 200))\n",
236 | "])"
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": 7,
242 | "metadata": {},
243 | "outputs": [
244 | {
245 | "name": "stderr",
246 | "output_type": "stream",
247 | "text": [
248 | "/Users/dkhundley/opt/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py:354: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
249 | " self._final_estimator.fit(Xt, y, **fit_params)\n"
250 | ]
251 | },
252 | {
253 | "data": {
254 | "text/plain": [
255 | "Pipeline(memory=None,\n",
256 | " steps=[('data_preprocessing',\n",
257 | " ColumnTransformer(n_jobs=None, remainder='drop',\n",
258 | " sparse_threshold=0.3,\n",
259 | " transformer_weights=None,\n",
260 | " transformers=[('sex_transformer',\n",
261 | " OneHotEncoder(categories='auto',\n",
262 | " drop=None,\n",
263 | " dtype=,\n",
264 | " handle_unknown='error',\n",
265 | " sparse=True),\n",
266 | " ['Sex'])],\n",
267 | " verbose=False)),\n",
268 | " ('data_scaling',\n",
269 | " StandardScaler(copy=True,...\n",
270 | " RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,\n",
271 | " class_weight=None, criterion='gini',\n",
272 | " max_depth=10, max_features='auto',\n",
273 | " max_leaf_nodes=None, max_samples=None,\n",
274 | " min_impurity_decrease=0.0,\n",
275 | " min_impurity_split=None,\n",
276 | " min_samples_leaf=3, min_samples_split=4,\n",
277 | " min_weight_fraction_leaf=0.0,\n",
278 | " n_estimators=200, n_jobs=None,\n",
279 | " oob_score=False, random_state=None,\n",
280 | " verbose=0, warm_start=False))],\n",
281 | " verbose=False)"
282 | ]
283 | },
284 | "execution_count": 7,
285 | "metadata": {},
286 | "output_type": "execute_result"
287 | }
288 | ],
289 | "source": [
290 | "# Fitting the training data to our pipeline\n",
291 | "rfc_pipeline.fit(X_train, y_train)"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": 8,
297 | "metadata": {},
298 | "outputs": [
299 | {
300 | "data": {
301 | "text/plain": [
302 | "['model/rfc_pipeline.pkl']"
303 | ]
304 | },
305 | "execution_count": 8,
306 | "metadata": {},
307 | "output_type": "execute_result"
308 | }
309 | ],
310 | "source": [
311 | "# Saving our pipeline to a binary pickle file\n",
312 | "joblib.dump(rfc_pipeline, 'model/rfc_pipeline.pkl')"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": 9,
318 | "metadata": {},
319 | "outputs": [],
320 | "source": [
321 | "# Loading back in our serialized model\n",
322 | "loaded_model = joblib.load('model/rfc_pipeline.pkl')"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": 10,
328 | "metadata": {},
329 | "outputs": [
330 | {
331 | "name": "stdout",
332 | "output_type": "stream",
333 | "text": [
334 | "Accuracy Score: 0.7847533632286996\n",
335 | "ROC AUC Score: 0.7718430320308569\n",
336 | "Confusion Matrix: \n",
337 | "[[112 22]\n",
338 | " [ 26 63]]\n"
339 | ]
340 | }
341 | ],
342 | "source": [
343 | "# Checking out our predicted results using the validation dataset\n",
344 | "pipeline_preds = loaded_model.predict(X_val)\n",
345 | "\n",
346 | "val_accuracy = accuracy_score(y_val, pipeline_preds)\n",
347 | "val_roc_auc = roc_auc_score(y_val, pipeline_preds)\n",
348 | "val_confusion_matrix = confusion_matrix(y_val, pipeline_preds)\n",
349 | "\n",
350 | "print(f'Accuracy Score: {val_accuracy}')\n",
351 | "print(f'ROC AUC Score: {val_roc_auc}')\n",
352 | "print(f'Confusion Matrix: \\n{val_confusion_matrix}')"
353 | ]
354 | },
355 | {
356 | "cell_type": "code",
357 | "execution_count": null,
358 | "metadata": {},
359 | "outputs": [],
360 | "source": []
361 | }
362 | ],
363 | "metadata": {
364 | "kernelspec": {
365 | "display_name": "Python 3",
366 | "language": "python",
367 | "name": "python3"
368 | },
369 | "language_info": {
370 | "codemirror_mode": {
371 | "name": "ipython",
372 | "version": 3
373 | },
374 | "file_extension": ".py",
375 | "mimetype": "text/x-python",
376 | "name": "python",
377 | "nbconvert_exporter": "python",
378 | "pygments_lexer": "ipython3",
379 | "version": "3.7.6"
380 | }
381 | },
382 | "nbformat": 4,
383 | "nbformat_minor": 2
384 | }
385 |
--------------------------------------------------------------------------------
/003_sklearn_pipelines/model/rfc_pipeline.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/003_sklearn_pipelines/model/rfc_pipeline.pkl
--------------------------------------------------------------------------------
/003_sklearn_pipelines/titanic_pipeline.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Science Quick Tip #003: Using Scikit-Learn Pipelines!\n",
8 | "In this notebook, I'll show you how to create a pipeline that produces a single binary file in the end for clean inference purposes. The goal is NOT to create a necessarily accurate model here, so don't worry if your accuracy scores are bad. This project will only focus on using Scikit-Learn's default transformers. In the next quick tip post, I'll teach you how to create custom transfomers and also make use of those within this same pipeline format."
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "## Project Setup\n",
16 | "Let's go ahead and import the libraries we'll be using as well as the datasets."
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 1,
22 | "metadata": {},
23 | "outputs": [],
24 | "source": [
25 | "# Importing the libraries we'll be using for this project\n",
26 | "import pandas as pd\n",
27 | "import joblib\n",
28 | "\n",
29 | "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n",
30 | "from sklearn.compose import ColumnTransformer\n",
31 | "from sklearn.pipeline import Pipeline\n",
32 | "from sklearn.ensemble import RandomForestClassifier\n",
33 | "from sklearn.model_selection import train_test_split\n",
34 | "from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 2,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "# Importing the training dataset\n",
44 | "raw_train = pd.read_csv('data/train.csv')"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 3,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "# Splitting the training data into appropriate training and validation sets\n",
54 | "X = raw_train.drop(columns = ['Survived'])\n",
55 | "y = raw_train[['Survived']]\n",
56 | "\n",
57 | "X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42)"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 4,
63 | "metadata": {},
64 | "outputs": [
65 | {
66 | "data": {
67 | "text/html": [
68 | "\n",
69 | "\n",
82 | "
\n",
83 | " \n",
84 | " \n",
85 | " | \n",
86 | " PassengerId | \n",
87 | " Pclass | \n",
88 | " Name | \n",
89 | " Sex | \n",
90 | " Age | \n",
91 | " SibSp | \n",
92 | " Parch | \n",
93 | " Ticket | \n",
94 | " Fare | \n",
95 | " Cabin | \n",
96 | " Embarked | \n",
97 | "
\n",
98 | " \n",
99 | " \n",
100 | " \n",
101 | " 298 | \n",
102 | " 299 | \n",
103 | " 1 | \n",
104 | " Saalfeld, Mr. Adolphe | \n",
105 | " male | \n",
106 | " NaN | \n",
107 | " 0 | \n",
108 | " 0 | \n",
109 | " 19988 | \n",
110 | " 30.5000 | \n",
111 | " C106 | \n",
112 | " S | \n",
113 | "
\n",
114 | " \n",
115 | " 884 | \n",
116 | " 885 | \n",
117 | " 3 | \n",
118 | " Sutehall, Mr. Henry Jr | \n",
119 | " male | \n",
120 | " 25.00 | \n",
121 | " 0 | \n",
122 | " 0 | \n",
123 | " SOTON/OQ 392076 | \n",
124 | " 7.0500 | \n",
125 | " NaN | \n",
126 | " S | \n",
127 | "
\n",
128 | " \n",
129 | " 247 | \n",
130 | " 248 | \n",
131 | " 2 | \n",
132 | " Hamalainen, Mrs. William (Anna) | \n",
133 | " female | \n",
134 | " 24.00 | \n",
135 | " 0 | \n",
136 | " 2 | \n",
137 | " 250649 | \n",
138 | " 14.5000 | \n",
139 | " NaN | \n",
140 | " S | \n",
141 | "
\n",
142 | " \n",
143 | " 478 | \n",
144 | " 479 | \n",
145 | " 3 | \n",
146 | " Karlsson, Mr. Nils August | \n",
147 | " male | \n",
148 | " 22.00 | \n",
149 | " 0 | \n",
150 | " 0 | \n",
151 | " 350060 | \n",
152 | " 7.5208 | \n",
153 | " NaN | \n",
154 | " S | \n",
155 | "
\n",
156 | " \n",
157 | " 305 | \n",
158 | " 306 | \n",
159 | " 1 | \n",
160 | " Allison, Master. Hudson Trevor | \n",
161 | " male | \n",
162 | " 0.92 | \n",
163 | " 1 | \n",
164 | " 2 | \n",
165 | " 113781 | \n",
166 | " 151.5500 | \n",
167 | " C22 C26 | \n",
168 | " S | \n",
169 | "
\n",
170 | " \n",
171 | "
\n",
172 | "
"
173 | ],
174 | "text/plain": [
175 | " PassengerId Pclass Name Sex Age \\\n",
176 | "298 299 1 Saalfeld, Mr. Adolphe male NaN \n",
177 | "884 885 3 Sutehall, Mr. Henry Jr male 25.00 \n",
178 | "247 248 2 Hamalainen, Mrs. William (Anna) female 24.00 \n",
179 | "478 479 3 Karlsson, Mr. Nils August male 22.00 \n",
180 | "305 306 1 Allison, Master. Hudson Trevor male 0.92 \n",
181 | "\n",
182 | " SibSp Parch Ticket Fare Cabin Embarked \n",
183 | "298 0 0 19988 30.5000 C106 S \n",
184 | "884 0 0 SOTON/OQ 392076 7.0500 NaN S \n",
185 | "247 0 2 250649 14.5000 NaN S \n",
186 | "478 0 0 350060 7.5208 NaN S \n",
187 | "305 1 2 113781 151.5500 C22 C26 S "
188 | ]
189 | },
190 | "execution_count": 4,
191 | "metadata": {},
192 | "output_type": "execute_result"
193 | }
194 | ],
195 | "source": [
196 | "# Viewing first few rows of X_train dataset\n",
197 | "X_train.head()"
198 | ]
199 | },
200 | {
201 | "cell_type": "markdown",
202 | "metadata": {},
203 | "source": [
204 | "## Creating Our Pipeline\n",
205 | "With our data imported, we're ready to go ahead and start creating our pipeline. As mentioned above, we'll only be using the default transformers here, so we definitely won't be getting great results out of our model predictions. But that's okay! The purpose here is learning how to use a pipeline.\n",
206 | "\n",
207 | "Note: You might be wondering in the next cell why we're creating a column transformer for a single column. This is because in the next post, we'll be adding custom transformers making use of mostly the same code you'll see below. (With a few additions!)"
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": 5,
213 | "metadata": {},
214 | "outputs": [],
215 | "source": [
216 | "# Creating a preprocessor to transform the 'Sex' column\n",
217 | "data_preprocessor = ColumnTransformer(transformers = [\n",
218 | " ('sex_transformer', OneHotEncoder(), ['Sex'])\n",
219 | "])"
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": 6,
225 | "metadata": {},
226 | "outputs": [],
227 | "source": [
228 | "# Creating our pipeline that first preprocesses the data, then scales the data, then fits the data to a RandomForestClassifier\n",
229 | "rfc_pipeline = Pipeline(steps = [\n",
230 | " ('data_preprocessing', data_preprocessor),\n",
231 | " ('data_scaling', StandardScaler()),\n",
232 | " ('model', RandomForestClassifier(max_depth = 10,\n",
233 | " min_samples_leaf = 3,\n",
234 | " min_samples_split = 4,\n",
235 | " n_estimators = 200))\n",
236 | "])"
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": 7,
242 | "metadata": {},
243 | "outputs": [
244 | {
245 | "name": "stderr",
246 | "output_type": "stream",
247 | "text": [
248 | "/Users/dkhundley/opt/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py:354: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
249 | " self._final_estimator.fit(Xt, y, **fit_params)\n"
250 | ]
251 | },
252 | {
253 | "data": {
254 | "text/plain": [
255 | "Pipeline(memory=None,\n",
256 | " steps=[('data_preprocessing',\n",
257 | " ColumnTransformer(n_jobs=None, remainder='drop',\n",
258 | " sparse_threshold=0.3,\n",
259 | " transformer_weights=None,\n",
260 | " transformers=[('sex_transformer',\n",
261 | " OneHotEncoder(categories='auto',\n",
262 | " drop=None,\n",
263 | " dtype=,\n",
264 | " handle_unknown='error',\n",
265 | " sparse=True),\n",
266 | " ['Sex'])],\n",
267 | " verbose=False)),\n",
268 | " ('data_scaling',\n",
269 | " StandardScaler(copy=True,...\n",
270 | " RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,\n",
271 | " class_weight=None, criterion='gini',\n",
272 | " max_depth=10, max_features='auto',\n",
273 | " max_leaf_nodes=None, max_samples=None,\n",
274 | " min_impurity_decrease=0.0,\n",
275 | " min_impurity_split=None,\n",
276 | " min_samples_leaf=3, min_samples_split=4,\n",
277 | " min_weight_fraction_leaf=0.0,\n",
278 | " n_estimators=200, n_jobs=None,\n",
279 | " oob_score=False, random_state=None,\n",
280 | " verbose=0, warm_start=False))],\n",
281 | " verbose=False)"
282 | ]
283 | },
284 | "execution_count": 7,
285 | "metadata": {},
286 | "output_type": "execute_result"
287 | }
288 | ],
289 | "source": [
290 | "# Fitting the training data to our pipeline\n",
291 | "rfc_pipeline.fit(X_train, y_train)"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": 8,
297 | "metadata": {},
298 | "outputs": [
299 | {
300 | "data": {
301 | "text/plain": [
302 | "['model/rfc_pipeline.pkl']"
303 | ]
304 | },
305 | "execution_count": 8,
306 | "metadata": {},
307 | "output_type": "execute_result"
308 | }
309 | ],
310 | "source": [
311 | "# Saving our pipeline to a binary pickle file\n",
312 | "joblib.dump(rfc_pipeline, 'model/rfc_pipeline.pkl')"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": 9,
318 | "metadata": {},
319 | "outputs": [],
320 | "source": [
321 | "# Loading back in our serialized model\n",
322 | "loaded_model = joblib.load('model/rfc_pipeline.pkl')"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": 10,
328 | "metadata": {},
329 | "outputs": [
330 | {
331 | "name": "stdout",
332 | "output_type": "stream",
333 | "text": [
334 | "Accuracy Score: 0.7847533632286996\n",
335 | "ROC AUC Score: 0.7718430320308569\n",
336 | "Confusion Matrix: \n",
337 | "[[112 22]\n",
338 | " [ 26 63]]\n"
339 | ]
340 | }
341 | ],
342 | "source": [
343 | "# Checking out our predicted results using the validation dataset\n",
344 | "pipeline_preds = loaded_model.predict(X_val)\n",
345 | "\n",
346 | "val_accuracy = accuracy_score(y_val, pipeline_preds)\n",
347 | "val_roc_auc = roc_auc_score(y_val, pipeline_preds)\n",
348 | "val_confusion_matrix = confusion_matrix(y_val, pipeline_preds)\n",
349 | "\n",
350 | "print(f'Accuracy Score: {val_accuracy}')\n",
351 | "print(f'ROC AUC Score: {val_roc_auc}')\n",
352 | "print(f'Confusion Matrix: \\n{val_confusion_matrix}')"
353 | ]
354 | },
355 | {
356 | "cell_type": "code",
357 | "execution_count": null,
358 | "metadata": {},
359 | "outputs": [],
360 | "source": []
361 | }
362 | ],
363 | "metadata": {
364 | "kernelspec": {
365 | "display_name": "Python 3",
366 | "language": "python",
367 | "name": "python3"
368 | },
369 | "language_info": {
370 | "codemirror_mode": {
371 | "name": "ipython",
372 | "version": 3
373 | },
374 | "file_extension": ".py",
375 | "mimetype": "text/x-python",
376 | "name": "python",
377 | "nbconvert_exporter": "python",
378 | "pygments_lexer": "ipython3",
379 | "version": "3.7.6"
380 | }
381 | },
382 | "nbformat": 4,
383 | "nbformat_minor": 2
384 | }
385 |
--------------------------------------------------------------------------------
/004_pipeline_custom_transformers/model/rfc_pipeline.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/004_pipeline_custom_transformers/model/rfc_pipeline.pkl
--------------------------------------------------------------------------------
/004_pipeline_custom_transformers/titanic_custom_pipeline.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Science Quick Tip #004: Using Custom Transformers in Scikit-Learn Pipelines!\n",
8 | "In our last post, we covered how to use Scikit-Learn pipelines to conjoin all the appropriate transformers into a single output. In this new post, we'll take things a step further by adding custom transformers to the pipeline. Because this is very much building on top of the last post, much of this code should already appear to be familiar to you."
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "## Project Setup\n",
16 | "Let's go ahead and import the libraries we'll be using as well as the datasets."
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 1,
22 | "metadata": {},
23 | "outputs": [],
24 | "source": [
25 | "# Importing the libraries we'll be using for this project\n",
26 | "import pandas as pd\n",
27 | "import joblib\n",
28 | "\n",
29 | "from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer\n",
30 | "from sklearn.impute import SimpleImputer\n",
31 | "from sklearn.compose import ColumnTransformer\n",
32 | "from sklearn.pipeline import Pipeline\n",
33 | "from sklearn.ensemble import RandomForestClassifier\n",
34 | "from sklearn.model_selection import train_test_split\n",
35 | "from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 2,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "# Importing the training dataset\n",
45 | "raw_train = pd.read_csv('../data/titanic/train.csv')"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 3,
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "# Splitting the training data into appropriate training and validation sets\n",
55 | "X = raw_train.drop(columns = ['Survived'])\n",
56 | "y = raw_train[['Survived']]\n",
57 | "\n",
58 | "X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42)"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 4,
64 | "metadata": {},
65 | "outputs": [
66 | {
67 | "data": {
68 | "text/html": [
69 | "\n",
70 | "\n",
83 | "
\n",
84 | " \n",
85 | " \n",
86 | " | \n",
87 | " PassengerId | \n",
88 | " Pclass | \n",
89 | " Name | \n",
90 | " Sex | \n",
91 | " Age | \n",
92 | " SibSp | \n",
93 | " Parch | \n",
94 | " Ticket | \n",
95 | " Fare | \n",
96 | " Cabin | \n",
97 | " Embarked | \n",
98 | "
\n",
99 | " \n",
100 | " \n",
101 | " \n",
102 | " 298 | \n",
103 | " 299 | \n",
104 | " 1 | \n",
105 | " Saalfeld, Mr. Adolphe | \n",
106 | " male | \n",
107 | " NaN | \n",
108 | " 0 | \n",
109 | " 0 | \n",
110 | " 19988 | \n",
111 | " 30.5000 | \n",
112 | " C106 | \n",
113 | " S | \n",
114 | "
\n",
115 | " \n",
116 | " 884 | \n",
117 | " 885 | \n",
118 | " 3 | \n",
119 | " Sutehall, Mr. Henry Jr | \n",
120 | " male | \n",
121 | " 25.00 | \n",
122 | " 0 | \n",
123 | " 0 | \n",
124 | " SOTON/OQ 392076 | \n",
125 | " 7.0500 | \n",
126 | " NaN | \n",
127 | " S | \n",
128 | "
\n",
129 | " \n",
130 | " 247 | \n",
131 | " 248 | \n",
132 | " 2 | \n",
133 | " Hamalainen, Mrs. William (Anna) | \n",
134 | " female | \n",
135 | " 24.00 | \n",
136 | " 0 | \n",
137 | " 2 | \n",
138 | " 250649 | \n",
139 | " 14.5000 | \n",
140 | " NaN | \n",
141 | " S | \n",
142 | "
\n",
143 | " \n",
144 | " 478 | \n",
145 | " 479 | \n",
146 | " 3 | \n",
147 | " Karlsson, Mr. Nils August | \n",
148 | " male | \n",
149 | " 22.00 | \n",
150 | " 0 | \n",
151 | " 0 | \n",
152 | " 350060 | \n",
153 | " 7.5208 | \n",
154 | " NaN | \n",
155 | " S | \n",
156 | "
\n",
157 | " \n",
158 | " 305 | \n",
159 | " 306 | \n",
160 | " 1 | \n",
161 | " Allison, Master. Hudson Trevor | \n",
162 | " male | \n",
163 | " 0.92 | \n",
164 | " 1 | \n",
165 | " 2 | \n",
166 | " 113781 | \n",
167 | " 151.5500 | \n",
168 | " C22 C26 | \n",
169 | " S | \n",
170 | "
\n",
171 | " \n",
172 | "
\n",
173 | "
"
174 | ],
175 | "text/plain": [
176 | " PassengerId Pclass Name Sex Age \\\n",
177 | "298 299 1 Saalfeld, Mr. Adolphe male NaN \n",
178 | "884 885 3 Sutehall, Mr. Henry Jr male 25.00 \n",
179 | "247 248 2 Hamalainen, Mrs. William (Anna) female 24.00 \n",
180 | "478 479 3 Karlsson, Mr. Nils August male 22.00 \n",
181 | "305 306 1 Allison, Master. Hudson Trevor male 0.92 \n",
182 | "\n",
183 | " SibSp Parch Ticket Fare Cabin Embarked \n",
184 | "298 0 0 19988 30.5000 C106 S \n",
185 | "884 0 0 SOTON/OQ 392076 7.0500 NaN S \n",
186 | "247 0 2 250649 14.5000 NaN S \n",
187 | "478 0 0 350060 7.5208 NaN S \n",
188 | "305 1 2 113781 151.5500 C22 C26 S "
189 | ]
190 | },
191 | "execution_count": 4,
192 | "metadata": {},
193 | "output_type": "execute_result"
194 | }
195 | ],
196 | "source": [
197 | "# Viewing first few rows of X_train dataset\n",
198 | "X_train.head()"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": 5,
204 | "metadata": {},
205 | "outputs": [
206 | {
207 | "data": {
208 | "text/html": [
209 | "\n",
210 | "\n",
223 | "
\n",
224 | " \n",
225 | " \n",
226 | " | \n",
227 | " Survived | \n",
228 | "
\n",
229 | " \n",
230 | " \n",
231 | " \n",
232 | " 298 | \n",
233 | " 1 | \n",
234 | "
\n",
235 | " \n",
236 | " 884 | \n",
237 | " 0 | \n",
238 | "
\n",
239 | " \n",
240 | " 247 | \n",
241 | " 1 | \n",
242 | "
\n",
243 | " \n",
244 | " 478 | \n",
245 | " 0 | \n",
246 | "
\n",
247 | " \n",
248 | " 305 | \n",
249 | " 1 | \n",
250 | "
\n",
251 | " \n",
252 | "
\n",
253 | "
"
254 | ],
255 | "text/plain": [
256 | " Survived\n",
257 | "298 1\n",
258 | "884 0\n",
259 | "247 1\n",
260 | "478 0\n",
261 | "305 1"
262 | ]
263 | },
264 | "execution_count": 5,
265 | "metadata": {},
266 | "output_type": "execute_result"
267 | }
268 | ],
269 | "source": [
270 | "# Viewing first few rows of y_train dataset\n",
271 | "y_train.head()"
272 | ]
273 | },
274 | {
275 | "cell_type": "markdown",
276 | "metadata": {},
277 | "source": [
278 | "## Creating Our Pipeline (Now With Custom Transformers!)\n",
279 | "With our data imported, we're ready to go ahead and start creating our pipeline. As mentioned above, we'll only be using the default transformers here, so we definitely won't be getting great results out of our model predictions. But that's okay! The purpose here is learning how to use a pipeline.\n",
280 | "\n",
281 | "Note: You might be wondering in the next cell why we're creating a column transformer for a single column. This is because in the next post, we'll be adding custom transformers making use of mostly the same code you'll see below. (With a few additions!)"
282 | ]
283 | },
284 | {
285 | "cell_type": "code",
286 | "execution_count": 6,
287 | "metadata": {},
288 | "outputs": [],
289 | "source": [
290 | "# Creating a function to appropriately engineer the 'Age' column\n",
291 | "def create_age_bins(col):\n",
292 | " '''Engineers age bin variables for pipeline'''\n",
293 | " \n",
294 | " # Defining / instantiating the necessary variables\n",
295 | " age_bins = [-1, 12, 18, 25, 50, 100]\n",
296 | " age_labels = ['child', 'teen', 'young_adult', 'adult', 'elder']\n",
297 | " age_imputer = SimpleImputer(strategy = 'median')\n",
298 | " age_ohe = OneHotEncoder()\n",
299 | " \n",
300 | " # Performing basic imputation for nulls\n",
301 | " imputed = age_imputer.fit_transform(col)\n",
302 | " ages_filled = pd.DataFrame(data = imputed, columns = ['Age'])\n",
303 | " \n",
304 | " # Segregating ages into age bins\n",
305 | " age_cat_cols = pd.cut(ages_filled['Age'], bins = age_bins, labels = age_labels)\n",
306 | " age_cats = pd.DataFrame(data = age_cat_cols, columns = ['Age'])\n",
307 | " \n",
308 | " # One hot encoding new age bins\n",
309 | " ages_encoded = age_ohe.fit_transform(age_cats[['Age']])\n",
310 | " ages_encoded = pd.DataFrame(data = ages_encoded.toarray())\n",
311 | " \n",
312 | " return ages_encoded"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": 7,
318 | "metadata": {},
319 | "outputs": [],
320 | "source": [
321 | "# Creating function to appropriately engineer the 'Embarked' column\n",
322 | "def create_embarked_columns(col):\n",
323 | " '''Engineers the embarked variables for pipeline'''\n",
324 | " \n",
325 | " # Instantiating the transformer objects\n",
326 | " embarked_imputer = SimpleImputer(strategy = 'most_frequent')\n",
327 | " embarked_ohe = OneHotEncoder()\n",
328 | " \n",
329 | " # Performing basic imputation for nulls\n",
330 | " imputed = embarked_imputer.fit_transform(col)\n",
331 | " embarked_filled = pd.DataFrame(data = imputed, columns = ['Embarked'])\n",
332 | " \n",
333 | " # Performing OHE on the col data\n",
334 | " embarked_columns = embarked_ohe.fit_transform(embarked_filled[['Embarked']])\n",
335 | " embarked_columns_df = pd.DataFrame(data = embarked_columns.toarray())\n",
336 | " \n",
337 | " return embarked_columns_df"
338 | ]
339 | },
340 | {
341 | "cell_type": "code",
342 | "execution_count": 8,
343 | "metadata": {},
344 | "outputs": [],
345 | "source": [
346 | "# Creating a preprocessor to transform the 'Sex' column\n",
347 | "data_preprocessor = ColumnTransformer(transformers = [\n",
348 | " ('sex_transformer', OneHotEncoder(), ['Sex']),\n",
349 | " ('age_transformer', FunctionTransformer(create_age_bins, validate = False), ['Age']),\n",
350 | " ('embarked_transformer', FunctionTransformer(create_embarked_columns, validate = False), ['Embarked'])\n",
351 | "])"
352 | ]
353 | },
354 | {
355 | "cell_type": "code",
356 | "execution_count": 9,
357 | "metadata": {},
358 | "outputs": [],
359 | "source": [
360 | "# Creating our pipeline that first preprocesses the data, then scales the data, then fits the data to a RandomForestClassifier\n",
361 | "rfc_pipeline = Pipeline(steps = [\n",
362 | " ('data_preprocessing', data_preprocessor),\n",
363 | " ('data_scaling', StandardScaler()),\n",
364 | " ('model', RandomForestClassifier(max_depth = 10,\n",
365 | " min_samples_leaf = 3,\n",
366 | " min_samples_split = 4,\n",
367 | " n_estimators = 200))\n",
368 | "])"
369 | ]
370 | },
371 | {
372 | "cell_type": "code",
373 | "execution_count": 10,
374 | "metadata": {},
375 | "outputs": [
376 | {
377 | "name": "stderr",
378 | "output_type": "stream",
379 | "text": [
380 | "/Users/dkhundley/opt/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py:354: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
381 | " self._final_estimator.fit(Xt, y, **fit_params)\n"
382 | ]
383 | },
384 | {
385 | "data": {
386 | "text/plain": [
387 | "Pipeline(memory=None,\n",
388 | " steps=[('data_preprocessing',\n",
389 | " ColumnTransformer(n_jobs=None, remainder='drop',\n",
390 | " sparse_threshold=0.3,\n",
391 | " transformer_weights=None,\n",
392 | " transformers=[('sex_transformer',\n",
393 | " OneHotEncoder(categories='auto',\n",
394 | " drop=None,\n",
395 | " dtype=,\n",
396 | " handle_unknown='error',\n",
397 | " sparse=True),\n",
398 | " ['Sex']),\n",
399 | " ('age_transformer',\n",
400 | " FunctionTransformer(accept_sparse=False...\n",
401 | " RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,\n",
402 | " class_weight=None, criterion='gini',\n",
403 | " max_depth=10, max_features='auto',\n",
404 | " max_leaf_nodes=None, max_samples=None,\n",
405 | " min_impurity_decrease=0.0,\n",
406 | " min_impurity_split=None,\n",
407 | " min_samples_leaf=3, min_samples_split=4,\n",
408 | " min_weight_fraction_leaf=0.0,\n",
409 | " n_estimators=200, n_jobs=None,\n",
410 | " oob_score=False, random_state=None,\n",
411 | " verbose=0, warm_start=False))],\n",
412 | " verbose=False)"
413 | ]
414 | },
415 | "execution_count": 10,
416 | "metadata": {},
417 | "output_type": "execute_result"
418 | }
419 | ],
420 | "source": [
421 | "# Fitting the training data to our pipeline\n",
422 | "rfc_pipeline.fit(X_train, y_train)"
423 | ]
424 | },
425 | {
426 | "cell_type": "code",
427 | "execution_count": 11,
428 | "metadata": {},
429 | "outputs": [
430 | {
431 | "data": {
432 | "text/plain": [
433 | "['model/rfc_pipeline.pkl']"
434 | ]
435 | },
436 | "execution_count": 11,
437 | "metadata": {},
438 | "output_type": "execute_result"
439 | }
440 | ],
441 | "source": [
442 | "# Saving our pipeline to a binary pickle file\n",
443 | "joblib.dump(rfc_pipeline, 'model/rfc_pipeline.pkl')"
444 | ]
445 | },
446 | {
447 | "cell_type": "code",
448 | "execution_count": 12,
449 | "metadata": {},
450 | "outputs": [],
451 | "source": [
452 | "# Loading back in our serialized model\n",
453 | "loaded_model = joblib.load('model/rfc_pipeline.pkl')"
454 | ]
455 | },
456 | {
457 | "cell_type": "code",
458 | "execution_count": 13,
459 | "metadata": {},
460 | "outputs": [
461 | {
462 | "name": "stdout",
463 | "output_type": "stream",
464 | "text": [
465 | "Accuracy Score: 0.7847533632286996\n",
466 | "ROC AUC Score: 0.7775029347643804\n",
467 | "Confusion Matrix: \n",
468 | "[[109 25]\n",
469 | " [ 23 66]]\n"
470 | ]
471 | }
472 | ],
473 | "source": [
474 | "# Checking out our predicted results using the validation dataset\n",
475 | "pipeline_preds = loaded_model.predict(X_val)\n",
476 | "\n",
477 | "val_accuracy = accuracy_score(y_val, pipeline_preds)\n",
478 | "val_roc_auc = roc_auc_score(y_val, pipeline_preds)\n",
479 | "val_confusion_matrix = confusion_matrix(y_val, pipeline_preds)\n",
480 | "\n",
481 | "print(f'Accuracy Score: {val_accuracy}')\n",
482 | "print(f'ROC AUC Score: {val_roc_auc}')\n",
483 | "print(f'Confusion Matrix: \\n{val_confusion_matrix}')"
484 | ]
485 | },
486 | {
487 | "cell_type": "code",
488 | "execution_count": null,
489 | "metadata": {},
490 | "outputs": [],
491 | "source": []
492 | }
493 | ],
494 | "metadata": {
495 | "kernelspec": {
496 | "display_name": "Python 3",
497 | "language": "python",
498 | "name": "python3"
499 | },
500 | "language_info": {
501 | "codemirror_mode": {
502 | "name": "ipython",
503 | "version": 3
504 | },
505 | "file_extension": ".py",
506 | "mimetype": "text/x-python",
507 | "name": "python",
508 | "nbconvert_exporter": "python",
509 | "pygments_lexer": "ipython3",
510 | "version": "3.7.6"
511 | }
512 | },
513 | "nbformat": 4,
514 | "nbformat_minor": 2
515 | }
516 |
--------------------------------------------------------------------------------
/005_two_ways_to_ohe/ce_ohe.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/005_two_ways_to_ohe/ce_ohe.pkl
--------------------------------------------------------------------------------
/005_two_ways_to_ohe/sklearn_ohe.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/005_two_ways_to_ohe/sklearn_ohe.pkl
--------------------------------------------------------------------------------
/007_performance_testing_locust/__pycache__/locustfile.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/007_performance_testing_locust/__pycache__/locustfile.cpython-37.pyc
--------------------------------------------------------------------------------
/007_performance_testing_locust/api/__pycache__/api.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/007_performance_testing_locust/api/__pycache__/api.cpython-37.pyc
--------------------------------------------------------------------------------
/007_performance_testing_locust/api/__pycache__/helpers.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/007_performance_testing_locust/api/__pycache__/helpers.cpython-37.pyc
--------------------------------------------------------------------------------
/007_performance_testing_locust/api/api.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import joblib
3 | from flask import Flask, request, json, Response, jsonify, make_response
4 | from helpers import create_embarked_columns, create_age_bins
5 |
6 |
7 | ## PRELOADED COMPONENTS
8 | # ------------------------------------------------------------------------------
9 |
10 | # Instantiating the Flask application
11 | application = Flask(__name__)
12 |
13 | # Loading the saved, serialized model
14 | model = joblib.load('../model/rfc_pipeline.pkl')
15 |
16 | ## API ENDPOINTS
17 | # ------------------------------------------------------------------------------
18 |
19 | # Defining our prediction endpoint
20 | @application.route('/predict', methods = ['POST'])
21 | def predict():
22 |
23 | # Getting incoming data from request
24 | predict_json = request.json
25 |
26 | # Transforming JSON data to DataFrame
27 | predict_df = pd.json_normalize(predict_json)
28 |
29 | # Running data through model
30 | preds = model.predict(predict_df)
31 |
32 | # Prepping preds to be returned to user
33 | js = json.dumps({'preds': str(preds[0])})
34 |
35 | return Response(js, status = 200, mimetype = 'application/json')
36 |
37 |
38 |
39 | # Defining a basic health endpoint
40 | @application.route('/health', methods = ['GET'])
41 | def health():
42 |
43 | # Dumping out simple health message
44 | js = json.dumps({'Status': 'Healthy!'})
45 |
46 | return Response(js, status = 200, mimetype = 'application/json')
47 |
48 |
49 | ## SCRIPT INVOCATION
50 | # ------------------------------------------------------------------------------
51 |
52 | if __name__ == '__main__':
53 | # Starting the Flask API on script invocation
54 | application.run(host = '0.0.0.0', debug = True)
55 |
--------------------------------------------------------------------------------
/007_performance_testing_locust/api/helpers.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from sklearn.impute import SimpleImputer
3 | from sklearn.preprocessing import OneHotEncoder, StandardScaler
4 |
5 | # Creating a function to appropriately engineer the 'Age' column
6 | def create_age_bins(col):
7 | '''Engineers age bin variables for pipeline'''
8 |
9 | # Defining / instantiating the necessary variables
10 | age_bins = [-1, 12, 18, 25, 50, 100]
11 | age_labels = ['child', 'teen', 'young_adult', 'adult', 'elder']
12 | age_imputer = SimpleImputer(strategy = 'median')
13 | age_ohe = OneHotEncoder()
14 |
15 | # Performing basic imputation for nulls
16 | imputed = age_imputer.fit_transform(col)
17 | ages_filled = pd.DataFrame(data = imputed, columns = ['Age'])
18 |
19 | # Segregating ages into age bins
20 | age_cat_cols = pd.cut(ages_filled['Age'], bins = age_bins, labels = age_labels)
21 | age_cats = pd.DataFrame(data = age_cat_cols, columns = ['Age'])
22 |
23 | # One hot encoding new age bins
24 | ages_encoded = age_ohe.fit_transform(age_cats[['Age']])
25 | ages_encoded = pd.DataFrame(data = ages_encoded.toarray())
26 |
27 | return ages_encoded
28 |
29 |
30 |
31 | # Creating function to appropriately engineer the 'Embarked' column
32 | def create_embarked_columns(col):
33 | '''Engineers the embarked variables for pipeline'''
34 |
35 | # Instantiating the transformer objects
36 | embarked_imputer = SimpleImputer(strategy = 'most_frequent')
37 | embarked_ohe = OneHotEncoder()
38 |
39 | # Performing basic imputation for nulls
40 | imputed = embarked_imputer.fit_transform(col)
41 | embarked_filled = pd.DataFrame(data = imputed, columns = ['Embarked'])
42 |
43 | # Performing OHE on the col data
44 | embarked_columns = embarked_ohe.fit_transform(embarked_filled[['Embarked']])
45 | embarked_columns_df = pd.DataFrame(data = embarked_columns.toarray())
46 |
47 | return embarked_columns_df
48 |
--------------------------------------------------------------------------------
/007_performance_testing_locust/api/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | gunicorn --bind 0.0.0.0:5001 --workers 5 api:application
3 |
--------------------------------------------------------------------------------
/007_performance_testing_locust/locustfile.py:
--------------------------------------------------------------------------------
1 | from locust import HttpUser, task, between
2 | import json
3 |
4 | # Loading the test JSON data
5 | with open('test_data/test_1.json') as f:
6 | test_data = json.loads(f.read())
7 |
8 | # Creating an API User class inheriting from Locust's HttpUser class
9 | class APIUser(HttpUser):
10 | # Setting the host name and wait_time
11 | host = 'http://localhost:5001'
12 | wait_time = between(3, 5)
13 |
14 | # Defining the post task using the JSON test data
15 | @task()
16 | def predict_endpoint(self):
17 | self.client.post('/predict', json = test_data)
18 |
--------------------------------------------------------------------------------
/007_performance_testing_locust/model/rfc_pipeline.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/007_performance_testing_locust/model/rfc_pipeline.pkl
--------------------------------------------------------------------------------
/007_performance_testing_locust/test_data/test_1.json:
--------------------------------------------------------------------------------
1 | [{"PassengerId":892,"Pclass":3,"Name":"Kelly, Mr. James","Sex":"male","Age":34.5,"SibSp":0,"Parch":0,"Ticket":"330911","Fare":7.8292,"Cabin":null,"Embarked":"Q"}]
2 |
--------------------------------------------------------------------------------
/007_performance_testing_locust/test_data/test_2.json:
--------------------------------------------------------------------------------
1 | [{"PassengerId":893,"Pclass":3,"Name":"Wilkes, Mrs. James (Ellen Needs)","Sex":"female","Age":47.0,"SibSp":1,"Parch":0,"Ticket":"363272","Fare":7.0,"Cabin":null,"Embarked":"S"}]
2 |
--------------------------------------------------------------------------------
/007_performance_testing_locust/test_data/tests.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | echo 'Test 1'
3 | curl --request POST --header 'content-type: application/json' --data @test_1.json --url localhost:5001/predict
4 | echo
5 | echo 'Test 2'
6 | curl --request POST --header 'content-type: application/json' --data @test_2.json --url localhost:5001/predict
7 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/.ipynb_checkpoints/mlflow_wine_notebook-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Science Quick Tips #008: MLFlow Part 1 - Getting Started with MLFlow!\n",
8 | "In this first post in our sub-series on MLFlow, we're going to take things easy by getting up and running with MLFlow. To make things easy on ourselves for modeling, we're simply going to use the Red Wine quality dataset instead of the Titanic dataset we've made use of in other posts. The script here is the exact same code you'll find in mlflow-wine.py. The only difference is that when you launch the MLFlow UI, you'll be able to see that the source is technically different."
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 1,
14 | "metadata": {},
15 | "outputs": [],
16 | "source": [
17 | "# Importing in necessary libraries\n",
18 | "import pandas as pd\n",
19 | "\n",
20 | "from sklearn.model_selection import train_test_split\n",
21 | "from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n",
22 | "from sklearn.linear_model import ElasticNet\n",
23 | "\n",
24 | "import mlflow\n",
25 | "import mlflow.sklearn"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 2,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "# Loading data and prepping for training\n",
35 | "df_wine = pd.read_csv('../data/wine/train.csv')\n",
36 | "\n",
37 | "X = df_wine.drop(columns = 'quality')\n",
38 | "y = df_wine[['quality']]\n",
39 | "\n",
40 | "X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42)"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 3,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "# Defining model parameters\n",
50 | "alpha = 1\n",
51 | "l1_ratio = 0.5"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 13,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "# Running MLFlow script\n",
61 | "with mlflow.start_run():\n",
62 | " \n",
63 | " # Instantiating model with model parameters\n",
64 | " model = ElasticNet(alpha = alpha,\n",
65 | " l1_ratio = l1_ratio)\n",
66 | " \n",
67 | " # Fitting training data to the model\n",
68 | " model.fit(X_train, y_train)\n",
69 | " \n",
70 | " # Running prediction on validation dataset\n",
71 | " preds = model.predict(X_val)\n",
72 | " \n",
73 | " # Getting metrics on the validation dataset\n",
74 | " rmse = mean_squared_error(preds, y_val)\n",
75 | " abs_error = mean_absolute_error(preds, y_val)\n",
76 | " r2 = r2_score(preds, y_val)\n",
77 | " \n",
78 | " # Logging params and metrics to MLFlow\n",
79 | " mlflow.log_param('alpha', alpha)\n",
80 | " mlflow.log_param('l1_ratio', l1_ratio)\n",
81 | " mlflow.log_metric('rmse', rmse)\n",
82 | " mlflow.log_metric('abs_error', abs_error)\n",
83 | " mlflow.log_metric('r2', r2)\n",
84 | " \n",
85 | " # Logging model to MLFlow\n",
86 | " mlflow.sklearn.log_model(model, 'model')"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": null,
92 | "metadata": {},
93 | "outputs": [],
94 | "source": []
95 | }
96 | ],
97 | "metadata": {
98 | "kernelspec": {
99 | "display_name": "Python 3",
100 | "language": "python",
101 | "name": "python3"
102 | },
103 | "language_info": {
104 | "codemirror_mode": {
105 | "name": "ipython",
106 | "version": 3
107 | },
108 | "file_extension": ".py",
109 | "mimetype": "text/x-python",
110 | "name": "python",
111 | "nbconvert_exporter": "python",
112 | "pygments_lexer": "ipython3",
113 | "version": "3.7.6"
114 | }
115 | },
116 | "nbformat": 4,
117 | "nbformat_minor": 2
118 | }
119 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlflow-existing-model.py:
--------------------------------------------------------------------------------
1 | # Importing in necessary libraries
2 | import pandas as pd
3 | from sklearn.model_selection import train_test_split
4 | from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
5 | from sklearn.linear_model import ElasticNet
6 | import mlflow
7 | import mlflow.sklearn
8 | import joblib
9 |
10 | # Loading serialized model
11 | model = joblib.load('model/model.pkl')
12 |
13 | # Logging model to MLFlow
14 | mlflow.sklearn.log_model(model, 'model')
15 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlflow-wine.py:
--------------------------------------------------------------------------------
1 | # Importing in necessary libraries
2 | import pandas as pd
3 | from sklearn.model_selection import train_test_split
4 | from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
5 | from sklearn.linear_model import ElasticNet
6 | import mlflow
7 | import mlflow.sklearn
8 |
9 | # Loading data and prepping for training
10 | df_wine = pd.read_csv('../data/wine/train.csv')
11 |
12 | X = df_wine.drop(columns = 'quality')
13 | y = df_wine[['quality']]
14 |
15 | X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42)
16 |
17 | # Defining model parameters
18 | alpha = 1
19 | l1_ratio = 1
20 |
21 | # Running MLFlow script
22 | with mlflow.start_run():
23 |
24 | # Instantiating model with model parameters
25 | model = ElasticNet(alpha = alpha,
26 | l1_ratio = l1_ratio)
27 |
28 | # Fitting training data to the model
29 | model.fit(X_train, y_train)
30 |
31 | # Running prediction on validation dataset
32 | preds = model.predict(X_val)
33 |
34 | # Getting metrics on the validation dataset
35 | rmse = mean_squared_error(preds, y_val)
36 | abs_error = mean_absolute_error(preds, y_val)
37 | r2 = r2_score(preds, y_val)
38 |
39 | # Logging params and metrics to MLFlow
40 | mlflow.log_param('alpha', alpha)
41 | mlflow.log_param('l1_ratio', l1_ratio)
42 | mlflow.log_metric('rmse', rmse)
43 | mlflow.log_metric('abs_error', abs_error)
44 | mlflow.log_metric('r2', r2)
45 |
46 | # Logging model to MLFlow
47 | mlflow.sklearn.log_model(model, 'model')
48 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlflow_wine_notebook.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Science Quick Tips #008: MLFlow Part 1 - Getting Started with MLFlow!\n",
8 | "In this first post in our sub-series on MLFlow, we're going to take things easy by getting up and running with MLFlow. To make things easy on ourselves for modeling, we're simply going to use the Red Wine quality dataset instead of the Titanic dataset we've made use of in other posts. The script here is the exact same code you'll find in mlflow-wine.py. The only difference is that when you launch the MLFlow UI, you'll be able to see that the source is technically different."
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 1,
14 | "metadata": {},
15 | "outputs": [],
16 | "source": [
17 | "# Importing in necessary libraries\n",
18 | "import pandas as pd\n",
19 | "\n",
20 | "from sklearn.model_selection import train_test_split\n",
21 | "from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n",
22 | "from sklearn.linear_model import ElasticNet\n",
23 | "\n",
24 | "import mlflow\n",
25 | "import mlflow.sklearn"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 2,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "# Loading data and prepping for training\n",
35 | "df_wine = pd.read_csv('../data/wine/train.csv')\n",
36 | "\n",
37 | "X = df_wine.drop(columns = 'quality')\n",
38 | "y = df_wine[['quality']]\n",
39 | "\n",
40 | "X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42)"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 3,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "# Defining model parameters\n",
50 | "alpha = 1\n",
51 | "l1_ratio = 0.5"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 4,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "# Running MLFlow script\n",
61 | "with mlflow.start_run():\n",
62 | " \n",
63 | " # Instantiating model with model parameters\n",
64 | " model = ElasticNet(alpha = alpha,\n",
65 | " l1_ratio = l1_ratio)\n",
66 | " \n",
67 | " # Fitting training data to the model\n",
68 | " model.fit(X_train, y_train)\n",
69 | " \n",
70 | " # Running prediction on validation dataset\n",
71 | " preds = model.predict(X_val)\n",
72 | " \n",
73 | " # Getting metrics on the validation dataset\n",
74 | " rmse = mean_squared_error(preds, y_val)\n",
75 | " abs_error = mean_absolute_error(preds, y_val)\n",
76 | " r2 = r2_score(preds, y_val)\n",
77 | " \n",
78 | " # Logging params and metrics to MLFlow\n",
79 | " mlflow.log_param('alpha', alpha)\n",
80 | " mlflow.log_param('l1_ratio', l1_ratio)\n",
81 | " mlflow.log_metric('rmse', rmse)\n",
82 | " mlflow.log_metric('abs_error', abs_error)\n",
83 | " mlflow.log_metric('r2', r2)\n",
84 | " \n",
85 | " # Logging model to MLFlow\n",
86 | " mlflow.sklearn.log_model(model, 'model')"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 6,
92 | "metadata": {},
93 | "outputs": [
94 | {
95 | "data": {
96 | "text/plain": [
97 | "['model/model.pkl']"
98 | ]
99 | },
100 | "execution_count": 6,
101 | "metadata": {},
102 | "output_type": "execute_result"
103 | }
104 | ],
105 | "source": [
106 | "import joblib\n",
107 | "joblib.dump(model, 'model/model.pkl')"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": null,
113 | "metadata": {},
114 | "outputs": [],
115 | "source": []
116 | }
117 | ],
118 | "metadata": {
119 | "kernelspec": {
120 | "display_name": "Python 3",
121 | "language": "python",
122 | "name": "python3"
123 | },
124 | "language_info": {
125 | "codemirror_mode": {
126 | "name": "ipython",
127 | "version": 3
128 | },
129 | "file_extension": ".py",
130 | "mimetype": "text/x-python",
131 | "name": "python",
132 | "nbconvert_exporter": "python",
133 | "pygments_lexer": "ipython3",
134 | "version": "3.7.6"
135 | }
136 | },
137 | "nbformat": 4,
138 | "nbformat_minor": 2
139 | }
140 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/artifacts/model/MLmodel:
--------------------------------------------------------------------------------
1 | artifact_path: model
2 | flavors:
3 | python_function:
4 | env: conda.yaml
5 | loader_module: mlflow.sklearn
6 | model_path: model.pkl
7 | python_version: 3.7.6
8 | sklearn:
9 | pickled_model: model.pkl
10 | serialization_format: cloudpickle
11 | sklearn_version: 0.22.2.post1
12 | run_id: 09fa7bc156ff4d59b4b00b8fdbe84728
13 | utc_time_created: '2020-09-26 23:40:11.580877'
14 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/artifacts/model/conda.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - defaults
3 | - conda-forge
4 | dependencies:
5 | - python=3.7.6
6 | - scikit-learn=0.22.2.post1
7 | - pip
8 | - pip:
9 | - mlflow
10 | - cloudpickle==1.2.2
11 | name: mlflow-env
12 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/artifacts/model/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/artifacts/model/model.pkl
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/meta.yaml:
--------------------------------------------------------------------------------
1 | artifact_uri: file:///Users/dkhundley/Documents/Repositories/ds-quick-tips/008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/artifacts
2 | end_time: 1601163611593
3 | entry_point_name: ''
4 | experiment_id: '0'
5 | lifecycle_stage: active
6 | name: ''
7 | run_id: 09fa7bc156ff4d59b4b00b8fdbe84728
8 | run_uuid: 09fa7bc156ff4d59b4b00b8fdbe84728
9 | source_name: ''
10 | source_type: 4
11 | source_version: ''
12 | start_time: 1601163611321
13 | status: 3
14 | tags: []
15 | user_id: dkhundley
16 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/metrics/abs_error:
--------------------------------------------------------------------------------
1 | 1601163611578 0.6442845590438651 0
2 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/metrics/r2:
--------------------------------------------------------------------------------
1 | 1601163611579 -25.56867782599562 0
2 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/metrics/rmse:
--------------------------------------------------------------------------------
1 | 1601163611576 0.6163736624975248 0
2 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/params/alpha:
--------------------------------------------------------------------------------
1 | 1
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/params/l1_ratio:
--------------------------------------------------------------------------------
1 | 0.5
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/tags/mlflow.log-model.history:
--------------------------------------------------------------------------------
1 | [{"run_id": "09fa7bc156ff4d59b4b00b8fdbe84728", "artifact_path": "model", "utc_time_created": "2020-09-26 23:40:11.580877", "flavors": {"python_function": {"model_path": "model.pkl", "loader_module": "mlflow.sklearn", "python_version": "3.7.6", "env": "conda.yaml"}, "sklearn": {"pickled_model": "model.pkl", "sklearn_version": "0.22.2.post1", "serialization_format": "cloudpickle"}}}]
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/tags/mlflow.source.name:
--------------------------------------------------------------------------------
1 | /Users/dkhundley/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/tags/mlflow.source.type:
--------------------------------------------------------------------------------
1 | LOCAL
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/09fa7bc156ff4d59b4b00b8fdbe84728/tags/mlflow.user:
--------------------------------------------------------------------------------
1 | dkhundley
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/27398db7fb544a269a0c85ec637bbab9/artifacts/model/MLmodel:
--------------------------------------------------------------------------------
1 | artifact_path: model
2 | flavors:
3 | python_function:
4 | env: conda.yaml
5 | loader_module: mlflow.sklearn
6 | model_path: model.pkl
7 | python_version: 3.7.6
8 | sklearn:
9 | pickled_model: model.pkl
10 | serialization_format: cloudpickle
11 | sklearn_version: 0.22.2.post1
12 | run_id: 27398db7fb544a269a0c85ec637bbab9
13 | utc_time_created: '2020-09-29 23:05:32.886408'
14 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/27398db7fb544a269a0c85ec637bbab9/artifacts/model/conda.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - defaults
3 | - conda-forge
4 | dependencies:
5 | - python=3.7.6
6 | - scikit-learn=0.22.2.post1
7 | - pip
8 | - pip:
9 | - mlflow
10 | - cloudpickle==1.2.2
11 | name: mlflow-env
12 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/27398db7fb544a269a0c85ec637bbab9/artifacts/model/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/008_mlflow_getting_started/mlruns/0/27398db7fb544a269a0c85ec637bbab9/artifacts/model/model.pkl
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/27398db7fb544a269a0c85ec637bbab9/meta.yaml:
--------------------------------------------------------------------------------
1 | artifact_uri: file:///Users/dkhundley/Documents/Repositories/ds-quick-tips/008_mlflow_getting_started/mlruns/0/27398db7fb544a269a0c85ec637bbab9/artifacts
2 | end_time: 1601420732900
3 | entry_point_name: ''
4 | experiment_id: '0'
5 | lifecycle_stage: active
6 | name: ''
7 | run_id: 27398db7fb544a269a0c85ec637bbab9
8 | run_uuid: 27398db7fb544a269a0c85ec637bbab9
9 | source_name: ''
10 | source_type: 4
11 | source_version: ''
12 | start_time: 1601420732876
13 | status: 3
14 | tags: []
15 | user_id: dkhundley
16 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/27398db7fb544a269a0c85ec637bbab9/tags/mlflow.log-model.history:
--------------------------------------------------------------------------------
1 | [{"run_id": "27398db7fb544a269a0c85ec637bbab9", "artifact_path": "model", "utc_time_created": "2020-09-29 23:05:32.886408", "flavors": {"python_function": {"model_path": "model.pkl", "loader_module": "mlflow.sklearn", "python_version": "3.7.6", "env": "conda.yaml"}, "sklearn": {"pickled_model": "model.pkl", "sklearn_version": "0.22.2.post1", "serialization_format": "cloudpickle"}}}]
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/27398db7fb544a269a0c85ec637bbab9/tags/mlflow.source.git.commit:
--------------------------------------------------------------------------------
1 | 061a26344091a53ae58beaf1efe36cce697843a2
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/27398db7fb544a269a0c85ec637bbab9/tags/mlflow.source.name:
--------------------------------------------------------------------------------
1 | mlflow-existing-model.py
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/27398db7fb544a269a0c85ec637bbab9/tags/mlflow.source.type:
--------------------------------------------------------------------------------
1 | LOCAL
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/27398db7fb544a269a0c85ec637bbab9/tags/mlflow.user:
--------------------------------------------------------------------------------
1 | dkhundley
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/artifacts/model/MLmodel:
--------------------------------------------------------------------------------
1 | artifact_path: model
2 | flavors:
3 | python_function:
4 | env: conda.yaml
5 | loader_module: mlflow.sklearn
6 | model_path: model.pkl
7 | python_version: 3.7.6
8 | sklearn:
9 | pickled_model: model.pkl
10 | serialization_format: cloudpickle
11 | sklearn_version: 0.22.2.post1
12 | run_id: 5a2bf3f0cb504b40ac9cd9a70af32ac6
13 | utc_time_created: '2020-09-29 23:02:21.478675'
14 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/artifacts/model/conda.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - defaults
3 | - conda-forge
4 | dependencies:
5 | - python=3.7.6
6 | - scikit-learn=0.22.2.post1
7 | - pip
8 | - pip:
9 | - mlflow
10 | - cloudpickle==1.2.2
11 | name: mlflow-env
12 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/artifacts/model/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/artifacts/model/model.pkl
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/meta.yaml:
--------------------------------------------------------------------------------
1 | artifact_uri: file:///Users/dkhundley/Documents/Repositories/ds-quick-tips/008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/artifacts
2 | end_time: 1601420541495
3 | entry_point_name: ''
4 | experiment_id: '0'
5 | lifecycle_stage: active
6 | name: ''
7 | run_id: 5a2bf3f0cb504b40ac9cd9a70af32ac6
8 | run_uuid: 5a2bf3f0cb504b40ac9cd9a70af32ac6
9 | source_name: ''
10 | source_type: 4
11 | source_version: ''
12 | start_time: 1601420540829
13 | status: 3
14 | tags: []
15 | user_id: dkhundley
16 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/metrics/abs_error:
--------------------------------------------------------------------------------
1 | 1601420541476 0.6442845590438651 0
2 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/metrics/r2:
--------------------------------------------------------------------------------
1 | 1601420541477 -25.56867782599562 0
2 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/metrics/rmse:
--------------------------------------------------------------------------------
1 | 1601420541475 0.6163736624975248 0
2 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/params/alpha:
--------------------------------------------------------------------------------
1 | 1
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/params/l1_ratio:
--------------------------------------------------------------------------------
1 | 0.5
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/tags/mlflow.log-model.history:
--------------------------------------------------------------------------------
1 | [{"run_id": "5a2bf3f0cb504b40ac9cd9a70af32ac6", "artifact_path": "model", "utc_time_created": "2020-09-29 23:02:21.478675", "flavors": {"python_function": {"model_path": "model.pkl", "loader_module": "mlflow.sklearn", "python_version": "3.7.6", "env": "conda.yaml"}, "sklearn": {"pickled_model": "model.pkl", "sklearn_version": "0.22.2.post1", "serialization_format": "cloudpickle"}}}]
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/tags/mlflow.source.name:
--------------------------------------------------------------------------------
1 | /Users/dkhundley/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/tags/mlflow.source.type:
--------------------------------------------------------------------------------
1 | LOCAL
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/5a2bf3f0cb504b40ac9cd9a70af32ac6/tags/mlflow.user:
--------------------------------------------------------------------------------
1 | dkhundley
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/artifacts/model/MLmodel:
--------------------------------------------------------------------------------
1 | artifact_path: model
2 | flavors:
3 | python_function:
4 | env: conda.yaml
5 | loader_module: mlflow.sklearn
6 | model_path: model.pkl
7 | python_version: 3.7.6
8 | sklearn:
9 | pickled_model: model.pkl
10 | serialization_format: cloudpickle
11 | sklearn_version: 0.22.2.post1
12 | run_id: 93cfbd77d77f4e308297d9a47ea3abd6
13 | utc_time_created: '2020-09-26 23:48:14.718142'
14 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/artifacts/model/conda.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - defaults
3 | - conda-forge
4 | dependencies:
5 | - python=3.7.6
6 | - scikit-learn=0.22.2.post1
7 | - pip
8 | - pip:
9 | - mlflow
10 | - cloudpickle==1.2.2
11 | name: mlflow-env
12 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/artifacts/model/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/artifacts/model/model.pkl
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/meta.yaml:
--------------------------------------------------------------------------------
1 | artifact_uri: file:///Users/dkhundley/Documents/Repositories/ds-quick-tips/008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/artifacts
2 | end_time: 1601164094731
3 | entry_point_name: ''
4 | experiment_id: '0'
5 | lifecycle_stage: deleted
6 | name: ''
7 | run_id: 93cfbd77d77f4e308297d9a47ea3abd6
8 | run_uuid: 93cfbd77d77f4e308297d9a47ea3abd6
9 | source_name: ''
10 | source_type: 4
11 | source_version: ''
12 | start_time: 1601164094443
13 | status: 3
14 | tags: []
15 | user_id: dkhundley
16 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/metrics/abs_error:
--------------------------------------------------------------------------------
1 | 1601164094715 0.6468353681504646 0
2 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/metrics/r2:
--------------------------------------------------------------------------------
1 | 1601164094716 -31.43732803690922 0
2 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/metrics/rmse:
--------------------------------------------------------------------------------
1 | 1601164094714 0.6150055162124933 0
2 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/params/alpha:
--------------------------------------------------------------------------------
1 | 1
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/params/l1_ratio:
--------------------------------------------------------------------------------
1 | 1
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/tags/mlflow.log-model.history:
--------------------------------------------------------------------------------
1 | [{"run_id": "93cfbd77d77f4e308297d9a47ea3abd6", "artifact_path": "model", "utc_time_created": "2020-09-26 23:48:14.718142", "flavors": {"python_function": {"model_path": "model.pkl", "loader_module": "mlflow.sklearn", "python_version": "3.7.6", "env": "conda.yaml"}, "sklearn": {"pickled_model": "model.pkl", "sklearn_version": "0.22.2.post1", "serialization_format": "cloudpickle"}}}]
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/tags/mlflow.source.git.commit:
--------------------------------------------------------------------------------
1 | a069e3387d68e43416be7a2b7626dc8d102a5079
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/tags/mlflow.source.name:
--------------------------------------------------------------------------------
1 | mlflow-wine.py
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/tags/mlflow.source.type:
--------------------------------------------------------------------------------
1 | LOCAL
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/93cfbd77d77f4e308297d9a47ea3abd6/tags/mlflow.user:
--------------------------------------------------------------------------------
1 | dkhundley
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/artifacts/model/MLmodel:
--------------------------------------------------------------------------------
1 | artifact_path: model
2 | flavors:
3 | python_function:
4 | env: conda.yaml
5 | loader_module: mlflow.sklearn
6 | model_path: model.pkl
7 | python_version: 3.7.6
8 | sklearn:
9 | pickled_model: model.pkl
10 | serialization_format: cloudpickle
11 | sklearn_version: 0.22.2.post1
12 | run_id: befa6150910e4724b1248ee939971dc2
13 | utc_time_created: '2020-09-26 23:40:30.180546'
14 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/artifacts/model/conda.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - defaults
3 | - conda-forge
4 | dependencies:
5 | - python=3.7.6
6 | - scikit-learn=0.22.2.post1
7 | - pip
8 | - pip:
9 | - mlflow
10 | - cloudpickle==1.2.2
11 | name: mlflow-env
12 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/artifacts/model/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/artifacts/model/model.pkl
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/meta.yaml:
--------------------------------------------------------------------------------
1 | artifact_uri: file:///Users/dkhundley/Documents/Repositories/ds-quick-tips/008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/artifacts
2 | end_time: 1601163630196
3 | entry_point_name: ''
4 | experiment_id: '0'
5 | lifecycle_stage: active
6 | name: ''
7 | run_id: befa6150910e4724b1248ee939971dc2
8 | run_uuid: befa6150910e4724b1248ee939971dc2
9 | source_name: ''
10 | source_type: 4
11 | source_version: ''
12 | start_time: 1601163630009
13 | status: 3
14 | tags: []
15 | user_id: dkhundley
16 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/metrics/abs_error:
--------------------------------------------------------------------------------
1 | 1601163630177 0.6468353681504646 0
2 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/metrics/r2:
--------------------------------------------------------------------------------
1 | 1601163630178 -31.43732803690922 0
2 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/metrics/rmse:
--------------------------------------------------------------------------------
1 | 1601163630176 0.6150055162124933 0
2 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/params/alpha:
--------------------------------------------------------------------------------
1 | 1
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/params/l1_ratio:
--------------------------------------------------------------------------------
1 | 1
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/tags/mlflow.log-model.history:
--------------------------------------------------------------------------------
1 | [{"run_id": "befa6150910e4724b1248ee939971dc2", "artifact_path": "model", "utc_time_created": "2020-09-26 23:40:30.180546", "flavors": {"python_function": {"model_path": "model.pkl", "loader_module": "mlflow.sklearn", "python_version": "3.7.6", "env": "conda.yaml"}, "sklearn": {"pickled_model": "model.pkl", "sklearn_version": "0.22.2.post1", "serialization_format": "cloudpickle"}}}]
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/tags/mlflow.source.git.commit:
--------------------------------------------------------------------------------
1 | a069e3387d68e43416be7a2b7626dc8d102a5079
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/tags/mlflow.source.name:
--------------------------------------------------------------------------------
1 | mlflow-wine.py
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/tags/mlflow.source.type:
--------------------------------------------------------------------------------
1 | LOCAL
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/befa6150910e4724b1248ee939971dc2/tags/mlflow.user:
--------------------------------------------------------------------------------
1 | dkhundley
--------------------------------------------------------------------------------
/008_mlflow_getting_started/mlruns/0/meta.yaml:
--------------------------------------------------------------------------------
1 | artifact_location: file:///Users/dkhundley/Documents/Repositories/ds-quick-tips/008_mlflow_getting_started/mlruns/0
2 | experiment_id: '0'
3 | lifecycle_stage: active
4 | name: Default
5 |
--------------------------------------------------------------------------------
/008_mlflow_getting_started/model/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/008_mlflow_getting_started/model/model.pkl
--------------------------------------------------------------------------------
/009_mlflow_tracking_server/Dockerfile:
--------------------------------------------------------------------------------
1 | # Defining base image
2 | FROM python:3.8.2-slim
3 |
4 | # Installing packages from PyPi
5 | RUN pip install mlflow[extras]==1.9.1 && \
6 | pip install psycopg2-binary==2.8.5 && \
7 | pip install boto3==1.15.16
8 |
9 | # Defining start up command
10 | EXPOSE 5000
11 | ENTRYPOINT ["mlflow", "server"]
12 |
--------------------------------------------------------------------------------
/009_mlflow_tracking_server/k8s/mlflow_deployment.yaml:
--------------------------------------------------------------------------------
1 | # Creating MLflow deployment
2 | apiVersion: apps/v1
3 | kind: Deployment
4 | metadata:
5 | name: mlflow-deployment
6 | spec:
7 | replicas: 1
8 | selector:
9 | matchLabels:
10 | app: mlflow-deployment
11 | template:
12 | metadata:
13 | labels:
14 | app: mlflow-deployment
15 | spec:
16 | containers:
17 | - name: mlflow-deployment
18 | image: dkhundley/mlflow-server:1.0.3
19 | imagePullPolicy: Always
20 | args:
21 | - --host=0.0.0.0
22 | - --port=5000
23 | - --backend-store-uri=postgresql://mlflow_user:mlflow_pwd@10.109.74.95:5432/mlflow_db
24 | - --default-artifact-root=s3://mlflow/
25 | - --workers=2
26 | env:
27 | - name: MLFLOW_S3_ENDPOINT_URL
28 | value: http://10.111.110.13:9000/
29 | - name: AWS_ACCESS_KEY_ID
30 | value: "minio"
31 | - name: AWS_SECRET_ACCESS_KEY
32 | value: "minio123"
33 | ports:
34 | - name: http
35 | containerPort: 5000
36 | protocol: TCP
37 | resources:
38 | requests:
39 | cpu: "500m"
40 | ---
41 | apiVersion: v1
42 | kind: Service
43 | metadata:
44 | name: mlflow-service
45 | spec:
46 | type: NodePort
47 | ports:
48 | - port: 5000
49 | targetPort: 5000
50 | protocol: TCP
51 | name: http
52 | selector:
53 | app: mlflow-deployment
54 | ---
55 | apiVersion: networking.k8s.io/v1beta1
56 | kind: Ingress
57 | metadata:
58 | name: mlflow-ingress
59 | annotations:
60 | kubernetes.io/ingress.class: nginx
61 | nginx.ingress.kubernetes.il/add-base-url: "true"
62 | spec:
63 | rules:
64 | - host: mlflow-server.local
65 | http:
66 | paths:
67 | - backend:
68 | serviceName: mlflow-service
69 | servicePort: 5000
70 | path: /
71 |
--------------------------------------------------------------------------------
/009_mlflow_tracking_server/k8s/mlflow_minio.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: Deployment
3 | metadata:
4 | name: mlflow-minio
5 | spec:
6 | selector:
7 | matchLabels:
8 | app: mlflow-minio
9 | template:
10 | metadata:
11 | labels:
12 | app: mlflow-minio
13 | spec:
14 | volumes:
15 | - name: mlflow-pvc
16 | persistentVolumeClaim:
17 | claimName: mlflow-pvc
18 | containers:
19 | - name: mlflow-minio
20 | image: minio/minio:latest
21 | args:
22 | - server
23 | - /data
24 | volumeMounts:
25 | - name: mlflow-pvc
26 | mountPath: '/data'
27 | env:
28 | - name: MINIO_ACCESS_KEY
29 | value: "minio"
30 | - name: MINIO_SECRET_KEY
31 | value: "minio123"
32 | ports:
33 | - containerPort: 9000
34 | ---
35 | apiVersion: v1
36 | kind: Service
37 | metadata:
38 | name: mlflow-minio-service
39 | spec:
40 | type: NodePort
41 | ports:
42 | - port: 9000
43 | targetPort: 9000
44 | protocol: TCP
45 | selector:
46 | app: mlflow-minio
47 | ---
48 | apiVersion: networking.k8s.io/v1beta1
49 | kind: Ingress
50 | metadata:
51 | name: mlflow-minio-ingress
52 | annotations:
53 | kubernetes.io/ingress.class: nginx
54 | nginx.ingress.kubernetes.il/add-base-url: "true"
55 | nginx.ingress.kubernetes.io/ssl-redirect: "false"
56 | spec:
57 | rules:
58 | - host: mlflow-minio.local
59 | http:
60 | paths:
61 | - backend:
62 | serviceName: mlflow-minio-service
63 | servicePort: 9000
64 | path: /
65 | ---
66 | apiVersion: v1
67 | kind: PersistentVolumeClaim
68 | metadata:
69 | name: mlflow-pvc
70 | spec:
71 | accessModes:
72 | - ReadWriteMany
73 | resources:
74 | requests:
75 | storage: 100Mi
76 |
--------------------------------------------------------------------------------
/009_mlflow_tracking_server/k8s/mlflow_postgres.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 | name: mlflow-postgres-config
5 | labels:
6 | app: mlflow-postgres
7 | data:
8 | POSTGRES_DB: mlflow_db
9 | POSTGRES_USER: mlflow_user
10 | POSTGRES_PASSWORD: mlflow_pwd
11 | PGDATA: /var/lib/postgresql/mlflow/data
12 | ---
13 | apiVersion: apps/v1
14 | kind: StatefulSet
15 | metadata:
16 | name: mlflow-postgres
17 | labels:
18 | app: mlflow-postgres
19 | spec:
20 | selector:
21 | matchLabels:
22 | app: mlflow-postgres
23 | serviceName: "mlflow-postgres-service"
24 | replicas: 1
25 | template:
26 | metadata:
27 | labels:
28 | app: mlflow-postgres
29 | spec:
30 | containers:
31 | - name: mlflow-postgres
32 | image: postgres:11
33 | ports:
34 | - containerPort: 5432
35 | protocol: TCP
36 | envFrom:
37 | - configMapRef:
38 | name: mlflow-postgres-config
39 | resources:
40 | requests:
41 | memory: "1Gi"
42 | cpu: "500m"
43 | volumeMounts:
44 | - name: mlflow-pvc
45 | mountPath: /var/lib/postgresql/mlflow
46 | volumeClaimTemplates:
47 | - metadata:
48 | name: mlflow-pvc
49 | spec:
50 | accessModes: [ "ReadWriteOnce" ]
51 | resources:
52 | requests:
53 | storage: 100Mi
54 | ---
55 | apiVersion: v1
56 | kind: Service
57 | metadata:
58 | name: mlflow-postgres-service
59 | labels:
60 | svc: mlflow-postgres-service
61 | spec:
62 | type: NodePort
63 | ports:
64 | - port: 5432
65 | targetPort: 5432
66 | protocol: TCP
67 | selector:
68 | app: mlflow-postgres
69 |
--------------------------------------------------------------------------------
/010_mlflow_logging_to_server/.ipynb_checkpoints/Untitled-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# Importing in necessary libraries\n",
10 | "import pandas as pd\n",
11 | "from sklearn.model_selection import train_test_split\n",
12 | "from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n",
13 | "from sklearn.linear_model import ElasticNet\n",
14 | "import mlflow\n",
15 | "import mlflow.sklearn"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 2,
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "# Setting tracking URI\n",
25 | "mlflow.set_tracking_uri('http://mlflow-server.local')"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 3,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "# Loading data and prepping for training\n",
35 | "df_wine = pd.read_csv('../data/wine/train.csv')\n",
36 | "\n",
37 | "X = df_wine.drop(columns = 'quality')\n",
38 | "y = df_wine[['quality']]\n",
39 | "\n",
40 | "X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42)"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 4,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "# Defining model parameters\n",
50 | "alpha = 1\n",
51 | "l1_ratio = 1"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 6,
57 | "metadata": {},
58 | "outputs": [
59 | {
60 | "ename": "PermissionError",
61 | "evalue": "[Errno 13] Permission denied: '/opt/mlflow'",
62 | "output_type": "error",
63 | "traceback": [
64 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
65 | "\u001b[0;31mPermissionError\u001b[0m Traceback (most recent call last)",
66 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0;31m# Logging model to MLFlow\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 27\u001b[0;31m \u001b[0mmlflow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlog_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'model'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
67 | "\u001b[0;32m~/opt/anaconda3/lib/python3.7/site-packages/mlflow/sklearn/__init__.py\u001b[0m in \u001b[0;36mlog_model\u001b[0;34m(sk_model, artifact_path, conda_env, serialization_format, registered_model_name, signature, input_example)\u001b[0m\n\u001b[1;32m 296\u001b[0m \u001b[0mregistered_model_name\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mregistered_model_name\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 297\u001b[0m \u001b[0msignature\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msignature\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 298\u001b[0;31m \u001b[0minput_example\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minput_example\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 299\u001b[0m )\n\u001b[1;32m 300\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
68 | "\u001b[0;32m~/opt/anaconda3/lib/python3.7/site-packages/mlflow/models/model.py\u001b[0m in \u001b[0;36mlog\u001b[0;34m(cls, artifact_path, flavor, registered_model_name, **kwargs)\u001b[0m\n\u001b[1;32m 159\u001b[0m \u001b[0mmlflow_model\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcls\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0martifact_path\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0martifact_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrun_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrun_id\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 160\u001b[0m \u001b[0mflavor\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlocal_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmlflow_model\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmlflow_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 161\u001b[0;31m \u001b[0mmlflow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtracking\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfluent\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlog_artifacts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlocal_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0martifact_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 162\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 163\u001b[0m \u001b[0mmlflow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtracking\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfluent\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_record_logged_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmlflow_model\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
69 | "\u001b[0;32m~/opt/anaconda3/lib/python3.7/site-packages/mlflow/tracking/fluent.py\u001b[0m in \u001b[0;36mlog_artifacts\u001b[0;34m(local_dir, artifact_path)\u001b[0m\n\u001b[1;32m 326\u001b[0m \"\"\"\n\u001b[1;32m 327\u001b[0m \u001b[0mrun_id\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_get_or_start_run\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_id\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 328\u001b[0;31m \u001b[0mMlflowClient\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlog_artifacts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrun_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlocal_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0martifact_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 329\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 330\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
70 | "\u001b[0;32m~/opt/anaconda3/lib/python3.7/site-packages/mlflow/tracking/client.py\u001b[0m in \u001b[0;36mlog_artifacts\u001b[0;34m(self, run_id, local_dir, artifact_path)\u001b[0m\n\u001b[1;32m 288\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0mparam\u001b[0m \u001b[0martifact_path\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mIf\u001b[0m \u001b[0mprovided\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mdirectory\u001b[0m \u001b[0;32min\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0martifact_uri\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;31m`\u001b[0m \u001b[0mto\u001b[0m \u001b[0mwrite\u001b[0m \u001b[0mto\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 289\u001b[0m \"\"\"\n\u001b[0;32m--> 290\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_tracking_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlog_artifacts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrun_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlocal_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0martifact_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 291\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 292\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_record_logged_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrun_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmlflow_model\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
71 | "\u001b[0;32m~/opt/anaconda3/lib/python3.7/site-packages/mlflow/tracking/_tracking_service/client.py\u001b[0m in \u001b[0;36mlog_artifacts\u001b[0;34m(self, run_id, local_dir, artifact_path)\u001b[0m\n\u001b[1;32m 286\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0mparam\u001b[0m \u001b[0martifact_path\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mIf\u001b[0m \u001b[0mprovided\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mdirectory\u001b[0m \u001b[0;32min\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0martifact_uri\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;31m`\u001b[0m \u001b[0mto\u001b[0m \u001b[0mwrite\u001b[0m \u001b[0mto\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 287\u001b[0m \"\"\"\n\u001b[0;32m--> 288\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_artifact_repo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrun_id\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlog_artifacts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlocal_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0martifact_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 289\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 290\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mlist_artifacts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrun_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
72 | "\u001b[0;32m~/opt/anaconda3/lib/python3.7/site-packages/mlflow/store/artifact/local_artifact_repo.py\u001b[0m in \u001b[0;36mlog_artifacts\u001b[0;34m(self, local_dir, artifact_path)\u001b[0m\n\u001b[1;32m 55\u001b[0m )\n\u001b[1;32m 56\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0martifact_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 57\u001b[0;31m \u001b[0mmkdir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0martifact_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 58\u001b[0m \u001b[0mdir_util\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy_tree\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlocal_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdst\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0martifact_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpreserve_mode\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpreserve_times\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 59\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
73 | "\u001b[0;32m~/opt/anaconda3/lib/python3.7/site-packages/mlflow/utils/file_utils.py\u001b[0m in \u001b[0;36mmkdir\u001b[0;34m(root, name)\u001b[0m\n\u001b[1;32m 109\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mOSError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 110\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merrno\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0merrno\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mEEXIST\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misdir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtarget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 111\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 112\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 113\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
74 | "\u001b[0;32m~/opt/anaconda3/lib/python3.7/site-packages/mlflow/utils/file_utils.py\u001b[0m in \u001b[0;36mmkdir\u001b[0;34m(root, name)\u001b[0m\n\u001b[1;32m 106\u001b[0m \u001b[0mtarget\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mroot\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mroot\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 107\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 108\u001b[0;31m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmakedirs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtarget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 109\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mOSError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 110\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merrno\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0merrno\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mEEXIST\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misdir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtarget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
75 | "\u001b[0;32m~/opt/anaconda3/lib/python3.7/os.py\u001b[0m in \u001b[0;36mmakedirs\u001b[0;34m(name, mode, exist_ok)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhead\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mtail\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 211\u001b[0;31m \u001b[0mmakedirs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexist_ok\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mexist_ok\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 212\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mFileExistsError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[0;31m# Defeats race condition when another thread created the path\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
76 | "\u001b[0;32m~/opt/anaconda3/lib/python3.7/os.py\u001b[0m in \u001b[0;36mmakedirs\u001b[0;34m(name, mode, exist_ok)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhead\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mtail\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 211\u001b[0;31m \u001b[0mmakedirs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexist_ok\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mexist_ok\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 212\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mFileExistsError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[0;31m# Defeats race condition when another thread created the path\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
77 | "\u001b[0;32m~/opt/anaconda3/lib/python3.7/os.py\u001b[0m in \u001b[0;36mmakedirs\u001b[0;34m(name, mode, exist_ok)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhead\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mtail\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 211\u001b[0;31m \u001b[0mmakedirs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexist_ok\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mexist_ok\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 212\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mFileExistsError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[0;31m# Defeats race condition when another thread created the path\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
78 | "\u001b[0;32m~/opt/anaconda3/lib/python3.7/os.py\u001b[0m in \u001b[0;36mmakedirs\u001b[0;34m(name, mode, exist_ok)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhead\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mtail\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 211\u001b[0;31m \u001b[0mmakedirs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexist_ok\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mexist_ok\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 212\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mFileExistsError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[0;31m# Defeats race condition when another thread created the path\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
79 | "\u001b[0;32m~/opt/anaconda3/lib/python3.7/os.py\u001b[0m in \u001b[0;36mmakedirs\u001b[0;34m(name, mode, exist_ok)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhead\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mtail\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 211\u001b[0;31m \u001b[0mmakedirs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexist_ok\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mexist_ok\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 212\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mFileExistsError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[0;31m# Defeats race condition when another thread created the path\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
80 | "\u001b[0;32m~/opt/anaconda3/lib/python3.7/os.py\u001b[0m in \u001b[0;36mmakedirs\u001b[0;34m(name, mode, exist_ok)\u001b[0m\n\u001b[1;32m 219\u001b[0m \u001b[0;32mreturn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 220\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 221\u001b[0;31m \u001b[0mmkdir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 222\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mOSError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 223\u001b[0m \u001b[0;31m# Cannot rely on checking for EEXIST, since the operating system\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
81 | "\u001b[0;31mPermissionError\u001b[0m: [Errno 13] Permission denied: '/opt/mlflow'"
82 | ]
83 | }
84 | ],
85 | "source": [
86 | "# Running MLFlow script\n",
87 | "with mlflow.start_run():\n",
88 | "\n",
89 | " # Instantiating model with model parameters\n",
90 | " model = ElasticNet(alpha = alpha,\n",
91 | " l1_ratio = l1_ratio)\n",
92 | "\n",
93 | " # Fitting training data to the model\n",
94 | " model.fit(X_train, y_train)\n",
95 | "\n",
96 | " # Running prediction on validation dataset\n",
97 | " preds = model.predict(X_val)\n",
98 | "\n",
99 | " # Getting metrics on the validation dataset\n",
100 | " rmse = mean_squared_error(preds, y_val)\n",
101 | " abs_error = mean_absolute_error(preds, y_val)\n",
102 | " r2 = r2_score(preds, y_val)\n",
103 | "\n",
104 | " # Logging params and metrics to MLFlow\n",
105 | " mlflow.log_param('alpha', alpha)\n",
106 | " mlflow.log_param('l1_ratio', l1_ratio)\n",
107 | " mlflow.log_metric('rmse', rmse)\n",
108 | " mlflow.log_metric('abs_error', abs_error)\n",
109 | " mlflow.log_metric('r2', r2)\n",
110 | "\n",
111 | " # Logging model to MLFlow\n",
112 | " mlflow.sklearn.log_model(model)"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": null,
118 | "metadata": {},
119 | "outputs": [],
120 | "source": []
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": null,
125 | "metadata": {},
126 | "outputs": [],
127 | "source": []
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": null,
132 | "metadata": {},
133 | "outputs": [],
134 | "source": []
135 | }
136 | ],
137 | "metadata": {
138 | "kernelspec": {
139 | "display_name": "Python 3",
140 | "language": "python",
141 | "name": "python3"
142 | },
143 | "language_info": {
144 | "codemirror_mode": {
145 | "name": "ipython",
146 | "version": 3
147 | },
148 | "file_extension": ".py",
149 | "mimetype": "text/x-python",
150 | "name": "python",
151 | "nbconvert_exporter": "python",
152 | "pygments_lexer": "ipython3",
153 | "version": "3.7.6"
154 | }
155 | },
156 | "nbformat": 4,
157 | "nbformat_minor": 2
158 | }
159 |
--------------------------------------------------------------------------------
/010_mlflow_logging_to_server/mlflow-wine.py:
--------------------------------------------------------------------------------
1 | # Importing in necessary libraries
2 | import os
3 | import pandas as pd
4 | from sklearn.model_selection import train_test_split
5 | from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
6 | from sklearn.linear_model import ElasticNet
7 | import mlflow
8 | import mlflow.sklearn
9 |
10 |
11 |
12 | # PROJECT SETUP
13 | # ------------------------------------------------------------------------------
14 | # Setting the MLflow tracking server
15 | mlflow.set_tracking_uri('http://mlflow-server.local')
16 |
17 | # Setting the requried environment variables
18 | os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'http://mlflow-minio.local/'
19 | os.environ['AWS_ACCESS_KEY_ID'] = 'minio'
20 | os.environ['AWS_SECRET_ACCESS_KEY'] = 'minio123'
21 |
22 | # Loading data from a CSV file
23 | df_wine = pd.read_csv('../data/wine/train.csv')
24 |
25 | # Separating the target class ('quality') from remainder of the training data
26 | X = df_wine.drop(columns = 'quality')
27 | y = df_wine[['quality']]
28 |
29 | # Splitting the data into training and validation sets
30 | X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42)
31 |
32 |
33 |
34 |
35 | # MODEL TRAINING AND LOGGING
36 | # ------------------------------------------------------------------------------
37 | # Defining model parameters
38 | alpha = 1
39 | l1_ratio = 1
40 |
41 | # Running MLFlow script
42 | with mlflow.start_run():
43 |
44 | # Instantiating model with model parameters
45 | model = ElasticNet(alpha = alpha,
46 | l1_ratio = l1_ratio)
47 |
48 | # Fitting training data to the model
49 | model.fit(X_train, y_train)
50 |
51 | # Running prediction on validation dataset
52 | preds = model.predict(X_val)
53 |
54 | # Getting metrics on the validation dataset
55 | rmse = mean_squared_error(preds, y_val)
56 | abs_error = mean_absolute_error(preds, y_val)
57 | r2 = r2_score(preds, y_val)
58 |
59 | # Logging params and metrics to MLFlow
60 | mlflow.log_param('alpha', alpha)
61 | mlflow.log_param('l1_ratio', l1_ratio)
62 | mlflow.log_metric('rmse', rmse)
63 | mlflow.log_metric('abs_error', abs_error)
64 | mlflow.log_metric('r2', r2)
65 |
66 | # Logging training data
67 | mlflow.log_artifact(local_path = '../data/wine/train.csv')
68 |
69 | # Logging training code
70 | mlflow.log_artifact(local_path = './mlflow-wine.py')
71 |
72 | # Logging model to MLFlow
73 | mlflow.sklearn.log_model(sk_model = model,
74 | artifact_path = 'wine-pyfile-model',
75 | registered_model_name = 'wine-pyfile-model')
76 |
--------------------------------------------------------------------------------
/011_mlflow_interacting_with_client/.ipynb_checkpoints/MLflow_client_interaction-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# Importing our required libraries\n",
10 | "import mlflow\n",
11 | "import mlflow.sklearn\n",
12 | "import pandas as pd\n",
13 | "import os"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 2,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "# Setting the MLflow client\n",
23 | "client = mlflow.tracking.MlflowClient(tracking_uri = 'http://mlflow-server.local')"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 3,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "# Setting the requried environment variables\n",
33 | "os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'http://mlflow-minio.local/'\n",
34 | "os.environ['AWS_ACCESS_KEY_ID'] = 'minio'\n",
35 | "os.environ['AWS_SECRET_ACCESS_KEY'] = 'minio123'"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 4,
41 | "metadata": {},
42 | "outputs": [
43 | {
44 | "data": {
45 | "text/plain": [
46 | "[]"
47 | ]
48 | },
49 | "execution_count": 4,
50 | "metadata": {},
51 | "output_type": "execute_result"
52 | }
53 | ],
54 | "source": [
55 | "# Listing the MLflow experiments\n",
56 | "client.list_experiments()"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": 7,
62 | "metadata": {},
63 | "outputs": [
64 | {
65 | "name": "stdout",
66 | "output_type": "stream",
67 | "text": [
68 | "name=wine-pyfile-model; version=1\n",
69 | "name=wine-pyfile-model; version=2\n",
70 | "name=wine-pyfile-model; version=3\n"
71 | ]
72 | }
73 | ],
74 | "source": [
75 | "# Getting the model versions for the wine-pyfile-model\n",
76 | "results = client.search_model_versions(\"name='wine-pyfile-model'\")\n",
77 | "\n",
78 | "for res in results:\n",
79 | " print(f'name={res.name}; version={res.version}')"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 5,
85 | "metadata": {},
86 | "outputs": [
87 | {
88 | "name": "stdout",
89 | "output_type": "stream",
90 | "text": [
91 | "s3://mlflow/0/3a496ea82c304ea38a4ebe1281f7faf2/artifacts/wine-pyfile-model/\n"
92 | ]
93 | }
94 | ],
95 | "source": [
96 | "# Getting the URI for version 2 of the Wine model\n",
97 | "uri = (client.get_model_version_download_uri(name = 'wine-pyfile-model', version='2')) + '/'\n",
98 | "print(uri)"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 8,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "# Loading the model using the URI above\n",
108 | "model = mlflow.sklearn.load_model(model_uri = uri)"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 9,
114 | "metadata": {},
115 | "outputs": [
116 | {
117 | "data": {
118 | "text/plain": [
119 | "ElasticNet(alpha=1, copy_X=True, fit_intercept=True, l1_ratio=1, max_iter=1000,\n",
120 | " normalize=False, positive=False, precompute=False, random_state=None,\n",
121 | " selection='cyclic', tol=0.0001, warm_start=False)"
122 | ]
123 | },
124 | "execution_count": 9,
125 | "metadata": {},
126 | "output_type": "execute_result"
127 | }
128 | ],
129 | "source": [
130 | "# Showing the model object itself\n",
131 | "model"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": 11,
137 | "metadata": {},
138 | "outputs": [],
139 | "source": [
140 | "# Loading in the training data\n",
141 | "df_wine = pd.read_csv('../data/wine/train.csv')"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": 12,
147 | "metadata": {},
148 | "outputs": [],
149 | "source": [
150 | "# Dropping the predictor column\n",
151 | "X = df_wine.drop(columns = ['quality'])"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": 14,
157 | "metadata": {},
158 | "outputs": [
159 | {
160 | "data": {
161 | "text/plain": [
162 | "array([5.68130396, 5.5414187 , 5.59652501, ..., 5.65587028, 5.63891449,\n",
163 | " 5.64739238])"
164 | ]
165 | },
166 | "execution_count": 14,
167 | "metadata": {},
168 | "output_type": "execute_result"
169 | }
170 | ],
171 | "source": [
172 | "# Getting model predictions\n",
173 | "model.predict(X)"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": null,
179 | "metadata": {},
180 | "outputs": [],
181 | "source": []
182 | }
183 | ],
184 | "metadata": {
185 | "kernelspec": {
186 | "display_name": "Python 3",
187 | "language": "python",
188 | "name": "python3"
189 | },
190 | "language_info": {
191 | "codemirror_mode": {
192 | "name": "ipython",
193 | "version": 3
194 | },
195 | "file_extension": ".py",
196 | "mimetype": "text/x-python",
197 | "name": "python",
198 | "nbconvert_exporter": "python",
199 | "pygments_lexer": "ipython3",
200 | "version": "3.7.6"
201 | }
202 | },
203 | "nbformat": 4,
204 | "nbformat_minor": 2
205 | }
206 |
--------------------------------------------------------------------------------
/011_mlflow_interacting_with_client/MLflow_client_interaction.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# Importing our required libraries\n",
10 | "import mlflow\n",
11 | "import mlflow.sklearn\n",
12 | "import pandas as pd\n",
13 | "import os"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 2,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "# Setting the MLflow client\n",
23 | "client = mlflow.tracking.MlflowClient(tracking_uri = 'http://mlflow-server.local')"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 3,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "# Setting the requried environment variables\n",
33 | "os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'http://mlflow-minio.local/'\n",
34 | "os.environ['AWS_ACCESS_KEY_ID'] = 'minio'\n",
35 | "os.environ['AWS_SECRET_ACCESS_KEY'] = 'minio123'"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 4,
41 | "metadata": {},
42 | "outputs": [
43 | {
44 | "data": {
45 | "text/plain": [
46 | "[]"
47 | ]
48 | },
49 | "execution_count": 4,
50 | "metadata": {},
51 | "output_type": "execute_result"
52 | }
53 | ],
54 | "source": [
55 | "# Listing the MLflow experiments\n",
56 | "client.list_experiments()"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": 7,
62 | "metadata": {},
63 | "outputs": [
64 | {
65 | "name": "stdout",
66 | "output_type": "stream",
67 | "text": [
68 | "name=wine-pyfile-model; version=1\n",
69 | "name=wine-pyfile-model; version=2\n",
70 | "name=wine-pyfile-model; version=3\n"
71 | ]
72 | }
73 | ],
74 | "source": [
75 | "# Getting the model versions for the wine-pyfile-model\n",
76 | "results = client.search_model_versions(\"name='wine-pyfile-model'\")\n",
77 | "\n",
78 | "for res in results:\n",
79 | " print(f'name={res.name}; version={res.version}')"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 5,
85 | "metadata": {},
86 | "outputs": [
87 | {
88 | "name": "stdout",
89 | "output_type": "stream",
90 | "text": [
91 | "s3://mlflow/0/3a496ea82c304ea38a4ebe1281f7faf2/artifacts/wine-pyfile-model/\n"
92 | ]
93 | }
94 | ],
95 | "source": [
96 | "# Getting the URI for version 2 of the Wine model\n",
97 | "uri = (client.get_model_version_download_uri(name = 'wine-pyfile-model', version='2')) + '/'\n",
98 | "print(uri)"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 8,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "# Loading the model using the URI above\n",
108 | "model = mlflow.sklearn.load_model(model_uri = uri)"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 9,
114 | "metadata": {},
115 | "outputs": [
116 | {
117 | "data": {
118 | "text/plain": [
119 | "ElasticNet(alpha=1, copy_X=True, fit_intercept=True, l1_ratio=1, max_iter=1000,\n",
120 | " normalize=False, positive=False, precompute=False, random_state=None,\n",
121 | " selection='cyclic', tol=0.0001, warm_start=False)"
122 | ]
123 | },
124 | "execution_count": 9,
125 | "metadata": {},
126 | "output_type": "execute_result"
127 | }
128 | ],
129 | "source": [
130 | "# Showing the model object itself\n",
131 | "model"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": 11,
137 | "metadata": {},
138 | "outputs": [],
139 | "source": [
140 | "# Loading in the training data\n",
141 | "df_wine = pd.read_csv('../data/wine/train.csv')"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": 12,
147 | "metadata": {},
148 | "outputs": [],
149 | "source": [
150 | "# Dropping the predictor column\n",
151 | "X = df_wine.drop(columns = ['quality'])"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": 14,
157 | "metadata": {},
158 | "outputs": [
159 | {
160 | "data": {
161 | "text/plain": [
162 | "array([5.68130396, 5.5414187 , 5.59652501, ..., 5.65587028, 5.63891449,\n",
163 | " 5.64739238])"
164 | ]
165 | },
166 | "execution_count": 14,
167 | "metadata": {},
168 | "output_type": "execute_result"
169 | }
170 | ],
171 | "source": [
172 | "# Getting model predictions\n",
173 | "model.predict(X)"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": null,
179 | "metadata": {},
180 | "outputs": [],
181 | "source": []
182 | }
183 | ],
184 | "metadata": {
185 | "kernelspec": {
186 | "display_name": "Python 3",
187 | "language": "python",
188 | "name": "python3"
189 | },
190 | "language_info": {
191 | "codemirror_mode": {
192 | "name": "ipython",
193 | "version": 3
194 | },
195 | "file_extension": ".py",
196 | "mimetype": "text/x-python",
197 | "name": "python",
198 | "nbconvert_exporter": "python",
199 | "pygments_lexer": "ipython3",
200 | "version": "3.7.6"
201 | }
202 | },
203 | "nbformat": 4,
204 | "nbformat_minor": 2
205 | }
206 |
--------------------------------------------------------------------------------
/011_mlflow_interacting_with_client/mlruns/0/meta.yaml:
--------------------------------------------------------------------------------
1 | artifact_location: file:///Users/dkhundley/Documents/Repositories/ds-quick-tips/011_mlflow_interacting_with_client/mlruns/0
2 | experiment_id: '0'
3 | lifecycle_stage: active
4 | name: Default
5 |
--------------------------------------------------------------------------------
/012_dockerizing_fastapi/Dockerfile:
--------------------------------------------------------------------------------
1 | # Starting with base image
2 | FROM python:3.8-slim-buster
3 |
4 | # Installing required packages from requirements.txt file
5 | COPY dependencies/requirements.txt /
6 | RUN pip install -r /requirements.txt
7 |
8 | # Copying the FastAPI inference script
9 | COPY container/ /container
10 |
11 | # Setting the working directory appropriately
12 | WORKDIR /container
13 |
14 | # Exposing the appropriate port on the container
15 | EXPOSE 5000
16 |
17 | # Setting the entrypoint for the container
18 | ENTRYPOINT ["uvicorn"]
--------------------------------------------------------------------------------
/012_dockerizing_fastapi/container/api.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import pickle
3 | from fastapi import FastAPI, Request
4 | from pydantic import BaseModel
5 |
6 |
7 |
8 | ## API INSTANTIATION
9 | ## ----------------------------------------------------------------
10 |
11 | # Instantiating FastAPI
12 | api = FastAPI()
13 |
14 | # Loading in model from serialized .pkl file
15 | pkl_filename = "../model/iris_model.pkl"
16 | with open(pkl_filename, 'rb') as file:
17 | lr_model = pickle.load(file)
18 |
19 | # Creating the data model for data validation
20 | class Iris(BaseModel):
21 | sepal_length: float
22 | sepal_width: float
23 | petal_length: float
24 | petal_width: float
25 |
26 |
27 |
28 | ## API ENDPOINTS
29 | ## ----------------------------------------------------------------
30 |
31 | # Defining a test root path and message
32 | @api.get('/')
33 | def root():
34 | return {'message': 'Hello friends!'}
35 |
36 |
37 |
38 | # Defining the prediction endpoint without data validation
39 | @api.post('/basic_predict')
40 | async def basic_predict(request: Request):
41 |
42 | # Getting the JSON from the body of the request
43 | input_data = await request.json()
44 |
45 | # Converting JSON to Pandas DataFrame
46 | input_df = pd.DataFrame([input_data])
47 |
48 | # Getting the prediction from the Logistic Regression model
49 | pred = lr_model.predict(input_df)[0]
50 |
51 | return pred
52 |
53 |
54 |
55 | # Defining the prediction endpoint with data validation
56 | @api.post('/predict')
57 | async def predict(iris: Iris):
58 |
59 | # Converting input data into Pandas DataFrame
60 | input_df = pd.DataFrame([iris.dict()])
61 |
62 | # Getting the prediction from the Logistic Regression model
63 | pred = lr_model.predict(input_df)[0]
64 |
65 | return pred
--------------------------------------------------------------------------------
/012_dockerizing_fastapi/container/start_api.sh:
--------------------------------------------------------------------------------
1 | uvicorn api:api --host 0.0.0.0 --port 5001 --reload
--------------------------------------------------------------------------------
/012_dockerizing_fastapi/container/train.py:
--------------------------------------------------------------------------------
1 | # Importing the required Python libraries
2 | import numpy as np
3 | import pandas as pd
4 | import pickle
5 | from sklearn import datasets
6 | from sklearn.linear_model import LogisticRegression
7 |
8 | # Loading the iris dataset from Scikit-Learn
9 | iris = datasets.load_iris()
10 |
11 | # Converting the iris dataset into a Pandas DataFrame
12 | df_iris = pd.DataFrame(data = np.c_[iris['data'], iris['target']],
13 | columns = iris['feature_names'] + ['target'])
14 |
15 | # Separating the training dataset (X) from the predictor value (y)
16 | X = df_iris.drop(columns = ['target'])
17 | y = df_iris[['target']]
18 |
19 | # Instantiating a Logistic Regression (LR) model
20 | lr_model = LogisticRegression()
21 |
22 | # Fitting the dataset to the LR model
23 | lr_model.fit(X, y)
24 |
25 | # Saving the model to a serialized .pkl file
26 | pkl_filename = "../model/iris_model.pkl"
27 | with open(pkl_filename, 'wb') as file:
28 | pickle.dump(lr_model, file)
--------------------------------------------------------------------------------
/012_dockerizing_fastapi/dependencies/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi==0.63.0
2 | pandas==1.2.1
3 | scikit-learn==0.24.1
4 | uvicorn==0.13.4
--------------------------------------------------------------------------------
/012_dockerizing_fastapi/k8s/deployment.yaml:
--------------------------------------------------------------------------------
1 | # Creating the deployment for the Iris FastAPI
2 | apiVersion: apps/v1
3 | kind: Deployment
4 | metadata:
5 | name: fastapi-iris
6 | labels:
7 | app: fastapi-iris
8 | spec:
9 | replicas: 1
10 | selector:
11 | matchLabels:
12 | app: fastapi-iris
13 | template:
14 | metadata:
15 | labels:
16 | app: fastapi-iris
17 | spec:
18 | containers:
19 | - name: fastapi-iris
20 | image: fastapi-iris:1.0.0
21 | ports:
22 | = containerPort: 5000
23 | resources:
24 | requests:
25 | cpu: 100m
26 | memory: 100Mi
27 | ---
28 | # Creating the service to support the Iris FastAPI deployment
29 | apiVersion: v1
30 | kind: Service
31 | metadata:
32 | name: fastapi-iris-service
33 | labels:
34 | app: fastapi-iris
35 | spec:
36 | type: LoadBalancer
37 | ports:
38 | - port: 5000
39 | protocol: TCP
40 | targetPort: 5000
41 | selector:
42 | app: fastapi-iris
--------------------------------------------------------------------------------
/012_dockerizing_fastapi/model/iris_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/012_dockerizing_fastapi/model/iris_model.pkl
--------------------------------------------------------------------------------
/012_dockerizing_fastapi/notebooks/.ipynb_checkpoints/iris_model_creation-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# Importing the required Python libraries\n",
10 | "import numpy as np\n",
11 | "import pandas as pd\n",
12 | "from sklearn import datasets\n",
13 | "from sklearn.linear_model import LogisticRegression"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 2,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "# Loading the iris dataset\n",
23 | "iris = datasets.load_iris()\n",
24 | "\n",
25 | "# Converting the iris dataset into a pandas dataframe\n",
26 | "df_iris = pd.DataFrame(data = np.c_[iris['data'], iris['target']],\n",
27 | " columns = iris['feature_names'] + ['target'])"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 8,
33 | "metadata": {},
34 | "outputs": [
35 | {
36 | "data": {
37 | "text/plain": [
38 | "'[5.1,3.5,1.4,0.2]'"
39 | ]
40 | },
41 | "execution_count": 8,
42 | "metadata": {},
43 | "output_type": "execute_result"
44 | }
45 | ],
46 | "source": [
47 | "df_iris.drop(columns = ['target']).iloc[0].to_json(orient = 'records')"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 3,
53 | "metadata": {},
54 | "outputs": [],
55 | "source": [
56 | "# Separating the training dataset (X) from the predictor value (y)\n",
57 | "X = df_iris.drop(columns = ['target'])\n",
58 | "y = df_iris[['target']]"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 4,
64 | "metadata": {},
65 | "outputs": [
66 | {
67 | "name": "stderr",
68 | "output_type": "stream",
69 | "text": [
70 | "/home/pi/.local/lib/python3.7/site-packages/sklearn/utils/validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
71 | " return f(*args, **kwargs)\n",
72 | "/home/pi/.local/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
73 | "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
74 | "\n",
75 | "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
76 | " https://scikit-learn.org/stable/modules/preprocessing.html\n",
77 | "Please also refer to the documentation for alternative solver options:\n",
78 | " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
79 | " extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n"
80 | ]
81 | },
82 | {
83 | "data": {
84 | "text/plain": [
85 | "LogisticRegression()"
86 | ]
87 | },
88 | "execution_count": 4,
89 | "metadata": {},
90 | "output_type": "execute_result"
91 | }
92 | ],
93 | "source": [
94 | "# Instantiating a Logistic Regression (LR) model\n",
95 | "lr_model = LogisticRegression()\n",
96 | "\n",
97 | "# Fitting the dataset to the LR model\n",
98 | "lr_model.fit(X, y)"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 6,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "# Saving model to .pkl file\n",
108 | "import pickle\n",
109 | "pkl_filename = \"../model/iris_model.pkl\"\n",
110 | "with open(pkl_filename, 'wb') as file:\n",
111 | " pickle.dump(lr_model, file)"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": 6,
117 | "metadata": {},
118 | "outputs": [
119 | {
120 | "name": "stderr",
121 | "output_type": "stream",
122 | "text": [
123 | "/var/mobile/Containers/Data/Application/8FC05BBA-B11D-49BC-B4D9-87CB282BBBF2/Library/Application Support/com.rationalmatter.junoapp/python-home/lib/python3.6/site-packages/sklearn/base.py:334: UserWarning: Trying to unpickle estimator LogisticRegression from version 0.24.1 when using version 0.23.1. This might lead to breaking code or invalid results. Use at your own risk.\n",
124 | " UserWarning)\n"
125 | ]
126 | }
127 | ],
128 | "source": [
129 | "# Loading model back in from .pkl file\n",
130 | "import pickle\n",
131 | "pkl_filename = \"model/iris_model.pkl\"\n",
132 | "with open(pkl_filename, 'rb') as file:\n",
133 | " lr_loaded_model_model = pickle.load(file)"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": 5,
139 | "metadata": {},
140 | "outputs": [
141 | {
142 | "data": {
143 | "text/plain": [
144 | "['Dockerfile', 'k8s', 'dependencies', 'container', 'model', 'notebooks']"
145 | ]
146 | },
147 | "execution_count": 5,
148 | "metadata": {},
149 | "output_type": "execute_result"
150 | }
151 | ],
152 | "source": [
153 | "import os\n",
154 | "os.listdir()"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": null,
160 | "metadata": {},
161 | "outputs": [],
162 | "source": []
163 | }
164 | ],
165 | "metadata": {
166 | "kernelspec": {
167 | "display_name": "Python 3",
168 | "language": "python",
169 | "name": "python3"
170 | },
171 | "language_info": {
172 | "codemirror_mode": {
173 | "name": "ipython",
174 | "version": 3
175 | },
176 | "file_extension": ".py",
177 | "mimetype": "text/x-python",
178 | "name": "python",
179 | "nbconvert_exporter": "python",
180 | "pygments_lexer": "ipython3",
181 | "version": "3.6.6+"
182 | }
183 | },
184 | "nbformat": 4,
185 | "nbformat_minor": 2
186 | }
187 |
--------------------------------------------------------------------------------
/012_dockerizing_fastapi/notebooks/iris_model_creation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# Importing the required Python libraries\n",
10 | "import numpy as np\n",
11 | "import pandas as pd\n",
12 | "from sklearn import datasets\n",
13 | "from sklearn.linear_model import LogisticRegression"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 2,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "# Loading the iris dataset\n",
23 | "iris = datasets.load_iris()\n",
24 | "\n",
25 | "# Converting the iris dataset into a pandas dataframe\n",
26 | "df_iris = pd.DataFrame(data = np.c_[iris['data'], iris['target']],\n",
27 | " columns = iris['feature_names'] + ['target'])"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 8,
33 | "metadata": {},
34 | "outputs": [
35 | {
36 | "data": {
37 | "text/plain": [
38 | "'[5.1,3.5,1.4,0.2]'"
39 | ]
40 | },
41 | "execution_count": 8,
42 | "metadata": {},
43 | "output_type": "execute_result"
44 | }
45 | ],
46 | "source": [
47 | "df_iris.drop(columns = ['target']).iloc[0].to_json(orient = 'records')"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 3,
53 | "metadata": {},
54 | "outputs": [],
55 | "source": [
56 | "# Separating the training dataset (X) from the predictor value (y)\n",
57 | "X = df_iris.drop(columns = ['target'])\n",
58 | "y = df_iris[['target']]"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 4,
64 | "metadata": {},
65 | "outputs": [
66 | {
67 | "name": "stderr",
68 | "output_type": "stream",
69 | "text": [
70 | "/home/pi/.local/lib/python3.7/site-packages/sklearn/utils/validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
71 | " return f(*args, **kwargs)\n",
72 | "/home/pi/.local/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
73 | "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
74 | "\n",
75 | "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
76 | " https://scikit-learn.org/stable/modules/preprocessing.html\n",
77 | "Please also refer to the documentation for alternative solver options:\n",
78 | " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
79 | " extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n"
80 | ]
81 | },
82 | {
83 | "data": {
84 | "text/plain": [
85 | "LogisticRegression()"
86 | ]
87 | },
88 | "execution_count": 4,
89 | "metadata": {},
90 | "output_type": "execute_result"
91 | }
92 | ],
93 | "source": [
94 | "# Instantiating a Logistic Regression (LR) model\n",
95 | "lr_model = LogisticRegression()\n",
96 | "\n",
97 | "# Fitting the dataset to the LR model\n",
98 | "lr_model.fit(X, y)"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 6,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "# Saving model to .pkl file\n",
108 | "import pickle\n",
109 | "pkl_filename = \"../model/iris_model.pkl\"\n",
110 | "with open(pkl_filename, 'wb') as file:\n",
111 | " pickle.dump(lr_model, file)"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": 6,
117 | "metadata": {},
118 | "outputs": [
119 | {
120 | "name": "stderr",
121 | "output_type": "stream",
122 | "text": [
123 | "/var/mobile/Containers/Data/Application/8FC05BBA-B11D-49BC-B4D9-87CB282BBBF2/Library/Application Support/com.rationalmatter.junoapp/python-home/lib/python3.6/site-packages/sklearn/base.py:334: UserWarning: Trying to unpickle estimator LogisticRegression from version 0.24.1 when using version 0.23.1. This might lead to breaking code or invalid results. Use at your own risk.\n",
124 | " UserWarning)\n"
125 | ]
126 | }
127 | ],
128 | "source": [
129 | "# Loading model back in from .pkl file\n",
130 | "import pickle\n",
131 | "pkl_filename = \"model/iris_model.pkl\"\n",
132 | "with open(pkl_filename, 'rb') as file:\n",
133 | " lr_loaded_model_model = pickle.load(file)"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": 5,
139 | "metadata": {},
140 | "outputs": [
141 | {
142 | "data": {
143 | "text/plain": [
144 | "['Dockerfile', 'k8s', 'dependencies', 'container', 'model', 'notebooks']"
145 | ]
146 | },
147 | "execution_count": 5,
148 | "metadata": {},
149 | "output_type": "execute_result"
150 | }
151 | ],
152 | "source": [
153 | "import os\n",
154 | "os.listdir()"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": null,
160 | "metadata": {},
161 | "outputs": [],
162 | "source": []
163 | }
164 | ],
165 | "metadata": {
166 | "kernelspec": {
167 | "display_name": "Python 3",
168 | "language": "python",
169 | "name": "python3"
170 | },
171 | "language_info": {
172 | "codemirror_mode": {
173 | "name": "ipython",
174 | "version": 3
175 | },
176 | "file_extension": ".py",
177 | "mimetype": "text/x-python",
178 | "name": "python",
179 | "nbconvert_exporter": "python",
180 | "pygments_lexer": "ipython3",
181 | "version": "3.6.6+"
182 | }
183 | },
184 | "nbformat": 4,
185 | "nbformat_minor": 2
186 | }
187 |
--------------------------------------------------------------------------------
/012_dockerizing_fastapi/tests/test_bad_predict.sh:
--------------------------------------------------------------------------------
1 | curl --request POST \
2 | --header 'Content-Type: application/json' \
3 | --data @test_json/bad_data.json \
4 | --url http://0.0.0.0:5001/predict
--------------------------------------------------------------------------------
/012_dockerizing_fastapi/tests/test_basic_predict.sh:
--------------------------------------------------------------------------------
1 | curl --request POST \
2 | --header 'Content-Type: application/json' \
3 | --data @test_json/test_data.json \
4 | --url http://0.0.0.0:5001/basic_predict
--------------------------------------------------------------------------------
/012_dockerizing_fastapi/tests/test_json/bad_data.json:
--------------------------------------------------------------------------------
1 | {"sepal_length":"dkhundley","sepal_width":3.5,"petal_length":1.4,"petal_width":0.2}
--------------------------------------------------------------------------------
/012_dockerizing_fastapi/tests/test_json/test_data.json:
--------------------------------------------------------------------------------
1 | {"sepal_length":5.1,"sepal_width":3.5,"petal_length":1.4,"petal_width":0.2}
--------------------------------------------------------------------------------
/012_dockerizing_fastapi/tests/test_predict.sh:
--------------------------------------------------------------------------------
1 | curl --request POST \
2 | --header 'Content-Type: application/json' \
3 | --data @test_json/test_data.json \
4 | --url http://0.0.0.0:5001/predict
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/Dockerfile:
--------------------------------------------------------------------------------
1 | # Starting with base image
2 | FROM python:3.9-slim-buster
3 |
4 | # Installing required packages from requirements.txt file
5 | COPY dependencies/requirements.txt /
6 | RUN pip install -r /requirements.txt
7 |
8 | # Copying the FastAPI inference script and model
9 | COPY container/ /container
10 | COPY models/ /models
11 |
12 | # Setting the working directory appropriately
13 | WORKDIR /container
14 |
15 | # Exposing the appropriate port on the container
16 | EXPOSE 5001
17 |
18 | # Setting the entrypoint for the container
19 | ENTRYPOINT ["uvicorn", "--host", "0.0.0.0", "--port", "5001", "api:api"]
20 |
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/container/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/container/__init__.py
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/container/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/container/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/container/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/container/__pycache__/__init__.cpython-38.pyc
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/container/__pycache__/api.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/container/__pycache__/api.cpython-37.pyc
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/container/__pycache__/api.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/container/__pycache__/api.cpython-38.pyc
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/container/api.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import pickle
3 | from fastapi import FastAPI, Request
4 | from fastapi.responses import JSONResponse
5 |
6 |
7 | ## API INSTANTIATION
8 | ## ----------------------------------------------------------------
9 | # Instantiating FastAPI
10 | api = FastAPI()
11 |
12 | # Loading in model from serialized .pkl file
13 | pkl_filename = "../models/iris_model.pkl"
14 | with open(pkl_filename, 'rb') as file:
15 | lr_model = pickle.load(file)
16 |
17 |
18 |
19 | ## API ENDPOINTS
20 | ## ----------------------------------------------------------------
21 | # Defining a test root path and message
22 | @api.get('/')
23 | def root():
24 | msg = {'message': 'Hello friends!'}
25 | return JSONResponse(content = msg, status_code = 200)
26 |
27 |
28 |
29 | # Defining the prediction endpoint without data validation
30 | @api.post('/predict')
31 | async def predict(request: Request):
32 |
33 | # Getting the JSON from the body of the request
34 | input_data = await request.json()
35 |
36 | # Converting JSON to Pandas DataFrame
37 | input_df = pd.DataFrame([input_data])
38 |
39 | # Getting the prediction from the Logistic Regression model
40 | pred = lr_model.predict(input_df)[0]
41 |
42 | return JSONResponse(content = pred, status_code = 200)
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/container/start_api.sh:
--------------------------------------------------------------------------------
1 | uvicorn api:api --host 0.0.0.0 --port 5001 --reload
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/container/train.py:
--------------------------------------------------------------------------------
1 | # Importing the required Python libraries
2 | import numpy as np
3 | import pandas as pd
4 | import pickle
5 | from sklearn import datasets
6 | from sklearn.linear_model import LogisticRegression
7 |
8 | # Loading the iris dataset from Scikit-Learn
9 | iris = datasets.load_iris()
10 |
11 | # Converting the iris dataset into a Pandas DataFrame
12 | df_iris = pd.DataFrame(data = np.c_[iris['data'], iris['target']],
13 | columns = iris['feature_names'] + ['target'])
14 |
15 | # Separating the training dataset (X) from the predictor value (y)
16 | X = df_iris.drop(columns = ['target'])
17 | y = df_iris[['target']]
18 |
19 | # Instantiating a Logistic Regression (LR) model
20 | lr_model = LogisticRegression()
21 |
22 | # Fitting the dataset to the LR model
23 | lr_model.fit(X, y)
24 |
25 | # Saving the model to a serialized .pkl file
26 | pkl_filename = "../models/iris_model.pkl"
27 | with open(pkl_filename, 'wb') as file:
28 | pickle.dump(lr_model, file)
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/dependencies/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi==0.68.0
2 | numpy==1.21.1
3 | pandas==1.3.1
4 | scikit-learn==0.24.1
5 | uvicorn==0.14.0
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/models/iris_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/models/iris_model.pkl
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/.pytest_cache/v/cache/lastfailed:
--------------------------------------------------------------------------------
1 | {
2 | "test_api.py::TestClient": true
3 | }
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/.pytest_cache/v/cache/nodeids:
--------------------------------------------------------------------------------
1 | [
2 | "test_api.py::test_predict",
3 | "test_api.py::test_root_message"
4 | ]
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/.pytest_cache/v/cache/stepwise:
--------------------------------------------------------------------------------
1 | []
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/tests/__init__.py
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/tests/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/tests/__pycache__/__init__.cpython-38.pyc
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/__pycache__/test_api.cpython-37-pytest-6.2.4.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/tests/__pycache__/test_api.cpython-37-pytest-6.2.4.pyc
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/curl_scripts/test_predict.sh:
--------------------------------------------------------------------------------
1 | curl --request POST \
2 | --header 'Content-Type: application/json' \
3 | --data @../test_json/test_data.json \
4 | --url http://0.0.0.0:5001/predict
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/performance_testing/__pycache__/locustfile.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/tests/performance_testing/__pycache__/locustfile.cpython-38.pyc
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/performance_testing/locustfile.py:
--------------------------------------------------------------------------------
1 | from locust import HttpUser, task, between
2 | import json
3 |
4 | # Loading the test JSON data
5 | with open('test_json/test_data.json') as f:
6 | test_data = json.loads(f.read())
7 |
8 | # Creating an API User class inheriting from Locust's HttpUser class
9 | class APIUser(HttpUser):
10 | # Setting the host name and wait_time
11 | host = 'http://localhost:5001'
12 | wait_time = between(3, 5)
13 |
14 | # Defining the post task using the JSON test data
15 | @task()
16 | def predict_endpoint(self):
17 | self.client.post('/predict', json = test_data)
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/reports/dependency_scan_report.txt:
--------------------------------------------------------------------------------
1 | +==============================================================================+
2 | | |
3 | | /$$$$$$ /$$ |
4 | | /$$__ $$ | $$ |
5 | | /$$$$$$$ /$$$$$$ | $$ \__//$$$$$$ /$$$$$$ /$$ /$$ |
6 | | /$$_____/ |____ $$| $$$$ /$$__ $$|_ $$_/ | $$ | $$ |
7 | | | $$$$$$ /$$$$$$$| $$_/ | $$$$$$$$ | $$ | $$ | $$ |
8 | | \____ $$ /$$__ $$| $$ | $$_____/ | $$ /$$| $$ | $$ |
9 | | /$$$$$$$/| $$$$$$$| $$ | $$$$$$$ | $$$$/| $$$$$$$ |
10 | | |_______/ \_______/|__/ \_______/ \___/ \____ $$ |
11 | | /$$ | $$ |
12 | | | $$$$$$/ |
13 | | by pyup.io \______/ |
14 | | |
15 | +==============================================================================+
16 | | REPORT |
17 | | checked 5 packages, using free DB (updated once a month) |
18 | +============================+===========+==========================+==========+
19 | | package | installed | affected | ID |
20 | +============================+===========+==========================+==========+
21 | | uvicorn | 0.0.4 | <0.11.7 | 38664 |
22 | +==============================================================================+
23 | | The request logger provided by Uvicorn prior to version 0.11.7 is vulnerable |
24 | | to ASNI escape sequence injection. Whenever any HTTP request is received, |
25 | | the default behaviour of uvicorn is to log its details to either the console |
26 | | or a log file. When attackers request crafted URLs with percent-encoded |
27 | | escape sequences, the logging component will log the URL after it's been |
28 | | processed with urllib.parse.unquote, therefore converting any percent- |
29 | | encoded characters into their single-character equivalent, which can have |
30 | | special meaning in terminal emulators. By requesting URLs with crafted |
31 | | paths, attackers can: * Pollute uvicorn's access logs, therefore |
32 | | jeopardising the integrity of such files. * Use ANSI sequence codes to |
33 | | attempt to interact with the terminal emulator that's displaying the logs |
34 | | (either in real time or from a file). See: CVE-2020-7694. |
35 | +==============================================================================+
36 | | uvicorn | 0.0.4 | <0.11.7 | 38665 |
37 | +==============================================================================+
38 | | Uvicorn before 0.11.7 is vulnerable to HTTP response splitting. CRLF |
39 | | sequences are not escaped in the value of HTTP headers. Attackers can |
40 | | exploit this to add arbitrary headers to HTTP responses, or even return an |
41 | | arbitrary response body, whenever crafted input is used to construct HTTP |
42 | | headers. See: CVE-2020-7695. |
43 | +==============================================================================+
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/reports/linter_report.txt:
--------------------------------------------------------------------------------
1 | ************* Module container.api
2 | /Users/dkhundley/Documents/Repositories/ds-quick-tips/013_fastapi_tests_scans/container/api.py:42:0: C0304: Final newline missing (missing-final-newline)
3 | /Users/dkhundley/Documents/Repositories/ds-quick-tips/013_fastapi_tests_scans/container/api.py:1:0: C0114: Missing module docstring (missing-module-docstring)
4 | /Users/dkhundley/Documents/Repositories/ds-quick-tips/013_fastapi_tests_scans/container/api.py:3:0: E0401: Unable to import 'fastapi' (import-error)
5 | /Users/dkhundley/Documents/Repositories/ds-quick-tips/013_fastapi_tests_scans/container/api.py:4:0: E0401: Unable to import 'fastapi.responses' (import-error)
6 | /Users/dkhundley/Documents/Repositories/ds-quick-tips/013_fastapi_tests_scans/container/api.py:13:0: C0103: Constant name "pkl_filename" doesn't conform to UPPER_CASE naming style (invalid-name)
7 | /Users/dkhundley/Documents/Repositories/ds-quick-tips/013_fastapi_tests_scans/container/api.py:23:0: C0116: Missing function or method docstring (missing-function-docstring)
8 | /Users/dkhundley/Documents/Repositories/ds-quick-tips/013_fastapi_tests_scans/container/api.py:31:0: C0116: Missing function or method docstring (missing-function-docstring)
9 | /Users/dkhundley/Documents/Repositories/ds-quick-tips/013_fastapi_tests_scans/container/api.py:2:0: C0411: standard import "import pickle" should be placed before "import pandas as pd" (wrong-import-order)
10 | ************* Module container.train
11 | /Users/dkhundley/Documents/Repositories/ds-quick-tips/013_fastapi_tests_scans/container/train.py:28:0: C0304: Final newline missing (missing-final-newline)
12 | /Users/dkhundley/Documents/Repositories/ds-quick-tips/013_fastapi_tests_scans/container/train.py:28:0: W0311: Bad indentation. Found 1 spaces, expected 4 (bad-indentation)
13 | /Users/dkhundley/Documents/Repositories/ds-quick-tips/013_fastapi_tests_scans/container/train.py:1:0: C0114: Missing module docstring (missing-module-docstring)
14 | /Users/dkhundley/Documents/Repositories/ds-quick-tips/013_fastapi_tests_scans/container/train.py:26:0: C0103: Constant name "pkl_filename" doesn't conform to UPPER_CASE naming style (invalid-name)
15 | /Users/dkhundley/Documents/Repositories/ds-quick-tips/013_fastapi_tests_scans/container/train.py:4:0: C0411: standard import "import pickle" should be placed before "import numpy as np" (wrong-import-order)
16 |
17 |
18 | Report
19 | ======
20 | 30 statements analysed.
21 |
22 | Statistics by type
23 | ------------------
24 |
25 | +---------+-------+-----------+-----------+------------+---------+
26 | |type |number |old number |difference |%documented |%badname |
27 | +=========+=======+===========+===========+============+=========+
28 | |module |3 |3 |= |33.33 |0.00 |
29 | +---------+-------+-----------+-----------+------------+---------+
30 | |class |0 |0 |= |0 |0 |
31 | +---------+-------+-----------+-----------+------------+---------+
32 | |method |0 |0 |= |0 |0 |
33 | +---------+-------+-----------+-----------+------------+---------+
34 | |function |2 |2 |= |0.00 |0.00 |
35 | +---------+-------+-----------+-----------+------------+---------+
36 |
37 |
38 |
39 | External dependencies
40 | ---------------------
41 | ::
42 |
43 | numpy (container.train)
44 | pandas (container.api,container.train)
45 | sklearn
46 | \-datasets (container.train)
47 | \-linear_model (container.train)
48 |
49 |
50 |
51 | Raw metrics
52 | -----------
53 |
54 | +----------+-------+------+---------+-----------+
55 | |type |number |% |previous |difference |
56 | +==========+=======+======+=========+===========+
57 | |code |38 |50.00 |38 |= |
58 | +----------+-------+------+---------+-----------+
59 | |docstring |0 |0.00 |0 |= |
60 | +----------+-------+------+---------+-----------+
61 | |comment |18 |23.68 |18 |= |
62 | +----------+-------+------+---------+-----------+
63 | |empty |20 |26.32 |20 |= |
64 | +----------+-------+------+---------+-----------+
65 |
66 |
67 |
68 | Duplication
69 | -----------
70 |
71 | +-------------------------+------+---------+-----------+
72 | | |now |previous |difference |
73 | +=========================+======+=========+===========+
74 | |nb duplicated lines |0 |0 |= |
75 | +-------------------------+------+---------+-----------+
76 | |percent duplicated lines |0.000 |0.000 |= |
77 | +-------------------------+------+---------+-----------+
78 |
79 |
80 |
81 | Messages by category
82 | --------------------
83 |
84 | +-----------+-------+---------+-----------+
85 | |type |number |previous |difference |
86 | +===========+=======+=========+===========+
87 | |convention |10 |10 |= |
88 | +-----------+-------+---------+-----------+
89 | |refactor |0 |0 |= |
90 | +-----------+-------+---------+-----------+
91 | |warning |1 |1 |= |
92 | +-----------+-------+---------+-----------+
93 | |error |2 |2 |= |
94 | +-----------+-------+---------+-----------+
95 |
96 |
97 |
98 | % errors / warnings by module
99 | -----------------------------
100 |
101 | +----------------+-------+--------+---------+-----------+
102 | |module |error |warning |refactor |convention |
103 | +================+=======+========+=========+===========+
104 | |container.api |100.00 |0.00 |0.00 |60.00 |
105 | +----------------+-------+--------+---------+-----------+
106 | |container.train |0.00 |100.00 |0.00 |40.00 |
107 | +----------------+-------+--------+---------+-----------+
108 |
109 |
110 |
111 | Messages
112 | --------
113 |
114 | +---------------------------+------------+
115 | |message id |occurrences |
116 | +===========================+============+
117 | |wrong-import-order |2 |
118 | +---------------------------+------------+
119 | |missing-module-docstring |2 |
120 | +---------------------------+------------+
121 | |missing-function-docstring |2 |
122 | +---------------------------+------------+
123 | |missing-final-newline |2 |
124 | +---------------------------+------------+
125 | |invalid-name |2 |
126 | +---------------------------+------------+
127 | |import-error |2 |
128 | +---------------------------+------------+
129 | |bad-indentation |1 |
130 | +---------------------------+------------+
131 |
132 |
133 |
134 |
135 | ------------------------------------------------------------------
136 | Your code has been rated at 3.00/10 (previous run: 3.00/10, +0.00)
137 |
138 |
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/reports/performance_test_exceptions.csv:
--------------------------------------------------------------------------------
1 | Count,Message,Traceback,Nodes
2 |
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/reports/performance_test_failures.csv:
--------------------------------------------------------------------------------
1 | Method,Name,Error,Occurrences
2 | POST,/predict,"HTTPConnectionPool(host='localhost', port=5001): Max retries exceeded with url: /predict (Caused by NewConnectionError(': Failed to establish a new connection: [Errno 61] Connection refused'))",112
3 |
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/reports/performance_test_stats.csv:
--------------------------------------------------------------------------------
1 | Type,Name,Request Count,Failure Count,Median Response Time,Average Response Time,Min Response Time,Max Response Time,Average Content Size,Requests/s,Failures/s,50%,66%,75%,80%,90%,95%,98%,99%,99.9%,99.99%,100%
2 | POST,/predict,112,112,6,6.156811776785702,2.4135830000000524,15.114749999999955,0.0,3.883597734176632,3.883597734176632,6,7,7,8,10,11,12,13,15,15,15
3 | ,Aggregated,112,112,6,6.156811776785702,2.4135830000000524,15.114749999999955,0.0,3.883597734176632,3.883597734176632,6,7,7,8,10,11,12,13,15,15,15
4 |
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/reports/performance_test_stats_history.csv:
--------------------------------------------------------------------------------
1 | Timestamp,User Count,Type,Name,Requests/s,Failures/s,50%,66%,75%,80%,90%,95%,98%,99%,99.9%,99.99%,100%,Total Request Count,Total Failure Count,Total Median Response Time,Total Average Response Time,Total Min Response Time,Total Max Response Time,Total Average Content Size
2 | 1628458765,0,,Aggregated,0.000000,0.000000,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0,0,0,0,0,0,0
3 | 1628458766,5,,Aggregated,0.000000,0.000000,11,11,11,11,11,11,11,11,11,11,11,5,5,11,10.770100000000003,10.469167000000057,11.046166000000024,0.0
4 | 1628458767,10,,Aggregated,0.000000,0.000000,11,11,11,11,12,12,12,12,12,12,12,10,10,11,10.620950000000006,8.912667000000152,11.83004199999993,0.0
5 | 1628458768,15,,Aggregated,0.000000,0.000000,11,11,12,12,13,15,15,15,15,15,15,15,15,11,11.227561066666707,8.912667000000152,15.114749999999955,0.0
6 | 1628458769,15,,Aggregated,5.000000,5.000000,11,11,12,12,13,15,15,15,15,15,15,19,19,11,10.092164473684207,4.092667000000105,15.114749999999955,0.0
7 | 1628458770,15,,Aggregated,5.000000,5.000000,10,11,11,12,12,13,15,15,15,15,15,24,24,10,8.92718575000004,2.4135830000000524,15.114749999999955,0.0
8 | 1628458771,15,,Aggregated,4.250000,4.250000,10,11,11,11,12,13,15,15,15,15,15,27,27,10,8.581003111111146,2.4135830000000524,15.114749999999955,0.0
9 | 1628458772,15,,Aggregated,4.400000,4.400000,7,11,11,11,12,13,15,15,15,15,15,31,31,7,8.185465064516132,2.4135830000000524,15.114749999999955,0.0
10 | 1628458773,15,,Aggregated,4.166667,4.166667,7,10,11,11,12,13,15,15,15,15,15,33,33,7,8.123130060606037,2.4135830000000524,15.114749999999955,0.0
11 | 1628458774,15,,Aggregated,4.142857,4.142857,7,10,11,11,12,13,15,15,15,15,15,38,38,7,7.933628289473686,2.4135830000000524,15.114749999999955,0.0
12 | 1628458775,15,,Aggregated,4.000000,4.000000,7,9,11,11,12,12,15,15,15,15,15,42,42,7,7.679332333333274,2.4135830000000524,15.114749999999955,0.0
13 | 1628458776,15,,Aggregated,4.000000,4.000000,7,9,11,11,12,12,15,15,15,15,15,44,44,6,7.511122159090815,2.4135830000000524,15.114749999999955,0.0
14 | 1628458777,15,,Aggregated,4.000000,4.000000,7,8,10,11,12,12,15,15,15,15,15,48,48,6,7.331673583333242,2.4135830000000524,15.114749999999955,0.0
15 | 1628458778,15,,Aggregated,3.800000,3.800000,6,7,10,11,11,12,13,15,15,15,15,54,54,6,7.058474518518427,2.4135830000000524,15.114749999999955,0.0
16 | 1628458779,15,,Aggregated,3.800000,3.800000,6,7,10,10,11,12,13,15,15,15,15,56,56,6,6.985906232142781,2.4135830000000524,15.114749999999955,0.0
17 | 1628458780,15,,Aggregated,3.800000,3.800000,6,7,8,10,11,12,13,15,15,15,15,61,61,6,6.849107213114675,2.4135830000000524,15.114749999999955,0.0
18 | 1628458781,15,,Aggregated,3.900000,3.900000,6,7,8,10,11,12,13,15,15,15,15,63,63,6,6.834124984126912,2.4135830000000524,15.114749999999955,0.0
19 | 1628458782,15,,Aggregated,3.700000,3.700000,6,7,8,10,11,12,13,15,15,15,15,68,68,6,6.756037970588114,2.4135830000000524,15.114749999999955,0.0
20 | 1628458783,15,,Aggregated,3.700000,3.700000,6,7,8,10,11,12,13,15,15,15,15,70,70,6,6.763961285714137,2.4135830000000524,15.114749999999955,0.0
21 | 1628458784,15,,Aggregated,3.700000,3.700000,6,7,8,9,11,12,13,15,15,15,15,74,74,6,6.658054594594409,2.4135830000000524,15.114749999999955,0.0
22 | 1628458785,15,,Aggregated,3.700000,3.700000,6,7,8,9,11,12,13,15,15,15,15,80,80,6,6.586210399999874,2.4135830000000524,15.114749999999955,0.0
23 |
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/reports/static_scan_report.txt:
--------------------------------------------------------------------------------
1 | Run started:2021-08-08 21:40:22.468129
2 |
3 | Test results:
4 | >> Issue: [B403:blacklist] Consider possible security implications associated with pickle module.
5 | Severity: Low Confidence: High
6 | Location: ../container/api.py:2
7 | More Info: https://bandit.readthedocs.io/en/latest/blacklists/blacklist_imports.html#b403-import-pickle
8 | 1 import pandas as pd
9 | 2 import pickle
10 | 3 from fastapi import FastAPI, Request
11 |
12 | --------------------------------------------------
13 | >> Issue: [B301:blacklist] Pickle and modules that wrap it can be unsafe when used to deserialize untrusted data, possible security issue.
14 | Severity: Medium Confidence: High
15 | Location: ../container/api.py:15
16 | More Info: https://bandit.readthedocs.io/en/latest/blacklists/blacklist_calls.html#b301-pickle
17 | 14 with open(pkl_filename, 'rb') as file:
18 | 15 lr_model = pickle.load(file)
19 | 16
20 |
21 | --------------------------------------------------
22 | >> Issue: [B403:blacklist] Consider possible security implications associated with pickle module.
23 | Severity: Low Confidence: High
24 | Location: ../container/train.py:4
25 | More Info: https://bandit.readthedocs.io/en/latest/blacklists/blacklist_imports.html#b403-import-pickle
26 | 3 import pandas as pd
27 | 4 import pickle
28 | 5 from sklearn import datasets
29 |
30 | --------------------------------------------------
31 |
32 | Code scanned:
33 | Total lines of code: 33
34 | Total lines skipped (#nosec): 0
35 |
36 | Run metrics:
37 | Total issues (by severity):
38 | Undefined: 0.0
39 | Low: 2.0
40 | Medium: 1.0
41 | High: 0.0
42 | Total issues (by confidence):
43 | Undefined: 0.0
44 | Low: 0.0
45 | Medium: 0.0
46 | High: 3.0
47 | Files skipped (0):
48 |
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/reports/unit_test_report.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/tests/reports/unit_test_report.txt
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/run_all_tests.sh:
--------------------------------------------------------------------------------
1 | docker build -t iris-api:dev -f ../Dockerfile
2 | export CONTAINER_ID=$(docker run -d -p 5001:5001 iris-api:dev)
3 | bash run_container_scan.sh
4 | bash run_dependency_scan.sh
5 | bash run_linter.sh
6 | bash run_perf_test.sh
7 | bash run_static_scan.sh
8 | bash run_unit_tests.sh
9 | docker stop $CONTAINER_ID
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/run_container_scan.sh:
--------------------------------------------------------------------------------
1 | docker scan iris-api:dev | tee reports/container_scan_results.txt
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/run_dependency_scan.sh:
--------------------------------------------------------------------------------
1 | safety check -r ../dependencies/requirements.txt --full-report -o reports/dependency_scan_report.txt
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/run_linter.sh:
--------------------------------------------------------------------------------
1 | pylint ../container/ --reports=y --output=reports/linter_report.txt
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/run_perf_test.sh:
--------------------------------------------------------------------------------
1 | locust --locustfile performance_testing/locustfile.py --headless --users 15 --spawn-rate 5 --run-time 30s --only-summary --csv reports/performance_test
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/run_static_scan.sh:
--------------------------------------------------------------------------------
1 | bandit --format=txt --output=reports/static_scan_report.txt -r ../container/
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/run_unit_tests.sh:
--------------------------------------------------------------------------------
1 | pytest --log-file=reports/unit_test_report.txt unit_testing/
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/test_json/bad_data.json:
--------------------------------------------------------------------------------
1 | {"sepal_length":"dkhundley","sepal_width":3.5,"petal_length":1.4,"petal_width":0.2}
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/test_json/test_data.json:
--------------------------------------------------------------------------------
1 | {"sepal_length":5.1,"sepal_width":3.5,"petal_length":1.4,"petal_width":0.2}
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/unit_testing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/tests/unit_testing/__init__.py
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/unit_testing/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/tests/unit_testing/__pycache__/__init__.cpython-38.pyc
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/unit_testing/__pycache__/test_api.cpython-38-pytest-6.2.3.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/tests/unit_testing/__pycache__/test_api.cpython-38-pytest-6.2.3.pyc
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/unit_testing/__pycache__/test_api.cpython-38-pytest-6.2.4.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dkhundley/ds-quick-tips/a34d6023b9a532d8bd40749b850f0de77ae174c2/013_fastapi_tests_scans/tests/unit_testing/__pycache__/test_api.cpython-38-pytest-6.2.4.pyc
--------------------------------------------------------------------------------
/013_fastapi_tests_scans/tests/unit_testing/test_api.py:
--------------------------------------------------------------------------------
1 | import json
2 | from fastapi.testclient import TestClient
3 | from container.api import api
4 |
5 |
6 |
7 | ## PYTEST SETUP
8 | ## --------------------------------------------------------------------------------------------------------------------
9 | # Instantiating the test client from our container's API
10 | client = TestClient(api)
11 |
12 | # Loading test JSON file
13 | with open('test_json/test_data.json', 'rb') as file:
14 | test_json = json.load(file)
15 |
16 |
17 |
18 | ## UNIT TEST CASES
19 | ## --------------------------------------------------------------------------------------------------------------------
20 | # Creating a unit test for the basic root path
21 | def test_root_message():
22 | response = client.get("/")
23 | assert response.status_code == 200
24 | assert response.json() == {'message': 'Hello friend!'}
25 |
26 | # Creating a unit test for the prediction endpoint
27 | def test_predict():
28 | response = client.post('/predict', json = test_json)
29 | assert response.status_code == 200
--------------------------------------------------------------------------------
/014_kfolds_validation/notebooks/.ipynb_checkpoints/kfolds-validation-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "5f86ded3",
6 | "metadata": {},
7 | "source": [
8 | "# K-Folds Validation\n",
9 | "As part of this notebook, we will be exploring how to make efficient use of small datasets by utilizing **k-folds validation**. K-folds validation splits a training dataset into multiple small batches. One of these datasets is reserved as the validation dataset "
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "id": "863c0f2e",
15 | "metadata": {},
16 | "source": [
17 | "## Project Setup"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 29,
23 | "id": "834b1058",
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "# Importing the necessary Python libraries\n",
28 | "import numpy as np\n",
29 | "import pandas as pd\n",
30 | "from sklearn import datasets\n",
31 | "from sklearn.model_selection import train_test_split\n",
32 | "from sklearn.ensemble import RandomForestClassifier\n",
33 | "from sklearn.metrics import accuracy_score, confusion_matrix"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 2,
39 | "id": "1aaca1cb",
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "# Getting the Iris dataset from Scikit-Learn\n",
44 | "iris = datasets.load_iris()"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 15,
50 | "id": "86e7124e",
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "# Loading the predictor value (y) and remainder of the training dataset (X) as Pandas DataFrames\n",
55 | "X = pd.DataFrame(data = iris['data'], columns = iris['feature_names'])\n",
56 | "y = pd.DataFrame(data = iris['target'], columns = ['target'])"
57 | ]
58 | },
59 | {
60 | "cell_type": "markdown",
61 | "id": "552f0963",
62 | "metadata": {},
63 | "source": [
64 | "## Performing a Typical Split\n",
65 | "Before we jump into how we perform k-folds validation, let's do a quick refresher on how we typically split our dataset using a traditional `train_test_split`. Then we'll later contrast this method with k-folds validation."
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 18,
71 | "id": "22eb9945",
72 | "metadata": {},
73 | "outputs": [],
74 | "source": [
75 | "# Performing a train_test_split on the dataset\n",
76 | "X_train, X_val, y_train, y_val = train_test_split(X, y)"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 21,
82 | "id": "07fd229f",
83 | "metadata": {},
84 | "outputs": [],
85 | "source": [
86 | "# Instantiating a RandomForestClassifier model\n",
87 | "rfc_model = RandomForestClassifier()"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": 22,
93 | "id": "09bffb52",
94 | "metadata": {},
95 | "outputs": [
96 | {
97 | "name": "stderr",
98 | "output_type": "stream",
99 | "text": [
100 | ":2: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
101 | " rfc_model.fit(X_train, y_train)\n"
102 | ]
103 | },
104 | {
105 | "data": {
106 | "text/plain": [
107 | "RandomForestClassifier()"
108 | ]
109 | },
110 | "execution_count": 22,
111 | "metadata": {},
112 | "output_type": "execute_result"
113 | }
114 | ],
115 | "source": [
116 | "# Fitting the X_train and y_train datasets to the RandomForestClassifier model\n",
117 | "rfc_model.fit(X_train, y_train)"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 23,
123 | "id": "53fb4d46",
124 | "metadata": {},
125 | "outputs": [],
126 | "source": [
127 | "# Getting inferential predictions for the validation dataset\n",
128 | "val_preds = rfc_model.predict(X_val)"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": 28,
134 | "id": "041535cf",
135 | "metadata": {},
136 | "outputs": [],
137 | "source": [
138 | "# Generating validation metrics by comparing the inferential predictions (val_preds) to the actuals (y_val)\n",
139 | "val_accuracy = accuracy_score(y_val, val_preds)\n",
140 | "val_confusion_matrix = confusion_matrix(y_val, val_preds)"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": 30,
146 | "id": "343f7364",
147 | "metadata": {},
148 | "outputs": [
149 | {
150 | "name": "stdout",
151 | "output_type": "stream",
152 | "text": [
153 | "Accuracy Score: 0.9210526315789473\n",
154 | "Confusion Matrix: \n",
155 | "[[14 0 0]\n",
156 | " [ 0 7 0]\n",
157 | " [ 0 3 14]]\n"
158 | ]
159 | }
160 | ],
161 | "source": [
162 | "# Printing out the validation metrics\n",
163 | "print(f'Accuracy Score: {val_accuracy}')\n",
164 | "print(f'Confusion Matrix: \\n{val_confusion_matrix}')"
165 | ]
166 | },
167 | {
168 | "cell_type": "markdown",
169 | "id": "36bd650e",
170 | "metadata": {},
171 | "source": [
172 | "## Training with K-Folds Validation\n",
173 | "Now that we have performed a very basic model training using a traditional `train_test_split`, we are now ready to perform a training using k-folds validation."
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": null,
179 | "id": "bdfc755e",
180 | "metadata": {},
181 | "outputs": [],
182 | "source": []
183 | }
184 | ],
185 | "metadata": {
186 | "kernelspec": {
187 | "display_name": "Python 3",
188 | "language": "python",
189 | "name": "python3"
190 | },
191 | "language_info": {
192 | "codemirror_mode": {
193 | "name": "ipython",
194 | "version": 3
195 | },
196 | "file_extension": ".py",
197 | "mimetype": "text/x-python",
198 | "name": "python",
199 | "nbconvert_exporter": "python",
200 | "pygments_lexer": "ipython3",
201 | "version": "3.8.8"
202 | }
203 | },
204 | "nbformat": 4,
205 | "nbformat_minor": 5
206 | }
207 |
--------------------------------------------------------------------------------
/014_kfolds_validation/notebooks/kfolds-validation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "5f86ded3",
6 | "metadata": {},
7 | "source": [
8 | "# K-Folds Validation\n",
9 | "As part of this notebook, we will be exploring how to make efficient use of small datasets by utilizing **k-folds validation**. K-folds validation splits a training dataset into multiple small batches. One of these datasets is reserved as the validation dataset "
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "id": "863c0f2e",
15 | "metadata": {},
16 | "source": [
17 | "## Project Setup"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 1,
23 | "id": "834b1058",
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "# Importing the necessary Python libraries\n",
28 | "import numpy as np\n",
29 | "import pandas as pd\n",
30 | "from sklearn import datasets\n",
31 | "from sklearn.model_selection import train_test_split, KFold\n",
32 | "from sklearn.ensemble import RandomForestClassifier\n",
33 | "from sklearn.metrics import accuracy_score, confusion_matrix\n",
34 | "\n",
35 | "import warnings\n",
36 | "warnings.filterwarnings('ignore')"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 2,
42 | "id": "1aaca1cb",
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "# Getting the Iris dataset from Scikit-Learn\n",
47 | "iris = datasets.load_iris()"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 3,
53 | "id": "86e7124e",
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "# Loading the predictor value (y) and remainder of the training dataset (X) as Pandas DataFrames\n",
58 | "X = pd.DataFrame(data = iris['data'], columns = iris['feature_names'])\n",
59 | "y = pd.DataFrame(data = iris['target'], columns = ['target'])"
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "id": "552f0963",
65 | "metadata": {},
66 | "source": [
67 | "## Performing a Typical Split\n",
68 | "Before we jump into how we perform k-folds validation, let's do a quick refresher on how we typically split our dataset using a traditional `train_test_split`. Then we'll later contrast this method with k-folds validation."
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 4,
74 | "id": "22eb9945",
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "# Performing a train_test_split on the dataset\n",
79 | "X_train, X_val, y_train, y_val = train_test_split(X, y)"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 5,
85 | "id": "07fd229f",
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "# Instantiating a RandomForestClassifier model\n",
90 | "rfc_model = RandomForestClassifier()"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": 6,
96 | "id": "09bffb52",
97 | "metadata": {},
98 | "outputs": [
99 | {
100 | "data": {
101 | "text/plain": [
102 | "RandomForestClassifier()"
103 | ]
104 | },
105 | "execution_count": 6,
106 | "metadata": {},
107 | "output_type": "execute_result"
108 | }
109 | ],
110 | "source": [
111 | "# Fitting the X_train and y_train datasets to the RandomForestClassifier model\n",
112 | "rfc_model.fit(X_train, y_train)"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 7,
118 | "id": "53fb4d46",
119 | "metadata": {},
120 | "outputs": [],
121 | "source": [
122 | "# Getting inferential predictions for the validation dataset\n",
123 | "val_preds = rfc_model.predict(X_val)"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": 8,
129 | "id": "041535cf",
130 | "metadata": {},
131 | "outputs": [],
132 | "source": [
133 | "# Generating validation metrics by comparing the inferential predictions (val_preds) to the actuals (y_val)\n",
134 | "val_accuracy = accuracy_score(y_val, val_preds)\n",
135 | "val_confusion_matrix = confusion_matrix(y_val, val_preds)"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": 9,
141 | "id": "343f7364",
142 | "metadata": {},
143 | "outputs": [
144 | {
145 | "name": "stdout",
146 | "output_type": "stream",
147 | "text": [
148 | "Accuracy Score: 0.9210526315789473\n",
149 | "Confusion Matrix: \n",
150 | "[[12 0 0]\n",
151 | " [ 0 11 0]\n",
152 | " [ 0 3 12]]\n"
153 | ]
154 | }
155 | ],
156 | "source": [
157 | "# Printing out the validation metrics\n",
158 | "print(f'Accuracy Score: {val_accuracy}')\n",
159 | "print(f'Confusion Matrix: \\n{val_confusion_matrix}')"
160 | ]
161 | },
162 | {
163 | "cell_type": "markdown",
164 | "id": "36bd650e",
165 | "metadata": {},
166 | "source": [
167 | "## Training with K-Folds Validation\n",
168 | "Now that we have performed a very basic model training using a traditional `train_test_split`, we are now ready to perform a training using k-folds validation."
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": 10,
174 | "id": "bdfc755e",
175 | "metadata": {},
176 | "outputs": [],
177 | "source": [
178 | "# Instantiating the K-Fold cross validation object with 5 folds\n",
179 | "k_folds = KFold(n_splits = 5, shuffle = True, random_state = 42)"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 11,
185 | "id": "de62dc1a",
186 | "metadata": {},
187 | "outputs": [
188 | {
189 | "name": "stdout",
190 | "output_type": "stream",
191 | "text": [
192 | "Accuracy Score: 1.0\n",
193 | "Confusion Matrix: \n",
194 | "[[10 0 0]\n",
195 | " [ 0 9 0]\n",
196 | " [ 0 0 11]]\n",
197 | "Accuracy Score: 0.9666666666666667\n",
198 | "Confusion Matrix: \n",
199 | "[[13 0 0]\n",
200 | " [ 0 10 0]\n",
201 | " [ 0 1 6]]\n",
202 | "Accuracy Score: 0.9333333333333333\n",
203 | "Confusion Matrix: \n",
204 | "[[12 0 0]\n",
205 | " [ 0 8 2]\n",
206 | " [ 0 0 8]]\n",
207 | "Accuracy Score: 0.9333333333333333\n",
208 | "Confusion Matrix: \n",
209 | "[[ 8 0 0]\n",
210 | " [ 0 9 1]\n",
211 | " [ 0 1 11]]\n",
212 | "Accuracy Score: 0.9666666666666667\n",
213 | "Confusion Matrix: \n",
214 | "[[ 7 0 0]\n",
215 | " [ 0 11 0]\n",
216 | " [ 0 1 11]]\n"
217 | ]
218 | }
219 | ],
220 | "source": [
221 | "# Iterating through each of the folds in K-Folds\n",
222 | "for train_index, val_index in k_folds.split(X):\n",
223 | " \n",
224 | " # Splitting the training set from the validation set for this specific fold\n",
225 | " X_train, X_val = X.iloc[train_index, :], X.iloc[val_index, :]\n",
226 | " y_train, y_val = y.iloc[train_index], y.iloc[val_index]\n",
227 | " \n",
228 | " # Instantiating a RandomForestClassifier model\n",
229 | " rfc_model = RandomForestClassifier()\n",
230 | " \n",
231 | " # Fitting the X_train and y_train datasets to the RandomForestClassifier model\n",
232 | " rfc_model.fit(X_train, y_train)\n",
233 | " \n",
234 | " # Getting inferential predictions for the validation dataset\n",
235 | " val_preds = rfc_model.predict(X_val)\n",
236 | " \n",
237 | " # Generating validation metrics by comparing the inferential predictions (val_preds) to the actuals (y_val)\n",
238 | " val_accuracy = accuracy_score(y_val, val_preds)\n",
239 | " val_confusion_matrix = confusion_matrix(y_val, val_preds)\n",
240 | " \n",
241 | " # Printing out the validation metrics\n",
242 | " print(f'Accuracy Score: {val_accuracy}')\n",
243 | " print(f'Confusion Matrix: \\n{val_confusion_matrix}')"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": null,
249 | "id": "46e23280",
250 | "metadata": {},
251 | "outputs": [],
252 | "source": []
253 | }
254 | ],
255 | "metadata": {
256 | "kernelspec": {
257 | "display_name": "Python 3 (ipykernel)",
258 | "language": "python",
259 | "name": "python3"
260 | },
261 | "language_info": {
262 | "codemirror_mode": {
263 | "name": "ipython",
264 | "version": 3
265 | },
266 | "file_extension": ".py",
267 | "mimetype": "text/x-python",
268 | "name": "python",
269 | "nbconvert_exporter": "python",
270 | "pygments_lexer": "ipython3",
271 | "version": "3.10.1"
272 | }
273 | },
274 | "nbformat": 4,
275 | "nbformat_minor": 5
276 | }
277 |
--------------------------------------------------------------------------------
/015_synthesizing_test_data/notebooks/synthesizing_test_data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "314c6067",
6 | "metadata": {},
7 | "source": [
8 | "# Data Science Quick Tip #015: Synthesizing Your Own Test Data\n",
9 | "In this notebook, we'll be sharing how to synthesize your own test data for test purposes. We will cover how to synthesize data for three use cases: binary classification, multiclass classification, and regression."
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "id": "c7f25f02",
15 | "metadata": {},
16 | "source": [
17 | "## Project Setup"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 1,
23 | "id": "70806b94",
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "# Importing the required Python libraries\n",
28 | "import pandas as pd\n",
29 | "from sklearn.datasets import make_blobs, make_classification, make_regression\n",
30 | "from sklearn.model_selection import train_test_split\n",
31 | "from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, mean_absolute_error, mean_squared_error, r2_score\n",
32 | "from sklearn.ensemble import RandomForestClassifier\n",
33 | "from sklearn.linear_model import LogisticRegression"
34 | ]
35 | },
36 | {
37 | "cell_type": "markdown",
38 | "id": "4915e523",
39 | "metadata": {},
40 | "source": [
41 | "## Use Case #1: Binary Classification"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 19,
47 | "id": "3c1034be",
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "# Generating synthetic binary classification in the form of arrays\n",
52 | "X, y = make_classification(n_samples = 10000,\n",
53 | " n_features = 25,\n",
54 | " n_informative = 10,\n",
55 | " n_redundant = 10,\n",
56 | " n_repeated = 5,\n",
57 | " n_classes = 2,\n",
58 | " weights = [0.6, 0.4])"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 20,
64 | "id": "7160aa2d",
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "# Transforming the arrays into Pandas DataFrames\n",
69 | "df_X = pd.DataFrame(data = X)\n",
70 | "df_y = pd.DataFrame(data = y, columns = ['target'])"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 21,
76 | "id": "2a422c2b",
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "# Performing a split on the data to save data as a holdout, validation set\n",
81 | "X_train, X_val, y_train, y_val = train_test_split(df_X, df_y)"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": 22,
87 | "id": "32ea8505",
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "# Instantiating the binary classification model with the RandomForestClassifier algorithm\n",
92 | "binary_classification_model = RandomForestClassifier()"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": 23,
98 | "id": "20f39ca5",
99 | "metadata": {},
100 | "outputs": [
101 | {
102 | "name": "stderr",
103 | "output_type": "stream",
104 | "text": [
105 | "C:\\Users\\david\\AppData\\Local\\Temp/ipykernel_7920/4251291611.py:2: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
106 | " binary_classification_model.fit(X_train, y_train)\n"
107 | ]
108 | },
109 | {
110 | "data": {
111 | "text/plain": [
112 | "RandomForestClassifier()"
113 | ]
114 | },
115 | "execution_count": 23,
116 | "metadata": {},
117 | "output_type": "execute_result"
118 | }
119 | ],
120 | "source": [
121 | "# Training the binary classification model against the training data\n",
122 | "binary_classification_model.fit(X_train, y_train)"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": 24,
128 | "id": "08a870a4",
129 | "metadata": {},
130 | "outputs": [],
131 | "source": [
132 | "# Getting inferential predictions from the validation dataset\n",
133 | "val_preds = binary_classification_model.predict(X_val)"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": 25,
139 | "id": "7214f419",
140 | "metadata": {},
141 | "outputs": [],
142 | "source": [
143 | "# Generating validation metrics by comparing the inferential predictions (val_preds) to the actuals (y_val)\n",
144 | "val_accuracy = accuracy_score(y_val, val_preds)\n",
145 | "val_roc_auc_score = roc_auc_score(y_val, val_preds)\n",
146 | "val_f1_score = f1_score(y_val, val_preds)"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": 26,
152 | "id": "f1a61d57",
153 | "metadata": {},
154 | "outputs": [
155 | {
156 | "name": "stdout",
157 | "output_type": "stream",
158 | "text": [
159 | "Accuracy score: 0.9484\n",
160 | "ROC AUC score: 0.9330565646081993\n",
161 | "F1 score: 0.9330565646081993\n"
162 | ]
163 | }
164 | ],
165 | "source": [
166 | "# Printing out the average validation metrics\n",
167 | "print(f'Accuracy score: {val_accuracy}')\n",
168 | "print(f'ROC AUC score: {val_f1_score}')\n",
169 | "print(f'F1 score: {val_f1_score}')"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": null,
175 | "id": "e0645cd9",
176 | "metadata": {},
177 | "outputs": [],
178 | "source": []
179 | }
180 | ],
181 | "metadata": {
182 | "kernelspec": {
183 | "display_name": "Python 3 (ipykernel)",
184 | "language": "python",
185 | "name": "python3"
186 | },
187 | "language_info": {
188 | "codemirror_mode": {
189 | "name": "ipython",
190 | "version": 3
191 | },
192 | "file_extension": ".py",
193 | "mimetype": "text/x-python",
194 | "name": "python",
195 | "nbconvert_exporter": "python",
196 | "pygments_lexer": "ipython3",
197 | "version": "3.10.1"
198 | }
199 | },
200 | "nbformat": 4,
201 | "nbformat_minor": 5
202 | }
203 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Data Science Quick Tips Repository!
2 | This repository contains all the code associated with each of my "Data Science Quick Tips" posts. Feel free to use as you please!
3 |
--------------------------------------------------------------------------------