├── .circleci
└── config.yml
├── .dockerignore
├── .gitignore
├── Dockerfile
├── LICENSE
├── Makefile
├── Procfile
├── README.md
├── jupyter_notebooks
├── Section12_DeepLearningModel
│ └── CNN_Analysis_and Model.ipynb
├── Section2_MLPipelineOverview
│ ├── 02.10_ML_Pipeline-WrapUp_for_Deployment.ipynb
│ ├── 02.6_ML_Pipeline_Step1-DataAnalysis.ipynb
│ ├── 02.7_ML_Pipeline_Step2-FeatureEngineering.ipynb
│ ├── 02.8_ML_Pipeline_Step3-FeatureSelection.ipynb
│ ├── 02.9_ML_Pipeline_Step4-MachineLearningModelBuild.ipynb
│ └── BONUS_Randomisation_in_ML _and_setting_the_seed.ipynb
└── requirements.txt
├── packages
├── ml_api
│ ├── VERSION
│ ├── api
│ │ ├── __init__.py
│ │ ├── app.py
│ │ ├── config.py
│ │ ├── controller.py
│ │ └── validation.py
│ ├── diff_test_requirements.txt
│ ├── requirements.txt
│ ├── run.py
│ ├── run.sh
│ ├── test_data_predictions.csv
│ └── tests
│ │ ├── __init__.py
│ │ ├── capture_model_predictions.py
│ │ ├── conftest.py
│ │ ├── differential_tests
│ │ ├── __init__.py
│ │ └── test_differential.py
│ │ ├── test_controller.py
│ │ └── test_validation.py
├── neural_network_model
│ ├── MANIFEST.in
│ ├── config.yml
│ ├── neural_network_model
│ │ ├── VERSION
│ │ ├── __init__.py
│ │ ├── config
│ │ │ ├── __init__.py
│ │ │ └── config.py
│ │ ├── datasets
│ │ │ ├── __init__.py
│ │ │ └── test_data
│ │ │ │ ├── Black-grass
│ │ │ │ └── 1.png
│ │ │ │ ├── Charlock
│ │ │ │ └── 1.png
│ │ │ │ └── __init__.py
│ │ ├── model.py
│ │ ├── pipeline.py
│ │ ├── predict.py
│ │ ├── processing
│ │ │ ├── __init__.py
│ │ │ ├── data_management.py
│ │ │ ├── errors.py
│ │ │ └── preprocessors.py
│ │ ├── train_pipeline.py
│ │ └── trained_models
│ │ │ └── __init__.py
│ ├── requirements.txt
│ ├── setup.py
│ └── tests
│ │ ├── __init__.py
│ │ ├── conftest.py
│ │ └── test_predict.py
└── regression_model
│ ├── MANIFEST.in
│ ├── regression_model
│ ├── VERSION
│ ├── __init__.py
│ ├── config
│ │ ├── __init__.py
│ │ ├── config.py
│ │ └── logging_config.py
│ ├── datasets
│ │ └── __init__.py
│ ├── pipeline.py
│ ├── predict.py
│ ├── processing
│ │ ├── __init__.py
│ │ ├── data_management.py
│ │ ├── errors.py
│ │ ├── features.py
│ │ ├── preprocessors.py
│ │ └── validation.py
│ ├── train_pipeline.py
│ └── trained_models
│ │ └── __init__.py
│ ├── requirements.txt
│ ├── setup.py
│ └── tests
│ ├── __init__.py
│ └── test_predict.py
├── requirements.txt
└── scripts
├── fetch_kaggle_dataset.sh
├── fetch_kaggle_large_dataset.sh
├── input_test.json
└── publish_model.sh
/.circleci/config.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | defaults: &defaults
4 | docker:
5 | - image: circleci/python:3.7.2
6 | working_directory: ~/project
7 |
8 | prepare_venv: &prepare_venv
9 | run:
10 | name: Create venv
11 | command: |
12 | python3 -m venv venv
13 | source venv/bin/activate
14 | pip install --upgrade pip
15 |
16 | fetch_data: &fetch_data
17 | run:
18 | name: Set script permissions and fetch data
19 | command: |
20 | source venv/bin/activate
21 | chmod +x ./scripts/fetch_kaggle_dataset.sh
22 | ./scripts/fetch_kaggle_dataset.sh
23 |
24 | jobs:
25 | test_regression_model:
26 | <<: *defaults
27 | steps:
28 | - checkout
29 | - *prepare_venv
30 | - run:
31 | name: Install requirements
32 | command: |
33 | . venv/bin/activate
34 | pip install -r packages/regression_model/requirements.txt
35 | - *fetch_data
36 | - run:
37 | name: Train model
38 | command: |
39 | . venv/bin/activate
40 | PYTHONPATH=./packages/regression_model python3 packages/regression_model/regression_model/train_pipeline.py
41 | - run:
42 | name: Run tests
43 | command: |
44 | . venv/bin/activate
45 | py.test -vv packages/regression_model/tests
46 |
47 | test_ml_api:
48 | <<: *defaults
49 | steps:
50 | - checkout
51 | - restore_cache:
52 | keys:
53 | - py-deps-{{ checksum "packages/ml_api/requirements.txt" }}
54 | - run:
55 | name: Runnning tests
56 | command: |
57 | python3 -m venv venv
58 | . venv/bin/activate
59 | pip install --upgrade pip
60 | pip install -r packages/ml_api/requirements.txt
61 | py.test -vv packages/ml_api/tests -m "not differential"
62 | - save_cache:
63 | key: py-deps-{{ checksum "packages/ml_api/requirements.txt" }}
64 | paths:
65 | - "/venv"
66 |
67 | train_and_upload_regression_model:
68 | <<: *defaults
69 | steps:
70 | - checkout
71 | - *prepare_venv
72 | - run:
73 | name: Install requirements
74 | command: |
75 | . venv/bin/activate
76 | pip install -r packages/regression_model/requirements.txt
77 | - *fetch_data
78 | - run:
79 | name: Train model
80 | command: |
81 | . venv/bin/activate
82 | PYTHONPATH=./packages/regression_model python3 packages/regression_model/regression_model/train_pipeline.py
83 | - run:
84 | name: Publish model to Gemfury
85 | command: |
86 | . venv/bin/activate
87 | chmod +x ./scripts/publish_model.sh
88 | ./scripts/publish_model.sh ./packages/regression_model/
89 |
90 | section_9_differential_tests:
91 | <<: *defaults
92 | steps:
93 | - checkout
94 | - *prepare_venv
95 | - run:
96 | name: Capturing previous model predictions
97 | command: |
98 | . venv/bin/activate
99 | pip install -r packages/ml_api/diff_test_requirements.txt
100 | PYTHONPATH=./packages/ml_api python3 packages/ml_api/tests/capture_model_predictions.py
101 | - run:
102 | name: Runnning differential tests
103 | command: |
104 | . venv/bin/activate
105 | pip install -r packages/ml_api/requirements.txt
106 | py.test -vv packages/ml_api/tests -m differential
107 |
108 | section_10_deploy_to_heroku:
109 | <<: *defaults
110 | steps:
111 | - checkout
112 | - run:
113 | name: Deploy to Heroku
114 | command: |
115 | git push https://heroku:$HEROKU_API_KEY@git.heroku.com/$HEROKU_APP_NAME.git master
116 |
117 | section_11_build_and_push_to_heroku_docker:
118 | <<: *defaults
119 | steps:
120 | - checkout
121 | - setup_remote_docker:
122 | docker_layer_caching: true
123 | - run: docker login --username=$HEROKU_EMAIL --password=$HEROKU_API_KEY registry.heroku.com
124 | - run:
125 | name: Setup Heroku CLI
126 | command: |
127 | wget -qO- https://cli-assets.heroku.com/install-ubuntu.sh | sh
128 | - run:
129 | name: Build and Push Image
130 | command: |
131 | make build-ml-api-heroku push-ml-api-heroku
132 | - run:
133 | name: Release to Heroku
134 | command: |
135 | heroku container:release web --app $HEROKU_APP_NAME
136 |
137 | section_12_publish_docker_image_to_aws:
138 | <<: *defaults
139 | working_directory: ~/project/packages/ml_models
140 | steps:
141 | - checkout
142 | - setup_remote_docker
143 | - run:
144 | name: Publishing docker image to aws ECR
145 | command: |
146 | sudo pip install awscli
147 | eval $(aws ecr get-login --no-include-email --region us-east-1)
148 | make build-ml-api-aws tag-ml-api push-ml-api-aws
149 | aws ecs update-service --cluster ml-api-cluster --service custom-service --task-definition first-run-task-definition --force-new-deployment
150 |
151 | section_13_train_and_upload_neural_network_model:
152 | docker:
153 | - image: circleci/python:3.6.4-stretch
154 | working_directory: ~/project
155 | steps:
156 | - checkout
157 | - *prepare_venv
158 | - run:
159 | name: Install requirements
160 | command: |
161 | . venv/bin/activate
162 | pip install -r packages/neural_network_model/requirements.txt
163 | - run:
164 | name: Fetch Training data - 2GB
165 | command: |
166 | . venv/bin/activate
167 | chmod +x ./scripts/fetch_kaggle_large_dataset.sh
168 | ./scripts/fetch_kaggle_large_dataset.sh
169 | - run:
170 | name: Train model
171 | command: |
172 | . venv/bin/activate
173 | PYTHONPATH=./packages/neural_network_model python3 packages/neural_network_model/neural_network_model/train_pipeline.py
174 | - run:
175 | name: Publish model to Gemfury
176 | command: |
177 | . venv/bin/activate
178 | chmod +x ./scripts/publish_model.sh
179 | ./scripts/publish_model.sh ./packages/neural_network_model/
180 |
181 | workflows:
182 | version: 2
183 | test-all:
184 | jobs:
185 | - test_regression_model
186 | - test_ml_api
187 | - section_9_differential_tests
188 | - train_and_upload_regression_model:
189 | requires:
190 | - test_regression_model
191 | - test_ml_api
192 | - section_9_differential_tests
193 | filters:
194 | branches:
195 | only:
196 | - master
197 | # - section_10_deploy_to_heroku:
198 | # requires:
199 | # - train_and_upload_regression_model
200 | # filters:
201 | # branches:
202 | # only:
203 | # - master
204 | - section_11_build_and_push_to_heroku_docker:
205 | requires:
206 | - train_and_upload_regression_model
207 | filters:
208 | branches:
209 | only:
210 | - master
211 | # - section_12_publish_docker_image_to_aws:
212 | # requires:
213 | # - train_and_upload_regression_model
214 | # filters:
215 | # branches:
216 | # only:
217 | # - master
218 | - section_13_train_and_upload_neural_network_model:
219 | requires:
220 | - test_regression_model
221 | - test_ml_api
222 | - section_9_differential_tests
223 | # - train_and_upload_regression_model
224 | # filters:
225 | # branches:
226 | # only:
227 | # - master
228 |
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | jupyter_notebooks*
2 | */env*
3 | */venv*
4 | .circleci*
5 | packages/regression_model
6 | *.env
7 | *.log
8 | .git
9 | .gitignore
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
106 | # pycharm
107 | .idea/
108 |
109 | # datafiles
110 | train.csv
111 | test.csv
112 | test_data_predictions.csv
113 | v2-plant-seedlings-dataset/
114 | v2-plant-seedlings-dataset.zip
115 |
116 | # all logs
117 | logs/
118 |
119 | # trained models (will be created in CI)
120 | packages/regression_model/regression_model/trained_models/*.pkl
121 | packages/neural_network_model/neural_network_model/trained_models/*.pkl
122 | packages/neural_network_model/neural_network_model/trained_models/*.h5
123 | *.h5
124 | packages/neural_network_model/neural_network_model/datasets/training_data_reference.txt
125 |
126 | .DS_Store
127 |
128 | kaggle.json
129 | packages/ml_api/uploads/*
130 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.6.4
2 |
3 | # Create the user that will run the app
4 | RUN adduser --disabled-password --gecos '' ml-api-user
5 |
6 | WORKDIR /opt/ml_api
7 |
8 | ARG PIP_EXTRA_INDEX_URL
9 | ENV FLASK_APP run.py
10 |
11 | # Install requirements, including from Gemfury
12 | ADD ./packages/ml_api /opt/ml_api/
13 | RUN pip install --upgrade pip
14 | RUN pip install -r /opt/ml_api/requirements.txt
15 |
16 | RUN chmod +x /opt/ml_api/run.sh
17 | RUN chown -R ml-api-user:ml-api-user ./
18 |
19 | USER ml-api-user
20 |
21 | EXPOSE 5000
22 |
23 | CMD ["bash", "./run.sh"]
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2019, Soledad Galli and Christopher Samiullah. Deployment of Machine Learning Models, online course.
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | 1. Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | 3. Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | NAME=udemy-ml-api
2 | COMMIT_ID=$(shell git rev-parse HEAD)
3 |
4 |
5 | build-ml-api-heroku:
6 | docker build --build-arg PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} -t registry.heroku.com/$(NAME)/web:$(COMMIT_ID) .
7 |
8 | push-ml-api-heroku:
9 | docker push registry.heroku.com/${HEROKU_APP_NAME}/web:$(COMMIT_ID)
10 |
11 | build-ml-api-aws:
12 | docker build --build-arg PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} -t $(NAME):$(COMMIT_ID) .
13 |
14 | push-ml-api-aws:
15 | docker push ${AWS_ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/$(NAME):$(COMMIT_ID)
16 |
17 | tag-ml-api:
18 | docker tag $(NAME):$(COMMIT_ID) ${AWS_ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/$(NAME):$(COMMIT_ID)
19 |
--------------------------------------------------------------------------------
/Procfile:
--------------------------------------------------------------------------------
1 | web: gunicorn --pythonpath packages/ml_api --access-logfile - --error-logfile - run:application
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Deployment of Machine Learning Models
2 | Accompanying repo for the online course Deployment of Machine Learning Models.
3 |
4 | For the documentation, visit the [course on Udemy](https://www.udemy.com/deployment-of-machine-learning-models/?couponCode=TIDREPO).
5 |
--------------------------------------------------------------------------------
/jupyter_notebooks/Section2_MLPipelineOverview/02.10_ML_Pipeline-WrapUp_for_Deployment.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Machine Learning Model Building Pipeline: Wrapping up for Deployment\n",
8 | "\n",
9 | "\n",
10 | "In the previous lectures, we worked through the typical Machine Learning pipeline to build a regression model that allows us to predict house prices. Briefly, we transformed variables in the dataset to make them suitable for use in a Regression model, then we selected the most predictive variables and finally we built our model.\n",
11 | "\n",
12 | "Now, we want to deploy our model. We want to create an API, that we can call with new data, with new characteristics about houses, to get an estimate of the SalePrice. In order to do so, we need to write code in a very specific way. We will show you how to write production code in the coming lectures.\n",
13 | "\n",
14 | "Here, we will summarise, the key pieces of code, that we need to take forward, for this particular project, to put our model in production.\n",
15 | "\n",
16 | "Let's go ahead and get started."
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "### Setting the seed\n",
24 | "\n",
25 | "It is important to note, that we are engineering variables and pre-processing data with the idea of deploying the model if we find business value in it. Therefore, from now on, for each step that includes some element of randomness, it is extremely important that we **set the seed**. This way, we can obtain reproducibility between our research and our development code.\n",
26 | "\n",
27 | "This is perhaps one of the most important lessons that you need to take away from this course: **Always set the seeds**.\n",
28 | "\n",
29 | "Let's go ahead and load the dataset."
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 1,
35 | "metadata": {
36 | "collapsed": true
37 | },
38 | "outputs": [],
39 | "source": [
40 | "# to handle datasets\n",
41 | "import pandas as pd\n",
42 | "import numpy as np\n",
43 | "\n",
44 | "# to divide train and test set\n",
45 | "from sklearn.model_selection import train_test_split\n",
46 | "\n",
47 | "# feature scaling\n",
48 | "from sklearn.preprocessing import MinMaxScaler\n",
49 | "\n",
50 | "# to build the models\n",
51 | "from sklearn.linear_model import Lasso\n",
52 | "\n",
53 | "# to evaluate the models\n",
54 | "from sklearn.metrics import mean_squared_error\n",
55 | "from math import sqrt\n",
56 | "\n",
57 | "# to persist the model and the scaler\n",
58 | "from sklearn.externals import joblib\n",
59 | "\n",
60 | "# to visualise al the columns in the dataframe\n",
61 | "pd.pandas.set_option('display.max_columns', None)"
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "metadata": {},
67 | "source": [
68 | "## Load data\n",
69 | "\n",
70 | "We need the training data to train our model in the production environment. "
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 2,
76 | "metadata": {},
77 | "outputs": [
78 | {
79 | "name": "stdout",
80 | "output_type": "stream",
81 | "text": [
82 | "(1460, 81)\n"
83 | ]
84 | },
85 | {
86 | "data": {
87 | "text/html": [
88 | "
\n",
89 | "\n",
102 | "
\n",
103 | " \n",
104 | " \n",
105 | " | \n",
106 | " Id | \n",
107 | " MSSubClass | \n",
108 | " MSZoning | \n",
109 | " LotFrontage | \n",
110 | " LotArea | \n",
111 | " Street | \n",
112 | " Alley | \n",
113 | " LotShape | \n",
114 | " LandContour | \n",
115 | " Utilities | \n",
116 | " LotConfig | \n",
117 | " LandSlope | \n",
118 | " Neighborhood | \n",
119 | " Condition1 | \n",
120 | " Condition2 | \n",
121 | " BldgType | \n",
122 | " HouseStyle | \n",
123 | " OverallQual | \n",
124 | " OverallCond | \n",
125 | " YearBuilt | \n",
126 | " YearRemodAdd | \n",
127 | " RoofStyle | \n",
128 | " RoofMatl | \n",
129 | " Exterior1st | \n",
130 | " Exterior2nd | \n",
131 | " MasVnrType | \n",
132 | " MasVnrArea | \n",
133 | " ExterQual | \n",
134 | " ExterCond | \n",
135 | " Foundation | \n",
136 | " BsmtQual | \n",
137 | " BsmtCond | \n",
138 | " BsmtExposure | \n",
139 | " BsmtFinType1 | \n",
140 | " BsmtFinSF1 | \n",
141 | " BsmtFinType2 | \n",
142 | " BsmtFinSF2 | \n",
143 | " BsmtUnfSF | \n",
144 | " TotalBsmtSF | \n",
145 | " Heating | \n",
146 | " HeatingQC | \n",
147 | " CentralAir | \n",
148 | " Electrical | \n",
149 | " 1stFlrSF | \n",
150 | " 2ndFlrSF | \n",
151 | " LowQualFinSF | \n",
152 | " GrLivArea | \n",
153 | " BsmtFullBath | \n",
154 | " BsmtHalfBath | \n",
155 | " FullBath | \n",
156 | " HalfBath | \n",
157 | " BedroomAbvGr | \n",
158 | " KitchenAbvGr | \n",
159 | " KitchenQual | \n",
160 | " TotRmsAbvGrd | \n",
161 | " Functional | \n",
162 | " Fireplaces | \n",
163 | " FireplaceQu | \n",
164 | " GarageType | \n",
165 | " GarageYrBlt | \n",
166 | " GarageFinish | \n",
167 | " GarageCars | \n",
168 | " GarageArea | \n",
169 | " GarageQual | \n",
170 | " GarageCond | \n",
171 | " PavedDrive | \n",
172 | " WoodDeckSF | \n",
173 | " OpenPorchSF | \n",
174 | " EnclosedPorch | \n",
175 | " 3SsnPorch | \n",
176 | " ScreenPorch | \n",
177 | " PoolArea | \n",
178 | " PoolQC | \n",
179 | " Fence | \n",
180 | " MiscFeature | \n",
181 | " MiscVal | \n",
182 | " MoSold | \n",
183 | " YrSold | \n",
184 | " SaleType | \n",
185 | " SaleCondition | \n",
186 | " SalePrice | \n",
187 | "
\n",
188 | " \n",
189 | " \n",
190 | " \n",
191 | " 0 | \n",
192 | " 1 | \n",
193 | " 60 | \n",
194 | " RL | \n",
195 | " 65.0 | \n",
196 | " 8450 | \n",
197 | " Pave | \n",
198 | " NaN | \n",
199 | " Reg | \n",
200 | " Lvl | \n",
201 | " AllPub | \n",
202 | " Inside | \n",
203 | " Gtl | \n",
204 | " CollgCr | \n",
205 | " Norm | \n",
206 | " Norm | \n",
207 | " 1Fam | \n",
208 | " 2Story | \n",
209 | " 7 | \n",
210 | " 5 | \n",
211 | " 2003 | \n",
212 | " 2003 | \n",
213 | " Gable | \n",
214 | " CompShg | \n",
215 | " VinylSd | \n",
216 | " VinylSd | \n",
217 | " BrkFace | \n",
218 | " 196.0 | \n",
219 | " Gd | \n",
220 | " TA | \n",
221 | " PConc | \n",
222 | " Gd | \n",
223 | " TA | \n",
224 | " No | \n",
225 | " GLQ | \n",
226 | " 706 | \n",
227 | " Unf | \n",
228 | " 0 | \n",
229 | " 150 | \n",
230 | " 856 | \n",
231 | " GasA | \n",
232 | " Ex | \n",
233 | " Y | \n",
234 | " SBrkr | \n",
235 | " 856 | \n",
236 | " 854 | \n",
237 | " 0 | \n",
238 | " 1710 | \n",
239 | " 1 | \n",
240 | " 0 | \n",
241 | " 2 | \n",
242 | " 1 | \n",
243 | " 3 | \n",
244 | " 1 | \n",
245 | " Gd | \n",
246 | " 8 | \n",
247 | " Typ | \n",
248 | " 0 | \n",
249 | " NaN | \n",
250 | " Attchd | \n",
251 | " 2003.0 | \n",
252 | " RFn | \n",
253 | " 2 | \n",
254 | " 548 | \n",
255 | " TA | \n",
256 | " TA | \n",
257 | " Y | \n",
258 | " 0 | \n",
259 | " 61 | \n",
260 | " 0 | \n",
261 | " 0 | \n",
262 | " 0 | \n",
263 | " 0 | \n",
264 | " NaN | \n",
265 | " NaN | \n",
266 | " NaN | \n",
267 | " 0 | \n",
268 | " 2 | \n",
269 | " 2008 | \n",
270 | " WD | \n",
271 | " Normal | \n",
272 | " 208500 | \n",
273 | "
\n",
274 | " \n",
275 | " 1 | \n",
276 | " 2 | \n",
277 | " 20 | \n",
278 | " RL | \n",
279 | " 80.0 | \n",
280 | " 9600 | \n",
281 | " Pave | \n",
282 | " NaN | \n",
283 | " Reg | \n",
284 | " Lvl | \n",
285 | " AllPub | \n",
286 | " FR2 | \n",
287 | " Gtl | \n",
288 | " Veenker | \n",
289 | " Feedr | \n",
290 | " Norm | \n",
291 | " 1Fam | \n",
292 | " 1Story | \n",
293 | " 6 | \n",
294 | " 8 | \n",
295 | " 1976 | \n",
296 | " 1976 | \n",
297 | " Gable | \n",
298 | " CompShg | \n",
299 | " MetalSd | \n",
300 | " MetalSd | \n",
301 | " None | \n",
302 | " 0.0 | \n",
303 | " TA | \n",
304 | " TA | \n",
305 | " CBlock | \n",
306 | " Gd | \n",
307 | " TA | \n",
308 | " Gd | \n",
309 | " ALQ | \n",
310 | " 978 | \n",
311 | " Unf | \n",
312 | " 0 | \n",
313 | " 284 | \n",
314 | " 1262 | \n",
315 | " GasA | \n",
316 | " Ex | \n",
317 | " Y | \n",
318 | " SBrkr | \n",
319 | " 1262 | \n",
320 | " 0 | \n",
321 | " 0 | \n",
322 | " 1262 | \n",
323 | " 0 | \n",
324 | " 1 | \n",
325 | " 2 | \n",
326 | " 0 | \n",
327 | " 3 | \n",
328 | " 1 | \n",
329 | " TA | \n",
330 | " 6 | \n",
331 | " Typ | \n",
332 | " 1 | \n",
333 | " TA | \n",
334 | " Attchd | \n",
335 | " 1976.0 | \n",
336 | " RFn | \n",
337 | " 2 | \n",
338 | " 460 | \n",
339 | " TA | \n",
340 | " TA | \n",
341 | " Y | \n",
342 | " 298 | \n",
343 | " 0 | \n",
344 | " 0 | \n",
345 | " 0 | \n",
346 | " 0 | \n",
347 | " 0 | \n",
348 | " NaN | \n",
349 | " NaN | \n",
350 | " NaN | \n",
351 | " 0 | \n",
352 | " 5 | \n",
353 | " 2007 | \n",
354 | " WD | \n",
355 | " Normal | \n",
356 | " 181500 | \n",
357 | "
\n",
358 | " \n",
359 | " 2 | \n",
360 | " 3 | \n",
361 | " 60 | \n",
362 | " RL | \n",
363 | " 68.0 | \n",
364 | " 11250 | \n",
365 | " Pave | \n",
366 | " NaN | \n",
367 | " IR1 | \n",
368 | " Lvl | \n",
369 | " AllPub | \n",
370 | " Inside | \n",
371 | " Gtl | \n",
372 | " CollgCr | \n",
373 | " Norm | \n",
374 | " Norm | \n",
375 | " 1Fam | \n",
376 | " 2Story | \n",
377 | " 7 | \n",
378 | " 5 | \n",
379 | " 2001 | \n",
380 | " 2002 | \n",
381 | " Gable | \n",
382 | " CompShg | \n",
383 | " VinylSd | \n",
384 | " VinylSd | \n",
385 | " BrkFace | \n",
386 | " 162.0 | \n",
387 | " Gd | \n",
388 | " TA | \n",
389 | " PConc | \n",
390 | " Gd | \n",
391 | " TA | \n",
392 | " Mn | \n",
393 | " GLQ | \n",
394 | " 486 | \n",
395 | " Unf | \n",
396 | " 0 | \n",
397 | " 434 | \n",
398 | " 920 | \n",
399 | " GasA | \n",
400 | " Ex | \n",
401 | " Y | \n",
402 | " SBrkr | \n",
403 | " 920 | \n",
404 | " 866 | \n",
405 | " 0 | \n",
406 | " 1786 | \n",
407 | " 1 | \n",
408 | " 0 | \n",
409 | " 2 | \n",
410 | " 1 | \n",
411 | " 3 | \n",
412 | " 1 | \n",
413 | " Gd | \n",
414 | " 6 | \n",
415 | " Typ | \n",
416 | " 1 | \n",
417 | " TA | \n",
418 | " Attchd | \n",
419 | " 2001.0 | \n",
420 | " RFn | \n",
421 | " 2 | \n",
422 | " 608 | \n",
423 | " TA | \n",
424 | " TA | \n",
425 | " Y | \n",
426 | " 0 | \n",
427 | " 42 | \n",
428 | " 0 | \n",
429 | " 0 | \n",
430 | " 0 | \n",
431 | " 0 | \n",
432 | " NaN | \n",
433 | " NaN | \n",
434 | " NaN | \n",
435 | " 0 | \n",
436 | " 9 | \n",
437 | " 2008 | \n",
438 | " WD | \n",
439 | " Normal | \n",
440 | " 223500 | \n",
441 | "
\n",
442 | " \n",
443 | " 3 | \n",
444 | " 4 | \n",
445 | " 70 | \n",
446 | " RL | \n",
447 | " 60.0 | \n",
448 | " 9550 | \n",
449 | " Pave | \n",
450 | " NaN | \n",
451 | " IR1 | \n",
452 | " Lvl | \n",
453 | " AllPub | \n",
454 | " Corner | \n",
455 | " Gtl | \n",
456 | " Crawfor | \n",
457 | " Norm | \n",
458 | " Norm | \n",
459 | " 1Fam | \n",
460 | " 2Story | \n",
461 | " 7 | \n",
462 | " 5 | \n",
463 | " 1915 | \n",
464 | " 1970 | \n",
465 | " Gable | \n",
466 | " CompShg | \n",
467 | " Wd Sdng | \n",
468 | " Wd Shng | \n",
469 | " None | \n",
470 | " 0.0 | \n",
471 | " TA | \n",
472 | " TA | \n",
473 | " BrkTil | \n",
474 | " TA | \n",
475 | " Gd | \n",
476 | " No | \n",
477 | " ALQ | \n",
478 | " 216 | \n",
479 | " Unf | \n",
480 | " 0 | \n",
481 | " 540 | \n",
482 | " 756 | \n",
483 | " GasA | \n",
484 | " Gd | \n",
485 | " Y | \n",
486 | " SBrkr | \n",
487 | " 961 | \n",
488 | " 756 | \n",
489 | " 0 | \n",
490 | " 1717 | \n",
491 | " 1 | \n",
492 | " 0 | \n",
493 | " 1 | \n",
494 | " 0 | \n",
495 | " 3 | \n",
496 | " 1 | \n",
497 | " Gd | \n",
498 | " 7 | \n",
499 | " Typ | \n",
500 | " 1 | \n",
501 | " Gd | \n",
502 | " Detchd | \n",
503 | " 1998.0 | \n",
504 | " Unf | \n",
505 | " 3 | \n",
506 | " 642 | \n",
507 | " TA | \n",
508 | " TA | \n",
509 | " Y | \n",
510 | " 0 | \n",
511 | " 35 | \n",
512 | " 272 | \n",
513 | " 0 | \n",
514 | " 0 | \n",
515 | " 0 | \n",
516 | " NaN | \n",
517 | " NaN | \n",
518 | " NaN | \n",
519 | " 0 | \n",
520 | " 2 | \n",
521 | " 2006 | \n",
522 | " WD | \n",
523 | " Abnorml | \n",
524 | " 140000 | \n",
525 | "
\n",
526 | " \n",
527 | " 4 | \n",
528 | " 5 | \n",
529 | " 60 | \n",
530 | " RL | \n",
531 | " 84.0 | \n",
532 | " 14260 | \n",
533 | " Pave | \n",
534 | " NaN | \n",
535 | " IR1 | \n",
536 | " Lvl | \n",
537 | " AllPub | \n",
538 | " FR2 | \n",
539 | " Gtl | \n",
540 | " NoRidge | \n",
541 | " Norm | \n",
542 | " Norm | \n",
543 | " 1Fam | \n",
544 | " 2Story | \n",
545 | " 8 | \n",
546 | " 5 | \n",
547 | " 2000 | \n",
548 | " 2000 | \n",
549 | " Gable | \n",
550 | " CompShg | \n",
551 | " VinylSd | \n",
552 | " VinylSd | \n",
553 | " BrkFace | \n",
554 | " 350.0 | \n",
555 | " Gd | \n",
556 | " TA | \n",
557 | " PConc | \n",
558 | " Gd | \n",
559 | " TA | \n",
560 | " Av | \n",
561 | " GLQ | \n",
562 | " 655 | \n",
563 | " Unf | \n",
564 | " 0 | \n",
565 | " 490 | \n",
566 | " 1145 | \n",
567 | " GasA | \n",
568 | " Ex | \n",
569 | " Y | \n",
570 | " SBrkr | \n",
571 | " 1145 | \n",
572 | " 1053 | \n",
573 | " 0 | \n",
574 | " 2198 | \n",
575 | " 1 | \n",
576 | " 0 | \n",
577 | " 2 | \n",
578 | " 1 | \n",
579 | " 4 | \n",
580 | " 1 | \n",
581 | " Gd | \n",
582 | " 9 | \n",
583 | " Typ | \n",
584 | " 1 | \n",
585 | " TA | \n",
586 | " Attchd | \n",
587 | " 2000.0 | \n",
588 | " RFn | \n",
589 | " 3 | \n",
590 | " 836 | \n",
591 | " TA | \n",
592 | " TA | \n",
593 | " Y | \n",
594 | " 192 | \n",
595 | " 84 | \n",
596 | " 0 | \n",
597 | " 0 | \n",
598 | " 0 | \n",
599 | " 0 | \n",
600 | " NaN | \n",
601 | " NaN | \n",
602 | " NaN | \n",
603 | " 0 | \n",
604 | " 12 | \n",
605 | " 2008 | \n",
606 | " WD | \n",
607 | " Normal | \n",
608 | " 250000 | \n",
609 | "
\n",
610 | " \n",
611 | "
\n",
612 | "
"
613 | ],
614 | "text/plain": [
615 | " Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n",
616 | "0 1 60 RL 65.0 8450 Pave NaN Reg \n",
617 | "1 2 20 RL 80.0 9600 Pave NaN Reg \n",
618 | "2 3 60 RL 68.0 11250 Pave NaN IR1 \n",
619 | "3 4 70 RL 60.0 9550 Pave NaN IR1 \n",
620 | "4 5 60 RL 84.0 14260 Pave NaN IR1 \n",
621 | "\n",
622 | " LandContour Utilities LotConfig LandSlope Neighborhood Condition1 \\\n",
623 | "0 Lvl AllPub Inside Gtl CollgCr Norm \n",
624 | "1 Lvl AllPub FR2 Gtl Veenker Feedr \n",
625 | "2 Lvl AllPub Inside Gtl CollgCr Norm \n",
626 | "3 Lvl AllPub Corner Gtl Crawfor Norm \n",
627 | "4 Lvl AllPub FR2 Gtl NoRidge Norm \n",
628 | "\n",
629 | " Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt \\\n",
630 | "0 Norm 1Fam 2Story 7 5 2003 \n",
631 | "1 Norm 1Fam 1Story 6 8 1976 \n",
632 | "2 Norm 1Fam 2Story 7 5 2001 \n",
633 | "3 Norm 1Fam 2Story 7 5 1915 \n",
634 | "4 Norm 1Fam 2Story 8 5 2000 \n",
635 | "\n",
636 | " YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType \\\n",
637 | "0 2003 Gable CompShg VinylSd VinylSd BrkFace \n",
638 | "1 1976 Gable CompShg MetalSd MetalSd None \n",
639 | "2 2002 Gable CompShg VinylSd VinylSd BrkFace \n",
640 | "3 1970 Gable CompShg Wd Sdng Wd Shng None \n",
641 | "4 2000 Gable CompShg VinylSd VinylSd BrkFace \n",
642 | "\n",
643 | " MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure \\\n",
644 | "0 196.0 Gd TA PConc Gd TA No \n",
645 | "1 0.0 TA TA CBlock Gd TA Gd \n",
646 | "2 162.0 Gd TA PConc Gd TA Mn \n",
647 | "3 0.0 TA TA BrkTil TA Gd No \n",
648 | "4 350.0 Gd TA PConc Gd TA Av \n",
649 | "\n",
650 | " BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF \\\n",
651 | "0 GLQ 706 Unf 0 150 856 \n",
652 | "1 ALQ 978 Unf 0 284 1262 \n",
653 | "2 GLQ 486 Unf 0 434 920 \n",
654 | "3 ALQ 216 Unf 0 540 756 \n",
655 | "4 GLQ 655 Unf 0 490 1145 \n",
656 | "\n",
657 | " Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF \\\n",
658 | "0 GasA Ex Y SBrkr 856 854 0 \n",
659 | "1 GasA Ex Y SBrkr 1262 0 0 \n",
660 | "2 GasA Ex Y SBrkr 920 866 0 \n",
661 | "3 GasA Gd Y SBrkr 961 756 0 \n",
662 | "4 GasA Ex Y SBrkr 1145 1053 0 \n",
663 | "\n",
664 | " GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr \\\n",
665 | "0 1710 1 0 2 1 3 \n",
666 | "1 1262 0 1 2 0 3 \n",
667 | "2 1786 1 0 2 1 3 \n",
668 | "3 1717 1 0 1 0 3 \n",
669 | "4 2198 1 0 2 1 4 \n",
670 | "\n",
671 | " KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu \\\n",
672 | "0 1 Gd 8 Typ 0 NaN \n",
673 | "1 1 TA 6 Typ 1 TA \n",
674 | "2 1 Gd 6 Typ 1 TA \n",
675 | "3 1 Gd 7 Typ 1 Gd \n",
676 | "4 1 Gd 9 Typ 1 TA \n",
677 | "\n",
678 | " GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual \\\n",
679 | "0 Attchd 2003.0 RFn 2 548 TA \n",
680 | "1 Attchd 1976.0 RFn 2 460 TA \n",
681 | "2 Attchd 2001.0 RFn 2 608 TA \n",
682 | "3 Detchd 1998.0 Unf 3 642 TA \n",
683 | "4 Attchd 2000.0 RFn 3 836 TA \n",
684 | "\n",
685 | " GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch \\\n",
686 | "0 TA Y 0 61 0 0 \n",
687 | "1 TA Y 298 0 0 0 \n",
688 | "2 TA Y 0 42 0 0 \n",
689 | "3 TA Y 0 35 272 0 \n",
690 | "4 TA Y 192 84 0 0 \n",
691 | "\n",
692 | " ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold \\\n",
693 | "0 0 0 NaN NaN NaN 0 2 2008 \n",
694 | "1 0 0 NaN NaN NaN 0 5 2007 \n",
695 | "2 0 0 NaN NaN NaN 0 9 2008 \n",
696 | "3 0 0 NaN NaN NaN 0 2 2006 \n",
697 | "4 0 0 NaN NaN NaN 0 12 2008 \n",
698 | "\n",
699 | " SaleType SaleCondition SalePrice \n",
700 | "0 WD Normal 208500 \n",
701 | "1 WD Normal 181500 \n",
702 | "2 WD Normal 223500 \n",
703 | "3 WD Abnorml 140000 \n",
704 | "4 WD Normal 250000 "
705 | ]
706 | },
707 | "execution_count": 2,
708 | "metadata": {},
709 | "output_type": "execute_result"
710 | }
711 | ],
712 | "source": [
713 | "# load dataset\n",
714 | "data = pd.read_csv('houseprice.csv')\n",
715 | "print(data.shape)\n",
716 | "data.head()"
717 | ]
718 | },
719 | {
720 | "cell_type": "markdown",
721 | "metadata": {},
722 | "source": [
723 | "## Separate dataset into train and test\n",
724 | "\n",
725 | "Before beginning to engineer our features, it is important to separate our data intro training and testing set. This is to avoid over-fitting. There is an element of randomness in dividing the dataset, so remember to set the seed."
726 | ]
727 | },
728 | {
729 | "cell_type": "code",
730 | "execution_count": 3,
731 | "metadata": {},
732 | "outputs": [
733 | {
734 | "data": {
735 | "text/plain": [
736 | "((1314, 81), (146, 81))"
737 | ]
738 | },
739 | "execution_count": 3,
740 | "metadata": {},
741 | "output_type": "execute_result"
742 | }
743 | ],
744 | "source": [
745 | "# Let's separate into train and test set\n",
746 | "# Remember to seet the seed (random_state for this sklearn function)\n",
747 | "\n",
748 | "X_train, X_test, y_train, y_test = train_test_split(data, data.SalePrice,\n",
749 | " test_size=0.1,\n",
750 | " random_state=0) # we are setting the seed here\n",
751 | "X_train.shape, X_test.shape"
752 | ]
753 | },
754 | {
755 | "cell_type": "markdown",
756 | "metadata": {},
757 | "source": [
758 | "## Selected features\n",
759 | "\n",
760 | "Remember that we will deploy our model utilising only a subset of features, the most predictive ones. This is to make simpler models, so that we build simpler code for deployment. We will tell you more about this in coming lectures."
761 | ]
762 | },
763 | {
764 | "cell_type": "code",
765 | "execution_count": 4,
766 | "metadata": {},
767 | "outputs": [
768 | {
769 | "name": "stdout",
770 | "output_type": "stream",
771 | "text": [
772 | "Number of features: 23\n"
773 | ]
774 | }
775 | ],
776 | "source": [
777 | "# load selected features\n",
778 | "features = pd.read_csv('selected_features.csv', header=None)\n",
779 | "\n",
780 | "# Remember that I added the extra feature, to show you how to put\n",
781 | "# an additional feature engineering step into production\n",
782 | "features = [x for x in features[0]] + ['LotFrontage']\n",
783 | "print('Number of features: ', len(features))"
784 | ]
785 | },
786 | {
787 | "cell_type": "markdown",
788 | "metadata": {},
789 | "source": [
790 | "### Missing values\n",
791 | "\n",
792 | "For categorical variables, we will fill missing information by adding an additional category: \"missing\""
793 | ]
794 | },
795 | {
796 | "cell_type": "code",
797 | "execution_count": 5,
798 | "metadata": {
799 | "scrolled": true
800 | },
801 | "outputs": [
802 | {
803 | "name": "stdout",
804 | "output_type": "stream",
805 | "text": [
806 | "MasVnrType 0.005 % missing values\n",
807 | "BsmtQual 0.024 % missing values\n",
808 | "BsmtExposure 0.025 % missing values\n",
809 | "FireplaceQu 0.473 % missing values\n",
810 | "GarageType 0.056 % missing values\n",
811 | "GarageFinish 0.056 % missing values\n"
812 | ]
813 | }
814 | ],
815 | "source": [
816 | "# make a list of the categorical variables that contain missing values\n",
817 | "vars_with_na = [var for var in features if X_train[var].isnull().sum()>1 and X_train[var].dtypes=='O']\n",
818 | "\n",
819 | "# print the variable name and the percentage of missing values\n",
820 | "for var in vars_with_na:\n",
821 | " print(var, np.round(X_train[var].isnull().mean(), 3), ' % missing values')"
822 | ]
823 | },
824 | {
825 | "cell_type": "markdown",
826 | "metadata": {},
827 | "source": [
828 | "Note that we have much less categorical variables with missing values than in our original dataset. But we still use categorical variables with NA for the final model, so we need to include this piece of feature engineering logic in the deployment pipeline. "
829 | ]
830 | },
831 | {
832 | "cell_type": "code",
833 | "execution_count": 6,
834 | "metadata": {},
835 | "outputs": [
836 | {
837 | "data": {
838 | "text/plain": [
839 | "MasVnrType 0\n",
840 | "BsmtQual 0\n",
841 | "BsmtExposure 0\n",
842 | "FireplaceQu 0\n",
843 | "GarageType 0\n",
844 | "GarageFinish 0\n",
845 | "dtype: int64"
846 | ]
847 | },
848 | "execution_count": 6,
849 | "metadata": {},
850 | "output_type": "execute_result"
851 | }
852 | ],
853 | "source": [
854 | "# I bring forward the functions used in the feature engineering notebook:\n",
855 | "\n",
856 | "# function to replace NA in categorical variables\n",
857 | "def fill_categorical_na(df, var_list):\n",
858 | " X = df.copy()\n",
859 | " X[var_list] = df[var_list].fillna('Missing')\n",
860 | " return X\n",
861 | "\n",
862 | "# replace missing values with new label: \"Missing\"\n",
863 | "X_train = fill_categorical_na(X_train, vars_with_na)\n",
864 | "X_test = fill_categorical_na(X_test, vars_with_na)\n",
865 | "\n",
866 | "# check that we have no missing information in the engineered variables\n",
867 | "X_train[vars_with_na].isnull().sum()"
868 | ]
869 | },
870 | {
871 | "cell_type": "markdown",
872 | "metadata": {},
873 | "source": [
874 | "For numerical variables, we are going to add an additional variable capturing the missing information, and then replace the missing information in the original variable by the mode, or most frequent value:"
875 | ]
876 | },
877 | {
878 | "cell_type": "code",
879 | "execution_count": 7,
880 | "metadata": {},
881 | "outputs": [
882 | {
883 | "name": "stdout",
884 | "output_type": "stream",
885 | "text": [
886 | "LotFrontage 0.177 % missing values\n"
887 | ]
888 | }
889 | ],
890 | "source": [
891 | "# make a list of the numerical variables that contain missing values\n",
892 | "vars_with_na = [var for var in features if X_train[var].isnull().sum()>1 and X_train[var].dtypes!='O']\n",
893 | "\n",
894 | "# print the variable name and the percentage of missing values\n",
895 | "for var in vars_with_na:\n",
896 | " print(var, np.round(X_train[var].isnull().mean(), 3), ' % missing values')"
897 | ]
898 | },
899 | {
900 | "cell_type": "markdown",
901 | "metadata": {},
902 | "source": [
903 | "#### Important: persisting the mean value for NA imputation\n",
904 | "\n",
905 | "As you will see in future sections, one of the key pieces of deploying the model is \"Model Validation\". Model validation refers to corroborating that the deployed model and the model built during research, are identical. The entire pipeline needs to produce identical results.\n",
906 | "\n",
907 | "Therefore, in order to check at the end of the process that the feature engineering pipelines are identical, we will save -we will persist-, the mean value of the variable, so that we can use it at the end, to corroborate our models."
908 | ]
909 | },
910 | {
911 | "cell_type": "code",
912 | "execution_count": 8,
913 | "metadata": {},
914 | "outputs": [
915 | {
916 | "data": {
917 | "text/plain": [
918 | "LotFrontage 0\n",
919 | "dtype: int64"
920 | ]
921 | },
922 | "execution_count": 8,
923 | "metadata": {},
924 | "output_type": "execute_result"
925 | }
926 | ],
927 | "source": [
928 | "# replace the missing values\n",
929 | "\n",
930 | "mean_var_dict = {}\n",
931 | "\n",
932 | "for var in vars_with_na:\n",
933 | " \n",
934 | " # calculate the mode\n",
935 | " mode_val = X_train[var].mode()[0]\n",
936 | " \n",
937 | " # we persist the mean in the dictionary\n",
938 | " mean_var_dict[var] = mode_val\n",
939 | " \n",
940 | " # train\n",
941 | " # note that the additional binary variable was not selected, so we don't need this step any more\n",
942 | " #X_train[var+'_na'] = np.where(X_train[var].isnull(), 1, 0)\n",
943 | " X_train[var].fillna(mode_val, inplace=True)\n",
944 | " \n",
945 | " # test\n",
946 | " # note that the additional binary variable was not selected, so we don't need this step any more\n",
947 | " #X_test[var+'_na'] = np.where(X_test[var].isnull(), 1, 0)\n",
948 | " X_test[var].fillna(mode_val, inplace=True)\n",
949 | "\n",
950 | "# we save the dictionary for later\n",
951 | "np.save('mean_var_dict.npy', mean_var_dict)\n",
952 | "\n",
953 | "# check that we have no more missing values in the engineered variables\n",
954 | "X_train[vars_with_na].isnull().sum()"
955 | ]
956 | },
957 | {
958 | "cell_type": "markdown",
959 | "metadata": {},
960 | "source": [
961 | "### Temporal variables\n",
962 | "\n",
963 | "One of our temporal variables was selected to be used in the final model: 'YearRemodAdd'\n",
964 | "\n",
965 | "So we need to deploy the bit of code that creates it."
966 | ]
967 | },
968 | {
969 | "cell_type": "code",
970 | "execution_count": 9,
971 | "metadata": {
972 | "collapsed": true
973 | },
974 | "outputs": [],
975 | "source": [
976 | "# create the temporal var \"elapsed years\"\n",
977 | "def elapsed_years(df, var):\n",
978 | " # capture difference between year variable and year the house was sold\n",
979 | " df[var] = df['YrSold'] - df[var]\n",
980 | " return df"
981 | ]
982 | },
983 | {
984 | "cell_type": "code",
985 | "execution_count": 10,
986 | "metadata": {
987 | "collapsed": true
988 | },
989 | "outputs": [],
990 | "source": [
991 | "X_train = elapsed_years(X_train, 'YearRemodAdd')\n",
992 | "X_test = elapsed_years(X_test, 'YearRemodAdd')"
993 | ]
994 | },
995 | {
996 | "cell_type": "markdown",
997 | "metadata": {},
998 | "source": [
999 | "### Numerical variables\n",
1000 | "\n",
1001 | "We will log transform the numerical variables that do not contain zeros in order to get a more Gaussian-like distribution. This tends to help Linear machine learning models.\n",
1002 | "\n",
1003 | "Originally, we also transformed 'LotArea', but this variable was not selected, so we remove it from the pipeline:"
1004 | ]
1005 | },
1006 | {
1007 | "cell_type": "code",
1008 | "execution_count": 11,
1009 | "metadata": {
1010 | "collapsed": true
1011 | },
1012 | "outputs": [],
1013 | "source": [
1014 | "for var in ['LotFrontage', '1stFlrSF', 'GrLivArea', 'SalePrice']:\n",
1015 | " X_train[var] = np.log(X_train[var])\n",
1016 | " X_test[var]= np.log(X_test[var])"
1017 | ]
1018 | },
1019 | {
1020 | "cell_type": "markdown",
1021 | "metadata": {},
1022 | "source": [
1023 | "### Categorical variables\n",
1024 | "\n",
1025 | "We do have categorical variables in our final model. First, we will remove those categories within variables that are present in less than 1% of the observations:"
1026 | ]
1027 | },
1028 | {
1029 | "cell_type": "code",
1030 | "execution_count": 12,
1031 | "metadata": {},
1032 | "outputs": [
1033 | {
1034 | "data": {
1035 | "text/plain": [
1036 | "['MSZoning',\n",
1037 | " 'Neighborhood',\n",
1038 | " 'RoofStyle',\n",
1039 | " 'MasVnrType',\n",
1040 | " 'BsmtQual',\n",
1041 | " 'BsmtExposure',\n",
1042 | " 'HeatingQC',\n",
1043 | " 'CentralAir',\n",
1044 | " 'KitchenQual',\n",
1045 | " 'FireplaceQu',\n",
1046 | " 'GarageType',\n",
1047 | " 'GarageFinish',\n",
1048 | " 'PavedDrive']"
1049 | ]
1050 | },
1051 | "execution_count": 12,
1052 | "metadata": {},
1053 | "output_type": "execute_result"
1054 | }
1055 | ],
1056 | "source": [
1057 | "# let's capture the categorical variables first\n",
1058 | "cat_vars = [var for var in features if X_train[var].dtype == 'O']\n",
1059 | "cat_vars"
1060 | ]
1061 | },
1062 | {
1063 | "cell_type": "markdown",
1064 | "metadata": {},
1065 | "source": [
1066 | "#### Important: persisting the frequent labels\n",
1067 | "\n",
1068 | "As you will see in future sections, one of the key pieces of deploying the model is \"Model Validation\". Model validation refers to corroborating that the deployed model and the model built during research, are identical. The entire pipeline needs to produce identical results.\n",
1069 | "\n",
1070 | "Therefore, in order to check at the end of the process, that the feature engineering pipelines are identical, we will save -we will persist-, the list of frequent labels per variable, so that we can use it at the end, to corroborate our models."
1071 | ]
1072 | },
1073 | {
1074 | "cell_type": "code",
1075 | "execution_count": 13,
1076 | "metadata": {
1077 | "collapsed": true
1078 | },
1079 | "outputs": [],
1080 | "source": [
1081 | "def find_frequent_labels(df, var, rare_perc):\n",
1082 | " # finds the labels that are shared by more than a certain % of the houses in the dataset\n",
1083 | " df = df.copy()\n",
1084 | " tmp = df.groupby(var)['SalePrice'].count() / len(df)\n",
1085 | " return tmp[tmp>rare_perc].index\n",
1086 | "\n",
1087 | "frequent_labels_dict = {}\n",
1088 | "\n",
1089 | "for var in cat_vars:\n",
1090 | " frequent_ls = find_frequent_labels(X_train, var, 0.01)\n",
1091 | " \n",
1092 | " # we save the list in a dictionary\n",
1093 | " frequent_labels_dict[var] = frequent_ls\n",
1094 | " \n",
1095 | " X_train[var] = np.where(X_train[var].isin(frequent_ls), X_train[var], 'Rare')\n",
1096 | " X_test[var] = np.where(X_test[var].isin(frequent_ls), X_test[var], 'Rare')\n",
1097 | " \n",
1098 | "# now we save the dictionary\n",
1099 | "np.save('FrequentLabels.npy', frequent_labels_dict)"
1100 | ]
1101 | },
1102 | {
1103 | "cell_type": "code",
1104 | "execution_count": 14,
1105 | "metadata": {},
1106 | "outputs": [
1107 | {
1108 | "data": {
1109 | "text/plain": [
1110 | "{'BsmtExposure': Index(['Av', 'Gd', 'Missing', 'Mn', 'No'], dtype='object', name='BsmtExposure'),\n",
1111 | " 'BsmtQual': Index(['Ex', 'Fa', 'Gd', 'Missing', 'TA'], dtype='object', name='BsmtQual'),\n",
1112 | " 'CentralAir': Index(['N', 'Y'], dtype='object', name='CentralAir'),\n",
1113 | " 'FireplaceQu': Index(['Ex', 'Fa', 'Gd', 'Missing', 'Po', 'TA'], dtype='object', name='FireplaceQu'),\n",
1114 | " 'GarageFinish': Index(['Fin', 'Missing', 'RFn', 'Unf'], dtype='object', name='GarageFinish'),\n",
1115 | " 'GarageType': Index(['Attchd', 'Basment', 'BuiltIn', 'Detchd', 'Missing'], dtype='object', name='GarageType'),\n",
1116 | " 'HeatingQC': Index(['Ex', 'Fa', 'Gd', 'TA'], dtype='object', name='HeatingQC'),\n",
1117 | " 'KitchenQual': Index(['Ex', 'Fa', 'Gd', 'TA'], dtype='object', name='KitchenQual'),\n",
1118 | " 'MSZoning': Index(['FV', 'RH', 'RL', 'RM'], dtype='object', name='MSZoning'),\n",
1119 | " 'MasVnrType': Index(['BrkFace', 'None', 'Stone'], dtype='object', name='MasVnrType'),\n",
1120 | " 'Neighborhood': Index(['Blmngtn', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr', 'Crawfor',\n",
1121 | " 'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel', 'NAmes', 'NWAmes',\n",
1122 | " 'NoRidge', 'NridgHt', 'OldTown', 'SWISU', 'Sawyer', 'SawyerW',\n",
1123 | " 'Somerst', 'StoneBr', 'Timber'],\n",
1124 | " dtype='object', name='Neighborhood'),\n",
1125 | " 'PavedDrive': Index(['N', 'P', 'Y'], dtype='object', name='PavedDrive'),\n",
1126 | " 'RoofStyle': Index(['Gable', 'Hip'], dtype='object', name='RoofStyle')}"
1127 | ]
1128 | },
1129 | "execution_count": 14,
1130 | "metadata": {},
1131 | "output_type": "execute_result"
1132 | }
1133 | ],
1134 | "source": [
1135 | "frequent_labels_dict"
1136 | ]
1137 | },
1138 | {
1139 | "cell_type": "markdown",
1140 | "metadata": {},
1141 | "source": [
1142 | "Next, we need to transform the strings of these variables into numbers. We will do it so that we capture the monotonic relationship between the label and the target:"
1143 | ]
1144 | },
1145 | {
1146 | "cell_type": "code",
1147 | "execution_count": 15,
1148 | "metadata": {
1149 | "collapsed": true
1150 | },
1151 | "outputs": [],
1152 | "source": [
1153 | "# this function will assign discrete values to the strings of the variables, \n",
1154 | "# so that the smaller value corresponds to the smaller mean of target\n",
1155 | "\n",
1156 | "def replace_categories(train, test, var, target):\n",
1157 | " train = train.copy()\n",
1158 | " test = test.copy()\n",
1159 | " \n",
1160 | " ordered_labels = train.groupby([var])[target].mean().sort_values().index\n",
1161 | " ordinal_label = {k:i for i, k in enumerate(ordered_labels, 0)} \n",
1162 | " \n",
1163 | " train[var] = train[var].map(ordinal_label)\n",
1164 | " test[var] = test[var].map(ordinal_label)\n",
1165 | " \n",
1166 | " return ordinal_label, train, test"
1167 | ]
1168 | },
1169 | {
1170 | "cell_type": "code",
1171 | "execution_count": 16,
1172 | "metadata": {
1173 | "scrolled": true
1174 | },
1175 | "outputs": [],
1176 | "source": [
1177 | "ordinal_label_dict = {}\n",
1178 | "for var in cat_vars:\n",
1179 | " ordinal_label, X_train, X_test = replace_categories(X_train, X_test, var, 'SalePrice')\n",
1180 | " ordinal_label_dict[var] = ordinal_label\n",
1181 | " \n",
1182 | "# now we save the dictionary\n",
1183 | "np.save('OrdinalLabels.npy', ordinal_label_dict)"
1184 | ]
1185 | },
1186 | {
1187 | "cell_type": "code",
1188 | "execution_count": 17,
1189 | "metadata": {
1190 | "scrolled": true
1191 | },
1192 | "outputs": [
1193 | {
1194 | "data": {
1195 | "text/plain": [
1196 | "{'BsmtExposure': {'Av': 3, 'Gd': 4, 'Missing': 0, 'Mn': 2, 'No': 1},\n",
1197 | " 'BsmtQual': {'Ex': 4, 'Fa': 1, 'Gd': 3, 'Missing': 0, 'TA': 2},\n",
1198 | " 'CentralAir': {'N': 0, 'Y': 1},\n",
1199 | " 'FireplaceQu': {'Ex': 5, 'Fa': 2, 'Gd': 4, 'Missing': 1, 'Po': 0, 'TA': 3},\n",
1200 | " 'GarageFinish': {'Fin': 3, 'Missing': 0, 'RFn': 2, 'Unf': 1},\n",
1201 | " 'GarageType': {'Attchd': 4,\n",
1202 | " 'Basment': 3,\n",
1203 | " 'BuiltIn': 5,\n",
1204 | " 'Detchd': 2,\n",
1205 | " 'Missing': 0,\n",
1206 | " 'Rare': 1},\n",
1207 | " 'HeatingQC': {'Ex': 4, 'Fa': 1, 'Gd': 3, 'Rare': 0, 'TA': 2},\n",
1208 | " 'KitchenQual': {'Ex': 3, 'Fa': 0, 'Gd': 2, 'TA': 1},\n",
1209 | " 'MSZoning': {'FV': 4, 'RH': 2, 'RL': 3, 'RM': 1, 'Rare': 0},\n",
1210 | " 'MasVnrType': {'BrkFace': 2, 'None': 0, 'Rare': 1, 'Stone': 3},\n",
1211 | " 'Neighborhood': {'Blmngtn': 14,\n",
1212 | " 'BrDale': 2,\n",
1213 | " 'BrkSide': 4,\n",
1214 | " 'ClearCr': 17,\n",
1215 | " 'CollgCr': 15,\n",
1216 | " 'Crawfor': 16,\n",
1217 | " 'Edwards': 3,\n",
1218 | " 'Gilbert': 13,\n",
1219 | " 'IDOTRR': 0,\n",
1220 | " 'MeadowV': 1,\n",
1221 | " 'Mitchel': 9,\n",
1222 | " 'NAmes': 8,\n",
1223 | " 'NWAmes': 12,\n",
1224 | " 'NoRidge': 22,\n",
1225 | " 'NridgHt': 21,\n",
1226 | " 'OldTown': 5,\n",
1227 | " 'Rare': 11,\n",
1228 | " 'SWISU': 7,\n",
1229 | " 'Sawyer': 6,\n",
1230 | " 'SawyerW': 10,\n",
1231 | " 'Somerst': 18,\n",
1232 | " 'StoneBr': 20,\n",
1233 | " 'Timber': 19},\n",
1234 | " 'PavedDrive': {'N': 0, 'P': 1, 'Y': 2},\n",
1235 | " 'RoofStyle': {'Gable': 0, 'Hip': 2, 'Rare': 1}}"
1236 | ]
1237 | },
1238 | "execution_count": 17,
1239 | "metadata": {},
1240 | "output_type": "execute_result"
1241 | }
1242 | ],
1243 | "source": [
1244 | "ordinal_label_dict"
1245 | ]
1246 | },
1247 | {
1248 | "cell_type": "code",
1249 | "execution_count": 18,
1250 | "metadata": {},
1251 | "outputs": [
1252 | {
1253 | "data": {
1254 | "text/plain": [
1255 | "[]"
1256 | ]
1257 | },
1258 | "execution_count": 18,
1259 | "metadata": {},
1260 | "output_type": "execute_result"
1261 | }
1262 | ],
1263 | "source": [
1264 | "# check absence of na\n",
1265 | "[var for var in features if X_train[var].isnull().sum()>0]"
1266 | ]
1267 | },
1268 | {
1269 | "cell_type": "code",
1270 | "execution_count": 19,
1271 | "metadata": {},
1272 | "outputs": [
1273 | {
1274 | "data": {
1275 | "text/plain": [
1276 | "[]"
1277 | ]
1278 | },
1279 | "execution_count": 19,
1280 | "metadata": {},
1281 | "output_type": "execute_result"
1282 | }
1283 | ],
1284 | "source": [
1285 | "# check absence of na\n",
1286 | "[var for var in features if X_test[var].isnull().sum()>0]"
1287 | ]
1288 | },
1289 | {
1290 | "cell_type": "markdown",
1291 | "metadata": {},
1292 | "source": [
1293 | "### Feature Scaling\n",
1294 | "\n",
1295 | "For use in linear models, features need to be either scaled or normalised. In the next section, I will scale features between the min and max values:"
1296 | ]
1297 | },
1298 | {
1299 | "cell_type": "code",
1300 | "execution_count": 20,
1301 | "metadata": {
1302 | "collapsed": true
1303 | },
1304 | "outputs": [],
1305 | "source": [
1306 | "# capture the target\n",
1307 | "y_train = X_train['SalePrice']\n",
1308 | "y_test = X_test['SalePrice']"
1309 | ]
1310 | },
1311 | {
1312 | "cell_type": "code",
1313 | "execution_count": 21,
1314 | "metadata": {},
1315 | "outputs": [
1316 | {
1317 | "data": {
1318 | "text/plain": [
1319 | "['scaler.pkl']"
1320 | ]
1321 | },
1322 | "execution_count": 21,
1323 | "metadata": {},
1324 | "output_type": "execute_result"
1325 | }
1326 | ],
1327 | "source": [
1328 | "# fit scaler\n",
1329 | "scaler = MinMaxScaler() # create an instance\n",
1330 | "scaler.fit(X_train[features]) # fit the scaler to the train set for later use\n",
1331 | "\n",
1332 | "# we persist the model for future use\n",
1333 | "joblib.dump(scaler, 'scaler.pkl')"
1334 | ]
1335 | },
1336 | {
1337 | "cell_type": "code",
1338 | "execution_count": 22,
1339 | "metadata": {
1340 | "collapsed": true
1341 | },
1342 | "outputs": [],
1343 | "source": [
1344 | "# transform the train and test set, and add on the Id and SalePrice variables\n",
1345 | "X_train = pd.DataFrame(scaler.transform(X_train[features]), columns=features)\n",
1346 | "X_test = pd.DataFrame(scaler.transform(X_test[features]), columns=features)"
1347 | ]
1348 | },
1349 | {
1350 | "cell_type": "code",
1351 | "execution_count": 23,
1352 | "metadata": {},
1353 | "outputs": [
1354 | {
1355 | "data": {
1356 | "text/plain": [
1357 | "['lasso_regression.pkl']"
1358 | ]
1359 | },
1360 | "execution_count": 23,
1361 | "metadata": {},
1362 | "output_type": "execute_result"
1363 | }
1364 | ],
1365 | "source": [
1366 | "# train the model\n",
1367 | "lin_model = Lasso(alpha=0.005, random_state=0) # remember to set the random_state / seed\n",
1368 | "lin_model.fit(X_train, y_train)\n",
1369 | "\n",
1370 | "# we persist the model for future use\n",
1371 | "joblib.dump(lin_model, 'lasso_regression.pkl')"
1372 | ]
1373 | },
1374 | {
1375 | "cell_type": "code",
1376 | "execution_count": 24,
1377 | "metadata": {},
1378 | "outputs": [
1379 | {
1380 | "name": "stdout",
1381 | "output_type": "stream",
1382 | "text": [
1383 | "linear train mse: 1087435415.4414494\n",
1384 | "linear train rmse: 32976.285652593586\n",
1385 | "\n",
1386 | "linear test mse: 1405259552.259598\n",
1387 | "linear test rmse: 37486.791704006864\n",
1388 | "\n",
1389 | "Average house price: 163000.00000000012\n"
1390 | ]
1391 | }
1392 | ],
1393 | "source": [
1394 | "# evaluate the model:\n",
1395 | "# remember that we log transformed the output (SalePrice) in our feature engineering notebook / lecture.\n",
1396 | "\n",
1397 | "# In order to get the true performance of the Lasso\n",
1398 | "# we need to transform both the target and the predictions\n",
1399 | "# back to the original house prices values.\n",
1400 | "\n",
1401 | "# We will evaluate performance using the mean squared error and the\n",
1402 | "# root of the mean squared error\n",
1403 | "\n",
1404 | "pred = lin_model.predict(X_train)\n",
1405 | "print('linear train mse: {}'.format(mean_squared_error(np.exp(y_train), np.exp(pred))))\n",
1406 | "print('linear train rmse: {}'.format(sqrt(mean_squared_error(np.exp(y_train), np.exp(pred)))))\n",
1407 | "print()\n",
1408 | "pred = lin_model.predict(X_test)\n",
1409 | "print('linear test mse: {}'.format(mean_squared_error(np.exp(y_test), np.exp(pred))))\n",
1410 | "print('linear test rmse: {}'.format(sqrt(mean_squared_error(np.exp(y_test), np.exp(pred)))))\n",
1411 | "print()\n",
1412 | "print('Average house price: ', np.exp(y_train).median())"
1413 | ]
1414 | },
1415 | {
1416 | "cell_type": "markdown",
1417 | "metadata": {},
1418 | "source": [
1419 | "That is all for this notebook. And that is all for this section too.\n",
1420 | "\n",
1421 | "**In the next section, we will show you how to productionise this code for model deployment**."
1422 | ]
1423 | },
1424 | {
1425 | "cell_type": "code",
1426 | "execution_count": null,
1427 | "metadata": {
1428 | "collapsed": true
1429 | },
1430 | "outputs": [],
1431 | "source": []
1432 | }
1433 | ],
1434 | "metadata": {
1435 | "kernelspec": {
1436 | "display_name": "Python 3",
1437 | "language": "python",
1438 | "name": "python3"
1439 | },
1440 | "language_info": {
1441 | "codemirror_mode": {
1442 | "name": "ipython",
1443 | "version": 3
1444 | },
1445 | "file_extension": ".py",
1446 | "mimetype": "text/x-python",
1447 | "name": "python",
1448 | "nbconvert_exporter": "python",
1449 | "pygments_lexer": "ipython3",
1450 | "version": "3.6.1"
1451 | },
1452 | "toc": {
1453 | "nav_menu": {},
1454 | "number_sections": true,
1455 | "sideBar": true,
1456 | "skip_h1_title": false,
1457 | "toc_cell": false,
1458 | "toc_position": {
1459 | "height": "583px",
1460 | "left": "0px",
1461 | "right": "1324px",
1462 | "top": "107px",
1463 | "width": "212px"
1464 | },
1465 | "toc_section_display": "block",
1466 | "toc_window_display": true
1467 | }
1468 | },
1469 | "nbformat": 4,
1470 | "nbformat_minor": 2
1471 | }
1472 |
--------------------------------------------------------------------------------
/jupyter_notebooks/Section2_MLPipelineOverview/02.8_ML_Pipeline_Step3-FeatureSelection.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Machine Learning Model Building Pipeline: Feature Selection\n",
8 | "\n",
9 | "In the following videos, we will take you through a practical example of each one of the steps in the Machine Learning model building pipeline that we described in the previous lectures. There will be a notebook for each one of the Machine Learning Pipeline steps:\n",
10 | "\n",
11 | "1. Data Analysis\n",
12 | "2. Feature Engineering\n",
13 | "3. Feature Selection\n",
14 | "4. Model Building\n",
15 | "\n",
16 | "**This is the notebook for step 3: Feature Selection**\n",
17 | "\n",
18 | "\n",
19 | "We will use the house price dataset available on [Kaggle.com](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data). See below for more details.\n",
20 | "\n",
21 | "===================================================================================================\n",
22 | "\n",
23 | "## Predicting Sale Price of Houses\n",
24 | "\n",
25 | "The aim of the project is to build a machine learning model to predict the sale price of homes based on different explanatory variables describing aspects of residential houses. \n",
26 | "\n",
27 | "### Why is this important? \n",
28 | "\n",
29 | "Predicting house prices is useful to identify fruitful investments, or to determine whether the price advertised for a house is over or underestimated, before making a buying judgment.\n",
30 | "\n",
31 | "### What is the objective of the machine learning model?\n",
32 | "\n",
33 | "We aim to minimise the difference between the real price, and the estimated price by our model. We will evaluate model performance using the mean squared error (mse) and the root squared of the mean squared error (rmse).\n",
34 | "\n",
35 | "### How do I download the dataset?\n",
36 | "\n",
37 | "To download the House Price dataset go this website:\n",
38 | "https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data\n",
39 | "\n",
40 | "Scroll down to the bottom of the page, and click on the link 'train.csv', and then click the 'download' blue button towards the right of the screen, to download the dataset. Rename the file as 'houseprice.csv' and save it to a directory of your choice.\n",
41 | "\n",
42 | "**Note the following:**\n",
43 | "- You need to be logged in to Kaggle in order to download the datasets.\n",
44 | "- You need to accept the terms and conditions of the competition to download the dataset\n",
45 | "- If you save the file to the same directory where you saved this jupyter notebook, then you can run the code as it is written here.\n",
46 | "\n",
47 | "===================================================================================================="
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "metadata": {},
53 | "source": [
54 | "## House Prices dataset: Feature Selection\n",
55 | "\n",
56 | "In the following cells, we will select a group of variables, the most predictive ones, to build our machine learning models. \n",
57 | "\n",
58 | "### Why do we need to select variables?\n",
59 | "\n",
60 | "1. For production: Fewer variables mean smaller client input requirements (e.g. customers filling out a form on a website or mobile app), and hence less code for error handling. This reduces the chances of bugs.\n",
61 | "2. For model performance: Fewer variables mean simpler, more interpretable, less over-fitted models\n",
62 | "\n",
63 | "\n",
64 | "**We will select variables using the Lasso regression: Lasso has the property of setting the coefficient of non-informative variables to zero. This way we can identify those variables and remove them from our final models.**\n",
65 | "\n",
66 | "### Setting the seed\n",
67 | "\n",
68 | "It is important to note, that we are engineering variables and pre-processing data with the idea of deploying the model if we find business value in it. Therefore, from now on, for each step that includes some element of randomness, it is extremely important that we **set the seed**. This way, we can obtain reproducibility between our research and our development code.\n",
69 | "\n",
70 | "This is perhaps one of the most important lessons that you need to take away from this course: **Always set the seeds**.\n",
71 | "\n",
72 | "Let's go ahead and load the dataset."
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 1,
78 | "metadata": {
79 | "collapsed": true
80 | },
81 | "outputs": [],
82 | "source": [
83 | "# to handle datasets\n",
84 | "import pandas as pd\n",
85 | "import numpy as np\n",
86 | "\n",
87 | "# for plotting\n",
88 | "import matplotlib.pyplot as plt\n",
89 | "%matplotlib inline\n",
90 | "\n",
91 | "# to build the models\n",
92 | "from sklearn.linear_model import Lasso\n",
93 | "from sklearn.feature_selection import SelectFromModel\n",
94 | "\n",
95 | "# to visualise al the columns in the dataframe\n",
96 | "pd.pandas.set_option('display.max_columns', None)"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 2,
102 | "metadata": {},
103 | "outputs": [
104 | {
105 | "data": {
106 | "text/html": [
107 | "\n",
108 | "\n",
121 | "
\n",
122 | " \n",
123 | " \n",
124 | " | \n",
125 | " Id | \n",
126 | " SalePrice | \n",
127 | " MSSubClass | \n",
128 | " MSZoning | \n",
129 | " LotFrontage | \n",
130 | " LotArea | \n",
131 | " Street | \n",
132 | " Alley | \n",
133 | " LotShape | \n",
134 | " LandContour | \n",
135 | " Utilities | \n",
136 | " LotConfig | \n",
137 | " LandSlope | \n",
138 | " Neighborhood | \n",
139 | " Condition1 | \n",
140 | " Condition2 | \n",
141 | " BldgType | \n",
142 | " HouseStyle | \n",
143 | " OverallQual | \n",
144 | " OverallCond | \n",
145 | " YearBuilt | \n",
146 | " YearRemodAdd | \n",
147 | " RoofStyle | \n",
148 | " RoofMatl | \n",
149 | " Exterior1st | \n",
150 | " Exterior2nd | \n",
151 | " MasVnrType | \n",
152 | " MasVnrArea | \n",
153 | " ExterQual | \n",
154 | " ExterCond | \n",
155 | " Foundation | \n",
156 | " BsmtQual | \n",
157 | " BsmtCond | \n",
158 | " BsmtExposure | \n",
159 | " BsmtFinType1 | \n",
160 | " BsmtFinSF1 | \n",
161 | " BsmtFinType2 | \n",
162 | " BsmtFinSF2 | \n",
163 | " BsmtUnfSF | \n",
164 | " TotalBsmtSF | \n",
165 | " Heating | \n",
166 | " HeatingQC | \n",
167 | " CentralAir | \n",
168 | " Electrical | \n",
169 | " 1stFlrSF | \n",
170 | " 2ndFlrSF | \n",
171 | " LowQualFinSF | \n",
172 | " GrLivArea | \n",
173 | " BsmtFullBath | \n",
174 | " BsmtHalfBath | \n",
175 | " FullBath | \n",
176 | " HalfBath | \n",
177 | " BedroomAbvGr | \n",
178 | " KitchenAbvGr | \n",
179 | " KitchenQual | \n",
180 | " TotRmsAbvGrd | \n",
181 | " Functional | \n",
182 | " Fireplaces | \n",
183 | " FireplaceQu | \n",
184 | " GarageType | \n",
185 | " GarageYrBlt | \n",
186 | " GarageFinish | \n",
187 | " GarageCars | \n",
188 | " GarageArea | \n",
189 | " GarageQual | \n",
190 | " GarageCond | \n",
191 | " PavedDrive | \n",
192 | " WoodDeckSF | \n",
193 | " OpenPorchSF | \n",
194 | " EnclosedPorch | \n",
195 | " 3SsnPorch | \n",
196 | " ScreenPorch | \n",
197 | " PoolArea | \n",
198 | " PoolQC | \n",
199 | " Fence | \n",
200 | " MiscFeature | \n",
201 | " MiscVal | \n",
202 | " MoSold | \n",
203 | " YrSold | \n",
204 | " SaleType | \n",
205 | " SaleCondition | \n",
206 | " LotFrontage_na | \n",
207 | " MasVnrArea_na | \n",
208 | " GarageYrBlt_na | \n",
209 | "
\n",
210 | " \n",
211 | " \n",
212 | " \n",
213 | " 0 | \n",
214 | " 931 | \n",
215 | " 12.211060 | \n",
216 | " 0.000000 | \n",
217 | " 0.75 | \n",
218 | " 0.461171 | \n",
219 | " 0.377048 | \n",
220 | " 1.0 | \n",
221 | " 1.0 | \n",
222 | " 0.333333 | \n",
223 | " 1.000000 | \n",
224 | " 1.0 | \n",
225 | " 0.0 | \n",
226 | " 0.0 | \n",
227 | " 0.863636 | \n",
228 | " 0.4 | \n",
229 | " 1.0 | \n",
230 | " 0.75 | \n",
231 | " 0.6 | \n",
232 | " 0.777778 | \n",
233 | " 0.50 | \n",
234 | " 0.014706 | \n",
235 | " 0.049180 | \n",
236 | " 0.0 | \n",
237 | " 0.0 | \n",
238 | " 1.0 | \n",
239 | " 1.0 | \n",
240 | " 0.000000 | \n",
241 | " 0.00000 | \n",
242 | " 0.666667 | \n",
243 | " 1.0 | \n",
244 | " 1.0 | \n",
245 | " 0.75 | \n",
246 | " 0.75 | \n",
247 | " 0.75 | \n",
248 | " 1.000000 | \n",
249 | " 0.002835 | \n",
250 | " 0.666667 | \n",
251 | " 0.0 | \n",
252 | " 0.673479 | \n",
253 | " 0.239935 | \n",
254 | " 1.0 | \n",
255 | " 1.00 | \n",
256 | " 1.0 | \n",
257 | " 1.0 | \n",
258 | " 0.559760 | \n",
259 | " 0.0 | \n",
260 | " 0.0 | \n",
261 | " 0.523250 | \n",
262 | " 0.000000 | \n",
263 | " 0.0 | \n",
264 | " 0.666667 | \n",
265 | " 0.0 | \n",
266 | " 0.375 | \n",
267 | " 0.333333 | \n",
268 | " 0.666667 | \n",
269 | " 0.416667 | \n",
270 | " 1.0 | \n",
271 | " 0.000000 | \n",
272 | " 0.2 | \n",
273 | " 0.8 | \n",
274 | " 0.018692 | \n",
275 | " 1.000000 | \n",
276 | " 0.75 | \n",
277 | " 0.430183 | \n",
278 | " 0.666667 | \n",
279 | " 1.0 | \n",
280 | " 1.0 | \n",
281 | " 0.116686 | \n",
282 | " 0.032907 | \n",
283 | " 0.0 | \n",
284 | " 0.000000 | \n",
285 | " 0.0 | \n",
286 | " 0.0 | \n",
287 | " 0.0 | \n",
288 | " 0.75 | \n",
289 | " 1.0 | \n",
290 | " 0.0 | \n",
291 | " 0.545455 | \n",
292 | " 0.75 | \n",
293 | " 0.666667 | \n",
294 | " 0.75 | \n",
295 | " 0.0 | \n",
296 | " 0.0 | \n",
297 | " 0.0 | \n",
298 | "
\n",
299 | " \n",
300 | " 1 | \n",
301 | " 657 | \n",
302 | " 11.887931 | \n",
303 | " 0.000000 | \n",
304 | " 0.75 | \n",
305 | " 0.456066 | \n",
306 | " 0.399443 | \n",
307 | " 1.0 | \n",
308 | " 1.0 | \n",
309 | " 0.333333 | \n",
310 | " 0.333333 | \n",
311 | " 1.0 | \n",
312 | " 0.0 | \n",
313 | " 0.0 | \n",
314 | " 0.363636 | \n",
315 | " 0.4 | \n",
316 | " 1.0 | \n",
317 | " 0.75 | \n",
318 | " 0.6 | \n",
319 | " 0.444444 | \n",
320 | " 0.75 | \n",
321 | " 0.360294 | \n",
322 | " 0.049180 | \n",
323 | " 0.0 | \n",
324 | " 0.0 | \n",
325 | " 0.6 | \n",
326 | " 0.6 | \n",
327 | " 0.666667 | \n",
328 | " 0.03375 | \n",
329 | " 0.666667 | \n",
330 | " 1.0 | \n",
331 | " 0.5 | \n",
332 | " 0.50 | \n",
333 | " 0.75 | \n",
334 | " 0.25 | \n",
335 | " 0.666667 | \n",
336 | " 0.142807 | \n",
337 | " 0.666667 | \n",
338 | " 0.0 | \n",
339 | " 0.114724 | \n",
340 | " 0.172340 | \n",
341 | " 1.0 | \n",
342 | " 1.00 | \n",
343 | " 1.0 | \n",
344 | " 1.0 | \n",
345 | " 0.434539 | \n",
346 | " 0.0 | \n",
347 | " 0.0 | \n",
348 | " 0.406196 | \n",
349 | " 0.333333 | \n",
350 | " 0.0 | \n",
351 | " 0.333333 | \n",
352 | " 0.5 | \n",
353 | " 0.375 | \n",
354 | " 0.333333 | \n",
355 | " 0.666667 | \n",
356 | " 0.250000 | \n",
357 | " 1.0 | \n",
358 | " 0.000000 | \n",
359 | " 0.2 | \n",
360 | " 0.8 | \n",
361 | " 0.457944 | \n",
362 | " 0.666667 | \n",
363 | " 0.25 | \n",
364 | " 0.220028 | \n",
365 | " 0.666667 | \n",
366 | " 1.0 | \n",
367 | " 1.0 | \n",
368 | " 0.000000 | \n",
369 | " 0.000000 | \n",
370 | " 0.0 | \n",
371 | " 0.000000 | \n",
372 | " 0.0 | \n",
373 | " 0.0 | \n",
374 | " 0.0 | \n",
375 | " 0.50 | \n",
376 | " 1.0 | \n",
377 | " 0.0 | \n",
378 | " 0.636364 | \n",
379 | " 0.50 | \n",
380 | " 0.666667 | \n",
381 | " 0.75 | \n",
382 | " 0.0 | \n",
383 | " 0.0 | \n",
384 | " 0.0 | \n",
385 | "
\n",
386 | " \n",
387 | " 2 | \n",
388 | " 46 | \n",
389 | " 12.675764 | \n",
390 | " 0.588235 | \n",
391 | " 0.75 | \n",
392 | " 0.394699 | \n",
393 | " 0.347082 | \n",
394 | " 1.0 | \n",
395 | " 1.0 | \n",
396 | " 0.000000 | \n",
397 | " 0.333333 | \n",
398 | " 1.0 | \n",
399 | " 0.0 | \n",
400 | " 0.0 | \n",
401 | " 0.954545 | \n",
402 | " 0.4 | \n",
403 | " 1.0 | \n",
404 | " 1.00 | \n",
405 | " 0.6 | \n",
406 | " 0.888889 | \n",
407 | " 0.50 | \n",
408 | " 0.036765 | \n",
409 | " 0.098361 | \n",
410 | " 1.0 | \n",
411 | " 0.0 | \n",
412 | " 0.3 | \n",
413 | " 0.2 | \n",
414 | " 0.666667 | \n",
415 | " 0.25750 | \n",
416 | " 1.000000 | \n",
417 | " 1.0 | \n",
418 | " 1.0 | \n",
419 | " 1.00 | \n",
420 | " 0.75 | \n",
421 | " 0.25 | \n",
422 | " 1.000000 | \n",
423 | " 0.080794 | \n",
424 | " 0.666667 | \n",
425 | " 0.0 | \n",
426 | " 0.601951 | \n",
427 | " 0.286743 | \n",
428 | " 1.0 | \n",
429 | " 1.00 | \n",
430 | " 1.0 | \n",
431 | " 1.0 | \n",
432 | " 0.627205 | \n",
433 | " 0.0 | \n",
434 | " 0.0 | \n",
435 | " 0.586296 | \n",
436 | " 0.333333 | \n",
437 | " 0.0 | \n",
438 | " 0.666667 | \n",
439 | " 0.0 | \n",
440 | " 0.250 | \n",
441 | " 0.333333 | \n",
442 | " 1.000000 | \n",
443 | " 0.333333 | \n",
444 | " 1.0 | \n",
445 | " 0.333333 | \n",
446 | " 0.8 | \n",
447 | " 0.8 | \n",
448 | " 0.046729 | \n",
449 | " 0.666667 | \n",
450 | " 0.50 | \n",
451 | " 0.406206 | \n",
452 | " 0.666667 | \n",
453 | " 1.0 | \n",
454 | " 1.0 | \n",
455 | " 0.228705 | \n",
456 | " 0.149909 | \n",
457 | " 0.0 | \n",
458 | " 0.000000 | \n",
459 | " 0.0 | \n",
460 | " 0.0 | \n",
461 | " 0.0 | \n",
462 | " 0.75 | \n",
463 | " 1.0 | \n",
464 | " 0.0 | \n",
465 | " 0.090909 | \n",
466 | " 1.00 | \n",
467 | " 0.666667 | \n",
468 | " 0.75 | \n",
469 | " 0.0 | \n",
470 | " 0.0 | \n",
471 | " 0.0 | \n",
472 | "
\n",
473 | " \n",
474 | " 3 | \n",
475 | " 1349 | \n",
476 | " 12.278393 | \n",
477 | " 0.000000 | \n",
478 | " 0.75 | \n",
479 | " 0.388581 | \n",
480 | " 0.493677 | \n",
481 | " 1.0 | \n",
482 | " 1.0 | \n",
483 | " 0.666667 | \n",
484 | " 0.666667 | \n",
485 | " 1.0 | \n",
486 | " 0.0 | \n",
487 | " 0.0 | \n",
488 | " 0.454545 | \n",
489 | " 0.4 | \n",
490 | " 1.0 | \n",
491 | " 0.75 | \n",
492 | " 0.6 | \n",
493 | " 0.666667 | \n",
494 | " 0.50 | \n",
495 | " 0.066176 | \n",
496 | " 0.163934 | \n",
497 | " 0.0 | \n",
498 | " 0.0 | \n",
499 | " 1.0 | \n",
500 | " 1.0 | \n",
501 | " 0.000000 | \n",
502 | " 0.00000 | \n",
503 | " 0.666667 | \n",
504 | " 1.0 | \n",
505 | " 1.0 | \n",
506 | " 0.75 | \n",
507 | " 0.75 | \n",
508 | " 1.00 | \n",
509 | " 1.000000 | \n",
510 | " 0.255670 | \n",
511 | " 0.666667 | \n",
512 | " 0.0 | \n",
513 | " 0.018114 | \n",
514 | " 0.242553 | \n",
515 | " 1.0 | \n",
516 | " 1.00 | \n",
517 | " 1.0 | \n",
518 | " 1.0 | \n",
519 | " 0.566920 | \n",
520 | " 0.0 | \n",
521 | " 0.0 | \n",
522 | " 0.529943 | \n",
523 | " 0.333333 | \n",
524 | " 0.0 | \n",
525 | " 0.666667 | \n",
526 | " 0.0 | \n",
527 | " 0.375 | \n",
528 | " 0.333333 | \n",
529 | " 0.666667 | \n",
530 | " 0.250000 | \n",
531 | " 1.0 | \n",
532 | " 0.333333 | \n",
533 | " 0.4 | \n",
534 | " 0.8 | \n",
535 | " 0.084112 | \n",
536 | " 0.666667 | \n",
537 | " 0.50 | \n",
538 | " 0.362482 | \n",
539 | " 0.666667 | \n",
540 | " 1.0 | \n",
541 | " 1.0 | \n",
542 | " 0.469078 | \n",
543 | " 0.045704 | \n",
544 | " 0.0 | \n",
545 | " 0.000000 | \n",
546 | " 0.0 | \n",
547 | " 0.0 | \n",
548 | " 0.0 | \n",
549 | " 0.75 | \n",
550 | " 1.0 | \n",
551 | " 0.0 | \n",
552 | " 0.636364 | \n",
553 | " 0.25 | \n",
554 | " 0.666667 | \n",
555 | " 0.75 | \n",
556 | " 1.0 | \n",
557 | " 0.0 | \n",
558 | " 0.0 | \n",
559 | "
\n",
560 | " \n",
561 | " 4 | \n",
562 | " 56 | \n",
563 | " 12.103486 | \n",
564 | " 0.000000 | \n",
565 | " 0.75 | \n",
566 | " 0.577658 | \n",
567 | " 0.402702 | \n",
568 | " 1.0 | \n",
569 | " 1.0 | \n",
570 | " 0.333333 | \n",
571 | " 0.333333 | \n",
572 | " 1.0 | \n",
573 | " 0.0 | \n",
574 | " 0.0 | \n",
575 | " 0.363636 | \n",
576 | " 0.4 | \n",
577 | " 1.0 | \n",
578 | " 0.75 | \n",
579 | " 0.6 | \n",
580 | " 0.555556 | \n",
581 | " 0.50 | \n",
582 | " 0.323529 | \n",
583 | " 0.737705 | \n",
584 | " 0.0 | \n",
585 | " 0.0 | \n",
586 | " 0.6 | \n",
587 | " 0.7 | \n",
588 | " 0.666667 | \n",
589 | " 0.17000 | \n",
590 | " 0.333333 | \n",
591 | " 1.0 | \n",
592 | " 0.5 | \n",
593 | " 0.50 | \n",
594 | " 0.75 | \n",
595 | " 0.25 | \n",
596 | " 0.333333 | \n",
597 | " 0.086818 | \n",
598 | " 0.666667 | \n",
599 | " 0.0 | \n",
600 | " 0.434278 | \n",
601 | " 0.233224 | \n",
602 | " 1.0 | \n",
603 | " 0.75 | \n",
604 | " 1.0 | \n",
605 | " 1.0 | \n",
606 | " 0.549026 | \n",
607 | " 0.0 | \n",
608 | " 0.0 | \n",
609 | " 0.513216 | \n",
610 | " 0.000000 | \n",
611 | " 0.0 | \n",
612 | " 0.666667 | \n",
613 | " 0.0 | \n",
614 | " 0.375 | \n",
615 | " 0.333333 | \n",
616 | " 0.333333 | \n",
617 | " 0.416667 | \n",
618 | " 1.0 | \n",
619 | " 0.333333 | \n",
620 | " 0.8 | \n",
621 | " 0.8 | \n",
622 | " 0.411215 | \n",
623 | " 0.666667 | \n",
624 | " 0.50 | \n",
625 | " 0.406206 | \n",
626 | " 0.666667 | \n",
627 | " 1.0 | \n",
628 | " 1.0 | \n",
629 | " 0.000000 | \n",
630 | " 0.000000 | \n",
631 | " 0.0 | \n",
632 | " 0.801181 | \n",
633 | " 0.0 | \n",
634 | " 0.0 | \n",
635 | " 0.0 | \n",
636 | " 0.75 | \n",
637 | " 1.0 | \n",
638 | " 0.0 | \n",
639 | " 0.545455 | \n",
640 | " 0.50 | \n",
641 | " 0.666667 | \n",
642 | " 0.75 | \n",
643 | " 0.0 | \n",
644 | " 0.0 | \n",
645 | " 0.0 | \n",
646 | "
\n",
647 | " \n",
648 | "
\n",
649 | "
"
650 | ],
651 | "text/plain": [
652 | " Id SalePrice MSSubClass MSZoning LotFrontage LotArea Street \\\n",
653 | "0 931 12.211060 0.000000 0.75 0.461171 0.377048 1.0 \n",
654 | "1 657 11.887931 0.000000 0.75 0.456066 0.399443 1.0 \n",
655 | "2 46 12.675764 0.588235 0.75 0.394699 0.347082 1.0 \n",
656 | "3 1349 12.278393 0.000000 0.75 0.388581 0.493677 1.0 \n",
657 | "4 56 12.103486 0.000000 0.75 0.577658 0.402702 1.0 \n",
658 | "\n",
659 | " Alley LotShape LandContour Utilities LotConfig LandSlope \\\n",
660 | "0 1.0 0.333333 1.000000 1.0 0.0 0.0 \n",
661 | "1 1.0 0.333333 0.333333 1.0 0.0 0.0 \n",
662 | "2 1.0 0.000000 0.333333 1.0 0.0 0.0 \n",
663 | "3 1.0 0.666667 0.666667 1.0 0.0 0.0 \n",
664 | "4 1.0 0.333333 0.333333 1.0 0.0 0.0 \n",
665 | "\n",
666 | " Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual \\\n",
667 | "0 0.863636 0.4 1.0 0.75 0.6 0.777778 \n",
668 | "1 0.363636 0.4 1.0 0.75 0.6 0.444444 \n",
669 | "2 0.954545 0.4 1.0 1.00 0.6 0.888889 \n",
670 | "3 0.454545 0.4 1.0 0.75 0.6 0.666667 \n",
671 | "4 0.363636 0.4 1.0 0.75 0.6 0.555556 \n",
672 | "\n",
673 | " OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st \\\n",
674 | "0 0.50 0.014706 0.049180 0.0 0.0 1.0 \n",
675 | "1 0.75 0.360294 0.049180 0.0 0.0 0.6 \n",
676 | "2 0.50 0.036765 0.098361 1.0 0.0 0.3 \n",
677 | "3 0.50 0.066176 0.163934 0.0 0.0 1.0 \n",
678 | "4 0.50 0.323529 0.737705 0.0 0.0 0.6 \n",
679 | "\n",
680 | " Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation \\\n",
681 | "0 1.0 0.000000 0.00000 0.666667 1.0 1.0 \n",
682 | "1 0.6 0.666667 0.03375 0.666667 1.0 0.5 \n",
683 | "2 0.2 0.666667 0.25750 1.000000 1.0 1.0 \n",
684 | "3 1.0 0.000000 0.00000 0.666667 1.0 1.0 \n",
685 | "4 0.7 0.666667 0.17000 0.333333 1.0 0.5 \n",
686 | "\n",
687 | " BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 \\\n",
688 | "0 0.75 0.75 0.75 1.000000 0.002835 0.666667 \n",
689 | "1 0.50 0.75 0.25 0.666667 0.142807 0.666667 \n",
690 | "2 1.00 0.75 0.25 1.000000 0.080794 0.666667 \n",
691 | "3 0.75 0.75 1.00 1.000000 0.255670 0.666667 \n",
692 | "4 0.50 0.75 0.25 0.333333 0.086818 0.666667 \n",
693 | "\n",
694 | " BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir \\\n",
695 | "0 0.0 0.673479 0.239935 1.0 1.00 1.0 \n",
696 | "1 0.0 0.114724 0.172340 1.0 1.00 1.0 \n",
697 | "2 0.0 0.601951 0.286743 1.0 1.00 1.0 \n",
698 | "3 0.0 0.018114 0.242553 1.0 1.00 1.0 \n",
699 | "4 0.0 0.434278 0.233224 1.0 0.75 1.0 \n",
700 | "\n",
701 | " Electrical 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath \\\n",
702 | "0 1.0 0.559760 0.0 0.0 0.523250 0.000000 \n",
703 | "1 1.0 0.434539 0.0 0.0 0.406196 0.333333 \n",
704 | "2 1.0 0.627205 0.0 0.0 0.586296 0.333333 \n",
705 | "3 1.0 0.566920 0.0 0.0 0.529943 0.333333 \n",
706 | "4 1.0 0.549026 0.0 0.0 0.513216 0.000000 \n",
707 | "\n",
708 | " BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual \\\n",
709 | "0 0.0 0.666667 0.0 0.375 0.333333 0.666667 \n",
710 | "1 0.0 0.333333 0.5 0.375 0.333333 0.666667 \n",
711 | "2 0.0 0.666667 0.0 0.250 0.333333 1.000000 \n",
712 | "3 0.0 0.666667 0.0 0.375 0.333333 0.666667 \n",
713 | "4 0.0 0.666667 0.0 0.375 0.333333 0.333333 \n",
714 | "\n",
715 | " TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt \\\n",
716 | "0 0.416667 1.0 0.000000 0.2 0.8 0.018692 \n",
717 | "1 0.250000 1.0 0.000000 0.2 0.8 0.457944 \n",
718 | "2 0.333333 1.0 0.333333 0.8 0.8 0.046729 \n",
719 | "3 0.250000 1.0 0.333333 0.4 0.8 0.084112 \n",
720 | "4 0.416667 1.0 0.333333 0.8 0.8 0.411215 \n",
721 | "\n",
722 | " GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive \\\n",
723 | "0 1.000000 0.75 0.430183 0.666667 1.0 1.0 \n",
724 | "1 0.666667 0.25 0.220028 0.666667 1.0 1.0 \n",
725 | "2 0.666667 0.50 0.406206 0.666667 1.0 1.0 \n",
726 | "3 0.666667 0.50 0.362482 0.666667 1.0 1.0 \n",
727 | "4 0.666667 0.50 0.406206 0.666667 1.0 1.0 \n",
728 | "\n",
729 | " WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea \\\n",
730 | "0 0.116686 0.032907 0.0 0.000000 0.0 0.0 \n",
731 | "1 0.000000 0.000000 0.0 0.000000 0.0 0.0 \n",
732 | "2 0.228705 0.149909 0.0 0.000000 0.0 0.0 \n",
733 | "3 0.469078 0.045704 0.0 0.000000 0.0 0.0 \n",
734 | "4 0.000000 0.000000 0.0 0.801181 0.0 0.0 \n",
735 | "\n",
736 | " PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType \\\n",
737 | "0 0.0 0.75 1.0 0.0 0.545455 0.75 0.666667 \n",
738 | "1 0.0 0.50 1.0 0.0 0.636364 0.50 0.666667 \n",
739 | "2 0.0 0.75 1.0 0.0 0.090909 1.00 0.666667 \n",
740 | "3 0.0 0.75 1.0 0.0 0.636364 0.25 0.666667 \n",
741 | "4 0.0 0.75 1.0 0.0 0.545455 0.50 0.666667 \n",
742 | "\n",
743 | " SaleCondition LotFrontage_na MasVnrArea_na GarageYrBlt_na \n",
744 | "0 0.75 0.0 0.0 0.0 \n",
745 | "1 0.75 0.0 0.0 0.0 \n",
746 | "2 0.75 0.0 0.0 0.0 \n",
747 | "3 0.75 1.0 0.0 0.0 \n",
748 | "4 0.75 0.0 0.0 0.0 "
749 | ]
750 | },
751 | "execution_count": 2,
752 | "metadata": {},
753 | "output_type": "execute_result"
754 | }
755 | ],
756 | "source": [
757 | "# load dataset\n",
758 | "# We load the datasets with the engineered values: we built and saved these datasets in the previous lecture.\n",
759 | "# If you haven't done so, go ahead and check the previous lecture / notebook to find out how to create these datasets\n",
760 | "\n",
761 | "X_train = pd.read_csv('xtrain.csv')\n",
762 | "X_test = pd.read_csv('xtest.csv')\n",
763 | "\n",
764 | "X_train.head()"
765 | ]
766 | },
767 | {
768 | "cell_type": "code",
769 | "execution_count": 3,
770 | "metadata": {
771 | "collapsed": true
772 | },
773 | "outputs": [],
774 | "source": [
775 | "# capture the target\n",
776 | "y_train = X_train['SalePrice']\n",
777 | "y_test = X_test['SalePrice']\n",
778 | "\n",
779 | "# drop unnecessary variables from our training and testing sets\n",
780 | "X_train.drop(['Id', 'SalePrice'], axis=1, inplace=True)\n",
781 | "X_test.drop(['Id', 'SalePrice'], axis=1, inplace=True)"
782 | ]
783 | },
784 | {
785 | "cell_type": "markdown",
786 | "metadata": {},
787 | "source": [
788 | "### Feature Selection\n",
789 | "\n",
790 | "Let's go ahead and select a subset of the most predictive features. There is an element of randomness in the Lasso regression, so remember to set the seed."
791 | ]
792 | },
793 | {
794 | "cell_type": "code",
795 | "execution_count": 4,
796 | "metadata": {
797 | "scrolled": true
798 | },
799 | "outputs": [
800 | {
801 | "data": {
802 | "text/plain": [
803 | "SelectFromModel(estimator=Lasso(alpha=0.005, copy_X=True, fit_intercept=True, max_iter=1000,\n",
804 | " normalize=False, positive=False, precompute=False, random_state=0,\n",
805 | " selection='cyclic', tol=0.0001, warm_start=False),\n",
806 | " prefit=False, threshold=None)"
807 | ]
808 | },
809 | "execution_count": 4,
810 | "metadata": {},
811 | "output_type": "execute_result"
812 | }
813 | ],
814 | "source": [
815 | "# here I will do the model fitting and feature selection\n",
816 | "# altogether in one line of code\n",
817 | "\n",
818 | "# first, I specify the Lasso Regression model, and I\n",
819 | "# select a suitable alpha (equivalent of penalty).\n",
820 | "# The bigger the alpha the less features that will be selected.\n",
821 | "\n",
822 | "# Then I use the selectFromModel object from sklearn, which\n",
823 | "# will select the features which coefficients are non-zero\n",
824 | "\n",
825 | "sel_ = SelectFromModel(Lasso(alpha=0.005, random_state=0)) # remember to set the seed, the random state in this function\n",
826 | "sel_.fit(X_train, y_train)"
827 | ]
828 | },
829 | {
830 | "cell_type": "code",
831 | "execution_count": 5,
832 | "metadata": {},
833 | "outputs": [
834 | {
835 | "data": {
836 | "text/plain": [
837 | "array([ True, True, False, False, False, False, False, False, False,\n",
838 | " False, False, True, False, False, False, False, True, True,\n",
839 | " False, True, True, False, False, False, True, False, False,\n",
840 | " False, False, True, False, True, False, False, False, False,\n",
841 | " False, False, False, True, True, False, True, False, False,\n",
842 | " True, True, False, False, False, False, False, True, False,\n",
843 | " False, True, True, True, False, True, True, False, False,\n",
844 | " False, True, False, False, False, False, False, False, False,\n",
845 | " False, False, False, False, False, False, False, False, False, False], dtype=bool)"
846 | ]
847 | },
848 | "execution_count": 5,
849 | "metadata": {},
850 | "output_type": "execute_result"
851 | }
852 | ],
853 | "source": [
854 | "# this command let's us visualise those features that were kept.\n",
855 | "# Kept features have a True indicator\n",
856 | "sel_.get_support()"
857 | ]
858 | },
859 | {
860 | "cell_type": "code",
861 | "execution_count": 6,
862 | "metadata": {},
863 | "outputs": [
864 | {
865 | "name": "stdout",
866 | "output_type": "stream",
867 | "text": [
868 | "total features: 82\n",
869 | "selected features: 22\n",
870 | "features with coefficients shrank to zero: 60\n"
871 | ]
872 | }
873 | ],
874 | "source": [
875 | "# let's print the number of total and selected features\n",
876 | "\n",
877 | "# this is how we can make a list of the selected features\n",
878 | "selected_feat = X_train.columns[(sel_.get_support())]\n",
879 | "\n",
880 | "# let's print some stats\n",
881 | "print('total features: {}'.format((X_train.shape[1])))\n",
882 | "print('selected features: {}'.format(len(selected_feat)))\n",
883 | "print('features with coefficients shrank to zero: {}'.format(\n",
884 | " np.sum(sel_.estimator_.coef_ == 0)))"
885 | ]
886 | },
887 | {
888 | "cell_type": "code",
889 | "execution_count": 7,
890 | "metadata": {},
891 | "outputs": [
892 | {
893 | "data": {
894 | "text/plain": [
895 | "Index(['MSSubClass', 'MSZoning', 'Neighborhood', 'OverallQual', 'OverallCond',\n",
896 | " 'YearRemodAdd', 'RoofStyle', 'MasVnrType', 'BsmtQual', 'BsmtExposure',\n",
897 | " 'HeatingQC', 'CentralAir', '1stFlrSF', 'GrLivArea', 'BsmtFullBath',\n",
898 | " 'KitchenQual', 'Fireplaces', 'FireplaceQu', 'GarageType',\n",
899 | " 'GarageFinish', 'GarageCars', 'PavedDrive'],\n",
900 | " dtype='object')"
901 | ]
902 | },
903 | "execution_count": 7,
904 | "metadata": {},
905 | "output_type": "execute_result"
906 | }
907 | ],
908 | "source": [
909 | "# print the selected features\n",
910 | "selected_feat"
911 | ]
912 | },
913 | {
914 | "cell_type": "markdown",
915 | "metadata": {},
916 | "source": [
917 | "### Identify the selected variables"
918 | ]
919 | },
920 | {
921 | "cell_type": "code",
922 | "execution_count": 8,
923 | "metadata": {},
924 | "outputs": [
925 | {
926 | "data": {
927 | "text/plain": [
928 | "Index(['MSSubClass', 'MSZoning', 'Neighborhood', 'OverallQual', 'OverallCond',\n",
929 | " 'YearRemodAdd', 'RoofStyle', 'MasVnrType', 'BsmtQual', 'BsmtExposure',\n",
930 | " 'HeatingQC', 'CentralAir', '1stFlrSF', 'GrLivArea', 'BsmtFullBath',\n",
931 | " 'KitchenQual', 'Fireplaces', 'FireplaceQu', 'GarageType',\n",
932 | " 'GarageFinish', 'GarageCars', 'PavedDrive'],\n",
933 | " dtype='object')"
934 | ]
935 | },
936 | "execution_count": 8,
937 | "metadata": {},
938 | "output_type": "execute_result"
939 | }
940 | ],
941 | "source": [
942 | "# this is an alternative way of identifying the selected features \n",
943 | "# based on the non-zero regularisation coefficients:\n",
944 | "selected_feats = X_train.columns[(sel_.estimator_.coef_ != 0).ravel().tolist()]\n",
945 | "selected_feats"
946 | ]
947 | },
948 | {
949 | "cell_type": "code",
950 | "execution_count": 9,
951 | "metadata": {
952 | "collapsed": true
953 | },
954 | "outputs": [],
955 | "source": [
956 | "# now we save the selected list of features\n",
957 | "pd.Series(selected_feats).to_csv('selected_features.csv', index=False)"
958 | ]
959 | },
960 | {
961 | "cell_type": "markdown",
962 | "metadata": {
963 | "collapsed": true
964 | },
965 | "source": [
966 | "That is all for this notebook. In the next video, we will go ahead and build the final model using the selected features. See you then!"
967 | ]
968 | }
969 | ],
970 | "metadata": {
971 | "kernelspec": {
972 | "display_name": "Python 3",
973 | "language": "python",
974 | "name": "python3"
975 | },
976 | "language_info": {
977 | "codemirror_mode": {
978 | "name": "ipython",
979 | "version": 3
980 | },
981 | "file_extension": ".py",
982 | "mimetype": "text/x-python",
983 | "name": "python",
984 | "nbconvert_exporter": "python",
985 | "pygments_lexer": "ipython3",
986 | "version": "3.6.1"
987 | },
988 | "toc": {
989 | "nav_menu": {},
990 | "number_sections": true,
991 | "sideBar": true,
992 | "skip_h1_title": false,
993 | "toc_cell": false,
994 | "toc_position": {
995 | "height": "583px",
996 | "left": "0px",
997 | "right": "1324px",
998 | "top": "107px",
999 | "width": "212px"
1000 | },
1001 | "toc_section_display": "block",
1002 | "toc_window_display": true
1003 | }
1004 | },
1005 | "nbformat": 4,
1006 | "nbformat_minor": 2
1007 | }
1008 |
--------------------------------------------------------------------------------
/jupyter_notebooks/Section2_MLPipelineOverview/BONUS_Randomisation_in_ML _and_setting_the_seed.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Reproducibility in Machine Learning\n",
8 | "\n",
9 | "Reproducibility in machine learning modeling is an important problem faced by data scientists and companies seeking to put machine learning models into production. Reproducibility means that given the same inputs, we should obtain exactly the same outputs. And this is for both our research and our production environment. In other words, our research models and our deployed models should produce the same score for the same input.\n",
10 | "\n",
11 | "There are tremendous costs to irreproducible machine learning models including:\n",
12 | "\n",
13 | "- Financial costs\n",
14 | "- Time costs (lost time)\n",
15 | "- Reputational costs\n",
16 | "- Compliance costs\n",
17 | "- Regulatory costs\n",
18 | "\n",
19 | "The problems with reproducibility can arise in any and all of the machine learning building pipeline steps:\n",
20 | "\n",
21 | "- Data gathering\n",
22 | "- Feature extraction and feature engineering\n",
23 | "- Feature selection\n",
24 | "- Model building\n",
25 | "- Data scoring\n",
26 | "\n",
27 | "This is because all these steps involve elements of randomness. For example, if gathering data with SQL, there is an element of randomness when retrieving the rows from the database. During feature engineering, if we replace missing information by a random extraction of non-missing observations, we are introducing another layer of randomness. Machine learning models and feature selection algorithms involve randomness during model fitting. Think for example Random Forests; there is an element of randomness to select the features at each split, as well as to bootrstrap a sample of the dataset to fit each tree. For neural networks there is an element of randomness to initialise the network weights.\n",
28 | "\n",
29 | "In a future section, we will show you how to tackle reproducibility between research and deployment pipelines.\n",
30 | "\n",
31 | "For this section, please go ahead and get familiar with randomness in computer science and machine learning by visiting the following resources:\n",
32 | "\n",
33 | "- [Why do we need randomness?](https://www.kdnuggets.com/2017/06/surprising-complexity-randomness.html)\n",
34 | "- [Embrace Randomness in Machine Learning](https://machinelearningmastery.com/randomness-in-machine-learning/)\n",
35 | "- [Random Number Generators for ML in python](https://machinelearningmastery.com/introduction-to-random-number-generators-for-machine-learning/)"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {
42 | "collapsed": true
43 | },
44 | "outputs": [],
45 | "source": []
46 | }
47 | ],
48 | "metadata": {
49 | "kernelspec": {
50 | "display_name": "Python 3",
51 | "language": "python",
52 | "name": "python3"
53 | },
54 | "language_info": {
55 | "codemirror_mode": {
56 | "name": "ipython",
57 | "version": 3
58 | },
59 | "file_extension": ".py",
60 | "mimetype": "text/x-python",
61 | "name": "python",
62 | "nbconvert_exporter": "python",
63 | "pygments_lexer": "ipython3",
64 | "version": "3.6.1"
65 | },
66 | "toc": {
67 | "nav_menu": {},
68 | "number_sections": true,
69 | "sideBar": true,
70 | "skip_h1_title": false,
71 | "toc_cell": false,
72 | "toc_position": {},
73 | "toc_section_display": "block",
74 | "toc_window_display": false
75 | }
76 | },
77 | "nbformat": 4,
78 | "nbformat_minor": 2
79 | }
80 |
--------------------------------------------------------------------------------
/jupyter_notebooks/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter==1.0.0
2 | matplotlib==3.0.2
3 | pandas==0.23.4
4 | numpy==1.13.3
5 | scikit-learn==0.19.0
6 | Keras==2.1.3
7 | opencv-python==4.0.0.21
8 | h5py==2.9.0
9 |
--------------------------------------------------------------------------------
/packages/ml_api/VERSION:
--------------------------------------------------------------------------------
1 | 0.2.1
--------------------------------------------------------------------------------
/packages/ml_api/api/__init__.py:
--------------------------------------------------------------------------------
1 | from api.config import PACKAGE_ROOT
2 |
3 | with open(PACKAGE_ROOT / 'VERSION') as version_file:
4 | __version__ = version_file.read().strip()
5 |
--------------------------------------------------------------------------------
/packages/ml_api/api/app.py:
--------------------------------------------------------------------------------
1 | from flask import Flask
2 |
3 | from api.config import get_logger
4 |
5 |
6 | _logger = get_logger(logger_name=__name__)
7 |
8 |
9 | def create_app(*, config_object) -> Flask:
10 | """Create a flask app instance."""
11 |
12 | flask_app = Flask('ml_api')
13 | flask_app.config.from_object(config_object)
14 |
15 | # import blueprints
16 | from api.controller import prediction_app
17 | flask_app.register_blueprint(prediction_app)
18 | _logger.debug('Application instance created')
19 |
20 | return flask_app
21 |
--------------------------------------------------------------------------------
/packages/ml_api/api/config.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from logging.handlers import TimedRotatingFileHandler
3 | import pathlib
4 | import os
5 | import sys
6 |
7 | PACKAGE_ROOT = pathlib.Path(__file__).resolve().parent.parent
8 |
9 | FORMATTER = logging.Formatter(
10 | "%(asctime)s — %(name)s — %(levelname)s —"
11 | "%(funcName)s:%(lineno)d — %(message)s")
12 | LOG_DIR = PACKAGE_ROOT / 'logs'
13 | LOG_DIR.mkdir(exist_ok=True)
14 | LOG_FILE = LOG_DIR / 'ml_api.log'
15 | UPLOAD_FOLDER = PACKAGE_ROOT / 'uploads'
16 | UPLOAD_FOLDER.mkdir(exist_ok=True)
17 |
18 | ALLOWED_EXTENSIONS = set(['png', 'jpg', 'jpeg'])
19 |
20 |
21 | def get_console_handler():
22 | console_handler = logging.StreamHandler(sys.stdout)
23 | console_handler.setFormatter(FORMATTER)
24 | return console_handler
25 |
26 |
27 | def get_file_handler():
28 | file_handler = TimedRotatingFileHandler(
29 | LOG_FILE, when='midnight')
30 | file_handler.setFormatter(FORMATTER)
31 | file_handler.setLevel(logging.WARNING)
32 | return file_handler
33 |
34 |
35 | def get_logger(*, logger_name):
36 | """Get logger with prepared handlers."""
37 |
38 | logger = logging.getLogger(logger_name)
39 |
40 | logger.setLevel(logging.INFO)
41 |
42 | logger.addHandler(get_console_handler())
43 | logger.addHandler(get_file_handler())
44 | logger.propagate = False
45 |
46 | return logger
47 |
48 |
49 | class Config:
50 | DEBUG = False
51 | TESTING = False
52 | CSRF_ENABLED = True
53 | SECRET_KEY = 'this-really-needs-to-be-changed'
54 | SERVER_PORT = 5000
55 | UPLOAD_FOLDER = UPLOAD_FOLDER
56 |
57 |
58 | class ProductionConfig(Config):
59 | DEBUG = False
60 | SERVER_ADDRESS: os.environ.get('SERVER_ADDRESS', '0.0.0.0')
61 | SERVER_PORT: os.environ.get('SERVER_PORT', '5000')
62 |
63 |
64 | class DevelopmentConfig(Config):
65 | DEVELOPMENT = True
66 | DEBUG = True
67 |
68 |
69 | class TestingConfig(Config):
70 | TESTING = True
71 |
--------------------------------------------------------------------------------
/packages/ml_api/api/controller.py:
--------------------------------------------------------------------------------
1 | from flask import Blueprint, request, jsonify
2 | from regression_model.predict import make_prediction
3 | from regression_model import __version__ as _version
4 | from neural_network_model.predict import make_single_prediction
5 | import os
6 | from werkzeug.utils import secure_filename
7 |
8 | from api.config import get_logger, UPLOAD_FOLDER
9 | from api.validation import validate_inputs, allowed_file
10 | from api import __version__ as api_version
11 |
12 | _logger = get_logger(logger_name=__name__)
13 |
14 |
15 | prediction_app = Blueprint('prediction_app', __name__)
16 |
17 |
18 | @prediction_app.route('/health', methods=['GET'])
19 | def health():
20 | if request.method == 'GET':
21 | _logger.info('health status OK')
22 | return 'ok'
23 |
24 |
25 | @prediction_app.route('/version', methods=['GET'])
26 | def version():
27 | if request.method == 'GET':
28 | return jsonify({'model_version': _version,
29 | 'api_version': api_version})
30 |
31 |
32 | @prediction_app.route('/v1/predict/regression', methods=['POST'])
33 | def predict():
34 | if request.method == 'POST':
35 | # Step 1: Extract POST data from request body as JSON
36 | json_data = request.get_json()
37 | _logger.debug(f'Inputs: {json_data}')
38 |
39 | # Step 2: Validate the input using marshmallow schema
40 | input_data, errors = validate_inputs(input_data=json_data)
41 |
42 | # Step 3: Model prediction
43 | result = make_prediction(input_data=input_data)
44 | _logger.debug(f'Outputs: {result}')
45 |
46 | # Step 4: Convert numpy ndarray to list
47 | predictions = result.get('predictions').tolist()
48 | version = result.get('version')
49 |
50 | # Step 5: Return the response as JSON
51 | return jsonify({'predictions': predictions,
52 | 'version': version,
53 | 'errors': errors})
54 |
55 |
56 | @prediction_app.route('/predict/classifier', methods=['POST'])
57 | def predict_image():
58 | if request.method == 'POST':
59 | # Step 1: check if the post request has the file part
60 | if 'file' not in request.files:
61 | return jsonify('No file found'), 400
62 |
63 | file = request.files['file']
64 |
65 | # Step 2: Basic file extension validation
66 | if file and allowed_file(file.filename):
67 | filename = secure_filename(file.filename)
68 |
69 | # Step 3: Save the file
70 | # Note, in production, this would require careful
71 | # validation, management and clean up.
72 | file.save(os.path.join(UPLOAD_FOLDER, filename))
73 |
74 | _logger.debug(f'Inputs: {filename}')
75 |
76 | # Step 4: perform prediction
77 | result = make_single_prediction(
78 | image_name=filename,
79 | image_directory=UPLOAD_FOLDER)
80 |
81 | _logger.debug(f'Outputs: {result}')
82 |
83 | readable_predictions = result.get('readable_predictions')
84 | version = result.get('version')
85 |
86 | # Step 5: Return the response as JSON
87 | return jsonify(
88 | {'readable_predictions': readable_predictions[0],
89 | 'version': version})
90 |
--------------------------------------------------------------------------------
/packages/ml_api/api/validation.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 |
3 | from marshmallow import Schema, fields
4 | from marshmallow import ValidationError
5 |
6 | from api import config
7 |
8 |
9 | class InvalidInputError(Exception):
10 | """Invalid model input."""
11 |
12 |
13 | SYNTAX_ERROR_FIELD_MAP = {
14 | '1stFlrSF': 'FirstFlrSF',
15 | '2ndFlrSF': 'SecondFlrSF',
16 | '3SsnPorch': 'ThreeSsnPortch'
17 | }
18 |
19 |
20 | class HouseDataRequestSchema(Schema):
21 | Alley = fields.Str(allow_none=True)
22 | BedroomAbvGr = fields.Integer()
23 | BldgType = fields.Str()
24 | BsmtCond = fields.Str()
25 | BsmtExposure = fields.Str(allow_none=True)
26 | BsmtFinSF1 = fields.Float()
27 | BsmtFinSF2 = fields.Float()
28 | BsmtFinType1 = fields.Str()
29 | BsmtFinType2 = fields.Str()
30 | BsmtFullBath = fields.Float()
31 | BsmtHalfBath = fields.Float()
32 | BsmtQual = fields.Str(allow_none=True)
33 | BsmtUnfSF = fields.Float()
34 | CentralAir = fields.Str()
35 | Condition1 = fields.Str()
36 | Condition2 = fields.Str()
37 | Electrical = fields.Str()
38 | EnclosedPorch = fields.Integer()
39 | ExterCond = fields.Str()
40 | ExterQual = fields.Str()
41 | Exterior1st = fields.Str()
42 | Exterior2nd = fields.Str()
43 | Fence = fields.Str(allow_none=True)
44 | FireplaceQu = fields.Str(allow_none=True)
45 | Fireplaces = fields.Integer()
46 | Foundation = fields.Str()
47 | FullBath = fields.Integer()
48 | Functional = fields.Str()
49 | GarageArea = fields.Float()
50 | GarageCars = fields.Float()
51 | GarageCond = fields.Str()
52 | GarageFinish = fields.Str(allow_none=True)
53 | GarageQual = fields.Str()
54 | GarageType = fields.Str(allow_none=True)
55 | GarageYrBlt = fields.Float()
56 | GrLivArea = fields.Integer()
57 | HalfBath = fields.Integer()
58 | Heating = fields.Str()
59 | HeatingQC = fields.Str()
60 | HouseStyle = fields.Str()
61 | Id = fields.Integer()
62 | KitchenAbvGr = fields.Integer()
63 | KitchenQual = fields.Str()
64 | LandContour = fields.Str()
65 | LandSlope = fields.Str()
66 | LotArea = fields.Integer()
67 | LotConfig = fields.Str()
68 | LotFrontage = fields.Float(allow_none=True)
69 | LotShape = fields.Str()
70 | LowQualFinSF = fields.Integer()
71 | MSSubClass = fields.Integer()
72 | MSZoning = fields.Str()
73 | MasVnrArea = fields.Float()
74 | MasVnrType = fields.Str(allow_none=True)
75 | MiscFeature = fields.Str(allow_none=True)
76 | MiscVal = fields.Integer()
77 | MoSold = fields.Integer()
78 | Neighborhood = fields.Str()
79 | OpenPorchSF = fields.Integer()
80 | OverallCond = fields.Integer()
81 | OverallQual = fields.Integer()
82 | PavedDrive = fields.Str()
83 | PoolArea = fields.Integer()
84 | PoolQC = fields.Str(allow_none=True)
85 | RoofMatl = fields.Str()
86 | RoofStyle = fields.Str()
87 | SaleCondition = fields.Str()
88 | SaleType = fields.Str()
89 | ScreenPorch = fields.Integer()
90 | Street = fields.Str()
91 | TotRmsAbvGrd = fields.Integer()
92 | TotalBsmtSF = fields.Float()
93 | Utilities = fields.Str()
94 | WoodDeckSF = fields.Integer()
95 | YearBuilt = fields.Integer()
96 | YearRemodAdd = fields.Integer()
97 | YrSold = fields.Integer()
98 | FirstFlrSF = fields.Integer()
99 | SecondFlrSF = fields.Integer()
100 | ThreeSsnPortch = fields.Integer()
101 |
102 |
103 | def _filter_error_rows(errors: dict,
104 | validated_input: t.List[dict]
105 | ) -> t.List[dict]:
106 | """Remove input data rows with errors."""
107 |
108 | indexes = errors.keys()
109 | # delete them in reverse order so that you
110 | # don't throw off the subsequent indexes.
111 | for index in sorted(indexes, reverse=True):
112 | del validated_input[index]
113 |
114 | return validated_input
115 |
116 |
117 | def validate_inputs(input_data):
118 | """Check prediction inputs against schema."""
119 |
120 | # set many=True to allow passing in a list
121 | schema = HouseDataRequestSchema(strict=True, many=True)
122 |
123 | # convert syntax error field names (beginning with numbers)
124 | for dict in input_data:
125 | for key, value in SYNTAX_ERROR_FIELD_MAP.items():
126 | dict[value] = dict[key]
127 | del dict[key]
128 |
129 | errors = None
130 | try:
131 | schema.load(input_data)
132 | except ValidationError as exc:
133 | errors = exc.messages
134 |
135 | # convert syntax error field names back
136 | # this is a hack - never name your data
137 | # fields with numbers as the first letter.
138 | for dict in input_data:
139 | for key, value in SYNTAX_ERROR_FIELD_MAP.items():
140 | dict[key] = dict[value]
141 | del dict[value]
142 |
143 | if errors:
144 | validated_input = _filter_error_rows(
145 | errors=errors,
146 | validated_input=input_data)
147 | else:
148 | validated_input = input_data
149 |
150 | return validated_input, errors
151 |
152 |
153 | def allowed_file(filename):
154 | return '.' in filename and \
155 | filename.rsplit('.', 1)[1].lower() in config.ALLOWED_EXTENSIONS
156 |
--------------------------------------------------------------------------------
/packages/ml_api/diff_test_requirements.txt:
--------------------------------------------------------------------------------
1 | --extra-index-url=${PIP_EXTRA_INDEX_URL}
2 |
3 | # api
4 | flask==1.0.2
5 |
6 | # schema validation
7 | marshmallow==2.17.0
8 |
9 | # Set this to the previous model version
10 | regression-model==0.1.0
--------------------------------------------------------------------------------
/packages/ml_api/requirements.txt:
--------------------------------------------------------------------------------
1 | --extra-index-url=${PIP_EXTRA_INDEX_URL}
2 |
3 | # api
4 | flask==1.0.2
5 |
6 | # schema validation
7 | marshmallow==2.17.0
8 |
9 | # Install from gemfury
10 | regression-model==1.0.0
11 | neural_network_model==0.1.1
12 |
13 | # Deployment
14 | gunicorn==19.9.0
--------------------------------------------------------------------------------
/packages/ml_api/run.py:
--------------------------------------------------------------------------------
1 | from api.app import create_app
2 | from api.config import DevelopmentConfig, ProductionConfig
3 |
4 |
5 | application = create_app(
6 | config_object=ProductionConfig)
7 |
8 |
9 | if __name__ == '__main__':
10 | application.run()
11 |
--------------------------------------------------------------------------------
/packages/ml_api/run.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | export IS_DEBUG=${DEBUG:-false}
3 | exec gunicorn --bind 0.0.0.0:5000 --access-logfile - --error-logfile - run:application
--------------------------------------------------------------------------------
/packages/ml_api/test_data_predictions.csv:
--------------------------------------------------------------------------------
1 | ,predictions,version
2 | 0,143988.30704997465,0.2.0
3 | 1,116598.08159580332,0.2.0
4 | 2,130128.90560814076,0.2.0
5 | 3,113470.10675716968,0.2.0
6 | 4,159022.48121448176,0.2.0
7 | 5,139861.32732907546,0.2.0
8 | 6,227118.89767805065,0.2.0
9 | 7,91953.99400144782,0.2.0
10 | 8,225573.26579772323,0.2.0
11 | 9,125802.8602526304,0.2.0
12 | 10,137481.49149643493,0.2.0
13 | 11,124990.09839895074,0.2.0
14 | 12,133270.15609091,0.2.0
15 | 13,192143.4530280595,0.2.0
16 | 14,123206.5594461486,0.2.0
17 | 15,201801.77975634683,0.2.0
18 | 16,198027.98470170778,0.2.0
19 | 17,185664.94305866087,0.2.0
20 | 18,146728.39264190392,0.2.0
21 | 19,152443.1572738422,0.2.0
22 | 20,197054.58979409203,0.2.0
23 | 21,146781.9115319493,0.2.0
24 | 22,138838.0050135225,0.2.0
25 | 23,259997.45200360558,0.2.0
26 | 24,220904.18524276977,0.2.0
27 | 25,162760.6578114075,0.2.0
28 | 26,81622.7760115488,0.2.0
29 | 27,104671.50728326188,0.2.0
30 | 28,129551.38264993431,0.2.0
31 | 29,95446.01639989471,0.2.0
32 | 30,129507.4444341237,0.2.0
33 | 31,95477.93516568728,0.2.0
34 | 32,129422.6043698834,0.2.0
35 | 33,128062.38086640426,0.2.0
36 | 34,123419.71922835958,0.2.0
37 | 35,128318.94350485185,0.2.0
38 | 36,207431.6698047325,0.2.0
39 | 37,174685.92854135018,0.2.0
40 | 38,204544.1513220886,0.2.0
41 | 39,188046.15280301377,0.2.0
42 | 40,182971.78532877663,0.2.0
43 | 41,70097.27238622728,0.2.0
44 | 42,110733.2059471847,0.2.0
45 | 43,93994.92500037784,0.2.0
46 | 44,252924.35745892464,0.2.0
47 | 45,214641.99038515135,0.2.0
48 | 46,154979.9669243978,0.2.0
49 | 47,160810.80098181101,0.2.0
50 | 48,230690.236786167,0.2.0
51 | 49,196243.15614263792,0.2.0
52 | 50,177792.5604951465,0.2.0
53 | 51,150956.42632815256,0.2.0
54 | 52,168211.15880784288,0.2.0
55 | 53,158387.31855224012,0.2.0
56 | 54,114339.5601018531,0.2.0
57 | 55,90052.36198593948,0.2.0
58 | 56,89964.45949954129,0.2.0
59 | 57,98668.89304456668,0.2.0
60 | 58,121518.86270978909,0.2.0
61 | 59,134198.59781615838,0.2.0
62 | 60,163434.02753944616,0.2.0
63 | 61,135542.55508479764,0.2.0
64 | 62,141825.43043982252,0.2.0
65 | 63,227613.38755000453,0.2.0
66 | 64,188761.60830094197,0.2.0
67 | 65,116489.4563051063,0.2.0
68 | 66,167327.47818717395,0.2.0
69 | 67,183019.80781626955,0.2.0
70 | 68,263704.159135985,0.2.0
71 | 69,194109.36377179576,0.2.0
72 | 70,300262.7532032975,0.2.0
73 | 71,223004.09657281314,0.2.0
74 | 72,229985.38944263826,0.2.0
75 | 73,184172.20037350367,0.2.0
76 | 74,188222.84233142118,0.2.0
77 | 75,188097.29339417908,0.2.0
78 | 76,172331.10498565168,0.2.0
79 | 77,174886.6907641111,0.2.0
80 | 78,201441.14534017237,0.2.0
81 | 79,178852.47480584026,0.2.0
82 | 80,225286.87493988863,0.2.0
83 | 81,186618.03844702366,0.2.0
84 | 82,253907.81542043414,0.2.0
85 | 83,240359.90484464006,0.2.0
86 | 84,238601.0921535284,0.2.0
87 | 85,177935.77765021168,0.2.0
88 | 86,162057.79394455065,0.2.0
89 | 87,163514.64562596226,0.2.0
90 | 88,133002.50357947565,0.2.0
91 | 89,126285.82757075419,0.2.0
92 | 90,114122.89197558099,0.2.0
93 | 91,118965.43322308766,0.2.0
94 | 92,107820.17501469971,0.2.0
95 | 93,107672.41260124673,0.2.0
96 | 94,161142.56666974662,0.2.0
97 | 95,155175.112064241,0.2.0
98 | 96,159626.62056220102,0.2.0
99 | 97,159289.85166702382,0.2.0
100 | 98,164753.43823200595,0.2.0
101 | 99,130441.66184067688,0.2.0
102 | 100,150115.21843697876,0.2.0
103 | 101,363780.0225506806,0.2.0
104 | 102,330017.780544809,0.2.0
105 | 103,331883.3191102819,0.2.0
106 | 104,406837.5511403465,0.2.0
107 | 105,292997.10969063273,0.2.0
108 | 106,306609.27632288035,0.2.0
109 | 107,329626.60615839734,0.2.0
110 | 108,311532.52238578524,0.2.0
111 | 109,302589.7805774104,0.2.0
112 | 110,313113.53389941505,0.2.0
113 | 111,255492.2795391536,0.2.0
114 | 112,348040.2630000232,0.2.0
115 | 113,286215.77612206567,0.2.0
116 | 114,257811.3774942191,0.2.0
117 | 115,219056.33504400466,0.2.0
118 | 116,221072.9009001751,0.2.0
119 | 117,227272.5447635412,0.2.0
120 | 118,389000.9584031945,0.2.0
121 | 119,333081.2372066048,0.2.0
122 | 120,301748.2795090072,0.2.0
123 | 121,268886.605541231,0.2.0
124 | 122,292214.7783535345,0.2.0
125 | 123,218893.10534405566,0.2.0
126 | 124,198679.87790616706,0.2.0
127 | 125,198256.12319179106,0.2.0
128 | 126,203810.58008877232,0.2.0
129 | 127,200888.22351579432,0.2.0
130 | 128,208173.15639542375,0.2.0
131 | 129,208236.64492513813,0.2.0
132 | 130,204263.56750308358,0.2.0
133 | 131,194016.82016564548,0.2.0
134 | 132,247220.62121392722,0.2.0
135 | 133,186454.85767170336,0.2.0
136 | 134,183808.3284633914,0.2.0
137 | 135,184105.97903285234,0.2.0
138 | 136,239209.89605894414,0.2.0
139 | 137,184218.80235097196,0.2.0
140 | 138,307821.6280329202,0.2.0
141 | 139,309780.2215794851,0.2.0
142 | 140,250051.75088695402,0.2.0
143 | 141,264234.36472344183,0.2.0
144 | 142,238517.39539507058,0.2.0
145 | 143,253639.64599699862,0.2.0
146 | 144,266777.25555390265,0.2.0
147 | 145,249262.33173072065,0.2.0
148 | 146,354687.6212203011,0.2.0
149 | 147,211718.31772737036,0.2.0
150 | 148,208112.29103266165,0.2.0
151 | 149,269063.04990015837,0.2.0
152 | 150,232554.7387626751,0.2.0
153 | 151,267547.16223942576,0.2.0
154 | 152,259496.4322217068,0.2.0
155 | 153,254987.37388475015,0.2.0
156 | 154,213297.22522688,0.2.0
157 | 155,209521.4853124122,0.2.0
158 | 156,168400.4848772304,0.2.0
159 | 157,168269.52494463106,0.2.0
160 | 158,138015.7063444789,0.2.0
161 | 159,197692.7497359191,0.2.0
162 | 160,210792.23068435694,0.2.0
163 | 161,160895.21637656086,0.2.0
164 | 162,129967.65699942572,0.2.0
165 | 163,148887.7470968613,0.2.0
166 | 164,189032.60710901304,0.2.0
167 | 165,206354.3720483368,0.2.0
168 | 166,170625.45360343822,0.2.0
169 | 167,161155.2832590772,0.2.0
170 | 168,177241.4857453312,0.2.0
171 | 169,152617.9750132888,0.2.0
172 | 170,164767.3082372813,0.2.0
173 | 171,121689.0145099861,0.2.0
174 | 172,114755.20351999925,0.2.0
175 | 173,109385.54490451732,0.2.0
176 | 174,115908.28531894127,0.2.0
177 | 175,127297.15226141199,0.2.0
178 | 176,111687.7144642378,0.2.0
179 | 177,250341.40946203517,0.2.0
180 | 178,231747.51470786144,0.2.0
181 | 179,273940.75455758354,0.2.0
182 | 180,223840.72800951728,0.2.0
183 | 181,207683.72914446727,0.2.0
184 | 182,185613.50839666792,0.2.0
185 | 183,195932.25270587756,0.2.0
186 | 184,248138.38057655803,0.2.0
187 | 185,188290.29546011682,0.2.0
188 | 186,210444.7210381098,0.2.0
189 | 187,205928.18597414377,0.2.0
190 | 188,210044.0320203481,0.2.0
191 | 189,156787.38785618285,0.2.0
192 | 190,149779.3462459088,0.2.0
193 | 191,222254.2913941949,0.2.0
194 | 192,117338.5782329264,0.2.0
195 | 193,144956.37156722017,0.2.0
196 | 194,190502.7599290919,0.2.0
197 | 195,176058.9300745161,0.2.0
198 | 196,113437.17520996452,0.2.0
199 | 197,113005.87286210393,0.2.0
200 | 198,148396.4974016323,0.2.0
201 | 199,155111.51255427708,0.2.0
202 | 200,160895.4088655705,0.2.0
203 | 201,146811.64156366416,0.2.0
204 | 202,161697.96498210484,0.2.0
205 | 203,175408.29205737467,0.2.0
206 | 204,119486.7853118973,0.2.0
207 | 205,155735.2535739763,0.2.0
208 | 206,161732.25789945782,0.2.0
209 | 207,186302.28474718594,0.2.0
210 | 208,126314.40090076534,0.2.0
211 | 209,161489.29160402366,0.2.0
212 | 210,142192.79730554653,0.2.0
213 | 211,125295.79760954925,0.2.0
214 | 212,133726.54674477206,0.2.0
215 | 213,131402.58297528428,0.2.0
216 | 214,147256.8448434014,0.2.0
217 | 215,130042.3601888925,0.2.0
218 | 216,126109.99661525768,0.2.0
219 | 217,104028.06280588396,0.2.0
220 | 218,139015.86204044707,0.2.0
221 | 219,123915.67823516048,0.2.0
222 | 220,178112.6718654715,0.2.0
223 | 221,125873.4394256058,0.2.0
224 | 222,94911.69337443665,0.2.0
225 | 223,137426.63537243495,0.2.0
226 | 224,110144.45586689096,0.2.0
227 | 225,119424.4928970573,0.2.0
228 | 226,149432.93149379385,0.2.0
229 | 227,163081.24792773716,0.2.0
230 | 228,72754.84825273752,0.2.0
231 | 229,107008.00619034276,0.2.0
232 | 230,97026.69480171583,0.2.0
233 | 231,176624.72236581342,0.2.0
234 | 232,136815.75834336376,0.2.0
235 | 233,136527.98103527437,0.2.0
236 | 234,149254.9171475344,0.2.0
237 | 235,127404.15185928933,0.2.0
238 | 236,150150.4110071018,0.2.0
239 | 237,122947.21890337647,0.2.0
240 | 238,123038.56391694587,0.2.0
241 | 239,106055.04206900226,0.2.0
242 | 240,133737.62620695255,0.2.0
243 | 241,127761.33500718801,0.2.0
244 | 242,148651.3511288533,0.2.0
245 | 243,150394.04939898496,0.2.0
246 | 244,137871.15589031755,0.2.0
247 | 245,137889.2545253325,0.2.0
248 | 246,135021.79176355613,0.2.0
249 | 247,132212.93368155853,0.2.0
250 | 248,132394.6589172383,0.2.0
251 | 249,116451.46796853734,0.2.0
252 | 250,132045.77239979545,0.2.0
253 | 251,93828.92317256187,0.2.0
254 | 252,98304.79957463636,0.2.0
255 | 253,116592.62783055207,0.2.0
256 | 254,98723.66631722648,0.2.0
257 | 255,70121.22021310769,0.2.0
258 | 256,97709.23487001589,0.2.0
259 | 257,117883.99993469544,0.2.0
260 | 258,145026.28625503322,0.2.0
261 | 259,153912.57618886943,0.2.0
262 | 260,93381.08729006874,0.2.0
263 | 261,123495.69496267234,0.2.0
264 | 262,151217.31007381002,0.2.0
265 | 263,70925.4220942242,0.2.0
266 | 264,134164.7860642941,0.2.0
267 | 265,137115.50773650245,0.2.0
268 | 266,112454.46885682318,0.2.0
269 | 267,113576.35603796394,0.2.0
270 | 268,126311.04816994928,0.2.0
271 | 269,130853.87341430226,0.2.0
272 | 270,134365.47254085648,0.2.0
273 | 271,149331.816504544,0.2.0
274 | 272,113846.4490674583,0.2.0
275 | 273,127309.62370143532,0.2.0
276 | 274,138936.11004121447,0.2.0
277 | 275,126773.14110750334,0.2.0
278 | 276,118674.20763474096,0.2.0
279 | 277,94732.55765810968,0.2.0
280 | 278,115042.27875631058,0.2.0
281 | 279,97413.63757181565,0.2.0
282 | 280,125103.21858739002,0.2.0
283 | 281,127112.78156168538,0.2.0
284 | 282,100712.28345775318,0.2.0
285 | 283,123435.94852302536,0.2.0
286 | 284,146777.37991798244,0.2.0
287 | 285,141324.91303095603,0.2.0
288 | 286,147015.62617541858,0.2.0
289 | 287,182059.49685921244,0.2.0
290 | 288,66635.70748853082,0.2.0
291 | 289,113133.7345902136,0.2.0
292 | 290,115399.86396709623,0.2.0
293 | 291,142613.97712567318,0.2.0
294 | 292,122675.88261778199,0.2.0
295 | 293,128951.35723355877,0.2.0
296 | 294,159633.68071362676,0.2.0
297 | 295,163672.2859152473,0.2.0
298 | 296,200101.77128067127,0.2.0
299 | 297,166260.33914041193,0.2.0
300 | 298,150329.84339014755,0.2.0
301 | 299,140794.76572322496,0.2.0
302 | 300,166102.833620058,0.2.0
303 | 301,140183.19131161584,0.2.0
304 | 302,257819.0508760762,0.2.0
305 | 303,257819.0508760762,0.2.0
306 | 304,257819.0508760762,0.2.0
307 | 305,297489.40422482847,0.2.0
308 | 306,288713.0465842733,0.2.0
309 | 307,238840.80382128613,0.2.0
310 | 308,264054.2118258276,0.2.0
311 | 309,214038.27040784762,0.2.0
312 | 310,216541.14163119273,0.2.0
313 | 311,251482.14382697808,0.2.0
314 | 312,201302.78506297944,0.2.0
315 | 313,221418.6030263962,0.2.0
316 | 314,143245.9627266626,0.2.0
317 | 315,195099.27104358346,0.2.0
318 | 316,194957.58888827328,0.2.0
319 | 317,196553.0339968338,0.2.0
320 | 318,209163.81006532238,0.2.0
321 | 319,137593.75834543034,0.2.0
322 | 320,139886.56269297737,0.2.0
323 | 321,224462.0649769455,0.2.0
324 | 322,249722.4606197197,0.2.0
325 | 323,196221.2726508532,0.2.0
326 | 324,200883.07978660773,0.2.0
327 | 325,236876.5404898464,0.2.0
328 | 326,265449.9719556491,0.2.0
329 | 327,210031.52797804037,0.2.0
330 | 328,250335.16327422266,0.2.0
331 | 329,193702.5517580212,0.2.0
332 | 330,113345.66683243777,0.2.0
333 | 331,141908.87717126816,0.2.0
334 | 332,98061.70102934526,0.2.0
335 | 333,122961.05363435802,0.2.0
336 | 334,117995.15041902235,0.2.0
337 | 335,134068.9122846434,0.2.0
338 | 336,122607.11339521343,0.2.0
339 | 337,128632.12690453106,0.2.0
340 | 338,130665.06200115388,0.2.0
341 | 339,181867.81868509538,0.2.0
342 | 340,172320.99427457084,0.2.0
343 | 341,163115.13448378997,0.2.0
344 | 342,142692.95549842576,0.2.0
345 | 343,204336.63049215134,0.2.0
346 | 344,151865.2725254776,0.2.0
347 | 345,187999.9387459913,0.2.0
348 | 346,153898.50002741258,0.2.0
349 | 347,201370.60175011388,0.2.0
350 | 348,136260.79769104172,0.2.0
351 | 349,167661.378830941,0.2.0
352 | 350,151900.7260108396,0.2.0
353 | 351,203200.5976776774,0.2.0
354 | 352,275987.18626456213,0.2.0
355 | 353,131731.26809609786,0.2.0
356 | 354,72685.59185678526,0.2.0
357 | 355,264769.3677760745,0.2.0
358 | 356,223505.75506482823,0.2.0
359 | 357,140373.47418071458,0.2.0
360 | 358,165740.37720853413,0.2.0
361 | 359,153501.3958318297,0.2.0
362 | 360,333345.8132030645,0.2.0
363 | 361,284907.13582157245,0.2.0
364 | 362,235976.61331734635,0.2.0
365 | 363,237331.86536503406,0.2.0
366 | 364,222571.43251950064,0.2.0
367 | 365,330547.42125199316,0.2.0
368 | 366,126425.36283381855,0.2.0
369 | 367,150931.15863895716,0.2.0
370 | 368,116973.81860226691,0.2.0
371 | 369,147483.17081444428,0.2.0
372 | 370,137775.93779758728,0.2.0
373 | 371,136213.6538169831,0.2.0
374 | 372,160855.09129555486,0.2.0
375 | 373,180999.95456004038,0.2.0
376 | 374,177875.4323401108,0.2.0
377 | 375,183722.0684301858,0.2.0
378 | 376,183394.03709605164,0.2.0
379 | 377,167171.69796713692,0.2.0
380 | 378,253008.1582497637,0.2.0
381 | 379,208356.18546752,0.2.0
382 | 380,184067.27386951286,0.2.0
383 | 381,184525.57241064525,0.2.0
384 | 382,234914.10484877022,0.2.0
385 | 383,319321.39732491894,0.2.0
386 | 384,329258.81904322456,0.2.0
387 | 385,171807.44667235087,0.2.0
388 | 386,300439.8001753106,0.2.0
389 | 387,168715.42175203658,0.2.0
390 | 388,224083.29347340713,0.2.0
391 | 389,169027.4893700393,0.2.0
392 | 390,219986.76456349975,0.2.0
393 | 391,206599.36694968113,0.2.0
394 | 392,168431.21773772905,0.2.0
395 | 393,198938.11718684685,0.2.0
396 | 394,137044.70162504562,0.2.0
397 | 395,256489.3797086342,0.2.0
398 | 396,169081.6811380493,0.2.0
399 | 397,246159.3182317069,0.2.0
400 | 398,146517.01285907425,0.2.0
401 | 399,115488.93084257792,0.2.0
402 | 400,124226.28849234067,0.2.0
403 | 401,105765.49539858926,0.2.0
404 | 402,105734.63795160982,0.2.0
405 | 403,109307.7618847266,0.2.0
406 | 404,153399.47012489414,0.2.0
407 | 405,148098.79308079585,0.2.0
408 | 406,256865.85340555105,0.2.0
409 | 407,353705.2884855737,0.2.0
410 | 408,339406.68729405693,0.2.0
411 | 409,370934.7245862843,0.2.0
412 | 410,412758.66452745936,0.2.0
413 | 411,337318.9162127192,0.2.0
414 | 412,292636.5292003634,0.2.0
415 | 413,306738.89042618143,0.2.0
416 | 414,395200.33469924616,0.2.0
417 | 415,265420.90751885757,0.2.0
418 | 416,304674.1881521481,0.2.0
419 | 417,322466.11906014563,0.2.0
420 | 418,309583.69640512683,0.2.0
421 | 419,222251.71906371377,0.2.0
422 | 420,305633.12114918296,0.2.0
423 | 421,246068.43249597988,0.2.0
424 | 422,237392.40028237563,0.2.0
425 | 423,211279.01604200783,0.2.0
426 | 424,228094.0196541859,0.2.0
427 | 425,217362.23612708444,0.2.0
428 | 426,212395.21391217507,0.2.0
429 | 427,192157.327626266,0.2.0
430 | 428,210131.93667451647,0.2.0
431 | 429,218479.26431069477,0.2.0
432 | 430,227732.65975321413,0.2.0
433 | 431,207550.8611689138,0.2.0
434 | 432,196406.28233478937,0.2.0
435 | 433,215352.46117706495,0.2.0
436 | 434,195390.69073167298,0.2.0
437 | 435,268095.89486272854,0.2.0
438 | 436,317322.5783410133,0.2.0
439 | 437,292294.5209052129,0.2.0
440 | 438,256214.48067033372,0.2.0
441 | 439,289956.5518384693,0.2.0
442 | 440,285699.6865787319,0.2.0
443 | 441,238369.04431785582,0.2.0
444 | 442,266162.84585317614,0.2.0
445 | 443,276105.07384260837,0.2.0
446 | 444,241944.78930174315,0.2.0
447 | 445,212994.50831895912,0.2.0
448 | 446,266502.50110652676,0.2.0
449 | 447,203362.7111452237,0.2.0
450 | 448,180227.73055119175,0.2.0
451 | 449,188392.39553333411,0.2.0
452 | 450,142481.50831170173,0.2.0
453 | 451,174912.95802564104,0.2.0
454 | 452,168060.24103720946,0.2.0
455 | 453,170840.3065243665,0.2.0
456 | 454,185335.0674102329,0.2.0
457 | 455,175685.71835342573,0.2.0
458 | 456,182131.57134249242,0.2.0
459 | 457,127731.04705949678,0.2.0
460 | 458,130944.89863769621,0.2.0
461 | 459,105125.80701127343,0.2.0
462 | 460,113673.41846707783,0.2.0
463 | 461,171746.81645701104,0.2.0
464 | 462,147544.47667904384,0.2.0
465 | 463,266570.15210116236,0.2.0
466 | 464,340483.4209594863,0.2.0
467 | 465,193926.64894274823,0.2.0
468 | 466,177273.1783748505,0.2.0
469 | 467,188439.6899965548,0.2.0
470 | 468,179646.3820244513,0.2.0
471 | 469,277801.9107183519,0.2.0
472 | 470,244750.34380769494,0.2.0
473 | 471,264143.13027023565,0.2.0
474 | 472,264084.9900022445,0.2.0
475 | 473,190623.30283373612,0.2.0
476 | 474,218303.47626378198,0.2.0
477 | 475,209178.35576652727,0.2.0
478 | 476,210247.40015571192,0.2.0
479 | 477,305489.9014144604,0.2.0
480 | 478,206548.65094650167,0.2.0
481 | 479,260901.671279582,0.2.0
482 | 480,234130.08563281858,0.2.0
483 | 481,215084.1602052955,0.2.0
484 | 482,162068.0157257143,0.2.0
485 | 483,175403.3655499554,0.2.0
486 | 484,188329.78909449733,0.2.0
487 | 485,148772.6745077038,0.2.0
488 | 486,135234.48910921262,0.2.0
489 | 487,132981.35850945665,0.2.0
490 | 488,142443.15434220844,0.2.0
491 | 489,172322.6219487221,0.2.0
492 | 490,114015.40802504608,0.2.0
493 | 491,131679.82317114327,0.2.0
494 | 492,140830.26421534023,0.2.0
495 | 493,96630.01740632812,0.2.0
496 | 494,146497.76662391485,0.2.0
497 | 495,161384.411998765,0.2.0
498 | 496,122294.75296565886,0.2.0
499 | 497,187349.35839738324,0.2.0
500 | 498,139773.34125411394,0.2.0
501 | 499,151158.00827612064,0.2.0
502 |
--------------------------------------------------------------------------------
/packages/ml_api/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/ml_api/tests/__init__.py
--------------------------------------------------------------------------------
/packages/ml_api/tests/capture_model_predictions.py:
--------------------------------------------------------------------------------
1 | """
2 | This script should only be run in CI.
3 | Never run it locally or you will disrupt the
4 | differential test versioning logic.
5 | """
6 |
7 | import pandas as pd
8 |
9 | from regression_model.predict import make_prediction
10 | from regression_model.processing.data_management import load_dataset
11 |
12 | from api import config
13 |
14 |
15 | def capture_predictions() -> None:
16 | """Save the test data predictions to a CSV."""
17 |
18 | save_file = 'test_data_predictions.csv'
19 | test_data = load_dataset(file_name='test.csv')
20 |
21 | # we take a slice with no input validation issues
22 | multiple_test_input = test_data[99:600]
23 |
24 | predictions = make_prediction(input_data=multiple_test_input)
25 |
26 | # save predictions for the test dataset
27 | predictions_df = pd.DataFrame(predictions)
28 |
29 | # hack here to save the file to the regression model
30 | # package of the repo, not the installed package
31 | predictions_df.to_csv(f'{config.PACKAGE_ROOT}/{save_file}')
32 |
33 |
34 | if __name__ == '__main__':
35 | capture_predictions()
36 |
--------------------------------------------------------------------------------
/packages/ml_api/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from api.app import create_app
4 | from api.config import TestingConfig
5 |
6 |
7 | @pytest.fixture
8 | def app():
9 | app = create_app(config_object=TestingConfig)
10 |
11 | with app.app_context():
12 | yield app
13 |
14 |
15 | @pytest.fixture
16 | def flask_test_client(app):
17 | with app.test_client() as test_client:
18 | yield test_client
19 |
--------------------------------------------------------------------------------
/packages/ml_api/tests/differential_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/ml_api/tests/differential_tests/__init__.py
--------------------------------------------------------------------------------
/packages/ml_api/tests/differential_tests/test_differential.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | from regression_model.config import config as model_config
4 | from regression_model.predict import make_prediction
5 | from regression_model.processing.data_management import load_dataset
6 | import pandas as pd
7 | import pytest
8 |
9 |
10 | from api import config
11 |
12 |
13 | @pytest.mark.differential
14 | def test_model_prediction_differential(
15 | *,
16 | save_file: str = 'test_data_predictions.csv'):
17 | """
18 | This test compares the prediction result similarity of
19 | the current model with the previous model's results.
20 | """
21 |
22 | # Given
23 | # Load the saved previous model predictions
24 | previous_model_df = pd.read_csv(f'{config.PACKAGE_ROOT}/{save_file}')
25 | previous_model_predictions = previous_model_df.predictions.values
26 |
27 | test_data = load_dataset(file_name=model_config.TESTING_DATA_FILE)
28 | multiple_test_input = test_data[99:600]
29 |
30 | # When
31 | current_result = make_prediction(input_data=multiple_test_input)
32 | current_model_predictions = current_result.get('predictions')
33 |
34 | # Then
35 | # diff the current model vs. the old model
36 | assert len(previous_model_predictions) == len(
37 | current_model_predictions)
38 |
39 | # Perform the differential test
40 | for previous_value, current_value in zip(
41 | previous_model_predictions, current_model_predictions):
42 |
43 | # convert numpy float64 to Python float.
44 | previous_value = previous_value.item()
45 | current_value = current_value.item()
46 |
47 | # rel_tol is the relative tolerance – it is the maximum allowed
48 | # difference between a and b, relative to the larger absolute
49 | # value of a or b. For example, to set a tolerance of 5%, pass
50 | # rel_tol=0.05.
51 | assert math.isclose(previous_value,
52 | current_value,
53 | rel_tol=model_config.ACCEPTABLE_MODEL_DIFFERENCE)
54 |
--------------------------------------------------------------------------------
/packages/ml_api/tests/test_controller.py:
--------------------------------------------------------------------------------
1 | import io
2 | import json
3 | import math
4 | import os
5 |
6 | from neural_network_model.config import config as ccn_config
7 | from regression_model import __version__ as _version
8 | from regression_model.config import config as model_config
9 | from regression_model.processing.data_management import load_dataset
10 |
11 | from api import __version__ as api_version
12 |
13 |
14 | def test_health_endpoint_returns_200(flask_test_client):
15 | # When
16 | response = flask_test_client.get('/health')
17 |
18 | # Then
19 | assert response.status_code == 200
20 |
21 |
22 | def test_version_endpoint_returns_version(flask_test_client):
23 | # When
24 | response = flask_test_client.get('/version')
25 |
26 | # Then
27 | assert response.status_code == 200
28 | response_json = json.loads(response.data)
29 | assert response_json['model_version'] == _version
30 | assert response_json['api_version'] == api_version
31 |
32 |
33 | def test_prediction_endpoint_returns_prediction(flask_test_client):
34 | # Given
35 | # Load the test data from the regression_model package
36 | # This is important as it makes it harder for the test
37 | # data versions to get confused by not spreading it
38 | # across packages.
39 | test_data = load_dataset(file_name=model_config.TESTING_DATA_FILE)
40 | post_json = test_data[0:1].to_json(orient='records')
41 |
42 | # When
43 | response = flask_test_client.post('/v1/predict/regression',
44 | json=json.loads(post_json))
45 |
46 | # Then
47 | assert response.status_code == 200
48 | response_json = json.loads(response.data)
49 | prediction = response_json['predictions']
50 | response_version = response_json['version']
51 | assert math.ceil(prediction[0]) == 112476
52 | assert response_version == _version
53 |
54 |
55 | def test_classifier_endpoint_returns_prediction(flask_test_client):
56 | # Given
57 | # Load the test data from the neural_network_model package
58 | # This is important as it makes it harder for the test
59 | # data versions to get confused by not spreading it
60 | # across packages.
61 | data_dir = os.path.abspath(os.path.join(ccn_config.DATA_FOLDER, os.pardir))
62 | test_dir = os.path.join(data_dir, 'test_data')
63 | black_grass_dir = os.path.join(test_dir, 'Black-grass')
64 | black_grass_image = os.path.join(black_grass_dir, '1.png')
65 | with open(black_grass_image, "rb") as image_file:
66 | file_bytes = image_file.read()
67 | data = dict(
68 | file=(io.BytesIO(bytearray(file_bytes)), "1.png"),
69 | )
70 |
71 | # When
72 | response = flask_test_client.post('/predict/classifier',
73 | content_type='multipart/form-data',
74 | data=data)
75 |
76 | # Then
77 | assert response.status_code == 200
78 | response_json = json.loads(response.data)
79 | assert response_json['readable_predictions']
80 |
--------------------------------------------------------------------------------
/packages/ml_api/tests/test_validation.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | from regression_model.config import config
4 | from regression_model.processing.data_management import load_dataset
5 |
6 |
7 | def test_prediction_endpoint_validation_200(flask_test_client):
8 | # Given
9 | # Load the test data from the regression_model package.
10 | # This is important as it makes it harder for the test
11 | # data versions to get confused by not spreading it
12 | # across packages.
13 | test_data = load_dataset(file_name=config.TESTING_DATA_FILE)
14 | post_json = test_data.to_json(orient='records')
15 |
16 | # When
17 | response = flask_test_client.post('/v1/predict/regression',
18 | json=json.loads(post_json))
19 |
20 | # Then
21 | assert response.status_code == 200
22 | response_json = json.loads(response.data)
23 |
24 | # Check correct number of errors removed
25 | assert len(response_json.get('predictions')) + len(
26 | response_json.get('errors')) == len(test_data)
27 |
--------------------------------------------------------------------------------
/packages/neural_network_model/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt
2 | include *.md
3 | include *.cfg
4 | include *.pkl
5 | recursive-include ./neural_network_model/*.py
6 |
7 | include neural_network_model/trained_models/*.pkl
8 | include neural_network_model/trained_models/*.h5
9 | include neural_network_model/VERSION
10 | include neural_network_model/datasets/test_data/Black-grass/1.png
11 | include neural_network_model/datasets/test_data/Charlock/1.png
12 |
13 | include ./requirements.txt
14 | exclude *.log
15 |
16 | recursive-exclude * __pycache__
17 | recursive-exclude * *.py[co]
--------------------------------------------------------------------------------
/packages/neural_network_model/config.yml:
--------------------------------------------------------------------------------
1 | MODEL_NAME: ${MODEL_NAME:cnn_model}
2 | PIPELINE_NAME: ${PIPELINE_NAME:cnn_pipe}
3 | CLASSES_PATH: ${CLASSES_PATH:False}
4 | IMAGE_SIZE: $(IMAGE_SIZE:150}
5 |
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/VERSION:
--------------------------------------------------------------------------------
1 | 0.1.0
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from neural_network_model.config import config
4 |
5 |
6 | with open(os.path.join(config.PACKAGE_ROOT, 'VERSION')) as version_file:
7 | __version__ = version_file.read().strip()
8 |
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/neural_network_model/neural_network_model/config/__init__.py
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/config/config.py:
--------------------------------------------------------------------------------
1 | # The Keras model loading function does not play well with
2 | # Pathlib at the moment, so we are using the old os module
3 | # style
4 |
5 | import os
6 |
7 | PWD = os.path.dirname(os.path.abspath(__file__))
8 | PACKAGE_ROOT = os.path.abspath(os.path.join(PWD, '..'))
9 | DATASET_DIR = os.path.join(PACKAGE_ROOT, 'datasets')
10 | TRAINED_MODEL_DIR = os.path.join(PACKAGE_ROOT, 'trained_models')
11 | DATA_FOLDER = os.path.join(DATASET_DIR, 'v2-plant-seedlings-dataset')
12 |
13 | # MODEL PERSISTING
14 | MODEL_NAME = 'cnn_model'
15 | PIPELINE_NAME = 'cnn_pipe'
16 | CLASSES_NAME = 'classes'
17 | ENCODER_NAME = 'encoder'
18 |
19 | # MODEL FITTING
20 | IMAGE_SIZE = 150 # 50 for testing, 150 for final model
21 | BATCH_SIZE = 10
22 | EPOCHS = int(os.environ.get('EPOCHS', 1)) # 1 for testing, 10 for final model
23 |
24 |
25 | with open(os.path.join(PACKAGE_ROOT, 'VERSION')) as version_file:
26 | _version = version_file.read().strip()
27 |
28 | MODEL_FILE_NAME = f'{MODEL_NAME}_{_version}.h5'
29 | MODEL_PATH = os.path.join(TRAINED_MODEL_DIR, MODEL_FILE_NAME)
30 |
31 | PIPELINE_FILE_NAME = f'{PIPELINE_NAME}_{_version}.pkl'
32 | PIPELINE_PATH = os.path.join(TRAINED_MODEL_DIR, PIPELINE_FILE_NAME)
33 |
34 | CLASSES_FILE_NAME = f'{CLASSES_NAME}_{_version}.pkl'
35 | CLASSES_PATH = os.path.join(TRAINED_MODEL_DIR, CLASSES_FILE_NAME)
36 |
37 | ENCODER_FILE_NAME = f'{ENCODER_NAME}_{_version}.pkl'
38 | ENCODER_PATH = os.path.join(TRAINED_MODEL_DIR, ENCODER_FILE_NAME)
39 |
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/neural_network_model/neural_network_model/datasets/__init__.py
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/datasets/test_data/Black-grass/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/neural_network_model/neural_network_model/datasets/test_data/Black-grass/1.png
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/datasets/test_data/Charlock/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/neural_network_model/neural_network_model/datasets/test_data/Charlock/1.png
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/datasets/test_data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/neural_network_model/neural_network_model/datasets/test_data/__init__.py
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/model.py:
--------------------------------------------------------------------------------
1 | # for the convolutional network
2 | from keras.models import Sequential
3 | from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten
4 | from keras.optimizers import Adam
5 | from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
6 | from keras.wrappers.scikit_learn import KerasClassifier
7 |
8 | from neural_network_model.config import config
9 |
10 |
11 | def cnn_model(kernel_size=(3, 3),
12 | pool_size=(2, 2),
13 | first_filters=32,
14 | second_filters=64,
15 | third_filters=128,
16 | dropout_conv=0.3,
17 | dropout_dense=0.3,
18 | image_size=50):
19 |
20 | model = Sequential()
21 | model.add(Conv2D(
22 | first_filters,
23 | kernel_size,
24 | activation='relu',
25 | input_shape=(image_size, image_size, 3)))
26 | model.add(Conv2D(first_filters, kernel_size, activation = 'relu'))
27 | model.add(MaxPooling2D(pool_size=pool_size))
28 | model.add(Dropout(dropout_conv))
29 |
30 | model.add(Conv2D(second_filters, kernel_size, activation='relu'))
31 | model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
32 | model.add(MaxPooling2D(pool_size=pool_size))
33 | model.add(Dropout(dropout_conv))
34 |
35 | model.add(Conv2D(third_filters, kernel_size, activation='relu'))
36 | model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
37 | model.add(MaxPooling2D(pool_size=pool_size))
38 | model.add(Dropout(dropout_conv))
39 |
40 | model.add(Flatten())
41 | model.add(Dense(256, activation="relu"))
42 | model.add(Dropout(dropout_dense))
43 | model.add(Dense(12, activation="softmax"))
44 |
45 | model.compile(Adam(lr=0.0001),
46 | loss='binary_crossentropy',
47 | metrics=['accuracy'])
48 |
49 | return model
50 |
51 |
52 | checkpoint = ModelCheckpoint(config.MODEL_PATH,
53 | monitor='acc',
54 | verbose=1,
55 | save_best_only=True,
56 | mode='max')
57 |
58 | reduce_lr = ReduceLROnPlateau(monitor='acc',
59 | factor=0.5,
60 | patience=2,
61 | verbose=1,
62 | mode='max',
63 | min_lr=0.00001)
64 |
65 | callbacks_list = [checkpoint, reduce_lr]
66 |
67 | cnn_clf = KerasClassifier(build_fn=cnn_model,
68 | batch_size=config.BATCH_SIZE,
69 | validation_split=10,
70 | epochs=config.EPOCHS,
71 | verbose=1, # progress bar - required for CI job
72 | callbacks=callbacks_list,
73 | image_size=config.IMAGE_SIZE
74 | )
75 |
76 |
77 | if __name__ == '__main__':
78 | model = cnn_model()
79 | model.summary()
80 |
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/pipeline.py:
--------------------------------------------------------------------------------
1 | from sklearn.pipeline import Pipeline
2 |
3 | from neural_network_model.config import config
4 | from neural_network_model.processing import preprocessors as pp
5 | from neural_network_model import model
6 |
7 |
8 | pipe = Pipeline([
9 | ('dataset', pp.CreateDataset(config.IMAGE_SIZE)),
10 | ('cnn_model', model.cnn_clf)])
11 |
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/predict.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import pandas as pd
4 |
5 | from neural_network_model import __version__ as _version
6 | from neural_network_model.processing import data_management as dm
7 |
8 | _logger = logging.getLogger(__name__)
9 | KERAS_PIPELINE = dm.load_pipeline_keras()
10 | ENCODER = dm.load_encoder()
11 |
12 |
13 | def make_single_prediction(*, image_name: str, image_directory: str):
14 | """Make a single prediction using the saved model pipeline.
15 |
16 | Args:
17 | image_name: Filename of the image to classify
18 | image_directory: Location of the image to classify
19 |
20 | Returns
21 | Dictionary with both raw predictions and readable values.
22 | """
23 |
24 | image_df = dm.load_single_image(
25 | data_folder=image_directory,
26 | filename=image_name)
27 |
28 | prepared_df = image_df['image'].reset_index(drop=True)
29 | _logger.info(f'received input array: {prepared_df}, '
30 | f'filename: {image_name}')
31 |
32 | predictions = KERAS_PIPELINE.predict(prepared_df)
33 | readable_predictions = ENCODER.encoder.inverse_transform(predictions)
34 |
35 | _logger.info(f'Made prediction: {predictions}'
36 | f' with model version: {_version}')
37 |
38 | return dict(predictions=predictions,
39 | readable_predictions=readable_predictions,
40 | version=_version)
41 |
42 |
43 | def make_bulk_prediction(*, images_df: pd.Series) -> dict:
44 | """Make multiple predictions using the saved model pipeline.
45 |
46 | Currently, this function is primarily for testing purposes,
47 | allowing us to pass in a directory of images for running
48 | bulk predictions.
49 |
50 | Args:
51 | images_df: Pandas series of images
52 |
53 | Returns
54 | Dictionary with both raw predictions and their classifications.
55 | """
56 |
57 | _logger.info(f'received input df: {images_df}')
58 |
59 | predictions = KERAS_PIPELINE.predict(images_df)
60 | readable_predictions = ENCODER.encoder.inverse_transform(predictions)
61 |
62 | _logger.info(f'Made predictions: {predictions}'
63 | f' with model version: {_version}')
64 |
65 | return dict(predictions=predictions,
66 | readable_predictions=readable_predictions,
67 | version=_version)
68 |
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/neural_network_model/neural_network_model/processing/__init__.py
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/processing/data_management.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import typing as t
4 | from glob import glob
5 | from pathlib import Path
6 |
7 | import pandas as pd
8 | from keras.models import load_model
9 | from keras.wrappers.scikit_learn import KerasClassifier
10 | from sklearn.externals import joblib
11 | from sklearn.model_selection import train_test_split
12 | from sklearn.pipeline import Pipeline
13 | from sklearn.preprocessing import LabelEncoder
14 |
15 | from neural_network_model import model as m
16 | from neural_network_model.config import config
17 |
18 | _logger = logging.getLogger(__name__)
19 |
20 |
21 | def load_single_image(data_folder: str, filename: str) -> pd.DataFrame:
22 | """Makes dataframe with image path and target."""
23 |
24 | image_df = []
25 |
26 | # search for specific image in directory
27 | for image_path in glob(os.path.join(data_folder, f'{filename}')):
28 | tmp = pd.DataFrame([image_path, 'unknown']).T
29 | image_df.append(tmp)
30 |
31 | # concatenate the final df
32 | images_df = pd.concat(image_df, axis=0, ignore_index=True)
33 | images_df.columns = ['image', 'target']
34 |
35 | return images_df
36 |
37 |
38 | def load_image_paths(data_folder: str) -> pd.DataFrame:
39 | """Makes dataframe with image path and target."""
40 |
41 | images_df = []
42 |
43 | # navigate within each folder
44 | for class_folder_name in os.listdir(data_folder):
45 | class_folder_path = os.path.join(data_folder, class_folder_name)
46 |
47 | # collect every image path
48 | for image_path in glob(os.path.join(class_folder_path, "*.png")):
49 | tmp = pd.DataFrame([image_path, class_folder_name]).T
50 | images_df.append(tmp)
51 |
52 | # concatenate the final df
53 | images_df = pd.concat(images_df, axis=0, ignore_index=True)
54 | images_df.columns = ['image', 'target']
55 |
56 | return images_df
57 |
58 |
59 | def get_train_test_target(df: pd.DataFrame):
60 | """Split a dataset into train and test segments."""
61 |
62 | X_train, X_test, y_train, y_test = train_test_split(df['image'],
63 | df['target'],
64 | test_size=0.20,
65 | random_state=101)
66 |
67 | X_train.reset_index(drop=True, inplace=True)
68 | X_test.reset_index(drop=True, inplace=True)
69 |
70 | y_train.reset_index(drop=True, inplace=True)
71 | y_test.reset_index(drop=True, inplace=True)
72 |
73 | return X_train, X_test, y_train, y_test
74 |
75 |
76 | def save_pipeline_keras(model) -> None:
77 | """Persist keras model to disk."""
78 |
79 | joblib.dump(model.named_steps['dataset'], config.PIPELINE_PATH)
80 | joblib.dump(model.named_steps['cnn_model'].classes_, config.CLASSES_PATH)
81 | model.named_steps['cnn_model'].model.save(str(config.MODEL_PATH))
82 |
83 | remove_old_pipelines(
84 | files_to_keep=[config.MODEL_FILE_NAME, config.ENCODER_FILE_NAME,
85 | config.PIPELINE_FILE_NAME, config.CLASSES_FILE_NAME])
86 |
87 |
88 | def load_pipeline_keras() -> Pipeline:
89 | """Load a Keras Pipeline from disk."""
90 |
91 | dataset = joblib.load(config.PIPELINE_PATH)
92 |
93 | build_model = lambda: load_model(config.MODEL_PATH)
94 |
95 | classifier = KerasClassifier(build_fn=build_model,
96 | batch_size=config.BATCH_SIZE,
97 | validation_split=10,
98 | epochs=config.EPOCHS,
99 | verbose=2,
100 | callbacks=m.callbacks_list,
101 | # image_size = config.IMAGE_SIZE
102 | )
103 |
104 | classifier.classes_ = joblib.load(config.CLASSES_PATH)
105 | classifier.model = build_model()
106 |
107 | return Pipeline([
108 | ('dataset', dataset),
109 | ('cnn_model', classifier)
110 | ])
111 |
112 |
113 | def load_encoder() -> LabelEncoder:
114 | encoder = joblib.load(config.ENCODER_PATH)
115 |
116 | return encoder
117 |
118 |
119 | def remove_old_pipelines(*, files_to_keep: t.List[str]) -> None:
120 | """
121 | Remove old model pipelines, models, encoders and classes.
122 |
123 | This is to ensure there is a simple one-to-one
124 | mapping between the package version and the model
125 | version to be imported and used by other applications.
126 | """
127 | do_not_delete = files_to_keep + ['__init__.py']
128 | for model_file in Path(config.TRAINED_MODEL_DIR).iterdir():
129 | if model_file.name not in do_not_delete:
130 | model_file.unlink()
131 |
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/processing/errors.py:
--------------------------------------------------------------------------------
1 | class BaseError(Exception):
2 | """Base package error."""
3 |
4 |
5 | class InvalidModelInputError(BaseError):
6 | """Model input contains an error."""
7 |
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/processing/preprocessors.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import cv2
3 | from keras.utils import np_utils
4 | from sklearn.preprocessing import LabelEncoder
5 | from sklearn.base import BaseEstimator, TransformerMixin
6 |
7 |
8 | class TargetEncoder(BaseEstimator, TransformerMixin):
9 |
10 | def __init__(self, encoder=LabelEncoder()):
11 | self.encoder = encoder
12 |
13 | def fit(self, X, y=None):
14 | # note that x is the target in this case
15 | self.encoder.fit(X)
16 | return self
17 |
18 | def transform(self, X):
19 | X = X.copy()
20 | X = np_utils.to_categorical(self.encoder.transform(X))
21 | return X
22 |
23 |
24 | def _im_resize(df, n, image_size):
25 | im = cv2.imread(df[n])
26 | im = cv2.resize(im, (image_size, image_size))
27 | return im
28 |
29 |
30 | class CreateDataset(BaseEstimator, TransformerMixin):
31 |
32 | def __init__(self, image_size=50):
33 | self.image_size = image_size
34 |
35 | def fit(self, X, y=None):
36 | return self
37 |
38 | def transform(self, X):
39 | X = X.copy()
40 | tmp = np.zeros((len(X),
41 | self.image_size,
42 | self.image_size, 3), dtype='float32')
43 |
44 | for n in range(0, len(X)):
45 | im = _im_resize(X, n, self.image_size)
46 | tmp[n] = im
47 |
48 | print('Dataset Images shape: {} size: {:,}'.format(
49 | tmp.shape, tmp.size))
50 | return tmp
51 |
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/train_pipeline.py:
--------------------------------------------------------------------------------
1 | from sklearn.externals import joblib
2 |
3 | from neural_network_model import pipeline as pipe
4 | from neural_network_model.config import config
5 | from neural_network_model.processing import data_management as dm
6 | from neural_network_model.processing import preprocessors as pp
7 |
8 |
9 | def run_training(save_result: bool = True):
10 | """Train a Convolutional Neural Network."""
11 |
12 | images_df = dm.load_image_paths(config.DATA_FOLDER)
13 | X_train, X_test, y_train, y_test = dm.get_train_test_target(images_df)
14 |
15 | enc = pp.TargetEncoder()
16 | enc.fit(y_train)
17 | y_train = enc.transform(y_train)
18 |
19 | pipe.pipe.fit(X_train, y_train)
20 |
21 | if save_result:
22 | joblib.dump(enc, config.ENCODER_PATH)
23 | dm.save_pipeline_keras(pipe.pipe)
24 |
25 |
26 | if __name__ == '__main__':
27 | run_training(save_result=True)
28 |
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/trained_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/neural_network_model/neural_network_model/trained_models/__init__.py
--------------------------------------------------------------------------------
/packages/neural_network_model/requirements.txt:
--------------------------------------------------------------------------------
1 | # production requirements
2 | pandas==0.23.4
3 | numpy==1.13.3
4 | scikit-learn==0.19.0
5 | Keras==2.1.3
6 | opencv-python==4.0.0.21
7 | h5py==2.9.0
8 | Theano==0.9.0
9 |
10 | # packaging
11 | setuptools==40.6.3
12 | wheel==0.32.3
13 |
14 | # testing requirements
15 | pytest==4.0.2
16 |
17 | # fetching datasets
18 | kaggle==1.5.1.1
--------------------------------------------------------------------------------
/packages/neural_network_model/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import io
5 | import os
6 | from pathlib import Path
7 |
8 | from setuptools import find_packages, setup
9 |
10 |
11 | # Package meta-data.
12 | NAME = 'neural_network_model'
13 | DESCRIPTION = 'Train and deploy neural network model.'
14 | URL = 'your github project'
15 | EMAIL = 'your_email@email.com'
16 | AUTHOR = 'Your name'
17 | REQUIRES_PYTHON = '>=3.6.0'
18 |
19 |
20 | # What packages are required for this module to be executed?
21 | def list_reqs(fname='requirements.txt'):
22 | with open(fname) as fd:
23 | return fd.read().splitlines()
24 |
25 |
26 | # The rest you shouldn't have to touch too much :)
27 | # ------------------------------------------------
28 | # Except, perhaps the License and Trove Classifiers!
29 | # If you do change the License, remember to change the
30 | # Trove Classifier for that!
31 |
32 | here = os.path.abspath(os.path.dirname(__file__))
33 |
34 | # Import the README and use it as the long-description.
35 | # Note: this will only work if 'README.md' is present in your MANIFEST.in file!
36 | try:
37 | with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
38 | long_description = '\n' + f.read()
39 | except FileNotFoundError:
40 | long_description = DESCRIPTION
41 |
42 |
43 | # Load the package's __version__.py module as a dictionary.
44 | ROOT_DIR = Path(__file__).resolve().parent
45 | PACKAGE_DIR = ROOT_DIR / NAME
46 | about = {}
47 | with open(PACKAGE_DIR / 'VERSION') as f:
48 | _version = f.read().strip()
49 | about['__version__'] = _version
50 |
51 |
52 | # Where the magic happens:
53 | setup(
54 | name=NAME,
55 | version=about['__version__'],
56 | description=DESCRIPTION,
57 | long_description=long_description,
58 | long_description_content_type='text/markdown',
59 | author=AUTHOR,
60 | author_email=EMAIL,
61 | python_requires=REQUIRES_PYTHON,
62 | url=URL,
63 | packages=find_packages(exclude=('tests',)),
64 | package_data={'neural_network_model': ['VERSION']},
65 | install_requires=list_reqs(),
66 | extras_require={},
67 | include_package_data=True,
68 | license='MIT',
69 | classifiers=[
70 | # Trove classifiers
71 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
72 | 'License :: OSI Approved :: MIT License',
73 | 'Programming Language :: Python',
74 | 'Programming Language :: Python :: 3',
75 | 'Programming Language :: Python :: 3.6',
76 | 'Programming Language :: Python :: Implementation :: CPython',
77 | 'Programming Language :: Python :: Implementation :: PyPy'
78 | ],
79 | )
80 |
--------------------------------------------------------------------------------
/packages/neural_network_model/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/neural_network_model/tests/__init__.py
--------------------------------------------------------------------------------
/packages/neural_network_model/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import os
3 |
4 | from neural_network_model.config import config
5 |
6 |
7 | @pytest.fixture
8 | def black_grass_dir():
9 | test_data_dir = os.path.join(config.DATASET_DIR, 'test_data')
10 | black_grass_dir = os.path.join(test_data_dir, 'Black-grass')
11 |
12 | return black_grass_dir
13 |
14 |
15 | @pytest.fixture
16 | def charlock_dir():
17 | test_data_dir = os.path.join(config.DATASET_DIR, 'test_data')
18 | charlock_dir = os.path.join(test_data_dir, 'Charlock')
19 |
20 | return charlock_dir
21 |
--------------------------------------------------------------------------------
/packages/neural_network_model/tests/test_predict.py:
--------------------------------------------------------------------------------
1 | from neural_network_model import __version__ as _version
2 | from neural_network_model.predict import (make_single_prediction)
3 |
4 |
5 | def test_make_prediction_on_sample(charlock_dir):
6 | # Given
7 | filename = '1.png'
8 | expected_classification = 'Charlock'
9 |
10 | # When
11 | results = make_single_prediction(image_directory=charlock_dir,
12 | image_name=filename)
13 |
14 | # Then
15 | assert results['predictions'] is not None
16 | assert results['readable_predictions'][0] == expected_classification
17 | assert results['version'] == _version
18 |
--------------------------------------------------------------------------------
/packages/regression_model/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt
2 | include *.md
3 | include *.cfg
4 | include *.pkl
5 | recursive-include ./regression_model/*
6 |
7 | include regression_model/datasets/train.csv
8 | include regression_model/datasets/test.csv
9 | include regression_model/trained_models/*.pkl
10 | include regression_model/VERSION
11 |
12 | include ./requirements.txt
13 | exclude *.log
14 |
15 | recursive-exclude * __pycache__
16 | recursive-exclude * *.py[co]
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/VERSION:
--------------------------------------------------------------------------------
1 | 1.0.0
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 |
4 | from regression_model.config import config
5 | from regression_model.config import logging_config
6 |
7 |
8 | # Configure logger for use in package
9 | logger = logging.getLogger(__name__)
10 | logger.setLevel(logging.DEBUG)
11 | logger.addHandler(logging_config.get_console_handler())
12 | logger.propagate = False
13 |
14 |
15 | with open(os.path.join(config.PACKAGE_ROOT, 'VERSION')) as version_file:
16 | __version__ = version_file.read().strip()
17 |
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/regression_model/regression_model/config/__init__.py
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/config/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pathlib
3 |
4 | import regression_model
5 |
6 | import pandas as pd
7 |
8 |
9 | pd.options.display.max_rows = 10
10 | pd.options.display.max_columns = 10
11 |
12 |
13 | PACKAGE_ROOT = pathlib.Path(regression_model.__file__).resolve().parent
14 | TRAINED_MODEL_DIR = PACKAGE_ROOT / 'trained_models'
15 | DATASET_DIR = PACKAGE_ROOT / 'datasets'
16 |
17 | # data
18 | TESTING_DATA_FILE = 'test.csv'
19 | TRAINING_DATA_FILE = 'train.csv'
20 | TARGET = 'SalePrice'
21 |
22 |
23 | # variables
24 | FEATURES = ['MSSubClass', 'MSZoning', 'Neighborhood',
25 | 'OverallQual', 'OverallCond', 'YearRemodAdd',
26 | 'RoofStyle', 'MasVnrType', 'BsmtQual', 'BsmtExposure',
27 | 'HeatingQC', 'CentralAir', '1stFlrSF', 'GrLivArea',
28 | 'BsmtFullBath', 'KitchenQual', 'Fireplaces', 'FireplaceQu',
29 | 'GarageType', 'GarageFinish', 'GarageCars', 'PavedDrive',
30 | 'LotFrontage',
31 | # this one is only to calculate temporal variable:
32 | 'YrSold']
33 |
34 | # this variable is to calculate the temporal variable,
35 | # can be dropped afterwards
36 | DROP_FEATURES = 'YrSold'
37 |
38 | # numerical variables with NA in train set
39 | NUMERICAL_VARS_WITH_NA = ['LotFrontage']
40 |
41 | # categorical variables with NA in train set
42 | CATEGORICAL_VARS_WITH_NA = ['MasVnrType', 'BsmtQual', 'BsmtExposure',
43 | 'FireplaceQu', 'GarageType', 'GarageFinish']
44 |
45 | TEMPORAL_VARS = 'YearRemodAdd'
46 |
47 | # variables to log transform
48 | NUMERICALS_LOG_VARS = ['LotFrontage', '1stFlrSF', 'GrLivArea']
49 |
50 | # categorical variables to encode
51 | CATEGORICAL_VARS = ['MSZoning', 'Neighborhood', 'RoofStyle', 'MasVnrType',
52 | 'BsmtQual', 'BsmtExposure', 'HeatingQC', 'CentralAir',
53 | 'KitchenQual', 'FireplaceQu', 'GarageType', 'GarageFinish',
54 | 'PavedDrive']
55 |
56 | NUMERICAL_NA_NOT_ALLOWED = [
57 | feature for feature in FEATURES
58 | if feature not in CATEGORICAL_VARS + NUMERICAL_VARS_WITH_NA
59 | ]
60 |
61 | CATEGORICAL_NA_NOT_ALLOWED = [
62 | feature for feature in CATEGORICAL_VARS
63 | if feature not in CATEGORICAL_VARS_WITH_NA
64 | ]
65 |
66 |
67 | PIPELINE_NAME = 'lasso_regression'
68 | PIPELINE_SAVE_FILE = f'{PIPELINE_NAME}_output_v'
69 |
70 | # used for differential testing
71 | ACCEPTABLE_MODEL_DIFFERENCE = 0.05
72 |
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/config/logging_config.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from logging.handlers import TimedRotatingFileHandler
3 | import os
4 | import sys
5 |
6 | from regression_model.config import config
7 |
8 | # Multiple calls to logging.getLogger('someLogger') return a
9 | # reference to the same logger object. This is true not only
10 | # within the same module, but also across modules as long as
11 | # it is in the same Python interpreter process.
12 |
13 | FORMATTER = logging.Formatter(
14 | "%(asctime)s — %(name)s — %(levelname)s —"
15 | "%(funcName)s:%(lineno)d — %(message)s")
16 |
17 |
18 | def get_console_handler():
19 | console_handler = logging.StreamHandler(sys.stdout)
20 | console_handler.setFormatter(FORMATTER)
21 | return console_handler
22 |
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/regression_model/regression_model/datasets/__init__.py
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/pipeline.py:
--------------------------------------------------------------------------------
1 | from sklearn.linear_model import Lasso
2 | from sklearn.pipeline import Pipeline
3 | from sklearn.preprocessing import MinMaxScaler
4 |
5 | from regression_model.processing import preprocessors as pp
6 | from regression_model.processing import features
7 | from regression_model.config import config
8 |
9 | import logging
10 |
11 |
12 | _logger = logging.getLogger(__name__)
13 |
14 |
15 | price_pipe = Pipeline(
16 | [
17 | ('categorical_imputer',
18 | pp.CategoricalImputer(variables=config.CATEGORICAL_VARS_WITH_NA)),
19 | ('numerical_inputer',
20 | pp.NumericalImputer(variables=config.NUMERICAL_VARS_WITH_NA)),
21 | ('temporal_variable',
22 | pp.TemporalVariableEstimator(
23 | variables=config.TEMPORAL_VARS,
24 | reference_variable=config.DROP_FEATURES)),
25 | ('rare_label_encoder',
26 | pp.RareLabelCategoricalEncoder(
27 | tol=0.01,
28 | variables=config.CATEGORICAL_VARS)),
29 | ('categorical_encoder',
30 | pp.CategoricalEncoder(variables=config.CATEGORICAL_VARS)),
31 | ('log_transformer',
32 | features.LogTransformer(variables=config.NUMERICALS_LOG_VARS)),
33 | ('drop_features',
34 | pp.DropUnecessaryFeatures(variables_to_drop=config.DROP_FEATURES)),
35 | ('scaler', MinMaxScaler()),
36 | ('Linear_model', Lasso(alpha=0.005, random_state=0))
37 | ]
38 | )
39 |
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/predict.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 | from regression_model.processing.data_management import load_pipeline
5 | from regression_model.config import config
6 | from regression_model.processing.validation import validate_inputs
7 | from regression_model import __version__ as _version
8 |
9 | import logging
10 | import typing as t
11 |
12 |
13 | _logger = logging.getLogger(__name__)
14 |
15 | pipeline_file_name = f'{config.PIPELINE_SAVE_FILE}{_version}.pkl'
16 | _price_pipe = load_pipeline(file_name=pipeline_file_name)
17 |
18 |
19 | def make_prediction(*, input_data: t.Union[pd.DataFrame, dict],
20 | ) -> dict:
21 | """Make a prediction using a saved model pipeline.
22 |
23 | Args:
24 | input_data: Array of model prediction inputs.
25 |
26 | Returns:
27 | Predictions for each input row, as well as the model version.
28 | """
29 |
30 | data = pd.DataFrame(input_data)
31 | validated_data = validate_inputs(input_data=data)
32 |
33 | prediction = _price_pipe.predict(validated_data[config.FEATURES])
34 |
35 | output = np.exp(prediction)
36 |
37 | results = {'predictions': output, 'version': _version}
38 |
39 | _logger.info(
40 | f'Making predictions with model version: {_version} '
41 | f'Inputs: {validated_data} '
42 | f'Predictions: {results}')
43 |
44 | return results
45 |
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/regression_model/regression_model/processing/__init__.py
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/processing/data_management.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from sklearn.externals import joblib
3 | from sklearn.pipeline import Pipeline
4 |
5 | from regression_model.config import config
6 | from regression_model import __version__ as _version
7 |
8 | import logging
9 | import typing as t
10 |
11 |
12 | _logger = logging.getLogger(__name__)
13 |
14 |
15 | def load_dataset(*, file_name: str
16 | ) -> pd.DataFrame:
17 | _data = pd.read_csv(f'{config.DATASET_DIR}/{file_name}')
18 | return _data
19 |
20 |
21 | def save_pipeline(*, pipeline_to_persist) -> None:
22 | """Persist the pipeline.
23 |
24 | Saves the versioned model, and overwrites any previous
25 | saved models. This ensures that when the package is
26 | published, there is only one trained model that can be
27 | called, and we know exactly how it was built.
28 | """
29 |
30 | # Prepare versioned save file name
31 | save_file_name = f'{config.PIPELINE_SAVE_FILE}{_version}.pkl'
32 | save_path = config.TRAINED_MODEL_DIR / save_file_name
33 |
34 | remove_old_pipelines(files_to_keep=[save_file_name])
35 | joblib.dump(pipeline_to_persist, save_path)
36 | _logger.info(f'saved pipeline: {save_file_name}')
37 |
38 |
39 | def load_pipeline(*, file_name: str
40 | ) -> Pipeline:
41 | """Load a persisted pipeline."""
42 |
43 | file_path = config.TRAINED_MODEL_DIR / file_name
44 | trained_model = joblib.load(filename=file_path)
45 | return trained_model
46 |
47 |
48 | def remove_old_pipelines(*, files_to_keep: t.List[str]) -> None:
49 | """
50 | Remove old model pipelines.
51 |
52 | This is to ensure there is a simple one-to-one
53 | mapping between the package version and the model
54 | version to be imported and used by other applications.
55 | However, we do also include the immediate previous
56 | pipeline version for differential testing purposes.
57 | """
58 | do_not_delete = files_to_keep + ['__init__.py']
59 | for model_file in config.TRAINED_MODEL_DIR.iterdir():
60 | if model_file.name not in do_not_delete:
61 | model_file.unlink()
62 |
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/processing/errors.py:
--------------------------------------------------------------------------------
1 | class BaseError(Exception):
2 | """Base package error."""
3 |
4 |
5 | class InvalidModelInputError(BaseError):
6 | """Model input contains an error."""
7 |
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/processing/features.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.base import BaseEstimator, TransformerMixin
3 |
4 | from regression_model.processing.errors import InvalidModelInputError
5 |
6 |
7 | class LogTransformer(BaseEstimator, TransformerMixin):
8 | """Logarithm transformer."""
9 |
10 | def __init__(self, variables=None):
11 | if not isinstance(variables, list):
12 | self.variables = [variables]
13 | else:
14 | self.variables = variables
15 |
16 | def fit(self, X, y=None):
17 | # to accomodate the pipeline
18 | return self
19 |
20 | def transform(self, X):
21 | X = X.copy()
22 |
23 | # check that the values are non-negative for log transform
24 | if not (X[self.variables] > 0).all().all():
25 | vars_ = self.variables[(X[self.variables] <= 0).any()]
26 | raise InvalidModelInputError(
27 | f"Variables contain zero or negative values, "
28 | f"can't apply log for vars: {vars_}")
29 |
30 | for feature in self.variables:
31 | X[feature] = np.log(X[feature])
32 |
33 | return X
34 |
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/processing/preprocessors.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from sklearn.base import BaseEstimator, TransformerMixin
4 |
5 | from regression_model.processing.errors import InvalidModelInputError
6 |
7 |
8 | class CategoricalImputer(BaseEstimator, TransformerMixin):
9 | """Categorical data missing value imputer."""
10 |
11 | def __init__(self, variables=None) -> None:
12 | if not isinstance(variables, list):
13 | self.variables = [variables]
14 | else:
15 | self.variables = variables
16 |
17 | def fit(self, X: pd.DataFrame, y: pd.Series = None
18 | ) -> 'CategoricalImputer':
19 | """Fit statement to accomodate the sklearn pipeline."""
20 |
21 | return self
22 |
23 | def transform(self, X: pd.DataFrame) -> pd.DataFrame:
24 | """Apply the transforms to the dataframe."""
25 |
26 | X = X.copy()
27 | for feature in self.variables:
28 | X[feature] = X[feature].fillna('Missing')
29 |
30 | return X
31 |
32 |
33 | class NumericalImputer(BaseEstimator, TransformerMixin):
34 | """Numerical missing value imputer."""
35 |
36 | def __init__(self, variables=None):
37 | if not isinstance(variables, list):
38 | self.variables = [variables]
39 | else:
40 | self.variables = variables
41 |
42 | def fit(self, X, y=None):
43 | # persist mode in a dictionary
44 | self.imputer_dict_ = {}
45 | for feature in self.variables:
46 | self.imputer_dict_[feature] = X[feature].mode()[0]
47 | return self
48 |
49 | def transform(self, X):
50 | X = X.copy()
51 | for feature in self.variables:
52 | X[feature].fillna(self.imputer_dict_[feature], inplace=True)
53 | return X
54 |
55 |
56 | class TemporalVariableEstimator(BaseEstimator, TransformerMixin):
57 | """Temporal variable calculator."""
58 |
59 | def __init__(self, variables=None, reference_variable=None):
60 | if not isinstance(variables, list):
61 | self.variables = [variables]
62 | else:
63 | self.variables = variables
64 |
65 | self.reference_variables = reference_variable
66 |
67 | def fit(self, X, y=None):
68 | # we need this step to fit the sklearn pipeline
69 | return self
70 |
71 | def transform(self, X):
72 | X = X.copy()
73 | for feature in self.variables:
74 | X[feature] = X[self.reference_variables] - X[feature]
75 |
76 | return X
77 |
78 |
79 | class RareLabelCategoricalEncoder(BaseEstimator, TransformerMixin):
80 | """Rare label categorical encoder"""
81 |
82 | def __init__(self, tol=0.05, variables=None):
83 | self.tol = tol
84 | if not isinstance(variables, list):
85 | self.variables = [variables]
86 | else:
87 | self.variables = variables
88 |
89 | def fit(self, X, y=None):
90 | # persist frequent labels in dictionary
91 | self.encoder_dict_ = {}
92 |
93 | for var in self.variables:
94 | # the encoder will learn the most frequent categories
95 | t = pd.Series(X[var].value_counts() / np.float(len(X)))
96 | # frequent labels:
97 | self.encoder_dict_[var] = list(t[t >= self.tol].index)
98 |
99 | return self
100 |
101 | def transform(self, X):
102 | X = X.copy()
103 | for feature in self.variables:
104 | X[feature] = np.where(X[feature].isin(
105 | self.encoder_dict_[feature]), X[feature], 'Rare')
106 |
107 | return X
108 |
109 |
110 | class CategoricalEncoder(BaseEstimator, TransformerMixin):
111 | """String to numbers categorical encoder."""
112 |
113 | def __init__(self, variables=None):
114 | if not isinstance(variables, list):
115 | self.variables = [variables]
116 | else:
117 | self.variables = variables
118 |
119 | def fit(self, X, y):
120 | temp = pd.concat([X, y], axis=1)
121 | temp.columns = list(X.columns) + ['target']
122 |
123 | # persist transforming dictionary
124 | self.encoder_dict_ = {}
125 |
126 | for var in self.variables:
127 | t = temp.groupby([var])['target'].mean().sort_values(
128 | ascending=True).index
129 | self.encoder_dict_[var] = {k: i for i, k in enumerate(t, 0)}
130 |
131 | return self
132 |
133 | def transform(self, X):
134 | # encode labels
135 | X = X.copy()
136 | for feature in self.variables:
137 | X[feature] = X[feature].map(self.encoder_dict_[feature])
138 |
139 | # check if transformer introduces NaN
140 | if X[self.variables].isnull().any().any():
141 | null_counts = X[self.variables].isnull().any()
142 | vars_ = {key: value for (key, value) in null_counts.items()
143 | if value is True}
144 | raise InvalidModelInputError(
145 | f'Categorical encoder has introduced NaN when '
146 | f'transforming categorical variables: {vars_.keys()}')
147 |
148 | return X
149 |
150 |
151 | class DropUnecessaryFeatures(BaseEstimator, TransformerMixin):
152 |
153 | def __init__(self, variables_to_drop=None):
154 | self.variables = variables_to_drop
155 |
156 | def fit(self, X, y=None):
157 | return self
158 |
159 | def transform(self, X):
160 | # encode labels
161 | X = X.copy()
162 | X = X.drop(self.variables, axis=1)
163 |
164 | return X
165 |
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/processing/validation.py:
--------------------------------------------------------------------------------
1 | from regression_model.config import config
2 |
3 | import pandas as pd
4 |
5 |
6 | def validate_inputs(input_data: pd.DataFrame) -> pd.DataFrame:
7 | """Check model inputs for unprocessable values."""
8 |
9 | validated_data = input_data.copy()
10 |
11 | # check for numerical variables with NA not seen during training
12 | if input_data[config.NUMERICAL_NA_NOT_ALLOWED].isnull().any().any():
13 | validated_data = validated_data.dropna(
14 | axis=0, subset=config.NUMERICAL_NA_NOT_ALLOWED)
15 |
16 | # check for categorical variables with NA not seen during training
17 | if input_data[config.CATEGORICAL_NA_NOT_ALLOWED].isnull().any().any():
18 | validated_data = validated_data.dropna(
19 | axis=0, subset=config.CATEGORICAL_NA_NOT_ALLOWED)
20 |
21 | # check for values <= 0 for the log transformed variables
22 | if (input_data[config.NUMERICALS_LOG_VARS] <= 0).any().any():
23 | vars_with_neg_values = config.NUMERICALS_LOG_VARS[
24 | (input_data[config.NUMERICALS_LOG_VARS] <= 0).any()]
25 | validated_data = validated_data[
26 | validated_data[vars_with_neg_values] > 0]
27 |
28 | return validated_data
29 |
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/train_pipeline.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.model_selection import train_test_split
3 |
4 | from regression_model import pipeline
5 | from regression_model.processing.data_management import (
6 | load_dataset, save_pipeline)
7 | from regression_model.config import config
8 | from regression_model import __version__ as _version
9 |
10 | import logging
11 |
12 |
13 | _logger = logging.getLogger(__name__)
14 |
15 |
16 | def run_training() -> None:
17 | """Train the model."""
18 |
19 | # read training data
20 | data = load_dataset(file_name=config.TRAINING_DATA_FILE)
21 |
22 | # divide train and test
23 | X_train, X_test, y_train, y_test = train_test_split(
24 | data[config.FEATURES],
25 | data[config.TARGET],
26 | test_size=0.1,
27 | random_state=0) # we are setting the seed here
28 |
29 | # transform the target
30 | y_train = np.log(y_train)
31 | y_test = np.log(y_test)
32 |
33 | pipeline.price_pipe.fit(X_train[config.FEATURES],
34 | y_train)
35 |
36 | _logger.info(f'saving model version: {_version}')
37 | save_pipeline(pipeline_to_persist=pipeline.price_pipe)
38 |
39 |
40 | if __name__ == '__main__':
41 | run_training()
42 |
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/trained_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/regression_model/regression_model/trained_models/__init__.py
--------------------------------------------------------------------------------
/packages/regression_model/requirements.txt:
--------------------------------------------------------------------------------
1 | # production requirements
2 | numpy==1.15.4
3 | scikit-learn==0.20.2
4 | pandas==0.23.4
5 |
6 | # packaging
7 | setuptools==40.6.3
8 | wheel==0.32.3
9 |
10 | # testing requirements
11 | <<<<<<< HEAD
12 | pytest==4.0.2
13 |
14 | # fetching datasets
15 | kaggle==1.5.1.1
16 | =======
17 | pytest>=4.6.6,<5.0.0
18 |
19 | # fetching datasets
20 | kaggle==1.5.1.1
21 | >>>>>>> e470b691f73e63a969fef630210d28887b59b511
22 |
--------------------------------------------------------------------------------
/packages/regression_model/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import io
5 | import os
6 | from pathlib import Path
7 |
8 | from setuptools import find_packages, setup
9 |
10 |
11 | # Package meta-data.
12 | NAME = 'regression_model'
13 | DESCRIPTION = 'Train and deploy regression model.'
14 | URL = 'your github project'
15 | EMAIL = 'your_email@email.com'
16 | AUTHOR = 'Your name'
17 | REQUIRES_PYTHON = '>=3.6.0'
18 |
19 |
20 | # What packages are required for this module to be executed?
21 | def list_reqs(fname='requirements.txt'):
22 | with open(fname) as fd:
23 | return fd.read().splitlines()
24 |
25 |
26 | # The rest you shouldn't have to touch too much :)
27 | # ------------------------------------------------
28 | # Except, perhaps the License and Trove Classifiers!
29 | # If you do change the License, remember to change the
30 | # Trove Classifier for that!
31 |
32 | here = os.path.abspath(os.path.dirname(__file__))
33 |
34 | # Import the README and use it as the long-description.
35 | # Note: this will only work if 'README.md' is present in your MANIFEST.in file!
36 | try:
37 | with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
38 | long_description = '\n' + f.read()
39 | except FileNotFoundError:
40 | long_description = DESCRIPTION
41 |
42 |
43 | # Load the package's __version__.py module as a dictionary.
44 | ROOT_DIR = Path(__file__).resolve().parent
45 | PACKAGE_DIR = ROOT_DIR / NAME
46 | about = {}
47 | with open(PACKAGE_DIR / 'VERSION') as f:
48 | _version = f.read().strip()
49 | about['__version__'] = _version
50 |
51 |
52 | # Where the magic happens:
53 | setup(
54 | name=NAME,
55 | version=about['__version__'],
56 | description=DESCRIPTION,
57 | long_description=long_description,
58 | long_description_content_type='text/markdown',
59 | author=AUTHOR,
60 | author_email=EMAIL,
61 | python_requires=REQUIRES_PYTHON,
62 | url=URL,
63 | packages=find_packages(exclude=('tests',)),
64 | package_data={'regression_model': ['VERSION']},
65 | install_requires=list_reqs(),
66 | extras_require={},
67 | include_package_data=True,
68 | license='MIT',
69 | classifiers=[
70 | # Trove classifiers
71 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
72 | 'License :: OSI Approved :: MIT License',
73 | 'Programming Language :: Python',
74 | 'Programming Language :: Python :: 3',
75 | 'Programming Language :: Python :: 3.6',
76 | 'Programming Language :: Python :: Implementation :: CPython',
77 | 'Programming Language :: Python :: Implementation :: PyPy'
78 | ],
79 | )
80 |
--------------------------------------------------------------------------------
/packages/regression_model/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/regression_model/tests/__init__.py
--------------------------------------------------------------------------------
/packages/regression_model/tests/test_predict.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | from regression_model.predict import make_prediction
4 | from regression_model.processing.data_management import load_dataset
5 |
6 |
7 | def test_make_single_prediction():
8 | # Given
9 | test_data = load_dataset(file_name='test.csv')
10 | single_test_input = test_data[0:1]
11 |
12 | # When
13 | subject = make_prediction(input_data=single_test_input)
14 |
15 | # Then
16 | assert subject is not None
17 | assert isinstance(subject.get('predictions')[0], float)
18 | assert math.ceil(subject.get('predictions')[0]) == 112476
19 |
20 |
21 | def test_make_multiple_predictions():
22 | # Given
23 | test_data = load_dataset(file_name='test.csv')
24 | original_data_length = len(test_data)
25 | multiple_test_input = test_data
26 |
27 | # When
28 | subject = make_prediction(input_data=multiple_test_input)
29 |
30 | # Then
31 | assert subject is not None
32 | assert len(subject.get('predictions')) == 1451
33 |
34 | # We expect some rows to be filtered out
35 | assert len(subject.get('predictions')) != original_data_length
36 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | -r packages/ml_api/requirements.txt
2 |
--------------------------------------------------------------------------------
/scripts/fetch_kaggle_dataset.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | kaggle competitions download -c house-prices-advanced-regression-techniques -p packages/regression_model/regression_model/datasets/
--------------------------------------------------------------------------------
/scripts/fetch_kaggle_large_dataset.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | TRAINING_DATA_URL="vbookshelf/v2-plant-seedlings-dataset"
4 | NOW=$(date)
5 |
6 | kaggle datasets download -d $TRAINING_DATA_URL -p packages/neural_network_model/neural_network_model/datasets/ && \
7 | unzip packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset.zip -d packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset && \
8 | echo $TRAINING_DATA_URL 'retrieved on:' $NOW > packages/neural_network_model/neural_network_model/datasets/training_data_reference.txt && \
9 | mkdir -p "./packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset/Shepherds Purse" && \
10 | mv -v "./packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset/Shepherd’s Purse/"* "./packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset/Shepherds Purse"
11 | rm -rf "./packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset/Shepherd’s Purse"
--------------------------------------------------------------------------------
/scripts/input_test.json:
--------------------------------------------------------------------------------
1 | [{
2 | "Id": 1461,
3 | "MSSubClass": 20,
4 | "MSZoning": "RH",
5 | "LotFrontage": 80.0,
6 | "LotArea": 11622,
7 | "Street": "Pave",
8 | "Alley": null,
9 | "LotShape": "Reg",
10 | "LandContour": "Lvl",
11 | "Utilities": "AllPub",
12 | "LotConfig": "Inside",
13 | "LandSlope": "Gtl",
14 | "Neighborhood": "NAmes",
15 | "Condition1": "Feedr",
16 | "Condition2": "Norm",
17 | "BldgType": "1Fam",
18 | "HouseStyle": "1Story",
19 | "OverallQual": 5,
20 | "OverallCond": 6,
21 | "YearBuilt": 1961,
22 | "YearRemodAdd": 1961,
23 | "RoofStyle": "Gable",
24 | "RoofMatl": "CompShg",
25 | "Exterior1st": "VinylSd",
26 | "Exterior2nd": "VinylSd",
27 | "MasVnrType": "None",
28 | "MasVnrArea": 0.0,
29 | "ExterQual": "TA",
30 | "ExterCond": "TA",
31 | "Foundation": "CBlock",
32 | "BsmtQual": "TA",
33 | "BsmtCond": "TA",
34 | "BsmtExposure": "No",
35 | "BsmtFinType1": "Rec",
36 | "BsmtFinSF1": 468.0,
37 | "BsmtFinType2": "LwQ",
38 | "BsmtFinSF2": 144.0,
39 | "BsmtUnfSF": 270.0,
40 | "TotalBsmtSF": 882.0,
41 | "Heating": "GasA",
42 | "HeatingQC": "TA",
43 | "CentralAir": "Y",
44 | "Electrical": "SBrkr",
45 | "1stFlrSF": 896,
46 | "2ndFlrSF": 0,
47 | "LowQualFinSF": 0,
48 | "GrLivArea": 896,
49 | "BsmtFullBath": 0.0,
50 | "BsmtHalfBath": 0.0,
51 | "FullBath": 1,
52 | "HalfBath": 0,
53 | "BedroomAbvGr": 2,
54 | "KitchenAbvGr": 1,
55 | "KitchenQual": "TA",
56 | "TotRmsAbvGrd": 5,
57 | "Functional": "Typ",
58 | "Fireplaces": 0,
59 | "FireplaceQu": null,
60 | "GarageType": "Attchd",
61 | "GarageYrBlt": 1961.0,
62 | "GarageFinish": "Unf",
63 | "GarageCars": 1.0,
64 | "GarageArea": 730.0,
65 | "GarageQual": "TA",
66 | "GarageCond": "TA",
67 | "PavedDrive": "Y",
68 | "WoodDeckSF": 140,
69 | "OpenPorchSF": 0,
70 | "EnclosedPorch": 0,
71 | "3SsnPorch": 0,
72 | "ScreenPorch": 120,
73 | "PoolArea": 0,
74 | "PoolQC": null,
75 | "Fence": "MnPrv",
76 | "MiscFeature": null,
77 | "MiscVal": 0,
78 | "MoSold": 6,
79 | "YrSold": 2010,
80 | "SaleType": "WD",
81 | "SaleCondition": "Normal"
82 | }]
--------------------------------------------------------------------------------
/scripts/publish_model.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Building packages and uploading them to a Gemfury repository
4 |
5 | GEMFURY_URL=$GEMFURY_PUSH_URL
6 |
7 | set -e
8 |
9 | DIRS="$@"
10 | BASE_DIR=$(pwd)
11 | SETUP="setup.py"
12 |
13 | warn() {
14 | echo "$@" 1>&2
15 | }
16 |
17 | die() {
18 | warn "$@"
19 | exit 1
20 | }
21 |
22 | build() {
23 | DIR="${1/%\//}"
24 | echo "Checking directory $DIR"
25 | cd "$BASE_DIR/$DIR"
26 | [ ! -e $SETUP ] && warn "No $SETUP file, skipping" && return
27 | PACKAGE_NAME=$(python $SETUP --fullname)
28 | echo "Package $PACKAGE_NAME"
29 | python "$SETUP" sdist bdist_wheel || die "Building package $PACKAGE_NAME failed"
30 | for X in $(ls dist)
31 | do
32 | curl -F package=@"dist/$X" "$GEMFURY_URL" || die "Uploading package $PACKAGE_NAME failed on file dist/$X"
33 | done
34 | }
35 |
36 | if [ -n "$DIRS" ]; then
37 | for dir in $DIRS; do
38 | build $dir
39 | done
40 | else
41 | ls -d */ | while read dir; do
42 | build $dir
43 | done
44 | fi
--------------------------------------------------------------------------------