├── .circleci └── config.yml ├── .dockerignore ├── .gitignore ├── Dockerfile ├── LICENSE ├── Makefile ├── Procfile ├── README.md ├── jupyter_notebooks ├── Section12_DeepLearningModel │ └── CNN_Analysis_and Model.ipynb ├── Section2_MLPipelineOverview │ ├── 02.10_ML_Pipeline-WrapUp_for_Deployment.ipynb │ ├── 02.6_ML_Pipeline_Step1-DataAnalysis.ipynb │ ├── 02.7_ML_Pipeline_Step2-FeatureEngineering.ipynb │ ├── 02.8_ML_Pipeline_Step3-FeatureSelection.ipynb │ ├── 02.9_ML_Pipeline_Step4-MachineLearningModelBuild.ipynb │ └── BONUS_Randomisation_in_ML _and_setting_the_seed.ipynb └── requirements.txt ├── packages ├── ml_api │ ├── VERSION │ ├── api │ │ ├── __init__.py │ │ ├── app.py │ │ ├── config.py │ │ ├── controller.py │ │ └── validation.py │ ├── diff_test_requirements.txt │ ├── requirements.txt │ ├── run.py │ ├── run.sh │ ├── test_data_predictions.csv │ └── tests │ │ ├── __init__.py │ │ ├── capture_model_predictions.py │ │ ├── conftest.py │ │ ├── differential_tests │ │ ├── __init__.py │ │ └── test_differential.py │ │ ├── test_controller.py │ │ └── test_validation.py ├── neural_network_model │ ├── MANIFEST.in │ ├── config.yml │ ├── neural_network_model │ │ ├── VERSION │ │ ├── __init__.py │ │ ├── config │ │ │ ├── __init__.py │ │ │ └── config.py │ │ ├── datasets │ │ │ ├── __init__.py │ │ │ └── test_data │ │ │ │ ├── Black-grass │ │ │ │ └── 1.png │ │ │ │ ├── Charlock │ │ │ │ └── 1.png │ │ │ │ └── __init__.py │ │ ├── model.py │ │ ├── pipeline.py │ │ ├── predict.py │ │ ├── processing │ │ │ ├── __init__.py │ │ │ ├── data_management.py │ │ │ ├── errors.py │ │ │ └── preprocessors.py │ │ ├── train_pipeline.py │ │ └── trained_models │ │ │ └── __init__.py │ ├── requirements.txt │ ├── setup.py │ └── tests │ │ ├── __init__.py │ │ ├── conftest.py │ │ └── test_predict.py └── regression_model │ ├── MANIFEST.in │ ├── regression_model │ ├── VERSION │ ├── __init__.py │ ├── config │ │ ├── __init__.py │ │ ├── config.py │ │ └── logging_config.py │ ├── datasets │ │ └── __init__.py │ ├── pipeline.py │ ├── predict.py │ ├── processing │ │ ├── __init__.py │ │ ├── data_management.py │ │ ├── errors.py │ │ ├── features.py │ │ ├── preprocessors.py │ │ └── validation.py │ ├── train_pipeline.py │ └── trained_models │ │ └── __init__.py │ ├── requirements.txt │ ├── setup.py │ └── tests │ ├── __init__.py │ └── test_predict.py ├── requirements.txt └── scripts ├── fetch_kaggle_dataset.sh ├── fetch_kaggle_large_dataset.sh ├── input_test.json └── publish_model.sh /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | defaults: &defaults 4 | docker: 5 | - image: circleci/python:3.7.2 6 | working_directory: ~/project 7 | 8 | prepare_venv: &prepare_venv 9 | run: 10 | name: Create venv 11 | command: | 12 | python3 -m venv venv 13 | source venv/bin/activate 14 | pip install --upgrade pip 15 | 16 | fetch_data: &fetch_data 17 | run: 18 | name: Set script permissions and fetch data 19 | command: | 20 | source venv/bin/activate 21 | chmod +x ./scripts/fetch_kaggle_dataset.sh 22 | ./scripts/fetch_kaggle_dataset.sh 23 | 24 | jobs: 25 | test_regression_model: 26 | <<: *defaults 27 | steps: 28 | - checkout 29 | - *prepare_venv 30 | - run: 31 | name: Install requirements 32 | command: | 33 | . venv/bin/activate 34 | pip install -r packages/regression_model/requirements.txt 35 | - *fetch_data 36 | - run: 37 | name: Train model 38 | command: | 39 | . venv/bin/activate 40 | PYTHONPATH=./packages/regression_model python3 packages/regression_model/regression_model/train_pipeline.py 41 | - run: 42 | name: Run tests 43 | command: | 44 | . venv/bin/activate 45 | py.test -vv packages/regression_model/tests 46 | 47 | test_ml_api: 48 | <<: *defaults 49 | steps: 50 | - checkout 51 | - restore_cache: 52 | keys: 53 | - py-deps-{{ checksum "packages/ml_api/requirements.txt" }} 54 | - run: 55 | name: Runnning tests 56 | command: | 57 | python3 -m venv venv 58 | . venv/bin/activate 59 | pip install --upgrade pip 60 | pip install -r packages/ml_api/requirements.txt 61 | py.test -vv packages/ml_api/tests -m "not differential" 62 | - save_cache: 63 | key: py-deps-{{ checksum "packages/ml_api/requirements.txt" }} 64 | paths: 65 | - "/venv" 66 | 67 | train_and_upload_regression_model: 68 | <<: *defaults 69 | steps: 70 | - checkout 71 | - *prepare_venv 72 | - run: 73 | name: Install requirements 74 | command: | 75 | . venv/bin/activate 76 | pip install -r packages/regression_model/requirements.txt 77 | - *fetch_data 78 | - run: 79 | name: Train model 80 | command: | 81 | . venv/bin/activate 82 | PYTHONPATH=./packages/regression_model python3 packages/regression_model/regression_model/train_pipeline.py 83 | - run: 84 | name: Publish model to Gemfury 85 | command: | 86 | . venv/bin/activate 87 | chmod +x ./scripts/publish_model.sh 88 | ./scripts/publish_model.sh ./packages/regression_model/ 89 | 90 | section_9_differential_tests: 91 | <<: *defaults 92 | steps: 93 | - checkout 94 | - *prepare_venv 95 | - run: 96 | name: Capturing previous model predictions 97 | command: | 98 | . venv/bin/activate 99 | pip install -r packages/ml_api/diff_test_requirements.txt 100 | PYTHONPATH=./packages/ml_api python3 packages/ml_api/tests/capture_model_predictions.py 101 | - run: 102 | name: Runnning differential tests 103 | command: | 104 | . venv/bin/activate 105 | pip install -r packages/ml_api/requirements.txt 106 | py.test -vv packages/ml_api/tests -m differential 107 | 108 | section_10_deploy_to_heroku: 109 | <<: *defaults 110 | steps: 111 | - checkout 112 | - run: 113 | name: Deploy to Heroku 114 | command: | 115 | git push https://heroku:$HEROKU_API_KEY@git.heroku.com/$HEROKU_APP_NAME.git master 116 | 117 | section_11_build_and_push_to_heroku_docker: 118 | <<: *defaults 119 | steps: 120 | - checkout 121 | - setup_remote_docker: 122 | docker_layer_caching: true 123 | - run: docker login --username=$HEROKU_EMAIL --password=$HEROKU_API_KEY registry.heroku.com 124 | - run: 125 | name: Setup Heroku CLI 126 | command: | 127 | wget -qO- https://cli-assets.heroku.com/install-ubuntu.sh | sh 128 | - run: 129 | name: Build and Push Image 130 | command: | 131 | make build-ml-api-heroku push-ml-api-heroku 132 | - run: 133 | name: Release to Heroku 134 | command: | 135 | heroku container:release web --app $HEROKU_APP_NAME 136 | 137 | section_12_publish_docker_image_to_aws: 138 | <<: *defaults 139 | working_directory: ~/project/packages/ml_models 140 | steps: 141 | - checkout 142 | - setup_remote_docker 143 | - run: 144 | name: Publishing docker image to aws ECR 145 | command: | 146 | sudo pip install awscli 147 | eval $(aws ecr get-login --no-include-email --region us-east-1) 148 | make build-ml-api-aws tag-ml-api push-ml-api-aws 149 | aws ecs update-service --cluster ml-api-cluster --service custom-service --task-definition first-run-task-definition --force-new-deployment 150 | 151 | section_13_train_and_upload_neural_network_model: 152 | docker: 153 | - image: circleci/python:3.6.4-stretch 154 | working_directory: ~/project 155 | steps: 156 | - checkout 157 | - *prepare_venv 158 | - run: 159 | name: Install requirements 160 | command: | 161 | . venv/bin/activate 162 | pip install -r packages/neural_network_model/requirements.txt 163 | - run: 164 | name: Fetch Training data - 2GB 165 | command: | 166 | . venv/bin/activate 167 | chmod +x ./scripts/fetch_kaggle_large_dataset.sh 168 | ./scripts/fetch_kaggle_large_dataset.sh 169 | - run: 170 | name: Train model 171 | command: | 172 | . venv/bin/activate 173 | PYTHONPATH=./packages/neural_network_model python3 packages/neural_network_model/neural_network_model/train_pipeline.py 174 | - run: 175 | name: Publish model to Gemfury 176 | command: | 177 | . venv/bin/activate 178 | chmod +x ./scripts/publish_model.sh 179 | ./scripts/publish_model.sh ./packages/neural_network_model/ 180 | 181 | workflows: 182 | version: 2 183 | test-all: 184 | jobs: 185 | - test_regression_model 186 | - test_ml_api 187 | - section_9_differential_tests 188 | - train_and_upload_regression_model: 189 | requires: 190 | - test_regression_model 191 | - test_ml_api 192 | - section_9_differential_tests 193 | filters: 194 | branches: 195 | only: 196 | - master 197 | # - section_10_deploy_to_heroku: 198 | # requires: 199 | # - train_and_upload_regression_model 200 | # filters: 201 | # branches: 202 | # only: 203 | # - master 204 | - section_11_build_and_push_to_heroku_docker: 205 | requires: 206 | - train_and_upload_regression_model 207 | filters: 208 | branches: 209 | only: 210 | - master 211 | # - section_12_publish_docker_image_to_aws: 212 | # requires: 213 | # - train_and_upload_regression_model 214 | # filters: 215 | # branches: 216 | # only: 217 | # - master 218 | - section_13_train_and_upload_neural_network_model: 219 | requires: 220 | - test_regression_model 221 | - test_ml_api 222 | - section_9_differential_tests 223 | # - train_and_upload_regression_model 224 | # filters: 225 | # branches: 226 | # only: 227 | # - master 228 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | jupyter_notebooks* 2 | */env* 3 | */venv* 4 | .circleci* 5 | packages/regression_model 6 | *.env 7 | *.log 8 | .git 9 | .gitignore -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # pycharm 107 | .idea/ 108 | 109 | # datafiles 110 | train.csv 111 | test.csv 112 | test_data_predictions.csv 113 | v2-plant-seedlings-dataset/ 114 | v2-plant-seedlings-dataset.zip 115 | 116 | # all logs 117 | logs/ 118 | 119 | # trained models (will be created in CI) 120 | packages/regression_model/regression_model/trained_models/*.pkl 121 | packages/neural_network_model/neural_network_model/trained_models/*.pkl 122 | packages/neural_network_model/neural_network_model/trained_models/*.h5 123 | *.h5 124 | packages/neural_network_model/neural_network_model/datasets/training_data_reference.txt 125 | 126 | .DS_Store 127 | 128 | kaggle.json 129 | packages/ml_api/uploads/* 130 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6.4 2 | 3 | # Create the user that will run the app 4 | RUN adduser --disabled-password --gecos '' ml-api-user 5 | 6 | WORKDIR /opt/ml_api 7 | 8 | ARG PIP_EXTRA_INDEX_URL 9 | ENV FLASK_APP run.py 10 | 11 | # Install requirements, including from Gemfury 12 | ADD ./packages/ml_api /opt/ml_api/ 13 | RUN pip install --upgrade pip 14 | RUN pip install -r /opt/ml_api/requirements.txt 15 | 16 | RUN chmod +x /opt/ml_api/run.sh 17 | RUN chown -R ml-api-user:ml-api-user ./ 18 | 19 | USER ml-api-user 20 | 21 | EXPOSE 5000 22 | 23 | CMD ["bash", "./run.sh"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, Soledad Galli and Christopher Samiullah. Deployment of Machine Learning Models, online course. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | NAME=udemy-ml-api 2 | COMMIT_ID=$(shell git rev-parse HEAD) 3 | 4 | 5 | build-ml-api-heroku: 6 | docker build --build-arg PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} -t registry.heroku.com/$(NAME)/web:$(COMMIT_ID) . 7 | 8 | push-ml-api-heroku: 9 | docker push registry.heroku.com/${HEROKU_APP_NAME}/web:$(COMMIT_ID) 10 | 11 | build-ml-api-aws: 12 | docker build --build-arg PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} -t $(NAME):$(COMMIT_ID) . 13 | 14 | push-ml-api-aws: 15 | docker push ${AWS_ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/$(NAME):$(COMMIT_ID) 16 | 17 | tag-ml-api: 18 | docker tag $(NAME):$(COMMIT_ID) ${AWS_ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/$(NAME):$(COMMIT_ID) 19 | -------------------------------------------------------------------------------- /Procfile: -------------------------------------------------------------------------------- 1 | web: gunicorn --pythonpath packages/ml_api --access-logfile - --error-logfile - run:application -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deployment of Machine Learning Models 2 | Accompanying repo for the online course Deployment of Machine Learning Models. 3 | 4 | For the documentation, visit the [course on Udemy](https://www.udemy.com/deployment-of-machine-learning-models/?couponCode=TIDREPO). 5 | -------------------------------------------------------------------------------- /jupyter_notebooks/Section2_MLPipelineOverview/02.10_ML_Pipeline-WrapUp_for_Deployment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Machine Learning Model Building Pipeline: Wrapping up for Deployment\n", 8 | "\n", 9 | "\n", 10 | "In the previous lectures, we worked through the typical Machine Learning pipeline to build a regression model that allows us to predict house prices. Briefly, we transformed variables in the dataset to make them suitable for use in a Regression model, then we selected the most predictive variables and finally we built our model.\n", 11 | "\n", 12 | "Now, we want to deploy our model. We want to create an API, that we can call with new data, with new characteristics about houses, to get an estimate of the SalePrice. In order to do so, we need to write code in a very specific way. We will show you how to write production code in the coming lectures.\n", 13 | "\n", 14 | "Here, we will summarise, the key pieces of code, that we need to take forward, for this particular project, to put our model in production.\n", 15 | "\n", 16 | "Let's go ahead and get started." 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "### Setting the seed\n", 24 | "\n", 25 | "It is important to note, that we are engineering variables and pre-processing data with the idea of deploying the model if we find business value in it. Therefore, from now on, for each step that includes some element of randomness, it is extremely important that we **set the seed**. This way, we can obtain reproducibility between our research and our development code.\n", 26 | "\n", 27 | "This is perhaps one of the most important lessons that you need to take away from this course: **Always set the seeds**.\n", 28 | "\n", 29 | "Let's go ahead and load the dataset." 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 1, 35 | "metadata": { 36 | "collapsed": true 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "# to handle datasets\n", 41 | "import pandas as pd\n", 42 | "import numpy as np\n", 43 | "\n", 44 | "# to divide train and test set\n", 45 | "from sklearn.model_selection import train_test_split\n", 46 | "\n", 47 | "# feature scaling\n", 48 | "from sklearn.preprocessing import MinMaxScaler\n", 49 | "\n", 50 | "# to build the models\n", 51 | "from sklearn.linear_model import Lasso\n", 52 | "\n", 53 | "# to evaluate the models\n", 54 | "from sklearn.metrics import mean_squared_error\n", 55 | "from math import sqrt\n", 56 | "\n", 57 | "# to persist the model and the scaler\n", 58 | "from sklearn.externals import joblib\n", 59 | "\n", 60 | "# to visualise al the columns in the dataframe\n", 61 | "pd.pandas.set_option('display.max_columns', None)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "## Load data\n", 69 | "\n", 70 | "We need the training data to train our model in the production environment. " 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 2, 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "name": "stdout", 80 | "output_type": "stream", 81 | "text": [ 82 | "(1460, 81)\n" 83 | ] 84 | }, 85 | { 86 | "data": { 87 | "text/html": [ 88 | "
\n", 89 | "\n", 102 | "\n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | "
IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilitiesLotConfigLandSlopeNeighborhoodCondition1Condition2BldgTypeHouseStyleOverallQualOverallCondYearBuiltYearRemodAddRoofStyleRoofMatlExterior1stExterior2ndMasVnrTypeMasVnrAreaExterQualExterCondFoundationBsmtQualBsmtCondBsmtExposureBsmtFinType1BsmtFinSF1BsmtFinType2BsmtFinSF2BsmtUnfSFTotalBsmtSFHeatingHeatingQCCentralAirElectrical1stFlrSF2ndFlrSFLowQualFinSFGrLivAreaBsmtFullBathBsmtHalfBathFullBathHalfBathBedroomAbvGrKitchenAbvGrKitchenQualTotRmsAbvGrdFunctionalFireplacesFireplaceQuGarageTypeGarageYrBltGarageFinishGarageCarsGarageAreaGarageQualGarageCondPavedDriveWoodDeckSFOpenPorchSFEnclosedPorch3SsnPorchScreenPorchPoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleConditionSalePrice
0160RL65.08450PaveNaNRegLvlAllPubInsideGtlCollgCrNormNorm1Fam2Story7520032003GableCompShgVinylSdVinylSdBrkFace196.0GdTAPConcGdTANoGLQ706Unf0150856GasAExYSBrkr85685401710102131Gd8Typ0NaNAttchd2003.0RFn2548TATAY0610000NaNNaNNaN022008WDNormal208500
1220RL80.09600PaveNaNRegLvlAllPubFR2GtlVeenkerFeedrNorm1Fam1Story6819761976GableCompShgMetalSdMetalSdNone0.0TATACBlockGdTAGdALQ978Unf02841262GasAExYSBrkr1262001262012031TA6Typ1TAAttchd1976.0RFn2460TATAY29800000NaNNaNNaN052007WDNormal181500
2360RL68.011250PaveNaNIR1LvlAllPubInsideGtlCollgCrNormNorm1Fam2Story7520012002GableCompShgVinylSdVinylSdBrkFace162.0GdTAPConcGdTAMnGLQ486Unf0434920GasAExYSBrkr92086601786102131Gd6Typ1TAAttchd2001.0RFn2608TATAY0420000NaNNaNNaN092008WDNormal223500
3470RL60.09550PaveNaNIR1LvlAllPubCornerGtlCrawforNormNorm1Fam2Story7519151970GableCompShgWd SdngWd ShngNone0.0TATABrkTilTAGdNoALQ216Unf0540756GasAGdYSBrkr96175601717101031Gd7Typ1GdDetchd1998.0Unf3642TATAY035272000NaNNaNNaN022006WDAbnorml140000
4560RL84.014260PaveNaNIR1LvlAllPubFR2GtlNoRidgeNormNorm1Fam2Story8520002000GableCompShgVinylSdVinylSdBrkFace350.0GdTAPConcGdTAAvGLQ655Unf04901145GasAExYSBrkr1145105302198102141Gd9Typ1TAAttchd2000.0RFn3836TATAY192840000NaNNaNNaN0122008WDNormal250000
\n", 612 | "
" 613 | ], 614 | "text/plain": [ 615 | " Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n", 616 | "0 1 60 RL 65.0 8450 Pave NaN Reg \n", 617 | "1 2 20 RL 80.0 9600 Pave NaN Reg \n", 618 | "2 3 60 RL 68.0 11250 Pave NaN IR1 \n", 619 | "3 4 70 RL 60.0 9550 Pave NaN IR1 \n", 620 | "4 5 60 RL 84.0 14260 Pave NaN IR1 \n", 621 | "\n", 622 | " LandContour Utilities LotConfig LandSlope Neighborhood Condition1 \\\n", 623 | "0 Lvl AllPub Inside Gtl CollgCr Norm \n", 624 | "1 Lvl AllPub FR2 Gtl Veenker Feedr \n", 625 | "2 Lvl AllPub Inside Gtl CollgCr Norm \n", 626 | "3 Lvl AllPub Corner Gtl Crawfor Norm \n", 627 | "4 Lvl AllPub FR2 Gtl NoRidge Norm \n", 628 | "\n", 629 | " Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt \\\n", 630 | "0 Norm 1Fam 2Story 7 5 2003 \n", 631 | "1 Norm 1Fam 1Story 6 8 1976 \n", 632 | "2 Norm 1Fam 2Story 7 5 2001 \n", 633 | "3 Norm 1Fam 2Story 7 5 1915 \n", 634 | "4 Norm 1Fam 2Story 8 5 2000 \n", 635 | "\n", 636 | " YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType \\\n", 637 | "0 2003 Gable CompShg VinylSd VinylSd BrkFace \n", 638 | "1 1976 Gable CompShg MetalSd MetalSd None \n", 639 | "2 2002 Gable CompShg VinylSd VinylSd BrkFace \n", 640 | "3 1970 Gable CompShg Wd Sdng Wd Shng None \n", 641 | "4 2000 Gable CompShg VinylSd VinylSd BrkFace \n", 642 | "\n", 643 | " MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure \\\n", 644 | "0 196.0 Gd TA PConc Gd TA No \n", 645 | "1 0.0 TA TA CBlock Gd TA Gd \n", 646 | "2 162.0 Gd TA PConc Gd TA Mn \n", 647 | "3 0.0 TA TA BrkTil TA Gd No \n", 648 | "4 350.0 Gd TA PConc Gd TA Av \n", 649 | "\n", 650 | " BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF \\\n", 651 | "0 GLQ 706 Unf 0 150 856 \n", 652 | "1 ALQ 978 Unf 0 284 1262 \n", 653 | "2 GLQ 486 Unf 0 434 920 \n", 654 | "3 ALQ 216 Unf 0 540 756 \n", 655 | "4 GLQ 655 Unf 0 490 1145 \n", 656 | "\n", 657 | " Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF \\\n", 658 | "0 GasA Ex Y SBrkr 856 854 0 \n", 659 | "1 GasA Ex Y SBrkr 1262 0 0 \n", 660 | "2 GasA Ex Y SBrkr 920 866 0 \n", 661 | "3 GasA Gd Y SBrkr 961 756 0 \n", 662 | "4 GasA Ex Y SBrkr 1145 1053 0 \n", 663 | "\n", 664 | " GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr \\\n", 665 | "0 1710 1 0 2 1 3 \n", 666 | "1 1262 0 1 2 0 3 \n", 667 | "2 1786 1 0 2 1 3 \n", 668 | "3 1717 1 0 1 0 3 \n", 669 | "4 2198 1 0 2 1 4 \n", 670 | "\n", 671 | " KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu \\\n", 672 | "0 1 Gd 8 Typ 0 NaN \n", 673 | "1 1 TA 6 Typ 1 TA \n", 674 | "2 1 Gd 6 Typ 1 TA \n", 675 | "3 1 Gd 7 Typ 1 Gd \n", 676 | "4 1 Gd 9 Typ 1 TA \n", 677 | "\n", 678 | " GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual \\\n", 679 | "0 Attchd 2003.0 RFn 2 548 TA \n", 680 | "1 Attchd 1976.0 RFn 2 460 TA \n", 681 | "2 Attchd 2001.0 RFn 2 608 TA \n", 682 | "3 Detchd 1998.0 Unf 3 642 TA \n", 683 | "4 Attchd 2000.0 RFn 3 836 TA \n", 684 | "\n", 685 | " GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch \\\n", 686 | "0 TA Y 0 61 0 0 \n", 687 | "1 TA Y 298 0 0 0 \n", 688 | "2 TA Y 0 42 0 0 \n", 689 | "3 TA Y 0 35 272 0 \n", 690 | "4 TA Y 192 84 0 0 \n", 691 | "\n", 692 | " ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold \\\n", 693 | "0 0 0 NaN NaN NaN 0 2 2008 \n", 694 | "1 0 0 NaN NaN NaN 0 5 2007 \n", 695 | "2 0 0 NaN NaN NaN 0 9 2008 \n", 696 | "3 0 0 NaN NaN NaN 0 2 2006 \n", 697 | "4 0 0 NaN NaN NaN 0 12 2008 \n", 698 | "\n", 699 | " SaleType SaleCondition SalePrice \n", 700 | "0 WD Normal 208500 \n", 701 | "1 WD Normal 181500 \n", 702 | "2 WD Normal 223500 \n", 703 | "3 WD Abnorml 140000 \n", 704 | "4 WD Normal 250000 " 705 | ] 706 | }, 707 | "execution_count": 2, 708 | "metadata": {}, 709 | "output_type": "execute_result" 710 | } 711 | ], 712 | "source": [ 713 | "# load dataset\n", 714 | "data = pd.read_csv('houseprice.csv')\n", 715 | "print(data.shape)\n", 716 | "data.head()" 717 | ] 718 | }, 719 | { 720 | "cell_type": "markdown", 721 | "metadata": {}, 722 | "source": [ 723 | "## Separate dataset into train and test\n", 724 | "\n", 725 | "Before beginning to engineer our features, it is important to separate our data intro training and testing set. This is to avoid over-fitting. There is an element of randomness in dividing the dataset, so remember to set the seed." 726 | ] 727 | }, 728 | { 729 | "cell_type": "code", 730 | "execution_count": 3, 731 | "metadata": {}, 732 | "outputs": [ 733 | { 734 | "data": { 735 | "text/plain": [ 736 | "((1314, 81), (146, 81))" 737 | ] 738 | }, 739 | "execution_count": 3, 740 | "metadata": {}, 741 | "output_type": "execute_result" 742 | } 743 | ], 744 | "source": [ 745 | "# Let's separate into train and test set\n", 746 | "# Remember to seet the seed (random_state for this sklearn function)\n", 747 | "\n", 748 | "X_train, X_test, y_train, y_test = train_test_split(data, data.SalePrice,\n", 749 | " test_size=0.1,\n", 750 | " random_state=0) # we are setting the seed here\n", 751 | "X_train.shape, X_test.shape" 752 | ] 753 | }, 754 | { 755 | "cell_type": "markdown", 756 | "metadata": {}, 757 | "source": [ 758 | "## Selected features\n", 759 | "\n", 760 | "Remember that we will deploy our model utilising only a subset of features, the most predictive ones. This is to make simpler models, so that we build simpler code for deployment. We will tell you more about this in coming lectures." 761 | ] 762 | }, 763 | { 764 | "cell_type": "code", 765 | "execution_count": 4, 766 | "metadata": {}, 767 | "outputs": [ 768 | { 769 | "name": "stdout", 770 | "output_type": "stream", 771 | "text": [ 772 | "Number of features: 23\n" 773 | ] 774 | } 775 | ], 776 | "source": [ 777 | "# load selected features\n", 778 | "features = pd.read_csv('selected_features.csv', header=None)\n", 779 | "\n", 780 | "# Remember that I added the extra feature, to show you how to put\n", 781 | "# an additional feature engineering step into production\n", 782 | "features = [x for x in features[0]] + ['LotFrontage']\n", 783 | "print('Number of features: ', len(features))" 784 | ] 785 | }, 786 | { 787 | "cell_type": "markdown", 788 | "metadata": {}, 789 | "source": [ 790 | "### Missing values\n", 791 | "\n", 792 | "For categorical variables, we will fill missing information by adding an additional category: \"missing\"" 793 | ] 794 | }, 795 | { 796 | "cell_type": "code", 797 | "execution_count": 5, 798 | "metadata": { 799 | "scrolled": true 800 | }, 801 | "outputs": [ 802 | { 803 | "name": "stdout", 804 | "output_type": "stream", 805 | "text": [ 806 | "MasVnrType 0.005 % missing values\n", 807 | "BsmtQual 0.024 % missing values\n", 808 | "BsmtExposure 0.025 % missing values\n", 809 | "FireplaceQu 0.473 % missing values\n", 810 | "GarageType 0.056 % missing values\n", 811 | "GarageFinish 0.056 % missing values\n" 812 | ] 813 | } 814 | ], 815 | "source": [ 816 | "# make a list of the categorical variables that contain missing values\n", 817 | "vars_with_na = [var for var in features if X_train[var].isnull().sum()>1 and X_train[var].dtypes=='O']\n", 818 | "\n", 819 | "# print the variable name and the percentage of missing values\n", 820 | "for var in vars_with_na:\n", 821 | " print(var, np.round(X_train[var].isnull().mean(), 3), ' % missing values')" 822 | ] 823 | }, 824 | { 825 | "cell_type": "markdown", 826 | "metadata": {}, 827 | "source": [ 828 | "Note that we have much less categorical variables with missing values than in our original dataset. But we still use categorical variables with NA for the final model, so we need to include this piece of feature engineering logic in the deployment pipeline. " 829 | ] 830 | }, 831 | { 832 | "cell_type": "code", 833 | "execution_count": 6, 834 | "metadata": {}, 835 | "outputs": [ 836 | { 837 | "data": { 838 | "text/plain": [ 839 | "MasVnrType 0\n", 840 | "BsmtQual 0\n", 841 | "BsmtExposure 0\n", 842 | "FireplaceQu 0\n", 843 | "GarageType 0\n", 844 | "GarageFinish 0\n", 845 | "dtype: int64" 846 | ] 847 | }, 848 | "execution_count": 6, 849 | "metadata": {}, 850 | "output_type": "execute_result" 851 | } 852 | ], 853 | "source": [ 854 | "# I bring forward the functions used in the feature engineering notebook:\n", 855 | "\n", 856 | "# function to replace NA in categorical variables\n", 857 | "def fill_categorical_na(df, var_list):\n", 858 | " X = df.copy()\n", 859 | " X[var_list] = df[var_list].fillna('Missing')\n", 860 | " return X\n", 861 | "\n", 862 | "# replace missing values with new label: \"Missing\"\n", 863 | "X_train = fill_categorical_na(X_train, vars_with_na)\n", 864 | "X_test = fill_categorical_na(X_test, vars_with_na)\n", 865 | "\n", 866 | "# check that we have no missing information in the engineered variables\n", 867 | "X_train[vars_with_na].isnull().sum()" 868 | ] 869 | }, 870 | { 871 | "cell_type": "markdown", 872 | "metadata": {}, 873 | "source": [ 874 | "For numerical variables, we are going to add an additional variable capturing the missing information, and then replace the missing information in the original variable by the mode, or most frequent value:" 875 | ] 876 | }, 877 | { 878 | "cell_type": "code", 879 | "execution_count": 7, 880 | "metadata": {}, 881 | "outputs": [ 882 | { 883 | "name": "stdout", 884 | "output_type": "stream", 885 | "text": [ 886 | "LotFrontage 0.177 % missing values\n" 887 | ] 888 | } 889 | ], 890 | "source": [ 891 | "# make a list of the numerical variables that contain missing values\n", 892 | "vars_with_na = [var for var in features if X_train[var].isnull().sum()>1 and X_train[var].dtypes!='O']\n", 893 | "\n", 894 | "# print the variable name and the percentage of missing values\n", 895 | "for var in vars_with_na:\n", 896 | " print(var, np.round(X_train[var].isnull().mean(), 3), ' % missing values')" 897 | ] 898 | }, 899 | { 900 | "cell_type": "markdown", 901 | "metadata": {}, 902 | "source": [ 903 | "#### Important: persisting the mean value for NA imputation\n", 904 | "\n", 905 | "As you will see in future sections, one of the key pieces of deploying the model is \"Model Validation\". Model validation refers to corroborating that the deployed model and the model built during research, are identical. The entire pipeline needs to produce identical results.\n", 906 | "\n", 907 | "Therefore, in order to check at the end of the process that the feature engineering pipelines are identical, we will save -we will persist-, the mean value of the variable, so that we can use it at the end, to corroborate our models." 908 | ] 909 | }, 910 | { 911 | "cell_type": "code", 912 | "execution_count": 8, 913 | "metadata": {}, 914 | "outputs": [ 915 | { 916 | "data": { 917 | "text/plain": [ 918 | "LotFrontage 0\n", 919 | "dtype: int64" 920 | ] 921 | }, 922 | "execution_count": 8, 923 | "metadata": {}, 924 | "output_type": "execute_result" 925 | } 926 | ], 927 | "source": [ 928 | "# replace the missing values\n", 929 | "\n", 930 | "mean_var_dict = {}\n", 931 | "\n", 932 | "for var in vars_with_na:\n", 933 | " \n", 934 | " # calculate the mode\n", 935 | " mode_val = X_train[var].mode()[0]\n", 936 | " \n", 937 | " # we persist the mean in the dictionary\n", 938 | " mean_var_dict[var] = mode_val\n", 939 | " \n", 940 | " # train\n", 941 | " # note that the additional binary variable was not selected, so we don't need this step any more\n", 942 | " #X_train[var+'_na'] = np.where(X_train[var].isnull(), 1, 0)\n", 943 | " X_train[var].fillna(mode_val, inplace=True)\n", 944 | " \n", 945 | " # test\n", 946 | " # note that the additional binary variable was not selected, so we don't need this step any more\n", 947 | " #X_test[var+'_na'] = np.where(X_test[var].isnull(), 1, 0)\n", 948 | " X_test[var].fillna(mode_val, inplace=True)\n", 949 | "\n", 950 | "# we save the dictionary for later\n", 951 | "np.save('mean_var_dict.npy', mean_var_dict)\n", 952 | "\n", 953 | "# check that we have no more missing values in the engineered variables\n", 954 | "X_train[vars_with_na].isnull().sum()" 955 | ] 956 | }, 957 | { 958 | "cell_type": "markdown", 959 | "metadata": {}, 960 | "source": [ 961 | "### Temporal variables\n", 962 | "\n", 963 | "One of our temporal variables was selected to be used in the final model: 'YearRemodAdd'\n", 964 | "\n", 965 | "So we need to deploy the bit of code that creates it." 966 | ] 967 | }, 968 | { 969 | "cell_type": "code", 970 | "execution_count": 9, 971 | "metadata": { 972 | "collapsed": true 973 | }, 974 | "outputs": [], 975 | "source": [ 976 | "# create the temporal var \"elapsed years\"\n", 977 | "def elapsed_years(df, var):\n", 978 | " # capture difference between year variable and year the house was sold\n", 979 | " df[var] = df['YrSold'] - df[var]\n", 980 | " return df" 981 | ] 982 | }, 983 | { 984 | "cell_type": "code", 985 | "execution_count": 10, 986 | "metadata": { 987 | "collapsed": true 988 | }, 989 | "outputs": [], 990 | "source": [ 991 | "X_train = elapsed_years(X_train, 'YearRemodAdd')\n", 992 | "X_test = elapsed_years(X_test, 'YearRemodAdd')" 993 | ] 994 | }, 995 | { 996 | "cell_type": "markdown", 997 | "metadata": {}, 998 | "source": [ 999 | "### Numerical variables\n", 1000 | "\n", 1001 | "We will log transform the numerical variables that do not contain zeros in order to get a more Gaussian-like distribution. This tends to help Linear machine learning models.\n", 1002 | "\n", 1003 | "Originally, we also transformed 'LotArea', but this variable was not selected, so we remove it from the pipeline:" 1004 | ] 1005 | }, 1006 | { 1007 | "cell_type": "code", 1008 | "execution_count": 11, 1009 | "metadata": { 1010 | "collapsed": true 1011 | }, 1012 | "outputs": [], 1013 | "source": [ 1014 | "for var in ['LotFrontage', '1stFlrSF', 'GrLivArea', 'SalePrice']:\n", 1015 | " X_train[var] = np.log(X_train[var])\n", 1016 | " X_test[var]= np.log(X_test[var])" 1017 | ] 1018 | }, 1019 | { 1020 | "cell_type": "markdown", 1021 | "metadata": {}, 1022 | "source": [ 1023 | "### Categorical variables\n", 1024 | "\n", 1025 | "We do have categorical variables in our final model. First, we will remove those categories within variables that are present in less than 1% of the observations:" 1026 | ] 1027 | }, 1028 | { 1029 | "cell_type": "code", 1030 | "execution_count": 12, 1031 | "metadata": {}, 1032 | "outputs": [ 1033 | { 1034 | "data": { 1035 | "text/plain": [ 1036 | "['MSZoning',\n", 1037 | " 'Neighborhood',\n", 1038 | " 'RoofStyle',\n", 1039 | " 'MasVnrType',\n", 1040 | " 'BsmtQual',\n", 1041 | " 'BsmtExposure',\n", 1042 | " 'HeatingQC',\n", 1043 | " 'CentralAir',\n", 1044 | " 'KitchenQual',\n", 1045 | " 'FireplaceQu',\n", 1046 | " 'GarageType',\n", 1047 | " 'GarageFinish',\n", 1048 | " 'PavedDrive']" 1049 | ] 1050 | }, 1051 | "execution_count": 12, 1052 | "metadata": {}, 1053 | "output_type": "execute_result" 1054 | } 1055 | ], 1056 | "source": [ 1057 | "# let's capture the categorical variables first\n", 1058 | "cat_vars = [var for var in features if X_train[var].dtype == 'O']\n", 1059 | "cat_vars" 1060 | ] 1061 | }, 1062 | { 1063 | "cell_type": "markdown", 1064 | "metadata": {}, 1065 | "source": [ 1066 | "#### Important: persisting the frequent labels\n", 1067 | "\n", 1068 | "As you will see in future sections, one of the key pieces of deploying the model is \"Model Validation\". Model validation refers to corroborating that the deployed model and the model built during research, are identical. The entire pipeline needs to produce identical results.\n", 1069 | "\n", 1070 | "Therefore, in order to check at the end of the process, that the feature engineering pipelines are identical, we will save -we will persist-, the list of frequent labels per variable, so that we can use it at the end, to corroborate our models." 1071 | ] 1072 | }, 1073 | { 1074 | "cell_type": "code", 1075 | "execution_count": 13, 1076 | "metadata": { 1077 | "collapsed": true 1078 | }, 1079 | "outputs": [], 1080 | "source": [ 1081 | "def find_frequent_labels(df, var, rare_perc):\n", 1082 | " # finds the labels that are shared by more than a certain % of the houses in the dataset\n", 1083 | " df = df.copy()\n", 1084 | " tmp = df.groupby(var)['SalePrice'].count() / len(df)\n", 1085 | " return tmp[tmp>rare_perc].index\n", 1086 | "\n", 1087 | "frequent_labels_dict = {}\n", 1088 | "\n", 1089 | "for var in cat_vars:\n", 1090 | " frequent_ls = find_frequent_labels(X_train, var, 0.01)\n", 1091 | " \n", 1092 | " # we save the list in a dictionary\n", 1093 | " frequent_labels_dict[var] = frequent_ls\n", 1094 | " \n", 1095 | " X_train[var] = np.where(X_train[var].isin(frequent_ls), X_train[var], 'Rare')\n", 1096 | " X_test[var] = np.where(X_test[var].isin(frequent_ls), X_test[var], 'Rare')\n", 1097 | " \n", 1098 | "# now we save the dictionary\n", 1099 | "np.save('FrequentLabels.npy', frequent_labels_dict)" 1100 | ] 1101 | }, 1102 | { 1103 | "cell_type": "code", 1104 | "execution_count": 14, 1105 | "metadata": {}, 1106 | "outputs": [ 1107 | { 1108 | "data": { 1109 | "text/plain": [ 1110 | "{'BsmtExposure': Index(['Av', 'Gd', 'Missing', 'Mn', 'No'], dtype='object', name='BsmtExposure'),\n", 1111 | " 'BsmtQual': Index(['Ex', 'Fa', 'Gd', 'Missing', 'TA'], dtype='object', name='BsmtQual'),\n", 1112 | " 'CentralAir': Index(['N', 'Y'], dtype='object', name='CentralAir'),\n", 1113 | " 'FireplaceQu': Index(['Ex', 'Fa', 'Gd', 'Missing', 'Po', 'TA'], dtype='object', name='FireplaceQu'),\n", 1114 | " 'GarageFinish': Index(['Fin', 'Missing', 'RFn', 'Unf'], dtype='object', name='GarageFinish'),\n", 1115 | " 'GarageType': Index(['Attchd', 'Basment', 'BuiltIn', 'Detchd', 'Missing'], dtype='object', name='GarageType'),\n", 1116 | " 'HeatingQC': Index(['Ex', 'Fa', 'Gd', 'TA'], dtype='object', name='HeatingQC'),\n", 1117 | " 'KitchenQual': Index(['Ex', 'Fa', 'Gd', 'TA'], dtype='object', name='KitchenQual'),\n", 1118 | " 'MSZoning': Index(['FV', 'RH', 'RL', 'RM'], dtype='object', name='MSZoning'),\n", 1119 | " 'MasVnrType': Index(['BrkFace', 'None', 'Stone'], dtype='object', name='MasVnrType'),\n", 1120 | " 'Neighborhood': Index(['Blmngtn', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr', 'Crawfor',\n", 1121 | " 'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel', 'NAmes', 'NWAmes',\n", 1122 | " 'NoRidge', 'NridgHt', 'OldTown', 'SWISU', 'Sawyer', 'SawyerW',\n", 1123 | " 'Somerst', 'StoneBr', 'Timber'],\n", 1124 | " dtype='object', name='Neighborhood'),\n", 1125 | " 'PavedDrive': Index(['N', 'P', 'Y'], dtype='object', name='PavedDrive'),\n", 1126 | " 'RoofStyle': Index(['Gable', 'Hip'], dtype='object', name='RoofStyle')}" 1127 | ] 1128 | }, 1129 | "execution_count": 14, 1130 | "metadata": {}, 1131 | "output_type": "execute_result" 1132 | } 1133 | ], 1134 | "source": [ 1135 | "frequent_labels_dict" 1136 | ] 1137 | }, 1138 | { 1139 | "cell_type": "markdown", 1140 | "metadata": {}, 1141 | "source": [ 1142 | "Next, we need to transform the strings of these variables into numbers. We will do it so that we capture the monotonic relationship between the label and the target:" 1143 | ] 1144 | }, 1145 | { 1146 | "cell_type": "code", 1147 | "execution_count": 15, 1148 | "metadata": { 1149 | "collapsed": true 1150 | }, 1151 | "outputs": [], 1152 | "source": [ 1153 | "# this function will assign discrete values to the strings of the variables, \n", 1154 | "# so that the smaller value corresponds to the smaller mean of target\n", 1155 | "\n", 1156 | "def replace_categories(train, test, var, target):\n", 1157 | " train = train.copy()\n", 1158 | " test = test.copy()\n", 1159 | " \n", 1160 | " ordered_labels = train.groupby([var])[target].mean().sort_values().index\n", 1161 | " ordinal_label = {k:i for i, k in enumerate(ordered_labels, 0)} \n", 1162 | " \n", 1163 | " train[var] = train[var].map(ordinal_label)\n", 1164 | " test[var] = test[var].map(ordinal_label)\n", 1165 | " \n", 1166 | " return ordinal_label, train, test" 1167 | ] 1168 | }, 1169 | { 1170 | "cell_type": "code", 1171 | "execution_count": 16, 1172 | "metadata": { 1173 | "scrolled": true 1174 | }, 1175 | "outputs": [], 1176 | "source": [ 1177 | "ordinal_label_dict = {}\n", 1178 | "for var in cat_vars:\n", 1179 | " ordinal_label, X_train, X_test = replace_categories(X_train, X_test, var, 'SalePrice')\n", 1180 | " ordinal_label_dict[var] = ordinal_label\n", 1181 | " \n", 1182 | "# now we save the dictionary\n", 1183 | "np.save('OrdinalLabels.npy', ordinal_label_dict)" 1184 | ] 1185 | }, 1186 | { 1187 | "cell_type": "code", 1188 | "execution_count": 17, 1189 | "metadata": { 1190 | "scrolled": true 1191 | }, 1192 | "outputs": [ 1193 | { 1194 | "data": { 1195 | "text/plain": [ 1196 | "{'BsmtExposure': {'Av': 3, 'Gd': 4, 'Missing': 0, 'Mn': 2, 'No': 1},\n", 1197 | " 'BsmtQual': {'Ex': 4, 'Fa': 1, 'Gd': 3, 'Missing': 0, 'TA': 2},\n", 1198 | " 'CentralAir': {'N': 0, 'Y': 1},\n", 1199 | " 'FireplaceQu': {'Ex': 5, 'Fa': 2, 'Gd': 4, 'Missing': 1, 'Po': 0, 'TA': 3},\n", 1200 | " 'GarageFinish': {'Fin': 3, 'Missing': 0, 'RFn': 2, 'Unf': 1},\n", 1201 | " 'GarageType': {'Attchd': 4,\n", 1202 | " 'Basment': 3,\n", 1203 | " 'BuiltIn': 5,\n", 1204 | " 'Detchd': 2,\n", 1205 | " 'Missing': 0,\n", 1206 | " 'Rare': 1},\n", 1207 | " 'HeatingQC': {'Ex': 4, 'Fa': 1, 'Gd': 3, 'Rare': 0, 'TA': 2},\n", 1208 | " 'KitchenQual': {'Ex': 3, 'Fa': 0, 'Gd': 2, 'TA': 1},\n", 1209 | " 'MSZoning': {'FV': 4, 'RH': 2, 'RL': 3, 'RM': 1, 'Rare': 0},\n", 1210 | " 'MasVnrType': {'BrkFace': 2, 'None': 0, 'Rare': 1, 'Stone': 3},\n", 1211 | " 'Neighborhood': {'Blmngtn': 14,\n", 1212 | " 'BrDale': 2,\n", 1213 | " 'BrkSide': 4,\n", 1214 | " 'ClearCr': 17,\n", 1215 | " 'CollgCr': 15,\n", 1216 | " 'Crawfor': 16,\n", 1217 | " 'Edwards': 3,\n", 1218 | " 'Gilbert': 13,\n", 1219 | " 'IDOTRR': 0,\n", 1220 | " 'MeadowV': 1,\n", 1221 | " 'Mitchel': 9,\n", 1222 | " 'NAmes': 8,\n", 1223 | " 'NWAmes': 12,\n", 1224 | " 'NoRidge': 22,\n", 1225 | " 'NridgHt': 21,\n", 1226 | " 'OldTown': 5,\n", 1227 | " 'Rare': 11,\n", 1228 | " 'SWISU': 7,\n", 1229 | " 'Sawyer': 6,\n", 1230 | " 'SawyerW': 10,\n", 1231 | " 'Somerst': 18,\n", 1232 | " 'StoneBr': 20,\n", 1233 | " 'Timber': 19},\n", 1234 | " 'PavedDrive': {'N': 0, 'P': 1, 'Y': 2},\n", 1235 | " 'RoofStyle': {'Gable': 0, 'Hip': 2, 'Rare': 1}}" 1236 | ] 1237 | }, 1238 | "execution_count": 17, 1239 | "metadata": {}, 1240 | "output_type": "execute_result" 1241 | } 1242 | ], 1243 | "source": [ 1244 | "ordinal_label_dict" 1245 | ] 1246 | }, 1247 | { 1248 | "cell_type": "code", 1249 | "execution_count": 18, 1250 | "metadata": {}, 1251 | "outputs": [ 1252 | { 1253 | "data": { 1254 | "text/plain": [ 1255 | "[]" 1256 | ] 1257 | }, 1258 | "execution_count": 18, 1259 | "metadata": {}, 1260 | "output_type": "execute_result" 1261 | } 1262 | ], 1263 | "source": [ 1264 | "# check absence of na\n", 1265 | "[var for var in features if X_train[var].isnull().sum()>0]" 1266 | ] 1267 | }, 1268 | { 1269 | "cell_type": "code", 1270 | "execution_count": 19, 1271 | "metadata": {}, 1272 | "outputs": [ 1273 | { 1274 | "data": { 1275 | "text/plain": [ 1276 | "[]" 1277 | ] 1278 | }, 1279 | "execution_count": 19, 1280 | "metadata": {}, 1281 | "output_type": "execute_result" 1282 | } 1283 | ], 1284 | "source": [ 1285 | "# check absence of na\n", 1286 | "[var for var in features if X_test[var].isnull().sum()>0]" 1287 | ] 1288 | }, 1289 | { 1290 | "cell_type": "markdown", 1291 | "metadata": {}, 1292 | "source": [ 1293 | "### Feature Scaling\n", 1294 | "\n", 1295 | "For use in linear models, features need to be either scaled or normalised. In the next section, I will scale features between the min and max values:" 1296 | ] 1297 | }, 1298 | { 1299 | "cell_type": "code", 1300 | "execution_count": 20, 1301 | "metadata": { 1302 | "collapsed": true 1303 | }, 1304 | "outputs": [], 1305 | "source": [ 1306 | "# capture the target\n", 1307 | "y_train = X_train['SalePrice']\n", 1308 | "y_test = X_test['SalePrice']" 1309 | ] 1310 | }, 1311 | { 1312 | "cell_type": "code", 1313 | "execution_count": 21, 1314 | "metadata": {}, 1315 | "outputs": [ 1316 | { 1317 | "data": { 1318 | "text/plain": [ 1319 | "['scaler.pkl']" 1320 | ] 1321 | }, 1322 | "execution_count": 21, 1323 | "metadata": {}, 1324 | "output_type": "execute_result" 1325 | } 1326 | ], 1327 | "source": [ 1328 | "# fit scaler\n", 1329 | "scaler = MinMaxScaler() # create an instance\n", 1330 | "scaler.fit(X_train[features]) # fit the scaler to the train set for later use\n", 1331 | "\n", 1332 | "# we persist the model for future use\n", 1333 | "joblib.dump(scaler, 'scaler.pkl')" 1334 | ] 1335 | }, 1336 | { 1337 | "cell_type": "code", 1338 | "execution_count": 22, 1339 | "metadata": { 1340 | "collapsed": true 1341 | }, 1342 | "outputs": [], 1343 | "source": [ 1344 | "# transform the train and test set, and add on the Id and SalePrice variables\n", 1345 | "X_train = pd.DataFrame(scaler.transform(X_train[features]), columns=features)\n", 1346 | "X_test = pd.DataFrame(scaler.transform(X_test[features]), columns=features)" 1347 | ] 1348 | }, 1349 | { 1350 | "cell_type": "code", 1351 | "execution_count": 23, 1352 | "metadata": {}, 1353 | "outputs": [ 1354 | { 1355 | "data": { 1356 | "text/plain": [ 1357 | "['lasso_regression.pkl']" 1358 | ] 1359 | }, 1360 | "execution_count": 23, 1361 | "metadata": {}, 1362 | "output_type": "execute_result" 1363 | } 1364 | ], 1365 | "source": [ 1366 | "# train the model\n", 1367 | "lin_model = Lasso(alpha=0.005, random_state=0) # remember to set the random_state / seed\n", 1368 | "lin_model.fit(X_train, y_train)\n", 1369 | "\n", 1370 | "# we persist the model for future use\n", 1371 | "joblib.dump(lin_model, 'lasso_regression.pkl')" 1372 | ] 1373 | }, 1374 | { 1375 | "cell_type": "code", 1376 | "execution_count": 24, 1377 | "metadata": {}, 1378 | "outputs": [ 1379 | { 1380 | "name": "stdout", 1381 | "output_type": "stream", 1382 | "text": [ 1383 | "linear train mse: 1087435415.4414494\n", 1384 | "linear train rmse: 32976.285652593586\n", 1385 | "\n", 1386 | "linear test mse: 1405259552.259598\n", 1387 | "linear test rmse: 37486.791704006864\n", 1388 | "\n", 1389 | "Average house price: 163000.00000000012\n" 1390 | ] 1391 | } 1392 | ], 1393 | "source": [ 1394 | "# evaluate the model:\n", 1395 | "# remember that we log transformed the output (SalePrice) in our feature engineering notebook / lecture.\n", 1396 | "\n", 1397 | "# In order to get the true performance of the Lasso\n", 1398 | "# we need to transform both the target and the predictions\n", 1399 | "# back to the original house prices values.\n", 1400 | "\n", 1401 | "# We will evaluate performance using the mean squared error and the\n", 1402 | "# root of the mean squared error\n", 1403 | "\n", 1404 | "pred = lin_model.predict(X_train)\n", 1405 | "print('linear train mse: {}'.format(mean_squared_error(np.exp(y_train), np.exp(pred))))\n", 1406 | "print('linear train rmse: {}'.format(sqrt(mean_squared_error(np.exp(y_train), np.exp(pred)))))\n", 1407 | "print()\n", 1408 | "pred = lin_model.predict(X_test)\n", 1409 | "print('linear test mse: {}'.format(mean_squared_error(np.exp(y_test), np.exp(pred))))\n", 1410 | "print('linear test rmse: {}'.format(sqrt(mean_squared_error(np.exp(y_test), np.exp(pred)))))\n", 1411 | "print()\n", 1412 | "print('Average house price: ', np.exp(y_train).median())" 1413 | ] 1414 | }, 1415 | { 1416 | "cell_type": "markdown", 1417 | "metadata": {}, 1418 | "source": [ 1419 | "That is all for this notebook. And that is all for this section too.\n", 1420 | "\n", 1421 | "**In the next section, we will show you how to productionise this code for model deployment**." 1422 | ] 1423 | }, 1424 | { 1425 | "cell_type": "code", 1426 | "execution_count": null, 1427 | "metadata": { 1428 | "collapsed": true 1429 | }, 1430 | "outputs": [], 1431 | "source": [] 1432 | } 1433 | ], 1434 | "metadata": { 1435 | "kernelspec": { 1436 | "display_name": "Python 3", 1437 | "language": "python", 1438 | "name": "python3" 1439 | }, 1440 | "language_info": { 1441 | "codemirror_mode": { 1442 | "name": "ipython", 1443 | "version": 3 1444 | }, 1445 | "file_extension": ".py", 1446 | "mimetype": "text/x-python", 1447 | "name": "python", 1448 | "nbconvert_exporter": "python", 1449 | "pygments_lexer": "ipython3", 1450 | "version": "3.6.1" 1451 | }, 1452 | "toc": { 1453 | "nav_menu": {}, 1454 | "number_sections": true, 1455 | "sideBar": true, 1456 | "skip_h1_title": false, 1457 | "toc_cell": false, 1458 | "toc_position": { 1459 | "height": "583px", 1460 | "left": "0px", 1461 | "right": "1324px", 1462 | "top": "107px", 1463 | "width": "212px" 1464 | }, 1465 | "toc_section_display": "block", 1466 | "toc_window_display": true 1467 | } 1468 | }, 1469 | "nbformat": 4, 1470 | "nbformat_minor": 2 1471 | } 1472 | -------------------------------------------------------------------------------- /jupyter_notebooks/Section2_MLPipelineOverview/02.8_ML_Pipeline_Step3-FeatureSelection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Machine Learning Model Building Pipeline: Feature Selection\n", 8 | "\n", 9 | "In the following videos, we will take you through a practical example of each one of the steps in the Machine Learning model building pipeline that we described in the previous lectures. There will be a notebook for each one of the Machine Learning Pipeline steps:\n", 10 | "\n", 11 | "1. Data Analysis\n", 12 | "2. Feature Engineering\n", 13 | "3. Feature Selection\n", 14 | "4. Model Building\n", 15 | "\n", 16 | "**This is the notebook for step 3: Feature Selection**\n", 17 | "\n", 18 | "\n", 19 | "We will use the house price dataset available on [Kaggle.com](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data). See below for more details.\n", 20 | "\n", 21 | "===================================================================================================\n", 22 | "\n", 23 | "## Predicting Sale Price of Houses\n", 24 | "\n", 25 | "The aim of the project is to build a machine learning model to predict the sale price of homes based on different explanatory variables describing aspects of residential houses. \n", 26 | "\n", 27 | "### Why is this important? \n", 28 | "\n", 29 | "Predicting house prices is useful to identify fruitful investments, or to determine whether the price advertised for a house is over or underestimated, before making a buying judgment.\n", 30 | "\n", 31 | "### What is the objective of the machine learning model?\n", 32 | "\n", 33 | "We aim to minimise the difference between the real price, and the estimated price by our model. We will evaluate model performance using the mean squared error (mse) and the root squared of the mean squared error (rmse).\n", 34 | "\n", 35 | "### How do I download the dataset?\n", 36 | "\n", 37 | "To download the House Price dataset go this website:\n", 38 | "https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data\n", 39 | "\n", 40 | "Scroll down to the bottom of the page, and click on the link 'train.csv', and then click the 'download' blue button towards the right of the screen, to download the dataset. Rename the file as 'houseprice.csv' and save it to a directory of your choice.\n", 41 | "\n", 42 | "**Note the following:**\n", 43 | "- You need to be logged in to Kaggle in order to download the datasets.\n", 44 | "- You need to accept the terms and conditions of the competition to download the dataset\n", 45 | "- If you save the file to the same directory where you saved this jupyter notebook, then you can run the code as it is written here.\n", 46 | "\n", 47 | "====================================================================================================" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "## House Prices dataset: Feature Selection\n", 55 | "\n", 56 | "In the following cells, we will select a group of variables, the most predictive ones, to build our machine learning models. \n", 57 | "\n", 58 | "### Why do we need to select variables?\n", 59 | "\n", 60 | "1. For production: Fewer variables mean smaller client input requirements (e.g. customers filling out a form on a website or mobile app), and hence less code for error handling. This reduces the chances of bugs.\n", 61 | "2. For model performance: Fewer variables mean simpler, more interpretable, less over-fitted models\n", 62 | "\n", 63 | "\n", 64 | "**We will select variables using the Lasso regression: Lasso has the property of setting the coefficient of non-informative variables to zero. This way we can identify those variables and remove them from our final models.**\n", 65 | "\n", 66 | "### Setting the seed\n", 67 | "\n", 68 | "It is important to note, that we are engineering variables and pre-processing data with the idea of deploying the model if we find business value in it. Therefore, from now on, for each step that includes some element of randomness, it is extremely important that we **set the seed**. This way, we can obtain reproducibility between our research and our development code.\n", 69 | "\n", 70 | "This is perhaps one of the most important lessons that you need to take away from this course: **Always set the seeds**.\n", 71 | "\n", 72 | "Let's go ahead and load the dataset." 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 1, 78 | "metadata": { 79 | "collapsed": true 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "# to handle datasets\n", 84 | "import pandas as pd\n", 85 | "import numpy as np\n", 86 | "\n", 87 | "# for plotting\n", 88 | "import matplotlib.pyplot as plt\n", 89 | "%matplotlib inline\n", 90 | "\n", 91 | "# to build the models\n", 92 | "from sklearn.linear_model import Lasso\n", 93 | "from sklearn.feature_selection import SelectFromModel\n", 94 | "\n", 95 | "# to visualise al the columns in the dataframe\n", 96 | "pd.pandas.set_option('display.max_columns', None)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 2, 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "data": { 106 | "text/html": [ 107 | "
\n", 108 | "\n", 121 | "\n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | "
IdSalePriceMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilitiesLotConfigLandSlopeNeighborhoodCondition1Condition2BldgTypeHouseStyleOverallQualOverallCondYearBuiltYearRemodAddRoofStyleRoofMatlExterior1stExterior2ndMasVnrTypeMasVnrAreaExterQualExterCondFoundationBsmtQualBsmtCondBsmtExposureBsmtFinType1BsmtFinSF1BsmtFinType2BsmtFinSF2BsmtUnfSFTotalBsmtSFHeatingHeatingQCCentralAirElectrical1stFlrSF2ndFlrSFLowQualFinSFGrLivAreaBsmtFullBathBsmtHalfBathFullBathHalfBathBedroomAbvGrKitchenAbvGrKitchenQualTotRmsAbvGrdFunctionalFireplacesFireplaceQuGarageTypeGarageYrBltGarageFinishGarageCarsGarageAreaGarageQualGarageCondPavedDriveWoodDeckSFOpenPorchSFEnclosedPorch3SsnPorchScreenPorchPoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleConditionLotFrontage_naMasVnrArea_naGarageYrBlt_na
093112.2110600.0000000.750.4611710.3770481.01.00.3333331.0000001.00.00.00.8636360.41.00.750.60.7777780.500.0147060.0491800.00.01.01.00.0000000.000000.6666671.01.00.750.750.751.0000000.0028350.6666670.00.6734790.2399351.01.001.01.00.5597600.00.00.5232500.0000000.00.6666670.00.3750.3333330.6666670.4166671.00.0000000.20.80.0186921.0000000.750.4301830.6666671.01.00.1166860.0329070.00.0000000.00.00.00.751.00.00.5454550.750.6666670.750.00.00.0
165711.8879310.0000000.750.4560660.3994431.01.00.3333330.3333331.00.00.00.3636360.41.00.750.60.4444440.750.3602940.0491800.00.00.60.60.6666670.033750.6666671.00.50.500.750.250.6666670.1428070.6666670.00.1147240.1723401.01.001.01.00.4345390.00.00.4061960.3333330.00.3333330.50.3750.3333330.6666670.2500001.00.0000000.20.80.4579440.6666670.250.2200280.6666671.01.00.0000000.0000000.00.0000000.00.00.00.501.00.00.6363640.500.6666670.750.00.00.0
24612.6757640.5882350.750.3946990.3470821.01.00.0000000.3333331.00.00.00.9545450.41.01.000.60.8888890.500.0367650.0983611.00.00.30.20.6666670.257501.0000001.01.01.000.750.251.0000000.0807940.6666670.00.6019510.2867431.01.001.01.00.6272050.00.00.5862960.3333330.00.6666670.00.2500.3333331.0000000.3333331.00.3333330.80.80.0467290.6666670.500.4062060.6666671.01.00.2287050.1499090.00.0000000.00.00.00.751.00.00.0909091.000.6666670.750.00.00.0
3134912.2783930.0000000.750.3885810.4936771.01.00.6666670.6666671.00.00.00.4545450.41.00.750.60.6666670.500.0661760.1639340.00.01.01.00.0000000.000000.6666671.01.00.750.751.001.0000000.2556700.6666670.00.0181140.2425531.01.001.01.00.5669200.00.00.5299430.3333330.00.6666670.00.3750.3333330.6666670.2500001.00.3333330.40.80.0841120.6666670.500.3624820.6666671.01.00.4690780.0457040.00.0000000.00.00.00.751.00.00.6363640.250.6666670.751.00.00.0
45612.1034860.0000000.750.5776580.4027021.01.00.3333330.3333331.00.00.00.3636360.41.00.750.60.5555560.500.3235290.7377050.00.00.60.70.6666670.170000.3333331.00.50.500.750.250.3333330.0868180.6666670.00.4342780.2332241.00.751.01.00.5490260.00.00.5132160.0000000.00.6666670.00.3750.3333330.3333330.4166671.00.3333330.80.80.4112150.6666670.500.4062060.6666671.01.00.0000000.0000000.00.8011810.00.00.00.751.00.00.5454550.500.6666670.750.00.00.0
\n", 649 | "
" 650 | ], 651 | "text/plain": [ 652 | " Id SalePrice MSSubClass MSZoning LotFrontage LotArea Street \\\n", 653 | "0 931 12.211060 0.000000 0.75 0.461171 0.377048 1.0 \n", 654 | "1 657 11.887931 0.000000 0.75 0.456066 0.399443 1.0 \n", 655 | "2 46 12.675764 0.588235 0.75 0.394699 0.347082 1.0 \n", 656 | "3 1349 12.278393 0.000000 0.75 0.388581 0.493677 1.0 \n", 657 | "4 56 12.103486 0.000000 0.75 0.577658 0.402702 1.0 \n", 658 | "\n", 659 | " Alley LotShape LandContour Utilities LotConfig LandSlope \\\n", 660 | "0 1.0 0.333333 1.000000 1.0 0.0 0.0 \n", 661 | "1 1.0 0.333333 0.333333 1.0 0.0 0.0 \n", 662 | "2 1.0 0.000000 0.333333 1.0 0.0 0.0 \n", 663 | "3 1.0 0.666667 0.666667 1.0 0.0 0.0 \n", 664 | "4 1.0 0.333333 0.333333 1.0 0.0 0.0 \n", 665 | "\n", 666 | " Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual \\\n", 667 | "0 0.863636 0.4 1.0 0.75 0.6 0.777778 \n", 668 | "1 0.363636 0.4 1.0 0.75 0.6 0.444444 \n", 669 | "2 0.954545 0.4 1.0 1.00 0.6 0.888889 \n", 670 | "3 0.454545 0.4 1.0 0.75 0.6 0.666667 \n", 671 | "4 0.363636 0.4 1.0 0.75 0.6 0.555556 \n", 672 | "\n", 673 | " OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st \\\n", 674 | "0 0.50 0.014706 0.049180 0.0 0.0 1.0 \n", 675 | "1 0.75 0.360294 0.049180 0.0 0.0 0.6 \n", 676 | "2 0.50 0.036765 0.098361 1.0 0.0 0.3 \n", 677 | "3 0.50 0.066176 0.163934 0.0 0.0 1.0 \n", 678 | "4 0.50 0.323529 0.737705 0.0 0.0 0.6 \n", 679 | "\n", 680 | " Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation \\\n", 681 | "0 1.0 0.000000 0.00000 0.666667 1.0 1.0 \n", 682 | "1 0.6 0.666667 0.03375 0.666667 1.0 0.5 \n", 683 | "2 0.2 0.666667 0.25750 1.000000 1.0 1.0 \n", 684 | "3 1.0 0.000000 0.00000 0.666667 1.0 1.0 \n", 685 | "4 0.7 0.666667 0.17000 0.333333 1.0 0.5 \n", 686 | "\n", 687 | " BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 \\\n", 688 | "0 0.75 0.75 0.75 1.000000 0.002835 0.666667 \n", 689 | "1 0.50 0.75 0.25 0.666667 0.142807 0.666667 \n", 690 | "2 1.00 0.75 0.25 1.000000 0.080794 0.666667 \n", 691 | "3 0.75 0.75 1.00 1.000000 0.255670 0.666667 \n", 692 | "4 0.50 0.75 0.25 0.333333 0.086818 0.666667 \n", 693 | "\n", 694 | " BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir \\\n", 695 | "0 0.0 0.673479 0.239935 1.0 1.00 1.0 \n", 696 | "1 0.0 0.114724 0.172340 1.0 1.00 1.0 \n", 697 | "2 0.0 0.601951 0.286743 1.0 1.00 1.0 \n", 698 | "3 0.0 0.018114 0.242553 1.0 1.00 1.0 \n", 699 | "4 0.0 0.434278 0.233224 1.0 0.75 1.0 \n", 700 | "\n", 701 | " Electrical 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath \\\n", 702 | "0 1.0 0.559760 0.0 0.0 0.523250 0.000000 \n", 703 | "1 1.0 0.434539 0.0 0.0 0.406196 0.333333 \n", 704 | "2 1.0 0.627205 0.0 0.0 0.586296 0.333333 \n", 705 | "3 1.0 0.566920 0.0 0.0 0.529943 0.333333 \n", 706 | "4 1.0 0.549026 0.0 0.0 0.513216 0.000000 \n", 707 | "\n", 708 | " BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual \\\n", 709 | "0 0.0 0.666667 0.0 0.375 0.333333 0.666667 \n", 710 | "1 0.0 0.333333 0.5 0.375 0.333333 0.666667 \n", 711 | "2 0.0 0.666667 0.0 0.250 0.333333 1.000000 \n", 712 | "3 0.0 0.666667 0.0 0.375 0.333333 0.666667 \n", 713 | "4 0.0 0.666667 0.0 0.375 0.333333 0.333333 \n", 714 | "\n", 715 | " TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt \\\n", 716 | "0 0.416667 1.0 0.000000 0.2 0.8 0.018692 \n", 717 | "1 0.250000 1.0 0.000000 0.2 0.8 0.457944 \n", 718 | "2 0.333333 1.0 0.333333 0.8 0.8 0.046729 \n", 719 | "3 0.250000 1.0 0.333333 0.4 0.8 0.084112 \n", 720 | "4 0.416667 1.0 0.333333 0.8 0.8 0.411215 \n", 721 | "\n", 722 | " GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive \\\n", 723 | "0 1.000000 0.75 0.430183 0.666667 1.0 1.0 \n", 724 | "1 0.666667 0.25 0.220028 0.666667 1.0 1.0 \n", 725 | "2 0.666667 0.50 0.406206 0.666667 1.0 1.0 \n", 726 | "3 0.666667 0.50 0.362482 0.666667 1.0 1.0 \n", 727 | "4 0.666667 0.50 0.406206 0.666667 1.0 1.0 \n", 728 | "\n", 729 | " WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea \\\n", 730 | "0 0.116686 0.032907 0.0 0.000000 0.0 0.0 \n", 731 | "1 0.000000 0.000000 0.0 0.000000 0.0 0.0 \n", 732 | "2 0.228705 0.149909 0.0 0.000000 0.0 0.0 \n", 733 | "3 0.469078 0.045704 0.0 0.000000 0.0 0.0 \n", 734 | "4 0.000000 0.000000 0.0 0.801181 0.0 0.0 \n", 735 | "\n", 736 | " PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType \\\n", 737 | "0 0.0 0.75 1.0 0.0 0.545455 0.75 0.666667 \n", 738 | "1 0.0 0.50 1.0 0.0 0.636364 0.50 0.666667 \n", 739 | "2 0.0 0.75 1.0 0.0 0.090909 1.00 0.666667 \n", 740 | "3 0.0 0.75 1.0 0.0 0.636364 0.25 0.666667 \n", 741 | "4 0.0 0.75 1.0 0.0 0.545455 0.50 0.666667 \n", 742 | "\n", 743 | " SaleCondition LotFrontage_na MasVnrArea_na GarageYrBlt_na \n", 744 | "0 0.75 0.0 0.0 0.0 \n", 745 | "1 0.75 0.0 0.0 0.0 \n", 746 | "2 0.75 0.0 0.0 0.0 \n", 747 | "3 0.75 1.0 0.0 0.0 \n", 748 | "4 0.75 0.0 0.0 0.0 " 749 | ] 750 | }, 751 | "execution_count": 2, 752 | "metadata": {}, 753 | "output_type": "execute_result" 754 | } 755 | ], 756 | "source": [ 757 | "# load dataset\n", 758 | "# We load the datasets with the engineered values: we built and saved these datasets in the previous lecture.\n", 759 | "# If you haven't done so, go ahead and check the previous lecture / notebook to find out how to create these datasets\n", 760 | "\n", 761 | "X_train = pd.read_csv('xtrain.csv')\n", 762 | "X_test = pd.read_csv('xtest.csv')\n", 763 | "\n", 764 | "X_train.head()" 765 | ] 766 | }, 767 | { 768 | "cell_type": "code", 769 | "execution_count": 3, 770 | "metadata": { 771 | "collapsed": true 772 | }, 773 | "outputs": [], 774 | "source": [ 775 | "# capture the target\n", 776 | "y_train = X_train['SalePrice']\n", 777 | "y_test = X_test['SalePrice']\n", 778 | "\n", 779 | "# drop unnecessary variables from our training and testing sets\n", 780 | "X_train.drop(['Id', 'SalePrice'], axis=1, inplace=True)\n", 781 | "X_test.drop(['Id', 'SalePrice'], axis=1, inplace=True)" 782 | ] 783 | }, 784 | { 785 | "cell_type": "markdown", 786 | "metadata": {}, 787 | "source": [ 788 | "### Feature Selection\n", 789 | "\n", 790 | "Let's go ahead and select a subset of the most predictive features. There is an element of randomness in the Lasso regression, so remember to set the seed." 791 | ] 792 | }, 793 | { 794 | "cell_type": "code", 795 | "execution_count": 4, 796 | "metadata": { 797 | "scrolled": true 798 | }, 799 | "outputs": [ 800 | { 801 | "data": { 802 | "text/plain": [ 803 | "SelectFromModel(estimator=Lasso(alpha=0.005, copy_X=True, fit_intercept=True, max_iter=1000,\n", 804 | " normalize=False, positive=False, precompute=False, random_state=0,\n", 805 | " selection='cyclic', tol=0.0001, warm_start=False),\n", 806 | " prefit=False, threshold=None)" 807 | ] 808 | }, 809 | "execution_count": 4, 810 | "metadata": {}, 811 | "output_type": "execute_result" 812 | } 813 | ], 814 | "source": [ 815 | "# here I will do the model fitting and feature selection\n", 816 | "# altogether in one line of code\n", 817 | "\n", 818 | "# first, I specify the Lasso Regression model, and I\n", 819 | "# select a suitable alpha (equivalent of penalty).\n", 820 | "# The bigger the alpha the less features that will be selected.\n", 821 | "\n", 822 | "# Then I use the selectFromModel object from sklearn, which\n", 823 | "# will select the features which coefficients are non-zero\n", 824 | "\n", 825 | "sel_ = SelectFromModel(Lasso(alpha=0.005, random_state=0)) # remember to set the seed, the random state in this function\n", 826 | "sel_.fit(X_train, y_train)" 827 | ] 828 | }, 829 | { 830 | "cell_type": "code", 831 | "execution_count": 5, 832 | "metadata": {}, 833 | "outputs": [ 834 | { 835 | "data": { 836 | "text/plain": [ 837 | "array([ True, True, False, False, False, False, False, False, False,\n", 838 | " False, False, True, False, False, False, False, True, True,\n", 839 | " False, True, True, False, False, False, True, False, False,\n", 840 | " False, False, True, False, True, False, False, False, False,\n", 841 | " False, False, False, True, True, False, True, False, False,\n", 842 | " True, True, False, False, False, False, False, True, False,\n", 843 | " False, True, True, True, False, True, True, False, False,\n", 844 | " False, True, False, False, False, False, False, False, False,\n", 845 | " False, False, False, False, False, False, False, False, False, False], dtype=bool)" 846 | ] 847 | }, 848 | "execution_count": 5, 849 | "metadata": {}, 850 | "output_type": "execute_result" 851 | } 852 | ], 853 | "source": [ 854 | "# this command let's us visualise those features that were kept.\n", 855 | "# Kept features have a True indicator\n", 856 | "sel_.get_support()" 857 | ] 858 | }, 859 | { 860 | "cell_type": "code", 861 | "execution_count": 6, 862 | "metadata": {}, 863 | "outputs": [ 864 | { 865 | "name": "stdout", 866 | "output_type": "stream", 867 | "text": [ 868 | "total features: 82\n", 869 | "selected features: 22\n", 870 | "features with coefficients shrank to zero: 60\n" 871 | ] 872 | } 873 | ], 874 | "source": [ 875 | "# let's print the number of total and selected features\n", 876 | "\n", 877 | "# this is how we can make a list of the selected features\n", 878 | "selected_feat = X_train.columns[(sel_.get_support())]\n", 879 | "\n", 880 | "# let's print some stats\n", 881 | "print('total features: {}'.format((X_train.shape[1])))\n", 882 | "print('selected features: {}'.format(len(selected_feat)))\n", 883 | "print('features with coefficients shrank to zero: {}'.format(\n", 884 | " np.sum(sel_.estimator_.coef_ == 0)))" 885 | ] 886 | }, 887 | { 888 | "cell_type": "code", 889 | "execution_count": 7, 890 | "metadata": {}, 891 | "outputs": [ 892 | { 893 | "data": { 894 | "text/plain": [ 895 | "Index(['MSSubClass', 'MSZoning', 'Neighborhood', 'OverallQual', 'OverallCond',\n", 896 | " 'YearRemodAdd', 'RoofStyle', 'MasVnrType', 'BsmtQual', 'BsmtExposure',\n", 897 | " 'HeatingQC', 'CentralAir', '1stFlrSF', 'GrLivArea', 'BsmtFullBath',\n", 898 | " 'KitchenQual', 'Fireplaces', 'FireplaceQu', 'GarageType',\n", 899 | " 'GarageFinish', 'GarageCars', 'PavedDrive'],\n", 900 | " dtype='object')" 901 | ] 902 | }, 903 | "execution_count": 7, 904 | "metadata": {}, 905 | "output_type": "execute_result" 906 | } 907 | ], 908 | "source": [ 909 | "# print the selected features\n", 910 | "selected_feat" 911 | ] 912 | }, 913 | { 914 | "cell_type": "markdown", 915 | "metadata": {}, 916 | "source": [ 917 | "### Identify the selected variables" 918 | ] 919 | }, 920 | { 921 | "cell_type": "code", 922 | "execution_count": 8, 923 | "metadata": {}, 924 | "outputs": [ 925 | { 926 | "data": { 927 | "text/plain": [ 928 | "Index(['MSSubClass', 'MSZoning', 'Neighborhood', 'OverallQual', 'OverallCond',\n", 929 | " 'YearRemodAdd', 'RoofStyle', 'MasVnrType', 'BsmtQual', 'BsmtExposure',\n", 930 | " 'HeatingQC', 'CentralAir', '1stFlrSF', 'GrLivArea', 'BsmtFullBath',\n", 931 | " 'KitchenQual', 'Fireplaces', 'FireplaceQu', 'GarageType',\n", 932 | " 'GarageFinish', 'GarageCars', 'PavedDrive'],\n", 933 | " dtype='object')" 934 | ] 935 | }, 936 | "execution_count": 8, 937 | "metadata": {}, 938 | "output_type": "execute_result" 939 | } 940 | ], 941 | "source": [ 942 | "# this is an alternative way of identifying the selected features \n", 943 | "# based on the non-zero regularisation coefficients:\n", 944 | "selected_feats = X_train.columns[(sel_.estimator_.coef_ != 0).ravel().tolist()]\n", 945 | "selected_feats" 946 | ] 947 | }, 948 | { 949 | "cell_type": "code", 950 | "execution_count": 9, 951 | "metadata": { 952 | "collapsed": true 953 | }, 954 | "outputs": [], 955 | "source": [ 956 | "# now we save the selected list of features\n", 957 | "pd.Series(selected_feats).to_csv('selected_features.csv', index=False)" 958 | ] 959 | }, 960 | { 961 | "cell_type": "markdown", 962 | "metadata": { 963 | "collapsed": true 964 | }, 965 | "source": [ 966 | "That is all for this notebook. In the next video, we will go ahead and build the final model using the selected features. See you then!" 967 | ] 968 | } 969 | ], 970 | "metadata": { 971 | "kernelspec": { 972 | "display_name": "Python 3", 973 | "language": "python", 974 | "name": "python3" 975 | }, 976 | "language_info": { 977 | "codemirror_mode": { 978 | "name": "ipython", 979 | "version": 3 980 | }, 981 | "file_extension": ".py", 982 | "mimetype": "text/x-python", 983 | "name": "python", 984 | "nbconvert_exporter": "python", 985 | "pygments_lexer": "ipython3", 986 | "version": "3.6.1" 987 | }, 988 | "toc": { 989 | "nav_menu": {}, 990 | "number_sections": true, 991 | "sideBar": true, 992 | "skip_h1_title": false, 993 | "toc_cell": false, 994 | "toc_position": { 995 | "height": "583px", 996 | "left": "0px", 997 | "right": "1324px", 998 | "top": "107px", 999 | "width": "212px" 1000 | }, 1001 | "toc_section_display": "block", 1002 | "toc_window_display": true 1003 | } 1004 | }, 1005 | "nbformat": 4, 1006 | "nbformat_minor": 2 1007 | } 1008 | -------------------------------------------------------------------------------- /jupyter_notebooks/Section2_MLPipelineOverview/BONUS_Randomisation_in_ML _and_setting_the_seed.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Reproducibility in Machine Learning\n", 8 | "\n", 9 | "Reproducibility in machine learning modeling is an important problem faced by data scientists and companies seeking to put machine learning models into production. Reproducibility means that given the same inputs, we should obtain exactly the same outputs. And this is for both our research and our production environment. In other words, our research models and our deployed models should produce the same score for the same input.\n", 10 | "\n", 11 | "There are tremendous costs to irreproducible machine learning models including:\n", 12 | "\n", 13 | "- Financial costs\n", 14 | "- Time costs (lost time)\n", 15 | "- Reputational costs\n", 16 | "- Compliance costs\n", 17 | "- Regulatory costs\n", 18 | "\n", 19 | "The problems with reproducibility can arise in any and all of the machine learning building pipeline steps:\n", 20 | "\n", 21 | "- Data gathering\n", 22 | "- Feature extraction and feature engineering\n", 23 | "- Feature selection\n", 24 | "- Model building\n", 25 | "- Data scoring\n", 26 | "\n", 27 | "This is because all these steps involve elements of randomness. For example, if gathering data with SQL, there is an element of randomness when retrieving the rows from the database. During feature engineering, if we replace missing information by a random extraction of non-missing observations, we are introducing another layer of randomness. Machine learning models and feature selection algorithms involve randomness during model fitting. Think for example Random Forests; there is an element of randomness to select the features at each split, as well as to bootrstrap a sample of the dataset to fit each tree. For neural networks there is an element of randomness to initialise the network weights.\n", 28 | "\n", 29 | "In a future section, we will show you how to tackle reproducibility between research and deployment pipelines.\n", 30 | "\n", 31 | "For this section, please go ahead and get familiar with randomness in computer science and machine learning by visiting the following resources:\n", 32 | "\n", 33 | "- [Why do we need randomness?](https://www.kdnuggets.com/2017/06/surprising-complexity-randomness.html)\n", 34 | "- [Embrace Randomness in Machine Learning](https://machinelearningmastery.com/randomness-in-machine-learning/)\n", 35 | "- [Random Number Generators for ML in python](https://machinelearningmastery.com/introduction-to-random-number-generators-for-machine-learning/)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": { 42 | "collapsed": true 43 | }, 44 | "outputs": [], 45 | "source": [] 46 | } 47 | ], 48 | "metadata": { 49 | "kernelspec": { 50 | "display_name": "Python 3", 51 | "language": "python", 52 | "name": "python3" 53 | }, 54 | "language_info": { 55 | "codemirror_mode": { 56 | "name": "ipython", 57 | "version": 3 58 | }, 59 | "file_extension": ".py", 60 | "mimetype": "text/x-python", 61 | "name": "python", 62 | "nbconvert_exporter": "python", 63 | "pygments_lexer": "ipython3", 64 | "version": "3.6.1" 65 | }, 66 | "toc": { 67 | "nav_menu": {}, 68 | "number_sections": true, 69 | "sideBar": true, 70 | "skip_h1_title": false, 71 | "toc_cell": false, 72 | "toc_position": {}, 73 | "toc_section_display": "block", 74 | "toc_window_display": false 75 | } 76 | }, 77 | "nbformat": 4, 78 | "nbformat_minor": 2 79 | } 80 | -------------------------------------------------------------------------------- /jupyter_notebooks/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter==1.0.0 2 | matplotlib==3.0.2 3 | pandas==0.23.4 4 | numpy==1.13.3 5 | scikit-learn==0.19.0 6 | Keras==2.1.3 7 | opencv-python==4.0.0.21 8 | h5py==2.9.0 9 | -------------------------------------------------------------------------------- /packages/ml_api/VERSION: -------------------------------------------------------------------------------- 1 | 0.2.1 -------------------------------------------------------------------------------- /packages/ml_api/api/__init__.py: -------------------------------------------------------------------------------- 1 | from api.config import PACKAGE_ROOT 2 | 3 | with open(PACKAGE_ROOT / 'VERSION') as version_file: 4 | __version__ = version_file.read().strip() 5 | -------------------------------------------------------------------------------- /packages/ml_api/api/app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask 2 | 3 | from api.config import get_logger 4 | 5 | 6 | _logger = get_logger(logger_name=__name__) 7 | 8 | 9 | def create_app(*, config_object) -> Flask: 10 | """Create a flask app instance.""" 11 | 12 | flask_app = Flask('ml_api') 13 | flask_app.config.from_object(config_object) 14 | 15 | # import blueprints 16 | from api.controller import prediction_app 17 | flask_app.register_blueprint(prediction_app) 18 | _logger.debug('Application instance created') 19 | 20 | return flask_app 21 | -------------------------------------------------------------------------------- /packages/ml_api/api/config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from logging.handlers import TimedRotatingFileHandler 3 | import pathlib 4 | import os 5 | import sys 6 | 7 | PACKAGE_ROOT = pathlib.Path(__file__).resolve().parent.parent 8 | 9 | FORMATTER = logging.Formatter( 10 | "%(asctime)s — %(name)s — %(levelname)s —" 11 | "%(funcName)s:%(lineno)d — %(message)s") 12 | LOG_DIR = PACKAGE_ROOT / 'logs' 13 | LOG_DIR.mkdir(exist_ok=True) 14 | LOG_FILE = LOG_DIR / 'ml_api.log' 15 | UPLOAD_FOLDER = PACKAGE_ROOT / 'uploads' 16 | UPLOAD_FOLDER.mkdir(exist_ok=True) 17 | 18 | ALLOWED_EXTENSIONS = set(['png', 'jpg', 'jpeg']) 19 | 20 | 21 | def get_console_handler(): 22 | console_handler = logging.StreamHandler(sys.stdout) 23 | console_handler.setFormatter(FORMATTER) 24 | return console_handler 25 | 26 | 27 | def get_file_handler(): 28 | file_handler = TimedRotatingFileHandler( 29 | LOG_FILE, when='midnight') 30 | file_handler.setFormatter(FORMATTER) 31 | file_handler.setLevel(logging.WARNING) 32 | return file_handler 33 | 34 | 35 | def get_logger(*, logger_name): 36 | """Get logger with prepared handlers.""" 37 | 38 | logger = logging.getLogger(logger_name) 39 | 40 | logger.setLevel(logging.INFO) 41 | 42 | logger.addHandler(get_console_handler()) 43 | logger.addHandler(get_file_handler()) 44 | logger.propagate = False 45 | 46 | return logger 47 | 48 | 49 | class Config: 50 | DEBUG = False 51 | TESTING = False 52 | CSRF_ENABLED = True 53 | SECRET_KEY = 'this-really-needs-to-be-changed' 54 | SERVER_PORT = 5000 55 | UPLOAD_FOLDER = UPLOAD_FOLDER 56 | 57 | 58 | class ProductionConfig(Config): 59 | DEBUG = False 60 | SERVER_ADDRESS: os.environ.get('SERVER_ADDRESS', '0.0.0.0') 61 | SERVER_PORT: os.environ.get('SERVER_PORT', '5000') 62 | 63 | 64 | class DevelopmentConfig(Config): 65 | DEVELOPMENT = True 66 | DEBUG = True 67 | 68 | 69 | class TestingConfig(Config): 70 | TESTING = True 71 | -------------------------------------------------------------------------------- /packages/ml_api/api/controller.py: -------------------------------------------------------------------------------- 1 | from flask import Blueprint, request, jsonify 2 | from regression_model.predict import make_prediction 3 | from regression_model import __version__ as _version 4 | from neural_network_model.predict import make_single_prediction 5 | import os 6 | from werkzeug.utils import secure_filename 7 | 8 | from api.config import get_logger, UPLOAD_FOLDER 9 | from api.validation import validate_inputs, allowed_file 10 | from api import __version__ as api_version 11 | 12 | _logger = get_logger(logger_name=__name__) 13 | 14 | 15 | prediction_app = Blueprint('prediction_app', __name__) 16 | 17 | 18 | @prediction_app.route('/health', methods=['GET']) 19 | def health(): 20 | if request.method == 'GET': 21 | _logger.info('health status OK') 22 | return 'ok' 23 | 24 | 25 | @prediction_app.route('/version', methods=['GET']) 26 | def version(): 27 | if request.method == 'GET': 28 | return jsonify({'model_version': _version, 29 | 'api_version': api_version}) 30 | 31 | 32 | @prediction_app.route('/v1/predict/regression', methods=['POST']) 33 | def predict(): 34 | if request.method == 'POST': 35 | # Step 1: Extract POST data from request body as JSON 36 | json_data = request.get_json() 37 | _logger.debug(f'Inputs: {json_data}') 38 | 39 | # Step 2: Validate the input using marshmallow schema 40 | input_data, errors = validate_inputs(input_data=json_data) 41 | 42 | # Step 3: Model prediction 43 | result = make_prediction(input_data=input_data) 44 | _logger.debug(f'Outputs: {result}') 45 | 46 | # Step 4: Convert numpy ndarray to list 47 | predictions = result.get('predictions').tolist() 48 | version = result.get('version') 49 | 50 | # Step 5: Return the response as JSON 51 | return jsonify({'predictions': predictions, 52 | 'version': version, 53 | 'errors': errors}) 54 | 55 | 56 | @prediction_app.route('/predict/classifier', methods=['POST']) 57 | def predict_image(): 58 | if request.method == 'POST': 59 | # Step 1: check if the post request has the file part 60 | if 'file' not in request.files: 61 | return jsonify('No file found'), 400 62 | 63 | file = request.files['file'] 64 | 65 | # Step 2: Basic file extension validation 66 | if file and allowed_file(file.filename): 67 | filename = secure_filename(file.filename) 68 | 69 | # Step 3: Save the file 70 | # Note, in production, this would require careful 71 | # validation, management and clean up. 72 | file.save(os.path.join(UPLOAD_FOLDER, filename)) 73 | 74 | _logger.debug(f'Inputs: {filename}') 75 | 76 | # Step 4: perform prediction 77 | result = make_single_prediction( 78 | image_name=filename, 79 | image_directory=UPLOAD_FOLDER) 80 | 81 | _logger.debug(f'Outputs: {result}') 82 | 83 | readable_predictions = result.get('readable_predictions') 84 | version = result.get('version') 85 | 86 | # Step 5: Return the response as JSON 87 | return jsonify( 88 | {'readable_predictions': readable_predictions[0], 89 | 'version': version}) 90 | -------------------------------------------------------------------------------- /packages/ml_api/api/validation.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | 3 | from marshmallow import Schema, fields 4 | from marshmallow import ValidationError 5 | 6 | from api import config 7 | 8 | 9 | class InvalidInputError(Exception): 10 | """Invalid model input.""" 11 | 12 | 13 | SYNTAX_ERROR_FIELD_MAP = { 14 | '1stFlrSF': 'FirstFlrSF', 15 | '2ndFlrSF': 'SecondFlrSF', 16 | '3SsnPorch': 'ThreeSsnPortch' 17 | } 18 | 19 | 20 | class HouseDataRequestSchema(Schema): 21 | Alley = fields.Str(allow_none=True) 22 | BedroomAbvGr = fields.Integer() 23 | BldgType = fields.Str() 24 | BsmtCond = fields.Str() 25 | BsmtExposure = fields.Str(allow_none=True) 26 | BsmtFinSF1 = fields.Float() 27 | BsmtFinSF2 = fields.Float() 28 | BsmtFinType1 = fields.Str() 29 | BsmtFinType2 = fields.Str() 30 | BsmtFullBath = fields.Float() 31 | BsmtHalfBath = fields.Float() 32 | BsmtQual = fields.Str(allow_none=True) 33 | BsmtUnfSF = fields.Float() 34 | CentralAir = fields.Str() 35 | Condition1 = fields.Str() 36 | Condition2 = fields.Str() 37 | Electrical = fields.Str() 38 | EnclosedPorch = fields.Integer() 39 | ExterCond = fields.Str() 40 | ExterQual = fields.Str() 41 | Exterior1st = fields.Str() 42 | Exterior2nd = fields.Str() 43 | Fence = fields.Str(allow_none=True) 44 | FireplaceQu = fields.Str(allow_none=True) 45 | Fireplaces = fields.Integer() 46 | Foundation = fields.Str() 47 | FullBath = fields.Integer() 48 | Functional = fields.Str() 49 | GarageArea = fields.Float() 50 | GarageCars = fields.Float() 51 | GarageCond = fields.Str() 52 | GarageFinish = fields.Str(allow_none=True) 53 | GarageQual = fields.Str() 54 | GarageType = fields.Str(allow_none=True) 55 | GarageYrBlt = fields.Float() 56 | GrLivArea = fields.Integer() 57 | HalfBath = fields.Integer() 58 | Heating = fields.Str() 59 | HeatingQC = fields.Str() 60 | HouseStyle = fields.Str() 61 | Id = fields.Integer() 62 | KitchenAbvGr = fields.Integer() 63 | KitchenQual = fields.Str() 64 | LandContour = fields.Str() 65 | LandSlope = fields.Str() 66 | LotArea = fields.Integer() 67 | LotConfig = fields.Str() 68 | LotFrontage = fields.Float(allow_none=True) 69 | LotShape = fields.Str() 70 | LowQualFinSF = fields.Integer() 71 | MSSubClass = fields.Integer() 72 | MSZoning = fields.Str() 73 | MasVnrArea = fields.Float() 74 | MasVnrType = fields.Str(allow_none=True) 75 | MiscFeature = fields.Str(allow_none=True) 76 | MiscVal = fields.Integer() 77 | MoSold = fields.Integer() 78 | Neighborhood = fields.Str() 79 | OpenPorchSF = fields.Integer() 80 | OverallCond = fields.Integer() 81 | OverallQual = fields.Integer() 82 | PavedDrive = fields.Str() 83 | PoolArea = fields.Integer() 84 | PoolQC = fields.Str(allow_none=True) 85 | RoofMatl = fields.Str() 86 | RoofStyle = fields.Str() 87 | SaleCondition = fields.Str() 88 | SaleType = fields.Str() 89 | ScreenPorch = fields.Integer() 90 | Street = fields.Str() 91 | TotRmsAbvGrd = fields.Integer() 92 | TotalBsmtSF = fields.Float() 93 | Utilities = fields.Str() 94 | WoodDeckSF = fields.Integer() 95 | YearBuilt = fields.Integer() 96 | YearRemodAdd = fields.Integer() 97 | YrSold = fields.Integer() 98 | FirstFlrSF = fields.Integer() 99 | SecondFlrSF = fields.Integer() 100 | ThreeSsnPortch = fields.Integer() 101 | 102 | 103 | def _filter_error_rows(errors: dict, 104 | validated_input: t.List[dict] 105 | ) -> t.List[dict]: 106 | """Remove input data rows with errors.""" 107 | 108 | indexes = errors.keys() 109 | # delete them in reverse order so that you 110 | # don't throw off the subsequent indexes. 111 | for index in sorted(indexes, reverse=True): 112 | del validated_input[index] 113 | 114 | return validated_input 115 | 116 | 117 | def validate_inputs(input_data): 118 | """Check prediction inputs against schema.""" 119 | 120 | # set many=True to allow passing in a list 121 | schema = HouseDataRequestSchema(strict=True, many=True) 122 | 123 | # convert syntax error field names (beginning with numbers) 124 | for dict in input_data: 125 | for key, value in SYNTAX_ERROR_FIELD_MAP.items(): 126 | dict[value] = dict[key] 127 | del dict[key] 128 | 129 | errors = None 130 | try: 131 | schema.load(input_data) 132 | except ValidationError as exc: 133 | errors = exc.messages 134 | 135 | # convert syntax error field names back 136 | # this is a hack - never name your data 137 | # fields with numbers as the first letter. 138 | for dict in input_data: 139 | for key, value in SYNTAX_ERROR_FIELD_MAP.items(): 140 | dict[key] = dict[value] 141 | del dict[value] 142 | 143 | if errors: 144 | validated_input = _filter_error_rows( 145 | errors=errors, 146 | validated_input=input_data) 147 | else: 148 | validated_input = input_data 149 | 150 | return validated_input, errors 151 | 152 | 153 | def allowed_file(filename): 154 | return '.' in filename and \ 155 | filename.rsplit('.', 1)[1].lower() in config.ALLOWED_EXTENSIONS 156 | -------------------------------------------------------------------------------- /packages/ml_api/diff_test_requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url=${PIP_EXTRA_INDEX_URL} 2 | 3 | # api 4 | flask==1.0.2 5 | 6 | # schema validation 7 | marshmallow==2.17.0 8 | 9 | # Set this to the previous model version 10 | regression-model==0.1.0 -------------------------------------------------------------------------------- /packages/ml_api/requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url=${PIP_EXTRA_INDEX_URL} 2 | 3 | # api 4 | flask==1.0.2 5 | 6 | # schema validation 7 | marshmallow==2.17.0 8 | 9 | # Install from gemfury 10 | regression-model==1.0.0 11 | neural_network_model==0.1.1 12 | 13 | # Deployment 14 | gunicorn==19.9.0 -------------------------------------------------------------------------------- /packages/ml_api/run.py: -------------------------------------------------------------------------------- 1 | from api.app import create_app 2 | from api.config import DevelopmentConfig, ProductionConfig 3 | 4 | 5 | application = create_app( 6 | config_object=ProductionConfig) 7 | 8 | 9 | if __name__ == '__main__': 10 | application.run() 11 | -------------------------------------------------------------------------------- /packages/ml_api/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export IS_DEBUG=${DEBUG:-false} 3 | exec gunicorn --bind 0.0.0.0:5000 --access-logfile - --error-logfile - run:application -------------------------------------------------------------------------------- /packages/ml_api/test_data_predictions.csv: -------------------------------------------------------------------------------- 1 | ,predictions,version 2 | 0,143988.30704997465,0.2.0 3 | 1,116598.08159580332,0.2.0 4 | 2,130128.90560814076,0.2.0 5 | 3,113470.10675716968,0.2.0 6 | 4,159022.48121448176,0.2.0 7 | 5,139861.32732907546,0.2.0 8 | 6,227118.89767805065,0.2.0 9 | 7,91953.99400144782,0.2.0 10 | 8,225573.26579772323,0.2.0 11 | 9,125802.8602526304,0.2.0 12 | 10,137481.49149643493,0.2.0 13 | 11,124990.09839895074,0.2.0 14 | 12,133270.15609091,0.2.0 15 | 13,192143.4530280595,0.2.0 16 | 14,123206.5594461486,0.2.0 17 | 15,201801.77975634683,0.2.0 18 | 16,198027.98470170778,0.2.0 19 | 17,185664.94305866087,0.2.0 20 | 18,146728.39264190392,0.2.0 21 | 19,152443.1572738422,0.2.0 22 | 20,197054.58979409203,0.2.0 23 | 21,146781.9115319493,0.2.0 24 | 22,138838.0050135225,0.2.0 25 | 23,259997.45200360558,0.2.0 26 | 24,220904.18524276977,0.2.0 27 | 25,162760.6578114075,0.2.0 28 | 26,81622.7760115488,0.2.0 29 | 27,104671.50728326188,0.2.0 30 | 28,129551.38264993431,0.2.0 31 | 29,95446.01639989471,0.2.0 32 | 30,129507.4444341237,0.2.0 33 | 31,95477.93516568728,0.2.0 34 | 32,129422.6043698834,0.2.0 35 | 33,128062.38086640426,0.2.0 36 | 34,123419.71922835958,0.2.0 37 | 35,128318.94350485185,0.2.0 38 | 36,207431.6698047325,0.2.0 39 | 37,174685.92854135018,0.2.0 40 | 38,204544.1513220886,0.2.0 41 | 39,188046.15280301377,0.2.0 42 | 40,182971.78532877663,0.2.0 43 | 41,70097.27238622728,0.2.0 44 | 42,110733.2059471847,0.2.0 45 | 43,93994.92500037784,0.2.0 46 | 44,252924.35745892464,0.2.0 47 | 45,214641.99038515135,0.2.0 48 | 46,154979.9669243978,0.2.0 49 | 47,160810.80098181101,0.2.0 50 | 48,230690.236786167,0.2.0 51 | 49,196243.15614263792,0.2.0 52 | 50,177792.5604951465,0.2.0 53 | 51,150956.42632815256,0.2.0 54 | 52,168211.15880784288,0.2.0 55 | 53,158387.31855224012,0.2.0 56 | 54,114339.5601018531,0.2.0 57 | 55,90052.36198593948,0.2.0 58 | 56,89964.45949954129,0.2.0 59 | 57,98668.89304456668,0.2.0 60 | 58,121518.86270978909,0.2.0 61 | 59,134198.59781615838,0.2.0 62 | 60,163434.02753944616,0.2.0 63 | 61,135542.55508479764,0.2.0 64 | 62,141825.43043982252,0.2.0 65 | 63,227613.38755000453,0.2.0 66 | 64,188761.60830094197,0.2.0 67 | 65,116489.4563051063,0.2.0 68 | 66,167327.47818717395,0.2.0 69 | 67,183019.80781626955,0.2.0 70 | 68,263704.159135985,0.2.0 71 | 69,194109.36377179576,0.2.0 72 | 70,300262.7532032975,0.2.0 73 | 71,223004.09657281314,0.2.0 74 | 72,229985.38944263826,0.2.0 75 | 73,184172.20037350367,0.2.0 76 | 74,188222.84233142118,0.2.0 77 | 75,188097.29339417908,0.2.0 78 | 76,172331.10498565168,0.2.0 79 | 77,174886.6907641111,0.2.0 80 | 78,201441.14534017237,0.2.0 81 | 79,178852.47480584026,0.2.0 82 | 80,225286.87493988863,0.2.0 83 | 81,186618.03844702366,0.2.0 84 | 82,253907.81542043414,0.2.0 85 | 83,240359.90484464006,0.2.0 86 | 84,238601.0921535284,0.2.0 87 | 85,177935.77765021168,0.2.0 88 | 86,162057.79394455065,0.2.0 89 | 87,163514.64562596226,0.2.0 90 | 88,133002.50357947565,0.2.0 91 | 89,126285.82757075419,0.2.0 92 | 90,114122.89197558099,0.2.0 93 | 91,118965.43322308766,0.2.0 94 | 92,107820.17501469971,0.2.0 95 | 93,107672.41260124673,0.2.0 96 | 94,161142.56666974662,0.2.0 97 | 95,155175.112064241,0.2.0 98 | 96,159626.62056220102,0.2.0 99 | 97,159289.85166702382,0.2.0 100 | 98,164753.43823200595,0.2.0 101 | 99,130441.66184067688,0.2.0 102 | 100,150115.21843697876,0.2.0 103 | 101,363780.0225506806,0.2.0 104 | 102,330017.780544809,0.2.0 105 | 103,331883.3191102819,0.2.0 106 | 104,406837.5511403465,0.2.0 107 | 105,292997.10969063273,0.2.0 108 | 106,306609.27632288035,0.2.0 109 | 107,329626.60615839734,0.2.0 110 | 108,311532.52238578524,0.2.0 111 | 109,302589.7805774104,0.2.0 112 | 110,313113.53389941505,0.2.0 113 | 111,255492.2795391536,0.2.0 114 | 112,348040.2630000232,0.2.0 115 | 113,286215.77612206567,0.2.0 116 | 114,257811.3774942191,0.2.0 117 | 115,219056.33504400466,0.2.0 118 | 116,221072.9009001751,0.2.0 119 | 117,227272.5447635412,0.2.0 120 | 118,389000.9584031945,0.2.0 121 | 119,333081.2372066048,0.2.0 122 | 120,301748.2795090072,0.2.0 123 | 121,268886.605541231,0.2.0 124 | 122,292214.7783535345,0.2.0 125 | 123,218893.10534405566,0.2.0 126 | 124,198679.87790616706,0.2.0 127 | 125,198256.12319179106,0.2.0 128 | 126,203810.58008877232,0.2.0 129 | 127,200888.22351579432,0.2.0 130 | 128,208173.15639542375,0.2.0 131 | 129,208236.64492513813,0.2.0 132 | 130,204263.56750308358,0.2.0 133 | 131,194016.82016564548,0.2.0 134 | 132,247220.62121392722,0.2.0 135 | 133,186454.85767170336,0.2.0 136 | 134,183808.3284633914,0.2.0 137 | 135,184105.97903285234,0.2.0 138 | 136,239209.89605894414,0.2.0 139 | 137,184218.80235097196,0.2.0 140 | 138,307821.6280329202,0.2.0 141 | 139,309780.2215794851,0.2.0 142 | 140,250051.75088695402,0.2.0 143 | 141,264234.36472344183,0.2.0 144 | 142,238517.39539507058,0.2.0 145 | 143,253639.64599699862,0.2.0 146 | 144,266777.25555390265,0.2.0 147 | 145,249262.33173072065,0.2.0 148 | 146,354687.6212203011,0.2.0 149 | 147,211718.31772737036,0.2.0 150 | 148,208112.29103266165,0.2.0 151 | 149,269063.04990015837,0.2.0 152 | 150,232554.7387626751,0.2.0 153 | 151,267547.16223942576,0.2.0 154 | 152,259496.4322217068,0.2.0 155 | 153,254987.37388475015,0.2.0 156 | 154,213297.22522688,0.2.0 157 | 155,209521.4853124122,0.2.0 158 | 156,168400.4848772304,0.2.0 159 | 157,168269.52494463106,0.2.0 160 | 158,138015.7063444789,0.2.0 161 | 159,197692.7497359191,0.2.0 162 | 160,210792.23068435694,0.2.0 163 | 161,160895.21637656086,0.2.0 164 | 162,129967.65699942572,0.2.0 165 | 163,148887.7470968613,0.2.0 166 | 164,189032.60710901304,0.2.0 167 | 165,206354.3720483368,0.2.0 168 | 166,170625.45360343822,0.2.0 169 | 167,161155.2832590772,0.2.0 170 | 168,177241.4857453312,0.2.0 171 | 169,152617.9750132888,0.2.0 172 | 170,164767.3082372813,0.2.0 173 | 171,121689.0145099861,0.2.0 174 | 172,114755.20351999925,0.2.0 175 | 173,109385.54490451732,0.2.0 176 | 174,115908.28531894127,0.2.0 177 | 175,127297.15226141199,0.2.0 178 | 176,111687.7144642378,0.2.0 179 | 177,250341.40946203517,0.2.0 180 | 178,231747.51470786144,0.2.0 181 | 179,273940.75455758354,0.2.0 182 | 180,223840.72800951728,0.2.0 183 | 181,207683.72914446727,0.2.0 184 | 182,185613.50839666792,0.2.0 185 | 183,195932.25270587756,0.2.0 186 | 184,248138.38057655803,0.2.0 187 | 185,188290.29546011682,0.2.0 188 | 186,210444.7210381098,0.2.0 189 | 187,205928.18597414377,0.2.0 190 | 188,210044.0320203481,0.2.0 191 | 189,156787.38785618285,0.2.0 192 | 190,149779.3462459088,0.2.0 193 | 191,222254.2913941949,0.2.0 194 | 192,117338.5782329264,0.2.0 195 | 193,144956.37156722017,0.2.0 196 | 194,190502.7599290919,0.2.0 197 | 195,176058.9300745161,0.2.0 198 | 196,113437.17520996452,0.2.0 199 | 197,113005.87286210393,0.2.0 200 | 198,148396.4974016323,0.2.0 201 | 199,155111.51255427708,0.2.0 202 | 200,160895.4088655705,0.2.0 203 | 201,146811.64156366416,0.2.0 204 | 202,161697.96498210484,0.2.0 205 | 203,175408.29205737467,0.2.0 206 | 204,119486.7853118973,0.2.0 207 | 205,155735.2535739763,0.2.0 208 | 206,161732.25789945782,0.2.0 209 | 207,186302.28474718594,0.2.0 210 | 208,126314.40090076534,0.2.0 211 | 209,161489.29160402366,0.2.0 212 | 210,142192.79730554653,0.2.0 213 | 211,125295.79760954925,0.2.0 214 | 212,133726.54674477206,0.2.0 215 | 213,131402.58297528428,0.2.0 216 | 214,147256.8448434014,0.2.0 217 | 215,130042.3601888925,0.2.0 218 | 216,126109.99661525768,0.2.0 219 | 217,104028.06280588396,0.2.0 220 | 218,139015.86204044707,0.2.0 221 | 219,123915.67823516048,0.2.0 222 | 220,178112.6718654715,0.2.0 223 | 221,125873.4394256058,0.2.0 224 | 222,94911.69337443665,0.2.0 225 | 223,137426.63537243495,0.2.0 226 | 224,110144.45586689096,0.2.0 227 | 225,119424.4928970573,0.2.0 228 | 226,149432.93149379385,0.2.0 229 | 227,163081.24792773716,0.2.0 230 | 228,72754.84825273752,0.2.0 231 | 229,107008.00619034276,0.2.0 232 | 230,97026.69480171583,0.2.0 233 | 231,176624.72236581342,0.2.0 234 | 232,136815.75834336376,0.2.0 235 | 233,136527.98103527437,0.2.0 236 | 234,149254.9171475344,0.2.0 237 | 235,127404.15185928933,0.2.0 238 | 236,150150.4110071018,0.2.0 239 | 237,122947.21890337647,0.2.0 240 | 238,123038.56391694587,0.2.0 241 | 239,106055.04206900226,0.2.0 242 | 240,133737.62620695255,0.2.0 243 | 241,127761.33500718801,0.2.0 244 | 242,148651.3511288533,0.2.0 245 | 243,150394.04939898496,0.2.0 246 | 244,137871.15589031755,0.2.0 247 | 245,137889.2545253325,0.2.0 248 | 246,135021.79176355613,0.2.0 249 | 247,132212.93368155853,0.2.0 250 | 248,132394.6589172383,0.2.0 251 | 249,116451.46796853734,0.2.0 252 | 250,132045.77239979545,0.2.0 253 | 251,93828.92317256187,0.2.0 254 | 252,98304.79957463636,0.2.0 255 | 253,116592.62783055207,0.2.0 256 | 254,98723.66631722648,0.2.0 257 | 255,70121.22021310769,0.2.0 258 | 256,97709.23487001589,0.2.0 259 | 257,117883.99993469544,0.2.0 260 | 258,145026.28625503322,0.2.0 261 | 259,153912.57618886943,0.2.0 262 | 260,93381.08729006874,0.2.0 263 | 261,123495.69496267234,0.2.0 264 | 262,151217.31007381002,0.2.0 265 | 263,70925.4220942242,0.2.0 266 | 264,134164.7860642941,0.2.0 267 | 265,137115.50773650245,0.2.0 268 | 266,112454.46885682318,0.2.0 269 | 267,113576.35603796394,0.2.0 270 | 268,126311.04816994928,0.2.0 271 | 269,130853.87341430226,0.2.0 272 | 270,134365.47254085648,0.2.0 273 | 271,149331.816504544,0.2.0 274 | 272,113846.4490674583,0.2.0 275 | 273,127309.62370143532,0.2.0 276 | 274,138936.11004121447,0.2.0 277 | 275,126773.14110750334,0.2.0 278 | 276,118674.20763474096,0.2.0 279 | 277,94732.55765810968,0.2.0 280 | 278,115042.27875631058,0.2.0 281 | 279,97413.63757181565,0.2.0 282 | 280,125103.21858739002,0.2.0 283 | 281,127112.78156168538,0.2.0 284 | 282,100712.28345775318,0.2.0 285 | 283,123435.94852302536,0.2.0 286 | 284,146777.37991798244,0.2.0 287 | 285,141324.91303095603,0.2.0 288 | 286,147015.62617541858,0.2.0 289 | 287,182059.49685921244,0.2.0 290 | 288,66635.70748853082,0.2.0 291 | 289,113133.7345902136,0.2.0 292 | 290,115399.86396709623,0.2.0 293 | 291,142613.97712567318,0.2.0 294 | 292,122675.88261778199,0.2.0 295 | 293,128951.35723355877,0.2.0 296 | 294,159633.68071362676,0.2.0 297 | 295,163672.2859152473,0.2.0 298 | 296,200101.77128067127,0.2.0 299 | 297,166260.33914041193,0.2.0 300 | 298,150329.84339014755,0.2.0 301 | 299,140794.76572322496,0.2.0 302 | 300,166102.833620058,0.2.0 303 | 301,140183.19131161584,0.2.0 304 | 302,257819.0508760762,0.2.0 305 | 303,257819.0508760762,0.2.0 306 | 304,257819.0508760762,0.2.0 307 | 305,297489.40422482847,0.2.0 308 | 306,288713.0465842733,0.2.0 309 | 307,238840.80382128613,0.2.0 310 | 308,264054.2118258276,0.2.0 311 | 309,214038.27040784762,0.2.0 312 | 310,216541.14163119273,0.2.0 313 | 311,251482.14382697808,0.2.0 314 | 312,201302.78506297944,0.2.0 315 | 313,221418.6030263962,0.2.0 316 | 314,143245.9627266626,0.2.0 317 | 315,195099.27104358346,0.2.0 318 | 316,194957.58888827328,0.2.0 319 | 317,196553.0339968338,0.2.0 320 | 318,209163.81006532238,0.2.0 321 | 319,137593.75834543034,0.2.0 322 | 320,139886.56269297737,0.2.0 323 | 321,224462.0649769455,0.2.0 324 | 322,249722.4606197197,0.2.0 325 | 323,196221.2726508532,0.2.0 326 | 324,200883.07978660773,0.2.0 327 | 325,236876.5404898464,0.2.0 328 | 326,265449.9719556491,0.2.0 329 | 327,210031.52797804037,0.2.0 330 | 328,250335.16327422266,0.2.0 331 | 329,193702.5517580212,0.2.0 332 | 330,113345.66683243777,0.2.0 333 | 331,141908.87717126816,0.2.0 334 | 332,98061.70102934526,0.2.0 335 | 333,122961.05363435802,0.2.0 336 | 334,117995.15041902235,0.2.0 337 | 335,134068.9122846434,0.2.0 338 | 336,122607.11339521343,0.2.0 339 | 337,128632.12690453106,0.2.0 340 | 338,130665.06200115388,0.2.0 341 | 339,181867.81868509538,0.2.0 342 | 340,172320.99427457084,0.2.0 343 | 341,163115.13448378997,0.2.0 344 | 342,142692.95549842576,0.2.0 345 | 343,204336.63049215134,0.2.0 346 | 344,151865.2725254776,0.2.0 347 | 345,187999.9387459913,0.2.0 348 | 346,153898.50002741258,0.2.0 349 | 347,201370.60175011388,0.2.0 350 | 348,136260.79769104172,0.2.0 351 | 349,167661.378830941,0.2.0 352 | 350,151900.7260108396,0.2.0 353 | 351,203200.5976776774,0.2.0 354 | 352,275987.18626456213,0.2.0 355 | 353,131731.26809609786,0.2.0 356 | 354,72685.59185678526,0.2.0 357 | 355,264769.3677760745,0.2.0 358 | 356,223505.75506482823,0.2.0 359 | 357,140373.47418071458,0.2.0 360 | 358,165740.37720853413,0.2.0 361 | 359,153501.3958318297,0.2.0 362 | 360,333345.8132030645,0.2.0 363 | 361,284907.13582157245,0.2.0 364 | 362,235976.61331734635,0.2.0 365 | 363,237331.86536503406,0.2.0 366 | 364,222571.43251950064,0.2.0 367 | 365,330547.42125199316,0.2.0 368 | 366,126425.36283381855,0.2.0 369 | 367,150931.15863895716,0.2.0 370 | 368,116973.81860226691,0.2.0 371 | 369,147483.17081444428,0.2.0 372 | 370,137775.93779758728,0.2.0 373 | 371,136213.6538169831,0.2.0 374 | 372,160855.09129555486,0.2.0 375 | 373,180999.95456004038,0.2.0 376 | 374,177875.4323401108,0.2.0 377 | 375,183722.0684301858,0.2.0 378 | 376,183394.03709605164,0.2.0 379 | 377,167171.69796713692,0.2.0 380 | 378,253008.1582497637,0.2.0 381 | 379,208356.18546752,0.2.0 382 | 380,184067.27386951286,0.2.0 383 | 381,184525.57241064525,0.2.0 384 | 382,234914.10484877022,0.2.0 385 | 383,319321.39732491894,0.2.0 386 | 384,329258.81904322456,0.2.0 387 | 385,171807.44667235087,0.2.0 388 | 386,300439.8001753106,0.2.0 389 | 387,168715.42175203658,0.2.0 390 | 388,224083.29347340713,0.2.0 391 | 389,169027.4893700393,0.2.0 392 | 390,219986.76456349975,0.2.0 393 | 391,206599.36694968113,0.2.0 394 | 392,168431.21773772905,0.2.0 395 | 393,198938.11718684685,0.2.0 396 | 394,137044.70162504562,0.2.0 397 | 395,256489.3797086342,0.2.0 398 | 396,169081.6811380493,0.2.0 399 | 397,246159.3182317069,0.2.0 400 | 398,146517.01285907425,0.2.0 401 | 399,115488.93084257792,0.2.0 402 | 400,124226.28849234067,0.2.0 403 | 401,105765.49539858926,0.2.0 404 | 402,105734.63795160982,0.2.0 405 | 403,109307.7618847266,0.2.0 406 | 404,153399.47012489414,0.2.0 407 | 405,148098.79308079585,0.2.0 408 | 406,256865.85340555105,0.2.0 409 | 407,353705.2884855737,0.2.0 410 | 408,339406.68729405693,0.2.0 411 | 409,370934.7245862843,0.2.0 412 | 410,412758.66452745936,0.2.0 413 | 411,337318.9162127192,0.2.0 414 | 412,292636.5292003634,0.2.0 415 | 413,306738.89042618143,0.2.0 416 | 414,395200.33469924616,0.2.0 417 | 415,265420.90751885757,0.2.0 418 | 416,304674.1881521481,0.2.0 419 | 417,322466.11906014563,0.2.0 420 | 418,309583.69640512683,0.2.0 421 | 419,222251.71906371377,0.2.0 422 | 420,305633.12114918296,0.2.0 423 | 421,246068.43249597988,0.2.0 424 | 422,237392.40028237563,0.2.0 425 | 423,211279.01604200783,0.2.0 426 | 424,228094.0196541859,0.2.0 427 | 425,217362.23612708444,0.2.0 428 | 426,212395.21391217507,0.2.0 429 | 427,192157.327626266,0.2.0 430 | 428,210131.93667451647,0.2.0 431 | 429,218479.26431069477,0.2.0 432 | 430,227732.65975321413,0.2.0 433 | 431,207550.8611689138,0.2.0 434 | 432,196406.28233478937,0.2.0 435 | 433,215352.46117706495,0.2.0 436 | 434,195390.69073167298,0.2.0 437 | 435,268095.89486272854,0.2.0 438 | 436,317322.5783410133,0.2.0 439 | 437,292294.5209052129,0.2.0 440 | 438,256214.48067033372,0.2.0 441 | 439,289956.5518384693,0.2.0 442 | 440,285699.6865787319,0.2.0 443 | 441,238369.04431785582,0.2.0 444 | 442,266162.84585317614,0.2.0 445 | 443,276105.07384260837,0.2.0 446 | 444,241944.78930174315,0.2.0 447 | 445,212994.50831895912,0.2.0 448 | 446,266502.50110652676,0.2.0 449 | 447,203362.7111452237,0.2.0 450 | 448,180227.73055119175,0.2.0 451 | 449,188392.39553333411,0.2.0 452 | 450,142481.50831170173,0.2.0 453 | 451,174912.95802564104,0.2.0 454 | 452,168060.24103720946,0.2.0 455 | 453,170840.3065243665,0.2.0 456 | 454,185335.0674102329,0.2.0 457 | 455,175685.71835342573,0.2.0 458 | 456,182131.57134249242,0.2.0 459 | 457,127731.04705949678,0.2.0 460 | 458,130944.89863769621,0.2.0 461 | 459,105125.80701127343,0.2.0 462 | 460,113673.41846707783,0.2.0 463 | 461,171746.81645701104,0.2.0 464 | 462,147544.47667904384,0.2.0 465 | 463,266570.15210116236,0.2.0 466 | 464,340483.4209594863,0.2.0 467 | 465,193926.64894274823,0.2.0 468 | 466,177273.1783748505,0.2.0 469 | 467,188439.6899965548,0.2.0 470 | 468,179646.3820244513,0.2.0 471 | 469,277801.9107183519,0.2.0 472 | 470,244750.34380769494,0.2.0 473 | 471,264143.13027023565,0.2.0 474 | 472,264084.9900022445,0.2.0 475 | 473,190623.30283373612,0.2.0 476 | 474,218303.47626378198,0.2.0 477 | 475,209178.35576652727,0.2.0 478 | 476,210247.40015571192,0.2.0 479 | 477,305489.9014144604,0.2.0 480 | 478,206548.65094650167,0.2.0 481 | 479,260901.671279582,0.2.0 482 | 480,234130.08563281858,0.2.0 483 | 481,215084.1602052955,0.2.0 484 | 482,162068.0157257143,0.2.0 485 | 483,175403.3655499554,0.2.0 486 | 484,188329.78909449733,0.2.0 487 | 485,148772.6745077038,0.2.0 488 | 486,135234.48910921262,0.2.0 489 | 487,132981.35850945665,0.2.0 490 | 488,142443.15434220844,0.2.0 491 | 489,172322.6219487221,0.2.0 492 | 490,114015.40802504608,0.2.0 493 | 491,131679.82317114327,0.2.0 494 | 492,140830.26421534023,0.2.0 495 | 493,96630.01740632812,0.2.0 496 | 494,146497.76662391485,0.2.0 497 | 495,161384.411998765,0.2.0 498 | 496,122294.75296565886,0.2.0 499 | 497,187349.35839738324,0.2.0 500 | 498,139773.34125411394,0.2.0 501 | 499,151158.00827612064,0.2.0 502 | -------------------------------------------------------------------------------- /packages/ml_api/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/ml_api/tests/__init__.py -------------------------------------------------------------------------------- /packages/ml_api/tests/capture_model_predictions.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script should only be run in CI. 3 | Never run it locally or you will disrupt the 4 | differential test versioning logic. 5 | """ 6 | 7 | import pandas as pd 8 | 9 | from regression_model.predict import make_prediction 10 | from regression_model.processing.data_management import load_dataset 11 | 12 | from api import config 13 | 14 | 15 | def capture_predictions() -> None: 16 | """Save the test data predictions to a CSV.""" 17 | 18 | save_file = 'test_data_predictions.csv' 19 | test_data = load_dataset(file_name='test.csv') 20 | 21 | # we take a slice with no input validation issues 22 | multiple_test_input = test_data[99:600] 23 | 24 | predictions = make_prediction(input_data=multiple_test_input) 25 | 26 | # save predictions for the test dataset 27 | predictions_df = pd.DataFrame(predictions) 28 | 29 | # hack here to save the file to the regression model 30 | # package of the repo, not the installed package 31 | predictions_df.to_csv(f'{config.PACKAGE_ROOT}/{save_file}') 32 | 33 | 34 | if __name__ == '__main__': 35 | capture_predictions() 36 | -------------------------------------------------------------------------------- /packages/ml_api/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from api.app import create_app 4 | from api.config import TestingConfig 5 | 6 | 7 | @pytest.fixture 8 | def app(): 9 | app = create_app(config_object=TestingConfig) 10 | 11 | with app.app_context(): 12 | yield app 13 | 14 | 15 | @pytest.fixture 16 | def flask_test_client(app): 17 | with app.test_client() as test_client: 18 | yield test_client 19 | -------------------------------------------------------------------------------- /packages/ml_api/tests/differential_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/ml_api/tests/differential_tests/__init__.py -------------------------------------------------------------------------------- /packages/ml_api/tests/differential_tests/test_differential.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | from regression_model.config import config as model_config 4 | from regression_model.predict import make_prediction 5 | from regression_model.processing.data_management import load_dataset 6 | import pandas as pd 7 | import pytest 8 | 9 | 10 | from api import config 11 | 12 | 13 | @pytest.mark.differential 14 | def test_model_prediction_differential( 15 | *, 16 | save_file: str = 'test_data_predictions.csv'): 17 | """ 18 | This test compares the prediction result similarity of 19 | the current model with the previous model's results. 20 | """ 21 | 22 | # Given 23 | # Load the saved previous model predictions 24 | previous_model_df = pd.read_csv(f'{config.PACKAGE_ROOT}/{save_file}') 25 | previous_model_predictions = previous_model_df.predictions.values 26 | 27 | test_data = load_dataset(file_name=model_config.TESTING_DATA_FILE) 28 | multiple_test_input = test_data[99:600] 29 | 30 | # When 31 | current_result = make_prediction(input_data=multiple_test_input) 32 | current_model_predictions = current_result.get('predictions') 33 | 34 | # Then 35 | # diff the current model vs. the old model 36 | assert len(previous_model_predictions) == len( 37 | current_model_predictions) 38 | 39 | # Perform the differential test 40 | for previous_value, current_value in zip( 41 | previous_model_predictions, current_model_predictions): 42 | 43 | # convert numpy float64 to Python float. 44 | previous_value = previous_value.item() 45 | current_value = current_value.item() 46 | 47 | # rel_tol is the relative tolerance – it is the maximum allowed 48 | # difference between a and b, relative to the larger absolute 49 | # value of a or b. For example, to set a tolerance of 5%, pass 50 | # rel_tol=0.05. 51 | assert math.isclose(previous_value, 52 | current_value, 53 | rel_tol=model_config.ACCEPTABLE_MODEL_DIFFERENCE) 54 | -------------------------------------------------------------------------------- /packages/ml_api/tests/test_controller.py: -------------------------------------------------------------------------------- 1 | import io 2 | import json 3 | import math 4 | import os 5 | 6 | from neural_network_model.config import config as ccn_config 7 | from regression_model import __version__ as _version 8 | from regression_model.config import config as model_config 9 | from regression_model.processing.data_management import load_dataset 10 | 11 | from api import __version__ as api_version 12 | 13 | 14 | def test_health_endpoint_returns_200(flask_test_client): 15 | # When 16 | response = flask_test_client.get('/health') 17 | 18 | # Then 19 | assert response.status_code == 200 20 | 21 | 22 | def test_version_endpoint_returns_version(flask_test_client): 23 | # When 24 | response = flask_test_client.get('/version') 25 | 26 | # Then 27 | assert response.status_code == 200 28 | response_json = json.loads(response.data) 29 | assert response_json['model_version'] == _version 30 | assert response_json['api_version'] == api_version 31 | 32 | 33 | def test_prediction_endpoint_returns_prediction(flask_test_client): 34 | # Given 35 | # Load the test data from the regression_model package 36 | # This is important as it makes it harder for the test 37 | # data versions to get confused by not spreading it 38 | # across packages. 39 | test_data = load_dataset(file_name=model_config.TESTING_DATA_FILE) 40 | post_json = test_data[0:1].to_json(orient='records') 41 | 42 | # When 43 | response = flask_test_client.post('/v1/predict/regression', 44 | json=json.loads(post_json)) 45 | 46 | # Then 47 | assert response.status_code == 200 48 | response_json = json.loads(response.data) 49 | prediction = response_json['predictions'] 50 | response_version = response_json['version'] 51 | assert math.ceil(prediction[0]) == 112476 52 | assert response_version == _version 53 | 54 | 55 | def test_classifier_endpoint_returns_prediction(flask_test_client): 56 | # Given 57 | # Load the test data from the neural_network_model package 58 | # This is important as it makes it harder for the test 59 | # data versions to get confused by not spreading it 60 | # across packages. 61 | data_dir = os.path.abspath(os.path.join(ccn_config.DATA_FOLDER, os.pardir)) 62 | test_dir = os.path.join(data_dir, 'test_data') 63 | black_grass_dir = os.path.join(test_dir, 'Black-grass') 64 | black_grass_image = os.path.join(black_grass_dir, '1.png') 65 | with open(black_grass_image, "rb") as image_file: 66 | file_bytes = image_file.read() 67 | data = dict( 68 | file=(io.BytesIO(bytearray(file_bytes)), "1.png"), 69 | ) 70 | 71 | # When 72 | response = flask_test_client.post('/predict/classifier', 73 | content_type='multipart/form-data', 74 | data=data) 75 | 76 | # Then 77 | assert response.status_code == 200 78 | response_json = json.loads(response.data) 79 | assert response_json['readable_predictions'] 80 | -------------------------------------------------------------------------------- /packages/ml_api/tests/test_validation.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from regression_model.config import config 4 | from regression_model.processing.data_management import load_dataset 5 | 6 | 7 | def test_prediction_endpoint_validation_200(flask_test_client): 8 | # Given 9 | # Load the test data from the regression_model package. 10 | # This is important as it makes it harder for the test 11 | # data versions to get confused by not spreading it 12 | # across packages. 13 | test_data = load_dataset(file_name=config.TESTING_DATA_FILE) 14 | post_json = test_data.to_json(orient='records') 15 | 16 | # When 17 | response = flask_test_client.post('/v1/predict/regression', 18 | json=json.loads(post_json)) 19 | 20 | # Then 21 | assert response.status_code == 200 22 | response_json = json.loads(response.data) 23 | 24 | # Check correct number of errors removed 25 | assert len(response_json.get('predictions')) + len( 26 | response_json.get('errors')) == len(test_data) 27 | -------------------------------------------------------------------------------- /packages/neural_network_model/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | include *.md 3 | include *.cfg 4 | include *.pkl 5 | recursive-include ./neural_network_model/*.py 6 | 7 | include neural_network_model/trained_models/*.pkl 8 | include neural_network_model/trained_models/*.h5 9 | include neural_network_model/VERSION 10 | include neural_network_model/datasets/test_data/Black-grass/1.png 11 | include neural_network_model/datasets/test_data/Charlock/1.png 12 | 13 | include ./requirements.txt 14 | exclude *.log 15 | 16 | recursive-exclude * __pycache__ 17 | recursive-exclude * *.py[co] -------------------------------------------------------------------------------- /packages/neural_network_model/config.yml: -------------------------------------------------------------------------------- 1 | MODEL_NAME: ${MODEL_NAME:cnn_model} 2 | PIPELINE_NAME: ${PIPELINE_NAME:cnn_pipe} 3 | CLASSES_PATH: ${CLASSES_PATH:False} 4 | IMAGE_SIZE: $(IMAGE_SIZE:150} 5 | -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/VERSION: -------------------------------------------------------------------------------- 1 | 0.1.0 -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from neural_network_model.config import config 4 | 5 | 6 | with open(os.path.join(config.PACKAGE_ROOT, 'VERSION')) as version_file: 7 | __version__ = version_file.read().strip() 8 | -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/neural_network_model/neural_network_model/config/__init__.py -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/config/config.py: -------------------------------------------------------------------------------- 1 | # The Keras model loading function does not play well with 2 | # Pathlib at the moment, so we are using the old os module 3 | # style 4 | 5 | import os 6 | 7 | PWD = os.path.dirname(os.path.abspath(__file__)) 8 | PACKAGE_ROOT = os.path.abspath(os.path.join(PWD, '..')) 9 | DATASET_DIR = os.path.join(PACKAGE_ROOT, 'datasets') 10 | TRAINED_MODEL_DIR = os.path.join(PACKAGE_ROOT, 'trained_models') 11 | DATA_FOLDER = os.path.join(DATASET_DIR, 'v2-plant-seedlings-dataset') 12 | 13 | # MODEL PERSISTING 14 | MODEL_NAME = 'cnn_model' 15 | PIPELINE_NAME = 'cnn_pipe' 16 | CLASSES_NAME = 'classes' 17 | ENCODER_NAME = 'encoder' 18 | 19 | # MODEL FITTING 20 | IMAGE_SIZE = 150 # 50 for testing, 150 for final model 21 | BATCH_SIZE = 10 22 | EPOCHS = int(os.environ.get('EPOCHS', 1)) # 1 for testing, 10 for final model 23 | 24 | 25 | with open(os.path.join(PACKAGE_ROOT, 'VERSION')) as version_file: 26 | _version = version_file.read().strip() 27 | 28 | MODEL_FILE_NAME = f'{MODEL_NAME}_{_version}.h5' 29 | MODEL_PATH = os.path.join(TRAINED_MODEL_DIR, MODEL_FILE_NAME) 30 | 31 | PIPELINE_FILE_NAME = f'{PIPELINE_NAME}_{_version}.pkl' 32 | PIPELINE_PATH = os.path.join(TRAINED_MODEL_DIR, PIPELINE_FILE_NAME) 33 | 34 | CLASSES_FILE_NAME = f'{CLASSES_NAME}_{_version}.pkl' 35 | CLASSES_PATH = os.path.join(TRAINED_MODEL_DIR, CLASSES_FILE_NAME) 36 | 37 | ENCODER_FILE_NAME = f'{ENCODER_NAME}_{_version}.pkl' 38 | ENCODER_PATH = os.path.join(TRAINED_MODEL_DIR, ENCODER_FILE_NAME) 39 | -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/neural_network_model/neural_network_model/datasets/__init__.py -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/datasets/test_data/Black-grass/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/neural_network_model/neural_network_model/datasets/test_data/Black-grass/1.png -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/datasets/test_data/Charlock/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/neural_network_model/neural_network_model/datasets/test_data/Charlock/1.png -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/datasets/test_data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/neural_network_model/neural_network_model/datasets/test_data/__init__.py -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/model.py: -------------------------------------------------------------------------------- 1 | # for the convolutional network 2 | from keras.models import Sequential 3 | from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten 4 | from keras.optimizers import Adam 5 | from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint 6 | from keras.wrappers.scikit_learn import KerasClassifier 7 | 8 | from neural_network_model.config import config 9 | 10 | 11 | def cnn_model(kernel_size=(3, 3), 12 | pool_size=(2, 2), 13 | first_filters=32, 14 | second_filters=64, 15 | third_filters=128, 16 | dropout_conv=0.3, 17 | dropout_dense=0.3, 18 | image_size=50): 19 | 20 | model = Sequential() 21 | model.add(Conv2D( 22 | first_filters, 23 | kernel_size, 24 | activation='relu', 25 | input_shape=(image_size, image_size, 3))) 26 | model.add(Conv2D(first_filters, kernel_size, activation = 'relu')) 27 | model.add(MaxPooling2D(pool_size=pool_size)) 28 | model.add(Dropout(dropout_conv)) 29 | 30 | model.add(Conv2D(second_filters, kernel_size, activation='relu')) 31 | model.add(Conv2D(second_filters, kernel_size, activation ='relu')) 32 | model.add(MaxPooling2D(pool_size=pool_size)) 33 | model.add(Dropout(dropout_conv)) 34 | 35 | model.add(Conv2D(third_filters, kernel_size, activation='relu')) 36 | model.add(Conv2D(third_filters, kernel_size, activation ='relu')) 37 | model.add(MaxPooling2D(pool_size=pool_size)) 38 | model.add(Dropout(dropout_conv)) 39 | 40 | model.add(Flatten()) 41 | model.add(Dense(256, activation="relu")) 42 | model.add(Dropout(dropout_dense)) 43 | model.add(Dense(12, activation="softmax")) 44 | 45 | model.compile(Adam(lr=0.0001), 46 | loss='binary_crossentropy', 47 | metrics=['accuracy']) 48 | 49 | return model 50 | 51 | 52 | checkpoint = ModelCheckpoint(config.MODEL_PATH, 53 | monitor='acc', 54 | verbose=1, 55 | save_best_only=True, 56 | mode='max') 57 | 58 | reduce_lr = ReduceLROnPlateau(monitor='acc', 59 | factor=0.5, 60 | patience=2, 61 | verbose=1, 62 | mode='max', 63 | min_lr=0.00001) 64 | 65 | callbacks_list = [checkpoint, reduce_lr] 66 | 67 | cnn_clf = KerasClassifier(build_fn=cnn_model, 68 | batch_size=config.BATCH_SIZE, 69 | validation_split=10, 70 | epochs=config.EPOCHS, 71 | verbose=1, # progress bar - required for CI job 72 | callbacks=callbacks_list, 73 | image_size=config.IMAGE_SIZE 74 | ) 75 | 76 | 77 | if __name__ == '__main__': 78 | model = cnn_model() 79 | model.summary() 80 | -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/pipeline.py: -------------------------------------------------------------------------------- 1 | from sklearn.pipeline import Pipeline 2 | 3 | from neural_network_model.config import config 4 | from neural_network_model.processing import preprocessors as pp 5 | from neural_network_model import model 6 | 7 | 8 | pipe = Pipeline([ 9 | ('dataset', pp.CreateDataset(config.IMAGE_SIZE)), 10 | ('cnn_model', model.cnn_clf)]) 11 | -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/predict.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import pandas as pd 4 | 5 | from neural_network_model import __version__ as _version 6 | from neural_network_model.processing import data_management as dm 7 | 8 | _logger = logging.getLogger(__name__) 9 | KERAS_PIPELINE = dm.load_pipeline_keras() 10 | ENCODER = dm.load_encoder() 11 | 12 | 13 | def make_single_prediction(*, image_name: str, image_directory: str): 14 | """Make a single prediction using the saved model pipeline. 15 | 16 | Args: 17 | image_name: Filename of the image to classify 18 | image_directory: Location of the image to classify 19 | 20 | Returns 21 | Dictionary with both raw predictions and readable values. 22 | """ 23 | 24 | image_df = dm.load_single_image( 25 | data_folder=image_directory, 26 | filename=image_name) 27 | 28 | prepared_df = image_df['image'].reset_index(drop=True) 29 | _logger.info(f'received input array: {prepared_df}, ' 30 | f'filename: {image_name}') 31 | 32 | predictions = KERAS_PIPELINE.predict(prepared_df) 33 | readable_predictions = ENCODER.encoder.inverse_transform(predictions) 34 | 35 | _logger.info(f'Made prediction: {predictions}' 36 | f' with model version: {_version}') 37 | 38 | return dict(predictions=predictions, 39 | readable_predictions=readable_predictions, 40 | version=_version) 41 | 42 | 43 | def make_bulk_prediction(*, images_df: pd.Series) -> dict: 44 | """Make multiple predictions using the saved model pipeline. 45 | 46 | Currently, this function is primarily for testing purposes, 47 | allowing us to pass in a directory of images for running 48 | bulk predictions. 49 | 50 | Args: 51 | images_df: Pandas series of images 52 | 53 | Returns 54 | Dictionary with both raw predictions and their classifications. 55 | """ 56 | 57 | _logger.info(f'received input df: {images_df}') 58 | 59 | predictions = KERAS_PIPELINE.predict(images_df) 60 | readable_predictions = ENCODER.encoder.inverse_transform(predictions) 61 | 62 | _logger.info(f'Made predictions: {predictions}' 63 | f' with model version: {_version}') 64 | 65 | return dict(predictions=predictions, 66 | readable_predictions=readable_predictions, 67 | version=_version) 68 | -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/neural_network_model/neural_network_model/processing/__init__.py -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/processing/data_management.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import typing as t 4 | from glob import glob 5 | from pathlib import Path 6 | 7 | import pandas as pd 8 | from keras.models import load_model 9 | from keras.wrappers.scikit_learn import KerasClassifier 10 | from sklearn.externals import joblib 11 | from sklearn.model_selection import train_test_split 12 | from sklearn.pipeline import Pipeline 13 | from sklearn.preprocessing import LabelEncoder 14 | 15 | from neural_network_model import model as m 16 | from neural_network_model.config import config 17 | 18 | _logger = logging.getLogger(__name__) 19 | 20 | 21 | def load_single_image(data_folder: str, filename: str) -> pd.DataFrame: 22 | """Makes dataframe with image path and target.""" 23 | 24 | image_df = [] 25 | 26 | # search for specific image in directory 27 | for image_path in glob(os.path.join(data_folder, f'{filename}')): 28 | tmp = pd.DataFrame([image_path, 'unknown']).T 29 | image_df.append(tmp) 30 | 31 | # concatenate the final df 32 | images_df = pd.concat(image_df, axis=0, ignore_index=True) 33 | images_df.columns = ['image', 'target'] 34 | 35 | return images_df 36 | 37 | 38 | def load_image_paths(data_folder: str) -> pd.DataFrame: 39 | """Makes dataframe with image path and target.""" 40 | 41 | images_df = [] 42 | 43 | # navigate within each folder 44 | for class_folder_name in os.listdir(data_folder): 45 | class_folder_path = os.path.join(data_folder, class_folder_name) 46 | 47 | # collect every image path 48 | for image_path in glob(os.path.join(class_folder_path, "*.png")): 49 | tmp = pd.DataFrame([image_path, class_folder_name]).T 50 | images_df.append(tmp) 51 | 52 | # concatenate the final df 53 | images_df = pd.concat(images_df, axis=0, ignore_index=True) 54 | images_df.columns = ['image', 'target'] 55 | 56 | return images_df 57 | 58 | 59 | def get_train_test_target(df: pd.DataFrame): 60 | """Split a dataset into train and test segments.""" 61 | 62 | X_train, X_test, y_train, y_test = train_test_split(df['image'], 63 | df['target'], 64 | test_size=0.20, 65 | random_state=101) 66 | 67 | X_train.reset_index(drop=True, inplace=True) 68 | X_test.reset_index(drop=True, inplace=True) 69 | 70 | y_train.reset_index(drop=True, inplace=True) 71 | y_test.reset_index(drop=True, inplace=True) 72 | 73 | return X_train, X_test, y_train, y_test 74 | 75 | 76 | def save_pipeline_keras(model) -> None: 77 | """Persist keras model to disk.""" 78 | 79 | joblib.dump(model.named_steps['dataset'], config.PIPELINE_PATH) 80 | joblib.dump(model.named_steps['cnn_model'].classes_, config.CLASSES_PATH) 81 | model.named_steps['cnn_model'].model.save(str(config.MODEL_PATH)) 82 | 83 | remove_old_pipelines( 84 | files_to_keep=[config.MODEL_FILE_NAME, config.ENCODER_FILE_NAME, 85 | config.PIPELINE_FILE_NAME, config.CLASSES_FILE_NAME]) 86 | 87 | 88 | def load_pipeline_keras() -> Pipeline: 89 | """Load a Keras Pipeline from disk.""" 90 | 91 | dataset = joblib.load(config.PIPELINE_PATH) 92 | 93 | build_model = lambda: load_model(config.MODEL_PATH) 94 | 95 | classifier = KerasClassifier(build_fn=build_model, 96 | batch_size=config.BATCH_SIZE, 97 | validation_split=10, 98 | epochs=config.EPOCHS, 99 | verbose=2, 100 | callbacks=m.callbacks_list, 101 | # image_size = config.IMAGE_SIZE 102 | ) 103 | 104 | classifier.classes_ = joblib.load(config.CLASSES_PATH) 105 | classifier.model = build_model() 106 | 107 | return Pipeline([ 108 | ('dataset', dataset), 109 | ('cnn_model', classifier) 110 | ]) 111 | 112 | 113 | def load_encoder() -> LabelEncoder: 114 | encoder = joblib.load(config.ENCODER_PATH) 115 | 116 | return encoder 117 | 118 | 119 | def remove_old_pipelines(*, files_to_keep: t.List[str]) -> None: 120 | """ 121 | Remove old model pipelines, models, encoders and classes. 122 | 123 | This is to ensure there is a simple one-to-one 124 | mapping between the package version and the model 125 | version to be imported and used by other applications. 126 | """ 127 | do_not_delete = files_to_keep + ['__init__.py'] 128 | for model_file in Path(config.TRAINED_MODEL_DIR).iterdir(): 129 | if model_file.name not in do_not_delete: 130 | model_file.unlink() 131 | -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/processing/errors.py: -------------------------------------------------------------------------------- 1 | class BaseError(Exception): 2 | """Base package error.""" 3 | 4 | 5 | class InvalidModelInputError(BaseError): 6 | """Model input contains an error.""" 7 | -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/processing/preprocessors.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | from keras.utils import np_utils 4 | from sklearn.preprocessing import LabelEncoder 5 | from sklearn.base import BaseEstimator, TransformerMixin 6 | 7 | 8 | class TargetEncoder(BaseEstimator, TransformerMixin): 9 | 10 | def __init__(self, encoder=LabelEncoder()): 11 | self.encoder = encoder 12 | 13 | def fit(self, X, y=None): 14 | # note that x is the target in this case 15 | self.encoder.fit(X) 16 | return self 17 | 18 | def transform(self, X): 19 | X = X.copy() 20 | X = np_utils.to_categorical(self.encoder.transform(X)) 21 | return X 22 | 23 | 24 | def _im_resize(df, n, image_size): 25 | im = cv2.imread(df[n]) 26 | im = cv2.resize(im, (image_size, image_size)) 27 | return im 28 | 29 | 30 | class CreateDataset(BaseEstimator, TransformerMixin): 31 | 32 | def __init__(self, image_size=50): 33 | self.image_size = image_size 34 | 35 | def fit(self, X, y=None): 36 | return self 37 | 38 | def transform(self, X): 39 | X = X.copy() 40 | tmp = np.zeros((len(X), 41 | self.image_size, 42 | self.image_size, 3), dtype='float32') 43 | 44 | for n in range(0, len(X)): 45 | im = _im_resize(X, n, self.image_size) 46 | tmp[n] = im 47 | 48 | print('Dataset Images shape: {} size: {:,}'.format( 49 | tmp.shape, tmp.size)) 50 | return tmp 51 | -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/train_pipeline.py: -------------------------------------------------------------------------------- 1 | from sklearn.externals import joblib 2 | 3 | from neural_network_model import pipeline as pipe 4 | from neural_network_model.config import config 5 | from neural_network_model.processing import data_management as dm 6 | from neural_network_model.processing import preprocessors as pp 7 | 8 | 9 | def run_training(save_result: bool = True): 10 | """Train a Convolutional Neural Network.""" 11 | 12 | images_df = dm.load_image_paths(config.DATA_FOLDER) 13 | X_train, X_test, y_train, y_test = dm.get_train_test_target(images_df) 14 | 15 | enc = pp.TargetEncoder() 16 | enc.fit(y_train) 17 | y_train = enc.transform(y_train) 18 | 19 | pipe.pipe.fit(X_train, y_train) 20 | 21 | if save_result: 22 | joblib.dump(enc, config.ENCODER_PATH) 23 | dm.save_pipeline_keras(pipe.pipe) 24 | 25 | 26 | if __name__ == '__main__': 27 | run_training(save_result=True) 28 | -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/trained_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/neural_network_model/neural_network_model/trained_models/__init__.py -------------------------------------------------------------------------------- /packages/neural_network_model/requirements.txt: -------------------------------------------------------------------------------- 1 | # production requirements 2 | pandas==0.23.4 3 | numpy==1.13.3 4 | scikit-learn==0.19.0 5 | Keras==2.1.3 6 | opencv-python==4.0.0.21 7 | h5py==2.9.0 8 | Theano==0.9.0 9 | 10 | # packaging 11 | setuptools==40.6.3 12 | wheel==0.32.3 13 | 14 | # testing requirements 15 | pytest==4.0.2 16 | 17 | # fetching datasets 18 | kaggle==1.5.1.1 -------------------------------------------------------------------------------- /packages/neural_network_model/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import io 5 | import os 6 | from pathlib import Path 7 | 8 | from setuptools import find_packages, setup 9 | 10 | 11 | # Package meta-data. 12 | NAME = 'neural_network_model' 13 | DESCRIPTION = 'Train and deploy neural network model.' 14 | URL = 'your github project' 15 | EMAIL = 'your_email@email.com' 16 | AUTHOR = 'Your name' 17 | REQUIRES_PYTHON = '>=3.6.0' 18 | 19 | 20 | # What packages are required for this module to be executed? 21 | def list_reqs(fname='requirements.txt'): 22 | with open(fname) as fd: 23 | return fd.read().splitlines() 24 | 25 | 26 | # The rest you shouldn't have to touch too much :) 27 | # ------------------------------------------------ 28 | # Except, perhaps the License and Trove Classifiers! 29 | # If you do change the License, remember to change the 30 | # Trove Classifier for that! 31 | 32 | here = os.path.abspath(os.path.dirname(__file__)) 33 | 34 | # Import the README and use it as the long-description. 35 | # Note: this will only work if 'README.md' is present in your MANIFEST.in file! 36 | try: 37 | with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f: 38 | long_description = '\n' + f.read() 39 | except FileNotFoundError: 40 | long_description = DESCRIPTION 41 | 42 | 43 | # Load the package's __version__.py module as a dictionary. 44 | ROOT_DIR = Path(__file__).resolve().parent 45 | PACKAGE_DIR = ROOT_DIR / NAME 46 | about = {} 47 | with open(PACKAGE_DIR / 'VERSION') as f: 48 | _version = f.read().strip() 49 | about['__version__'] = _version 50 | 51 | 52 | # Where the magic happens: 53 | setup( 54 | name=NAME, 55 | version=about['__version__'], 56 | description=DESCRIPTION, 57 | long_description=long_description, 58 | long_description_content_type='text/markdown', 59 | author=AUTHOR, 60 | author_email=EMAIL, 61 | python_requires=REQUIRES_PYTHON, 62 | url=URL, 63 | packages=find_packages(exclude=('tests',)), 64 | package_data={'neural_network_model': ['VERSION']}, 65 | install_requires=list_reqs(), 66 | extras_require={}, 67 | include_package_data=True, 68 | license='MIT', 69 | classifiers=[ 70 | # Trove classifiers 71 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers 72 | 'License :: OSI Approved :: MIT License', 73 | 'Programming Language :: Python', 74 | 'Programming Language :: Python :: 3', 75 | 'Programming Language :: Python :: 3.6', 76 | 'Programming Language :: Python :: Implementation :: CPython', 77 | 'Programming Language :: Python :: Implementation :: PyPy' 78 | ], 79 | ) 80 | -------------------------------------------------------------------------------- /packages/neural_network_model/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/neural_network_model/tests/__init__.py -------------------------------------------------------------------------------- /packages/neural_network_model/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | 4 | from neural_network_model.config import config 5 | 6 | 7 | @pytest.fixture 8 | def black_grass_dir(): 9 | test_data_dir = os.path.join(config.DATASET_DIR, 'test_data') 10 | black_grass_dir = os.path.join(test_data_dir, 'Black-grass') 11 | 12 | return black_grass_dir 13 | 14 | 15 | @pytest.fixture 16 | def charlock_dir(): 17 | test_data_dir = os.path.join(config.DATASET_DIR, 'test_data') 18 | charlock_dir = os.path.join(test_data_dir, 'Charlock') 19 | 20 | return charlock_dir 21 | -------------------------------------------------------------------------------- /packages/neural_network_model/tests/test_predict.py: -------------------------------------------------------------------------------- 1 | from neural_network_model import __version__ as _version 2 | from neural_network_model.predict import (make_single_prediction) 3 | 4 | 5 | def test_make_prediction_on_sample(charlock_dir): 6 | # Given 7 | filename = '1.png' 8 | expected_classification = 'Charlock' 9 | 10 | # When 11 | results = make_single_prediction(image_directory=charlock_dir, 12 | image_name=filename) 13 | 14 | # Then 15 | assert results['predictions'] is not None 16 | assert results['readable_predictions'][0] == expected_classification 17 | assert results['version'] == _version 18 | -------------------------------------------------------------------------------- /packages/regression_model/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | include *.md 3 | include *.cfg 4 | include *.pkl 5 | recursive-include ./regression_model/* 6 | 7 | include regression_model/datasets/train.csv 8 | include regression_model/datasets/test.csv 9 | include regression_model/trained_models/*.pkl 10 | include regression_model/VERSION 11 | 12 | include ./requirements.txt 13 | exclude *.log 14 | 15 | recursive-exclude * __pycache__ 16 | recursive-exclude * *.py[co] -------------------------------------------------------------------------------- /packages/regression_model/regression_model/VERSION: -------------------------------------------------------------------------------- 1 | 1.0.0 -------------------------------------------------------------------------------- /packages/regression_model/regression_model/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | from regression_model.config import config 5 | from regression_model.config import logging_config 6 | 7 | 8 | # Configure logger for use in package 9 | logger = logging.getLogger(__name__) 10 | logger.setLevel(logging.DEBUG) 11 | logger.addHandler(logging_config.get_console_handler()) 12 | logger.propagate = False 13 | 14 | 15 | with open(os.path.join(config.PACKAGE_ROOT, 'VERSION')) as version_file: 16 | __version__ = version_file.read().strip() 17 | -------------------------------------------------------------------------------- /packages/regression_model/regression_model/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/regression_model/regression_model/config/__init__.py -------------------------------------------------------------------------------- /packages/regression_model/regression_model/config/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | 4 | import regression_model 5 | 6 | import pandas as pd 7 | 8 | 9 | pd.options.display.max_rows = 10 10 | pd.options.display.max_columns = 10 11 | 12 | 13 | PACKAGE_ROOT = pathlib.Path(regression_model.__file__).resolve().parent 14 | TRAINED_MODEL_DIR = PACKAGE_ROOT / 'trained_models' 15 | DATASET_DIR = PACKAGE_ROOT / 'datasets' 16 | 17 | # data 18 | TESTING_DATA_FILE = 'test.csv' 19 | TRAINING_DATA_FILE = 'train.csv' 20 | TARGET = 'SalePrice' 21 | 22 | 23 | # variables 24 | FEATURES = ['MSSubClass', 'MSZoning', 'Neighborhood', 25 | 'OverallQual', 'OverallCond', 'YearRemodAdd', 26 | 'RoofStyle', 'MasVnrType', 'BsmtQual', 'BsmtExposure', 27 | 'HeatingQC', 'CentralAir', '1stFlrSF', 'GrLivArea', 28 | 'BsmtFullBath', 'KitchenQual', 'Fireplaces', 'FireplaceQu', 29 | 'GarageType', 'GarageFinish', 'GarageCars', 'PavedDrive', 30 | 'LotFrontage', 31 | # this one is only to calculate temporal variable: 32 | 'YrSold'] 33 | 34 | # this variable is to calculate the temporal variable, 35 | # can be dropped afterwards 36 | DROP_FEATURES = 'YrSold' 37 | 38 | # numerical variables with NA in train set 39 | NUMERICAL_VARS_WITH_NA = ['LotFrontage'] 40 | 41 | # categorical variables with NA in train set 42 | CATEGORICAL_VARS_WITH_NA = ['MasVnrType', 'BsmtQual', 'BsmtExposure', 43 | 'FireplaceQu', 'GarageType', 'GarageFinish'] 44 | 45 | TEMPORAL_VARS = 'YearRemodAdd' 46 | 47 | # variables to log transform 48 | NUMERICALS_LOG_VARS = ['LotFrontage', '1stFlrSF', 'GrLivArea'] 49 | 50 | # categorical variables to encode 51 | CATEGORICAL_VARS = ['MSZoning', 'Neighborhood', 'RoofStyle', 'MasVnrType', 52 | 'BsmtQual', 'BsmtExposure', 'HeatingQC', 'CentralAir', 53 | 'KitchenQual', 'FireplaceQu', 'GarageType', 'GarageFinish', 54 | 'PavedDrive'] 55 | 56 | NUMERICAL_NA_NOT_ALLOWED = [ 57 | feature for feature in FEATURES 58 | if feature not in CATEGORICAL_VARS + NUMERICAL_VARS_WITH_NA 59 | ] 60 | 61 | CATEGORICAL_NA_NOT_ALLOWED = [ 62 | feature for feature in CATEGORICAL_VARS 63 | if feature not in CATEGORICAL_VARS_WITH_NA 64 | ] 65 | 66 | 67 | PIPELINE_NAME = 'lasso_regression' 68 | PIPELINE_SAVE_FILE = f'{PIPELINE_NAME}_output_v' 69 | 70 | # used for differential testing 71 | ACCEPTABLE_MODEL_DIFFERENCE = 0.05 72 | -------------------------------------------------------------------------------- /packages/regression_model/regression_model/config/logging_config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from logging.handlers import TimedRotatingFileHandler 3 | import os 4 | import sys 5 | 6 | from regression_model.config import config 7 | 8 | # Multiple calls to logging.getLogger('someLogger') return a 9 | # reference to the same logger object. This is true not only 10 | # within the same module, but also across modules as long as 11 | # it is in the same Python interpreter process. 12 | 13 | FORMATTER = logging.Formatter( 14 | "%(asctime)s — %(name)s — %(levelname)s —" 15 | "%(funcName)s:%(lineno)d — %(message)s") 16 | 17 | 18 | def get_console_handler(): 19 | console_handler = logging.StreamHandler(sys.stdout) 20 | console_handler.setFormatter(FORMATTER) 21 | return console_handler 22 | -------------------------------------------------------------------------------- /packages/regression_model/regression_model/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/regression_model/regression_model/datasets/__init__.py -------------------------------------------------------------------------------- /packages/regression_model/regression_model/pipeline.py: -------------------------------------------------------------------------------- 1 | from sklearn.linear_model import Lasso 2 | from sklearn.pipeline import Pipeline 3 | from sklearn.preprocessing import MinMaxScaler 4 | 5 | from regression_model.processing import preprocessors as pp 6 | from regression_model.processing import features 7 | from regression_model.config import config 8 | 9 | import logging 10 | 11 | 12 | _logger = logging.getLogger(__name__) 13 | 14 | 15 | price_pipe = Pipeline( 16 | [ 17 | ('categorical_imputer', 18 | pp.CategoricalImputer(variables=config.CATEGORICAL_VARS_WITH_NA)), 19 | ('numerical_inputer', 20 | pp.NumericalImputer(variables=config.NUMERICAL_VARS_WITH_NA)), 21 | ('temporal_variable', 22 | pp.TemporalVariableEstimator( 23 | variables=config.TEMPORAL_VARS, 24 | reference_variable=config.DROP_FEATURES)), 25 | ('rare_label_encoder', 26 | pp.RareLabelCategoricalEncoder( 27 | tol=0.01, 28 | variables=config.CATEGORICAL_VARS)), 29 | ('categorical_encoder', 30 | pp.CategoricalEncoder(variables=config.CATEGORICAL_VARS)), 31 | ('log_transformer', 32 | features.LogTransformer(variables=config.NUMERICALS_LOG_VARS)), 33 | ('drop_features', 34 | pp.DropUnecessaryFeatures(variables_to_drop=config.DROP_FEATURES)), 35 | ('scaler', MinMaxScaler()), 36 | ('Linear_model', Lasso(alpha=0.005, random_state=0)) 37 | ] 38 | ) 39 | -------------------------------------------------------------------------------- /packages/regression_model/regression_model/predict.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from regression_model.processing.data_management import load_pipeline 5 | from regression_model.config import config 6 | from regression_model.processing.validation import validate_inputs 7 | from regression_model import __version__ as _version 8 | 9 | import logging 10 | import typing as t 11 | 12 | 13 | _logger = logging.getLogger(__name__) 14 | 15 | pipeline_file_name = f'{config.PIPELINE_SAVE_FILE}{_version}.pkl' 16 | _price_pipe = load_pipeline(file_name=pipeline_file_name) 17 | 18 | 19 | def make_prediction(*, input_data: t.Union[pd.DataFrame, dict], 20 | ) -> dict: 21 | """Make a prediction using a saved model pipeline. 22 | 23 | Args: 24 | input_data: Array of model prediction inputs. 25 | 26 | Returns: 27 | Predictions for each input row, as well as the model version. 28 | """ 29 | 30 | data = pd.DataFrame(input_data) 31 | validated_data = validate_inputs(input_data=data) 32 | 33 | prediction = _price_pipe.predict(validated_data[config.FEATURES]) 34 | 35 | output = np.exp(prediction) 36 | 37 | results = {'predictions': output, 'version': _version} 38 | 39 | _logger.info( 40 | f'Making predictions with model version: {_version} ' 41 | f'Inputs: {validated_data} ' 42 | f'Predictions: {results}') 43 | 44 | return results 45 | -------------------------------------------------------------------------------- /packages/regression_model/regression_model/processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/regression_model/regression_model/processing/__init__.py -------------------------------------------------------------------------------- /packages/regression_model/regression_model/processing/data_management.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.externals import joblib 3 | from sklearn.pipeline import Pipeline 4 | 5 | from regression_model.config import config 6 | from regression_model import __version__ as _version 7 | 8 | import logging 9 | import typing as t 10 | 11 | 12 | _logger = logging.getLogger(__name__) 13 | 14 | 15 | def load_dataset(*, file_name: str 16 | ) -> pd.DataFrame: 17 | _data = pd.read_csv(f'{config.DATASET_DIR}/{file_name}') 18 | return _data 19 | 20 | 21 | def save_pipeline(*, pipeline_to_persist) -> None: 22 | """Persist the pipeline. 23 | 24 | Saves the versioned model, and overwrites any previous 25 | saved models. This ensures that when the package is 26 | published, there is only one trained model that can be 27 | called, and we know exactly how it was built. 28 | """ 29 | 30 | # Prepare versioned save file name 31 | save_file_name = f'{config.PIPELINE_SAVE_FILE}{_version}.pkl' 32 | save_path = config.TRAINED_MODEL_DIR / save_file_name 33 | 34 | remove_old_pipelines(files_to_keep=[save_file_name]) 35 | joblib.dump(pipeline_to_persist, save_path) 36 | _logger.info(f'saved pipeline: {save_file_name}') 37 | 38 | 39 | def load_pipeline(*, file_name: str 40 | ) -> Pipeline: 41 | """Load a persisted pipeline.""" 42 | 43 | file_path = config.TRAINED_MODEL_DIR / file_name 44 | trained_model = joblib.load(filename=file_path) 45 | return trained_model 46 | 47 | 48 | def remove_old_pipelines(*, files_to_keep: t.List[str]) -> None: 49 | """ 50 | Remove old model pipelines. 51 | 52 | This is to ensure there is a simple one-to-one 53 | mapping between the package version and the model 54 | version to be imported and used by other applications. 55 | However, we do also include the immediate previous 56 | pipeline version for differential testing purposes. 57 | """ 58 | do_not_delete = files_to_keep + ['__init__.py'] 59 | for model_file in config.TRAINED_MODEL_DIR.iterdir(): 60 | if model_file.name not in do_not_delete: 61 | model_file.unlink() 62 | -------------------------------------------------------------------------------- /packages/regression_model/regression_model/processing/errors.py: -------------------------------------------------------------------------------- 1 | class BaseError(Exception): 2 | """Base package error.""" 3 | 4 | 5 | class InvalidModelInputError(BaseError): 6 | """Model input contains an error.""" 7 | -------------------------------------------------------------------------------- /packages/regression_model/regression_model/processing/features.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.base import BaseEstimator, TransformerMixin 3 | 4 | from regression_model.processing.errors import InvalidModelInputError 5 | 6 | 7 | class LogTransformer(BaseEstimator, TransformerMixin): 8 | """Logarithm transformer.""" 9 | 10 | def __init__(self, variables=None): 11 | if not isinstance(variables, list): 12 | self.variables = [variables] 13 | else: 14 | self.variables = variables 15 | 16 | def fit(self, X, y=None): 17 | # to accomodate the pipeline 18 | return self 19 | 20 | def transform(self, X): 21 | X = X.copy() 22 | 23 | # check that the values are non-negative for log transform 24 | if not (X[self.variables] > 0).all().all(): 25 | vars_ = self.variables[(X[self.variables] <= 0).any()] 26 | raise InvalidModelInputError( 27 | f"Variables contain zero or negative values, " 28 | f"can't apply log for vars: {vars_}") 29 | 30 | for feature in self.variables: 31 | X[feature] = np.log(X[feature]) 32 | 33 | return X 34 | -------------------------------------------------------------------------------- /packages/regression_model/regression_model/processing/preprocessors.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.base import BaseEstimator, TransformerMixin 4 | 5 | from regression_model.processing.errors import InvalidModelInputError 6 | 7 | 8 | class CategoricalImputer(BaseEstimator, TransformerMixin): 9 | """Categorical data missing value imputer.""" 10 | 11 | def __init__(self, variables=None) -> None: 12 | if not isinstance(variables, list): 13 | self.variables = [variables] 14 | else: 15 | self.variables = variables 16 | 17 | def fit(self, X: pd.DataFrame, y: pd.Series = None 18 | ) -> 'CategoricalImputer': 19 | """Fit statement to accomodate the sklearn pipeline.""" 20 | 21 | return self 22 | 23 | def transform(self, X: pd.DataFrame) -> pd.DataFrame: 24 | """Apply the transforms to the dataframe.""" 25 | 26 | X = X.copy() 27 | for feature in self.variables: 28 | X[feature] = X[feature].fillna('Missing') 29 | 30 | return X 31 | 32 | 33 | class NumericalImputer(BaseEstimator, TransformerMixin): 34 | """Numerical missing value imputer.""" 35 | 36 | def __init__(self, variables=None): 37 | if not isinstance(variables, list): 38 | self.variables = [variables] 39 | else: 40 | self.variables = variables 41 | 42 | def fit(self, X, y=None): 43 | # persist mode in a dictionary 44 | self.imputer_dict_ = {} 45 | for feature in self.variables: 46 | self.imputer_dict_[feature] = X[feature].mode()[0] 47 | return self 48 | 49 | def transform(self, X): 50 | X = X.copy() 51 | for feature in self.variables: 52 | X[feature].fillna(self.imputer_dict_[feature], inplace=True) 53 | return X 54 | 55 | 56 | class TemporalVariableEstimator(BaseEstimator, TransformerMixin): 57 | """Temporal variable calculator.""" 58 | 59 | def __init__(self, variables=None, reference_variable=None): 60 | if not isinstance(variables, list): 61 | self.variables = [variables] 62 | else: 63 | self.variables = variables 64 | 65 | self.reference_variables = reference_variable 66 | 67 | def fit(self, X, y=None): 68 | # we need this step to fit the sklearn pipeline 69 | return self 70 | 71 | def transform(self, X): 72 | X = X.copy() 73 | for feature in self.variables: 74 | X[feature] = X[self.reference_variables] - X[feature] 75 | 76 | return X 77 | 78 | 79 | class RareLabelCategoricalEncoder(BaseEstimator, TransformerMixin): 80 | """Rare label categorical encoder""" 81 | 82 | def __init__(self, tol=0.05, variables=None): 83 | self.tol = tol 84 | if not isinstance(variables, list): 85 | self.variables = [variables] 86 | else: 87 | self.variables = variables 88 | 89 | def fit(self, X, y=None): 90 | # persist frequent labels in dictionary 91 | self.encoder_dict_ = {} 92 | 93 | for var in self.variables: 94 | # the encoder will learn the most frequent categories 95 | t = pd.Series(X[var].value_counts() / np.float(len(X))) 96 | # frequent labels: 97 | self.encoder_dict_[var] = list(t[t >= self.tol].index) 98 | 99 | return self 100 | 101 | def transform(self, X): 102 | X = X.copy() 103 | for feature in self.variables: 104 | X[feature] = np.where(X[feature].isin( 105 | self.encoder_dict_[feature]), X[feature], 'Rare') 106 | 107 | return X 108 | 109 | 110 | class CategoricalEncoder(BaseEstimator, TransformerMixin): 111 | """String to numbers categorical encoder.""" 112 | 113 | def __init__(self, variables=None): 114 | if not isinstance(variables, list): 115 | self.variables = [variables] 116 | else: 117 | self.variables = variables 118 | 119 | def fit(self, X, y): 120 | temp = pd.concat([X, y], axis=1) 121 | temp.columns = list(X.columns) + ['target'] 122 | 123 | # persist transforming dictionary 124 | self.encoder_dict_ = {} 125 | 126 | for var in self.variables: 127 | t = temp.groupby([var])['target'].mean().sort_values( 128 | ascending=True).index 129 | self.encoder_dict_[var] = {k: i for i, k in enumerate(t, 0)} 130 | 131 | return self 132 | 133 | def transform(self, X): 134 | # encode labels 135 | X = X.copy() 136 | for feature in self.variables: 137 | X[feature] = X[feature].map(self.encoder_dict_[feature]) 138 | 139 | # check if transformer introduces NaN 140 | if X[self.variables].isnull().any().any(): 141 | null_counts = X[self.variables].isnull().any() 142 | vars_ = {key: value for (key, value) in null_counts.items() 143 | if value is True} 144 | raise InvalidModelInputError( 145 | f'Categorical encoder has introduced NaN when ' 146 | f'transforming categorical variables: {vars_.keys()}') 147 | 148 | return X 149 | 150 | 151 | class DropUnecessaryFeatures(BaseEstimator, TransformerMixin): 152 | 153 | def __init__(self, variables_to_drop=None): 154 | self.variables = variables_to_drop 155 | 156 | def fit(self, X, y=None): 157 | return self 158 | 159 | def transform(self, X): 160 | # encode labels 161 | X = X.copy() 162 | X = X.drop(self.variables, axis=1) 163 | 164 | return X 165 | -------------------------------------------------------------------------------- /packages/regression_model/regression_model/processing/validation.py: -------------------------------------------------------------------------------- 1 | from regression_model.config import config 2 | 3 | import pandas as pd 4 | 5 | 6 | def validate_inputs(input_data: pd.DataFrame) -> pd.DataFrame: 7 | """Check model inputs for unprocessable values.""" 8 | 9 | validated_data = input_data.copy() 10 | 11 | # check for numerical variables with NA not seen during training 12 | if input_data[config.NUMERICAL_NA_NOT_ALLOWED].isnull().any().any(): 13 | validated_data = validated_data.dropna( 14 | axis=0, subset=config.NUMERICAL_NA_NOT_ALLOWED) 15 | 16 | # check for categorical variables with NA not seen during training 17 | if input_data[config.CATEGORICAL_NA_NOT_ALLOWED].isnull().any().any(): 18 | validated_data = validated_data.dropna( 19 | axis=0, subset=config.CATEGORICAL_NA_NOT_ALLOWED) 20 | 21 | # check for values <= 0 for the log transformed variables 22 | if (input_data[config.NUMERICALS_LOG_VARS] <= 0).any().any(): 23 | vars_with_neg_values = config.NUMERICALS_LOG_VARS[ 24 | (input_data[config.NUMERICALS_LOG_VARS] <= 0).any()] 25 | validated_data = validated_data[ 26 | validated_data[vars_with_neg_values] > 0] 27 | 28 | return validated_data 29 | -------------------------------------------------------------------------------- /packages/regression_model/regression_model/train_pipeline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.model_selection import train_test_split 3 | 4 | from regression_model import pipeline 5 | from regression_model.processing.data_management import ( 6 | load_dataset, save_pipeline) 7 | from regression_model.config import config 8 | from regression_model import __version__ as _version 9 | 10 | import logging 11 | 12 | 13 | _logger = logging.getLogger(__name__) 14 | 15 | 16 | def run_training() -> None: 17 | """Train the model.""" 18 | 19 | # read training data 20 | data = load_dataset(file_name=config.TRAINING_DATA_FILE) 21 | 22 | # divide train and test 23 | X_train, X_test, y_train, y_test = train_test_split( 24 | data[config.FEATURES], 25 | data[config.TARGET], 26 | test_size=0.1, 27 | random_state=0) # we are setting the seed here 28 | 29 | # transform the target 30 | y_train = np.log(y_train) 31 | y_test = np.log(y_test) 32 | 33 | pipeline.price_pipe.fit(X_train[config.FEATURES], 34 | y_train) 35 | 36 | _logger.info(f'saving model version: {_version}') 37 | save_pipeline(pipeline_to_persist=pipeline.price_pipe) 38 | 39 | 40 | if __name__ == '__main__': 41 | run_training() 42 | -------------------------------------------------------------------------------- /packages/regression_model/regression_model/trained_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/regression_model/regression_model/trained_models/__init__.py -------------------------------------------------------------------------------- /packages/regression_model/requirements.txt: -------------------------------------------------------------------------------- 1 | # production requirements 2 | numpy==1.15.4 3 | scikit-learn==0.20.2 4 | pandas==0.23.4 5 | 6 | # packaging 7 | setuptools==40.6.3 8 | wheel==0.32.3 9 | 10 | # testing requirements 11 | <<<<<<< HEAD 12 | pytest==4.0.2 13 | 14 | # fetching datasets 15 | kaggle==1.5.1.1 16 | ======= 17 | pytest>=4.6.6,<5.0.0 18 | 19 | # fetching datasets 20 | kaggle==1.5.1.1 21 | >>>>>>> e470b691f73e63a969fef630210d28887b59b511 22 | -------------------------------------------------------------------------------- /packages/regression_model/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import io 5 | import os 6 | from pathlib import Path 7 | 8 | from setuptools import find_packages, setup 9 | 10 | 11 | # Package meta-data. 12 | NAME = 'regression_model' 13 | DESCRIPTION = 'Train and deploy regression model.' 14 | URL = 'your github project' 15 | EMAIL = 'your_email@email.com' 16 | AUTHOR = 'Your name' 17 | REQUIRES_PYTHON = '>=3.6.0' 18 | 19 | 20 | # What packages are required for this module to be executed? 21 | def list_reqs(fname='requirements.txt'): 22 | with open(fname) as fd: 23 | return fd.read().splitlines() 24 | 25 | 26 | # The rest you shouldn't have to touch too much :) 27 | # ------------------------------------------------ 28 | # Except, perhaps the License and Trove Classifiers! 29 | # If you do change the License, remember to change the 30 | # Trove Classifier for that! 31 | 32 | here = os.path.abspath(os.path.dirname(__file__)) 33 | 34 | # Import the README and use it as the long-description. 35 | # Note: this will only work if 'README.md' is present in your MANIFEST.in file! 36 | try: 37 | with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f: 38 | long_description = '\n' + f.read() 39 | except FileNotFoundError: 40 | long_description = DESCRIPTION 41 | 42 | 43 | # Load the package's __version__.py module as a dictionary. 44 | ROOT_DIR = Path(__file__).resolve().parent 45 | PACKAGE_DIR = ROOT_DIR / NAME 46 | about = {} 47 | with open(PACKAGE_DIR / 'VERSION') as f: 48 | _version = f.read().strip() 49 | about['__version__'] = _version 50 | 51 | 52 | # Where the magic happens: 53 | setup( 54 | name=NAME, 55 | version=about['__version__'], 56 | description=DESCRIPTION, 57 | long_description=long_description, 58 | long_description_content_type='text/markdown', 59 | author=AUTHOR, 60 | author_email=EMAIL, 61 | python_requires=REQUIRES_PYTHON, 62 | url=URL, 63 | packages=find_packages(exclude=('tests',)), 64 | package_data={'regression_model': ['VERSION']}, 65 | install_requires=list_reqs(), 66 | extras_require={}, 67 | include_package_data=True, 68 | license='MIT', 69 | classifiers=[ 70 | # Trove classifiers 71 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers 72 | 'License :: OSI Approved :: MIT License', 73 | 'Programming Language :: Python', 74 | 'Programming Language :: Python :: 3', 75 | 'Programming Language :: Python :: 3.6', 76 | 'Programming Language :: Python :: Implementation :: CPython', 77 | 'Programming Language :: Python :: Implementation :: PyPy' 78 | ], 79 | ) 80 | -------------------------------------------------------------------------------- /packages/regression_model/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solegalli/deploying-machine-learning-models/e9bd617d763c040b720888b0ecf37502736c6a27/packages/regression_model/tests/__init__.py -------------------------------------------------------------------------------- /packages/regression_model/tests/test_predict.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | from regression_model.predict import make_prediction 4 | from regression_model.processing.data_management import load_dataset 5 | 6 | 7 | def test_make_single_prediction(): 8 | # Given 9 | test_data = load_dataset(file_name='test.csv') 10 | single_test_input = test_data[0:1] 11 | 12 | # When 13 | subject = make_prediction(input_data=single_test_input) 14 | 15 | # Then 16 | assert subject is not None 17 | assert isinstance(subject.get('predictions')[0], float) 18 | assert math.ceil(subject.get('predictions')[0]) == 112476 19 | 20 | 21 | def test_make_multiple_predictions(): 22 | # Given 23 | test_data = load_dataset(file_name='test.csv') 24 | original_data_length = len(test_data) 25 | multiple_test_input = test_data 26 | 27 | # When 28 | subject = make_prediction(input_data=multiple_test_input) 29 | 30 | # Then 31 | assert subject is not None 32 | assert len(subject.get('predictions')) == 1451 33 | 34 | # We expect some rows to be filtered out 35 | assert len(subject.get('predictions')) != original_data_length 36 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -r packages/ml_api/requirements.txt 2 | -------------------------------------------------------------------------------- /scripts/fetch_kaggle_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | kaggle competitions download -c house-prices-advanced-regression-techniques -p packages/regression_model/regression_model/datasets/ -------------------------------------------------------------------------------- /scripts/fetch_kaggle_large_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | TRAINING_DATA_URL="vbookshelf/v2-plant-seedlings-dataset" 4 | NOW=$(date) 5 | 6 | kaggle datasets download -d $TRAINING_DATA_URL -p packages/neural_network_model/neural_network_model/datasets/ && \ 7 | unzip packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset.zip -d packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset && \ 8 | echo $TRAINING_DATA_URL 'retrieved on:' $NOW > packages/neural_network_model/neural_network_model/datasets/training_data_reference.txt && \ 9 | mkdir -p "./packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset/Shepherds Purse" && \ 10 | mv -v "./packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset/Shepherd’s Purse/"* "./packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset/Shepherds Purse" 11 | rm -rf "./packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset/Shepherd’s Purse" -------------------------------------------------------------------------------- /scripts/input_test.json: -------------------------------------------------------------------------------- 1 | [{ 2 | "Id": 1461, 3 | "MSSubClass": 20, 4 | "MSZoning": "RH", 5 | "LotFrontage": 80.0, 6 | "LotArea": 11622, 7 | "Street": "Pave", 8 | "Alley": null, 9 | "LotShape": "Reg", 10 | "LandContour": "Lvl", 11 | "Utilities": "AllPub", 12 | "LotConfig": "Inside", 13 | "LandSlope": "Gtl", 14 | "Neighborhood": "NAmes", 15 | "Condition1": "Feedr", 16 | "Condition2": "Norm", 17 | "BldgType": "1Fam", 18 | "HouseStyle": "1Story", 19 | "OverallQual": 5, 20 | "OverallCond": 6, 21 | "YearBuilt": 1961, 22 | "YearRemodAdd": 1961, 23 | "RoofStyle": "Gable", 24 | "RoofMatl": "CompShg", 25 | "Exterior1st": "VinylSd", 26 | "Exterior2nd": "VinylSd", 27 | "MasVnrType": "None", 28 | "MasVnrArea": 0.0, 29 | "ExterQual": "TA", 30 | "ExterCond": "TA", 31 | "Foundation": "CBlock", 32 | "BsmtQual": "TA", 33 | "BsmtCond": "TA", 34 | "BsmtExposure": "No", 35 | "BsmtFinType1": "Rec", 36 | "BsmtFinSF1": 468.0, 37 | "BsmtFinType2": "LwQ", 38 | "BsmtFinSF2": 144.0, 39 | "BsmtUnfSF": 270.0, 40 | "TotalBsmtSF": 882.0, 41 | "Heating": "GasA", 42 | "HeatingQC": "TA", 43 | "CentralAir": "Y", 44 | "Electrical": "SBrkr", 45 | "1stFlrSF": 896, 46 | "2ndFlrSF": 0, 47 | "LowQualFinSF": 0, 48 | "GrLivArea": 896, 49 | "BsmtFullBath": 0.0, 50 | "BsmtHalfBath": 0.0, 51 | "FullBath": 1, 52 | "HalfBath": 0, 53 | "BedroomAbvGr": 2, 54 | "KitchenAbvGr": 1, 55 | "KitchenQual": "TA", 56 | "TotRmsAbvGrd": 5, 57 | "Functional": "Typ", 58 | "Fireplaces": 0, 59 | "FireplaceQu": null, 60 | "GarageType": "Attchd", 61 | "GarageYrBlt": 1961.0, 62 | "GarageFinish": "Unf", 63 | "GarageCars": 1.0, 64 | "GarageArea": 730.0, 65 | "GarageQual": "TA", 66 | "GarageCond": "TA", 67 | "PavedDrive": "Y", 68 | "WoodDeckSF": 140, 69 | "OpenPorchSF": 0, 70 | "EnclosedPorch": 0, 71 | "3SsnPorch": 0, 72 | "ScreenPorch": 120, 73 | "PoolArea": 0, 74 | "PoolQC": null, 75 | "Fence": "MnPrv", 76 | "MiscFeature": null, 77 | "MiscVal": 0, 78 | "MoSold": 6, 79 | "YrSold": 2010, 80 | "SaleType": "WD", 81 | "SaleCondition": "Normal" 82 | }] -------------------------------------------------------------------------------- /scripts/publish_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Building packages and uploading them to a Gemfury repository 4 | 5 | GEMFURY_URL=$GEMFURY_PUSH_URL 6 | 7 | set -e 8 | 9 | DIRS="$@" 10 | BASE_DIR=$(pwd) 11 | SETUP="setup.py" 12 | 13 | warn() { 14 | echo "$@" 1>&2 15 | } 16 | 17 | die() { 18 | warn "$@" 19 | exit 1 20 | } 21 | 22 | build() { 23 | DIR="${1/%\//}" 24 | echo "Checking directory $DIR" 25 | cd "$BASE_DIR/$DIR" 26 | [ ! -e $SETUP ] && warn "No $SETUP file, skipping" && return 27 | PACKAGE_NAME=$(python $SETUP --fullname) 28 | echo "Package $PACKAGE_NAME" 29 | python "$SETUP" sdist bdist_wheel || die "Building package $PACKAGE_NAME failed" 30 | for X in $(ls dist) 31 | do 32 | curl -F package=@"dist/$X" "$GEMFURY_URL" || die "Uploading package $PACKAGE_NAME failed on file dist/$X" 33 | done 34 | } 35 | 36 | if [ -n "$DIRS" ]; then 37 | for dir in $DIRS; do 38 | build $dir 39 | done 40 | else 41 | ls -d */ | while read dir; do 42 | build $dir 43 | done 44 | fi --------------------------------------------------------------------------------