├── .circleci └── config.yml ├── .dockerignore ├── .gitignore ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── assignment-section-05 ├── MANIFEST.in ├── README.md ├── classification_model │ ├── VERSION │ ├── __init__.py │ ├── config.yml │ ├── config │ │ ├── __init__.py │ │ └── core.py │ ├── datasets │ │ └── __init__.py │ ├── pipeline.py │ ├── predict.py │ ├── processing │ │ ├── __init__.py │ │ ├── data_manager.py │ │ ├── features.py │ │ └── validation.py │ ├── train_pipeline.py │ └── trained_models │ │ └── __init__.py ├── mypy.ini ├── pyproject.toml ├── requirements │ ├── requirements.txt │ ├── test_requirements.txt │ └── typing_requirements.txt ├── setup.py ├── tests │ ├── __init__.py │ ├── conftest.py │ ├── test_features.py │ └── test_prediction.py └── tox.ini ├── packages ├── ml_api │ ├── VERSION │ ├── api │ │ ├── __init__.py │ │ ├── app.py │ │ ├── config.py │ │ ├── controller.py │ │ └── validation.py │ ├── diff_test_requirements.txt │ ├── requirements.txt │ ├── run.py │ ├── run.sh │ ├── test_data_predictions.csv │ ├── tests │ │ ├── __init__.py │ │ ├── capture_model_predictions.py │ │ ├── conftest.py │ │ ├── differential_tests │ │ │ ├── __init__.py │ │ │ └── test_differential.py │ │ ├── test_controller.py │ │ └── test_validation.py │ └── tox.ini ├── neural_network_model │ ├── MANIFEST.in │ ├── config.yml │ ├── neural_network_model │ │ ├── VERSION │ │ ├── __init__.py │ │ ├── config │ │ │ ├── __init__.py │ │ │ └── config.py │ │ ├── datasets │ │ │ ├── __init__.py │ │ │ └── test_data │ │ │ │ ├── Black-grass │ │ │ │ └── 1.png │ │ │ │ ├── Charlock │ │ │ │ └── 1.png │ │ │ │ └── __init__.py │ │ ├── model.py │ │ ├── pipeline.py │ │ ├── predict.py │ │ ├── processing │ │ │ ├── __init__.py │ │ │ ├── data_management.py │ │ │ ├── errors.py │ │ │ └── preprocessors.py │ │ ├── train_pipeline.py │ │ └── trained_models │ │ │ └── __init__.py │ ├── requirements.txt │ ├── setup.py │ └── tests │ │ ├── __init__.py │ │ ├── conftest.py │ │ └── test_predict.py └── regression_model │ ├── MANIFEST.in │ ├── regression_model │ ├── VERSION │ ├── __init__.py │ ├── config │ │ ├── __init__.py │ │ ├── config.py │ │ └── logging_config.py │ ├── datasets │ │ └── __init__.py │ ├── pipeline.py │ ├── predict.py │ ├── processing │ │ ├── __init__.py │ │ ├── data_management.py │ │ ├── errors.py │ │ ├── features.py │ │ ├── preprocessors.py │ │ └── validation.py │ ├── train_pipeline.py │ └── trained_models │ │ └── __init__.py │ ├── requirements.txt │ ├── setup.py │ ├── tests │ ├── __init__.py │ └── test_predict.py │ └── tox.ini ├── scripts ├── fetch_kaggle_dataset.sh ├── fetch_kaggle_large_dataset.sh ├── input_test.json └── publish_model.sh ├── section-04-research-and-development ├── 01-machine-learning-pipeline-data-analysis.ipynb ├── 02-machine-learning-pipeline-feature-engineering.ipynb ├── 03-machine-learning-pipeline-feature-selection.ipynb ├── 04-machine-learning-pipeline-model-training.ipynb ├── 05-machine-learning-pipeline-scoring-new-data.ipynb ├── 06-feature-engineering-with-open-source.ipynb ├── 07-feature-engineering-pipeline.ipynb ├── 08-final-machine-learning-pipeline.ipynb ├── preprocessors.py ├── preprocessors_bonus.py ├── requirements.txt └── titanic-assignment │ ├── 01-predicting-survival-titanic-assignement.ipynb │ ├── 02-predicting-survival-titanic-solution.ipynb │ ├── 03-titanic-survival-pipeline-assignment.ipynb │ └── 04-titanic-survival-pipeline-solution.ipynb ├── section-05-production-model-package ├── MANIFEST.in ├── mypy.ini ├── pyproject.toml ├── regression_model │ ├── VERSION │ ├── __init__.py │ ├── config.yml │ ├── config │ │ ├── __init__.py │ │ └── core.py │ ├── datasets │ │ └── __init__.py │ ├── pipeline.py │ ├── predict.py │ ├── processing │ │ ├── __init__.py │ │ ├── data_manager.py │ │ ├── features.py │ │ └── validation.py │ ├── train_pipeline.py │ └── trained_models │ │ └── __init__.py ├── requirements │ ├── requirements.txt │ ├── test_requirements.txt │ └── typing_requirements.txt ├── setup.py ├── tests │ ├── __init__.py │ ├── conftest.py │ ├── test_features.py │ └── test_prediction.py └── tox.ini ├── section-06-model-serving-api └── house-prices-api │ ├── Procfile │ ├── app │ ├── __init__.py │ ├── api.py │ ├── config.py │ ├── main.py │ ├── schemas │ │ ├── __init__.py │ │ ├── health.py │ │ └── predict.py │ └── tests │ │ ├── __init__.py │ │ ├── conftest.py │ │ └── test_api.py │ ├── mypy.ini │ ├── requirements.txt │ ├── test_requirements.txt │ ├── tox.ini │ └── typing_requirements.txt ├── section-07-ci-and-publishing ├── house-prices-api │ ├── Procfile │ ├── app │ │ ├── __init__.py │ │ ├── api.py │ │ ├── config.py │ │ ├── main.py │ │ ├── schemas │ │ │ ├── __init__.py │ │ │ ├── health.py │ │ │ └── predict.py │ │ └── tests │ │ │ ├── __init__.py │ │ │ ├── conftest.py │ │ │ └── test_api.py │ ├── mypy.ini │ ├── requirements.txt │ ├── test_requirements.txt │ ├── tox.ini │ └── typing_requirements.txt └── model-package │ ├── MANIFEST.in │ ├── mypy.ini │ ├── publish_model.sh │ ├── pyproject.toml │ ├── regression_model │ ├── VERSION │ ├── __init__.py │ ├── config.yml │ ├── config │ │ ├── __init__.py │ │ └── core.py │ ├── datasets │ │ └── __init__.py │ ├── pipeline.py │ ├── predict.py │ ├── processing │ │ ├── __init__.py │ │ ├── data_manager.py │ │ ├── features.py │ │ └── validation.py │ ├── train_pipeline.py │ └── trained_models │ │ └── __init__.py │ ├── requirements │ ├── requirements.txt │ ├── test_requirements.txt │ └── typing_requirements.txt │ ├── setup.py │ ├── tests │ ├── __init__.py │ ├── conftest.py │ ├── test_features.py │ └── test_prediction.py │ └── tox.ini └── section-08-deploying-with-containers ├── .dockerignore ├── Dockerfile └── house-prices-api ├── Procfile ├── app ├── __init__.py ├── api.py ├── config.py ├── main.py ├── schemas │ ├── __init__.py │ ├── health.py │ └── predict.py └── tests │ ├── __init__.py │ ├── conftest.py │ └── test_api.py ├── mypy.ini ├── requirements.txt ├── run.sh ├── test_requirements.txt ├── tox.ini └── typing_requirements.txt /.dockerignore: -------------------------------------------------------------------------------- 1 | jupyter_notebooks* 2 | */env* 3 | */venv* 4 | .circleci* 5 | packages/regression_model 6 | *.env 7 | *.log 8 | .git 9 | .gitignore -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | env39/ 89 | env311/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | 108 | # pycharm 109 | .idea/ 110 | 111 | # datafiles 112 | packages/regression_model/regression_model/datasets/*.csv 113 | packages/regression_model/regression_model/datasets/*.zip 114 | packages/regression_model/regression_model/datasets/*.txt 115 | train.csv 116 | test.csv 117 | raw.csv 118 | data_description.txt 119 | house-prices-advanced-regression-techniques.zip 120 | sample_submission.csv 121 | test_data_predictions.csv 122 | v2-plant-seedlings-dataset/ 123 | v2-plant-seedlings-dataset.zip 124 | 125 | # all logs 126 | logs/ 127 | 128 | # trained models (will be created in CI) 129 | section-05-production-model-package/regression_model/trained_models/*.pkl 130 | packages/regression_model/regression_model/trained_models/*.pkl 131 | packages/neural_network_model/neural_network_model/trained_models/*.pkl 132 | packages/neural_network_model/neural_network_model/trained_models/*.h5 133 | *.h5 134 | packages/neural_network_model/neural_network_model/datasets/training_data_reference.txt 135 | *.pkl 136 | 137 | .DS_Store 138 | 139 | kaggle.json 140 | packages/ml_api/uploads/* 141 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6.4 2 | 3 | # Create the user that will run the app 4 | RUN adduser --disabled-password --gecos '' ml-api-user 5 | 6 | WORKDIR /opt/ml_api 7 | 8 | ARG PIP_EXTRA_INDEX_URL 9 | ENV FLASK_APP run.py 10 | 11 | # Install requirements, including from Gemfury 12 | ADD ./packages/ml_api /opt/ml_api/ 13 | RUN pip install --upgrade pip 14 | RUN pip install -r /opt/ml_api/requirements.txt 15 | 16 | RUN chmod +x /opt/ml_api/run.sh 17 | RUN chown -R ml-api-user:ml-api-user ./ 18 | 19 | USER ml-api-user 20 | 21 | EXPOSE 5000 22 | 23 | CMD ["bash", "./run.sh"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, Soledad Galli and Christopher Samiullah. Deployment of Machine Learning Models, online course. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | NAME=udemy-ml-api 2 | COMMIT_ID=$(shell git rev-parse HEAD) 3 | 4 | 5 | build-ml-api-heroku: 6 | docker build --build-arg PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} -t registry.heroku.com/$(NAME)/web:$(COMMIT_ID) . 7 | 8 | push-ml-api-heroku: 9 | docker push registry.heroku.com/${HEROKU_APP_NAME}/web:$(COMMIT_ID) 10 | 11 | build-ml-api-aws: 12 | docker build --build-arg PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} -t $(NAME):$(COMMIT_ID) . 13 | 14 | push-ml-api-aws: 15 | docker push ${AWS_ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/$(NAME):$(COMMIT_ID) 16 | 17 | tag-ml-api: 18 | docker tag $(NAME):$(COMMIT_ID) ${AWS_ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/$(NAME):$(COMMIT_ID) 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deployment of Machine Learning Models 2 | Accompanying repo for the online course Deployment of Machine Learning Models. 3 | 4 | For the documentation, visit the [course on Udemy](https://www.udemy.com/deployment-of-machine-learning-models/?couponCode=TIDREPO). 5 | -------------------------------------------------------------------------------- /assignment-section-05/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | include *.md 3 | include *.pkl 4 | recursive-include ./classification_model/* 5 | 6 | include classification_model/datasets/train.csv 7 | include classification_model/datasets/test.csv 8 | include classification_model/trained_models/*.pkl 9 | include classification_model/VERSION 10 | include classification_model/config.yml 11 | 12 | include ./requirements/requirements.txt 13 | include ./requirements/test_requirements.txt 14 | exclude *.log 15 | exclude *.cfg 16 | 17 | recursive-exclude * __pycache__ 18 | recursive-exclude * *.py[co] -------------------------------------------------------------------------------- /assignment-section-05/README.md: -------------------------------------------------------------------------------- 1 | # Productionized Titanic Classification Model Package 2 | 3 | ## Run With Tox (Recommended) 4 | - Download the data from: https://www.openml.org/data/get_csv/16826755/phpMYEkMl 5 | - Save the file as `raw.csv` in the classification_model/datasets directory 6 | - `pip install tox` 7 | - Make sure you are in the assignment-section-05 directory (where the tox.ini file is) then run the command: `tox` (this runs the tests and typechecks, trains the model under the hood). The first time you run this it creates a virtual env and installs 8 | dependencies, so takes a few minutes. 9 | 10 | ## Run Without Tox 11 | - Download the data from: https://www.openml.org/data/get_csv/16826755/phpMYEkMl 12 | - Save the file as `raw.csv` in the classification_model/datasets directory 13 | - Add assignment-section-05 *and* classification_model paths to your system PYTHONPATH 14 | - `pip install -r requirements/test_requirements` 15 | - Train the model: `python classification_model/train_pipeline.py` 16 | - Run the tests `pytest tests` -------------------------------------------------------------------------------- /assignment-section-05/classification_model/VERSION: -------------------------------------------------------------------------------- 1 | 0.0.1 2 | -------------------------------------------------------------------------------- /assignment-section-05/classification_model/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from classification_model.config.core import PACKAGE_ROOT, config 4 | 5 | # It is strongly advised that you do not add any handlers other than 6 | # NullHandler to your library’s loggers. This is because the configuration 7 | # of handlers is the prerogative of the application developer who uses your 8 | # library. The application developer knows their target audience and what 9 | # handlers are most appropriate for their application: if you add handlers 10 | # ‘under the hood’, you might well interfere with their ability to carry out 11 | # unit tests and deliver logs which suit their requirements. 12 | # https://docs.python.org/3/howto/logging.html#configuring-logging-for-a-library 13 | logging.getLogger(config.app_config.package_name).addHandler(logging.NullHandler()) 14 | 15 | 16 | with open(PACKAGE_ROOT / "VERSION") as version_file: 17 | __version__ = version_file.read().strip() 18 | -------------------------------------------------------------------------------- /assignment-section-05/classification_model/config.yml: -------------------------------------------------------------------------------- 1 | # Package Overview 2 | package_name: regression_model 3 | 4 | # Data Files 5 | raw_data_file: raw.csv 6 | training_data_file: train.csv 7 | test_data_file: test.csv 8 | 9 | # Variables 10 | # The variable we are attempting to predict (sale price) 11 | target: survived 12 | 13 | pipeline_name: titanic_classification_model 14 | pipeline_save_file: titanic_classification_model_output_v 15 | 16 | features: 17 | - pclass 18 | - sex 19 | - age 20 | - sibsp 21 | - parch 22 | - fare 23 | - cabin 24 | - embarked 25 | - title # generated from name 26 | 27 | # set train/test split 28 | test_size: 0.1 29 | 30 | # to set the random seed 31 | random_state: 0 32 | 33 | unused_fields: 34 | - name 35 | - ticket 36 | - boat 37 | - body 38 | - home.dest 39 | 40 | numerical_vars: 41 | - age 42 | - fare 43 | 44 | categorical_vars: 45 | - sex 46 | - cabin 47 | - embarked 48 | - title 49 | 50 | cabin_vars: 51 | - cabin -------------------------------------------------------------------------------- /assignment-section-05/classification_model/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/assignment-section-05/classification_model/config/__init__.py -------------------------------------------------------------------------------- /assignment-section-05/classification_model/config/core.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Sequence 3 | 4 | from pydantic import BaseModel 5 | from strictyaml import YAML, load 6 | 7 | import classification_model 8 | 9 | # Project Directories 10 | PACKAGE_ROOT = Path(classification_model.__file__).resolve().parent 11 | ROOT = PACKAGE_ROOT.parent 12 | CONFIG_FILE_PATH = PACKAGE_ROOT / "config.yml" 13 | DATASET_DIR = PACKAGE_ROOT / "datasets" 14 | TRAINED_MODEL_DIR = PACKAGE_ROOT / "trained_models" 15 | 16 | 17 | class AppConfig(BaseModel): 18 | """ 19 | Application-level config. 20 | """ 21 | 22 | package_name: str 23 | raw_data_file: str 24 | pipeline_save_file: str 25 | 26 | 27 | class ModelConfig(BaseModel): 28 | """ 29 | All configuration relevant to model 30 | training and feature engineering. 31 | """ 32 | 33 | target: str 34 | unused_fields: Sequence[str] 35 | features: Sequence[str] 36 | test_size: float 37 | random_state: int 38 | numerical_vars: Sequence[str] 39 | categorical_vars: Sequence[str] 40 | cabin_vars: Sequence[str] 41 | 42 | 43 | class Config(BaseModel): 44 | """Master config object.""" 45 | 46 | app_config: AppConfig 47 | model_config: ModelConfig 48 | 49 | 50 | def find_config_file() -> Path: 51 | """Locate the configuration file.""" 52 | if CONFIG_FILE_PATH.is_file(): 53 | return CONFIG_FILE_PATH 54 | raise Exception(f"Config not found at {CONFIG_FILE_PATH!r}") 55 | 56 | 57 | def fetch_config_from_yaml(cfg_path: Path = None) -> YAML: 58 | """Parse YAML containing the package configuration.""" 59 | 60 | if not cfg_path: 61 | cfg_path = find_config_file() 62 | 63 | if cfg_path: 64 | with open(cfg_path, "r") as conf_file: 65 | parsed_config = load(conf_file.read()) 66 | return parsed_config 67 | raise OSError(f"Did not find config file at path: {cfg_path}") 68 | 69 | 70 | def create_and_validate_config(parsed_config: YAML = None) -> Config: 71 | """Run validation on config values.""" 72 | if parsed_config is None: 73 | parsed_config = fetch_config_from_yaml() 74 | 75 | # specify the data attribute from the strictyaml YAML type. 76 | _config = Config( 77 | app_config=AppConfig(**parsed_config.data), 78 | model_config=ModelConfig(**parsed_config.data), 79 | ) 80 | 81 | return _config 82 | 83 | 84 | config = create_and_validate_config() 85 | -------------------------------------------------------------------------------- /assignment-section-05/classification_model/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/assignment-section-05/classification_model/datasets/__init__.py -------------------------------------------------------------------------------- /assignment-section-05/classification_model/pipeline.py: -------------------------------------------------------------------------------- 1 | # for encoding categorical variables 2 | from feature_engine.encoding import OneHotEncoder, RareLabelEncoder 3 | 4 | # for imputation 5 | from feature_engine.imputation import ( 6 | AddMissingIndicator, 7 | CategoricalImputer, 8 | MeanMedianImputer, 9 | ) 10 | from sklearn.linear_model import LogisticRegression 11 | from sklearn.pipeline import Pipeline 12 | from sklearn.preprocessing import StandardScaler 13 | 14 | from classification_model.config.core import config 15 | from classification_model.processing.features import ExtractLetterTransformer 16 | 17 | titanic_pipe = Pipeline( 18 | [ 19 | # impute categorical variables with string missing 20 | ( 21 | "categorical_imputation", 22 | CategoricalImputer( 23 | imputation_method="missing", 24 | variables=config.model_config.categorical_vars, 25 | ), 26 | ), 27 | # add missing indicator to numerical variables 28 | ( 29 | "missing_indicator", 30 | AddMissingIndicator(variables=config.model_config.numerical_vars), 31 | ), 32 | # impute numerical variables with the median 33 | ( 34 | "median_imputation", 35 | MeanMedianImputer( 36 | imputation_method="median", variables=config.model_config.numerical_vars 37 | ), 38 | ), 39 | # Extract letter from cabin 40 | ( 41 | "extract_letter", 42 | ExtractLetterTransformer(variables=config.model_config.cabin_vars), 43 | ), 44 | # == CATEGORICAL ENCODING ====== 45 | # remove categories present in less than 5% of the observations (0.05) 46 | # group them in one category called 'Rare' 47 | ( 48 | "rare_label_encoder", 49 | RareLabelEncoder( 50 | tol=0.05, n_categories=1, variables=config.model_config.categorical_vars 51 | ), 52 | ), 53 | # encode categorical variables using one hot encoding into k-1 variables 54 | ( 55 | "categorical_encoder", 56 | OneHotEncoder( 57 | drop_last=True, variables=config.model_config.categorical_vars 58 | ), 59 | ), 60 | # scale 61 | ("scaler", StandardScaler()), 62 | ("Logit", LogisticRegression(C=0.0005, random_state=0)), 63 | ] 64 | ) 65 | -------------------------------------------------------------------------------- /assignment-section-05/classification_model/predict.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | 3 | import pandas as pd 4 | 5 | from classification_model import __version__ as _version 6 | from classification_model.config.core import config 7 | from classification_model.processing.data_manager import load_pipeline 8 | from classification_model.processing.validation import validate_inputs 9 | 10 | pipeline_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl" 11 | _titanic_pipe = load_pipeline(file_name=pipeline_file_name) 12 | 13 | 14 | def make_prediction( 15 | *, 16 | input_data: t.Union[pd.DataFrame, dict], 17 | ) -> dict: 18 | """Make a prediction using a saved model pipeline.""" 19 | 20 | data = pd.DataFrame(input_data) 21 | validated_data, errors = validate_inputs(input_data=data) 22 | results = {"predictions": None, "version": _version, "errors": errors} 23 | 24 | if not errors: 25 | predictions = _titanic_pipe.predict( 26 | X=validated_data[config.model_config.features] 27 | ) 28 | results = { 29 | "predictions": predictions, 30 | "version": _version, 31 | "errors": errors, 32 | } 33 | 34 | return results 35 | -------------------------------------------------------------------------------- /assignment-section-05/classification_model/processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/assignment-section-05/classification_model/processing/__init__.py -------------------------------------------------------------------------------- /assignment-section-05/classification_model/processing/data_manager.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | from pathlib import Path 4 | from typing import Any, List, Union 5 | 6 | import joblib 7 | import numpy as np 8 | import pandas as pd 9 | from sklearn.pipeline import Pipeline 10 | 11 | from classification_model import __version__ as _version 12 | from classification_model.config.core import DATASET_DIR, TRAINED_MODEL_DIR, config 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | # float type for np.nan 18 | def get_first_cabin(row: Any) -> Union[str, float]: 19 | try: 20 | return row.split()[0] 21 | except AttributeError: 22 | return np.nan 23 | 24 | 25 | def get_title(passenger: str) -> str: 26 | """Extracts the title (Mr, Ms, etc) from the name variable.""" 27 | line = passenger 28 | if re.search("Mrs", line): 29 | return "Mrs" 30 | elif re.search("Mr", line): 31 | return "Mr" 32 | elif re.search("Miss", line): 33 | return "Miss" 34 | elif re.search("Master", line): 35 | return "Master" 36 | else: 37 | return "Other" 38 | 39 | 40 | def pre_pipeline_preparation(*, dataframe: pd.DataFrame) -> pd.DataFrame: 41 | # replace question marks with NaN values 42 | data = dataframe.replace("?", np.nan) 43 | 44 | # retain only the first cabin if more than 45 | # 1 are available per passenger 46 | data["cabin"] = data["cabin"].apply(get_first_cabin) 47 | 48 | data["title"] = data["name"].apply(get_title) 49 | 50 | # cast numerical variables as floats 51 | data["fare"] = data["fare"].astype("float") 52 | data["age"] = data["age"].astype("float") 53 | 54 | # drop unnecessary variables 55 | data.drop(labels=config.model_config.unused_fields, axis=1, inplace=True) 56 | 57 | return data 58 | 59 | 60 | def _load_raw_dataset(*, file_name: str) -> pd.DataFrame: 61 | dataframe = pd.read_csv(Path(f"{DATASET_DIR}/{file_name}")) 62 | return dataframe 63 | 64 | 65 | def load_dataset(*, file_name: str) -> pd.DataFrame: 66 | dataframe = pd.read_csv(Path(f"{DATASET_DIR}/{file_name}")) 67 | transformed = pre_pipeline_preparation(dataframe=dataframe) 68 | 69 | return transformed 70 | 71 | 72 | def save_pipeline(*, pipeline_to_persist: Pipeline) -> None: 73 | """Persist the pipeline. 74 | Saves the versioned model, and overwrites any previous 75 | saved models. This ensures that when the package is 76 | published, there is only one trained model that can be 77 | called, and we know exactly how it was built. 78 | """ 79 | 80 | # Prepare versioned save file name 81 | save_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl" 82 | save_path = TRAINED_MODEL_DIR / save_file_name 83 | 84 | remove_old_pipelines(files_to_keep=[save_file_name]) 85 | joblib.dump(pipeline_to_persist, save_path) 86 | 87 | 88 | def load_pipeline(*, file_name: str) -> Pipeline: 89 | """Load a persisted pipeline.""" 90 | 91 | file_path = TRAINED_MODEL_DIR / file_name 92 | return joblib.load(filename=file_path) 93 | 94 | 95 | def remove_old_pipelines(*, files_to_keep: List[str]) -> None: 96 | """ 97 | Remove old model pipelines. 98 | This is to ensure there is a simple one-to-one 99 | mapping between the package version and the model 100 | version to be imported and used by other applications. 101 | """ 102 | do_not_delete = files_to_keep + ["__init__.py"] 103 | for model_file in TRAINED_MODEL_DIR.iterdir(): 104 | if model_file.name not in do_not_delete: 105 | model_file.unlink() 106 | -------------------------------------------------------------------------------- /assignment-section-05/classification_model/processing/features.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import BaseEstimator, TransformerMixin 2 | 3 | 4 | class ExtractLetterTransformer(BaseEstimator, TransformerMixin): 5 | # Extract first letter of variable 6 | 7 | def __init__(self, variables): 8 | 9 | if not isinstance(variables, list): 10 | raise ValueError("variables should be a list") 11 | 12 | self.variables = variables 13 | 14 | def fit(self, X, y=None): 15 | # we need this step to fit the sklearn pipeline 16 | return self 17 | 18 | def transform(self, X): 19 | 20 | # so that we do not over-write the original dataframe 21 | X = X.copy() 22 | 23 | for feature in self.variables: 24 | X[feature] = X[feature].str[0] 25 | 26 | return X 27 | -------------------------------------------------------------------------------- /assignment-section-05/classification_model/processing/validation.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Tuple, Union 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from pydantic import BaseModel, ValidationError 6 | 7 | from classification_model.config.core import config 8 | from classification_model.processing.data_manager import pre_pipeline_preparation 9 | 10 | 11 | def validate_inputs(*, input_data: pd.DataFrame) -> Tuple[pd.DataFrame, Optional[dict]]: 12 | """Check model inputs for unprocessable values.""" 13 | 14 | pre_processed = pre_pipeline_preparation(dataframe=input_data) 15 | validated_data = pre_processed[config.model_config.features].copy() 16 | errors = None 17 | 18 | try: 19 | # replace numpy nans so that pydantic can validate 20 | MultipleTitanicDataInputs( 21 | inputs=validated_data.replace({np.nan: None}).to_dict(orient="records") 22 | ) 23 | except ValidationError as error: 24 | errors = error.json() 25 | 26 | return validated_data, errors 27 | 28 | 29 | class TitanicDataInputSchema(BaseModel): 30 | pclass: Optional[int] 31 | name: Optional[str] 32 | sex: Optional[str] 33 | age: Optional[int] 34 | sibsp: Optional[int] 35 | parch: Optional[int] 36 | ticket: Optional[int] 37 | fare: Optional[float] 38 | cabin: Optional[str] 39 | embarked: Optional[str] 40 | boat: Optional[Union[str, int]] 41 | body: Optional[int] 42 | # TODO: rename home.dest, can get away with it now as it is not used 43 | 44 | 45 | class MultipleTitanicDataInputs(BaseModel): 46 | inputs: List[TitanicDataInputSchema] 47 | -------------------------------------------------------------------------------- /assignment-section-05/classification_model/train_pipeline.py: -------------------------------------------------------------------------------- 1 | from sklearn.model_selection import train_test_split 2 | 3 | from classification_model.config.core import config 4 | from classification_model.pipeline import titanic_pipe 5 | from classification_model.processing.data_manager import load_dataset, save_pipeline 6 | 7 | 8 | def run_training() -> None: 9 | """ 10 | Train the model. 11 | 12 | Training data can be found here: 13 | https://www.openml.org/data/get_csv/16826755/phpMYEkMl 14 | """ 15 | 16 | # read training data 17 | data = load_dataset(file_name=config.app_config.raw_data_file) 18 | 19 | # divide train and test 20 | X_train, X_test, y_train, y_test = train_test_split( 21 | data[config.model_config.features], # predictors 22 | data[config.model_config.target], 23 | test_size=config.model_config.test_size, 24 | # we are setting the random seed here 25 | # for reproducibility 26 | random_state=config.model_config.random_state, 27 | ) 28 | 29 | # fit model 30 | titanic_pipe.fit(X_train, y_train) 31 | 32 | # persist trained model 33 | save_pipeline(pipeline_to_persist=titanic_pipe) 34 | 35 | 36 | if __name__ == "__main__": 37 | run_training() 38 | -------------------------------------------------------------------------------- /assignment-section-05/classification_model/trained_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/assignment-section-05/classification_model/trained_models/__init__.py -------------------------------------------------------------------------------- /assignment-section-05/mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | warn_unreachable = False 3 | warn_unused_ignores = True 4 | follow_imports = skip 5 | show_error_context = True 6 | warn_incomplete_stub = True 7 | ignore_missing_imports = True 8 | check_untyped_defs = True 9 | cache_dir = /dev/null 10 | # Allow defining functions without any types. 11 | disallow_untyped_defs = False 12 | warn_redundant_casts = True 13 | warn_unused_configs = True 14 | strict_optional = True -------------------------------------------------------------------------------- /assignment-section-05/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" 7 | 8 | [tool.pytest.ini_options] 9 | minversion = "2.0" 10 | addopts = "-rfEX -p pytester --strict-markers" 11 | python_files = ["test_*.py", "*_test.py"] 12 | python_classes = ["Test", "Acceptance"] 13 | python_functions = ["test"] 14 | # NOTE: "doc" is not included here, but gets tested explicitly via "doctesting". 15 | testpaths = ["tests"] 16 | xfail_strict = true 17 | filterwarnings = [ 18 | "error", 19 | "default:Using or importing the ABCs:DeprecationWarning:unittest2.*", 20 | # produced by older pyparsing<=2.2.0. 21 | "default:Using or importing the ABCs:DeprecationWarning:pyparsing.*", 22 | "default:the imp module is deprecated in favour of importlib:DeprecationWarning:nose.*", 23 | # distutils is deprecated in 3.10, scheduled for removal in 3.12 24 | "ignore:The distutils package is deprecated:DeprecationWarning", 25 | # produced by python3.6/site.py itself (3.6.7 on Travis, could not trigger it with 3.6.8)." 26 | "ignore:.*U.*mode is deprecated:DeprecationWarning:(?!(pytest|_pytest))", 27 | # produced by pytest-xdist 28 | "ignore:.*type argument to addoption.*:DeprecationWarning", 29 | # produced on execnet (pytest-xdist) 30 | "ignore:.*inspect.getargspec.*deprecated, use inspect.signature.*:DeprecationWarning", 31 | # pytest's own futurewarnings 32 | "ignore::pytest.PytestExperimentalApiWarning", 33 | # Do not cause SyntaxError for invalid escape sequences in py37. 34 | # Those are caught/handled by pyupgrade, and not easy to filter with the 35 | # module being the filename (with .py removed). 36 | "default:invalid escape sequence:DeprecationWarning", 37 | # ignore use of unregistered marks, because we use many to test the implementation 38 | "ignore::_pytest.warning_types.PytestUnknownMarkWarning", 39 | ] 40 | 41 | [tool.black] 42 | target-version = ['py311'] 43 | 44 | [tool.isort] 45 | profile = "black" 46 | line_length = 100 47 | lines_between_sections = 1 48 | skip = "migrations" 49 | -------------------------------------------------------------------------------- /assignment-section-05/requirements/requirements.txt: -------------------------------------------------------------------------------- 1 | # We use compatible release functionality (see PEP 440 here: https://www.python.org/dev/peps/pep-0440/#compatible-release) 2 | # to specify acceptable version ranges of our project dependencies. This gives us the flexibility to keep up with small 3 | # updates/fixes, whilst ensuring we don't install a major update which could introduce backwards incompatible changes. 4 | numpy>=1.21.0,<2.0.0 5 | pandas>=1.3.5,<2.0.0 6 | pydantic>=1.8.1,<2.0.0 7 | scikit-learn>=1.1.3,<2.0.0 8 | strictyaml>=1.3.2,<2.0.0 9 | ruamel.yaml>=0.16.12,<1.0.0 10 | feature-engine>=1.0.2,<2.0.0 11 | joblib>=1.0.1,<2.0.0 -------------------------------------------------------------------------------- /assignment-section-05/requirements/test_requirements.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | 3 | # testing requirements 4 | pytest>=7.2.0,<8.0.0 5 | -------------------------------------------------------------------------------- /assignment-section-05/requirements/typing_requirements.txt: -------------------------------------------------------------------------------- 1 | # repo maintenance tooling 2 | black>=22.12.0,<23.0.0 3 | flake8>=6.0.0,<7.0.0 4 | mypy>=0.991,<1.0.0 5 | isort>=5.11.4,<6.0.0 -------------------------------------------------------------------------------- /assignment-section-05/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from pathlib import Path 5 | 6 | from setuptools import find_packages, setup 7 | 8 | # Package meta-data. 9 | NAME = 'tid-titanic-classification-model' 10 | DESCRIPTION = "Example Titanic dataset classification model package from Train In Data." 11 | URL = "https://github.com/trainindata/deploying-machine-learning-models" 12 | EMAIL = "christopher.samiullah@protonmail.com" 13 | AUTHOR = "ChristopherGS" 14 | REQUIRES_PYTHON = ">=3.7.0" 15 | 16 | 17 | # The rest you shouldn't have to touch too much :) 18 | # ------------------------------------------------ 19 | # Except, perhaps the License and Trove Classifiers! 20 | # Trove Classifiers: https://pypi.org/classifiers/ 21 | # If you do change the License, remember to change the 22 | # Trove Classifier for that! 23 | long_description = DESCRIPTION 24 | 25 | # Load the package's VERSION file as a dictionary. 26 | about = {} 27 | ROOT_DIR = Path(__file__).resolve().parent 28 | REQUIREMENTS_DIR = ROOT_DIR / 'requirements' 29 | PACKAGE_DIR = ROOT_DIR / 'classification_model' 30 | with open(PACKAGE_DIR / "VERSION") as f: 31 | _version = f.read().strip() 32 | about["__version__"] = _version 33 | 34 | 35 | # What packages are required for this module to be executed? 36 | def list_reqs(fname="requirements.txt"): 37 | with open(REQUIREMENTS_DIR / fname) as fd: 38 | return fd.read().splitlines() 39 | 40 | # Where the magic happens: 41 | setup( 42 | name=NAME, 43 | version=about["__version__"], 44 | description=DESCRIPTION, 45 | long_description=long_description, 46 | long_description_content_type="text/markdown", 47 | author=AUTHOR, 48 | author_email=EMAIL, 49 | python_requires=REQUIRES_PYTHON, 50 | url=URL, 51 | packages=find_packages(exclude=("tests",)), 52 | package_data={"classification_model": ["VERSION"]}, 53 | install_requires=list_reqs(), 54 | extras_require={}, 55 | include_package_data=True, 56 | license="BSD-3", 57 | classifiers=[ 58 | # Trove classifiers 59 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers 60 | "License :: OSI Approved :: MIT License", 61 | "Programming Language :: Python", 62 | "Programming Language :: Python :: 3", 63 | "Programming Language :: Python :: 3.7", 64 | "Programming Language :: Python :: 3.8", 65 | "Programming Language :: Python :: 3.9", 66 | "Programming Language :: Python :: 3.10", 67 | "Programming Language :: Python :: 3.11", 68 | "Programming Language :: Python :: Implementation :: CPython", 69 | "Programming Language :: Python :: Implementation :: PyPy", 70 | ], 71 | ) -------------------------------------------------------------------------------- /assignment-section-05/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/assignment-section-05/tests/__init__.py -------------------------------------------------------------------------------- /assignment-section-05/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import pytest 4 | from sklearn.model_selection import train_test_split 5 | 6 | from classification_model.config.core import config 7 | from classification_model.processing.data_manager import _load_raw_dataset 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | @pytest.fixture 13 | def sample_input_data(): 14 | data = _load_raw_dataset(file_name=config.app_config.raw_data_file) 15 | 16 | # divide train and test 17 | X_train, X_test, y_train, y_test = train_test_split( 18 | data, # predictors 19 | data[config.model_config.target], 20 | test_size=config.model_config.test_size, 21 | # we are setting the random seed here 22 | # for reproducibility 23 | random_state=config.model_config.random_state, 24 | ) 25 | 26 | return X_test 27 | -------------------------------------------------------------------------------- /assignment-section-05/tests/test_features.py: -------------------------------------------------------------------------------- 1 | from classification_model.config.core import config 2 | from classification_model.processing.features import ExtractLetterTransformer 3 | 4 | 5 | def test_temporal_variable_transformer(sample_input_data): 6 | # Given 7 | transformer = ExtractLetterTransformer( 8 | variables=config.model_config.cabin_vars, # cabin 9 | ) 10 | assert sample_input_data["cabin"].iat[6] == "E12" 11 | 12 | # When 13 | subject = transformer.fit_transform(sample_input_data) 14 | 15 | # Then 16 | assert subject["cabin"].iat[6] == "E" 17 | -------------------------------------------------------------------------------- /assignment-section-05/tests/test_prediction.py: -------------------------------------------------------------------------------- 1 | """ 2 | Note: These tests will fail if you have not first trained the model. 3 | """ 4 | 5 | import numpy as np 6 | from sklearn.metrics import accuracy_score 7 | 8 | from classification_model.predict import make_prediction 9 | 10 | 11 | def test_make_prediction(sample_input_data): 12 | # Given 13 | expected_no_predictions = 131 14 | 15 | # When 16 | result = make_prediction(input_data=sample_input_data) 17 | 18 | # Then 19 | predictions = result.get("predictions") 20 | assert isinstance(predictions, np.ndarray) 21 | assert isinstance(predictions[0], np.int64) 22 | assert result.get("errors") is None 23 | assert len(predictions) == expected_no_predictions 24 | _predictions = list(predictions) 25 | y_true = sample_input_data["survived"] 26 | accuracy = accuracy_score(_predictions, y_true) 27 | assert accuracy > 0.7 28 | -------------------------------------------------------------------------------- /assignment-section-05/tox.ini: -------------------------------------------------------------------------------- 1 | # Tox is a generic virtualenv management and test command line tool. Its goal is to 2 | # standardize testing in Python. We will be using it extensively in this course. 3 | 4 | # Using Tox we can (on multiple operating systems): 5 | # + Eliminate PYTHONPATH challenges when running scripts/tests 6 | # + Eliminate virtualenv setup confusion 7 | # + Streamline steps such as model training, model publishing 8 | 9 | 10 | [tox] 11 | envlist = test_package, checks 12 | skipsdist = True 13 | 14 | [testenv] 15 | install_command = pip install {opts} {packages} 16 | 17 | [testenv:test_package] 18 | deps = 19 | -rrequirements/test_requirements.txt 20 | 21 | setenv = 22 | PYTHONPATH=. 23 | PYTHONHASHSEED=0 24 | 25 | commands= 26 | python classification_model/train_pipeline.py 27 | pytest \ 28 | -s \ 29 | -vv \ 30 | {posargs:tests/} 31 | 32 | 33 | [testenv:train] 34 | envdir = {toxworkdir}/test_package 35 | 36 | deps = 37 | {[testenv:test_package]deps} 38 | 39 | setenv = 40 | {[testenv:test_package]setenv} 41 | commands= 42 | python classification_model/train_pipeline.py 43 | 44 | 45 | [testenv:checks] 46 | envdir = {toxworkdir}/checks 47 | deps = 48 | -r{toxinidir}/requirements/typing_requirements.txt 49 | commands = 50 | flake8 classification_model tests 51 | isort classification_model tests 52 | black classification_model tests 53 | {posargs:mypy classification_model} 54 | 55 | 56 | [flake8] 57 | exclude = .git,env 58 | max-line-length = 90 -------------------------------------------------------------------------------- /packages/ml_api/VERSION: -------------------------------------------------------------------------------- 1 | 0.3.0 -------------------------------------------------------------------------------- /packages/ml_api/api/__init__.py: -------------------------------------------------------------------------------- 1 | from api.config import PACKAGE_ROOT 2 | 3 | with open(PACKAGE_ROOT / 'VERSION') as version_file: 4 | __version__ = version_file.read().strip() 5 | -------------------------------------------------------------------------------- /packages/ml_api/api/app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask 2 | 3 | from api.config import get_logger 4 | 5 | 6 | _logger = get_logger(logger_name=__name__) 7 | 8 | 9 | def create_app(*, config_object) -> Flask: 10 | """Create a flask app instance.""" 11 | 12 | flask_app = Flask('ml_api') 13 | flask_app.config.from_object(config_object) 14 | 15 | # import blueprints 16 | from api.controller import prediction_app 17 | flask_app.register_blueprint(prediction_app) 18 | _logger.debug('Application instance created') 19 | 20 | return flask_app 21 | -------------------------------------------------------------------------------- /packages/ml_api/api/config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from logging.handlers import TimedRotatingFileHandler 3 | import pathlib 4 | import os 5 | import sys 6 | 7 | PACKAGE_ROOT = pathlib.Path(__file__).resolve().parent.parent 8 | 9 | FORMATTER = logging.Formatter( 10 | "%(asctime)s — %(name)s — %(levelname)s —" 11 | "%(funcName)s:%(lineno)d — %(message)s") 12 | LOG_DIR = PACKAGE_ROOT / 'logs' 13 | LOG_DIR.mkdir(exist_ok=True) 14 | LOG_FILE = LOG_DIR / 'ml_api.log' 15 | UPLOAD_FOLDER = PACKAGE_ROOT / 'uploads' 16 | UPLOAD_FOLDER.mkdir(exist_ok=True) 17 | 18 | ALLOWED_EXTENSIONS = set(['png', 'jpg', 'jpeg']) 19 | 20 | 21 | def get_console_handler(): 22 | console_handler = logging.StreamHandler(sys.stdout) 23 | console_handler.setFormatter(FORMATTER) 24 | return console_handler 25 | 26 | 27 | def get_file_handler(): 28 | file_handler = TimedRotatingFileHandler( 29 | LOG_FILE, when='midnight') 30 | file_handler.setFormatter(FORMATTER) 31 | file_handler.setLevel(logging.WARNING) 32 | return file_handler 33 | 34 | 35 | def get_logger(*, logger_name): 36 | """Get logger with prepared handlers.""" 37 | 38 | logger = logging.getLogger(logger_name) 39 | 40 | logger.setLevel(logging.INFO) 41 | 42 | logger.addHandler(get_console_handler()) 43 | logger.addHandler(get_file_handler()) 44 | logger.propagate = False 45 | 46 | return logger 47 | 48 | 49 | class Config: 50 | DEBUG = False 51 | TESTING = False 52 | CSRF_ENABLED = True 53 | SECRET_KEY = 'this-really-needs-to-be-changed' 54 | SERVER_PORT = 5000 55 | UPLOAD_FOLDER = UPLOAD_FOLDER 56 | 57 | 58 | class ProductionConfig(Config): 59 | DEBUG = False 60 | SERVER_ADDRESS: os.environ.get('SERVER_ADDRESS', '0.0.0.0') 61 | SERVER_PORT: os.environ.get('SERVER_PORT', '5000') 62 | 63 | 64 | class DevelopmentConfig(Config): 65 | DEVELOPMENT = True 66 | DEBUG = True 67 | 68 | 69 | class TestingConfig(Config): 70 | TESTING = True 71 | -------------------------------------------------------------------------------- /packages/ml_api/api/controller.py: -------------------------------------------------------------------------------- 1 | from flask import Blueprint, request, jsonify 2 | from regression_model.predict import make_prediction 3 | from regression_model import __version__ as _version 4 | from neural_network_model.predict import make_single_prediction 5 | import os 6 | from werkzeug.utils import secure_filename 7 | 8 | from api.config import get_logger, UPLOAD_FOLDER 9 | from api.validation import validate_inputs, allowed_file 10 | from api import __version__ as api_version 11 | 12 | _logger = get_logger(logger_name=__name__) 13 | 14 | 15 | prediction_app = Blueprint('prediction_app', __name__) 16 | 17 | 18 | @prediction_app.route('/health', methods=['GET']) 19 | def health(): 20 | if request.method == 'GET': 21 | _logger.info('health status OK') 22 | return 'ok' 23 | 24 | 25 | @prediction_app.route('/version', methods=['GET']) 26 | def version(): 27 | if request.method == 'GET': 28 | return jsonify({'model_version': _version, 29 | 'api_version': api_version}) 30 | 31 | 32 | @prediction_app.route('/v1/predict/regression', methods=['POST']) 33 | def predict(): 34 | if request.method == 'POST': 35 | # Step 1: Extract POST data from request body as JSON 36 | json_data = request.get_json() 37 | _logger.debug(f'Inputs: {json_data}') 38 | 39 | # Step 2: Validate the input using marshmallow schema 40 | input_data, errors = validate_inputs(input_data=json_data) 41 | 42 | # Step 3: Model prediction 43 | result = make_prediction(input_data=input_data) 44 | _logger.debug(f'Outputs: {result}') 45 | 46 | # Step 4: Convert numpy ndarray to list 47 | predictions = result.get('predictions').tolist() 48 | version = result.get('version') 49 | 50 | # Step 5: Return the response as JSON 51 | return jsonify({'predictions': predictions, 52 | 'version': version, 53 | 'errors': errors}) 54 | 55 | 56 | @prediction_app.route('/predict/classifier', methods=['POST']) 57 | def predict_image(): 58 | if request.method == 'POST': 59 | # Step 1: check if the post request has the file part 60 | if 'file' not in request.files: 61 | return jsonify('No file found'), 400 62 | 63 | file = request.files['file'] 64 | 65 | # Step 2: Basic file extension validation 66 | if file and allowed_file(file.filename): 67 | filename = secure_filename(file.filename) 68 | 69 | # Step 3: Save the file 70 | # Note, in production, this would require careful 71 | # validation, management and clean up. 72 | file.save(os.path.join(UPLOAD_FOLDER, filename)) 73 | 74 | _logger.debug(f'Inputs: {filename}') 75 | 76 | # Step 4: perform prediction 77 | result = make_single_prediction( 78 | image_name=filename, 79 | image_directory=UPLOAD_FOLDER) 80 | 81 | _logger.debug(f'Outputs: {result}') 82 | 83 | readable_predictions = result.get('readable_predictions') 84 | version = result.get('version') 85 | 86 | # Step 5: Return the response as JSON 87 | return jsonify( 88 | {'readable_predictions': readable_predictions[0], 89 | 'version': version}) 90 | -------------------------------------------------------------------------------- /packages/ml_api/diff_test_requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url=${PIP_EXTRA_INDEX_URL} 2 | 3 | # api 4 | flask>=1.1.1,<1.2.0 5 | 6 | # schema validation 7 | marshmallow==2.17.0 8 | 9 | # Set this to the previous model version 10 | regression-model==2.0.19 11 | 12 | # temporarily necessary as we update sklearn 13 | joblib>=0.14.1,<0.15.0 -------------------------------------------------------------------------------- /packages/ml_api/requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url=${PIP_EXTRA_INDEX_URL} 2 | 3 | # api 4 | flask>=1.1.1,<1.2.0 5 | 6 | # schema validation 7 | marshmallow==2.17.0 8 | 9 | # Install from gemfury 10 | regression-model==2.0.20 11 | neural_network_model==0.1.1 12 | 13 | # Deployment 14 | gunicorn==19.9.0 -------------------------------------------------------------------------------- /packages/ml_api/run.py: -------------------------------------------------------------------------------- 1 | from api.app import create_app 2 | from api.config import DevelopmentConfig, ProductionConfig 3 | 4 | 5 | application = create_app( 6 | config_object=ProductionConfig) 7 | 8 | 9 | if __name__ == '__main__': 10 | application.run() 11 | -------------------------------------------------------------------------------- /packages/ml_api/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export IS_DEBUG=${DEBUG:-false} 3 | exec gunicorn --bind 0.0.0.0:5000 --access-logfile - --error-logfile - run:application -------------------------------------------------------------------------------- /packages/ml_api/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/ml_api/tests/__init__.py -------------------------------------------------------------------------------- /packages/ml_api/tests/capture_model_predictions.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script should only be run in CI. 3 | Never run it locally or you will disrupt the 4 | differential test versioning logic. 5 | """ 6 | 7 | import pandas as pd 8 | 9 | from regression_model.predict import make_prediction 10 | from regression_model.processing.data_management import load_dataset 11 | 12 | from api import config 13 | 14 | 15 | def capture_predictions() -> None: 16 | """Save the test data predictions to a CSV.""" 17 | 18 | save_file = 'test_data_predictions.csv' 19 | test_data = load_dataset(file_name='test.csv') 20 | 21 | # we take a slice with no input validation issues 22 | multiple_test_input = test_data[99:600] 23 | 24 | predictions = make_prediction(input_data=multiple_test_input) 25 | 26 | # save predictions for the test dataset 27 | predictions_df = pd.DataFrame(predictions) 28 | 29 | # hack here to save the file to the regression model 30 | # package of the repo, not the installed package 31 | predictions_df.to_csv(f'{config.PACKAGE_ROOT}/{save_file}') 32 | 33 | 34 | if __name__ == '__main__': 35 | capture_predictions() 36 | -------------------------------------------------------------------------------- /packages/ml_api/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from api.app import create_app 4 | from api.config import TestingConfig 5 | 6 | 7 | @pytest.fixture 8 | def app(): 9 | app = create_app(config_object=TestingConfig) 10 | 11 | with app.app_context(): 12 | yield app 13 | 14 | 15 | @pytest.fixture 16 | def flask_test_client(app): 17 | with app.test_client() as test_client: 18 | yield test_client 19 | -------------------------------------------------------------------------------- /packages/ml_api/tests/differential_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/ml_api/tests/differential_tests/__init__.py -------------------------------------------------------------------------------- /packages/ml_api/tests/differential_tests/test_differential.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | from regression_model.config import config as model_config 4 | from regression_model.predict import make_prediction 5 | from regression_model.processing.data_management import load_dataset 6 | import pandas as pd 7 | import pytest 8 | 9 | 10 | from api import config 11 | 12 | 13 | @pytest.mark.differential 14 | def test_model_prediction_differential( 15 | *, 16 | save_file: str = 'test_data_predictions.csv'): 17 | """ 18 | This test compares the prediction result similarity of 19 | the current model with the previous model's results. 20 | """ 21 | 22 | # Given 23 | # Load the saved previous model predictions 24 | previous_model_df = pd.read_csv(f'{config.PACKAGE_ROOT}/{save_file}') 25 | previous_model_predictions = previous_model_df.predictions.values 26 | 27 | test_data = load_dataset(file_name=model_config.TESTING_DATA_FILE) 28 | multiple_test_input = test_data[99:600] 29 | 30 | # When 31 | current_result = make_prediction(input_data=multiple_test_input) 32 | current_model_predictions = current_result.get('predictions') 33 | 34 | # Then 35 | # diff the current model vs. the old model 36 | assert len(previous_model_predictions) == len( 37 | current_model_predictions) 38 | 39 | # Perform the differential test 40 | for previous_value, current_value in zip( 41 | previous_model_predictions, current_model_predictions): 42 | 43 | # convert numpy float64 to Python float. 44 | previous_value = previous_value.item() 45 | current_value = current_value.item() 46 | 47 | # rel_tol is the relative tolerance – it is the maximum allowed 48 | # difference between a and b, relative to the larger absolute 49 | # value of a or b. For example, to set a tolerance of 5%, pass 50 | # rel_tol=0.05. 51 | assert math.isclose(previous_value, 52 | current_value, 53 | rel_tol=model_config.ACCEPTABLE_MODEL_DIFFERENCE) 54 | -------------------------------------------------------------------------------- /packages/ml_api/tests/test_controller.py: -------------------------------------------------------------------------------- 1 | import io 2 | import json 3 | import math 4 | import os 5 | 6 | from neural_network_model.config import config as ccn_config 7 | from regression_model import __version__ as _version 8 | from regression_model.config import config as model_config 9 | from regression_model.processing.data_management import load_dataset 10 | 11 | from api import __version__ as api_version 12 | 13 | 14 | def test_health_endpoint_returns_200(flask_test_client): 15 | # When 16 | response = flask_test_client.get('/health') 17 | 18 | # Then 19 | assert response.status_code == 200 20 | 21 | 22 | def test_version_endpoint_returns_version(flask_test_client): 23 | # When 24 | response = flask_test_client.get('/version') 25 | 26 | # Then 27 | assert response.status_code == 200 28 | response_json = json.loads(response.data) 29 | assert response_json['model_version'] == _version 30 | assert response_json['api_version'] == api_version 31 | 32 | 33 | def test_prediction_endpoint_returns_prediction(flask_test_client): 34 | # Given 35 | # Load the test data from the regression_model package 36 | # This is important as it makes it harder for the test 37 | # data versions to get confused by not spreading it 38 | # across packages. 39 | test_data = load_dataset(file_name=model_config.TESTING_DATA_FILE) 40 | post_json = test_data[0:1].to_json(orient='records') 41 | 42 | # When 43 | response = flask_test_client.post('/v1/predict/regression', 44 | json=json.loads(post_json)) 45 | 46 | # Then 47 | assert response.status_code == 200 48 | response_json = json.loads(response.data) 49 | prediction = response_json['predictions'] 50 | response_version = response_json['version'] 51 | assert math.ceil(prediction[0]) == 112476 52 | assert response_version == _version 53 | 54 | 55 | def test_classifier_endpoint_returns_prediction(flask_test_client): 56 | # Given 57 | # Load the test data from the neural_network_model package 58 | # This is important as it makes it harder for the test 59 | # data versions to get confused by not spreading it 60 | # across packages. 61 | data_dir = os.path.abspath(os.path.join(ccn_config.DATA_FOLDER, os.pardir)) 62 | test_dir = os.path.join(data_dir, 'test_data') 63 | black_grass_dir = os.path.join(test_dir, 'Black-grass') 64 | black_grass_image = os.path.join(black_grass_dir, '1.png') 65 | with open(black_grass_image, "rb") as image_file: 66 | file_bytes = image_file.read() 67 | data = dict( 68 | file=(io.BytesIO(bytearray(file_bytes)), "1.png"), 69 | ) 70 | 71 | # When 72 | response = flask_test_client.post('/predict/classifier', 73 | content_type='multipart/form-data', 74 | data=data) 75 | 76 | # Then 77 | assert response.status_code == 200 78 | response_json = json.loads(response.data) 79 | assert response_json['readable_predictions'] 80 | -------------------------------------------------------------------------------- /packages/ml_api/tests/test_validation.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from regression_model.config import config 4 | from regression_model.processing.data_management import load_dataset 5 | 6 | 7 | def test_prediction_endpoint_validation_200(flask_test_client): 8 | # Given 9 | # Load the test data from the regression_model package. 10 | # This is important as it makes it harder for the test 11 | # data versions to get confused by not spreading it 12 | # across packages. 13 | test_data = load_dataset(file_name=config.TESTING_DATA_FILE) 14 | post_json = test_data.to_json(orient='records') 15 | 16 | # When 17 | response = flask_test_client.post('/v1/predict/regression', 18 | json=json.loads(post_json)) 19 | 20 | # Then 21 | assert response.status_code == 200 22 | response_json = json.loads(response.data) 23 | 24 | # Check correct number of errors removed 25 | assert len(response_json.get('predictions')) + len( 26 | response_json.get('errors')) == len(test_data) 27 | -------------------------------------------------------------------------------- /packages/ml_api/tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py36, py37, py38 3 | skipsdist = True 4 | 5 | 6 | [testenv] 7 | install_command = pip install --pre {opts} {packages} 8 | deps = 9 | -rrequirements.txt 10 | 11 | passenv = 12 | PIP_EXTRA_INDEX_URL 13 | KERAS_BACKEND 14 | 15 | setenv = 16 | PYTHONPATH=. 17 | 18 | commands = 19 | pytest \ 20 | -s \ 21 | -v \ 22 | -m "not differential" \ 23 | {posargs:tests} 24 | 25 | 26 | # content of pytest.ini 27 | [pytest] 28 | markers = 29 | integration: mark a test as an integration test. 30 | differential: mark a test as a differential test. 31 | filterwarnings = 32 | ignore::DeprecationWarning -------------------------------------------------------------------------------- /packages/neural_network_model/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | include *.md 3 | include *.cfg 4 | include *.pkl 5 | recursive-include ./neural_network_model/*.py 6 | 7 | include neural_network_model/trained_models/*.pkl 8 | include neural_network_model/trained_models/*.h5 9 | include neural_network_model/VERSION 10 | include neural_network_model/datasets/test_data/Black-grass/1.png 11 | include neural_network_model/datasets/test_data/Charlock/1.png 12 | 13 | include ./requirements.txt 14 | exclude *.log 15 | 16 | recursive-exclude * __pycache__ 17 | recursive-exclude * *.py[co] -------------------------------------------------------------------------------- /packages/neural_network_model/config.yml: -------------------------------------------------------------------------------- 1 | MODEL_NAME: ${MODEL_NAME:cnn_model} 2 | PIPELINE_NAME: ${PIPELINE_NAME:cnn_pipe} 3 | CLASSES_PATH: ${CLASSES_PATH:False} 4 | IMAGE_SIZE: $(IMAGE_SIZE:150} 5 | -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/VERSION: -------------------------------------------------------------------------------- 1 | 0.1.0 -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from neural_network_model.config import config 4 | 5 | 6 | with open(os.path.join(config.PACKAGE_ROOT, 'VERSION')) as version_file: 7 | __version__ = version_file.read().strip() 8 | -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/neural_network_model/neural_network_model/config/__init__.py -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/config/config.py: -------------------------------------------------------------------------------- 1 | # The Keras model loading function does not play well with 2 | # Pathlib at the moment, so we are using the old os module 3 | # style 4 | 5 | import os 6 | 7 | PWD = os.path.dirname(os.path.abspath(__file__)) 8 | PACKAGE_ROOT = os.path.abspath(os.path.join(PWD, '..')) 9 | DATASET_DIR = os.path.join(PACKAGE_ROOT, 'datasets') 10 | TRAINED_MODEL_DIR = os.path.join(PACKAGE_ROOT, 'trained_models') 11 | DATA_FOLDER = os.path.join(DATASET_DIR, 'v2-plant-seedlings-dataset') 12 | 13 | # MODEL PERSISTING 14 | MODEL_NAME = 'cnn_model' 15 | PIPELINE_NAME = 'cnn_pipe' 16 | CLASSES_NAME = 'classes' 17 | ENCODER_NAME = 'encoder' 18 | 19 | # MODEL FITTING 20 | IMAGE_SIZE = 150 # 50 for testing, 150 for final model 21 | BATCH_SIZE = 10 22 | EPOCHS = int(os.environ.get('EPOCHS', 1)) # 1 for testing, 10 for final model 23 | 24 | 25 | with open(os.path.join(PACKAGE_ROOT, 'VERSION')) as version_file: 26 | _version = version_file.read().strip() 27 | 28 | MODEL_FILE_NAME = f'{MODEL_NAME}_{_version}.h5' 29 | MODEL_PATH = os.path.join(TRAINED_MODEL_DIR, MODEL_FILE_NAME) 30 | 31 | PIPELINE_FILE_NAME = f'{PIPELINE_NAME}_{_version}.pkl' 32 | PIPELINE_PATH = os.path.join(TRAINED_MODEL_DIR, PIPELINE_FILE_NAME) 33 | 34 | CLASSES_FILE_NAME = f'{CLASSES_NAME}_{_version}.pkl' 35 | CLASSES_PATH = os.path.join(TRAINED_MODEL_DIR, CLASSES_FILE_NAME) 36 | 37 | ENCODER_FILE_NAME = f'{ENCODER_NAME}_{_version}.pkl' 38 | ENCODER_PATH = os.path.join(TRAINED_MODEL_DIR, ENCODER_FILE_NAME) 39 | -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/neural_network_model/neural_network_model/datasets/__init__.py -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/datasets/test_data/Black-grass/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/neural_network_model/neural_network_model/datasets/test_data/Black-grass/1.png -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/datasets/test_data/Charlock/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/neural_network_model/neural_network_model/datasets/test_data/Charlock/1.png -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/datasets/test_data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/neural_network_model/neural_network_model/datasets/test_data/__init__.py -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/model.py: -------------------------------------------------------------------------------- 1 | # for the convolutional network 2 | from keras.models import Sequential 3 | from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten 4 | from keras.optimizers import Adam 5 | from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint 6 | from keras.wrappers.scikit_learn import KerasClassifier 7 | 8 | from neural_network_model.config import config 9 | 10 | 11 | def cnn_model(kernel_size=(3, 3), 12 | pool_size=(2, 2), 13 | first_filters=32, 14 | second_filters=64, 15 | third_filters=128, 16 | dropout_conv=0.3, 17 | dropout_dense=0.3, 18 | image_size=50): 19 | 20 | model = Sequential() 21 | model.add(Conv2D( 22 | first_filters, 23 | kernel_size, 24 | activation='relu', 25 | input_shape=(image_size, image_size, 3))) 26 | model.add(Conv2D(first_filters, kernel_size, activation = 'relu')) 27 | model.add(MaxPooling2D(pool_size=pool_size)) 28 | model.add(Dropout(dropout_conv)) 29 | 30 | model.add(Conv2D(second_filters, kernel_size, activation='relu')) 31 | model.add(Conv2D(second_filters, kernel_size, activation ='relu')) 32 | model.add(MaxPooling2D(pool_size=pool_size)) 33 | model.add(Dropout(dropout_conv)) 34 | 35 | model.add(Conv2D(third_filters, kernel_size, activation='relu')) 36 | model.add(Conv2D(third_filters, kernel_size, activation ='relu')) 37 | model.add(MaxPooling2D(pool_size=pool_size)) 38 | model.add(Dropout(dropout_conv)) 39 | 40 | model.add(Flatten()) 41 | model.add(Dense(256, activation="relu")) 42 | model.add(Dropout(dropout_dense)) 43 | model.add(Dense(12, activation="softmax")) 44 | 45 | model.compile(Adam(lr=0.0001), 46 | loss='binary_crossentropy', 47 | metrics=['accuracy']) 48 | 49 | return model 50 | 51 | 52 | checkpoint = ModelCheckpoint(config.MODEL_PATH, 53 | monitor='acc', 54 | verbose=1, 55 | save_best_only=True, 56 | mode='max') 57 | 58 | reduce_lr = ReduceLROnPlateau(monitor='acc', 59 | factor=0.5, 60 | patience=2, 61 | verbose=1, 62 | mode='max', 63 | min_lr=0.00001) 64 | 65 | callbacks_list = [checkpoint, reduce_lr] 66 | 67 | cnn_clf = KerasClassifier(build_fn=cnn_model, 68 | batch_size=config.BATCH_SIZE, 69 | validation_split=10, 70 | epochs=config.EPOCHS, 71 | verbose=1, # progress bar - required for CI job 72 | callbacks=callbacks_list, 73 | image_size=config.IMAGE_SIZE 74 | ) 75 | 76 | 77 | if __name__ == '__main__': 78 | model = cnn_model() 79 | model.summary() 80 | -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/pipeline.py: -------------------------------------------------------------------------------- 1 | from sklearn.pipeline import Pipeline 2 | 3 | from neural_network_model.config import config 4 | from neural_network_model.processing import preprocessors as pp 5 | from neural_network_model import model 6 | 7 | 8 | pipe = Pipeline([ 9 | ('dataset', pp.CreateDataset(config.IMAGE_SIZE)), 10 | ('cnn_model', model.cnn_clf)]) 11 | -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/predict.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import pandas as pd 4 | 5 | from neural_network_model import __version__ as _version 6 | from neural_network_model.processing import data_management as dm 7 | 8 | _logger = logging.getLogger(__name__) 9 | KERAS_PIPELINE = dm.load_pipeline_keras() 10 | ENCODER = dm.load_encoder() 11 | 12 | 13 | def make_single_prediction(*, image_name: str, image_directory: str): 14 | """Make a single prediction using the saved model pipeline. 15 | 16 | Args: 17 | image_name: Filename of the image to classify 18 | image_directory: Location of the image to classify 19 | 20 | Returns 21 | Dictionary with both raw predictions and readable values. 22 | """ 23 | 24 | image_df = dm.load_single_image( 25 | data_folder=image_directory, 26 | filename=image_name) 27 | 28 | prepared_df = image_df['image'].reset_index(drop=True) 29 | _logger.info(f'received input array: {prepared_df}, ' 30 | f'filename: {image_name}') 31 | 32 | predictions = KERAS_PIPELINE.predict(prepared_df) 33 | readable_predictions = ENCODER.encoder.inverse_transform(predictions) 34 | 35 | _logger.info(f'Made prediction: {predictions}' 36 | f' with model version: {_version}') 37 | 38 | return dict(predictions=predictions, 39 | readable_predictions=readable_predictions, 40 | version=_version) 41 | 42 | 43 | def make_bulk_prediction(*, images_df: pd.Series) -> dict: 44 | """Make multiple predictions using the saved model pipeline. 45 | 46 | Currently, this function is primarily for testing purposes, 47 | allowing us to pass in a directory of images for running 48 | bulk predictions. 49 | 50 | Args: 51 | images_df: Pandas series of images 52 | 53 | Returns 54 | Dictionary with both raw predictions and their classifications. 55 | """ 56 | 57 | _logger.info(f'received input df: {images_df}') 58 | 59 | predictions = KERAS_PIPELINE.predict(images_df) 60 | readable_predictions = ENCODER.encoder.inverse_transform(predictions) 61 | 62 | _logger.info(f'Made predictions: {predictions}' 63 | f' with model version: {_version}') 64 | 65 | return dict(predictions=predictions, 66 | readable_predictions=readable_predictions, 67 | version=_version) 68 | -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/neural_network_model/neural_network_model/processing/__init__.py -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/processing/errors.py: -------------------------------------------------------------------------------- 1 | class BaseError(Exception): 2 | """Base package error.""" 3 | 4 | 5 | class InvalidModelInputError(BaseError): 6 | """Model input contains an error.""" 7 | -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/processing/preprocessors.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | from keras.utils import np_utils 4 | from sklearn.preprocessing import LabelEncoder 5 | from sklearn.base import BaseEstimator, TransformerMixin 6 | 7 | 8 | class TargetEncoder(BaseEstimator, TransformerMixin): 9 | 10 | def __init__(self, encoder=LabelEncoder()): 11 | self.encoder = encoder 12 | 13 | def fit(self, X, y=None): 14 | # note that x is the target in this case 15 | self.encoder.fit(X) 16 | return self 17 | 18 | def transform(self, X): 19 | X = X.copy() 20 | X = np_utils.to_categorical(self.encoder.transform(X)) 21 | return X 22 | 23 | 24 | def _im_resize(df, n, image_size): 25 | im = cv2.imread(df[n]) 26 | im = cv2.resize(im, (image_size, image_size)) 27 | return im 28 | 29 | 30 | class CreateDataset(BaseEstimator, TransformerMixin): 31 | 32 | def __init__(self, image_size=50): 33 | self.image_size = image_size 34 | 35 | def fit(self, X, y=None): 36 | return self 37 | 38 | def transform(self, X): 39 | X = X.copy() 40 | tmp = np.zeros((len(X), 41 | self.image_size, 42 | self.image_size, 3), dtype='float32') 43 | 44 | for n in range(0, len(X)): 45 | im = _im_resize(X, n, self.image_size) 46 | tmp[n] = im 47 | 48 | print('Dataset Images shape: {} size: {:,}'.format( 49 | tmp.shape, tmp.size)) 50 | return tmp 51 | -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/train_pipeline.py: -------------------------------------------------------------------------------- 1 | from sklearn.externals import joblib 2 | 3 | from neural_network_model import pipeline as pipe 4 | from neural_network_model.config import config 5 | from neural_network_model.processing import data_management as dm 6 | from neural_network_model.processing import preprocessors as pp 7 | 8 | 9 | def run_training(save_result: bool = True): 10 | """Train a Convolutional Neural Network.""" 11 | 12 | images_df = dm.load_image_paths(config.DATA_FOLDER) 13 | X_train, X_test, y_train, y_test = dm.get_train_test_target(images_df) 14 | 15 | enc = pp.TargetEncoder() 16 | enc.fit(y_train) 17 | y_train = enc.transform(y_train) 18 | 19 | pipe.pipe.fit(X_train, y_train) 20 | 21 | if save_result: 22 | joblib.dump(enc, config.ENCODER_PATH) 23 | dm.save_pipeline_keras(pipe.pipe) 24 | 25 | 26 | if __name__ == '__main__': 27 | run_training(save_result=True) 28 | -------------------------------------------------------------------------------- /packages/neural_network_model/neural_network_model/trained_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/neural_network_model/neural_network_model/trained_models/__init__.py -------------------------------------------------------------------------------- /packages/neural_network_model/requirements.txt: -------------------------------------------------------------------------------- 1 | # production requirements 2 | pandas==0.23.4 3 | numpy==1.13.3 4 | scikit-learn==0.19.0 5 | Keras==2.1.3 6 | opencv-python==4.0.0.21 7 | h5py==2.9.0 8 | Theano==0.9.0 9 | 10 | # packaging 11 | setuptools==40.6.3 12 | wheel==0.32.3 13 | 14 | # testing requirements 15 | pytest==4.0.2 16 | 17 | # fetching datasets 18 | kaggle==1.5.1.1 -------------------------------------------------------------------------------- /packages/neural_network_model/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import io 5 | import os 6 | from pathlib import Path 7 | 8 | from setuptools import find_packages, setup 9 | 10 | 11 | # Package meta-data. 12 | NAME = 'neural_network_model' 13 | DESCRIPTION = 'Train and deploy neural network model.' 14 | URL = 'your github project' 15 | EMAIL = 'your_email@email.com' 16 | AUTHOR = 'Your name' 17 | REQUIRES_PYTHON = '>=3.6.0' 18 | 19 | 20 | # What packages are required for this module to be executed? 21 | def list_reqs(fname='requirements.txt'): 22 | with open(fname) as fd: 23 | return fd.read().splitlines() 24 | 25 | 26 | # The rest you shouldn't have to touch too much :) 27 | # ------------------------------------------------ 28 | # Except, perhaps the License and Trove Classifiers! 29 | # If you do change the License, remember to change the 30 | # Trove Classifier for that! 31 | 32 | here = os.path.abspath(os.path.dirname(__file__)) 33 | 34 | # Import the README and use it as the long-description. 35 | # Note: this will only work if 'README.md' is present in your MANIFEST.in file! 36 | try: 37 | with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f: 38 | long_description = '\n' + f.read() 39 | except FileNotFoundError: 40 | long_description = DESCRIPTION 41 | 42 | 43 | # Load the package's __version__.py module as a dictionary. 44 | ROOT_DIR = Path(__file__).resolve().parent 45 | PACKAGE_DIR = ROOT_DIR / NAME 46 | about = {} 47 | with open(PACKAGE_DIR / 'VERSION') as f: 48 | _version = f.read().strip() 49 | about['__version__'] = _version 50 | 51 | 52 | # Where the magic happens: 53 | setup( 54 | name=NAME, 55 | version=about['__version__'], 56 | description=DESCRIPTION, 57 | long_description=long_description, 58 | long_description_content_type='text/markdown', 59 | author=AUTHOR, 60 | author_email=EMAIL, 61 | python_requires=REQUIRES_PYTHON, 62 | url=URL, 63 | packages=find_packages(exclude=('tests',)), 64 | package_data={'neural_network_model': ['VERSION']}, 65 | install_requires=list_reqs(), 66 | extras_require={}, 67 | include_package_data=True, 68 | license='MIT', 69 | classifiers=[ 70 | # Trove classifiers 71 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers 72 | 'License :: OSI Approved :: MIT License', 73 | 'Programming Language :: Python', 74 | 'Programming Language :: Python :: 3', 75 | 'Programming Language :: Python :: 3.6', 76 | 'Programming Language :: Python :: Implementation :: CPython', 77 | 'Programming Language :: Python :: Implementation :: PyPy' 78 | ], 79 | ) 80 | -------------------------------------------------------------------------------- /packages/neural_network_model/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/neural_network_model/tests/__init__.py -------------------------------------------------------------------------------- /packages/neural_network_model/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | 4 | from neural_network_model.config import config 5 | 6 | 7 | @pytest.fixture 8 | def black_grass_dir(): 9 | test_data_dir = os.path.join(config.DATASET_DIR, 'test_data') 10 | black_grass_dir = os.path.join(test_data_dir, 'Black-grass') 11 | 12 | return black_grass_dir 13 | 14 | 15 | @pytest.fixture 16 | def charlock_dir(): 17 | test_data_dir = os.path.join(config.DATASET_DIR, 'test_data') 18 | charlock_dir = os.path.join(test_data_dir, 'Charlock') 19 | 20 | return charlock_dir 21 | -------------------------------------------------------------------------------- /packages/neural_network_model/tests/test_predict.py: -------------------------------------------------------------------------------- 1 | from neural_network_model import __version__ as _version 2 | from neural_network_model.predict import (make_single_prediction) 3 | 4 | 5 | def test_make_prediction_on_sample(charlock_dir): 6 | # Given 7 | filename = '1.png' 8 | expected_classification = 'Charlock' 9 | 10 | # When 11 | results = make_single_prediction(image_directory=charlock_dir, 12 | image_name=filename) 13 | 14 | # Then 15 | assert results['predictions'] is not None 16 | assert results['readable_predictions'][0] == expected_classification 17 | assert results['version'] == _version 18 | -------------------------------------------------------------------------------- /packages/regression_model/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | include *.md 3 | include *.cfg 4 | include *.pkl 5 | recursive-include ./regression_model/* 6 | 7 | include regression_model/datasets/train.csv 8 | include regression_model/datasets/test.csv 9 | include regression_model/trained_models/*.pkl 10 | include regression_model/VERSION 11 | 12 | include ./requirements.txt 13 | exclude *.log 14 | 15 | recursive-exclude * __pycache__ 16 | recursive-exclude * *.py[co] -------------------------------------------------------------------------------- /packages/regression_model/regression_model/VERSION: -------------------------------------------------------------------------------- 1 | 2.0.20 -------------------------------------------------------------------------------- /packages/regression_model/regression_model/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from regression_model.config import config 4 | from regression_model.config import logging_config 5 | 6 | 7 | VERSION_PATH = config.PACKAGE_ROOT / 'VERSION' 8 | 9 | # Configure logger for use in package 10 | logger = logging.getLogger(__name__) 11 | logger.setLevel(logging.DEBUG) 12 | logger.addHandler(logging_config.get_console_handler()) 13 | logger.propagate = False 14 | 15 | 16 | with open(VERSION_PATH, 'r') as version_file: 17 | __version__ = version_file.read().strip() 18 | -------------------------------------------------------------------------------- /packages/regression_model/regression_model/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/regression_model/regression_model/config/__init__.py -------------------------------------------------------------------------------- /packages/regression_model/regression_model/config/config.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import regression_model 4 | 5 | import pandas as pd 6 | 7 | 8 | pd.options.display.max_rows = 10 9 | pd.options.display.max_columns = 10 10 | 11 | 12 | PACKAGE_ROOT = pathlib.Path(regression_model.__file__).resolve().parent 13 | TRAINED_MODEL_DIR = PACKAGE_ROOT / "trained_models" 14 | DATASET_DIR = PACKAGE_ROOT / "datasets" 15 | 16 | # data 17 | TESTING_DATA_FILE = "test.csv" 18 | TRAINING_DATA_FILE = "train.csv" 19 | TARGET = "SalePrice" 20 | 21 | 22 | # variables 23 | FEATURES = [ 24 | "MSSubClass", 25 | "MSZoning", 26 | "Neighborhood", 27 | "OverallQual", 28 | "OverallCond", 29 | "YearRemodAdd", 30 | "RoofStyle", 31 | "MasVnrType", 32 | "BsmtQual", 33 | "BsmtExposure", 34 | "HeatingQC", 35 | "CentralAir", 36 | "1stFlrSF", 37 | "GrLivArea", 38 | "BsmtFullBath", 39 | "KitchenQual", 40 | "Fireplaces", 41 | "FireplaceQu", 42 | "GarageType", 43 | "GarageFinish", 44 | "GarageCars", 45 | "PavedDrive", 46 | "LotFrontage", 47 | # this one is only to calculate temporal variable: 48 | "YrSold", 49 | ] 50 | 51 | # this variable is to calculate the temporal variable, 52 | # can be dropped afterwards 53 | DROP_FEATURES = "YrSold" 54 | 55 | # numerical variables with NA in train set 56 | NUMERICAL_VARS_WITH_NA = ["LotFrontage"] 57 | 58 | # categorical variables with NA in train set 59 | CATEGORICAL_VARS_WITH_NA = [ 60 | "MasVnrType", 61 | "BsmtQual", 62 | "BsmtExposure", 63 | "FireplaceQu", 64 | "GarageType", 65 | "GarageFinish", 66 | ] 67 | 68 | TEMPORAL_VARS = "YearRemodAdd" 69 | 70 | # variables to log transform 71 | NUMERICALS_LOG_VARS = ["LotFrontage", "1stFlrSF", "GrLivArea"] 72 | 73 | # categorical variables to encode 74 | CATEGORICAL_VARS = [ 75 | "MSZoning", 76 | "Neighborhood", 77 | "RoofStyle", 78 | "MasVnrType", 79 | "BsmtQual", 80 | "BsmtExposure", 81 | "HeatingQC", 82 | "CentralAir", 83 | "KitchenQual", 84 | "FireplaceQu", 85 | "GarageType", 86 | "GarageFinish", 87 | "PavedDrive", 88 | ] 89 | 90 | NUMERICAL_NA_NOT_ALLOWED = [ 91 | feature 92 | for feature in FEATURES 93 | if feature not in CATEGORICAL_VARS + NUMERICAL_VARS_WITH_NA 94 | ] 95 | 96 | CATEGORICAL_NA_NOT_ALLOWED = [ 97 | feature for feature in CATEGORICAL_VARS if feature not in CATEGORICAL_VARS_WITH_NA 98 | ] 99 | 100 | 101 | PIPELINE_NAME = "lasso_regression" 102 | PIPELINE_SAVE_FILE = f"{PIPELINE_NAME}_output_v" 103 | 104 | # used for differential testing 105 | ACCEPTABLE_MODEL_DIFFERENCE = 0.05 106 | -------------------------------------------------------------------------------- /packages/regression_model/regression_model/config/logging_config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | 4 | 5 | # Multiple calls to logging.getLogger('someLogger') return a 6 | # reference to the same logger object. This is true not only 7 | # within the same module, but also across modules as long as 8 | # it is in the same Python interpreter process. 9 | 10 | FORMATTER = logging.Formatter( 11 | "%(asctime)s — %(name)s — %(levelname)s —" "%(funcName)s:%(lineno)d — %(message)s" 12 | ) 13 | 14 | 15 | def get_console_handler(): 16 | console_handler = logging.StreamHandler(sys.stdout) 17 | console_handler.setFormatter(FORMATTER) 18 | return console_handler 19 | -------------------------------------------------------------------------------- /packages/regression_model/regression_model/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/regression_model/regression_model/datasets/__init__.py -------------------------------------------------------------------------------- /packages/regression_model/regression_model/pipeline.py: -------------------------------------------------------------------------------- 1 | from sklearn.linear_model import Lasso 2 | from sklearn.pipeline import Pipeline 3 | from sklearn.preprocessing import MinMaxScaler 4 | 5 | from regression_model.processing import preprocessors as pp 6 | from regression_model.processing import features 7 | from regression_model.config import config 8 | 9 | import logging 10 | 11 | 12 | _logger = logging.getLogger(__name__) 13 | 14 | 15 | price_pipe = Pipeline( 16 | [ 17 | ( 18 | "categorical_imputer", 19 | pp.CategoricalImputer(variables=config.CATEGORICAL_VARS_WITH_NA), 20 | ), 21 | ( 22 | "numerical_inputer", 23 | pp.NumericalImputer(variables=config.NUMERICAL_VARS_WITH_NA), 24 | ), 25 | ( 26 | "temporal_variable", 27 | pp.TemporalVariableEstimator( 28 | variables=config.TEMPORAL_VARS, reference_variable=config.DROP_FEATURES 29 | ), 30 | ), 31 | ( 32 | "rare_label_encoder", 33 | pp.RareLabelCategoricalEncoder(tol=0.01, variables=config.CATEGORICAL_VARS), 34 | ), 35 | ( 36 | "categorical_encoder", 37 | pp.CategoricalEncoder(variables=config.CATEGORICAL_VARS), 38 | ), 39 | ( 40 | "log_transformer", 41 | features.LogTransformer(variables=config.NUMERICALS_LOG_VARS), 42 | ), 43 | ( 44 | "drop_features", 45 | pp.DropUnecessaryFeatures(variables_to_drop=config.DROP_FEATURES), 46 | ), 47 | ("scaler", MinMaxScaler()), 48 | ("Linear_model", Lasso(alpha=0.005, random_state=0)), 49 | ] 50 | ) 51 | -------------------------------------------------------------------------------- /packages/regression_model/regression_model/predict.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from regression_model.processing.data_management import load_pipeline 5 | from regression_model.config import config 6 | from regression_model.processing.validation import validate_inputs 7 | from regression_model import __version__ as _version 8 | 9 | import logging 10 | import typing as t 11 | 12 | 13 | _logger = logging.getLogger(__name__) 14 | 15 | pipeline_file_name = f"{config.PIPELINE_SAVE_FILE}{_version}.pkl" 16 | _price_pipe = load_pipeline(file_name=pipeline_file_name) 17 | 18 | 19 | def make_prediction(*, input_data: t.Union[pd.DataFrame, dict], 20 | ) -> dict: 21 | """Make a prediction using a saved model pipeline. 22 | 23 | Args: 24 | input_data: Array of model prediction inputs. 25 | 26 | Returns: 27 | Predictions for each input row, as well as the model version. 28 | """ 29 | 30 | data = pd.DataFrame(input_data) 31 | validated_data = validate_inputs(input_data=data) 32 | 33 | prediction = _price_pipe.predict(validated_data[config.FEATURES]) 34 | 35 | output = np.exp(prediction) 36 | 37 | results = {"predictions": output, "version": _version} 38 | 39 | _logger.info( 40 | f"Making predictions with model version: {_version} " 41 | f"Inputs: {validated_data} " 42 | f"Predictions: {results}" 43 | ) 44 | 45 | return results 46 | -------------------------------------------------------------------------------- /packages/regression_model/regression_model/processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/regression_model/regression_model/processing/__init__.py -------------------------------------------------------------------------------- /packages/regression_model/regression_model/processing/data_management.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import joblib 3 | from sklearn.pipeline import Pipeline 4 | 5 | from regression_model.config import config 6 | from regression_model import __version__ as _version 7 | 8 | import logging 9 | import typing as t 10 | 11 | 12 | _logger = logging.getLogger(__name__) 13 | 14 | 15 | def load_dataset(*, file_name: str) -> pd.DataFrame: 16 | _data = pd.read_csv(f"{config.DATASET_DIR}/{file_name}") 17 | return _data 18 | 19 | 20 | def save_pipeline(*, pipeline_to_persist) -> None: 21 | """Persist the pipeline. 22 | Saves the versioned model, and overwrites any previous 23 | saved models. This ensures that when the package is 24 | published, there is only one trained model that can be 25 | called, and we know exactly how it was built. 26 | """ 27 | 28 | # Prepare versioned save file name 29 | save_file_name = f"{config.PIPELINE_SAVE_FILE}{_version}.pkl" 30 | save_path = config.TRAINED_MODEL_DIR / save_file_name 31 | 32 | remove_old_pipelines(files_to_keep=[save_file_name]) 33 | joblib.dump(pipeline_to_persist, save_path) 34 | _logger.info(f"saved pipeline: {save_file_name}") 35 | 36 | 37 | def load_pipeline(*, file_name: str) -> Pipeline: 38 | """Load a persisted pipeline.""" 39 | 40 | file_path = config.TRAINED_MODEL_DIR / file_name 41 | trained_model = joblib.load(filename=file_path) 42 | return trained_model 43 | 44 | 45 | def remove_old_pipelines(*, files_to_keep: t.List[str]) -> None: 46 | """ 47 | Remove old model pipelines. 48 | 49 | This is to ensure there is a simple one-to-one 50 | mapping between the package version and the model 51 | version to be imported and used by other applications. 52 | However, we do also include the immediate previous 53 | pipeline version for differential testing purposes. 54 | """ 55 | do_not_delete = files_to_keep + ['__init__.py'] 56 | for model_file in config.TRAINED_MODEL_DIR.iterdir(): 57 | if model_file.name not in do_not_delete: 58 | model_file.unlink() 59 | -------------------------------------------------------------------------------- /packages/regression_model/regression_model/processing/errors.py: -------------------------------------------------------------------------------- 1 | class BaseError(Exception): 2 | """Base package error.""" 3 | 4 | 5 | class InvalidModelInputError(BaseError): 6 | """Model input contains an error.""" 7 | -------------------------------------------------------------------------------- /packages/regression_model/regression_model/processing/features.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.base import BaseEstimator, TransformerMixin 3 | 4 | from regression_model.processing.errors import InvalidModelInputError 5 | 6 | 7 | class LogTransformer(BaseEstimator, TransformerMixin): 8 | """Logarithm transformer.""" 9 | 10 | def __init__(self, variables=None): 11 | if not isinstance(variables, list): 12 | self.variables = [variables] 13 | else: 14 | self.variables = variables 15 | 16 | def fit(self, X, y=None): 17 | # to accomodate the pipeline 18 | return self 19 | 20 | def transform(self, X): 21 | X = X.copy() 22 | 23 | # check that the values are non-negative for log transform 24 | if not (X[self.variables] > 0).all().all(): 25 | vars_ = self.variables[(X[self.variables] <= 0).any()] 26 | raise InvalidModelInputError( 27 | f"Variables contain zero or negative values, " 28 | f"can't apply log for vars: {vars_}" 29 | ) 30 | 31 | for feature in self.variables: 32 | X[feature] = np.log(X[feature]) 33 | 34 | return X 35 | -------------------------------------------------------------------------------- /packages/regression_model/regression_model/processing/validation.py: -------------------------------------------------------------------------------- 1 | from regression_model.config import config 2 | 3 | import pandas as pd 4 | 5 | 6 | def validate_inputs(input_data: pd.DataFrame) -> pd.DataFrame: 7 | """Check model inputs for unprocessable values.""" 8 | 9 | validated_data = input_data.copy() 10 | 11 | # check for numerical variables with NA not seen during training 12 | if input_data[config.NUMERICAL_NA_NOT_ALLOWED].isnull().any().any(): 13 | validated_data = validated_data.dropna( 14 | axis=0, subset=config.NUMERICAL_NA_NOT_ALLOWED 15 | ) 16 | 17 | # check for categorical variables with NA not seen during training 18 | if input_data[config.CATEGORICAL_NA_NOT_ALLOWED].isnull().any().any(): 19 | validated_data = validated_data.dropna( 20 | axis=0, subset=config.CATEGORICAL_NA_NOT_ALLOWED 21 | ) 22 | 23 | # check for values <= 0 for the log transformed variables 24 | if (input_data[config.NUMERICALS_LOG_VARS] <= 0).any().any(): 25 | vars_with_neg_values = config.NUMERICALS_LOG_VARS[ 26 | (input_data[config.NUMERICALS_LOG_VARS] <= 0).any() 27 | ] 28 | validated_data = validated_data[validated_data[vars_with_neg_values] > 0] 29 | 30 | return validated_data 31 | -------------------------------------------------------------------------------- /packages/regression_model/regression_model/train_pipeline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.model_selection import train_test_split 3 | 4 | from regression_model import pipeline 5 | from regression_model.processing.data_management import load_dataset, save_pipeline 6 | from regression_model.config import config 7 | from regression_model import __version__ as _version 8 | 9 | import logging 10 | 11 | 12 | _logger = logging.getLogger(__name__) 13 | 14 | 15 | def run_training() -> None: 16 | """Train the model.""" 17 | 18 | # read training data 19 | data = load_dataset(file_name=config.TRAINING_DATA_FILE) 20 | 21 | # divide train and test 22 | X_train, X_test, y_train, y_test = train_test_split( 23 | data[config.FEATURES], data[config.TARGET], test_size=0.1, random_state=0 24 | ) # we are setting the seed here 25 | 26 | # transform the target 27 | y_train = np.log(y_train) 28 | 29 | pipeline.price_pipe.fit(X_train[config.FEATURES], y_train) 30 | 31 | _logger.info(f"saving model version: {_version}") 32 | save_pipeline(pipeline_to_persist=pipeline.price_pipe) 33 | 34 | 35 | if __name__ == "__main__": 36 | run_training() 37 | -------------------------------------------------------------------------------- /packages/regression_model/regression_model/trained_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/regression_model/regression_model/trained_models/__init__.py -------------------------------------------------------------------------------- /packages/regression_model/requirements.txt: -------------------------------------------------------------------------------- 1 | # We use compatible release functionality (see PEP 440 here: https://www.python.org/dev/peps/pep-0440/#compatible-release) 2 | # to specify acceptable version ranges of our project dependencies. This gives us the flexibility to keep up with small 3 | # updates/fixes, whilst ensuring we don't install a major update which could introduce backwards incompatible changes. 4 | 5 | # Model Building Requirements 6 | numpy>=1.18.1,<1.19.0 7 | pandas>=0.25.3,<0.26.0 8 | scikit-learn>=0.22.1,<0.23.0 9 | joblib>=0.14.1,<0.15.0 10 | 11 | # testing requirements 12 | pytest>=5.3.2,<6.0.0 13 | 14 | # packaging 15 | setuptools>=41.4.0,<42.0.0 16 | wheel>=0.33.6,<0.34.0 17 | 18 | # fetching datasets 19 | kaggle>=1.5.6,<1.6.0 20 | -------------------------------------------------------------------------------- /packages/regression_model/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import io 5 | import os 6 | from pathlib import Path 7 | 8 | from setuptools import find_packages, setup 9 | 10 | 11 | # Package meta-data. 12 | NAME = 'regression_model' 13 | DESCRIPTION = 'Regression model for using in the Train In Data online course "Deployment of Machine Learning Models".' 14 | URL = 'https://github.com/trainindata/deploying-machine-learning-models' 15 | EMAIL = 'christopher.samiullah@protonmail.com' 16 | AUTHOR = 'ChristopherGS' 17 | REQUIRES_PYTHON = '>=3.6.0' 18 | 19 | 20 | # Packages that are required for this module to be executed 21 | def list_reqs(fname='requirements.txt'): 22 | with open(fname) as fd: 23 | return fd.read().splitlines() 24 | 25 | 26 | # The rest you shouldn't have to touch too much :) 27 | # ------------------------------------------------ 28 | # Except, perhaps the License and Trove Classifiers! 29 | # If you do change the License, remember to change the 30 | # Trove Classifier for that! 31 | 32 | here = os.path.abspath(os.path.dirname(__file__)) 33 | 34 | # Import the README and use it as the long-description. 35 | # Note: this will only work if 'README.md' is present in your MANIFEST.in file! 36 | try: 37 | with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f: 38 | long_description = '\n' + f.read() 39 | except FileNotFoundError: 40 | long_description = DESCRIPTION 41 | 42 | 43 | # Load the package's __version__.py module as a dictionary. 44 | ROOT_DIR = Path(__file__).resolve().parent 45 | PACKAGE_DIR = ROOT_DIR / 'regression_model' 46 | about = {} 47 | with open(PACKAGE_DIR / 'VERSION') as f: 48 | _version = f.read().strip() 49 | about['__version__'] = _version 50 | 51 | 52 | # Where the magic happens: 53 | setup( 54 | name=NAME, 55 | version=about['__version__'], 56 | description=DESCRIPTION, 57 | long_description=long_description, 58 | long_description_content_type='text/markdown', 59 | author=AUTHOR, 60 | author_email=EMAIL, 61 | python_requires=REQUIRES_PYTHON, 62 | url=URL, 63 | packages=find_packages(exclude=('tests',)), 64 | package_data={'regression_model': ['VERSION']}, 65 | install_requires=list_reqs(), 66 | extras_require={}, 67 | include_package_data=True, 68 | license='BSD 3', 69 | classifiers=[ 70 | # Trove classifiers 71 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers 72 | 'License :: OSI Approved :: MIT License', 73 | 'Programming Language :: Python', 74 | 'Programming Language :: Python :: 3', 75 | 'Programming Language :: Python :: 3.6', 76 | 'Programming Language :: Python :: 3.7', 77 | 'Programming Language :: Python :: 3.8', 78 | 'Programming Language :: Python :: Implementation :: CPython', 79 | 'Programming Language :: Python :: Implementation :: PyPy' 80 | ], 81 | ) 82 | -------------------------------------------------------------------------------- /packages/regression_model/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/regression_model/tests/__init__.py -------------------------------------------------------------------------------- /packages/regression_model/tests/test_predict.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | from regression_model.predict import make_prediction 4 | from regression_model.processing.data_management import load_dataset 5 | 6 | 7 | def test_make_single_prediction(): 8 | # Given 9 | test_data = load_dataset(file_name='test.csv') 10 | single_test_input = test_data[0:1] 11 | 12 | # When 13 | subject = make_prediction(input_data=single_test_input) 14 | 15 | # Then 16 | assert subject is not None 17 | assert isinstance(subject.get('predictions')[0], float) 18 | assert math.ceil(subject.get('predictions')[0]) == 112476 19 | 20 | 21 | def test_make_multiple_predictions(): 22 | # Given 23 | test_data = load_dataset(file_name='test.csv') 24 | original_data_length = len(test_data) 25 | multiple_test_input = test_data 26 | 27 | # When 28 | subject = make_prediction(input_data=multiple_test_input) 29 | 30 | # Then 31 | assert subject is not None 32 | assert len(subject.get('predictions')) == 1451 33 | 34 | # We expect some rows to be filtered out 35 | assert len(subject.get('predictions')) != original_data_length 36 | -------------------------------------------------------------------------------- /packages/regression_model/tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py36, py37, py38 3 | 4 | 5 | [testenv] 6 | install_command = pip install --pre {opts} {packages} 7 | whitelist_externals = unzip 8 | deps = 9 | -rrequirements.txt 10 | 11 | passenv = 12 | KAGGLE_USERNAME 13 | KAGGLE_KEY 14 | 15 | setenv = 16 | PYTHONPATH=. 17 | 18 | commands = 19 | kaggle competitions download -c house-prices-advanced-regression-techniques -p regression_model/datasets/ 20 | unzip -o regression_model/datasets/house-prices-advanced-regression-techniques.zip -d regression_model/datasets 21 | python regression_model/train_pipeline.py 22 | pytest \ 23 | -s \ 24 | -v \ 25 | {posargs:tests} 26 | -------------------------------------------------------------------------------- /scripts/fetch_kaggle_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | kaggle competitions download -c house-prices-advanced-regression-techniques -p packages/regression_model/regression_model/datasets/ -------------------------------------------------------------------------------- /scripts/fetch_kaggle_large_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | TRAINING_DATA_URL="vbookshelf/v2-plant-seedlings-dataset" 4 | NOW=$(date) 5 | 6 | kaggle datasets download -d $TRAINING_DATA_URL -p packages/neural_network_model/neural_network_model/datasets/ && \ 7 | unzip packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset.zip -d packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset && \ 8 | echo $TRAINING_DATA_URL 'retrieved on:' $NOW > packages/neural_network_model/neural_network_model/datasets/training_data_reference.txt && \ 9 | mkdir -p "./packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset/Shepherds Purse" && \ 10 | mv -v "./packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset/Shepherd’s Purse/"* "./packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset/Shepherds Purse" 11 | rm -rf "./packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset/Shepherd’s Purse" -------------------------------------------------------------------------------- /scripts/input_test.json: -------------------------------------------------------------------------------- 1 | [{ 2 | "Id": 1461, 3 | "MSSubClass": 20, 4 | "MSZoning": "RH", 5 | "LotFrontage": 80.0, 6 | "LotArea": 11622, 7 | "Street": "Pave", 8 | "Alley": null, 9 | "LotShape": "Reg", 10 | "LandContour": "Lvl", 11 | "Utilities": "AllPub", 12 | "LotConfig": "Inside", 13 | "LandSlope": "Gtl", 14 | "Neighborhood": "NAmes", 15 | "Condition1": "Feedr", 16 | "Condition2": "Norm", 17 | "BldgType": "1Fam", 18 | "HouseStyle": "1Story", 19 | "OverallQual": 5, 20 | "OverallCond": 6, 21 | "YearBuilt": 1961, 22 | "YearRemodAdd": 1961, 23 | "RoofStyle": "Gable", 24 | "RoofMatl": "CompShg", 25 | "Exterior1st": "VinylSd", 26 | "Exterior2nd": "VinylSd", 27 | "MasVnrType": "None", 28 | "MasVnrArea": 0.0, 29 | "ExterQual": "TA", 30 | "ExterCond": "TA", 31 | "Foundation": "CBlock", 32 | "BsmtQual": "TA", 33 | "BsmtCond": "TA", 34 | "BsmtExposure": "No", 35 | "BsmtFinType1": "Rec", 36 | "BsmtFinSF1": 468.0, 37 | "BsmtFinType2": "LwQ", 38 | "BsmtFinSF2": 144.0, 39 | "BsmtUnfSF": 270.0, 40 | "TotalBsmtSF": 882.0, 41 | "Heating": "GasA", 42 | "HeatingQC": "TA", 43 | "CentralAir": "Y", 44 | "Electrical": "SBrkr", 45 | "1stFlrSF": 896, 46 | "2ndFlrSF": 0, 47 | "LowQualFinSF": 0, 48 | "GrLivArea": 896, 49 | "BsmtFullBath": 0.0, 50 | "BsmtHalfBath": 0.0, 51 | "FullBath": 1, 52 | "HalfBath": 0, 53 | "BedroomAbvGr": 2, 54 | "KitchenAbvGr": 1, 55 | "KitchenQual": "TA", 56 | "TotRmsAbvGrd": 5, 57 | "Functional": "Typ", 58 | "Fireplaces": 0, 59 | "FireplaceQu": null, 60 | "GarageType": "Attchd", 61 | "GarageYrBlt": 1961.0, 62 | "GarageFinish": "Unf", 63 | "GarageCars": 1.0, 64 | "GarageArea": 730.0, 65 | "GarageQual": "TA", 66 | "GarageCond": "TA", 67 | "PavedDrive": "Y", 68 | "WoodDeckSF": 140, 69 | "OpenPorchSF": 0, 70 | "EnclosedPorch": 0, 71 | "3SsnPorch": 0, 72 | "ScreenPorch": 120, 73 | "PoolArea": 0, 74 | "PoolQC": null, 75 | "Fence": "MnPrv", 76 | "MiscFeature": null, 77 | "MiscVal": 0, 78 | "MoSold": 6, 79 | "YrSold": 2010, 80 | "SaleType": "WD", 81 | "SaleCondition": "Normal" 82 | }] -------------------------------------------------------------------------------- /scripts/publish_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Building packages and uploading them to a Gemfury repository 4 | 5 | GEMFURY_URL=$GEMFURY_PUSH_URL 6 | 7 | set -e 8 | 9 | DIRS="$@" 10 | BASE_DIR=$(pwd) 11 | SETUP="setup.py" 12 | 13 | warn() { 14 | echo "$@" 1>&2 15 | } 16 | 17 | die() { 18 | warn "$@" 19 | exit 1 20 | } 21 | 22 | build() { 23 | DIR="${1/%\//}" 24 | echo "Checking directory $DIR" 25 | cd "$BASE_DIR/$DIR" 26 | [ ! -e $SETUP ] && warn "No $SETUP file, skipping" && return 27 | PACKAGE_NAME=$(python $SETUP --fullname) 28 | echo "Package $PACKAGE_NAME" 29 | python "$SETUP" sdist bdist_wheel || die "Building package $PACKAGE_NAME failed" 30 | for X in $(ls dist) 31 | do 32 | curl -F package=@"dist/$X" "$GEMFURY_URL" || die "Uploading package $PACKAGE_NAME failed on file dist/$X" 33 | done 34 | } 35 | 36 | if [ -n "$DIRS" ]; then 37 | for dir in $DIRS; do 38 | build $dir 39 | done 40 | else 41 | ls -d */ | while read dir; do 42 | build $dir 43 | done 44 | fi -------------------------------------------------------------------------------- /section-04-research-and-development/preprocessors.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from sklearn.base import BaseEstimator, TransformerMixin 5 | 6 | 7 | 8 | class TemporalVariableTransformer(BaseEstimator, TransformerMixin): 9 | # Temporal elapsed time transformer 10 | 11 | def __init__(self, variables, reference_variable): 12 | 13 | if not isinstance(variables, list): 14 | raise ValueError('variables should be a list') 15 | 16 | self.variables = variables 17 | self.reference_variable = reference_variable 18 | 19 | def fit(self, X, y=None): 20 | # we need this step to fit the sklearn pipeline 21 | return self 22 | 23 | def transform(self, X): 24 | 25 | # so that we do not over-write the original dataframe 26 | X = X.copy() 27 | 28 | for feature in self.variables: 29 | X[feature] = X[self.reference_variable] - X[feature] 30 | 31 | return X 32 | 33 | 34 | 35 | # categorical missing value imputer 36 | class Mapper(BaseEstimator, TransformerMixin): 37 | 38 | def __init__(self, variables, mappings): 39 | 40 | if not isinstance(variables, list): 41 | raise ValueError('variables should be a list') 42 | 43 | self.variables = variables 44 | self.mappings = mappings 45 | 46 | def fit(self, X, y=None): 47 | # we need the fit statement to accomodate the sklearn pipeline 48 | return self 49 | 50 | def transform(self, X): 51 | X = X.copy() 52 | for feature in self.variables: 53 | X[feature] = X[feature].map(self.mappings) 54 | 55 | return X -------------------------------------------------------------------------------- /section-04-research-and-development/preprocessors_bonus.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.base import BaseEstimator, TransformerMixin 4 | 5 | 6 | class MeanImputer(BaseEstimator, TransformerMixin): 7 | """Numerical missing value imputer.""" 8 | 9 | def __init__(self, variables): 10 | if not isinstance(variables, list): 11 | raise ValueError('variables should be a list') 12 | self.variables = variables 13 | 14 | def fit(self, X, y=None): 15 | # persist mean values in a dictionary 16 | self.imputer_dict_ = X[self.variables].mean().to_dict() 17 | return self 18 | 19 | def transform(self, X): 20 | X = X.copy() 21 | for feature in self.variables: 22 | X[feature].fillna(self.imputer_dict_[feature], 23 | inplace=True) 24 | return X 25 | 26 | 27 | 28 | class RareLabelCategoricalEncoder(BaseEstimator, TransformerMixin): 29 | """Groups infrequent categories into a single string""" 30 | 31 | def __init__(self, tol=0.05, variables): 32 | 33 | if not isinstance(variables, list): 34 | raise ValueError('variables should be a list') 35 | 36 | self.tol = tol 37 | self.variables = variables 38 | 39 | def fit(self, X, y=None): 40 | # persist frequent labels in dictionary 41 | self.encoder_dict_ = {} 42 | 43 | for var in self.variables: 44 | # the encoder will learn the most frequent categories 45 | t = pd.Series(X[var].value_counts(normalize=True) 46 | # frequent labels: 47 | self.encoder_dict_[var] = list(t[t >= self.tol].index) 48 | 49 | return self 50 | 51 | def transform(self, X): 52 | X = X.copy() 53 | for feature in self.variables: 54 | X[feature] = np.where( 55 | X[feature].isin(self.encoder_dict_[feature]), 56 | X[feature], "Rare") 57 | 58 | return X 59 | 60 | 61 | class CategoricalEncoder(BaseEstimator, TransformerMixin): 62 | """String to numbers categorical encoder.""" 63 | 64 | def __init__(self, variables): 65 | 66 | if not isinstance(variables, list): 67 | raise ValueError('variables should be a list') 68 | 69 | self.variables = variables 70 | 71 | def fit(self, X, y): 72 | temp = pd.concat([X, y], axis=1) 73 | temp.columns = list(X.columns) + ["target"] 74 | 75 | # persist transforming dictionary 76 | self.encoder_dict_ = {} 77 | 78 | for var in self.variables: 79 | t = temp.groupby([var])["target"].mean().sort_values(ascending=True).index 80 | self.encoder_dict_[var] = {k: i for i, k in enumerate(t, 0)} 81 | 82 | return self 83 | 84 | def transform(self, X): 85 | # encode labels 86 | X = X.copy() 87 | for feature in self.variables: 88 | X[feature] = X[feature].map(self.encoder_dict_[feature]) 89 | 90 | return X -------------------------------------------------------------------------------- /section-04-research-and-development/requirements.txt: -------------------------------------------------------------------------------- 1 | feature-engine==1.0.2 2 | joblib==1.0.1 3 | matplotlib==3.3.4 4 | numpy==1.20.1 5 | pandas==1.2.2 6 | scikit-learn==0.24.1 7 | scipy==1.6.0 8 | seaborn==0.11.1 9 | statsmodels==0.12.2 -------------------------------------------------------------------------------- /section-05-production-model-package/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | include *.md 3 | include *.pkl 4 | recursive-include ./regression_model/* 5 | 6 | include regression_model/datasets/train.csv 7 | include regression_model/datasets/test.csv 8 | include regression_model/trained_models/*.pkl 9 | include regression_model/VERSION 10 | include regression_model/config.yml 11 | 12 | include ./requirements/requirements.txt 13 | include ./requirements/test_requirements.txt 14 | exclude *.log 15 | exclude *.cfg 16 | 17 | recursive-exclude * __pycache__ 18 | recursive-exclude * *.py[co] -------------------------------------------------------------------------------- /section-05-production-model-package/mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | # warn_unreachable = True 3 | warn_unused_ignores = True 4 | follow_imports = skip 5 | show_error_context = True 6 | warn_incomplete_stub = True 7 | ignore_missing_imports = True 8 | check_untyped_defs = True 9 | cache_dir = /dev/null 10 | # Cannot enable this one as we still allow defining functions without any types. 11 | # disallow_untyped_defs = True 12 | warn_redundant_casts = True 13 | warn_unused_configs = True 14 | strict_optional = True -------------------------------------------------------------------------------- /section-05-production-model-package/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" 7 | 8 | [tool.pytest.ini_options] 9 | minversion = "2.0" 10 | addopts = "-rfEX -p pytester --strict-markers" 11 | python_files = ["test_*.py", "*_test.py"] 12 | python_classes = ["Test", "Acceptance"] 13 | python_functions = ["test"] 14 | # NOTE: "doc" is not included here, but gets tested explicitly via "doctesting". 15 | testpaths = ["tests"] 16 | xfail_strict = true 17 | filterwarnings = [ 18 | "error", 19 | "default:Using or importing the ABCs:DeprecationWarning:unittest2.*", 20 | # produced by older pyparsing<=2.2.0. 21 | "default:Using or importing the ABCs:DeprecationWarning:pyparsing.*", 22 | "default:the imp module is deprecated in favour of importlib:DeprecationWarning:nose.*", 23 | # distutils is deprecated in 3.10, scheduled for removal in 3.12 24 | "ignore:The distutils package is deprecated:DeprecationWarning", 25 | # produced by python3.6/site.py itself (3.6.7 on Travis, could not trigger it with 3.6.8)." 26 | "ignore:.*U.*mode is deprecated:DeprecationWarning:(?!(pytest|_pytest))", 27 | # produced by pytest-xdist 28 | "ignore:.*type argument to addoption.*:DeprecationWarning", 29 | # produced on execnet (pytest-xdist) 30 | "ignore:.*inspect.getargspec.*deprecated, use inspect.signature.*:DeprecationWarning", 31 | # pytest's own futurewarnings 32 | "ignore::pytest.PytestExperimentalApiWarning", 33 | # Do not cause SyntaxError for invalid escape sequences in py37. 34 | # Those are caught/handled by pyupgrade, and not easy to filter with the 35 | # module being the filename (with .py removed). 36 | "default:invalid escape sequence:DeprecationWarning", 37 | # ignore use of unregistered marks, because we use many to test the implementation 38 | "ignore::_pytest.warning_types.PytestUnknownMarkWarning", 39 | ] 40 | 41 | [tool.black] 42 | target-version = ['py311'] 43 | 44 | [tool.isort] 45 | profile = "black" 46 | line_length = 100 47 | lines_between_sections = 1 48 | skip = "migrations" 49 | -------------------------------------------------------------------------------- /section-05-production-model-package/regression_model/VERSION: -------------------------------------------------------------------------------- 1 | 0.0.1 2 | -------------------------------------------------------------------------------- /section-05-production-model-package/regression_model/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from regression_model.config.core import PACKAGE_ROOT, config 4 | 5 | # It is strongly advised that you do not add any handlers other than 6 | # NullHandler to your library’s loggers. This is because the configuration 7 | # of handlers is the prerogative of the application developer who uses your 8 | # library. The application developer knows their target audience and what 9 | # handlers are most appropriate for their application: if you add handlers 10 | # ‘under the hood’, you might well interfere with their ability to carry out 11 | # unit tests and deliver logs which suit their requirements. 12 | # https://docs.python.org/3/howto/logging.html#configuring-logging-for-a-library 13 | logging.getLogger(config.app_config.package_name).addHandler(logging.NullHandler()) 14 | 15 | 16 | with open(PACKAGE_ROOT / "VERSION") as version_file: 17 | __version__ = version_file.read().strip() 18 | -------------------------------------------------------------------------------- /section-05-production-model-package/regression_model/config.yml: -------------------------------------------------------------------------------- 1 | # Package Overview 2 | package_name: regression_model 3 | 4 | # Data Files 5 | training_data_file: train.csv 6 | test_data_file: test.csv 7 | 8 | # Variables 9 | # The variable we are attempting to predict (sale price) 10 | target: SalePrice 11 | 12 | pipeline_name: regression_model 13 | pipeline_save_file: regression_model_output_v 14 | 15 | # Will cause syntax errors since they begin with numbers 16 | variables_to_rename: 17 | 1stFlrSF: FirstFlrSF 18 | 2ndFlrSF: SecondFlrSF 19 | 3SsnPorch: ThreeSsnPortch 20 | 21 | features: 22 | - MSSubClass 23 | - MSZoning 24 | - LotFrontage 25 | - LotShape 26 | - LandContour 27 | - LotConfig 28 | - Neighborhood 29 | - OverallQual 30 | - OverallCond 31 | - YearRemodAdd 32 | - RoofStyle 33 | - Exterior1st 34 | - ExterQual 35 | - Foundation 36 | - BsmtQual 37 | - BsmtExposure 38 | - BsmtFinType1 39 | - HeatingQC 40 | - CentralAir 41 | - FirstFlrSF # renamed 42 | - SecondFlrSF # renamed 43 | - GrLivArea 44 | - BsmtFullBath 45 | - HalfBath 46 | - KitchenQual 47 | - TotRmsAbvGrd 48 | - Functional 49 | - Fireplaces 50 | - FireplaceQu 51 | - GarageFinish 52 | - GarageCars 53 | - GarageArea 54 | - PavedDrive 55 | - WoodDeckSF 56 | - ScreenPorch 57 | - SaleCondition 58 | # this one is only to calculate temporal variable: 59 | - YrSold 60 | 61 | # set train/test split 62 | test_size: 0.1 63 | 64 | # to set the random seed 65 | random_state: 0 66 | 67 | alpha: 0.001 68 | 69 | # categorical variables with NA in train set 70 | categorical_vars_with_na_frequent: 71 | - BsmtQual 72 | - BsmtExposure 73 | - BsmtFinType1 74 | - GarageFinish 75 | 76 | categorical_vars_with_na_missing: 77 | - FireplaceQu 78 | 79 | numerical_vars_with_na: 80 | - LotFrontage 81 | 82 | temporal_vars: 83 | - YearRemodAdd 84 | 85 | ref_var: YrSold 86 | 87 | 88 | # variables to log transform 89 | numericals_log_vars: 90 | - LotFrontage 91 | - FirstFlrSF 92 | - GrLivArea 93 | 94 | binarize_vars: 95 | - ScreenPorch 96 | 97 | # variables to map 98 | qual_vars: 99 | - ExterQual 100 | - BsmtQual 101 | - HeatingQC 102 | - KitchenQual 103 | - FireplaceQu 104 | 105 | exposure_vars: 106 | - BsmtExposure 107 | 108 | finish_vars: 109 | - BsmtFinType1 110 | 111 | garage_vars: 112 | - GarageFinish 113 | 114 | categorical_vars: 115 | - MSSubClass 116 | - MSZoning 117 | - LotShape 118 | - LandContour 119 | - LotConfig 120 | - Neighborhood 121 | - RoofStyle 122 | - Exterior1st 123 | - Foundation 124 | - CentralAir 125 | - Functional 126 | - PavedDrive 127 | - SaleCondition 128 | 129 | # variable mappings 130 | qual_mappings: 131 | Po: 1 132 | Fa: 2 133 | TA: 3 134 | Gd: 4 135 | Ex: 5 136 | Missing: 0 137 | NA: 0 138 | 139 | exposure_mappings: 140 | No: 1 141 | Mn: 2 142 | Av: 3 143 | Gd: 4 144 | 145 | 146 | finish_mappings: 147 | Missing: 0 148 | NA: 0 149 | Unf: 1 150 | LwQ: 2 151 | Rec: 3 152 | BLQ: 4 153 | ALQ: 5 154 | GLQ: 6 155 | 156 | 157 | garage_mappings: 158 | Missing: 0 159 | NA: 0 160 | Unf: 1 161 | RFn: 2 162 | Fin: 3 163 | -------------------------------------------------------------------------------- /section-05-production-model-package/regression_model/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/section-05-production-model-package/regression_model/config/__init__.py -------------------------------------------------------------------------------- /section-05-production-model-package/regression_model/config/core.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Dict, List, Optional, Sequence 3 | 4 | from pydantic import BaseModel 5 | from strictyaml import YAML, load 6 | 7 | import regression_model 8 | 9 | # Project Directories 10 | PACKAGE_ROOT = Path(regression_model.__file__).resolve().parent 11 | ROOT = PACKAGE_ROOT.parent 12 | CONFIG_FILE_PATH = PACKAGE_ROOT / "config.yml" 13 | DATASET_DIR = PACKAGE_ROOT / "datasets" 14 | TRAINED_MODEL_DIR = PACKAGE_ROOT / "trained_models" 15 | 16 | 17 | class AppConfig(BaseModel): 18 | """ 19 | Application-level config. 20 | """ 21 | 22 | package_name: str 23 | training_data_file: str 24 | test_data_file: str 25 | pipeline_save_file: str 26 | 27 | 28 | class ModelConfig(BaseModel): 29 | """ 30 | All configuration relevant to model 31 | training and feature engineering. 32 | """ 33 | 34 | target: str 35 | variables_to_rename: Dict 36 | features: List[str] 37 | test_size: float 38 | random_state: int 39 | alpha: float 40 | categorical_vars_with_na_frequent: List[str] 41 | categorical_vars_with_na_missing: List[str] 42 | numerical_vars_with_na: List[str] 43 | temporal_vars: List[str] 44 | ref_var: str 45 | numericals_log_vars: Sequence[str] 46 | binarize_vars: Sequence[str] 47 | qual_vars: List[str] 48 | exposure_vars: List[str] 49 | finish_vars: List[str] 50 | garage_vars: List[str] 51 | categorical_vars: Sequence[str] 52 | qual_mappings: Dict[str, int] 53 | exposure_mappings: Dict[str, int] 54 | garage_mappings: Dict[str, int] 55 | finish_mappings: Dict[str, int] 56 | 57 | 58 | class Config(BaseModel): 59 | """Master config object.""" 60 | 61 | app_config: AppConfig 62 | model_config: ModelConfig 63 | 64 | 65 | def find_config_file() -> Path: 66 | """Locate the configuration file.""" 67 | if CONFIG_FILE_PATH.is_file(): 68 | return CONFIG_FILE_PATH 69 | raise Exception(f"Config not found at {CONFIG_FILE_PATH!r}") 70 | 71 | 72 | def fetch_config_from_yaml(cfg_path: Optional[Path] = None) -> YAML: 73 | """Parse YAML containing the package configuration.""" 74 | 75 | if not cfg_path: 76 | cfg_path = find_config_file() 77 | 78 | if cfg_path: 79 | with open(cfg_path, "r") as conf_file: 80 | parsed_config = load(conf_file.read()) 81 | return parsed_config 82 | raise OSError(f"Did not find config file at path: {cfg_path}") 83 | 84 | 85 | def create_and_validate_config(parsed_config: YAML = None) -> Config: 86 | """Run validation on config values.""" 87 | if parsed_config is None: 88 | parsed_config = fetch_config_from_yaml() 89 | 90 | # specify the data attribute from the strictyaml YAML type. 91 | _config = Config( 92 | app_config=AppConfig(**parsed_config.data), 93 | model_config=ModelConfig(**parsed_config.data), 94 | ) 95 | 96 | return _config 97 | 98 | 99 | config = create_and_validate_config() 100 | -------------------------------------------------------------------------------- /section-05-production-model-package/regression_model/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/section-05-production-model-package/regression_model/datasets/__init__.py -------------------------------------------------------------------------------- /section-05-production-model-package/regression_model/pipeline.py: -------------------------------------------------------------------------------- 1 | from feature_engine.encoding import OrdinalEncoder, RareLabelEncoder 2 | from feature_engine.imputation import AddMissingIndicator, CategoricalImputer, MeanMedianImputer 3 | from feature_engine.selection import DropFeatures 4 | from feature_engine.transformation import LogTransformer 5 | from feature_engine.wrappers import SklearnTransformerWrapper 6 | from sklearn.linear_model import Lasso 7 | from sklearn.pipeline import Pipeline 8 | from sklearn.preprocessing import Binarizer, MinMaxScaler 9 | 10 | from regression_model.config.core import config 11 | from regression_model.processing import features as pp 12 | 13 | price_pipe = Pipeline( 14 | [ 15 | # ===== IMPUTATION ===== 16 | # impute categorical variables with string missing 17 | ( 18 | "missing_imputation", 19 | CategoricalImputer( 20 | imputation_method="missing", 21 | variables=config.model_config.categorical_vars_with_na_missing, 22 | ), 23 | ), 24 | ( 25 | "frequent_imputation", 26 | CategoricalImputer( 27 | imputation_method="frequent", 28 | variables=config.model_config.categorical_vars_with_na_frequent, 29 | ), 30 | ), 31 | # add missing indicator 32 | ( 33 | "missing_indicator", 34 | AddMissingIndicator(variables=config.model_config.numerical_vars_with_na), 35 | ), 36 | # impute numerical variables with the mean 37 | ( 38 | "mean_imputation", 39 | MeanMedianImputer( 40 | imputation_method="mean", 41 | variables=config.model_config.numerical_vars_with_na, 42 | ), 43 | ), 44 | # == TEMPORAL VARIABLES ==== 45 | ( 46 | "elapsed_time", 47 | pp.TemporalVariableTransformer( 48 | variables=config.model_config.temporal_vars, 49 | reference_variable=config.model_config.ref_var, 50 | ), 51 | ), 52 | ("drop_features", DropFeatures(features_to_drop=[config.model_config.ref_var])), 53 | # ==== VARIABLE TRANSFORMATION ===== 54 | ("log", LogTransformer(variables=config.model_config.numericals_log_vars)), 55 | ( 56 | "binarizer", 57 | SklearnTransformerWrapper( 58 | transformer=Binarizer(threshold=0), 59 | variables=config.model_config.binarize_vars, 60 | ), 61 | ), 62 | # === mappers === 63 | ( 64 | "mapper_qual", 65 | pp.Mapper( 66 | variables=config.model_config.qual_vars, 67 | mappings=config.model_config.qual_mappings, 68 | ), 69 | ), 70 | ( 71 | "mapper_exposure", 72 | pp.Mapper( 73 | variables=config.model_config.exposure_vars, 74 | mappings=config.model_config.exposure_mappings, 75 | ), 76 | ), 77 | ( 78 | "mapper_finish", 79 | pp.Mapper( 80 | variables=config.model_config.finish_vars, 81 | mappings=config.model_config.finish_mappings, 82 | ), 83 | ), 84 | ( 85 | "mapper_garage", 86 | pp.Mapper( 87 | variables=config.model_config.garage_vars, 88 | mappings=config.model_config.garage_mappings, 89 | ), 90 | ), 91 | # == CATEGORICAL ENCODING 92 | ( 93 | "rare_label_encoder", 94 | RareLabelEncoder( 95 | tol=0.01, n_categories=1, variables=config.model_config.categorical_vars 96 | ), 97 | ), 98 | # encode categorical variables using the target mean 99 | ( 100 | "categorical_encoder", 101 | OrdinalEncoder( 102 | encoding_method="ordered", 103 | variables=config.model_config.categorical_vars, 104 | ), 105 | ), 106 | ("scaler", MinMaxScaler()), 107 | ( 108 | "Lasso", 109 | Lasso( 110 | alpha=config.model_config.alpha, 111 | random_state=config.model_config.random_state, 112 | ), 113 | ), 114 | ] 115 | ) 116 | -------------------------------------------------------------------------------- /section-05-production-model-package/regression_model/predict.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from regression_model import __version__ as _version 7 | from regression_model.config.core import config 8 | from regression_model.processing.data_manager import load_pipeline 9 | from regression_model.processing.validation import validate_inputs 10 | 11 | pipeline_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl" 12 | _price_pipe = load_pipeline(file_name=pipeline_file_name) 13 | 14 | 15 | def make_prediction( 16 | *, 17 | input_data: t.Union[pd.DataFrame, dict], 18 | ) -> dict: 19 | """Make a prediction using a saved model pipeline.""" 20 | 21 | data = pd.DataFrame(input_data) 22 | validated_data, errors = validate_inputs(input_data=data) 23 | results = {"predictions": None, "version": _version, "errors": errors} 24 | 25 | if not errors: 26 | predictions = _price_pipe.predict( 27 | X=validated_data[config.model_config.features] 28 | ) 29 | results = { 30 | "predictions": [np.exp(pred) for pred in predictions], # type: ignore 31 | "version": _version, 32 | "errors": errors, 33 | } 34 | 35 | return results 36 | -------------------------------------------------------------------------------- /section-05-production-model-package/regression_model/processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/section-05-production-model-package/regression_model/processing/__init__.py -------------------------------------------------------------------------------- /section-05-production-model-package/regression_model/processing/data_manager.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | from pathlib import Path 3 | 4 | import joblib 5 | import pandas as pd 6 | from sklearn.pipeline import Pipeline 7 | 8 | from regression_model import __version__ as _version 9 | from regression_model.config.core import DATASET_DIR, TRAINED_MODEL_DIR, config 10 | 11 | 12 | def load_dataset(*, file_name: str) -> pd.DataFrame: 13 | dataframe = pd.read_csv(Path(f"{DATASET_DIR}/{file_name}")) 14 | dataframe["MSSubClass"] = dataframe["MSSubClass"].astype("O") 15 | 16 | # rename variables beginning with numbers to avoid syntax errors later 17 | transformed = dataframe.rename(columns=config.model_config.variables_to_rename) 18 | return transformed 19 | 20 | 21 | def save_pipeline(*, pipeline_to_persist: Pipeline) -> None: 22 | """Persist the pipeline. 23 | Saves the versioned model, and overwrites any previous 24 | saved models. This ensures that when the package is 25 | published, there is only one trained model that can be 26 | called, and we know exactly how it was built. 27 | """ 28 | 29 | # Prepare versioned save file name 30 | save_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl" 31 | save_path = TRAINED_MODEL_DIR / save_file_name 32 | 33 | remove_old_pipelines(files_to_keep=[save_file_name]) 34 | joblib.dump(pipeline_to_persist, save_path) 35 | 36 | 37 | def load_pipeline(*, file_name: str) -> Pipeline: 38 | """Load a persisted pipeline.""" 39 | 40 | file_path = TRAINED_MODEL_DIR / file_name 41 | trained_model = joblib.load(filename=file_path) 42 | return trained_model 43 | 44 | 45 | def remove_old_pipelines(*, files_to_keep: t.List[str]) -> None: 46 | """ 47 | Remove old model pipelines. 48 | This is to ensure there is a simple one-to-one 49 | mapping between the package version and the model 50 | version to be imported and used by other applications. 51 | """ 52 | do_not_delete = files_to_keep + ["__init__.py"] 53 | for model_file in TRAINED_MODEL_DIR.iterdir(): 54 | if model_file.name not in do_not_delete: 55 | model_file.unlink() 56 | -------------------------------------------------------------------------------- /section-05-production-model-package/regression_model/processing/features.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import pandas as pd 4 | from sklearn.base import BaseEstimator, TransformerMixin 5 | 6 | 7 | class TemporalVariableTransformer(BaseEstimator, TransformerMixin): 8 | """Temporal elapsed time transformer.""" 9 | 10 | def __init__(self, variables: List[str], reference_variable: str): 11 | 12 | if not isinstance(variables, list): 13 | raise ValueError("variables should be a list") 14 | 15 | self.variables = variables 16 | self.reference_variable = reference_variable 17 | 18 | def fit(self, X: pd.DataFrame, y: pd.Series = None): 19 | # we need this step to fit the sklearn pipeline 20 | return self 21 | 22 | def transform(self, X: pd.DataFrame) -> pd.DataFrame: 23 | 24 | # so that we do not over-write the original dataframe 25 | X = X.copy() 26 | 27 | for feature in self.variables: 28 | X[feature] = X[self.reference_variable] - X[feature] 29 | 30 | return X 31 | 32 | 33 | class Mapper(BaseEstimator, TransformerMixin): 34 | """Categorical variable mapper.""" 35 | 36 | def __init__(self, variables: List[str], mappings: dict): 37 | 38 | if not isinstance(variables, list): 39 | raise ValueError("variables should be a list") 40 | 41 | self.variables = variables 42 | self.mappings = mappings 43 | 44 | def fit(self, X: pd.DataFrame, y: pd.Series = None): 45 | # we need the fit statement to accomodate the sklearn pipeline 46 | return self 47 | 48 | def transform(self, X: pd.DataFrame) -> pd.DataFrame: 49 | X = X.copy() 50 | for feature in self.variables: 51 | X[feature] = X[feature].map(self.mappings) 52 | 53 | return X 54 | -------------------------------------------------------------------------------- /section-05-production-model-package/regression_model/train_pipeline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from config.core import config 3 | from pipeline import price_pipe 4 | from processing.data_manager import load_dataset, save_pipeline 5 | from sklearn.model_selection import train_test_split 6 | 7 | 8 | def run_training() -> None: 9 | """Train the model.""" 10 | 11 | # read training data 12 | data = load_dataset(file_name=config.app_config.training_data_file) 13 | 14 | # divide train and test 15 | X_train, X_test, y_train, y_test = train_test_split( 16 | data[config.model_config.features], # predictors 17 | data[config.model_config.target], 18 | test_size=config.model_config.test_size, 19 | # we are setting the random seed here 20 | # for reproducibility 21 | random_state=config.model_config.random_state, 22 | ) 23 | y_train = np.log(y_train) 24 | 25 | # fit model 26 | price_pipe.fit(X_train, y_train) 27 | 28 | # persist trained model 29 | save_pipeline(pipeline_to_persist=price_pipe) 30 | 31 | 32 | if __name__ == "__main__": 33 | run_training() 34 | -------------------------------------------------------------------------------- /section-05-production-model-package/regression_model/trained_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/section-05-production-model-package/regression_model/trained_models/__init__.py -------------------------------------------------------------------------------- /section-05-production-model-package/requirements/requirements.txt: -------------------------------------------------------------------------------- 1 | # We use compatible release functionality (see PEP 440 here: https://www.python.org/dev/peps/pep-0440/#compatible-release) 2 | # to specify acceptable version ranges of our project dependencies. This gives us the flexibility to keep up with small 3 | # updates/fixes, whilst ensuring we don't install a major update which could introduce backwards incompatible changes. 4 | numpy>=1.21.0,<2.0.0 5 | pandas>=1.3.5,<2.0.0 6 | pydantic>=1.8.1,<2.0.0 7 | scikit-learn>=1.1.3,<2.0.0 8 | strictyaml>=1.3.2,<2.0.0 9 | ruamel.yaml>=0.16.12,<1.0.0 10 | feature-engine>=1.0.2,<1.6.0 # breaking change in v1.6.0 11 | joblib>=1.0.1,<2.0.0 -------------------------------------------------------------------------------- /section-05-production-model-package/requirements/test_requirements.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | 3 | # testing requirements 4 | pytest>=7.2.0,<8.0.0 5 | -------------------------------------------------------------------------------- /section-05-production-model-package/requirements/typing_requirements.txt: -------------------------------------------------------------------------------- 1 | # repo maintenance tooling 2 | black>=22.12.0,<23.0.0 3 | flake8>=6.0.0,<7.0.0 4 | mypy>=0.991,<1.0.0 5 | isort>=5.11.4,<6.0.0 -------------------------------------------------------------------------------- /section-05-production-model-package/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from pathlib import Path 5 | 6 | from setuptools import find_packages, setup 7 | 8 | # Package meta-data. 9 | NAME = 'tid-regression-model' 10 | DESCRIPTION = "Example regression model package from Train In Data." 11 | URL = "https://github.com/trainindata/testing-and-monitoring-ml-deployments" 12 | EMAIL = "christopher.samiullah@protonmail.com" 13 | AUTHOR = "ChristopherGS" 14 | REQUIRES_PYTHON = ">=3.6.0" 15 | 16 | 17 | # The rest you shouldn't have to touch too much :) 18 | # ------------------------------------------------ 19 | # Except, perhaps the License and Trove Classifiers! 20 | # If you do change the License, remember to change the 21 | # Trove Classifier for that! 22 | long_description = DESCRIPTION 23 | 24 | # Load the package's VERSION file as a dictionary. 25 | about = {} 26 | ROOT_DIR = Path(__file__).resolve().parent 27 | REQUIREMENTS_DIR = ROOT_DIR / 'requirements' 28 | PACKAGE_DIR = ROOT_DIR / 'regression_model' 29 | with open(PACKAGE_DIR / "VERSION") as f: 30 | _version = f.read().strip() 31 | about["__version__"] = _version 32 | 33 | 34 | # What packages are required for this module to be executed? 35 | def list_reqs(fname="requirements.txt"): 36 | with open(REQUIREMENTS_DIR / fname) as fd: 37 | return fd.read().splitlines() 38 | 39 | # Where the magic happens: 40 | setup( 41 | name=NAME, 42 | version=about["__version__"], 43 | description=DESCRIPTION, 44 | long_description=long_description, 45 | long_description_content_type="text/markdown", 46 | author=AUTHOR, 47 | author_email=EMAIL, 48 | python_requires=REQUIRES_PYTHON, 49 | url=URL, 50 | packages=find_packages(exclude=("tests",)), 51 | package_data={"regression_model": ["VERSION"]}, 52 | install_requires=list_reqs(), 53 | extras_require={}, 54 | include_package_data=True, 55 | license="BSD-3", 56 | classifiers=[ 57 | # Trove classifiers 58 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers 59 | "License :: OSI Approved :: MIT License", 60 | "Programming Language :: Python", 61 | "Programming Language :: Python :: 3", 62 | "Programming Language :: Python :: 3.6", 63 | "Programming Language :: Python :: 3.7", 64 | "Programming Language :: Python :: 3.8", 65 | "Programming Language :: Python :: 3.9", 66 | "Programming Language :: Python :: Implementation :: CPython", 67 | "Programming Language :: Python :: Implementation :: PyPy", 68 | ], 69 | ) -------------------------------------------------------------------------------- /section-05-production-model-package/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/section-05-production-model-package/tests/__init__.py -------------------------------------------------------------------------------- /section-05-production-model-package/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from regression_model.config.core import config 4 | from regression_model.processing.data_manager import load_dataset 5 | 6 | 7 | @pytest.fixture() 8 | def sample_input_data(): 9 | return load_dataset(file_name=config.app_config.test_data_file) 10 | -------------------------------------------------------------------------------- /section-05-production-model-package/tests/test_features.py: -------------------------------------------------------------------------------- 1 | from regression_model.config.core import config 2 | from regression_model.processing.features import TemporalVariableTransformer 3 | 4 | 5 | def test_temporal_variable_transformer(sample_input_data): 6 | # Given 7 | transformer = TemporalVariableTransformer( 8 | variables=config.model_config.temporal_vars, # YearRemodAdd 9 | reference_variable=config.model_config.ref_var, 10 | ) 11 | assert sample_input_data["YearRemodAdd"].iat[0] == 1961 12 | 13 | # When 14 | subject = transformer.fit_transform(sample_input_data) 15 | 16 | # Then 17 | assert subject["YearRemodAdd"].iat[0] == 49 18 | -------------------------------------------------------------------------------- /section-05-production-model-package/tests/test_prediction.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import numpy as np 4 | 5 | from regression_model.predict import make_prediction 6 | 7 | 8 | def test_make_prediction(sample_input_data): 9 | # Given 10 | expected_first_prediction_value = 113422 11 | expected_no_predictions = 1449 12 | 13 | # When 14 | result = make_prediction(input_data=sample_input_data) 15 | 16 | # Then 17 | predictions = result.get("predictions") 18 | assert isinstance(predictions, list) 19 | assert isinstance(predictions[0], np.float64) 20 | assert result.get("errors") is None 21 | assert len(predictions) == expected_no_predictions 22 | assert math.isclose(predictions[0], expected_first_prediction_value, abs_tol=100) 23 | -------------------------------------------------------------------------------- /section-05-production-model-package/tox.ini: -------------------------------------------------------------------------------- 1 | # Tox is a generic virtualenv management and test command line tool. Its goal is to 2 | # standardize testing in Python. We will be using it extensively in this course. 3 | 4 | # Using Tox we can (on multiple operating systems): 5 | # + Eliminate PYTHONPATH challenges when running scripts/tests 6 | # + Eliminate virtualenv setup confusion 7 | # + Streamline steps such as model training, model publishing 8 | 9 | 10 | [tox] 11 | min_version = 4 12 | envlist = test_package, checks 13 | skipsdist = True 14 | 15 | [testenv] 16 | basepython = python 17 | install_command = pip install {opts} {packages} 18 | allowlist_externals = train 19 | 20 | setenv = 21 | PYTHONPATH=. 22 | PYTHONHASHSEED=0 23 | 24 | [testenv:test_package] 25 | envdir = {toxworkdir}/test_package 26 | deps = 27 | -r{toxinidir}/requirements/test_requirements.txt 28 | commands= 29 | python regression_model/train_pipeline.py 30 | pytest \ 31 | -s \ 32 | -vv \ 33 | {posargs:tests/} 34 | 35 | [testenv:train] 36 | envdir = {toxworkdir}/test_package 37 | deps = 38 | {[testenv:test_package]deps} 39 | commands= 40 | python regression_model/train_pipeline.py 41 | 42 | 43 | [testenv:checks] 44 | envdir = {toxworkdir}/checks 45 | deps = 46 | -r{toxinidir}/requirements/typing_requirements.txt 47 | commands = 48 | flake8 regression_model tests 49 | isort regression_model tests 50 | {posargs:mypy regression_model} 51 | 52 | 53 | [flake8] 54 | exclude = .git,env 55 | max-line-length = 100 -------------------------------------------------------------------------------- /section-06-model-serving-api/house-prices-api/Procfile: -------------------------------------------------------------------------------- 1 | web: uvicorn app.main:app --host 0.0.0.0 --port $PORT -------------------------------------------------------------------------------- /section-06-model-serving-api/house-prices-api/app/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.2" 2 | -------------------------------------------------------------------------------- /section-06-model-serving-api/house-prices-api/app/api.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Any 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from fastapi import APIRouter, HTTPException 7 | from fastapi.encoders import jsonable_encoder 8 | from loguru import logger 9 | from regression_model import __version__ as model_version 10 | from regression_model.predict import make_prediction 11 | 12 | from app import __version__, schemas 13 | from app.config import settings 14 | 15 | api_router = APIRouter() 16 | 17 | 18 | @api_router.get("/health", response_model=schemas.Health, status_code=200) 19 | def health() -> dict: 20 | """ 21 | Root Get 22 | """ 23 | health = schemas.Health( 24 | name=settings.PROJECT_NAME, api_version=__version__, model_version=model_version 25 | ) 26 | 27 | return health.dict() 28 | 29 | 30 | @api_router.post("/predict", response_model=schemas.PredictionResults, status_code=200) 31 | async def predict(input_data: schemas.MultipleHouseDataInputs) -> Any: 32 | """ 33 | Make house price predictions with the TID regression model 34 | """ 35 | 36 | input_df = pd.DataFrame(jsonable_encoder(input_data.inputs)) 37 | 38 | # Advanced: You can improve performance of your API by rewriting the 39 | # `make prediction` function to be async and using await here. 40 | logger.info(f"Making prediction on inputs: {input_data.inputs}") 41 | results = make_prediction(input_data=input_df.replace({np.nan: None})) 42 | 43 | if results["errors"] is not None: 44 | logger.warning(f"Prediction validation error: {results.get('errors')}") 45 | raise HTTPException(status_code=400, detail=json.loads(results["errors"])) 46 | 47 | logger.info(f"Prediction results: {results.get('predictions')}") 48 | 49 | return results 50 | -------------------------------------------------------------------------------- /section-06-model-serving-api/house-prices-api/app/config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | from types import FrameType 4 | from typing import List, cast 5 | 6 | from loguru import logger 7 | from pydantic import AnyHttpUrl, BaseSettings 8 | 9 | 10 | class LoggingSettings(BaseSettings): 11 | LOGGING_LEVEL: int = logging.INFO # logging levels are type int 12 | 13 | 14 | class Settings(BaseSettings): 15 | API_V1_STR: str = "/api/v1" 16 | 17 | # Meta 18 | logging: LoggingSettings = LoggingSettings() 19 | 20 | # BACKEND_CORS_ORIGINS is a comma-separated list of origins 21 | # e.g: http://localhost,http://localhost:4200,http://localhost:3000 22 | BACKEND_CORS_ORIGINS: List[AnyHttpUrl] = [ 23 | "http://localhost:3000", # type: ignore 24 | "http://localhost:8000", # type: ignore 25 | "https://localhost:3000", # type: ignore 26 | "https://localhost:8000", # type: ignore 27 | ] 28 | 29 | PROJECT_NAME: str = "House Price Prediction API" 30 | 31 | class Config: 32 | case_sensitive = True 33 | 34 | 35 | # See: https://loguru.readthedocs.io/en/stable/overview.html#entirely-compatible-with-standard-logging # noqa 36 | class InterceptHandler(logging.Handler): 37 | def emit(self, record: logging.LogRecord) -> None: # pragma: no cover 38 | # Get corresponding Loguru level if it exists 39 | try: 40 | level = logger.level(record.levelname).name 41 | except ValueError: 42 | level = str(record.levelno) 43 | 44 | # Find caller from where originated the logged message 45 | frame, depth = logging.currentframe(), 2 46 | while frame.f_code.co_filename == logging.__file__: # noqa: WPS609 47 | frame = cast(FrameType, frame.f_back) 48 | depth += 1 49 | 50 | logger.opt(depth=depth, exception=record.exc_info).log( 51 | level, 52 | record.getMessage(), 53 | ) 54 | 55 | 56 | def setup_app_logging(config: Settings) -> None: 57 | """Prepare custom logging for our application.""" 58 | 59 | LOGGERS = ("uvicorn.asgi", "uvicorn.access") 60 | logging.getLogger().handlers = [InterceptHandler()] 61 | for logger_name in LOGGERS: 62 | logging_logger = logging.getLogger(logger_name) 63 | logging_logger.handlers = [InterceptHandler(level=config.logging.LOGGING_LEVEL)] 64 | 65 | logger.configure( 66 | handlers=[{"sink": sys.stderr, "level": config.logging.LOGGING_LEVEL}] 67 | ) 68 | 69 | 70 | settings = Settings() 71 | -------------------------------------------------------------------------------- /section-06-model-serving-api/house-prices-api/app/main.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from fastapi import APIRouter, FastAPI, Request 4 | from fastapi.middleware.cors import CORSMiddleware 5 | from fastapi.responses import HTMLResponse 6 | from loguru import logger 7 | 8 | from app.api import api_router 9 | from app.config import settings, setup_app_logging 10 | 11 | # setup logging as early as possible 12 | setup_app_logging(config=settings) 13 | 14 | 15 | app = FastAPI( 16 | title=settings.PROJECT_NAME, openapi_url=f"{settings.API_V1_STR}/openapi.json" 17 | ) 18 | 19 | root_router = APIRouter() 20 | 21 | 22 | @root_router.get("/") 23 | def index(request: Request) -> Any: 24 | """Basic HTML response.""" 25 | body = ( 26 | "" 27 | "
" 28 | "