├── .circleci
    └── config.yml
├── .dockerignore
├── .gitignore
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── assignment-section-05
    ├── MANIFEST.in
    ├── README.md
    ├── classification_model
    │   ├── VERSION
    │   ├── __init__.py
    │   ├── config.yml
    │   ├── config
    │   │   ├── __init__.py
    │   │   └── core.py
    │   ├── datasets
    │   │   └── __init__.py
    │   ├── pipeline.py
    │   ├── predict.py
    │   ├── processing
    │   │   ├── __init__.py
    │   │   ├── data_manager.py
    │   │   ├── features.py
    │   │   └── validation.py
    │   ├── train_pipeline.py
    │   └── trained_models
    │   │   └── __init__.py
    ├── mypy.ini
    ├── pyproject.toml
    ├── requirements
    │   ├── requirements.txt
    │   ├── test_requirements.txt
    │   └── typing_requirements.txt
    ├── setup.py
    ├── tests
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_features.py
    │   └── test_prediction.py
    └── tox.ini
├── packages
    ├── ml_api
    │   ├── VERSION
    │   ├── api
    │   │   ├── __init__.py
    │   │   ├── app.py
    │   │   ├── config.py
    │   │   ├── controller.py
    │   │   └── validation.py
    │   ├── diff_test_requirements.txt
    │   ├── requirements.txt
    │   ├── run.py
    │   ├── run.sh
    │   ├── test_data_predictions.csv
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── capture_model_predictions.py
    │   │   ├── conftest.py
    │   │   ├── differential_tests
    │   │   │   ├── __init__.py
    │   │   │   └── test_differential.py
    │   │   ├── test_controller.py
    │   │   └── test_validation.py
    │   └── tox.ini
    ├── neural_network_model
    │   ├── MANIFEST.in
    │   ├── config.yml
    │   ├── neural_network_model
    │   │   ├── VERSION
    │   │   ├── __init__.py
    │   │   ├── config
    │   │   │   ├── __init__.py
    │   │   │   └── config.py
    │   │   ├── datasets
    │   │   │   ├── __init__.py
    │   │   │   └── test_data
    │   │   │   │   ├── Black-grass
    │   │   │   │       └── 1.png
    │   │   │   │   ├── Charlock
    │   │   │   │       └── 1.png
    │   │   │   │   └── __init__.py
    │   │   ├── model.py
    │   │   ├── pipeline.py
    │   │   ├── predict.py
    │   │   ├── processing
    │   │   │   ├── __init__.py
    │   │   │   ├── data_management.py
    │   │   │   ├── errors.py
    │   │   │   └── preprocessors.py
    │   │   ├── train_pipeline.py
    │   │   └── trained_models
    │   │   │   └── __init__.py
    │   ├── requirements.txt
    │   ├── setup.py
    │   └── tests
    │   │   ├── __init__.py
    │   │   ├── conftest.py
    │   │   └── test_predict.py
    └── regression_model
    │   ├── MANIFEST.in
    │   ├── regression_model
    │       ├── VERSION
    │       ├── __init__.py
    │       ├── config
    │       │   ├── __init__.py
    │       │   ├── config.py
    │       │   └── logging_config.py
    │       ├── datasets
    │       │   └── __init__.py
    │       ├── pipeline.py
    │       ├── predict.py
    │       ├── processing
    │       │   ├── __init__.py
    │       │   ├── data_management.py
    │       │   ├── errors.py
    │       │   ├── features.py
    │       │   ├── preprocessors.py
    │       │   └── validation.py
    │       ├── train_pipeline.py
    │       └── trained_models
    │       │   └── __init__.py
    │   ├── requirements.txt
    │   ├── setup.py
    │   ├── tests
    │       ├── __init__.py
    │       └── test_predict.py
    │   └── tox.ini
├── scripts
    ├── fetch_kaggle_dataset.sh
    ├── fetch_kaggle_large_dataset.sh
    ├── input_test.json
    └── publish_model.sh
├── section-04-research-and-development
    ├── 01-machine-learning-pipeline-data-analysis.ipynb
    ├── 02-machine-learning-pipeline-feature-engineering.ipynb
    ├── 03-machine-learning-pipeline-feature-selection.ipynb
    ├── 04-machine-learning-pipeline-model-training.ipynb
    ├── 05-machine-learning-pipeline-scoring-new-data.ipynb
    ├── 06-feature-engineering-with-open-source.ipynb
    ├── 07-feature-engineering-pipeline.ipynb
    ├── 08-final-machine-learning-pipeline.ipynb
    ├── preprocessors.py
    ├── preprocessors_bonus.py
    ├── requirements.txt
    └── titanic-assignment
    │   ├── 01-predicting-survival-titanic-assignement.ipynb
    │   ├── 02-predicting-survival-titanic-solution.ipynb
    │   ├── 03-titanic-survival-pipeline-assignment.ipynb
    │   └── 04-titanic-survival-pipeline-solution.ipynb
├── section-05-production-model-package
    ├── MANIFEST.in
    ├── mypy.ini
    ├── pyproject.toml
    ├── regression_model
    │   ├── VERSION
    │   ├── __init__.py
    │   ├── config.yml
    │   ├── config
    │   │   ├── __init__.py
    │   │   └── core.py
    │   ├── datasets
    │   │   └── __init__.py
    │   ├── pipeline.py
    │   ├── predict.py
    │   ├── processing
    │   │   ├── __init__.py
    │   │   ├── data_manager.py
    │   │   ├── features.py
    │   │   └── validation.py
    │   ├── train_pipeline.py
    │   └── trained_models
    │   │   └── __init__.py
    ├── requirements
    │   ├── requirements.txt
    │   ├── test_requirements.txt
    │   └── typing_requirements.txt
    ├── setup.py
    ├── tests
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_features.py
    │   └── test_prediction.py
    └── tox.ini
├── section-06-model-serving-api
    └── house-prices-api
    │   ├── Procfile
    │   ├── app
    │       ├── __init__.py
    │       ├── api.py
    │       ├── config.py
    │       ├── main.py
    │       ├── schemas
    │       │   ├── __init__.py
    │       │   ├── health.py
    │       │   └── predict.py
    │       └── tests
    │       │   ├── __init__.py
    │       │   ├── conftest.py
    │       │   └── test_api.py
    │   ├── mypy.ini
    │   ├── requirements.txt
    │   ├── test_requirements.txt
    │   ├── tox.ini
    │   └── typing_requirements.txt
├── section-07-ci-and-publishing
    ├── house-prices-api
    │   ├── Procfile
    │   ├── app
    │   │   ├── __init__.py
    │   │   ├── api.py
    │   │   ├── config.py
    │   │   ├── main.py
    │   │   ├── schemas
    │   │   │   ├── __init__.py
    │   │   │   ├── health.py
    │   │   │   └── predict.py
    │   │   └── tests
    │   │   │   ├── __init__.py
    │   │   │   ├── conftest.py
    │   │   │   └── test_api.py
    │   ├── mypy.ini
    │   ├── requirements.txt
    │   ├── test_requirements.txt
    │   ├── tox.ini
    │   └── typing_requirements.txt
    └── model-package
    │   ├── MANIFEST.in
    │   ├── mypy.ini
    │   ├── publish_model.sh
    │   ├── pyproject.toml
    │   ├── regression_model
    │       ├── VERSION
    │       ├── __init__.py
    │       ├── config.yml
    │       ├── config
    │       │   ├── __init__.py
    │       │   └── core.py
    │       ├── datasets
    │       │   └── __init__.py
    │       ├── pipeline.py
    │       ├── predict.py
    │       ├── processing
    │       │   ├── __init__.py
    │       │   ├── data_manager.py
    │       │   ├── features.py
    │       │   └── validation.py
    │       ├── train_pipeline.py
    │       └── trained_models
    │       │   └── __init__.py
    │   ├── requirements
    │       ├── requirements.txt
    │       ├── test_requirements.txt
    │       └── typing_requirements.txt
    │   ├── setup.py
    │   ├── tests
    │       ├── __init__.py
    │       ├── conftest.py
    │       ├── test_features.py
    │       └── test_prediction.py
    │   └── tox.ini
└── section-08-deploying-with-containers
    ├── .dockerignore
    ├── Dockerfile
    └── house-prices-api
        ├── Procfile
        ├── app
            ├── __init__.py
            ├── api.py
            ├── config.py
            ├── main.py
            ├── schemas
            │   ├── __init__.py
            │   ├── health.py
            │   └── predict.py
            └── tests
            │   ├── __init__.py
            │   ├── conftest.py
            │   └── test_api.py
        ├── mypy.ini
        ├── requirements.txt
        ├── run.sh
        ├── test_requirements.txt
        ├── tox.ini
        └── typing_requirements.txt


/.dockerignore:
--------------------------------------------------------------------------------
1 | jupyter_notebooks*
2 | */env*
3 | */venv*
4 | .circleci*
5 | packages/regression_model
6 | *.env
7 | *.log
8 | .git
9 | .gitignore


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | env39/
 89 | env311/
 90 | venv/
 91 | ENV/
 92 | env.bak/
 93 | venv.bak/
 94 | 
 95 | # Spyder project settings
 96 | .spyderproject
 97 | .spyproject
 98 | 
 99 | # Rope project settings
100 | .ropeproject
101 | 
102 | # mkdocs documentation
103 | /site
104 | 
105 | # mypy
106 | .mypy_cache/
107 | 
108 | # pycharm
109 | .idea/
110 | 
111 | # datafiles
112 | packages/regression_model/regression_model/datasets/*.csv
113 | packages/regression_model/regression_model/datasets/*.zip
114 | packages/regression_model/regression_model/datasets/*.txt
115 | train.csv
116 | test.csv
117 | raw.csv
118 | data_description.txt
119 | house-prices-advanced-regression-techniques.zip
120 | sample_submission.csv
121 | test_data_predictions.csv
122 | v2-plant-seedlings-dataset/
123 | v2-plant-seedlings-dataset.zip
124 | 
125 | # all logs
126 | logs/
127 | 
128 | # trained models (will be created in CI)
129 | section-05-production-model-package/regression_model/trained_models/*.pkl
130 | packages/regression_model/regression_model/trained_models/*.pkl
131 | packages/neural_network_model/neural_network_model/trained_models/*.pkl
132 | packages/neural_network_model/neural_network_model/trained_models/*.h5
133 | *.h5
134 | packages/neural_network_model/neural_network_model/datasets/training_data_reference.txt
135 | *.pkl
136 | 
137 | .DS_Store
138 | 
139 | kaggle.json
140 | packages/ml_api/uploads/*
141 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.6.4
 2 | 
 3 | # Create the user that will run the app
 4 | RUN adduser --disabled-password --gecos '' ml-api-user
 5 | 
 6 | WORKDIR /opt/ml_api
 7 | 
 8 | ARG PIP_EXTRA_INDEX_URL
 9 | ENV FLASK_APP run.py
10 | 
11 | # Install requirements, including from Gemfury
12 | ADD ./packages/ml_api /opt/ml_api/
13 | RUN pip install --upgrade pip
14 | RUN pip install -r /opt/ml_api/requirements.txt
15 | 
16 | RUN chmod +x /opt/ml_api/run.sh
17 | RUN chown -R ml-api-user:ml-api-user ./
18 | 
19 | USER ml-api-user
20 | 
21 | EXPOSE 5000
22 | 
23 | CMD ["bash", "./run.sh"]


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2019, Soledad Galli and Christopher Samiullah. Deployment of Machine Learning Models, online course.
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | NAME=udemy-ml-api
 2 | COMMIT_ID=$(shell git rev-parse HEAD)
 3 | 
 4 | 
 5 | build-ml-api-heroku:
 6 | 	docker build --build-arg PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} -t registry.heroku.com/$(NAME)/web:$(COMMIT_ID) .
 7 | 
 8 | push-ml-api-heroku:
 9 | 	docker push registry.heroku.com/${HEROKU_APP_NAME}/web:$(COMMIT_ID)
10 | 
11 | build-ml-api-aws:
12 | 	docker build --build-arg PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} -t $(NAME):$(COMMIT_ID) .
13 | 
14 | push-ml-api-aws:
15 | 	docker push ${AWS_ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/$(NAME):$(COMMIT_ID)
16 | 
17 | tag-ml-api:
18 | 	docker tag $(NAME):$(COMMIT_ID) ${AWS_ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/$(NAME):$(COMMIT_ID)
19 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Deployment of Machine Learning Models
2 | Accompanying repo for the online course Deployment of Machine Learning Models.
3 | 
4 | For the documentation, visit the [course on Udemy](https://www.udemy.com/deployment-of-machine-learning-models/?couponCode=TIDREPO).
5 | 


--------------------------------------------------------------------------------
/assignment-section-05/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include *.txt
 2 | include *.md
 3 | include *.pkl
 4 | recursive-include ./classification_model/*
 5 | 
 6 | include classification_model/datasets/train.csv
 7 | include classification_model/datasets/test.csv
 8 | include classification_model/trained_models/*.pkl
 9 | include classification_model/VERSION
10 | include classification_model/config.yml
11 | 
12 | include ./requirements/requirements.txt
13 | include ./requirements/test_requirements.txt
14 | exclude *.log
15 | exclude *.cfg
16 | 
17 | recursive-exclude * __pycache__
18 | recursive-exclude * *.py[co]


--------------------------------------------------------------------------------
/assignment-section-05/README.md:
--------------------------------------------------------------------------------
 1 | # Productionized Titanic Classification Model Package
 2 | 
 3 | ## Run With Tox (Recommended)
 4 | - Download the data from: https://www.openml.org/data/get_csv/16826755/phpMYEkMl
 5 | - Save the file as `raw.csv` in the classification_model/datasets directory
 6 | - `pip install tox`
 7 | - Make sure you are in the assignment-section-05 directory (where the tox.ini file is) then run the command: `tox` (this runs the tests and typechecks, trains the model under the hood). The first time you run this it creates a virtual env and installs
 8 | dependencies, so takes a few minutes.
 9 | 
10 | ## Run Without Tox
11 | - Download the data from: https://www.openml.org/data/get_csv/16826755/phpMYEkMl
12 | - Save the file as `raw.csv` in the classification_model/datasets directory
13 | - Add assignment-section-05 *and* classification_model paths to your system PYTHONPATH
14 | - `pip install -r requirements/test_requirements`
15 | - Train the model: `python classification_model/train_pipeline.py`
16 | - Run the tests `pytest tests`


--------------------------------------------------------------------------------
/assignment-section-05/classification_model/VERSION:
--------------------------------------------------------------------------------
1 | 0.0.1
2 | 


--------------------------------------------------------------------------------
/assignment-section-05/classification_model/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from classification_model.config.core import PACKAGE_ROOT, config
 4 | 
 5 | # It is strongly advised that you do not add any handlers other than
 6 | # NullHandler to your library’s loggers. This is because the configuration
 7 | # of handlers is the prerogative of the application developer who uses your
 8 | # library. The application developer knows their target audience and what
 9 | # handlers are most appropriate for their application: if you add handlers
10 | # ‘under the hood’, you might well interfere with their ability to carry out
11 | # unit tests and deliver logs which suit their requirements.
12 | # https://docs.python.org/3/howto/logging.html#configuring-logging-for-a-library
13 | logging.getLogger(config.app_config.package_name).addHandler(logging.NullHandler())
14 | 
15 | 
16 | with open(PACKAGE_ROOT / "VERSION") as version_file:
17 |     __version__ = version_file.read().strip()
18 | 


--------------------------------------------------------------------------------
/assignment-section-05/classification_model/config.yml:
--------------------------------------------------------------------------------
 1 | # Package Overview
 2 | package_name: regression_model
 3 | 
 4 | # Data Files
 5 | raw_data_file: raw.csv
 6 | training_data_file: train.csv
 7 | test_data_file: test.csv
 8 | 
 9 | # Variables
10 | # The variable we are attempting to predict (sale price)
11 | target: survived
12 | 
13 | pipeline_name: titanic_classification_model
14 | pipeline_save_file: titanic_classification_model_output_v
15 | 
16 | features:
17 |   - pclass
18 |   - sex
19 |   - age
20 |   - sibsp
21 |   - parch
22 |   - fare
23 |   - cabin
24 |   - embarked
25 |   - title  # generated from name
26 | 
27 | # set train/test split
28 | test_size: 0.1
29 | 
30 | # to set the random seed
31 | random_state: 0
32 | 
33 | unused_fields:
34 |   - name
35 |   - ticket
36 |   - boat
37 |   - body
38 |   - home.dest
39 | 
40 | numerical_vars:
41 |   - age
42 |   - fare
43 | 
44 | categorical_vars:
45 |   - sex
46 |   - cabin
47 |   - embarked
48 |   - title
49 | 
50 | cabin_vars:
51 |   - cabin


--------------------------------------------------------------------------------
/assignment-section-05/classification_model/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/assignment-section-05/classification_model/config/__init__.py


--------------------------------------------------------------------------------
/assignment-section-05/classification_model/config/core.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Sequence
 3 | 
 4 | from pydantic import BaseModel
 5 | from strictyaml import YAML, load
 6 | 
 7 | import classification_model
 8 | 
 9 | # Project Directories
10 | PACKAGE_ROOT = Path(classification_model.__file__).resolve().parent
11 | ROOT = PACKAGE_ROOT.parent
12 | CONFIG_FILE_PATH = PACKAGE_ROOT / "config.yml"
13 | DATASET_DIR = PACKAGE_ROOT / "datasets"
14 | TRAINED_MODEL_DIR = PACKAGE_ROOT / "trained_models"
15 | 
16 | 
17 | class AppConfig(BaseModel):
18 |     """
19 |     Application-level config.
20 |     """
21 | 
22 |     package_name: str
23 |     raw_data_file: str
24 |     pipeline_save_file: str
25 | 
26 | 
27 | class ModelConfig(BaseModel):
28 |     """
29 |     All configuration relevant to model
30 |     training and feature engineering.
31 |     """
32 | 
33 |     target: str
34 |     unused_fields: Sequence[str]
35 |     features: Sequence[str]
36 |     test_size: float
37 |     random_state: int
38 |     numerical_vars: Sequence[str]
39 |     categorical_vars: Sequence[str]
40 |     cabin_vars: Sequence[str]
41 | 
42 | 
43 | class Config(BaseModel):
44 |     """Master config object."""
45 | 
46 |     app_config: AppConfig
47 |     model_config: ModelConfig
48 | 
49 | 
50 | def find_config_file() -> Path:
51 |     """Locate the configuration file."""
52 |     if CONFIG_FILE_PATH.is_file():
53 |         return CONFIG_FILE_PATH
54 |     raise Exception(f"Config not found at {CONFIG_FILE_PATH!r}")
55 | 
56 | 
57 | def fetch_config_from_yaml(cfg_path: Path = None) -> YAML:
58 |     """Parse YAML containing the package configuration."""
59 | 
60 |     if not cfg_path:
61 |         cfg_path = find_config_file()
62 | 
63 |     if cfg_path:
64 |         with open(cfg_path, "r") as conf_file:
65 |             parsed_config = load(conf_file.read())
66 |             return parsed_config
67 |     raise OSError(f"Did not find config file at path: {cfg_path}")
68 | 
69 | 
70 | def create_and_validate_config(parsed_config: YAML = None) -> Config:
71 |     """Run validation on config values."""
72 |     if parsed_config is None:
73 |         parsed_config = fetch_config_from_yaml()
74 | 
75 |     # specify the data attribute from the strictyaml YAML type.
76 |     _config = Config(
77 |         app_config=AppConfig(**parsed_config.data),
78 |         model_config=ModelConfig(**parsed_config.data),
79 |     )
80 | 
81 |     return _config
82 | 
83 | 
84 | config = create_and_validate_config()
85 | 


--------------------------------------------------------------------------------
/assignment-section-05/classification_model/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/assignment-section-05/classification_model/datasets/__init__.py


--------------------------------------------------------------------------------
/assignment-section-05/classification_model/pipeline.py:
--------------------------------------------------------------------------------
 1 | # for encoding categorical variables
 2 | from feature_engine.encoding import OneHotEncoder, RareLabelEncoder
 3 | 
 4 | # for imputation
 5 | from feature_engine.imputation import (
 6 |     AddMissingIndicator,
 7 |     CategoricalImputer,
 8 |     MeanMedianImputer,
 9 | )
10 | from sklearn.linear_model import LogisticRegression
11 | from sklearn.pipeline import Pipeline
12 | from sklearn.preprocessing import StandardScaler
13 | 
14 | from classification_model.config.core import config
15 | from classification_model.processing.features import ExtractLetterTransformer
16 | 
17 | titanic_pipe = Pipeline(
18 |     [
19 |         # impute categorical variables with string missing
20 |         (
21 |             "categorical_imputation",
22 |             CategoricalImputer(
23 |                 imputation_method="missing",
24 |                 variables=config.model_config.categorical_vars,
25 |             ),
26 |         ),
27 |         # add missing indicator to numerical variables
28 |         (
29 |             "missing_indicator",
30 |             AddMissingIndicator(variables=config.model_config.numerical_vars),
31 |         ),
32 |         # impute numerical variables with the median
33 |         (
34 |             "median_imputation",
35 |             MeanMedianImputer(
36 |                 imputation_method="median", variables=config.model_config.numerical_vars
37 |             ),
38 |         ),
39 |         # Extract letter from cabin
40 |         (
41 |             "extract_letter",
42 |             ExtractLetterTransformer(variables=config.model_config.cabin_vars),
43 |         ),
44 |         # == CATEGORICAL ENCODING ======
45 |         # remove categories present in less than 5% of the observations (0.05)
46 |         # group them in one category called 'Rare'
47 |         (
48 |             "rare_label_encoder",
49 |             RareLabelEncoder(
50 |                 tol=0.05, n_categories=1, variables=config.model_config.categorical_vars
51 |             ),
52 |         ),
53 |         # encode categorical variables using one hot encoding into k-1 variables
54 |         (
55 |             "categorical_encoder",
56 |             OneHotEncoder(
57 |                 drop_last=True, variables=config.model_config.categorical_vars
58 |             ),
59 |         ),
60 |         # scale
61 |         ("scaler", StandardScaler()),
62 |         ("Logit", LogisticRegression(C=0.0005, random_state=0)),
63 |     ]
64 | )
65 | 


--------------------------------------------------------------------------------
/assignment-section-05/classification_model/predict.py:
--------------------------------------------------------------------------------
 1 | import typing as t
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from classification_model import __version__ as _version
 6 | from classification_model.config.core import config
 7 | from classification_model.processing.data_manager import load_pipeline
 8 | from classification_model.processing.validation import validate_inputs
 9 | 
10 | pipeline_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl"
11 | _titanic_pipe = load_pipeline(file_name=pipeline_file_name)
12 | 
13 | 
14 | def make_prediction(
15 |     *,
16 |     input_data: t.Union[pd.DataFrame, dict],
17 | ) -> dict:
18 |     """Make a prediction using a saved model pipeline."""
19 | 
20 |     data = pd.DataFrame(input_data)
21 |     validated_data, errors = validate_inputs(input_data=data)
22 |     results = {"predictions": None, "version": _version, "errors": errors}
23 | 
24 |     if not errors:
25 |         predictions = _titanic_pipe.predict(
26 |             X=validated_data[config.model_config.features]
27 |         )
28 |         results = {
29 |             "predictions": predictions,
30 |             "version": _version,
31 |             "errors": errors,
32 |         }
33 | 
34 |     return results
35 | 


--------------------------------------------------------------------------------
/assignment-section-05/classification_model/processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/assignment-section-05/classification_model/processing/__init__.py


--------------------------------------------------------------------------------
/assignment-section-05/classification_model/processing/data_manager.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import re
  3 | from pathlib import Path
  4 | from typing import Any, List, Union
  5 | 
  6 | import joblib
  7 | import numpy as np
  8 | import pandas as pd
  9 | from sklearn.pipeline import Pipeline
 10 | 
 11 | from classification_model import __version__ as _version
 12 | from classification_model.config.core import DATASET_DIR, TRAINED_MODEL_DIR, config
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | # float type for np.nan
 18 | def get_first_cabin(row: Any) -> Union[str, float]:
 19 |     try:
 20 |         return row.split()[0]
 21 |     except AttributeError:
 22 |         return np.nan
 23 | 
 24 | 
 25 | def get_title(passenger: str) -> str:
 26 |     """Extracts the title (Mr, Ms, etc) from the name variable."""
 27 |     line = passenger
 28 |     if re.search("Mrs", line):
 29 |         return "Mrs"
 30 |     elif re.search("Mr", line):
 31 |         return "Mr"
 32 |     elif re.search("Miss", line):
 33 |         return "Miss"
 34 |     elif re.search("Master", line):
 35 |         return "Master"
 36 |     else:
 37 |         return "Other"
 38 | 
 39 | 
 40 | def pre_pipeline_preparation(*, dataframe: pd.DataFrame) -> pd.DataFrame:
 41 |     # replace question marks with NaN values
 42 |     data = dataframe.replace("?", np.nan)
 43 | 
 44 |     # retain only the first cabin if more than
 45 |     # 1 are available per passenger
 46 |     data["cabin"] = data["cabin"].apply(get_first_cabin)
 47 | 
 48 |     data["title"] = data["name"].apply(get_title)
 49 | 
 50 |     # cast numerical variables as floats
 51 |     data["fare"] = data["fare"].astype("float")
 52 |     data["age"] = data["age"].astype("float")
 53 | 
 54 |     # drop unnecessary variables
 55 |     data.drop(labels=config.model_config.unused_fields, axis=1, inplace=True)
 56 | 
 57 |     return data
 58 | 
 59 | 
 60 | def _load_raw_dataset(*, file_name: str) -> pd.DataFrame:
 61 |     dataframe = pd.read_csv(Path(f"{DATASET_DIR}/{file_name}"))
 62 |     return dataframe
 63 | 
 64 | 
 65 | def load_dataset(*, file_name: str) -> pd.DataFrame:
 66 |     dataframe = pd.read_csv(Path(f"{DATASET_DIR}/{file_name}"))
 67 |     transformed = pre_pipeline_preparation(dataframe=dataframe)
 68 | 
 69 |     return transformed
 70 | 
 71 | 
 72 | def save_pipeline(*, pipeline_to_persist: Pipeline) -> None:
 73 |     """Persist the pipeline.
 74 |     Saves the versioned model, and overwrites any previous
 75 |     saved models. This ensures that when the package is
 76 |     published, there is only one trained model that can be
 77 |     called, and we know exactly how it was built.
 78 |     """
 79 | 
 80 |     # Prepare versioned save file name
 81 |     save_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl"
 82 |     save_path = TRAINED_MODEL_DIR / save_file_name
 83 | 
 84 |     remove_old_pipelines(files_to_keep=[save_file_name])
 85 |     joblib.dump(pipeline_to_persist, save_path)
 86 | 
 87 | 
 88 | def load_pipeline(*, file_name: str) -> Pipeline:
 89 |     """Load a persisted pipeline."""
 90 | 
 91 |     file_path = TRAINED_MODEL_DIR / file_name
 92 |     return joblib.load(filename=file_path)
 93 | 
 94 | 
 95 | def remove_old_pipelines(*, files_to_keep: List[str]) -> None:
 96 |     """
 97 |     Remove old model pipelines.
 98 |     This is to ensure there is a simple one-to-one
 99 |     mapping between the package version and the model
100 |     version to be imported and used by other applications.
101 |     """
102 |     do_not_delete = files_to_keep + ["__init__.py"]
103 |     for model_file in TRAINED_MODEL_DIR.iterdir():
104 |         if model_file.name not in do_not_delete:
105 |             model_file.unlink()
106 | 


--------------------------------------------------------------------------------
/assignment-section-05/classification_model/processing/features.py:
--------------------------------------------------------------------------------
 1 | from sklearn.base import BaseEstimator, TransformerMixin
 2 | 
 3 | 
 4 | class ExtractLetterTransformer(BaseEstimator, TransformerMixin):
 5 |     # Extract first letter of variable
 6 | 
 7 |     def __init__(self, variables):
 8 | 
 9 |         if not isinstance(variables, list):
10 |             raise ValueError("variables should be a list")
11 | 
12 |         self.variables = variables
13 | 
14 |     def fit(self, X, y=None):
15 |         # we need this step to fit the sklearn pipeline
16 |         return self
17 | 
18 |     def transform(self, X):
19 | 
20 |         # so that we do not over-write the original dataframe
21 |         X = X.copy()
22 | 
23 |         for feature in self.variables:
24 |             X[feature] = X[feature].str[0]
25 | 
26 |         return X
27 | 


--------------------------------------------------------------------------------
/assignment-section-05/classification_model/processing/validation.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional, Tuple, Union
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | from pydantic import BaseModel, ValidationError
 6 | 
 7 | from classification_model.config.core import config
 8 | from classification_model.processing.data_manager import pre_pipeline_preparation
 9 | 
10 | 
11 | def validate_inputs(*, input_data: pd.DataFrame) -> Tuple[pd.DataFrame, Optional[dict]]:
12 |     """Check model inputs for unprocessable values."""
13 | 
14 |     pre_processed = pre_pipeline_preparation(dataframe=input_data)
15 |     validated_data = pre_processed[config.model_config.features].copy()
16 |     errors = None
17 | 
18 |     try:
19 |         # replace numpy nans so that pydantic can validate
20 |         MultipleTitanicDataInputs(
21 |             inputs=validated_data.replace({np.nan: None}).to_dict(orient="records")
22 |         )
23 |     except ValidationError as error:
24 |         errors = error.json()
25 | 
26 |     return validated_data, errors
27 | 
28 | 
29 | class TitanicDataInputSchema(BaseModel):
30 |     pclass: Optional[int]
31 |     name: Optional[str]
32 |     sex: Optional[str]
33 |     age: Optional[int]
34 |     sibsp: Optional[int]
35 |     parch: Optional[int]
36 |     ticket: Optional[int]
37 |     fare: Optional[float]
38 |     cabin: Optional[str]
39 |     embarked: Optional[str]
40 |     boat: Optional[Union[str, int]]
41 |     body: Optional[int]
42 |     # TODO: rename home.dest, can get away with it now as it is not used
43 | 
44 | 
45 | class MultipleTitanicDataInputs(BaseModel):
46 |     inputs: List[TitanicDataInputSchema]
47 | 


--------------------------------------------------------------------------------
/assignment-section-05/classification_model/train_pipeline.py:
--------------------------------------------------------------------------------
 1 | from sklearn.model_selection import train_test_split
 2 | 
 3 | from classification_model.config.core import config
 4 | from classification_model.pipeline import titanic_pipe
 5 | from classification_model.processing.data_manager import load_dataset, save_pipeline
 6 | 
 7 | 
 8 | def run_training() -> None:
 9 |     """
10 |     Train the model.
11 | 
12 |     Training data can be found here:
13 |     https://www.openml.org/data/get_csv/16826755/phpMYEkMl
14 |     """
15 | 
16 |     # read training data
17 |     data = load_dataset(file_name=config.app_config.raw_data_file)
18 | 
19 |     # divide train and test
20 |     X_train, X_test, y_train, y_test = train_test_split(
21 |         data[config.model_config.features],  # predictors
22 |         data[config.model_config.target],
23 |         test_size=config.model_config.test_size,
24 |         # we are setting the random seed here
25 |         # for reproducibility
26 |         random_state=config.model_config.random_state,
27 |     )
28 | 
29 |     # fit model
30 |     titanic_pipe.fit(X_train, y_train)
31 | 
32 |     # persist trained model
33 |     save_pipeline(pipeline_to_persist=titanic_pipe)
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     run_training()
38 | 


--------------------------------------------------------------------------------
/assignment-section-05/classification_model/trained_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/assignment-section-05/classification_model/trained_models/__init__.py


--------------------------------------------------------------------------------
/assignment-section-05/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | warn_unreachable = False
 3 | warn_unused_ignores = True
 4 | follow_imports = skip
 5 | show_error_context = True
 6 | warn_incomplete_stub = True
 7 | ignore_missing_imports = True
 8 | check_untyped_defs = True
 9 | cache_dir = /dev/null
10 | # Allow defining functions without any types.
11 | disallow_untyped_defs = False
12 | warn_redundant_casts = True
13 | warn_unused_configs = True
14 | strict_optional = True


--------------------------------------------------------------------------------
/assignment-section-05/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = [
 3 |     "setuptools>=42",
 4 |     "wheel"
 5 | ]
 6 | build-backend = "setuptools.build_meta"
 7 | 
 8 | [tool.pytest.ini_options]
 9 | minversion = "2.0"
10 | addopts = "-rfEX -p pytester --strict-markers"
11 | python_files = ["test_*.py", "*_test.py"]
12 | python_classes = ["Test", "Acceptance"]
13 | python_functions = ["test"]
14 | # NOTE: "doc" is not included here, but gets tested explicitly via "doctesting".
15 | testpaths = ["tests"]
16 | xfail_strict = true
17 | filterwarnings = [
18 |     "error",
19 |     "default:Using or importing the ABCs:DeprecationWarning:unittest2.*",
20 |     # produced by older pyparsing<=2.2.0.
21 |     "default:Using or importing the ABCs:DeprecationWarning:pyparsing.*",
22 |     "default:the imp module is deprecated in favour of importlib:DeprecationWarning:nose.*",
23 |     # distutils is deprecated in 3.10, scheduled for removal in 3.12
24 |     "ignore:The distutils package is deprecated:DeprecationWarning",
25 |     # produced by python3.6/site.py itself (3.6.7 on Travis, could not trigger it with 3.6.8)."
26 |     "ignore:.*U.*mode is deprecated:DeprecationWarning:(?!(pytest|_pytest))",
27 |     # produced by pytest-xdist
28 |     "ignore:.*type argument to addoption.*:DeprecationWarning",
29 |     # produced on execnet (pytest-xdist)
30 |     "ignore:.*inspect.getargspec.*deprecated, use inspect.signature.*:DeprecationWarning",
31 |     # pytest's own futurewarnings
32 |     "ignore::pytest.PytestExperimentalApiWarning",
33 |     # Do not cause SyntaxError for invalid escape sequences in py37.
34 |     # Those are caught/handled by pyupgrade, and not easy to filter with the
35 |     # module being the filename (with .py removed).
36 |     "default:invalid escape sequence:DeprecationWarning",
37 |     # ignore use of unregistered marks, because we use many to test the implementation
38 |     "ignore::_pytest.warning_types.PytestUnknownMarkWarning",
39 | ]
40 | 
41 | [tool.black]
42 | target-version = ['py311']
43 | 
44 | [tool.isort]
45 | profile = "black"
46 | line_length = 100
47 | lines_between_sections = 1
48 | skip = "migrations"
49 | 


--------------------------------------------------------------------------------
/assignment-section-05/requirements/requirements.txt:
--------------------------------------------------------------------------------
 1 | # We use compatible release functionality (see PEP 440 here: https://www.python.org/dev/peps/pep-0440/#compatible-release)
 2 | # to specify acceptable version ranges of our project dependencies. This gives us the flexibility to keep up with small
 3 | # updates/fixes, whilst ensuring we don't install a major update which could introduce backwards incompatible changes.
 4 | numpy>=1.21.0,<2.0.0
 5 | pandas>=1.3.5,<2.0.0
 6 | pydantic>=1.8.1,<2.0.0
 7 | scikit-learn>=1.1.3,<2.0.0
 8 | strictyaml>=1.3.2,<2.0.0
 9 | ruamel.yaml>=0.16.12,<1.0.0
10 | feature-engine>=1.0.2,<2.0.0
11 | joblib>=1.0.1,<2.0.0


--------------------------------------------------------------------------------
/assignment-section-05/requirements/test_requirements.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 | 
3 | # testing requirements
4 | pytest>=7.2.0,<8.0.0
5 | 


--------------------------------------------------------------------------------
/assignment-section-05/requirements/typing_requirements.txt:
--------------------------------------------------------------------------------
1 | # repo maintenance tooling
2 | black>=22.12.0,<23.0.0
3 | flake8>=6.0.0,<7.0.0
4 | mypy>=0.991,<1.0.0
5 | isort>=5.11.4,<6.0.0


--------------------------------------------------------------------------------
/assignment-section-05/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from pathlib import Path
 5 | 
 6 | from setuptools import find_packages, setup
 7 | 
 8 | # Package meta-data.
 9 | NAME = 'tid-titanic-classification-model'
10 | DESCRIPTION = "Example Titanic dataset classification model package from Train In Data."
11 | URL = "https://github.com/trainindata/deploying-machine-learning-models"
12 | EMAIL = "christopher.samiullah@protonmail.com"
13 | AUTHOR = "ChristopherGS"
14 | REQUIRES_PYTHON = ">=3.7.0"
15 | 
16 | 
17 | # The rest you shouldn't have to touch too much :)
18 | # ------------------------------------------------
19 | # Except, perhaps the License and Trove Classifiers!
20 | # Trove Classifiers: https://pypi.org/classifiers/
21 | # If you do change the License, remember to change the
22 | # Trove Classifier for that!
23 | long_description = DESCRIPTION
24 | 
25 | # Load the package's VERSION file as a dictionary.
26 | about = {}
27 | ROOT_DIR = Path(__file__).resolve().parent
28 | REQUIREMENTS_DIR = ROOT_DIR / 'requirements'
29 | PACKAGE_DIR = ROOT_DIR / 'classification_model'
30 | with open(PACKAGE_DIR / "VERSION") as f:
31 |     _version = f.read().strip()
32 |     about["__version__"] = _version
33 | 
34 | 
35 | # What packages are required for this module to be executed?
36 | def list_reqs(fname="requirements.txt"):
37 |     with open(REQUIREMENTS_DIR / fname) as fd:
38 |         return fd.read().splitlines()
39 | 
40 | # Where the magic happens:
41 | setup(
42 |     name=NAME,
43 |     version=about["__version__"],
44 |     description=DESCRIPTION,
45 |     long_description=long_description,
46 |     long_description_content_type="text/markdown",
47 |     author=AUTHOR,
48 |     author_email=EMAIL,
49 |     python_requires=REQUIRES_PYTHON,
50 |     url=URL,
51 |     packages=find_packages(exclude=("tests",)),
52 |     package_data={"classification_model": ["VERSION"]},
53 |     install_requires=list_reqs(),
54 |     extras_require={},
55 |     include_package_data=True,
56 |     license="BSD-3",
57 |     classifiers=[
58 |         # Trove classifiers
59 |         # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
60 |         "License :: OSI Approved :: MIT License",
61 |         "Programming Language :: Python",
62 |         "Programming Language :: Python :: 3",
63 |         "Programming Language :: Python :: 3.7",
64 |         "Programming Language :: Python :: 3.8",
65 |         "Programming Language :: Python :: 3.9",
66 |         "Programming Language :: Python :: 3.10",
67 |         "Programming Language :: Python :: 3.11",
68 |         "Programming Language :: Python :: Implementation :: CPython",
69 |         "Programming Language :: Python :: Implementation :: PyPy",
70 |     ],
71 | )


--------------------------------------------------------------------------------
/assignment-section-05/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/assignment-section-05/tests/__init__.py


--------------------------------------------------------------------------------
/assignment-section-05/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import pytest
 4 | from sklearn.model_selection import train_test_split
 5 | 
 6 | from classification_model.config.core import config
 7 | from classification_model.processing.data_manager import _load_raw_dataset
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | @pytest.fixture
13 | def sample_input_data():
14 |     data = _load_raw_dataset(file_name=config.app_config.raw_data_file)
15 | 
16 |     # divide train and test
17 |     X_train, X_test, y_train, y_test = train_test_split(
18 |         data,  # predictors
19 |         data[config.model_config.target],
20 |         test_size=config.model_config.test_size,
21 |         # we are setting the random seed here
22 |         # for reproducibility
23 |         random_state=config.model_config.random_state,
24 |     )
25 | 
26 |     return X_test
27 | 


--------------------------------------------------------------------------------
/assignment-section-05/tests/test_features.py:
--------------------------------------------------------------------------------
 1 | from classification_model.config.core import config
 2 | from classification_model.processing.features import ExtractLetterTransformer
 3 | 
 4 | 
 5 | def test_temporal_variable_transformer(sample_input_data):
 6 |     # Given
 7 |     transformer = ExtractLetterTransformer(
 8 |         variables=config.model_config.cabin_vars,  # cabin
 9 |     )
10 |     assert sample_input_data["cabin"].iat[6] == "E12"
11 | 
12 |     # When
13 |     subject = transformer.fit_transform(sample_input_data)
14 | 
15 |     # Then
16 |     assert subject["cabin"].iat[6] == "E"
17 | 


--------------------------------------------------------------------------------
/assignment-section-05/tests/test_prediction.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Note: These tests will fail if you have not first trained the model.
 3 | """
 4 | 
 5 | import numpy as np
 6 | from sklearn.metrics import accuracy_score
 7 | 
 8 | from classification_model.predict import make_prediction
 9 | 
10 | 
11 | def test_make_prediction(sample_input_data):
12 |     # Given
13 |     expected_no_predictions = 131
14 | 
15 |     # When
16 |     result = make_prediction(input_data=sample_input_data)
17 | 
18 |     # Then
19 |     predictions = result.get("predictions")
20 |     assert isinstance(predictions, np.ndarray)
21 |     assert isinstance(predictions[0], np.int64)
22 |     assert result.get("errors") is None
23 |     assert len(predictions) == expected_no_predictions
24 |     _predictions = list(predictions)
25 |     y_true = sample_input_data["survived"]
26 |     accuracy = accuracy_score(_predictions, y_true)
27 |     assert accuracy > 0.7
28 | 


--------------------------------------------------------------------------------
/assignment-section-05/tox.ini:
--------------------------------------------------------------------------------
 1 | # Tox is a generic virtualenv management and test command line tool. Its goal is to
 2 | # standardize testing in Python. We will be using it extensively in this course.
 3 | 
 4 | # Using Tox we can (on multiple operating systems):
 5 | # + Eliminate PYTHONPATH challenges when running scripts/tests
 6 | # + Eliminate virtualenv setup confusion
 7 | # + Streamline steps such as model training, model publishing
 8 | 
 9 | 
10 | [tox]
11 | envlist = test_package, checks
12 | skipsdist = True
13 | 
14 | [testenv]
15 | install_command = pip install {opts} {packages}
16 | 
17 | [testenv:test_package]
18 | deps =
19 | 	-rrequirements/test_requirements.txt
20 | 
21 | setenv =
22 | 	PYTHONPATH=.
23 | 	PYTHONHASHSEED=0
24 | 
25 | commands=
26 | 	python classification_model/train_pipeline.py
27 | 	pytest \
28 | 	-s \
29 | 	-vv \
30 | 	{posargs:tests/}
31 | 
32 | 
33 | [testenv:train]
34 | envdir = {toxworkdir}/test_package
35 | 
36 | deps =
37 |      {[testenv:test_package]deps}
38 | 
39 | setenv =
40 | 	{[testenv:test_package]setenv}
41 | commands=
42 | 	python classification_model/train_pipeline.py
43 | 
44 | 
45 | [testenv:checks]
46 | envdir = {toxworkdir}/checks
47 | deps =
48 | 	-r{toxinidir}/requirements/typing_requirements.txt
49 | commands =
50 | 	flake8 classification_model tests
51 | 	isort classification_model tests
52 | 	black classification_model tests
53 | 	{posargs:mypy classification_model}
54 | 
55 | 
56 | [flake8]
57 | exclude = .git,env
58 | max-line-length = 90


--------------------------------------------------------------------------------
/packages/ml_api/VERSION:
--------------------------------------------------------------------------------
1 | 0.3.0


--------------------------------------------------------------------------------
/packages/ml_api/api/__init__.py:
--------------------------------------------------------------------------------
1 | from api.config import PACKAGE_ROOT
2 | 
3 | with open(PACKAGE_ROOT / 'VERSION') as version_file:
4 |     __version__ = version_file.read().strip()
5 | 


--------------------------------------------------------------------------------
/packages/ml_api/api/app.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask
 2 | 
 3 | from api.config import get_logger
 4 | 
 5 | 
 6 | _logger = get_logger(logger_name=__name__)
 7 | 
 8 | 
 9 | def create_app(*, config_object) -> Flask:
10 |     """Create a flask app instance."""
11 | 
12 |     flask_app = Flask('ml_api')
13 |     flask_app.config.from_object(config_object)
14 | 
15 |     # import blueprints
16 |     from api.controller import prediction_app
17 |     flask_app.register_blueprint(prediction_app)
18 |     _logger.debug('Application instance created')
19 | 
20 |     return flask_app
21 | 


--------------------------------------------------------------------------------
/packages/ml_api/api/config.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from logging.handlers import TimedRotatingFileHandler
 3 | import pathlib
 4 | import os
 5 | import sys
 6 | 
 7 | PACKAGE_ROOT = pathlib.Path(__file__).resolve().parent.parent
 8 | 
 9 | FORMATTER = logging.Formatter(
10 |     "%(asctime)s — %(name)s — %(levelname)s —"
11 |     "%(funcName)s:%(lineno)d — %(message)s")
12 | LOG_DIR = PACKAGE_ROOT / 'logs'
13 | LOG_DIR.mkdir(exist_ok=True)
14 | LOG_FILE = LOG_DIR / 'ml_api.log'
15 | UPLOAD_FOLDER = PACKAGE_ROOT / 'uploads'
16 | UPLOAD_FOLDER.mkdir(exist_ok=True)
17 | 
18 | ALLOWED_EXTENSIONS = set(['png', 'jpg', 'jpeg'])
19 | 
20 | 
21 | def get_console_handler():
22 |     console_handler = logging.StreamHandler(sys.stdout)
23 |     console_handler.setFormatter(FORMATTER)
24 |     return console_handler
25 | 
26 | 
27 | def get_file_handler():
28 |     file_handler = TimedRotatingFileHandler(
29 |         LOG_FILE, when='midnight')
30 |     file_handler.setFormatter(FORMATTER)
31 |     file_handler.setLevel(logging.WARNING)
32 |     return file_handler
33 | 
34 | 
35 | def get_logger(*, logger_name):
36 |     """Get logger with prepared handlers."""
37 | 
38 |     logger = logging.getLogger(logger_name)
39 | 
40 |     logger.setLevel(logging.INFO)
41 | 
42 |     logger.addHandler(get_console_handler())
43 |     logger.addHandler(get_file_handler())
44 |     logger.propagate = False
45 | 
46 |     return logger
47 | 
48 | 
49 | class Config:
50 |     DEBUG = False
51 |     TESTING = False
52 |     CSRF_ENABLED = True
53 |     SECRET_KEY = 'this-really-needs-to-be-changed'
54 |     SERVER_PORT = 5000
55 |     UPLOAD_FOLDER = UPLOAD_FOLDER
56 | 
57 | 
58 | class ProductionConfig(Config):
59 |     DEBUG = False
60 |     SERVER_ADDRESS: os.environ.get('SERVER_ADDRESS', '0.0.0.0')
61 |     SERVER_PORT: os.environ.get('SERVER_PORT', '5000')
62 | 
63 | 
64 | class DevelopmentConfig(Config):
65 |     DEVELOPMENT = True
66 |     DEBUG = True
67 | 
68 | 
69 | class TestingConfig(Config):
70 |     TESTING = True
71 | 


--------------------------------------------------------------------------------
/packages/ml_api/api/controller.py:
--------------------------------------------------------------------------------
 1 | from flask import Blueprint, request, jsonify
 2 | from regression_model.predict import make_prediction
 3 | from regression_model import __version__ as _version
 4 | from neural_network_model.predict import make_single_prediction
 5 | import os
 6 | from werkzeug.utils import secure_filename
 7 | 
 8 | from api.config import get_logger, UPLOAD_FOLDER
 9 | from api.validation import validate_inputs, allowed_file
10 | from api import __version__ as api_version
11 | 
12 | _logger = get_logger(logger_name=__name__)
13 | 
14 | 
15 | prediction_app = Blueprint('prediction_app', __name__)
16 | 
17 | 
18 | @prediction_app.route('/health', methods=['GET'])
19 | def health():
20 |     if request.method == 'GET':
21 |         _logger.info('health status OK')
22 |         return 'ok'
23 | 
24 | 
25 | @prediction_app.route('/version', methods=['GET'])
26 | def version():
27 |     if request.method == 'GET':
28 |         return jsonify({'model_version': _version,
29 |                         'api_version': api_version})
30 | 
31 | 
32 | @prediction_app.route('/v1/predict/regression', methods=['POST'])
33 | def predict():
34 |     if request.method == 'POST':
35 |         # Step 1: Extract POST data from request body as JSON
36 |         json_data = request.get_json()
37 |         _logger.debug(f'Inputs: {json_data}')
38 | 
39 |         # Step 2: Validate the input using marshmallow schema
40 |         input_data, errors = validate_inputs(input_data=json_data)
41 | 
42 |         # Step 3: Model prediction
43 |         result = make_prediction(input_data=input_data)
44 |         _logger.debug(f'Outputs: {result}')
45 | 
46 |         # Step 4: Convert numpy ndarray to list
47 |         predictions = result.get('predictions').tolist()
48 |         version = result.get('version')
49 | 
50 |         # Step 5: Return the response as JSON
51 |         return jsonify({'predictions': predictions,
52 |                         'version': version,
53 |                         'errors': errors})
54 | 
55 | 
56 | @prediction_app.route('/predict/classifier', methods=['POST'])
57 | def predict_image():
58 |     if request.method == 'POST':
59 |         # Step 1: check if the post request has the file part
60 |         if 'file' not in request.files:
61 |             return jsonify('No file found'), 400
62 | 
63 |         file = request.files['file']
64 | 
65 |         # Step 2: Basic file extension validation
66 |         if file and allowed_file(file.filename):
67 |             filename = secure_filename(file.filename)
68 | 
69 |             # Step 3: Save the file
70 |             # Note, in production, this would require careful
71 |             # validation, management and clean up.
72 |             file.save(os.path.join(UPLOAD_FOLDER, filename))
73 | 
74 |             _logger.debug(f'Inputs: {filename}')
75 | 
76 |             # Step 4: perform prediction
77 |             result = make_single_prediction(
78 |                 image_name=filename,
79 |                 image_directory=UPLOAD_FOLDER)
80 | 
81 |             _logger.debug(f'Outputs: {result}')
82 | 
83 |         readable_predictions = result.get('readable_predictions')
84 |         version = result.get('version')
85 | 
86 |         # Step 5: Return the response as JSON
87 |         return jsonify(
88 |             {'readable_predictions': readable_predictions[0],
89 |              'version': version})
90 | 


--------------------------------------------------------------------------------
/packages/ml_api/diff_test_requirements.txt:
--------------------------------------------------------------------------------
 1 | --extra-index-url=${PIP_EXTRA_INDEX_URL}
 2 | 
 3 | # api
 4 | flask>=1.1.1,<1.2.0
 5 | 
 6 | # schema validation
 7 | marshmallow==2.17.0
 8 | 
 9 | # Set this to the previous model version
10 | regression-model==2.0.19
11 | 
12 | # temporarily necessary as we update sklearn
13 | joblib>=0.14.1,<0.15.0


--------------------------------------------------------------------------------
/packages/ml_api/requirements.txt:
--------------------------------------------------------------------------------
 1 | --extra-index-url=${PIP_EXTRA_INDEX_URL}
 2 | 
 3 | # api
 4 | flask>=1.1.1,<1.2.0
 5 | 
 6 | # schema validation
 7 | marshmallow==2.17.0
 8 | 
 9 | # Install from gemfury
10 | regression-model==2.0.20
11 | neural_network_model==0.1.1
12 | 
13 | # Deployment
14 | gunicorn==19.9.0


--------------------------------------------------------------------------------
/packages/ml_api/run.py:
--------------------------------------------------------------------------------
 1 | from api.app import create_app
 2 | from api.config import DevelopmentConfig, ProductionConfig
 3 | 
 4 | 
 5 | application = create_app(
 6 |     config_object=ProductionConfig)
 7 | 
 8 | 
 9 | if __name__ == '__main__':
10 |     application.run()
11 | 


--------------------------------------------------------------------------------
/packages/ml_api/run.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | export IS_DEBUG=${DEBUG:-false}
3 | exec gunicorn --bind 0.0.0.0:5000 --access-logfile - --error-logfile - run:application


--------------------------------------------------------------------------------
/packages/ml_api/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/ml_api/tests/__init__.py


--------------------------------------------------------------------------------
/packages/ml_api/tests/capture_model_predictions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script should only be run in CI.
 3 | Never run it locally or you will disrupt the
 4 | differential test versioning logic.
 5 | """
 6 | 
 7 | import pandas as pd
 8 | 
 9 | from regression_model.predict import make_prediction
10 | from regression_model.processing.data_management import load_dataset
11 | 
12 | from api import config
13 | 
14 | 
15 | def capture_predictions() -> None:
16 |     """Save the test data predictions to a CSV."""
17 | 
18 |     save_file = 'test_data_predictions.csv'
19 |     test_data = load_dataset(file_name='test.csv')
20 | 
21 |     # we take a slice with no input validation issues
22 |     multiple_test_input = test_data[99:600]
23 | 
24 |     predictions = make_prediction(input_data=multiple_test_input)
25 | 
26 |     # save predictions for the test dataset
27 |     predictions_df = pd.DataFrame(predictions)
28 | 
29 |     # hack here to save the file to the regression model
30 |     # package of the repo, not the installed package
31 |     predictions_df.to_csv(f'{config.PACKAGE_ROOT}/{save_file}')
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     capture_predictions()
36 | 


--------------------------------------------------------------------------------
/packages/ml_api/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from api.app import create_app
 4 | from api.config import TestingConfig
 5 | 
 6 | 
 7 | @pytest.fixture
 8 | def app():
 9 |     app = create_app(config_object=TestingConfig)
10 | 
11 |     with app.app_context():
12 |         yield app
13 | 
14 | 
15 | @pytest.fixture
16 | def flask_test_client(app):
17 |     with app.test_client() as test_client:
18 |         yield test_client
19 | 


--------------------------------------------------------------------------------
/packages/ml_api/tests/differential_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/ml_api/tests/differential_tests/__init__.py


--------------------------------------------------------------------------------
/packages/ml_api/tests/differential_tests/test_differential.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | from regression_model.config import config as model_config
 4 | from regression_model.predict import make_prediction
 5 | from regression_model.processing.data_management import load_dataset
 6 | import pandas as pd
 7 | import pytest
 8 | 
 9 | 
10 | from api import config
11 | 
12 | 
13 | @pytest.mark.differential
14 | def test_model_prediction_differential(
15 |         *,
16 |         save_file: str = 'test_data_predictions.csv'):
17 |     """
18 |     This test compares the prediction result similarity of
19 |     the current model with the previous model's results.
20 |     """
21 | 
22 |     # Given
23 |     # Load the saved previous model predictions
24 |     previous_model_df = pd.read_csv(f'{config.PACKAGE_ROOT}/{save_file}')
25 |     previous_model_predictions = previous_model_df.predictions.values
26 | 
27 |     test_data = load_dataset(file_name=model_config.TESTING_DATA_FILE)
28 |     multiple_test_input = test_data[99:600]
29 | 
30 |     # When
31 |     current_result = make_prediction(input_data=multiple_test_input)
32 |     current_model_predictions = current_result.get('predictions')
33 | 
34 |     # Then
35 |     # diff the current model vs. the old model
36 |     assert len(previous_model_predictions) == len(
37 |         current_model_predictions)
38 | 
39 |     # Perform the differential test
40 |     for previous_value, current_value in zip(
41 |             previous_model_predictions, current_model_predictions):
42 | 
43 |         # convert numpy float64 to Python float.
44 |         previous_value = previous_value.item()
45 |         current_value = current_value.item()
46 | 
47 |         # rel_tol is the relative tolerance – it is the maximum allowed
48 |         # difference between a and b, relative to the larger absolute
49 |         # value of a or b. For example, to set a tolerance of 5%, pass
50 |         # rel_tol=0.05.
51 |         assert math.isclose(previous_value,
52 |                             current_value,
53 |                             rel_tol=model_config.ACCEPTABLE_MODEL_DIFFERENCE)
54 | 


--------------------------------------------------------------------------------
/packages/ml_api/tests/test_controller.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import json
 3 | import math
 4 | import os
 5 | 
 6 | from neural_network_model.config import config as ccn_config
 7 | from regression_model import __version__ as _version
 8 | from regression_model.config import config as model_config
 9 | from regression_model.processing.data_management import load_dataset
10 | 
11 | from api import __version__ as api_version
12 | 
13 | 
14 | def test_health_endpoint_returns_200(flask_test_client):
15 |     # When
16 |     response = flask_test_client.get('/health')
17 | 
18 |     # Then
19 |     assert response.status_code == 200
20 | 
21 | 
22 | def test_version_endpoint_returns_version(flask_test_client):
23 |     # When
24 |     response = flask_test_client.get('/version')
25 | 
26 |     # Then
27 |     assert response.status_code == 200
28 |     response_json = json.loads(response.data)
29 |     assert response_json['model_version'] == _version
30 |     assert response_json['api_version'] == api_version
31 | 
32 | 
33 | def test_prediction_endpoint_returns_prediction(flask_test_client):
34 |     # Given
35 |     # Load the test data from the regression_model package
36 |     # This is important as it makes it harder for the test
37 |     # data versions to get confused by not spreading it
38 |     # across packages.
39 |     test_data = load_dataset(file_name=model_config.TESTING_DATA_FILE)
40 |     post_json = test_data[0:1].to_json(orient='records')
41 | 
42 |     # When
43 |     response = flask_test_client.post('/v1/predict/regression',
44 |                                       json=json.loads(post_json))
45 | 
46 |     # Then
47 |     assert response.status_code == 200
48 |     response_json = json.loads(response.data)
49 |     prediction = response_json['predictions']
50 |     response_version = response_json['version']
51 |     assert math.ceil(prediction[0]) == 112476
52 |     assert response_version == _version
53 | 
54 | 
55 | def test_classifier_endpoint_returns_prediction(flask_test_client):
56 |     # Given
57 |     # Load the test data from the neural_network_model package
58 |     # This is important as it makes it harder for the test
59 |     # data versions to get confused by not spreading it
60 |     # across packages.
61 |     data_dir = os.path.abspath(os.path.join(ccn_config.DATA_FOLDER, os.pardir))
62 |     test_dir = os.path.join(data_dir, 'test_data')
63 |     black_grass_dir = os.path.join(test_dir, 'Black-grass')
64 |     black_grass_image = os.path.join(black_grass_dir, '1.png')
65 |     with open(black_grass_image, "rb") as image_file:
66 |         file_bytes = image_file.read()
67 |         data = dict(
68 |             file=(io.BytesIO(bytearray(file_bytes)), "1.png"),
69 |         )
70 | 
71 |     # When
72 |     response = flask_test_client.post('/predict/classifier',
73 |                                       content_type='multipart/form-data',
74 |                                       data=data)
75 | 
76 |     # Then
77 |     assert response.status_code == 200
78 |     response_json = json.loads(response.data)
79 |     assert response_json['readable_predictions']
80 | 


--------------------------------------------------------------------------------
/packages/ml_api/tests/test_validation.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from regression_model.config import config
 4 | from regression_model.processing.data_management import load_dataset
 5 | 
 6 | 
 7 | def test_prediction_endpoint_validation_200(flask_test_client):
 8 |     # Given
 9 |     # Load the test data from the regression_model package.
10 |     # This is important as it makes it harder for the test
11 |     # data versions to get confused by not spreading it
12 |     # across packages.
13 |     test_data = load_dataset(file_name=config.TESTING_DATA_FILE)
14 |     post_json = test_data.to_json(orient='records')
15 | 
16 |     # When
17 |     response = flask_test_client.post('/v1/predict/regression',
18 |                                       json=json.loads(post_json))
19 | 
20 |     # Then
21 |     assert response.status_code == 200
22 |     response_json = json.loads(response.data)
23 | 
24 |     # Check correct number of errors removed
25 |     assert len(response_json.get('predictions')) + len(
26 |         response_json.get('errors')) == len(test_data)
27 | 


--------------------------------------------------------------------------------
/packages/ml_api/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py36, py37, py38
 3 | skipsdist = True
 4 | 
 5 | 
 6 | [testenv]
 7 | install_command = pip install --pre {opts} {packages}
 8 | deps =
 9 |     -rrequirements.txt
10 | 
11 | passenv =
12 |       PIP_EXTRA_INDEX_URL
13 |       KERAS_BACKEND
14 | 
15 | setenv =
16 |   PYTHONPATH=.
17 | 
18 | commands =
19 |      pytest \
20 |            -s \
21 |            -v \
22 |            -m "not differential" \
23 |            {posargs:tests}
24 | 
25 | 
26 | # content of pytest.ini
27 | [pytest]
28 | markers =
29 |     integration: mark a test as an integration test.
30 | 	differential: mark a test as a differential test.
31 | filterwarnings =
32 |     ignore::DeprecationWarning


--------------------------------------------------------------------------------
/packages/neural_network_model/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include *.txt
 2 | include *.md
 3 | include *.cfg
 4 | include *.pkl
 5 | recursive-include ./neural_network_model/*.py
 6 | 
 7 | include neural_network_model/trained_models/*.pkl
 8 | include neural_network_model/trained_models/*.h5
 9 | include neural_network_model/VERSION
10 | include neural_network_model/datasets/test_data/Black-grass/1.png
11 | include neural_network_model/datasets/test_data/Charlock/1.png
12 | 
13 | include ./requirements.txt
14 | exclude *.log
15 | 
16 | recursive-exclude * __pycache__
17 | recursive-exclude * *.py[co]


--------------------------------------------------------------------------------
/packages/neural_network_model/config.yml:
--------------------------------------------------------------------------------
1 | MODEL_NAME: ${MODEL_NAME:cnn_model}
2 | PIPELINE_NAME: ${PIPELINE_NAME:cnn_pipe}
3 | CLASSES_PATH: ${CLASSES_PATH:False}
4 | IMAGE_SIZE: $(IMAGE_SIZE:150}
5 | 


--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/VERSION:
--------------------------------------------------------------------------------
1 | 0.1.0


--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | from neural_network_model.config import config
4 | 
5 | 
6 | with open(os.path.join(config.PACKAGE_ROOT, 'VERSION')) as version_file:
7 |     __version__ = version_file.read().strip()
8 | 


--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/neural_network_model/neural_network_model/config/__init__.py


--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/config/config.py:
--------------------------------------------------------------------------------
 1 | # The Keras model loading function does not play well with
 2 | # Pathlib at the moment, so we are using the old os module
 3 | # style
 4 | 
 5 | import os
 6 | 
 7 | PWD = os.path.dirname(os.path.abspath(__file__))
 8 | PACKAGE_ROOT = os.path.abspath(os.path.join(PWD, '..'))
 9 | DATASET_DIR = os.path.join(PACKAGE_ROOT, 'datasets')
10 | TRAINED_MODEL_DIR = os.path.join(PACKAGE_ROOT, 'trained_models')
11 | DATA_FOLDER = os.path.join(DATASET_DIR, 'v2-plant-seedlings-dataset')
12 | 
13 | # MODEL PERSISTING
14 | MODEL_NAME = 'cnn_model'
15 | PIPELINE_NAME = 'cnn_pipe'
16 | CLASSES_NAME = 'classes'
17 | ENCODER_NAME = 'encoder'
18 | 
19 | # MODEL FITTING
20 | IMAGE_SIZE = 150  # 50 for testing, 150 for final model
21 | BATCH_SIZE = 10
22 | EPOCHS = int(os.environ.get('EPOCHS', 1))  # 1 for testing, 10 for final model
23 | 
24 | 
25 | with open(os.path.join(PACKAGE_ROOT, 'VERSION')) as version_file:
26 |     _version = version_file.read().strip()
27 | 
28 | MODEL_FILE_NAME = f'{MODEL_NAME}_{_version}.h5'
29 | MODEL_PATH = os.path.join(TRAINED_MODEL_DIR, MODEL_FILE_NAME)
30 | 
31 | PIPELINE_FILE_NAME = f'{PIPELINE_NAME}_{_version}.pkl'
32 | PIPELINE_PATH = os.path.join(TRAINED_MODEL_DIR, PIPELINE_FILE_NAME)
33 | 
34 | CLASSES_FILE_NAME = f'{CLASSES_NAME}_{_version}.pkl'
35 | CLASSES_PATH = os.path.join(TRAINED_MODEL_DIR, CLASSES_FILE_NAME)
36 | 
37 | ENCODER_FILE_NAME = f'{ENCODER_NAME}_{_version}.pkl'
38 | ENCODER_PATH = os.path.join(TRAINED_MODEL_DIR, ENCODER_FILE_NAME)
39 | 


--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/neural_network_model/neural_network_model/datasets/__init__.py


--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/datasets/test_data/Black-grass/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/neural_network_model/neural_network_model/datasets/test_data/Black-grass/1.png


--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/datasets/test_data/Charlock/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/neural_network_model/neural_network_model/datasets/test_data/Charlock/1.png


--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/datasets/test_data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/neural_network_model/neural_network_model/datasets/test_data/__init__.py


--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/model.py:
--------------------------------------------------------------------------------
 1 | # for the convolutional network
 2 | from keras.models import Sequential
 3 | from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten
 4 | from keras.optimizers import Adam
 5 | from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
 6 | from keras.wrappers.scikit_learn import KerasClassifier
 7 | 
 8 | from neural_network_model.config import config
 9 | 
10 | 
11 | def cnn_model(kernel_size=(3, 3),
12 |               pool_size=(2, 2),
13 |               first_filters=32,
14 |               second_filters=64,
15 |               third_filters=128,
16 |               dropout_conv=0.3,
17 |               dropout_dense=0.3,
18 |               image_size=50):
19 | 
20 |     model = Sequential()
21 |     model.add(Conv2D(
22 |       first_filters,
23 |       kernel_size,
24 |       activation='relu',
25 |       input_shape=(image_size, image_size, 3)))
26 |     model.add(Conv2D(first_filters, kernel_size, activation = 'relu'))
27 |     model.add(MaxPooling2D(pool_size=pool_size))
28 |     model.add(Dropout(dropout_conv))
29 | 
30 |     model.add(Conv2D(second_filters, kernel_size, activation='relu'))
31 |     model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
32 |     model.add(MaxPooling2D(pool_size=pool_size))
33 |     model.add(Dropout(dropout_conv))
34 | 
35 |     model.add(Conv2D(third_filters, kernel_size, activation='relu'))
36 |     model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
37 |     model.add(MaxPooling2D(pool_size=pool_size))
38 |     model.add(Dropout(dropout_conv))
39 | 
40 |     model.add(Flatten())
41 |     model.add(Dense(256, activation="relu"))
42 |     model.add(Dropout(dropout_dense))
43 |     model.add(Dense(12, activation="softmax"))
44 | 
45 |     model.compile(Adam(lr=0.0001),
46 |                   loss='binary_crossentropy',
47 |                   metrics=['accuracy'])
48 | 
49 |     return model
50 | 
51 | 
52 | checkpoint = ModelCheckpoint(config.MODEL_PATH,
53 |                              monitor='acc',
54 |                              verbose=1,
55 |                              save_best_only=True,
56 |                              mode='max')
57 | 
58 | reduce_lr = ReduceLROnPlateau(monitor='acc',
59 |                               factor=0.5,
60 |                               patience=2,
61 |                               verbose=1,
62 |                               mode='max',
63 |                               min_lr=0.00001)
64 | 
65 | callbacks_list = [checkpoint, reduce_lr]
66 | 
67 | cnn_clf = KerasClassifier(build_fn=cnn_model,
68 |                           batch_size=config.BATCH_SIZE,
69 |                           validation_split=10,
70 |                           epochs=config.EPOCHS,
71 |                           verbose=1,  # progress bar - required for CI job
72 |                           callbacks=callbacks_list,
73 |                           image_size=config.IMAGE_SIZE
74 |                           )
75 | 
76 | 
77 | if __name__ == '__main__':
78 |     model = cnn_model()
79 |     model.summary()
80 | 


--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/pipeline.py:
--------------------------------------------------------------------------------
 1 | from sklearn.pipeline import Pipeline
 2 | 
 3 | from neural_network_model.config import config
 4 | from neural_network_model.processing import preprocessors as pp
 5 | from neural_network_model import model
 6 | 
 7 | 
 8 | pipe = Pipeline([
 9 |                 ('dataset', pp.CreateDataset(config.IMAGE_SIZE)),
10 |                 ('cnn_model', model.cnn_clf)])
11 | 


--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/predict.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from neural_network_model import __version__ as _version
 6 | from neural_network_model.processing import data_management as dm
 7 | 
 8 | _logger = logging.getLogger(__name__)
 9 | KERAS_PIPELINE = dm.load_pipeline_keras()
10 | ENCODER = dm.load_encoder()
11 | 
12 | 
13 | def make_single_prediction(*, image_name: str, image_directory: str):
14 |     """Make a single prediction using the saved model pipeline.
15 | 
16 |         Args:
17 |             image_name: Filename of the image to classify
18 |             image_directory: Location of the image to classify
19 | 
20 |         Returns
21 |             Dictionary with both raw predictions and readable values.
22 |         """
23 | 
24 |     image_df = dm.load_single_image(
25 |         data_folder=image_directory,
26 |         filename=image_name)
27 | 
28 |     prepared_df = image_df['image'].reset_index(drop=True)
29 |     _logger.info(f'received input array: {prepared_df}, '
30 |                  f'filename: {image_name}')
31 | 
32 |     predictions = KERAS_PIPELINE.predict(prepared_df)
33 |     readable_predictions = ENCODER.encoder.inverse_transform(predictions)
34 | 
35 |     _logger.info(f'Made prediction: {predictions}'
36 |                  f' with model version: {_version}')
37 | 
38 |     return dict(predictions=predictions,
39 |                 readable_predictions=readable_predictions,
40 |                 version=_version)
41 | 
42 | 
43 | def make_bulk_prediction(*, images_df: pd.Series) -> dict:
44 |     """Make multiple predictions using the saved model pipeline.
45 | 
46 |     Currently, this function is primarily for testing purposes,
47 |     allowing us to pass in a directory of images for running
48 |     bulk predictions.
49 | 
50 |     Args:
51 |         images_df: Pandas series of images
52 | 
53 |     Returns
54 |         Dictionary with both raw predictions and their classifications.
55 |     """
56 | 
57 |     _logger.info(f'received input df: {images_df}')
58 | 
59 |     predictions = KERAS_PIPELINE.predict(images_df)
60 |     readable_predictions = ENCODER.encoder.inverse_transform(predictions)
61 | 
62 |     _logger.info(f'Made predictions: {predictions}'
63 |                  f' with model version: {_version}')
64 | 
65 |     return dict(predictions=predictions,
66 |                 readable_predictions=readable_predictions,
67 |                 version=_version)
68 | 


--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/neural_network_model/neural_network_model/processing/__init__.py


--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/processing/errors.py:
--------------------------------------------------------------------------------
1 | class BaseError(Exception):
2 |     """Base package error."""
3 | 
4 | 
5 | class InvalidModelInputError(BaseError):
6 |     """Model input contains an error."""
7 | 


--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/processing/preprocessors.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import cv2
 3 | from keras.utils import np_utils
 4 | from sklearn.preprocessing import LabelEncoder
 5 | from sklearn.base import BaseEstimator, TransformerMixin
 6 | 
 7 | 
 8 | class TargetEncoder(BaseEstimator, TransformerMixin):
 9 | 
10 |     def __init__(self, encoder=LabelEncoder()):
11 |         self.encoder = encoder
12 | 
13 |     def fit(self, X, y=None):
14 |         # note that x is the target in this case
15 |         self.encoder.fit(X)
16 |         return self
17 | 
18 |     def transform(self, X):
19 |         X = X.copy()
20 |         X = np_utils.to_categorical(self.encoder.transform(X))
21 |         return X
22 | 
23 | 
24 | def _im_resize(df, n, image_size):
25 |     im = cv2.imread(df[n])
26 |     im = cv2.resize(im, (image_size, image_size))
27 |     return im
28 | 
29 | 
30 | class CreateDataset(BaseEstimator, TransformerMixin):
31 | 
32 |     def __init__(self, image_size=50):
33 |         self.image_size = image_size
34 | 
35 |     def fit(self, X, y=None):
36 |         return self
37 | 
38 |     def transform(self, X):
39 |         X = X.copy()
40 |         tmp = np.zeros((len(X),
41 |                         self.image_size,
42 |                         self.image_size, 3), dtype='float32')
43 | 
44 |         for n in range(0, len(X)):
45 |             im = _im_resize(X, n, self.image_size)
46 |             tmp[n] = im
47 | 
48 |         print('Dataset Images shape: {} size: {:,}'.format(
49 |             tmp.shape, tmp.size))
50 |         return tmp
51 | 


--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/train_pipeline.py:
--------------------------------------------------------------------------------
 1 | from sklearn.externals import joblib
 2 | 
 3 | from neural_network_model import pipeline as pipe
 4 | from neural_network_model.config import config
 5 | from neural_network_model.processing import data_management as dm
 6 | from neural_network_model.processing import preprocessors as pp
 7 | 
 8 | 
 9 | def run_training(save_result: bool = True):
10 |     """Train a Convolutional Neural Network."""
11 | 
12 |     images_df = dm.load_image_paths(config.DATA_FOLDER)
13 |     X_train, X_test, y_train, y_test = dm.get_train_test_target(images_df)
14 | 
15 |     enc = pp.TargetEncoder()
16 |     enc.fit(y_train)
17 |     y_train = enc.transform(y_train)
18 | 
19 |     pipe.pipe.fit(X_train, y_train)
20 | 
21 |     if save_result:
22 |         joblib.dump(enc, config.ENCODER_PATH)
23 |         dm.save_pipeline_keras(pipe.pipe)
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     run_training(save_result=True)
28 | 


--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/trained_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/neural_network_model/neural_network_model/trained_models/__init__.py


--------------------------------------------------------------------------------
/packages/neural_network_model/requirements.txt:
--------------------------------------------------------------------------------
 1 | # production requirements
 2 | pandas==0.23.4
 3 | numpy==1.13.3
 4 | scikit-learn==0.19.0
 5 | Keras==2.1.3
 6 | opencv-python==4.0.0.21
 7 | h5py==2.9.0
 8 | Theano==0.9.0
 9 | 
10 | # packaging
11 | setuptools==40.6.3
12 | wheel==0.32.3
13 | 
14 | # testing requirements
15 | pytest==4.0.2
16 | 
17 | # fetching datasets
18 | kaggle==1.5.1.1


--------------------------------------------------------------------------------
/packages/neural_network_model/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import io
 5 | import os
 6 | from pathlib import Path
 7 | 
 8 | from setuptools import find_packages, setup
 9 | 
10 | 
11 | # Package meta-data.
12 | NAME = 'neural_network_model'
13 | DESCRIPTION = 'Train and deploy neural network model.'
14 | URL = 'your github project'
15 | EMAIL = 'your_email@email.com'
16 | AUTHOR = 'Your name'
17 | REQUIRES_PYTHON = '>=3.6.0'
18 | 
19 | 
20 | # What packages are required for this module to be executed?
21 | def list_reqs(fname='requirements.txt'):
22 |     with open(fname) as fd:
23 |         return fd.read().splitlines()
24 | 
25 | 
26 | # The rest you shouldn't have to touch too much :)
27 | # ------------------------------------------------
28 | # Except, perhaps the License and Trove Classifiers!
29 | # If you do change the License, remember to change the
30 | # Trove Classifier for that!
31 | 
32 | here = os.path.abspath(os.path.dirname(__file__))
33 | 
34 | # Import the README and use it as the long-description.
35 | # Note: this will only work if 'README.md' is present in your MANIFEST.in file!
36 | try:
37 |     with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
38 |         long_description = '\n' + f.read()
39 | except FileNotFoundError:
40 |     long_description = DESCRIPTION
41 | 
42 | 
43 | # Load the package's __version__.py module as a dictionary.
44 | ROOT_DIR = Path(__file__).resolve().parent
45 | PACKAGE_DIR = ROOT_DIR / NAME
46 | about = {}
47 | with open(PACKAGE_DIR / 'VERSION') as f:
48 |     _version = f.read().strip()
49 |     about['__version__'] = _version
50 | 
51 | 
52 | # Where the magic happens:
53 | setup(
54 |     name=NAME,
55 |     version=about['__version__'],
56 |     description=DESCRIPTION,
57 |     long_description=long_description,
58 |     long_description_content_type='text/markdown',
59 |     author=AUTHOR,
60 |     author_email=EMAIL,
61 |     python_requires=REQUIRES_PYTHON,
62 |     url=URL,
63 |     packages=find_packages(exclude=('tests',)),
64 |     package_data={'neural_network_model': ['VERSION']},
65 |     install_requires=list_reqs(),
66 |     extras_require={},
67 |     include_package_data=True,
68 |     license='MIT',
69 |     classifiers=[
70 |         # Trove classifiers
71 |         # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
72 |         'License :: OSI Approved :: MIT License',
73 |         'Programming Language :: Python',
74 |         'Programming Language :: Python :: 3',
75 |         'Programming Language :: Python :: 3.6',
76 |         'Programming Language :: Python :: Implementation :: CPython',
77 |         'Programming Language :: Python :: Implementation :: PyPy'
78 |     ],
79 | )
80 | 


--------------------------------------------------------------------------------
/packages/neural_network_model/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/neural_network_model/tests/__init__.py


--------------------------------------------------------------------------------
/packages/neural_network_model/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | 
 4 | from neural_network_model.config import config
 5 | 
 6 | 
 7 | @pytest.fixture
 8 | def black_grass_dir():
 9 |     test_data_dir = os.path.join(config.DATASET_DIR, 'test_data')
10 |     black_grass_dir = os.path.join(test_data_dir, 'Black-grass')
11 | 
12 |     return black_grass_dir
13 | 
14 | 
15 | @pytest.fixture
16 | def charlock_dir():
17 |     test_data_dir = os.path.join(config.DATASET_DIR, 'test_data')
18 |     charlock_dir = os.path.join(test_data_dir, 'Charlock')
19 | 
20 |     return charlock_dir
21 | 


--------------------------------------------------------------------------------
/packages/neural_network_model/tests/test_predict.py:
--------------------------------------------------------------------------------
 1 | from neural_network_model import __version__ as _version
 2 | from neural_network_model.predict import (make_single_prediction)
 3 | 
 4 | 
 5 | def test_make_prediction_on_sample(charlock_dir):
 6 |     # Given
 7 |     filename = '1.png'
 8 |     expected_classification = 'Charlock'
 9 | 
10 |     # When
11 |     results = make_single_prediction(image_directory=charlock_dir,
12 |                                      image_name=filename)
13 | 
14 |     # Then
15 |     assert results['predictions'] is not None
16 |     assert results['readable_predictions'][0] == expected_classification
17 |     assert results['version'] == _version
18 | 


--------------------------------------------------------------------------------
/packages/regression_model/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include *.txt
 2 | include *.md
 3 | include *.cfg
 4 | include *.pkl
 5 | recursive-include ./regression_model/*
 6 | 
 7 | include regression_model/datasets/train.csv
 8 | include regression_model/datasets/test.csv
 9 | include regression_model/trained_models/*.pkl
10 | include regression_model/VERSION
11 | 
12 | include ./requirements.txt
13 | exclude *.log
14 | 
15 | recursive-exclude * __pycache__
16 | recursive-exclude * *.py[co]


--------------------------------------------------------------------------------
/packages/regression_model/regression_model/VERSION:
--------------------------------------------------------------------------------
1 | 2.0.20


--------------------------------------------------------------------------------
/packages/regression_model/regression_model/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from regression_model.config import config
 4 | from regression_model.config import logging_config
 5 | 
 6 | 
 7 | VERSION_PATH = config.PACKAGE_ROOT / 'VERSION'
 8 | 
 9 | # Configure logger for use in package
10 | logger = logging.getLogger(__name__)
11 | logger.setLevel(logging.DEBUG)
12 | logger.addHandler(logging_config.get_console_handler())
13 | logger.propagate = False
14 | 
15 | 
16 | with open(VERSION_PATH, 'r') as version_file:
17 |     __version__ = version_file.read().strip()
18 | 


--------------------------------------------------------------------------------
/packages/regression_model/regression_model/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/regression_model/regression_model/config/__init__.py


--------------------------------------------------------------------------------
/packages/regression_model/regression_model/config/config.py:
--------------------------------------------------------------------------------
  1 | import pathlib
  2 | 
  3 | import regression_model
  4 | 
  5 | import pandas as pd
  6 | 
  7 | 
  8 | pd.options.display.max_rows = 10
  9 | pd.options.display.max_columns = 10
 10 | 
 11 | 
 12 | PACKAGE_ROOT = pathlib.Path(regression_model.__file__).resolve().parent
 13 | TRAINED_MODEL_DIR = PACKAGE_ROOT / "trained_models"
 14 | DATASET_DIR = PACKAGE_ROOT / "datasets"
 15 | 
 16 | # data
 17 | TESTING_DATA_FILE = "test.csv"
 18 | TRAINING_DATA_FILE = "train.csv"
 19 | TARGET = "SalePrice"
 20 | 
 21 | 
 22 | # variables
 23 | FEATURES = [
 24 |     "MSSubClass",
 25 |     "MSZoning",
 26 |     "Neighborhood",
 27 |     "OverallQual",
 28 |     "OverallCond",
 29 |     "YearRemodAdd",
 30 |     "RoofStyle",
 31 |     "MasVnrType",
 32 |     "BsmtQual",
 33 |     "BsmtExposure",
 34 |     "HeatingQC",
 35 |     "CentralAir",
 36 |     "1stFlrSF",
 37 |     "GrLivArea",
 38 |     "BsmtFullBath",
 39 |     "KitchenQual",
 40 |     "Fireplaces",
 41 |     "FireplaceQu",
 42 |     "GarageType",
 43 |     "GarageFinish",
 44 |     "GarageCars",
 45 |     "PavedDrive",
 46 |     "LotFrontage",
 47 |     # this one is only to calculate temporal variable:
 48 |     "YrSold",
 49 | ]
 50 | 
 51 | # this variable is to calculate the temporal variable,
 52 | # can be dropped afterwards
 53 | DROP_FEATURES = "YrSold"
 54 | 
 55 | # numerical variables with NA in train set
 56 | NUMERICAL_VARS_WITH_NA = ["LotFrontage"]
 57 | 
 58 | # categorical variables with NA in train set
 59 | CATEGORICAL_VARS_WITH_NA = [
 60 |     "MasVnrType",
 61 |     "BsmtQual",
 62 |     "BsmtExposure",
 63 |     "FireplaceQu",
 64 |     "GarageType",
 65 |     "GarageFinish",
 66 | ]
 67 | 
 68 | TEMPORAL_VARS = "YearRemodAdd"
 69 | 
 70 | # variables to log transform
 71 | NUMERICALS_LOG_VARS = ["LotFrontage", "1stFlrSF", "GrLivArea"]
 72 | 
 73 | # categorical variables to encode
 74 | CATEGORICAL_VARS = [
 75 |     "MSZoning",
 76 |     "Neighborhood",
 77 |     "RoofStyle",
 78 |     "MasVnrType",
 79 |     "BsmtQual",
 80 |     "BsmtExposure",
 81 |     "HeatingQC",
 82 |     "CentralAir",
 83 |     "KitchenQual",
 84 |     "FireplaceQu",
 85 |     "GarageType",
 86 |     "GarageFinish",
 87 |     "PavedDrive",
 88 | ]
 89 | 
 90 | NUMERICAL_NA_NOT_ALLOWED = [
 91 |     feature
 92 |     for feature in FEATURES
 93 |     if feature not in CATEGORICAL_VARS + NUMERICAL_VARS_WITH_NA
 94 | ]
 95 | 
 96 | CATEGORICAL_NA_NOT_ALLOWED = [
 97 |     feature for feature in CATEGORICAL_VARS if feature not in CATEGORICAL_VARS_WITH_NA
 98 | ]
 99 | 
100 | 
101 | PIPELINE_NAME = "lasso_regression"
102 | PIPELINE_SAVE_FILE = f"{PIPELINE_NAME}_output_v"
103 | 
104 | # used for differential testing
105 | ACCEPTABLE_MODEL_DIFFERENCE = 0.05
106 | 


--------------------------------------------------------------------------------
/packages/regression_model/regression_model/config/logging_config.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | 
 4 | 
 5 | # Multiple calls to logging.getLogger('someLogger') return a
 6 | # reference to the same logger object.  This is true not only
 7 | # within the same module, but also across modules as long as
 8 | # it is in the same Python interpreter process.
 9 | 
10 | FORMATTER = logging.Formatter(
11 |     "%(asctime)s — %(name)s — %(levelname)s —" "%(funcName)s:%(lineno)d — %(message)s"
12 | )
13 | 
14 | 
15 | def get_console_handler():
16 |     console_handler = logging.StreamHandler(sys.stdout)
17 |     console_handler.setFormatter(FORMATTER)
18 |     return console_handler
19 | 


--------------------------------------------------------------------------------
/packages/regression_model/regression_model/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/regression_model/regression_model/datasets/__init__.py


--------------------------------------------------------------------------------
/packages/regression_model/regression_model/pipeline.py:
--------------------------------------------------------------------------------
 1 | from sklearn.linear_model import Lasso
 2 | from sklearn.pipeline import Pipeline
 3 | from sklearn.preprocessing import MinMaxScaler
 4 | 
 5 | from regression_model.processing import preprocessors as pp
 6 | from regression_model.processing import features
 7 | from regression_model.config import config
 8 | 
 9 | import logging
10 | 
11 | 
12 | _logger = logging.getLogger(__name__)
13 | 
14 | 
15 | price_pipe = Pipeline(
16 |     [
17 |         (
18 |             "categorical_imputer",
19 |             pp.CategoricalImputer(variables=config.CATEGORICAL_VARS_WITH_NA),
20 |         ),
21 |         (
22 |             "numerical_inputer",
23 |             pp.NumericalImputer(variables=config.NUMERICAL_VARS_WITH_NA),
24 |         ),
25 |         (
26 |             "temporal_variable",
27 |             pp.TemporalVariableEstimator(
28 |                 variables=config.TEMPORAL_VARS, reference_variable=config.DROP_FEATURES
29 |             ),
30 |         ),
31 |         (
32 |             "rare_label_encoder",
33 |             pp.RareLabelCategoricalEncoder(tol=0.01, variables=config.CATEGORICAL_VARS),
34 |         ),
35 |         (
36 |             "categorical_encoder",
37 |             pp.CategoricalEncoder(variables=config.CATEGORICAL_VARS),
38 |         ),
39 |         (
40 |             "log_transformer",
41 |             features.LogTransformer(variables=config.NUMERICALS_LOG_VARS),
42 |         ),
43 |         (
44 |             "drop_features",
45 |             pp.DropUnecessaryFeatures(variables_to_drop=config.DROP_FEATURES),
46 |         ),
47 |         ("scaler", MinMaxScaler()),
48 |         ("Linear_model", Lasso(alpha=0.005, random_state=0)),
49 |     ]
50 | )
51 | 


--------------------------------------------------------------------------------
/packages/regression_model/regression_model/predict.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | from regression_model.processing.data_management import load_pipeline
 5 | from regression_model.config import config
 6 | from regression_model.processing.validation import validate_inputs
 7 | from regression_model import __version__ as _version
 8 | 
 9 | import logging
10 | import typing as t
11 | 
12 | 
13 | _logger = logging.getLogger(__name__)
14 | 
15 | pipeline_file_name = f"{config.PIPELINE_SAVE_FILE}{_version}.pkl"
16 | _price_pipe = load_pipeline(file_name=pipeline_file_name)
17 | 
18 | 
19 | def make_prediction(*, input_data: t.Union[pd.DataFrame, dict],
20 |                     ) -> dict:
21 |     """Make a prediction using a saved model pipeline.
22 | 
23 |     Args:
24 |         input_data: Array of model prediction inputs.
25 | 
26 |     Returns:
27 |         Predictions for each input row, as well as the model version.
28 |     """
29 | 
30 |     data = pd.DataFrame(input_data)
31 |     validated_data = validate_inputs(input_data=data)
32 | 
33 |     prediction = _price_pipe.predict(validated_data[config.FEATURES])
34 | 
35 |     output = np.exp(prediction)
36 | 
37 |     results = {"predictions": output, "version": _version}
38 | 
39 |     _logger.info(
40 |         f"Making predictions with model version: {_version} "
41 |         f"Inputs: {validated_data} "
42 |         f"Predictions: {results}"
43 |     )
44 | 
45 |     return results
46 | 


--------------------------------------------------------------------------------
/packages/regression_model/regression_model/processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/regression_model/regression_model/processing/__init__.py


--------------------------------------------------------------------------------
/packages/regression_model/regression_model/processing/data_management.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import joblib
 3 | from sklearn.pipeline import Pipeline
 4 | 
 5 | from regression_model.config import config
 6 | from regression_model import __version__ as _version
 7 | 
 8 | import logging
 9 | import typing as t
10 | 
11 | 
12 | _logger = logging.getLogger(__name__)
13 | 
14 | 
15 | def load_dataset(*, file_name: str) -> pd.DataFrame:
16 |     _data = pd.read_csv(f"{config.DATASET_DIR}/{file_name}")
17 |     return _data
18 | 
19 | 
20 | def save_pipeline(*, pipeline_to_persist) -> None:
21 |     """Persist the pipeline.
22 |     Saves the versioned model, and overwrites any previous
23 |     saved models. This ensures that when the package is
24 |     published, there is only one trained model that can be
25 |     called, and we know exactly how it was built.
26 |     """
27 | 
28 |     # Prepare versioned save file name
29 |     save_file_name = f"{config.PIPELINE_SAVE_FILE}{_version}.pkl"
30 |     save_path = config.TRAINED_MODEL_DIR / save_file_name
31 | 
32 |     remove_old_pipelines(files_to_keep=[save_file_name])
33 |     joblib.dump(pipeline_to_persist, save_path)
34 |     _logger.info(f"saved pipeline: {save_file_name}")
35 | 
36 | 
37 | def load_pipeline(*, file_name: str) -> Pipeline:
38 |     """Load a persisted pipeline."""
39 | 
40 |     file_path = config.TRAINED_MODEL_DIR / file_name
41 |     trained_model = joblib.load(filename=file_path)
42 |     return trained_model
43 | 
44 | 
45 | def remove_old_pipelines(*, files_to_keep: t.List[str]) -> None:
46 |     """
47 |     Remove old model pipelines.
48 | 
49 |     This is to ensure there is a simple one-to-one
50 |     mapping between the package version and the model
51 |     version to be imported and used by other applications.
52 |     However, we do also include the immediate previous
53 |     pipeline version for differential testing purposes.
54 |     """
55 |     do_not_delete = files_to_keep + ['__init__.py']
56 |     for model_file in config.TRAINED_MODEL_DIR.iterdir():
57 |         if model_file.name not in do_not_delete:
58 |             model_file.unlink()
59 | 


--------------------------------------------------------------------------------
/packages/regression_model/regression_model/processing/errors.py:
--------------------------------------------------------------------------------
1 | class BaseError(Exception):
2 |     """Base package error."""
3 | 
4 | 
5 | class InvalidModelInputError(BaseError):
6 |     """Model input contains an error."""
7 | 


--------------------------------------------------------------------------------
/packages/regression_model/regression_model/processing/features.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.base import BaseEstimator, TransformerMixin
 3 | 
 4 | from regression_model.processing.errors import InvalidModelInputError
 5 | 
 6 | 
 7 | class LogTransformer(BaseEstimator, TransformerMixin):
 8 |     """Logarithm transformer."""
 9 | 
10 |     def __init__(self, variables=None):
11 |         if not isinstance(variables, list):
12 |             self.variables = [variables]
13 |         else:
14 |             self.variables = variables
15 | 
16 |     def fit(self, X, y=None):
17 |         # to accomodate the pipeline
18 |         return self
19 | 
20 |     def transform(self, X):
21 |         X = X.copy()
22 | 
23 |         # check that the values are non-negative for log transform
24 |         if not (X[self.variables] > 0).all().all():
25 |             vars_ = self.variables[(X[self.variables] <= 0).any()]
26 |             raise InvalidModelInputError(
27 |                 f"Variables contain zero or negative values, "
28 |                 f"can't apply log for vars: {vars_}"
29 |             )
30 | 
31 |         for feature in self.variables:
32 |             X[feature] = np.log(X[feature])
33 | 
34 |         return X
35 | 


--------------------------------------------------------------------------------
/packages/regression_model/regression_model/processing/validation.py:
--------------------------------------------------------------------------------
 1 | from regression_model.config import config
 2 | 
 3 | import pandas as pd
 4 | 
 5 | 
 6 | def validate_inputs(input_data: pd.DataFrame) -> pd.DataFrame:
 7 |     """Check model inputs for unprocessable values."""
 8 | 
 9 |     validated_data = input_data.copy()
10 | 
11 |     # check for numerical variables with NA not seen during training
12 |     if input_data[config.NUMERICAL_NA_NOT_ALLOWED].isnull().any().any():
13 |         validated_data = validated_data.dropna(
14 |             axis=0, subset=config.NUMERICAL_NA_NOT_ALLOWED
15 |         )
16 | 
17 |     # check for categorical variables with NA not seen during training
18 |     if input_data[config.CATEGORICAL_NA_NOT_ALLOWED].isnull().any().any():
19 |         validated_data = validated_data.dropna(
20 |             axis=0, subset=config.CATEGORICAL_NA_NOT_ALLOWED
21 |         )
22 | 
23 |     # check for values <= 0 for the log transformed variables
24 |     if (input_data[config.NUMERICALS_LOG_VARS] <= 0).any().any():
25 |         vars_with_neg_values = config.NUMERICALS_LOG_VARS[
26 |             (input_data[config.NUMERICALS_LOG_VARS] <= 0).any()
27 |         ]
28 |         validated_data = validated_data[validated_data[vars_with_neg_values] > 0]
29 | 
30 |     return validated_data
31 | 


--------------------------------------------------------------------------------
/packages/regression_model/regression_model/train_pipeline.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.model_selection import train_test_split
 3 | 
 4 | from regression_model import pipeline
 5 | from regression_model.processing.data_management import load_dataset, save_pipeline
 6 | from regression_model.config import config
 7 | from regression_model import __version__ as _version
 8 | 
 9 | import logging
10 | 
11 | 
12 | _logger = logging.getLogger(__name__)
13 | 
14 | 
15 | def run_training() -> None:
16 |     """Train the model."""
17 | 
18 |     # read training data
19 |     data = load_dataset(file_name=config.TRAINING_DATA_FILE)
20 | 
21 |     # divide train and test
22 |     X_train, X_test, y_train, y_test = train_test_split(
23 |         data[config.FEATURES], data[config.TARGET], test_size=0.1, random_state=0
24 |     )  # we are setting the seed here
25 | 
26 |     # transform the target
27 |     y_train = np.log(y_train)
28 | 
29 |     pipeline.price_pipe.fit(X_train[config.FEATURES], y_train)
30 | 
31 |     _logger.info(f"saving model version: {_version}")
32 |     save_pipeline(pipeline_to_persist=pipeline.price_pipe)
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     run_training()
37 | 


--------------------------------------------------------------------------------
/packages/regression_model/regression_model/trained_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/regression_model/regression_model/trained_models/__init__.py


--------------------------------------------------------------------------------
/packages/regression_model/requirements.txt:
--------------------------------------------------------------------------------
 1 | # We use compatible release functionality (see PEP 440 here: https://www.python.org/dev/peps/pep-0440/#compatible-release)
 2 | # to specify acceptable version ranges of our project dependencies. This gives us the flexibility to keep up with small
 3 | # updates/fixes, whilst ensuring we don't install a major update which could introduce backwards incompatible changes.
 4 | 
 5 | # Model Building Requirements
 6 | numpy>=1.18.1,<1.19.0
 7 | pandas>=0.25.3,<0.26.0
 8 | scikit-learn>=0.22.1,<0.23.0
 9 | joblib>=0.14.1,<0.15.0
10 | 
11 | # testing requirements
12 | pytest>=5.3.2,<6.0.0
13 | 
14 | # packaging
15 | setuptools>=41.4.0,<42.0.0
16 | wheel>=0.33.6,<0.34.0
17 | 
18 | # fetching datasets
19 | kaggle>=1.5.6,<1.6.0
20 | 


--------------------------------------------------------------------------------
/packages/regression_model/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import io
 5 | import os
 6 | from pathlib import Path
 7 | 
 8 | from setuptools import find_packages, setup
 9 | 
10 | 
11 | # Package meta-data.
12 | NAME = 'regression_model'
13 | DESCRIPTION = 'Regression model for using in the Train In Data online course "Deployment of Machine Learning Models".'
14 | URL = 'https://github.com/trainindata/deploying-machine-learning-models'
15 | EMAIL = 'christopher.samiullah@protonmail.com'
16 | AUTHOR = 'ChristopherGS'
17 | REQUIRES_PYTHON = '>=3.6.0'
18 | 
19 | 
20 | # Packages that are required for this module to be executed
21 | def list_reqs(fname='requirements.txt'):
22 |     with open(fname) as fd:
23 |         return fd.read().splitlines()
24 | 
25 | 
26 | # The rest you shouldn't have to touch too much :)
27 | # ------------------------------------------------
28 | # Except, perhaps the License and Trove Classifiers!
29 | # If you do change the License, remember to change the
30 | # Trove Classifier for that!
31 | 
32 | here = os.path.abspath(os.path.dirname(__file__))
33 | 
34 | # Import the README and use it as the long-description.
35 | # Note: this will only work if 'README.md' is present in your MANIFEST.in file!
36 | try:
37 |     with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
38 |         long_description = '\n' + f.read()
39 | except FileNotFoundError:
40 |     long_description = DESCRIPTION
41 | 
42 | 
43 | # Load the package's __version__.py module as a dictionary.
44 | ROOT_DIR = Path(__file__).resolve().parent
45 | PACKAGE_DIR = ROOT_DIR / 'regression_model'
46 | about = {}
47 | with open(PACKAGE_DIR / 'VERSION') as f:
48 |     _version = f.read().strip()
49 |     about['__version__'] = _version
50 | 
51 | 
52 | # Where the magic happens:
53 | setup(
54 |     name=NAME,
55 |     version=about['__version__'],
56 |     description=DESCRIPTION,
57 |     long_description=long_description,
58 |     long_description_content_type='text/markdown',
59 |     author=AUTHOR,
60 |     author_email=EMAIL,
61 |     python_requires=REQUIRES_PYTHON,
62 |     url=URL,
63 |     packages=find_packages(exclude=('tests',)),
64 |     package_data={'regression_model': ['VERSION']},
65 |     install_requires=list_reqs(),
66 |     extras_require={},
67 |     include_package_data=True,
68 |     license='BSD 3',
69 |     classifiers=[
70 |         # Trove classifiers
71 |         # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
72 |         'License :: OSI Approved :: MIT License',
73 |         'Programming Language :: Python',
74 |         'Programming Language :: Python :: 3',
75 |         'Programming Language :: Python :: 3.6',
76 |         'Programming Language :: Python :: 3.7',
77 |         'Programming Language :: Python :: 3.8',
78 |         'Programming Language :: Python :: Implementation :: CPython',
79 |         'Programming Language :: Python :: Implementation :: PyPy'
80 |     ],
81 | )
82 | 


--------------------------------------------------------------------------------
/packages/regression_model/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/packages/regression_model/tests/__init__.py


--------------------------------------------------------------------------------
/packages/regression_model/tests/test_predict.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | from regression_model.predict import make_prediction
 4 | from regression_model.processing.data_management import load_dataset
 5 | 
 6 | 
 7 | def test_make_single_prediction():
 8 |     # Given
 9 |     test_data = load_dataset(file_name='test.csv')
10 |     single_test_input = test_data[0:1]
11 | 
12 |     # When
13 |     subject = make_prediction(input_data=single_test_input)
14 | 
15 |     # Then
16 |     assert subject is not None
17 |     assert isinstance(subject.get('predictions')[0], float)
18 |     assert math.ceil(subject.get('predictions')[0]) == 112476
19 | 
20 | 
21 | def test_make_multiple_predictions():
22 |     # Given
23 |     test_data = load_dataset(file_name='test.csv')
24 |     original_data_length = len(test_data)
25 |     multiple_test_input = test_data
26 | 
27 |     # When
28 |     subject = make_prediction(input_data=multiple_test_input)
29 | 
30 |     # Then
31 |     assert subject is not None
32 |     assert len(subject.get('predictions')) == 1451
33 | 
34 |     # We expect some rows to be filtered out
35 |     assert len(subject.get('predictions')) != original_data_length
36 | 


--------------------------------------------------------------------------------
/packages/regression_model/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py36, py37, py38
 3 | 
 4 | 
 5 | [testenv]
 6 | install_command = pip install --pre {opts} {packages}
 7 | whitelist_externals = unzip
 8 | deps =
 9 |     -rrequirements.txt
10 | 
11 | passenv =
12 |       KAGGLE_USERNAME
13 |       KAGGLE_KEY
14 | 
15 | setenv =
16 |   PYTHONPATH=.
17 | 
18 | commands =
19 |      kaggle competitions download -c house-prices-advanced-regression-techniques -p regression_model/datasets/
20 |      unzip -o regression_model/datasets/house-prices-advanced-regression-techniques.zip -d regression_model/datasets
21 |      python regression_model/train_pipeline.py
22 |      pytest \
23 |            -s \
24 |            -v \
25 |            {posargs:tests}
26 | 


--------------------------------------------------------------------------------
/scripts/fetch_kaggle_dataset.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | kaggle competitions download -c house-prices-advanced-regression-techniques -p packages/regression_model/regression_model/datasets/


--------------------------------------------------------------------------------
/scripts/fetch_kaggle_large_dataset.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | TRAINING_DATA_URL="vbookshelf/v2-plant-seedlings-dataset"
 4 | NOW=$(date)
 5 | 
 6 | kaggle datasets download -d $TRAINING_DATA_URL -p packages/neural_network_model/neural_network_model/datasets/ && \
 7 | unzip packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset.zip -d packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset && \
 8 | echo $TRAINING_DATA_URL 'retrieved on:' $NOW > packages/neural_network_model/neural_network_model/datasets/training_data_reference.txt && \
 9 | mkdir -p "./packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset/Shepherds Purse"  && \
10 | mv -v "./packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset/Shepherd’s Purse/"* "./packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset/Shepherds Purse"
11 | rm -rf "./packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset/Shepherd’s Purse"


--------------------------------------------------------------------------------
/scripts/input_test.json:
--------------------------------------------------------------------------------
 1 | [{
 2 | 	"Id": 1461,
 3 | 	"MSSubClass": 20,
 4 | 	"MSZoning": "RH",
 5 | 	"LotFrontage": 80.0,
 6 | 	"LotArea": 11622,
 7 | 	"Street": "Pave",
 8 | 	"Alley": null,
 9 | 	"LotShape": "Reg",
10 | 	"LandContour": "Lvl",
11 | 	"Utilities": "AllPub",
12 | 	"LotConfig": "Inside",
13 | 	"LandSlope": "Gtl",
14 | 	"Neighborhood": "NAmes",
15 | 	"Condition1": "Feedr",
16 | 	"Condition2": "Norm",
17 | 	"BldgType": "1Fam",
18 | 	"HouseStyle": "1Story",
19 | 	"OverallQual": 5,
20 | 	"OverallCond": 6,
21 | 	"YearBuilt": 1961,
22 | 	"YearRemodAdd": 1961,
23 | 	"RoofStyle": "Gable",
24 | 	"RoofMatl": "CompShg",
25 | 	"Exterior1st": "VinylSd",
26 | 	"Exterior2nd": "VinylSd",
27 | 	"MasVnrType": "None",
28 | 	"MasVnrArea": 0.0,
29 | 	"ExterQual": "TA",
30 | 	"ExterCond": "TA",
31 | 	"Foundation": "CBlock",
32 | 	"BsmtQual": "TA",
33 | 	"BsmtCond": "TA",
34 | 	"BsmtExposure": "No",
35 | 	"BsmtFinType1": "Rec",
36 | 	"BsmtFinSF1": 468.0,
37 | 	"BsmtFinType2": "LwQ",
38 | 	"BsmtFinSF2": 144.0,
39 | 	"BsmtUnfSF": 270.0,
40 | 	"TotalBsmtSF": 882.0,
41 | 	"Heating": "GasA",
42 | 	"HeatingQC": "TA",
43 | 	"CentralAir": "Y",
44 | 	"Electrical": "SBrkr",
45 | 	"1stFlrSF": 896,
46 | 	"2ndFlrSF": 0,
47 | 	"LowQualFinSF": 0,
48 | 	"GrLivArea": 896,
49 | 	"BsmtFullBath": 0.0,
50 | 	"BsmtHalfBath": 0.0,
51 | 	"FullBath": 1,
52 | 	"HalfBath": 0,
53 | 	"BedroomAbvGr": 2,
54 | 	"KitchenAbvGr": 1,
55 | 	"KitchenQual": "TA",
56 | 	"TotRmsAbvGrd": 5,
57 | 	"Functional": "Typ",
58 | 	"Fireplaces": 0,
59 | 	"FireplaceQu": null,
60 | 	"GarageType": "Attchd",
61 | 	"GarageYrBlt": 1961.0,
62 | 	"GarageFinish": "Unf",
63 | 	"GarageCars": 1.0,
64 | 	"GarageArea": 730.0,
65 | 	"GarageQual": "TA",
66 | 	"GarageCond": "TA",
67 | 	"PavedDrive": "Y",
68 | 	"WoodDeckSF": 140,
69 | 	"OpenPorchSF": 0,
70 | 	"EnclosedPorch": 0,
71 | 	"3SsnPorch": 0,
72 | 	"ScreenPorch": 120,
73 | 	"PoolArea": 0,
74 | 	"PoolQC": null,
75 | 	"Fence": "MnPrv",
76 | 	"MiscFeature": null,
77 | 	"MiscVal": 0,
78 | 	"MoSold": 6,
79 | 	"YrSold": 2010,
80 | 	"SaleType": "WD",
81 | 	"SaleCondition": "Normal"
82 | }]


--------------------------------------------------------------------------------
/scripts/publish_model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Building packages and uploading them to a Gemfury repository
 4 | 
 5 | GEMFURY_URL=$GEMFURY_PUSH_URL
 6 | 
 7 | set -e
 8 | 
 9 | DIRS="$@"
10 | BASE_DIR=$(pwd)
11 | SETUP="setup.py"
12 | 
13 | warn() {
14 |     echo "$@" 1>&2
15 | }
16 | 
17 | die() {
18 |     warn "$@"
19 |     exit 1
20 | }
21 | 
22 | build() {
23 |     DIR="${1/%\//}"
24 |     echo "Checking directory $DIR"
25 |     cd "$BASE_DIR/$DIR"
26 |     [ ! -e $SETUP ] && warn "No $SETUP file, skipping" && return
27 |     PACKAGE_NAME=$(python $SETUP --fullname)
28 |     echo "Package $PACKAGE_NAME"
29 |     python "$SETUP" sdist bdist_wheel || die "Building package $PACKAGE_NAME failed"
30 |     for X in $(ls dist)
31 |     do
32 |         curl -F package=@"dist/$X" "$GEMFURY_URL" || die "Uploading package $PACKAGE_NAME failed on file dist/$X"
33 |     done
34 | }
35 | 
36 | if [ -n "$DIRS" ]; then
37 |     for dir in $DIRS; do
38 |         build $dir
39 |     done
40 | else
41 |     ls -d */ | while read dir; do
42 |         build $dir
43 |     done
44 | fi


--------------------------------------------------------------------------------
/section-04-research-and-development/preprocessors.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | from sklearn.base import BaseEstimator, TransformerMixin
 5 | 
 6 | 
 7 | 
 8 | class TemporalVariableTransformer(BaseEstimator, TransformerMixin):
 9 | 	# Temporal elapsed time transformer
10 | 
11 |     def __init__(self, variables, reference_variable):
12 |         
13 |         if not isinstance(variables, list):
14 |             raise ValueError('variables should be a list')
15 |         
16 |         self.variables = variables
17 |         self.reference_variable = reference_variable
18 | 
19 |     def fit(self, X, y=None):
20 |         # we need this step to fit the sklearn pipeline
21 |         return self
22 | 
23 |     def transform(self, X):
24 | 
25 |     	# so that we do not over-write the original dataframe
26 |         X = X.copy()
27 |         
28 |         for feature in self.variables:
29 |             X[feature] = X[self.reference_variable] - X[feature]
30 | 
31 |         return X
32 | 
33 | 
34 | 
35 | # categorical missing value imputer
36 | class Mapper(BaseEstimator, TransformerMixin):
37 | 
38 |     def __init__(self, variables, mappings):
39 | 
40 |         if not isinstance(variables, list):
41 |             raise ValueError('variables should be a list')
42 | 
43 |         self.variables = variables
44 |         self.mappings = mappings
45 | 
46 |     def fit(self, X, y=None):
47 |         # we need the fit statement to accomodate the sklearn pipeline
48 |         return self
49 | 
50 |     def transform(self, X):
51 |         X = X.copy()
52 |         for feature in self.variables:
53 |             X[feature] = X[feature].map(self.mappings)
54 | 
55 |         return X


--------------------------------------------------------------------------------
/section-04-research-and-development/preprocessors_bonus.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from sklearn.base import BaseEstimator, TransformerMixin
 4 | 
 5 | 
 6 | class MeanImputer(BaseEstimator, TransformerMixin):
 7 |     """Numerical missing value imputer."""
 8 | 
 9 |     def __init__(self, variables):
10 |         if not isinstance(variables, list):
11 |             raise ValueError('variables should be a list')
12 |         self.variables = variables
13 | 
14 |     def fit(self, X, y=None):
15 |         # persist mean values in a dictionary
16 |         self.imputer_dict_ = X[self.variables].mean().to_dict()
17 |         return self
18 | 
19 |     def transform(self, X):
20 |         X = X.copy()
21 |         for feature in self.variables:
22 |             X[feature].fillna(self.imputer_dict_[feature],
23 |                               inplace=True)
24 |         return X
25 | 
26 | 
27 | 
28 | class RareLabelCategoricalEncoder(BaseEstimator, TransformerMixin):
29 |     """Groups infrequent categories into a single string"""
30 | 
31 |     def __init__(self, tol=0.05, variables):
32 | 
33 |         if not isinstance(variables, list):
34 |             raise ValueError('variables should be a list')
35 |         
36 |         self.tol = tol
37 |         self.variables = variables
38 | 
39 |     def fit(self, X, y=None):
40 |         # persist frequent labels in dictionary
41 |         self.encoder_dict_ = {}
42 | 
43 |         for var in self.variables:
44 |             # the encoder will learn the most frequent categories
45 |             t = pd.Series(X[var].value_counts(normalize=True) 
46 |             # frequent labels:
47 |             self.encoder_dict_[var] = list(t[t >= self.tol].index)
48 | 
49 |         return self
50 | 
51 |     def transform(self, X):
52 |         X = X.copy()
53 |         for feature in self.variables:
54 |             X[feature] = np.where(
55 |                 X[feature].isin(self.encoder_dict_[feature]),
56 |                                 X[feature], "Rare")
57 | 
58 |         return X
59 | 
60 | 
61 | class CategoricalEncoder(BaseEstimator, TransformerMixin):
62 |     """String to numbers categorical encoder."""
63 | 
64 |     def __init__(self, variables):
65 | 
66 |         if not isinstance(variables, list):
67 |             raise ValueError('variables should be a list')
68 |         
69 |         self.variables = variables
70 | 
71 |     def fit(self, X, y):
72 |         temp = pd.concat([X, y], axis=1)
73 |         temp.columns = list(X.columns) + ["target"]
74 | 
75 |         # persist transforming dictionary
76 |         self.encoder_dict_ = {}
77 | 
78 |         for var in self.variables:
79 |             t = temp.groupby([var])["target"].mean().sort_values(ascending=True).index
80 |             self.encoder_dict_[var] = {k: i for i, k in enumerate(t, 0)}
81 | 
82 |         return self
83 | 
84 |     def transform(self, X):
85 |         # encode labels
86 |         X = X.copy()
87 |         for feature in self.variables:
88 |             X[feature] = X[feature].map(self.encoder_dict_[feature])
89 | 
90 |         return X


--------------------------------------------------------------------------------
/section-04-research-and-development/requirements.txt:
--------------------------------------------------------------------------------
1 | feature-engine==1.0.2
2 | joblib==1.0.1
3 | matplotlib==3.3.4
4 | numpy==1.20.1
5 | pandas==1.2.2
6 | scikit-learn==0.24.1
7 | scipy==1.6.0
8 | seaborn==0.11.1
9 | statsmodels==0.12.2


--------------------------------------------------------------------------------
/section-05-production-model-package/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include *.txt
 2 | include *.md
 3 | include *.pkl
 4 | recursive-include ./regression_model/*
 5 | 
 6 | include regression_model/datasets/train.csv
 7 | include regression_model/datasets/test.csv
 8 | include regression_model/trained_models/*.pkl
 9 | include regression_model/VERSION
10 | include regression_model/config.yml
11 | 
12 | include ./requirements/requirements.txt
13 | include ./requirements/test_requirements.txt
14 | exclude *.log
15 | exclude *.cfg
16 | 
17 | recursive-exclude * __pycache__
18 | recursive-exclude * *.py[co]


--------------------------------------------------------------------------------
/section-05-production-model-package/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | # warn_unreachable = True
 3 | warn_unused_ignores = True
 4 | follow_imports = skip
 5 | show_error_context = True
 6 | warn_incomplete_stub = True
 7 | ignore_missing_imports = True
 8 | check_untyped_defs = True
 9 | cache_dir = /dev/null
10 | # Cannot enable this one as we still allow defining functions without any types.
11 | # disallow_untyped_defs = True
12 | warn_redundant_casts = True
13 | warn_unused_configs = True
14 | strict_optional = True


--------------------------------------------------------------------------------
/section-05-production-model-package/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = [
 3 |     "setuptools>=42",
 4 |     "wheel"
 5 | ]
 6 | build-backend = "setuptools.build_meta"
 7 | 
 8 | [tool.pytest.ini_options]
 9 | minversion = "2.0"
10 | addopts = "-rfEX -p pytester --strict-markers"
11 | python_files = ["test_*.py", "*_test.py"]
12 | python_classes = ["Test", "Acceptance"]
13 | python_functions = ["test"]
14 | # NOTE: "doc" is not included here, but gets tested explicitly via "doctesting".
15 | testpaths = ["tests"]
16 | xfail_strict = true
17 | filterwarnings = [
18 |     "error",
19 |     "default:Using or importing the ABCs:DeprecationWarning:unittest2.*",
20 |     # produced by older pyparsing<=2.2.0.
21 |     "default:Using or importing the ABCs:DeprecationWarning:pyparsing.*",
22 |     "default:the imp module is deprecated in favour of importlib:DeprecationWarning:nose.*",
23 |     # distutils is deprecated in 3.10, scheduled for removal in 3.12
24 |     "ignore:The distutils package is deprecated:DeprecationWarning",
25 |     # produced by python3.6/site.py itself (3.6.7 on Travis, could not trigger it with 3.6.8)."
26 |     "ignore:.*U.*mode is deprecated:DeprecationWarning:(?!(pytest|_pytest))",
27 |     # produced by pytest-xdist
28 |     "ignore:.*type argument to addoption.*:DeprecationWarning",
29 |     # produced on execnet (pytest-xdist)
30 |     "ignore:.*inspect.getargspec.*deprecated, use inspect.signature.*:DeprecationWarning",
31 |     # pytest's own futurewarnings
32 |     "ignore::pytest.PytestExperimentalApiWarning",
33 |     # Do not cause SyntaxError for invalid escape sequences in py37.
34 |     # Those are caught/handled by pyupgrade, and not easy to filter with the
35 |     # module being the filename (with .py removed).
36 |     "default:invalid escape sequence:DeprecationWarning",
37 |     # ignore use of unregistered marks, because we use many to test the implementation
38 |     "ignore::_pytest.warning_types.PytestUnknownMarkWarning",
39 | ]
40 | 
41 | [tool.black]
42 | target-version = ['py311']
43 | 
44 | [tool.isort]
45 | profile = "black"
46 | line_length = 100
47 | lines_between_sections = 1
48 | skip = "migrations"
49 | 


--------------------------------------------------------------------------------
/section-05-production-model-package/regression_model/VERSION:
--------------------------------------------------------------------------------
1 | 0.0.1
2 | 


--------------------------------------------------------------------------------
/section-05-production-model-package/regression_model/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from regression_model.config.core import PACKAGE_ROOT, config
 4 | 
 5 | # It is strongly advised that you do not add any handlers other than
 6 | # NullHandler to your library’s loggers. This is because the configuration
 7 | # of handlers is the prerogative of the application developer who uses your
 8 | # library. The application developer knows their target audience and what
 9 | # handlers are most appropriate for their application: if you add handlers
10 | # ‘under the hood’, you might well interfere with their ability to carry out
11 | # unit tests and deliver logs which suit their requirements.
12 | # https://docs.python.org/3/howto/logging.html#configuring-logging-for-a-library
13 | logging.getLogger(config.app_config.package_name).addHandler(logging.NullHandler())
14 | 
15 | 
16 | with open(PACKAGE_ROOT / "VERSION") as version_file:
17 |     __version__ = version_file.read().strip()
18 | 


--------------------------------------------------------------------------------
/section-05-production-model-package/regression_model/config.yml:
--------------------------------------------------------------------------------
  1 | # Package Overview
  2 | package_name: regression_model
  3 | 
  4 | # Data Files
  5 | training_data_file: train.csv
  6 | test_data_file: test.csv
  7 | 
  8 | # Variables
  9 | # The variable we are attempting to predict (sale price)
 10 | target: SalePrice
 11 | 
 12 | pipeline_name: regression_model
 13 | pipeline_save_file: regression_model_output_v
 14 | 
 15 | # Will cause syntax errors since they begin with numbers
 16 | variables_to_rename:
 17 |   1stFlrSF: FirstFlrSF
 18 |   2ndFlrSF: SecondFlrSF
 19 |   3SsnPorch: ThreeSsnPortch
 20 | 
 21 | features:
 22 |   - MSSubClass
 23 |   - MSZoning
 24 |   - LotFrontage
 25 |   - LotShape
 26 |   - LandContour
 27 |   - LotConfig
 28 |   - Neighborhood
 29 |   - OverallQual
 30 |   - OverallCond
 31 |   - YearRemodAdd
 32 |   - RoofStyle
 33 |   - Exterior1st
 34 |   - ExterQual
 35 |   - Foundation
 36 |   - BsmtQual
 37 |   - BsmtExposure
 38 |   - BsmtFinType1
 39 |   - HeatingQC
 40 |   - CentralAir
 41 |   - FirstFlrSF  # renamed
 42 |   - SecondFlrSF  # renamed
 43 |   - GrLivArea
 44 |   - BsmtFullBath
 45 |   - HalfBath
 46 |   - KitchenQual
 47 |   - TotRmsAbvGrd
 48 |   - Functional
 49 |   - Fireplaces
 50 |   - FireplaceQu
 51 |   - GarageFinish
 52 |   - GarageCars
 53 |   - GarageArea
 54 |   - PavedDrive
 55 |   - WoodDeckSF
 56 |   - ScreenPorch
 57 |   - SaleCondition
 58 |   # this one is only to calculate temporal variable:
 59 |   - YrSold
 60 | 
 61 | # set train/test split
 62 | test_size: 0.1
 63 | 
 64 | # to set the random seed
 65 | random_state: 0
 66 | 
 67 | alpha: 0.001
 68 | 
 69 | # categorical variables with NA in train set
 70 | categorical_vars_with_na_frequent:
 71 |   - BsmtQual
 72 |   - BsmtExposure
 73 |   - BsmtFinType1
 74 |   - GarageFinish
 75 | 
 76 | categorical_vars_with_na_missing:
 77 |   - FireplaceQu
 78 | 
 79 | numerical_vars_with_na:
 80 |   - LotFrontage
 81 | 
 82 | temporal_vars:
 83 |   - YearRemodAdd
 84 | 
 85 | ref_var: YrSold
 86 | 
 87 | 
 88 | # variables to log transform
 89 | numericals_log_vars:
 90 |   - LotFrontage
 91 |   - FirstFlrSF
 92 |   - GrLivArea
 93 | 
 94 | binarize_vars:
 95 |   - ScreenPorch
 96 | 
 97 | # variables to map
 98 | qual_vars:
 99 |   - ExterQual
100 |   - BsmtQual
101 |   - HeatingQC
102 |   - KitchenQual
103 |   - FireplaceQu
104 | 
105 | exposure_vars:
106 |   - BsmtExposure
107 | 
108 | finish_vars:
109 |   - BsmtFinType1
110 | 
111 | garage_vars:
112 |   - GarageFinish
113 | 
114 | categorical_vars:
115 |   - MSSubClass
116 |   - MSZoning
117 |   - LotShape
118 |   - LandContour
119 |   - LotConfig
120 |   - Neighborhood
121 |   - RoofStyle
122 |   - Exterior1st
123 |   - Foundation
124 |   - CentralAir
125 |   - Functional
126 |   - PavedDrive
127 |   - SaleCondition
128 | 
129 | # variable mappings
130 | qual_mappings:
131 |   Po: 1
132 |   Fa: 2
133 |   TA: 3
134 |   Gd: 4
135 |   Ex: 5
136 |   Missing: 0
137 |   NA: 0
138 | 
139 | exposure_mappings:
140 |   No: 1
141 |   Mn: 2
142 |   Av: 3
143 |   Gd: 4
144 | 
145 | 
146 | finish_mappings:
147 |   Missing: 0
148 |   NA: 0
149 |   Unf: 1
150 |   LwQ: 2
151 |   Rec: 3
152 |   BLQ: 4
153 |   ALQ: 5
154 |   GLQ: 6
155 | 
156 | 
157 | garage_mappings:
158 |   Missing: 0
159 |   NA: 0
160 |   Unf: 1
161 |   RFn: 2
162 |   Fin: 3
163 | 


--------------------------------------------------------------------------------
/section-05-production-model-package/regression_model/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/section-05-production-model-package/regression_model/config/__init__.py


--------------------------------------------------------------------------------
/section-05-production-model-package/regression_model/config/core.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from typing import Dict, List, Optional, Sequence
  3 | 
  4 | from pydantic import BaseModel
  5 | from strictyaml import YAML, load
  6 | 
  7 | import regression_model
  8 | 
  9 | # Project Directories
 10 | PACKAGE_ROOT = Path(regression_model.__file__).resolve().parent
 11 | ROOT = PACKAGE_ROOT.parent
 12 | CONFIG_FILE_PATH = PACKAGE_ROOT / "config.yml"
 13 | DATASET_DIR = PACKAGE_ROOT / "datasets"
 14 | TRAINED_MODEL_DIR = PACKAGE_ROOT / "trained_models"
 15 | 
 16 | 
 17 | class AppConfig(BaseModel):
 18 |     """
 19 |     Application-level config.
 20 |     """
 21 | 
 22 |     package_name: str
 23 |     training_data_file: str
 24 |     test_data_file: str
 25 |     pipeline_save_file: str
 26 | 
 27 | 
 28 | class ModelConfig(BaseModel):
 29 |     """
 30 |     All configuration relevant to model
 31 |     training and feature engineering.
 32 |     """
 33 | 
 34 |     target: str
 35 |     variables_to_rename: Dict
 36 |     features: List[str]
 37 |     test_size: float
 38 |     random_state: int
 39 |     alpha: float
 40 |     categorical_vars_with_na_frequent: List[str]
 41 |     categorical_vars_with_na_missing: List[str]
 42 |     numerical_vars_with_na: List[str]
 43 |     temporal_vars: List[str]
 44 |     ref_var: str
 45 |     numericals_log_vars: Sequence[str]
 46 |     binarize_vars: Sequence[str]
 47 |     qual_vars: List[str]
 48 |     exposure_vars: List[str]
 49 |     finish_vars: List[str]
 50 |     garage_vars: List[str]
 51 |     categorical_vars: Sequence[str]
 52 |     qual_mappings: Dict[str, int]
 53 |     exposure_mappings: Dict[str, int]
 54 |     garage_mappings: Dict[str, int]
 55 |     finish_mappings: Dict[str, int]
 56 | 
 57 | 
 58 | class Config(BaseModel):
 59 |     """Master config object."""
 60 | 
 61 |     app_config: AppConfig
 62 |     model_config: ModelConfig
 63 | 
 64 | 
 65 | def find_config_file() -> Path:
 66 |     """Locate the configuration file."""
 67 |     if CONFIG_FILE_PATH.is_file():
 68 |         return CONFIG_FILE_PATH
 69 |     raise Exception(f"Config not found at {CONFIG_FILE_PATH!r}")
 70 | 
 71 | 
 72 | def fetch_config_from_yaml(cfg_path: Optional[Path] = None) -> YAML:
 73 |     """Parse YAML containing the package configuration."""
 74 | 
 75 |     if not cfg_path:
 76 |         cfg_path = find_config_file()
 77 | 
 78 |     if cfg_path:
 79 |         with open(cfg_path, "r") as conf_file:
 80 |             parsed_config = load(conf_file.read())
 81 |             return parsed_config
 82 |     raise OSError(f"Did not find config file at path: {cfg_path}")
 83 | 
 84 | 
 85 | def create_and_validate_config(parsed_config: YAML = None) -> Config:
 86 |     """Run validation on config values."""
 87 |     if parsed_config is None:
 88 |         parsed_config = fetch_config_from_yaml()
 89 | 
 90 |     # specify the data attribute from the strictyaml YAML type.
 91 |     _config = Config(
 92 |         app_config=AppConfig(**parsed_config.data),
 93 |         model_config=ModelConfig(**parsed_config.data),
 94 |     )
 95 | 
 96 |     return _config
 97 | 
 98 | 
 99 | config = create_and_validate_config()
100 | 


--------------------------------------------------------------------------------
/section-05-production-model-package/regression_model/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/section-05-production-model-package/regression_model/datasets/__init__.py


--------------------------------------------------------------------------------
/section-05-production-model-package/regression_model/pipeline.py:
--------------------------------------------------------------------------------
  1 | from feature_engine.encoding import OrdinalEncoder, RareLabelEncoder
  2 | from feature_engine.imputation import AddMissingIndicator, CategoricalImputer, MeanMedianImputer
  3 | from feature_engine.selection import DropFeatures
  4 | from feature_engine.transformation import LogTransformer
  5 | from feature_engine.wrappers import SklearnTransformerWrapper
  6 | from sklearn.linear_model import Lasso
  7 | from sklearn.pipeline import Pipeline
  8 | from sklearn.preprocessing import Binarizer, MinMaxScaler
  9 | 
 10 | from regression_model.config.core import config
 11 | from regression_model.processing import features as pp
 12 | 
 13 | price_pipe = Pipeline(
 14 |     [
 15 |         # ===== IMPUTATION =====
 16 |         # impute categorical variables with string missing
 17 |         (
 18 |             "missing_imputation",
 19 |             CategoricalImputer(
 20 |                 imputation_method="missing",
 21 |                 variables=config.model_config.categorical_vars_with_na_missing,
 22 |             ),
 23 |         ),
 24 |         (
 25 |             "frequent_imputation",
 26 |             CategoricalImputer(
 27 |                 imputation_method="frequent",
 28 |                 variables=config.model_config.categorical_vars_with_na_frequent,
 29 |             ),
 30 |         ),
 31 |         # add missing indicator
 32 |         (
 33 |             "missing_indicator",
 34 |             AddMissingIndicator(variables=config.model_config.numerical_vars_with_na),
 35 |         ),
 36 |         # impute numerical variables with the mean
 37 |         (
 38 |             "mean_imputation",
 39 |             MeanMedianImputer(
 40 |                 imputation_method="mean",
 41 |                 variables=config.model_config.numerical_vars_with_na,
 42 |             ),
 43 |         ),
 44 |         # == TEMPORAL VARIABLES ====
 45 |         (
 46 |             "elapsed_time",
 47 |             pp.TemporalVariableTransformer(
 48 |                 variables=config.model_config.temporal_vars,
 49 |                 reference_variable=config.model_config.ref_var,
 50 |             ),
 51 |         ),
 52 |         ("drop_features", DropFeatures(features_to_drop=[config.model_config.ref_var])),
 53 |         # ==== VARIABLE TRANSFORMATION =====
 54 |         ("log", LogTransformer(variables=config.model_config.numericals_log_vars)),
 55 |         (
 56 |             "binarizer",
 57 |             SklearnTransformerWrapper(
 58 |                 transformer=Binarizer(threshold=0),
 59 |                 variables=config.model_config.binarize_vars,
 60 |             ),
 61 |         ),
 62 |         # === mappers ===
 63 |         (
 64 |             "mapper_qual",
 65 |             pp.Mapper(
 66 |                 variables=config.model_config.qual_vars,
 67 |                 mappings=config.model_config.qual_mappings,
 68 |             ),
 69 |         ),
 70 |         (
 71 |             "mapper_exposure",
 72 |             pp.Mapper(
 73 |                 variables=config.model_config.exposure_vars,
 74 |                 mappings=config.model_config.exposure_mappings,
 75 |             ),
 76 |         ),
 77 |         (
 78 |             "mapper_finish",
 79 |             pp.Mapper(
 80 |                 variables=config.model_config.finish_vars,
 81 |                 mappings=config.model_config.finish_mappings,
 82 |             ),
 83 |         ),
 84 |         (
 85 |             "mapper_garage",
 86 |             pp.Mapper(
 87 |                 variables=config.model_config.garage_vars,
 88 |                 mappings=config.model_config.garage_mappings,
 89 |             ),
 90 |         ),
 91 |         # == CATEGORICAL ENCODING
 92 |         (
 93 |             "rare_label_encoder",
 94 |             RareLabelEncoder(
 95 |                 tol=0.01, n_categories=1, variables=config.model_config.categorical_vars
 96 |             ),
 97 |         ),
 98 |         # encode categorical variables using the target mean
 99 |         (
100 |             "categorical_encoder",
101 |             OrdinalEncoder(
102 |                 encoding_method="ordered",
103 |                 variables=config.model_config.categorical_vars,
104 |             ),
105 |         ),
106 |         ("scaler", MinMaxScaler()),
107 |         (
108 |             "Lasso",
109 |             Lasso(
110 |                 alpha=config.model_config.alpha,
111 |                 random_state=config.model_config.random_state,
112 |             ),
113 |         ),
114 |     ]
115 | )
116 | 


--------------------------------------------------------------------------------
/section-05-production-model-package/regression_model/predict.py:
--------------------------------------------------------------------------------
 1 | import typing as t
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | from regression_model import __version__ as _version
 7 | from regression_model.config.core import config
 8 | from regression_model.processing.data_manager import load_pipeline
 9 | from regression_model.processing.validation import validate_inputs
10 | 
11 | pipeline_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl"
12 | _price_pipe = load_pipeline(file_name=pipeline_file_name)
13 | 
14 | 
15 | def make_prediction(
16 |     *,
17 |     input_data: t.Union[pd.DataFrame, dict],
18 | ) -> dict:
19 |     """Make a prediction using a saved model pipeline."""
20 | 
21 |     data = pd.DataFrame(input_data)
22 |     validated_data, errors = validate_inputs(input_data=data)
23 |     results = {"predictions": None, "version": _version, "errors": errors}
24 | 
25 |     if not errors:
26 |         predictions = _price_pipe.predict(
27 |             X=validated_data[config.model_config.features]
28 |         )
29 |         results = {
30 |             "predictions": [np.exp(pred) for pred in predictions],  # type: ignore
31 |             "version": _version,
32 |             "errors": errors,
33 |         }
34 | 
35 |     return results
36 | 


--------------------------------------------------------------------------------
/section-05-production-model-package/regression_model/processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/section-05-production-model-package/regression_model/processing/__init__.py


--------------------------------------------------------------------------------
/section-05-production-model-package/regression_model/processing/data_manager.py:
--------------------------------------------------------------------------------
 1 | import typing as t
 2 | from pathlib import Path
 3 | 
 4 | import joblib
 5 | import pandas as pd
 6 | from sklearn.pipeline import Pipeline
 7 | 
 8 | from regression_model import __version__ as _version
 9 | from regression_model.config.core import DATASET_DIR, TRAINED_MODEL_DIR, config
10 | 
11 | 
12 | def load_dataset(*, file_name: str) -> pd.DataFrame:
13 |     dataframe = pd.read_csv(Path(f"{DATASET_DIR}/{file_name}"))
14 |     dataframe["MSSubClass"] = dataframe["MSSubClass"].astype("O")
15 | 
16 |     # rename variables beginning with numbers to avoid syntax errors later
17 |     transformed = dataframe.rename(columns=config.model_config.variables_to_rename)
18 |     return transformed
19 | 
20 | 
21 | def save_pipeline(*, pipeline_to_persist: Pipeline) -> None:
22 |     """Persist the pipeline.
23 |     Saves the versioned model, and overwrites any previous
24 |     saved models. This ensures that when the package is
25 |     published, there is only one trained model that can be
26 |     called, and we know exactly how it was built.
27 |     """
28 | 
29 |     # Prepare versioned save file name
30 |     save_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl"
31 |     save_path = TRAINED_MODEL_DIR / save_file_name
32 | 
33 |     remove_old_pipelines(files_to_keep=[save_file_name])
34 |     joblib.dump(pipeline_to_persist, save_path)
35 | 
36 | 
37 | def load_pipeline(*, file_name: str) -> Pipeline:
38 |     """Load a persisted pipeline."""
39 | 
40 |     file_path = TRAINED_MODEL_DIR / file_name
41 |     trained_model = joblib.load(filename=file_path)
42 |     return trained_model
43 | 
44 | 
45 | def remove_old_pipelines(*, files_to_keep: t.List[str]) -> None:
46 |     """
47 |     Remove old model pipelines.
48 |     This is to ensure there is a simple one-to-one
49 |     mapping between the package version and the model
50 |     version to be imported and used by other applications.
51 |     """
52 |     do_not_delete = files_to_keep + ["__init__.py"]
53 |     for model_file in TRAINED_MODEL_DIR.iterdir():
54 |         if model_file.name not in do_not_delete:
55 |             model_file.unlink()
56 | 


--------------------------------------------------------------------------------
/section-05-production-model-package/regression_model/processing/features.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import pandas as pd
 4 | from sklearn.base import BaseEstimator, TransformerMixin
 5 | 
 6 | 
 7 | class TemporalVariableTransformer(BaseEstimator, TransformerMixin):
 8 |     """Temporal elapsed time transformer."""
 9 | 
10 |     def __init__(self, variables: List[str], reference_variable: str):
11 | 
12 |         if not isinstance(variables, list):
13 |             raise ValueError("variables should be a list")
14 | 
15 |         self.variables = variables
16 |         self.reference_variable = reference_variable
17 | 
18 |     def fit(self, X: pd.DataFrame, y: pd.Series = None):
19 |         # we need this step to fit the sklearn pipeline
20 |         return self
21 | 
22 |     def transform(self, X: pd.DataFrame) -> pd.DataFrame:
23 | 
24 |         # so that we do not over-write the original dataframe
25 |         X = X.copy()
26 | 
27 |         for feature in self.variables:
28 |             X[feature] = X[self.reference_variable] - X[feature]
29 | 
30 |         return X
31 | 
32 | 
33 | class Mapper(BaseEstimator, TransformerMixin):
34 |     """Categorical variable mapper."""
35 | 
36 |     def __init__(self, variables: List[str], mappings: dict):
37 | 
38 |         if not isinstance(variables, list):
39 |             raise ValueError("variables should be a list")
40 | 
41 |         self.variables = variables
42 |         self.mappings = mappings
43 | 
44 |     def fit(self, X: pd.DataFrame, y: pd.Series = None):
45 |         # we need the fit statement to accomodate the sklearn pipeline
46 |         return self
47 | 
48 |     def transform(self, X: pd.DataFrame) -> pd.DataFrame:
49 |         X = X.copy()
50 |         for feature in self.variables:
51 |             X[feature] = X[feature].map(self.mappings)
52 | 
53 |         return X
54 | 


--------------------------------------------------------------------------------
/section-05-production-model-package/regression_model/train_pipeline.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from config.core import config
 3 | from pipeline import price_pipe
 4 | from processing.data_manager import load_dataset, save_pipeline
 5 | from sklearn.model_selection import train_test_split
 6 | 
 7 | 
 8 | def run_training() -> None:
 9 |     """Train the model."""
10 | 
11 |     # read training data
12 |     data = load_dataset(file_name=config.app_config.training_data_file)
13 | 
14 |     # divide train and test
15 |     X_train, X_test, y_train, y_test = train_test_split(
16 |         data[config.model_config.features],  # predictors
17 |         data[config.model_config.target],
18 |         test_size=config.model_config.test_size,
19 |         # we are setting the random seed here
20 |         # for reproducibility
21 |         random_state=config.model_config.random_state,
22 |     )
23 |     y_train = np.log(y_train)
24 | 
25 |     # fit model
26 |     price_pipe.fit(X_train, y_train)
27 | 
28 |     # persist trained model
29 |     save_pipeline(pipeline_to_persist=price_pipe)
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     run_training()
34 | 


--------------------------------------------------------------------------------
/section-05-production-model-package/regression_model/trained_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/section-05-production-model-package/regression_model/trained_models/__init__.py


--------------------------------------------------------------------------------
/section-05-production-model-package/requirements/requirements.txt:
--------------------------------------------------------------------------------
 1 | # We use compatible release functionality (see PEP 440 here: https://www.python.org/dev/peps/pep-0440/#compatible-release)
 2 | # to specify acceptable version ranges of our project dependencies. This gives us the flexibility to keep up with small
 3 | # updates/fixes, whilst ensuring we don't install a major update which could introduce backwards incompatible changes.
 4 | numpy>=1.21.0,<2.0.0
 5 | pandas>=1.3.5,<2.0.0
 6 | pydantic>=1.8.1,<2.0.0
 7 | scikit-learn>=1.1.3,<2.0.0
 8 | strictyaml>=1.3.2,<2.0.0
 9 | ruamel.yaml>=0.16.12,<1.0.0
10 | feature-engine>=1.0.2,<1.6.0  # breaking change in v1.6.0
11 | joblib>=1.0.1,<2.0.0


--------------------------------------------------------------------------------
/section-05-production-model-package/requirements/test_requirements.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 | 
3 | # testing requirements
4 | pytest>=7.2.0,<8.0.0
5 | 


--------------------------------------------------------------------------------
/section-05-production-model-package/requirements/typing_requirements.txt:
--------------------------------------------------------------------------------
1 | # repo maintenance tooling
2 | black>=22.12.0,<23.0.0
3 | flake8>=6.0.0,<7.0.0
4 | mypy>=0.991,<1.0.0
5 | isort>=5.11.4,<6.0.0


--------------------------------------------------------------------------------
/section-05-production-model-package/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from pathlib import Path
 5 | 
 6 | from setuptools import find_packages, setup
 7 | 
 8 | # Package meta-data.
 9 | NAME = 'tid-regression-model'
10 | DESCRIPTION = "Example regression model package from Train In Data."
11 | URL = "https://github.com/trainindata/testing-and-monitoring-ml-deployments"
12 | EMAIL = "christopher.samiullah@protonmail.com"
13 | AUTHOR = "ChristopherGS"
14 | REQUIRES_PYTHON = ">=3.6.0"
15 | 
16 | 
17 | # The rest you shouldn't have to touch too much :)
18 | # ------------------------------------------------
19 | # Except, perhaps the License and Trove Classifiers!
20 | # If you do change the License, remember to change the
21 | # Trove Classifier for that!
22 | long_description = DESCRIPTION
23 | 
24 | # Load the package's VERSION file as a dictionary.
25 | about = {}
26 | ROOT_DIR = Path(__file__).resolve().parent
27 | REQUIREMENTS_DIR = ROOT_DIR / 'requirements'
28 | PACKAGE_DIR = ROOT_DIR / 'regression_model'
29 | with open(PACKAGE_DIR / "VERSION") as f:
30 |     _version = f.read().strip()
31 |     about["__version__"] = _version
32 | 
33 | 
34 | # What packages are required for this module to be executed?
35 | def list_reqs(fname="requirements.txt"):
36 |     with open(REQUIREMENTS_DIR / fname) as fd:
37 |         return fd.read().splitlines()
38 | 
39 | # Where the magic happens:
40 | setup(
41 |     name=NAME,
42 |     version=about["__version__"],
43 |     description=DESCRIPTION,
44 |     long_description=long_description,
45 |     long_description_content_type="text/markdown",
46 |     author=AUTHOR,
47 |     author_email=EMAIL,
48 |     python_requires=REQUIRES_PYTHON,
49 |     url=URL,
50 |     packages=find_packages(exclude=("tests",)),
51 |     package_data={"regression_model": ["VERSION"]},
52 |     install_requires=list_reqs(),
53 |     extras_require={},
54 |     include_package_data=True,
55 |     license="BSD-3",
56 |     classifiers=[
57 |         # Trove classifiers
58 |         # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
59 |         "License :: OSI Approved :: MIT License",
60 |         "Programming Language :: Python",
61 |         "Programming Language :: Python :: 3",
62 |         "Programming Language :: Python :: 3.6",
63 |         "Programming Language :: Python :: 3.7",
64 |         "Programming Language :: Python :: 3.8",
65 |         "Programming Language :: Python :: 3.9",
66 |         "Programming Language :: Python :: Implementation :: CPython",
67 |         "Programming Language :: Python :: Implementation :: PyPy",
68 |     ],
69 | )


--------------------------------------------------------------------------------
/section-05-production-model-package/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/section-05-production-model-package/tests/__init__.py


--------------------------------------------------------------------------------
/section-05-production-model-package/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from regression_model.config.core import config
 4 | from regression_model.processing.data_manager import load_dataset
 5 | 
 6 | 
 7 | @pytest.fixture()
 8 | def sample_input_data():
 9 |     return load_dataset(file_name=config.app_config.test_data_file)
10 | 


--------------------------------------------------------------------------------
/section-05-production-model-package/tests/test_features.py:
--------------------------------------------------------------------------------
 1 | from regression_model.config.core import config
 2 | from regression_model.processing.features import TemporalVariableTransformer
 3 | 
 4 | 
 5 | def test_temporal_variable_transformer(sample_input_data):
 6 |     # Given
 7 |     transformer = TemporalVariableTransformer(
 8 |         variables=config.model_config.temporal_vars,  # YearRemodAdd
 9 |         reference_variable=config.model_config.ref_var,
10 |     )
11 |     assert sample_input_data["YearRemodAdd"].iat[0] == 1961
12 | 
13 |     # When
14 |     subject = transformer.fit_transform(sample_input_data)
15 | 
16 |     # Then
17 |     assert subject["YearRemodAdd"].iat[0] == 49
18 | 


--------------------------------------------------------------------------------
/section-05-production-model-package/tests/test_prediction.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import numpy as np
 4 | 
 5 | from regression_model.predict import make_prediction
 6 | 
 7 | 
 8 | def test_make_prediction(sample_input_data):
 9 |     # Given
10 |     expected_first_prediction_value = 113422
11 |     expected_no_predictions = 1449
12 | 
13 |     # When
14 |     result = make_prediction(input_data=sample_input_data)
15 | 
16 |     # Then
17 |     predictions = result.get("predictions")
18 |     assert isinstance(predictions, list)
19 |     assert isinstance(predictions[0], np.float64)
20 |     assert result.get("errors") is None
21 |     assert len(predictions) == expected_no_predictions
22 |     assert math.isclose(predictions[0], expected_first_prediction_value, abs_tol=100)
23 | 


--------------------------------------------------------------------------------
/section-05-production-model-package/tox.ini:
--------------------------------------------------------------------------------
 1 | # Tox is a generic virtualenv management and test command line tool. Its goal is to
 2 | # standardize testing in Python. We will be using it extensively in this course.
 3 | 
 4 | # Using Tox we can (on multiple operating systems):
 5 | # + Eliminate PYTHONPATH challenges when running scripts/tests
 6 | # + Eliminate virtualenv setup confusion
 7 | # + Streamline steps such as model training, model publishing
 8 | 
 9 | 
10 | [tox]
11 | min_version = 4
12 | envlist = test_package, checks
13 | skipsdist = True
14 | 
15 | [testenv]
16 | basepython = python
17 | install_command = pip install {opts} {packages}
18 | allowlist_externals = train
19 | 
20 | setenv =
21 | 	PYTHONPATH=.
22 | 	PYTHONHASHSEED=0
23 | 
24 | [testenv:test_package]
25 | envdir = {toxworkdir}/test_package
26 | deps =
27 | 	-r{toxinidir}/requirements/test_requirements.txt
28 | commands=
29 | 	python regression_model/train_pipeline.py
30 | 	pytest \
31 | 	-s \
32 | 	-vv \
33 | 	{posargs:tests/}
34 | 
35 | [testenv:train]
36 | envdir = {toxworkdir}/test_package
37 | deps =
38 |      {[testenv:test_package]deps}
39 | commands=
40 | 	python regression_model/train_pipeline.py
41 | 
42 | 
43 | [testenv:checks]
44 | envdir = {toxworkdir}/checks
45 | deps =
46 | 	-r{toxinidir}/requirements/typing_requirements.txt
47 | commands =
48 | 	flake8 regression_model tests
49 | 	isort regression_model tests
50 | 	{posargs:mypy regression_model}
51 | 
52 | 
53 | [flake8]
54 | exclude = .git,env
55 | max-line-length = 100


--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/Procfile:
--------------------------------------------------------------------------------
1 | web: uvicorn app.main:app --host 0.0.0.0 --port $PORT


--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/app/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.2"
2 | 


--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/app/api.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import Any
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | from fastapi import APIRouter, HTTPException
 7 | from fastapi.encoders import jsonable_encoder
 8 | from loguru import logger
 9 | from regression_model import __version__ as model_version
10 | from regression_model.predict import make_prediction
11 | 
12 | from app import __version__, schemas
13 | from app.config import settings
14 | 
15 | api_router = APIRouter()
16 | 
17 | 
18 | @api_router.get("/health", response_model=schemas.Health, status_code=200)
19 | def health() -> dict:
20 |     """
21 |     Root Get
22 |     """
23 |     health = schemas.Health(
24 |         name=settings.PROJECT_NAME, api_version=__version__, model_version=model_version
25 |     )
26 | 
27 |     return health.dict()
28 | 
29 | 
30 | @api_router.post("/predict", response_model=schemas.PredictionResults, status_code=200)
31 | async def predict(input_data: schemas.MultipleHouseDataInputs) -> Any:
32 |     """
33 |     Make house price predictions with the TID regression model
34 |     """
35 | 
36 |     input_df = pd.DataFrame(jsonable_encoder(input_data.inputs))
37 | 
38 |     # Advanced: You can improve performance of your API by rewriting the
39 |     # `make prediction` function to be async and using await here.
40 |     logger.info(f"Making prediction on inputs: {input_data.inputs}")
41 |     results = make_prediction(input_data=input_df.replace({np.nan: None}))
42 | 
43 |     if results["errors"] is not None:
44 |         logger.warning(f"Prediction validation error: {results.get('errors')}")
45 |         raise HTTPException(status_code=400, detail=json.loads(results["errors"]))
46 | 
47 |     logger.info(f"Prediction results: {results.get('predictions')}")
48 | 
49 |     return results
50 | 


--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/app/config.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | from types import FrameType
 4 | from typing import List, cast
 5 | 
 6 | from loguru import logger
 7 | from pydantic import AnyHttpUrl, BaseSettings
 8 | 
 9 | 
10 | class LoggingSettings(BaseSettings):
11 |     LOGGING_LEVEL: int = logging.INFO  # logging levels are type int
12 | 
13 | 
14 | class Settings(BaseSettings):
15 |     API_V1_STR: str = "/api/v1"
16 | 
17 |     # Meta
18 |     logging: LoggingSettings = LoggingSettings()
19 | 
20 |     # BACKEND_CORS_ORIGINS is a comma-separated list of origins
21 |     # e.g: http://localhost,http://localhost:4200,http://localhost:3000
22 |     BACKEND_CORS_ORIGINS: List[AnyHttpUrl] = [
23 |         "http://localhost:3000",  # type: ignore
24 |         "http://localhost:8000",  # type: ignore
25 |         "https://localhost:3000",  # type: ignore
26 |         "https://localhost:8000",  # type: ignore
27 |     ]
28 | 
29 |     PROJECT_NAME: str = "House Price Prediction API"
30 | 
31 |     class Config:
32 |         case_sensitive = True
33 | 
34 | 
35 | # See: https://loguru.readthedocs.io/en/stable/overview.html#entirely-compatible-with-standard-logging  # noqa
36 | class InterceptHandler(logging.Handler):
37 |     def emit(self, record: logging.LogRecord) -> None:  # pragma: no cover
38 |         # Get corresponding Loguru level if it exists
39 |         try:
40 |             level = logger.level(record.levelname).name
41 |         except ValueError:
42 |             level = str(record.levelno)
43 | 
44 |         # Find caller from where originated the logged message
45 |         frame, depth = logging.currentframe(), 2
46 |         while frame.f_code.co_filename == logging.__file__:  # noqa: WPS609
47 |             frame = cast(FrameType, frame.f_back)
48 |             depth += 1
49 | 
50 |         logger.opt(depth=depth, exception=record.exc_info).log(
51 |             level,
52 |             record.getMessage(),
53 |         )
54 | 
55 | 
56 | def setup_app_logging(config: Settings) -> None:
57 |     """Prepare custom logging for our application."""
58 | 
59 |     LOGGERS = ("uvicorn.asgi", "uvicorn.access")
60 |     logging.getLogger().handlers = [InterceptHandler()]
61 |     for logger_name in LOGGERS:
62 |         logging_logger = logging.getLogger(logger_name)
63 |         logging_logger.handlers = [InterceptHandler(level=config.logging.LOGGING_LEVEL)]
64 | 
65 |     logger.configure(
66 |         handlers=[{"sink": sys.stderr, "level": config.logging.LOGGING_LEVEL}]
67 |     )
68 | 
69 | 
70 | settings = Settings()
71 | 


--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/app/main.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | from fastapi import APIRouter, FastAPI, Request
 4 | from fastapi.middleware.cors import CORSMiddleware
 5 | from fastapi.responses import HTMLResponse
 6 | from loguru import logger
 7 | 
 8 | from app.api import api_router
 9 | from app.config import settings, setup_app_logging
10 | 
11 | # setup logging as early as possible
12 | setup_app_logging(config=settings)
13 | 
14 | 
15 | app = FastAPI(
16 |     title=settings.PROJECT_NAME, openapi_url=f"{settings.API_V1_STR}/openapi.json"
17 | )
18 | 
19 | root_router = APIRouter()
20 | 
21 | 
22 | @root_router.get("/")
23 | def index(request: Request) -> Any:
24 |     """Basic HTML response."""
25 |     body = (
26 |         "<html>"
27 |         "<body style='padding: 10px;'>"
28 |         "<h1>Welcome to the API</h1>"
29 |         "<div>"
30 |         "Check the docs: <a href='/docs'>here</a>"
31 |         "</div>"
32 |         "</body>"
33 |         "</html>"
34 |     )
35 | 
36 |     return HTMLResponse(content=body)
37 | 
38 | 
39 | app.include_router(api_router, prefix=settings.API_V1_STR)
40 | app.include_router(root_router)
41 | 
42 | # Set all CORS enabled origins
43 | if settings.BACKEND_CORS_ORIGINS:
44 |     app.add_middleware(
45 |         CORSMiddleware,
46 |         allow_origins=[str(origin) for origin in settings.BACKEND_CORS_ORIGINS],
47 |         allow_credentials=True,
48 |         allow_methods=["*"],
49 |         allow_headers=["*"],
50 |     )
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     # Use this for debugging purposes only
55 |     logger.warning("Running in development mode. Do not run like this in production.")
56 |     import uvicorn
57 | 
58 |     uvicorn.run(app, host="localhost", port=8001, log_level="debug")
59 | 


--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/app/schemas/__init__.py:
--------------------------------------------------------------------------------
1 | from .health import Health
2 | from .predict import MultipleHouseDataInputs, PredictionResults
3 | 


--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/app/schemas/health.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 | 
3 | 
4 | class Health(BaseModel):
5 |     name: str
6 |     api_version: str
7 |     model_version: str
8 | 


--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/app/schemas/predict.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, List, Optional
  2 | 
  3 | from pydantic import BaseModel
  4 | from regression_model.processing.validation import HouseDataInputSchema
  5 | 
  6 | 
  7 | class PredictionResults(BaseModel):
  8 |     errors: Optional[Any]
  9 |     version: str
 10 |     predictions: Optional[List[float]]
 11 | 
 12 | 
 13 | class MultipleHouseDataInputs(BaseModel):
 14 |     inputs: List[HouseDataInputSchema]
 15 | 
 16 |     class Config:
 17 |         schema_extra = {
 18 |             "example": {
 19 |                 "inputs": [
 20 |                     {
 21 |                         "MSSubClass": 20,
 22 |                         "MSZoning": "RH",
 23 |                         "LotFrontage": 80.0,
 24 |                         "LotArea": 11622,
 25 |                         "Street": "Pave",
 26 |                         "Alley": None,
 27 |                         "LotShape": "Reg",
 28 |                         "LandContour": "Lvl",
 29 |                         "Utilities": "AllPub",
 30 |                         "LotConfig": "Inside",
 31 |                         "LandSlope": "Gtl",
 32 |                         "Neighborhood": "NAmes",
 33 |                         "Condition1": "Feedr",
 34 |                         "Condition2": "Norm",
 35 |                         "BldgType": "1Fam",
 36 |                         "HouseStyle": "1Story",
 37 |                         "OverallQual": 5,
 38 |                         "OverallCond": 6,
 39 |                         "YearBuilt": 1961,
 40 |                         "YearRemodAdd": 1961,
 41 |                         "RoofStyle": "Gable",
 42 |                         "RoofMatl": "CompShg",
 43 |                         "Exterior1st": "VinylSd",
 44 |                         "Exterior2nd": "VinylSd",
 45 |                         "MasVnrType": "None",
 46 |                         "MasVnrArea": 0.0,
 47 |                         "ExterQual": "TA",
 48 |                         "ExterCond": "TA",
 49 |                         "Foundation": "CBlock",
 50 |                         "BsmtQual": "TA",
 51 |                         "BsmtCond": "TA",
 52 |                         "BsmtExposure": "No",
 53 |                         "BsmtFinType1": "Rec",
 54 |                         "BsmtFinSF1": 468.0,
 55 |                         "BsmtFinType2": "LwQ",
 56 |                         "BsmtFinSF2": 144.0,
 57 |                         "BsmtUnfSF": 270.0,
 58 |                         "TotalBsmtSF": 882.0,
 59 |                         "Heating": "GasA",
 60 |                         "HeatingQC": "TA",
 61 |                         "CentralAir": "Y",
 62 |                         "Electrical": "SBrkr",
 63 |                         "FirstFlrSF": 896,
 64 |                         "SecondFlrSF": 0,
 65 |                         "LowQualFinSF": 0,
 66 |                         "GrLivArea": 896,
 67 |                         "BsmtFullBath": 0.0,
 68 |                         "BsmtHalfBath": 0.0,
 69 |                         "FullBath": 1,
 70 |                         "HalfBath": 0,
 71 |                         "BedroomAbvGr": 2,
 72 |                         "KitchenAbvGr": 1,
 73 |                         "KitchenQual": "TA",
 74 |                         "TotRmsAbvGrd": 5,
 75 |                         "Functional": "Typ",
 76 |                         "Fireplaces": 0,
 77 |                         "FireplaceQu": None,
 78 |                         "GarageType": "Attchd",
 79 |                         "GarageYrBlt": 1961.0,
 80 |                         "GarageFinish": "Unf",
 81 |                         "GarageCars": 1.0,
 82 |                         "GarageArea": 730.0,
 83 |                         "GarageQual": "TA",
 84 |                         "GarageCond": "TA",
 85 |                         "PavedDrive": "Y",
 86 |                         "WoodDeckSF": 140,
 87 |                         "OpenPorchSF": 0,
 88 |                         "EnclosedPorch": 0,
 89 |                         "ThreeSsnPortch": 0,
 90 |                         "ScreenPorch": 120,
 91 |                         "PoolArea": 0,
 92 |                         "PoolQC": None,
 93 |                         "Fence": "MnPrv",
 94 |                         "MiscFeature": None,
 95 |                         "MiscVal": 0,
 96 |                         "MoSold": 6,
 97 |                         "YrSold": 2010,
 98 |                         "SaleType": "WD",
 99 |                         "SaleCondition": "Normal",
100 |                     }
101 |                 ]
102 |             }
103 |         }
104 | 


--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/app/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/section-06-model-serving-api/house-prices-api/app/tests/__init__.py


--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/app/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | from typing import Generator
 2 | 
 3 | import pandas as pd
 4 | import pytest
 5 | from fastapi.testclient import TestClient
 6 | from regression_model.config.core import config
 7 | from regression_model.processing.data_manager import load_dataset
 8 | 
 9 | from app.main import app
10 | 
11 | 
12 | @pytest.fixture(scope="module")
13 | def test_data() -> pd.DataFrame:
14 |     return load_dataset(file_name=config.app_config.test_data_file)
15 | 
16 | 
17 | @pytest.fixture()
18 | def client() -> Generator:
19 |     with TestClient(app) as _client:
20 |         yield _client
21 |         app.dependency_overrides = {}
22 | 


--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/app/tests/test_api.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | from fastapi.testclient import TestClient
 6 | 
 7 | 
 8 | def test_make_prediction(client: TestClient, test_data: pd.DataFrame) -> None:
 9 |     # Given
10 |     payload = {
11 |         # ensure pydantic plays well with np.nan
12 |         "inputs": test_data.replace({np.nan: None}).to_dict(orient="records")
13 |     }
14 | 
15 |     # When
16 |     response = client.post(
17 |         "http://localhost:8001/api/v1/predict",
18 |         json=payload,
19 |     )
20 | 
21 |     # Then
22 |     assert response.status_code == 200
23 |     prediction_data = response.json()
24 |     assert prediction_data["predictions"]
25 |     assert prediction_data["errors"] is None
26 |     assert math.isclose(prediction_data["predictions"][0], 113422, rel_tol=100)
27 | 


--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | plugins = pydantic.mypy
3 | ignore_missing_imports = True
4 | disallow_untyped_defs = True
5 | 


--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/requirements.txt:
--------------------------------------------------------------------------------
1 | uvicorn>=0.20.0,<0.30.0
2 | fastapi>=0.88.0,<1.0.0
3 | python-multipart>=0.0.5,<0.1.0
4 | pydantic>=1.10.4,<1.12.0
5 | typing_extensions>=4.2.0,<5.0.0
6 | loguru>=0.5.3,<1.0.0
7 | # We will explain this in the course
8 | tid-regression-model>=3.2.0
9 | feature-engine>=1.0.2,<1.6.0  # breaking change in v1.6.0


--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/test_requirements.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 | 
3 | # testing requirements
4 | pytest>=7.2.0,<8.0.0
5 | requests>=2.28.0,<2.50.0
6 | httpx>=0.23.2,<0.50.0
7 | 


--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/tox.ini:
--------------------------------------------------------------------------------
 1 | # Tox is a generic virtualenv management and test command line tool. Its goal is to
 2 | # standardize testing in Python. We will be using it extensively in this course.
 3 | 
 4 | # Using Tox we can (on multiple operating systems):
 5 | # + Eliminate PYTHONPATH challenges when running scripts/tests
 6 | # + Eliminate virtualenv setup confusion
 7 | # + Streamline steps such as model training, model publishing
 8 | 
 9 | [pytest]
10 | log_cli_level=WARNING
11 | 
12 | [tox]
13 | min_version = 4
14 | envlist = test_app, checks
15 | skipsdist = True
16 | 
17 | [testenv]
18 | install_command = pip install {opts} {packages}
19 | 
20 | [testenv:test_app]
21 | deps =
22 | 	-rtest_requirements.txt
23 | 
24 | setenv =
25 | 	PYTHONPATH=.
26 | 	PYTHONHASHSEED=0
27 | 
28 | commands=
29 | 	pytest \
30 | 	-vv \
31 | 	{posargs:app/tests/}
32 | 
33 | 
34 | [testenv:run]
35 | envdir = {toxworkdir}/test_app
36 | deps =
37 | 	{[testenv:test_app]deps}
38 | 
39 | setenv =
40 | 	{[testenv:test_app]setenv}
41 | 
42 | commands=
43 | 	python app/main.py
44 | 
45 | 
46 | [testenv:checks]
47 | envdir = {toxworkdir}/checks
48 | deps =
49 | 	-r{toxinidir}/typing_requirements.txt
50 | commands =
51 | 	flake8 app
52 | 	isort app
53 | 	black app
54 | 	{posargs:mypy app}
55 | 
56 | 
57 | [flake8]
58 | exclude = .git,__pycache__,__init__.py,.mypy_cache,.pytest_cache,.venv,alembic
59 | max-line-length = 88


--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/typing_requirements.txt:
--------------------------------------------------------------------------------
1 | # repo maintenance tooling
2 | black>=22.12.0,<23.0.0
3 | flake8>=6.0.0,<7.0.0
4 | mypy>=0.991,<1.0.0
5 | isort>=5.11.4,<6.0.0
6 | pydantic>=1.10.4,<1.12.0


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/Procfile:
--------------------------------------------------------------------------------
1 | web: uvicorn app.main:app --host 0.0.0.0 --port $PORT


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/app/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.2"
2 | 


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/app/api.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import Any
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | from fastapi import APIRouter, HTTPException
 7 | from fastapi.encoders import jsonable_encoder
 8 | from loguru import logger
 9 | from regression_model import __version__ as model_version
10 | from regression_model.predict import make_prediction
11 | 
12 | from app import __version__, schemas
13 | from app.config import settings
14 | 
15 | api_router = APIRouter()
16 | 
17 | 
18 | @api_router.get("/health", response_model=schemas.Health, status_code=200)
19 | def health() -> dict:
20 |     """
21 |     Root Get
22 |     """
23 |     health = schemas.Health(
24 |         name=settings.PROJECT_NAME, api_version=__version__, model_version=model_version
25 |     )
26 | 
27 |     return health.dict()
28 | 
29 | 
30 | @api_router.post("/predict", response_model=schemas.PredictionResults, status_code=200)
31 | async def predict(input_data: schemas.MultipleHouseDataInputs) -> Any:
32 |     """
33 |     Make house price predictions with the TID regression model
34 |     """
35 | 
36 |     input_df = pd.DataFrame(jsonable_encoder(input_data.inputs))
37 | 
38 |     # Advanced: You can improve performance of your API by rewriting the
39 |     # `make prediction` function to be async and using await here.
40 |     logger.info(f"Making prediction on inputs: {input_data.inputs}")
41 |     results = make_prediction(input_data=input_df.replace({np.nan: None}))
42 | 
43 |     if results["errors"] is not None:
44 |         logger.warning(f"Prediction validation error: {results.get('errors')}")
45 |         raise HTTPException(status_code=400, detail=json.loads(results["errors"]))
46 | 
47 |     logger.info(f"Prediction results: {results.get('predictions')}")
48 | 
49 |     return results
50 | 


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/app/config.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | from types import FrameType
 4 | from typing import List, cast
 5 | 
 6 | from loguru import logger
 7 | from pydantic import AnyHttpUrl, BaseSettings
 8 | 
 9 | 
10 | class LoggingSettings(BaseSettings):
11 |     LOGGING_LEVEL: int = logging.INFO  # logging levels are type int
12 | 
13 | 
14 | class Settings(BaseSettings):
15 |     API_V1_STR: str = "/api/v1"
16 | 
17 |     # Meta
18 |     logging: LoggingSettings = LoggingSettings()
19 | 
20 |     # BACKEND_CORS_ORIGINS is a comma-separated list of origins
21 |     # e.g: http://localhost,http://localhost:4200,http://localhost:3000
22 |     BACKEND_CORS_ORIGINS: List[AnyHttpUrl] = [
23 |         "http://localhost:3000",  # type: ignore
24 |         "http://localhost:8000",  # type: ignore
25 |         "https://localhost:3000",  # type: ignore
26 |         "https://localhost:8000",  # type: ignore
27 |     ]
28 | 
29 |     PROJECT_NAME: str = "House Price Prediction API"
30 | 
31 |     class Config:
32 |         case_sensitive = True
33 | 
34 | 
35 | # See: https://loguru.readthedocs.io/en/stable/overview.html#entirely-compatible-with-standard-logging  # noqa
36 | class InterceptHandler(logging.Handler):
37 |     def emit(self, record: logging.LogRecord) -> None:  # pragma: no cover
38 |         # Get corresponding Loguru level if it exists
39 |         try:
40 |             level = logger.level(record.levelname).name
41 |         except ValueError:
42 |             level = str(record.levelno)
43 | 
44 |         # Find caller from where originated the logged message
45 |         frame, depth = logging.currentframe(), 2
46 |         while frame.f_code.co_filename == logging.__file__:  # noqa: WPS609
47 |             frame = cast(FrameType, frame.f_back)
48 |             depth += 1
49 | 
50 |         logger.opt(depth=depth, exception=record.exc_info).log(
51 |             level,
52 |             record.getMessage(),
53 |         )
54 | 
55 | 
56 | def setup_app_logging(config: Settings) -> None:
57 |     """Prepare custom logging for our application."""
58 | 
59 |     LOGGERS = ("uvicorn.asgi", "uvicorn.access")
60 |     logging.getLogger().handlers = [InterceptHandler()]
61 |     for logger_name in LOGGERS:
62 |         logging_logger = logging.getLogger(logger_name)
63 |         logging_logger.handlers = [InterceptHandler(level=config.logging.LOGGING_LEVEL)]
64 | 
65 |     logger.configure(
66 |         handlers=[{"sink": sys.stderr, "level": config.logging.LOGGING_LEVEL}]
67 |     )
68 | 
69 | 
70 | settings = Settings()
71 | 


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/app/main.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | from fastapi import APIRouter, FastAPI, Request
 4 | from fastapi.middleware.cors import CORSMiddleware
 5 | from fastapi.responses import HTMLResponse
 6 | from loguru import logger
 7 | 
 8 | from app.api import api_router
 9 | from app.config import settings, setup_app_logging
10 | 
11 | # setup logging as early as possible
12 | setup_app_logging(config=settings)
13 | 
14 | 
15 | app = FastAPI(
16 |     title=settings.PROJECT_NAME, openapi_url=f"{settings.API_V1_STR}/openapi.json"
17 | )
18 | 
19 | root_router = APIRouter()
20 | 
21 | 
22 | @root_router.get("/")
23 | def index(request: Request) -> Any:
24 |     """Basic HTML response."""
25 |     body = (
26 |         "<html>"
27 |         "<body style='padding: 10px;'>"
28 |         "<h1>Welcome to the API</h1>"
29 |         "<div>"
30 |         "Check the docs: <a href='/docs'>here</a>"
31 |         "</div>"
32 |         "</body>"
33 |         "</html>"
34 |     )
35 | 
36 |     return HTMLResponse(content=body)
37 | 
38 | 
39 | app.include_router(api_router, prefix=settings.API_V1_STR)
40 | app.include_router(root_router)
41 | 
42 | # Set all CORS enabled origins
43 | if settings.BACKEND_CORS_ORIGINS:
44 |     app.add_middleware(
45 |         CORSMiddleware,
46 |         allow_origins=[str(origin) for origin in settings.BACKEND_CORS_ORIGINS],
47 |         allow_credentials=True,
48 |         allow_methods=["*"],
49 |         allow_headers=["*"],
50 |     )
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     # Use this for debugging purposes only
55 |     logger.warning("Running in development mode. Do not run like this in production.")
56 |     import uvicorn
57 | 
58 |     uvicorn.run(app, host="localhost", port=8001, log_level="debug")
59 | 


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/app/schemas/__init__.py:
--------------------------------------------------------------------------------
1 | from .health import Health
2 | from .predict import MultipleHouseDataInputs, PredictionResults
3 | 


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/app/schemas/health.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 | 
3 | 
4 | class Health(BaseModel):
5 |     name: str
6 |     api_version: str
7 |     model_version: str
8 | 


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/app/schemas/predict.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, List, Optional
  2 | 
  3 | from pydantic import BaseModel
  4 | from regression_model.processing.validation import HouseDataInputSchema
  5 | 
  6 | 
  7 | class PredictionResults(BaseModel):
  8 |     errors: Optional[Any]
  9 |     version: str
 10 |     predictions: Optional[List[float]]
 11 | 
 12 | 
 13 | class MultipleHouseDataInputs(BaseModel):
 14 |     inputs: List[HouseDataInputSchema]
 15 | 
 16 |     class Config:
 17 |         schema_extra = {
 18 |             "example": {
 19 |                 "inputs": [
 20 |                     {
 21 |                         "MSSubClass": 20,
 22 |                         "MSZoning": "RH",
 23 |                         "LotFrontage": 80.0,
 24 |                         "LotArea": 11622,
 25 |                         "Street": "Pave",
 26 |                         "Alley": None,
 27 |                         "LotShape": "Reg",
 28 |                         "LandContour": "Lvl",
 29 |                         "Utilities": "AllPub",
 30 |                         "LotConfig": "Inside",
 31 |                         "LandSlope": "Gtl",
 32 |                         "Neighborhood": "NAmes",
 33 |                         "Condition1": "Feedr",
 34 |                         "Condition2": "Norm",
 35 |                         "BldgType": "1Fam",
 36 |                         "HouseStyle": "1Story",
 37 |                         "OverallQual": 5,
 38 |                         "OverallCond": 6,
 39 |                         "YearBuilt": 1961,
 40 |                         "YearRemodAdd": 1961,
 41 |                         "RoofStyle": "Gable",
 42 |                         "RoofMatl": "CompShg",
 43 |                         "Exterior1st": "VinylSd",
 44 |                         "Exterior2nd": "VinylSd",
 45 |                         "MasVnrType": "None",
 46 |                         "MasVnrArea": 0.0,
 47 |                         "ExterQual": "TA",
 48 |                         "ExterCond": "TA",
 49 |                         "Foundation": "CBlock",
 50 |                         "BsmtQual": "TA",
 51 |                         "BsmtCond": "TA",
 52 |                         "BsmtExposure": "No",
 53 |                         "BsmtFinType1": "Rec",
 54 |                         "BsmtFinSF1": 468.0,
 55 |                         "BsmtFinType2": "LwQ",
 56 |                         "BsmtFinSF2": 144.0,
 57 |                         "BsmtUnfSF": 270.0,
 58 |                         "TotalBsmtSF": 882.0,
 59 |                         "Heating": "GasA",
 60 |                         "HeatingQC": "TA",
 61 |                         "CentralAir": "Y",
 62 |                         "Electrical": "SBrkr",
 63 |                         "FirstFlrSF": 896,
 64 |                         "SecondFlrSF": 0,
 65 |                         "LowQualFinSF": 0,
 66 |                         "GrLivArea": 896,
 67 |                         "BsmtFullBath": 0.0,
 68 |                         "BsmtHalfBath": 0.0,
 69 |                         "FullBath": 1,
 70 |                         "HalfBath": 0,
 71 |                         "BedroomAbvGr": 2,
 72 |                         "KitchenAbvGr": 1,
 73 |                         "KitchenQual": "TA",
 74 |                         "TotRmsAbvGrd": 5,
 75 |                         "Functional": "Typ",
 76 |                         "Fireplaces": 0,
 77 |                         "FireplaceQu": None,
 78 |                         "GarageType": "Attchd",
 79 |                         "GarageYrBlt": 1961.0,
 80 |                         "GarageFinish": "Unf",
 81 |                         "GarageCars": 1.0,
 82 |                         "GarageArea": 730.0,
 83 |                         "GarageQual": "TA",
 84 |                         "GarageCond": "TA",
 85 |                         "PavedDrive": "Y",
 86 |                         "WoodDeckSF": 140,
 87 |                         "OpenPorchSF": 0,
 88 |                         "EnclosedPorch": 0,
 89 |                         "ThreeSsnPortch": 0,
 90 |                         "ScreenPorch": 120,
 91 |                         "PoolArea": 0,
 92 |                         "PoolQC": None,
 93 |                         "Fence": "MnPrv",
 94 |                         "MiscFeature": None,
 95 |                         "MiscVal": 0,
 96 |                         "MoSold": 6,
 97 |                         "YrSold": 2010,
 98 |                         "SaleType": "WD",
 99 |                         "SaleCondition": "Normal",
100 |                     }
101 |                 ]
102 |             }
103 |         }
104 | 


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/app/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/section-07-ci-and-publishing/house-prices-api/app/tests/__init__.py


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/app/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | from typing import Generator
 2 | 
 3 | import pandas as pd
 4 | import pytest
 5 | from fastapi.testclient import TestClient
 6 | from regression_model.config.core import config
 7 | from regression_model.processing.data_manager import load_dataset
 8 | 
 9 | from app.main import app
10 | 
11 | 
12 | @pytest.fixture(scope="module")
13 | def test_data() -> pd.DataFrame:
14 |     return load_dataset(file_name=config.app_config.test_data_file)
15 | 
16 | 
17 | @pytest.fixture()
18 | def client() -> Generator:
19 |     with TestClient(app) as _client:
20 |         yield _client
21 |         app.dependency_overrides = {}
22 | 


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/app/tests/test_api.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | from fastapi.testclient import TestClient
 6 | 
 7 | 
 8 | def test_make_prediction(client: TestClient, test_data: pd.DataFrame) -> None:
 9 |     # Given
10 |     payload = {
11 |         # ensure pydantic plays well with np.nan
12 |         "inputs": test_data.replace({np.nan: None}).to_dict(orient="records")
13 |     }
14 | 
15 |     # When
16 |     response = client.post(
17 |         "http://localhost:8001/api/v1/predict",
18 |         json=payload,
19 |     )
20 | 
21 |     # Then
22 |     assert response.status_code == 200
23 |     prediction_data = response.json()
24 |     assert prediction_data["predictions"]
25 |     assert prediction_data["errors"] is None
26 |     assert math.isclose(prediction_data["predictions"][0], 113422, rel_tol=100)
27 | 


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | plugins = pydantic.mypy
3 | ignore_missing_imports = True
4 | disallow_untyped_defs = True
5 | 


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/requirements.txt:
--------------------------------------------------------------------------------
1 | uvicorn>=0.20.0,<0.30.0
2 | fastapi>=0.88.0,<1.0.0
3 | python-multipart>=0.0.5,<0.1.0
4 | pydantic>=1.10.4,<1.12.0
5 | typing_extensions>=4.2.0,<5.0.0
6 | loguru>=0.5.3,<1.0.0
7 | # We will explain this in the course
8 | tid-regression-model>=3.2.0
9 | feature-engine>=1.0.2,<1.6.0  # breaking change in v1.6.0


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/test_requirements.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 | 
3 | # testing requirements
4 | pytest>=7.2.0,<8.0.0
5 | requests>=2.28.0,<2.50.0
6 | httpx>=0.23.2,<0.50.0
7 | 


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/tox.ini:
--------------------------------------------------------------------------------
 1 | # Tox is a generic virtualenv management and test command line tool. Its goal is to
 2 | # standardize testing in Python. We will be using it extensively in this course.
 3 | 
 4 | # Using Tox we can (on multiple operating systems):
 5 | # + Eliminate PYTHONPATH challenges when running scripts/tests
 6 | # + Eliminate virtualenv setup confusion
 7 | # + Streamline steps such as model training, model publishing
 8 | 
 9 | [pytest]
10 | log_cli_level=WARNING
11 | 
12 | [tox]
13 | envlist = test_app, checks
14 | skipsdist = True
15 | 
16 | [testenv]
17 | install_command = pip install {opts} {packages}
18 | 
19 | [testenv:test_app]
20 | deps =
21 | 	-rtest_requirements.txt
22 | 
23 | setenv =
24 | 	PYTHONPATH=.
25 | 	PYTHONHASHSEED=0
26 | 
27 | commands=
28 | 	pytest \
29 | 	-vv \
30 | 	{posargs:app/tests/}
31 | 
32 | 
33 | [testenv:run]
34 | envdir = {toxworkdir}/test_app
35 | deps =
36 | 	{[testenv:test_app]deps}
37 | 
38 | setenv =
39 | 	{[testenv:test_app]setenv}
40 | 
41 | commands=
42 | 	python app/main.py
43 | 
44 | 
45 | [testenv:checks]
46 | envdir = {toxworkdir}/checks
47 | deps =
48 | 	-r{toxinidir}/typing_requirements.txt
49 | commands =
50 | 	flake8 app
51 | 	isort app
52 | 	black app
53 | 	{posargs:mypy app}
54 | 
55 | 
56 | [flake8]
57 | exclude = .git,__pycache__,__init__.py,.mypy_cache,.pytest_cache,.venv,alembic
58 | max-line-length = 88


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/typing_requirements.txt:
--------------------------------------------------------------------------------
1 | # repo maintenance tooling
2 | black>=22.12.0,<23.0.0
3 | flake8>=6.0.0,<7.0.0
4 | mypy>=0.991,<1.0.0
5 | isort>=5.11.4,<6.0.0
6 | pydantic>=1.10.4,<1.12.0


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include *.txt
 2 | include *.md
 3 | include *.pkl
 4 | recursive-include ./regression_model/*
 5 | 
 6 | include regression_model/datasets/train.csv
 7 | include regression_model/datasets/test.csv
 8 | include regression_model/trained_models/*.pkl
 9 | include regression_model/VERSION
10 | include regression_model/config.yml
11 | 
12 | include ./requirements/requirements.txt
13 | include ./requirements/test_requirements.txt
14 | exclude *.log
15 | exclude *.cfg
16 | 
17 | recursive-exclude * __pycache__
18 | recursive-exclude * *.py[co]


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | # warn_unreachable = True
 3 | warn_unused_ignores = True
 4 | follow_imports = skip
 5 | show_error_context = True
 6 | warn_incomplete_stub = True
 7 | ignore_missing_imports = True
 8 | check_untyped_defs = True
 9 | cache_dir = /dev/null
10 | # Cannot enable this one as we still allow defining functions without any types.
11 | # disallow_untyped_defs = True
12 | warn_redundant_casts = True
13 | warn_unused_configs = True
14 | strict_optional = True


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/publish_model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Building packages and uploading them to a Gemfury repository
 4 | 
 5 | GEMFURY_URL=$GEMFURY_PUSH_URL
 6 | 
 7 | set -e
 8 | 
 9 | DIRS="$@"
10 | BASE_DIR=$(pwd)
11 | SETUP="setup.py"
12 | 
13 | warn() {
14 |     echo "$@" 1>&2
15 | }
16 | 
17 | die() {
18 |     warn "$@"
19 |     exit 1
20 | }
21 | 
22 | build() {
23 |     DIR="${1/%\//}"
24 |     echo "Checking directory $DIR"
25 |     cd "$BASE_DIR/$DIR"
26 |     [ ! -e $SETUP ] && warn "No $SETUP file, skipping" && return
27 |     PACKAGE_NAME=$(python $SETUP --fullname)
28 |     echo "Package $PACKAGE_NAME"
29 |     python "$SETUP" sdist bdist_wheel || die "Building package $PACKAGE_NAME failed"
30 |     for X in $(ls dist)
31 |     do
32 |         curl -F package=@"dist/$X" "$GEMFURY_URL" || die "Uploading package $PACKAGE_NAME failed on file dist/$X"
33 |     done
34 | }
35 | 
36 | if [ -n "$DIRS" ]; then
37 |     for dir in $DIRS; do
38 |         build $dir
39 |     done
40 | else
41 |     ls -d */ | while read dir; do
42 |         build $dir
43 |     done
44 | fi


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = [
 3 |     "setuptools>=42",
 4 |     "wheel"
 5 | ]
 6 | build-backend = "setuptools.build_meta"
 7 | 
 8 | [tool.pytest.ini_options]
 9 | minversion = "2.0"
10 | addopts = "-rfEX -p pytester --strict-markers"
11 | python_files = ["test_*.py", "*_test.py"]
12 | python_classes = ["Test", "Acceptance"]
13 | python_functions = ["test"]
14 | # NOTE: "doc" is not included here, but gets tested explicitly via "doctesting".
15 | testpaths = ["tests"]
16 | xfail_strict = true
17 | filterwarnings = [
18 |     "error",
19 |     "default:Using or importing the ABCs:DeprecationWarning:unittest2.*",
20 |     # produced by older pyparsing<=2.2.0.
21 |     "default:Using or importing the ABCs:DeprecationWarning:pyparsing.*",
22 |     "default:the imp module is deprecated in favour of importlib:DeprecationWarning:nose.*",
23 |     # distutils is deprecated in 3.10, scheduled for removal in 3.12
24 |     "ignore:The distutils package is deprecated:DeprecationWarning",
25 |     # produced by python3.6/site.py itself (3.6.7 on Travis, could not trigger it with 3.6.8)."
26 |     "ignore:.*U.*mode is deprecated:DeprecationWarning:(?!(pytest|_pytest))",
27 |     # produced by pytest-xdist
28 |     "ignore:.*type argument to addoption.*:DeprecationWarning",
29 |     # produced on execnet (pytest-xdist)
30 |     "ignore:.*inspect.getargspec.*deprecated, use inspect.signature.*:DeprecationWarning",
31 |     # pytest's own futurewarnings
32 |     "ignore::pytest.PytestExperimentalApiWarning",
33 |     # Do not cause SyntaxError for invalid escape sequences in py37.
34 |     # Those are caught/handled by pyupgrade, and not easy to filter with the
35 |     # module being the filename (with .py removed).
36 |     "default:invalid escape sequence:DeprecationWarning",
37 |     # ignore use of unregistered marks, because we use many to test the implementation
38 |     "ignore::_pytest.warning_types.PytestUnknownMarkWarning",
39 | ]
40 | 
41 | [tool.black]
42 | target-version = ['py311']
43 | 
44 | [tool.isort]
45 | profile = "black"
46 | line_length = 100
47 | lines_between_sections = 1
48 | skip = "migrations"
49 | 


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/regression_model/VERSION:
--------------------------------------------------------------------------------
1 | 4.0.5
2 | 


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/regression_model/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from regression_model.config.core import PACKAGE_ROOT, config
 4 | 
 5 | # It is strongly advised that you do not add any handlers other than
 6 | # NullHandler to your library’s loggers. This is because the configuration
 7 | # of handlers is the prerogative of the application developer who uses your
 8 | # library. The application developer knows their target audience and what
 9 | # handlers are most appropriate for their application: if you add handlers
10 | # ‘under the hood’, you might well interfere with their ability to carry out
11 | # unit tests and deliver logs which suit their requirements.
12 | # https://docs.python.org/3/howto/logging.html#configuring-logging-for-a-library
13 | logging.getLogger(config.app_config.package_name).addHandler(logging.NullHandler())
14 | 
15 | 
16 | with open(PACKAGE_ROOT / "VERSION") as version_file:
17 |     __version__ = version_file.read().strip()
18 | 


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/regression_model/config.yml:
--------------------------------------------------------------------------------
  1 | # Package Overview
  2 | package_name: regression_model
  3 | 
  4 | # Data Files
  5 | training_data_file: train.csv
  6 | test_data_file: test.csv
  7 | 
  8 | # Variables
  9 | # The variable we are attempting to predict (sale price)
 10 | target: SalePrice
 11 | 
 12 | pipeline_name: regression_model
 13 | pipeline_save_file: regression_model_output_v
 14 | 
 15 | # Will cause syntax errors since they begin with numbers
 16 | variables_to_rename:
 17 |   1stFlrSF: FirstFlrSF
 18 |   2ndFlrSF: SecondFlrSF
 19 |   3SsnPorch: ThreeSsnPortch
 20 | 
 21 | features:
 22 |   - MSSubClass
 23 |   - MSZoning
 24 |   - LotFrontage
 25 |   - LotShape
 26 |   - LandContour
 27 |   - LotConfig
 28 |   - Neighborhood
 29 |   - OverallQual
 30 |   - OverallCond
 31 |   - YearRemodAdd
 32 |   - RoofStyle
 33 |   - Exterior1st
 34 |   - ExterQual
 35 |   - Foundation
 36 |   - BsmtQual
 37 |   - BsmtExposure
 38 |   - BsmtFinType1
 39 |   - HeatingQC
 40 |   - CentralAir
 41 |   - FirstFlrSF  # renamed
 42 |   - SecondFlrSF  # renamed
 43 |   - GrLivArea
 44 |   - BsmtFullBath
 45 |   - HalfBath
 46 |   - KitchenQual
 47 |   - TotRmsAbvGrd
 48 |   - Functional
 49 |   - Fireplaces
 50 |   - FireplaceQu
 51 |   - GarageFinish
 52 |   - GarageCars
 53 |   - GarageArea
 54 |   - PavedDrive
 55 |   - WoodDeckSF
 56 |   - ScreenPorch
 57 |   - SaleCondition
 58 |   # this one is only to calculate temporal variable:
 59 |   - YrSold
 60 | 
 61 | # set train/test split
 62 | test_size: 0.1
 63 | 
 64 | # to set the random seed
 65 | random_state: 0
 66 | 
 67 | alpha: 0.001
 68 | 
 69 | # categorical variables with NA in train set
 70 | categorical_vars_with_na_frequent:
 71 |   - BsmtQual
 72 |   - BsmtExposure
 73 |   - BsmtFinType1
 74 |   - GarageFinish
 75 | 
 76 | categorical_vars_with_na_missing:
 77 |   - FireplaceQu
 78 | 
 79 | numerical_vars_with_na:
 80 |   - LotFrontage
 81 | 
 82 | temporal_vars:
 83 |   - YearRemodAdd
 84 | 
 85 | ref_var: YrSold
 86 | 
 87 | 
 88 | # variables to log transform
 89 | numericals_log_vars:
 90 |   - LotFrontage
 91 |   - FirstFlrSF
 92 |   - GrLivArea
 93 | 
 94 | binarize_vars:
 95 |   - ScreenPorch
 96 | 
 97 | # variables to map
 98 | qual_vars:
 99 |   - ExterQual
100 |   - BsmtQual
101 |   - HeatingQC
102 |   - KitchenQual
103 |   - FireplaceQu
104 | 
105 | exposure_vars:
106 |   - BsmtExposure
107 | 
108 | finish_vars:
109 |   - BsmtFinType1
110 | 
111 | garage_vars:
112 |   - GarageFinish
113 | 
114 | categorical_vars:
115 |   - MSSubClass
116 |   - MSZoning
117 |   - LotShape
118 |   - LandContour
119 |   - LotConfig
120 |   - Neighborhood
121 |   - RoofStyle
122 |   - Exterior1st
123 |   - Foundation
124 |   - CentralAir
125 |   - Functional
126 |   - PavedDrive
127 |   - SaleCondition
128 | 
129 | # variable mappings
130 | qual_mappings:
131 |   Po: 1
132 |   Fa: 2
133 |   TA: 3
134 |   Gd: 4
135 |   Ex: 5
136 |   Missing: 0
137 |   NA: 0
138 | 
139 | exposure_mappings:
140 |   No: 1
141 |   Mn: 2
142 |   Av: 3
143 |   Gd: 4
144 | 
145 | 
146 | finish_mappings:
147 |   Missing: 0
148 |   NA: 0
149 |   Unf: 1
150 |   LwQ: 2
151 |   Rec: 3
152 |   BLQ: 4
153 |   ALQ: 5
154 |   GLQ: 6
155 | 
156 | 
157 | garage_mappings:
158 |   Missing: 0
159 |   NA: 0
160 |   Unf: 1
161 |   RFn: 2
162 |   Fin: 3
163 | 


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/regression_model/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/section-07-ci-and-publishing/model-package/regression_model/config/__init__.py


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/regression_model/config/core.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from typing import Dict, List, Sequence
  3 | 
  4 | from pydantic import BaseModel
  5 | from strictyaml import YAML, load
  6 | 
  7 | import regression_model
  8 | 
  9 | # Project Directories
 10 | PACKAGE_ROOT = Path(regression_model.__file__).resolve().parent
 11 | ROOT = PACKAGE_ROOT.parent
 12 | CONFIG_FILE_PATH = PACKAGE_ROOT / "config.yml"
 13 | DATASET_DIR = PACKAGE_ROOT / "datasets"
 14 | TRAINED_MODEL_DIR = PACKAGE_ROOT / "trained_models"
 15 | 
 16 | 
 17 | class AppConfig(BaseModel):
 18 |     """
 19 |     Application-level config.
 20 |     """
 21 | 
 22 |     package_name: str
 23 |     training_data_file: str
 24 |     test_data_file: str
 25 |     pipeline_save_file: str
 26 | 
 27 | 
 28 | class ModelConfig(BaseModel):
 29 |     """
 30 |     All configuration relevant to model
 31 |     training and feature engineering.
 32 |     """
 33 | 
 34 |     target: str
 35 |     variables_to_rename: Dict
 36 |     features: List[str]
 37 |     test_size: float
 38 |     random_state: int
 39 |     alpha: float
 40 |     categorical_vars_with_na_frequent: List[str]
 41 |     categorical_vars_with_na_missing: List[str]
 42 |     numerical_vars_with_na: List[str]
 43 |     temporal_vars: List[str]
 44 |     ref_var: str
 45 |     numericals_log_vars: Sequence[str]
 46 |     binarize_vars: Sequence[str]
 47 |     qual_vars: List[str]
 48 |     exposure_vars: List[str]
 49 |     finish_vars: List[str]
 50 |     garage_vars: List[str]
 51 |     categorical_vars: Sequence[str]
 52 |     qual_mappings: Dict[str, int]
 53 |     exposure_mappings: Dict[str, int]
 54 |     garage_mappings: Dict[str, int]
 55 |     finish_mappings: Dict[str, int]
 56 | 
 57 | 
 58 | class Config(BaseModel):
 59 |     """Master config object."""
 60 | 
 61 |     app_config: AppConfig
 62 |     model_config: ModelConfig
 63 | 
 64 | 
 65 | def find_config_file() -> Path:
 66 |     """Locate the configuration file."""
 67 |     if CONFIG_FILE_PATH.is_file():
 68 |         return CONFIG_FILE_PATH
 69 |     raise Exception(f"Config not found at {CONFIG_FILE_PATH!r}")
 70 | 
 71 | 
 72 | def fetch_config_from_yaml(cfg_path: Path = None) -> YAML:
 73 |     """Parse YAML containing the package configuration."""
 74 | 
 75 |     if not cfg_path:
 76 |         cfg_path = find_config_file()
 77 | 
 78 |     if cfg_path:
 79 |         with open(cfg_path, "r") as conf_file:
 80 |             parsed_config = load(conf_file.read())
 81 |             return parsed_config
 82 |     raise OSError(f"Did not find config file at path: {cfg_path}")
 83 | 
 84 | 
 85 | def create_and_validate_config(parsed_config: YAML = None) -> Config:
 86 |     """Run validation on config values."""
 87 |     if parsed_config is None:
 88 |         parsed_config = fetch_config_from_yaml()
 89 | 
 90 |     # specify the data attribute from the strictyaml YAML type.
 91 |     _config = Config(
 92 |         app_config=AppConfig(**parsed_config.data),
 93 |         model_config=ModelConfig(**parsed_config.data),
 94 |     )
 95 | 
 96 |     return _config
 97 | 
 98 | 
 99 | config = create_and_validate_config()
100 | 


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/regression_model/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/section-07-ci-and-publishing/model-package/regression_model/datasets/__init__.py


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/regression_model/pipeline.py:
--------------------------------------------------------------------------------
  1 | from feature_engine.encoding import OrdinalEncoder, RareLabelEncoder
  2 | from feature_engine.imputation import (
  3 |     AddMissingIndicator,
  4 |     CategoricalImputer,
  5 |     MeanMedianImputer,
  6 | )
  7 | from feature_engine.selection import DropFeatures
  8 | from feature_engine.transformation import LogTransformer
  9 | from feature_engine.wrappers import SklearnTransformerWrapper
 10 | from sklearn.linear_model import Lasso
 11 | from sklearn.pipeline import Pipeline
 12 | from sklearn.preprocessing import Binarizer, MinMaxScaler
 13 | 
 14 | from regression_model.config.core import config
 15 | from regression_model.processing import features as pp
 16 | 
 17 | price_pipe = Pipeline(
 18 |     [
 19 |         # ===== IMPUTATION =====
 20 |         # impute categorical variables with string missing
 21 |         (
 22 |             "missing_imputation",
 23 |             CategoricalImputer(
 24 |                 imputation_method="missing",
 25 |                 variables=config.model_config.categorical_vars_with_na_missing,
 26 |             ),
 27 |         ),
 28 |         (
 29 |             "frequent_imputation",
 30 |             CategoricalImputer(
 31 |                 imputation_method="frequent",
 32 |                 variables=config.model_config.categorical_vars_with_na_frequent,
 33 |             ),
 34 |         ),
 35 |         # add missing indicator
 36 |         (
 37 |             "missing_indicator",
 38 |             AddMissingIndicator(variables=config.model_config.numerical_vars_with_na),
 39 |         ),
 40 |         # impute numerical variables with the mean
 41 |         (
 42 |             "mean_imputation",
 43 |             MeanMedianImputer(
 44 |                 imputation_method="mean",
 45 |                 variables=config.model_config.numerical_vars_with_na,
 46 |             ),
 47 |         ),
 48 |         # == TEMPORAL VARIABLES ====
 49 |         (
 50 |             "elapsed_time",
 51 |             pp.TemporalVariableTransformer(
 52 |                 variables=config.model_config.temporal_vars,
 53 |                 reference_variable=config.model_config.ref_var,
 54 |             ),
 55 |         ),
 56 |         ("drop_features", DropFeatures(features_to_drop=[config.model_config.ref_var])),
 57 |         # ==== VARIABLE TRANSFORMATION =====
 58 |         ("log", LogTransformer(variables=config.model_config.numericals_log_vars)),
 59 |         (
 60 |             "binarizer",
 61 |             SklearnTransformerWrapper(
 62 |                 transformer=Binarizer(threshold=0),
 63 |                 variables=config.model_config.binarize_vars,
 64 |             ),
 65 |         ),
 66 |         # === mappers ===
 67 |         (
 68 |             "mapper_qual",
 69 |             pp.Mapper(
 70 |                 variables=config.model_config.qual_vars,
 71 |                 mappings=config.model_config.qual_mappings,
 72 |             ),
 73 |         ),
 74 |         (
 75 |             "mapper_exposure",
 76 |             pp.Mapper(
 77 |                 variables=config.model_config.exposure_vars,
 78 |                 mappings=config.model_config.exposure_mappings,
 79 |             ),
 80 |         ),
 81 |         (
 82 |             "mapper_finish",
 83 |             pp.Mapper(
 84 |                 variables=config.model_config.finish_vars,
 85 |                 mappings=config.model_config.finish_mappings,
 86 |             ),
 87 |         ),
 88 |         (
 89 |             "mapper_garage",
 90 |             pp.Mapper(
 91 |                 variables=config.model_config.garage_vars,
 92 |                 mappings=config.model_config.garage_mappings,
 93 |             ),
 94 |         ),
 95 |         # == CATEGORICAL ENCODING
 96 |         (
 97 |             "rare_label_encoder",
 98 |             RareLabelEncoder(
 99 |                 tol=0.01, n_categories=1, variables=config.model_config.categorical_vars
100 |             ),
101 |         ),
102 |         # encode categorical variables using the target mean
103 |         (
104 |             "categorical_encoder",
105 |             OrdinalEncoder(
106 |                 encoding_method="ordered",
107 |                 variables=config.model_config.categorical_vars,
108 |             ),
109 |         ),
110 |         ("scaler", MinMaxScaler()),
111 |         (
112 |             "Lasso",
113 |             Lasso(
114 |                 alpha=config.model_config.alpha,
115 |                 random_state=config.model_config.random_state,
116 |             ),
117 |         ),
118 |     ]
119 | )
120 | 


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/regression_model/predict.py:
--------------------------------------------------------------------------------
 1 | import typing as t
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | from regression_model import __version__ as _version
 7 | from regression_model.config.core import config
 8 | from regression_model.processing.data_manager import load_pipeline
 9 | from regression_model.processing.validation import validate_inputs
10 | 
11 | pipeline_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl"
12 | _price_pipe = load_pipeline(file_name=pipeline_file_name)
13 | 
14 | 
15 | def make_prediction(
16 |     *,
17 |     input_data: t.Union[pd.DataFrame, dict],
18 | ) -> dict:
19 |     """Make a prediction using a saved model pipeline."""
20 | 
21 |     data = pd.DataFrame(input_data)
22 |     validated_data, errors = validate_inputs(input_data=data)
23 |     results = {"predictions": None, "version": _version, "errors": errors}
24 | 
25 |     if not errors:
26 |         predictions = _price_pipe.predict(
27 |             X=validated_data[config.model_config.features]
28 |         )
29 |         results = {
30 |             "predictions": [np.exp(pred) for pred in predictions],  # type: ignore
31 |             "version": _version,
32 |             "errors": errors,
33 |         }
34 | 
35 |     return results
36 | 


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/regression_model/processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/section-07-ci-and-publishing/model-package/regression_model/processing/__init__.py


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/regression_model/processing/data_manager.py:
--------------------------------------------------------------------------------
 1 | import typing as t
 2 | from pathlib import Path
 3 | 
 4 | import joblib
 5 | import pandas as pd
 6 | from sklearn.pipeline import Pipeline
 7 | 
 8 | from regression_model import __version__ as _version
 9 | from regression_model.config.core import DATASET_DIR, TRAINED_MODEL_DIR, config
10 | 
11 | 
12 | def load_dataset(*, file_name: str) -> pd.DataFrame:
13 |     dataframe = pd.read_csv(Path(f"{DATASET_DIR}/{file_name}"))
14 |     dataframe["MSSubClass"] = dataframe["MSSubClass"].astype("O")
15 | 
16 |     # rename variables beginning with numbers to avoid syntax errors later
17 |     transformed = dataframe.rename(columns=config.model_config.variables_to_rename)
18 |     return transformed
19 | 
20 | 
21 | def save_pipeline(*, pipeline_to_persist: Pipeline) -> None:
22 |     """Persist the pipeline.
23 |     Saves the versioned model, and overwrites any previous
24 |     saved models. This ensures that when the package is
25 |     published, there is only one trained model that can be
26 |     called, and we know exactly how it was built.
27 |     """
28 | 
29 |     # Prepare versioned save file name
30 |     save_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl"
31 |     save_path = TRAINED_MODEL_DIR / save_file_name
32 | 
33 |     remove_old_pipelines(files_to_keep=[save_file_name])
34 |     joblib.dump(pipeline_to_persist, save_path)
35 | 
36 | 
37 | def load_pipeline(*, file_name: str) -> Pipeline:
38 |     """Load a persisted pipeline."""
39 | 
40 |     file_path = TRAINED_MODEL_DIR / file_name
41 |     trained_model = joblib.load(filename=file_path)
42 |     return trained_model
43 | 
44 | 
45 | def remove_old_pipelines(*, files_to_keep: t.List[str]) -> None:
46 |     """
47 |     Remove old model pipelines.
48 |     This is to ensure there is a simple one-to-one
49 |     mapping between the package version and the model
50 |     version to be imported and used by other applications.
51 |     """
52 |     do_not_delete = files_to_keep + ["__init__.py"]
53 |     for model_file in TRAINED_MODEL_DIR.iterdir():
54 |         if model_file.name not in do_not_delete:
55 |             model_file.unlink()
56 | 


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/regression_model/processing/features.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import pandas as pd
 4 | from sklearn.base import BaseEstimator, TransformerMixin
 5 | 
 6 | 
 7 | class TemporalVariableTransformer(BaseEstimator, TransformerMixin):
 8 |     """Temporal elapsed time transformer."""
 9 | 
10 |     def __init__(self, variables: List[str], reference_variable: str):
11 | 
12 |         if not isinstance(variables, list):
13 |             raise ValueError("variables should be a list")
14 | 
15 |         self.variables = variables
16 |         self.reference_variable = reference_variable
17 | 
18 |     def fit(self, X: pd.DataFrame, y: pd.Series = None):
19 |         # we need this step to fit the sklearn pipeline
20 |         return self
21 | 
22 |     def transform(self, X: pd.DataFrame) -> pd.DataFrame:
23 | 
24 |         # so that we do not over-write the original dataframe
25 |         X = X.copy()
26 | 
27 |         for feature in self.variables:
28 |             X[feature] = X[self.reference_variable] - X[feature]
29 | 
30 |         return X
31 | 
32 | 
33 | class Mapper(BaseEstimator, TransformerMixin):
34 |     """Categorical variable mapper."""
35 | 
36 |     def __init__(self, variables: List[str], mappings: dict):
37 | 
38 |         if not isinstance(variables, list):
39 |             raise ValueError("variables should be a list")
40 | 
41 |         self.variables = variables
42 |         self.mappings = mappings
43 | 
44 |     def fit(self, X: pd.DataFrame, y: pd.Series = None):
45 |         # we need the fit statement to accomodate the sklearn pipeline
46 |         return self
47 | 
48 |     def transform(self, X: pd.DataFrame) -> pd.DataFrame:
49 |         X = X.copy()
50 |         for feature in self.variables:
51 |             X[feature] = X[feature].map(self.mappings)
52 | 
53 |         return X
54 | 


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/regression_model/train_pipeline.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from config.core import config
 3 | from pipeline import price_pipe
 4 | from processing.data_manager import load_dataset, save_pipeline
 5 | from sklearn.model_selection import train_test_split
 6 | 
 7 | 
 8 | def run_training() -> None:
 9 |     """Train the model."""
10 | 
11 |     # read training data
12 |     data = load_dataset(file_name=config.app_config.training_data_file)
13 | 
14 |     # divide train and test
15 |     X_train, X_test, y_train, y_test = train_test_split(
16 |         data[config.model_config.features],  # predictors
17 |         data[config.model_config.target],
18 |         test_size=config.model_config.test_size,
19 |         # we are setting the random seed here
20 |         # for reproducibility
21 |         random_state=config.model_config.random_state,
22 |     )
23 |     y_train = np.log(y_train)
24 | 
25 |     # fit model
26 |     price_pipe.fit(X_train, y_train)
27 | 
28 |     # persist trained model
29 |     save_pipeline(pipeline_to_persist=price_pipe)
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     run_training()
34 | 


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/regression_model/trained_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/section-07-ci-and-publishing/model-package/regression_model/trained_models/__init__.py


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/requirements/requirements.txt:
--------------------------------------------------------------------------------
 1 | # We use compatible release functionality (see PEP 440 here: https://www.python.org/dev/peps/pep-0440/#compatible-release)
 2 | # to specify acceptable version ranges of our project dependencies. This gives us the flexibility to keep up with small
 3 | # updates/fixes, whilst ensuring we don't install a major update which could introduce backwards incompatible changes.
 4 | numpy>=1.21.0,<2.0.0
 5 | pandas>=1.3.5,<2.0.0
 6 | pydantic>=1.8.1,<2.0.0
 7 | scikit-learn>=1.1.3,<2.0.0
 8 | strictyaml>=1.3.2,<2.0.0
 9 | ruamel.yaml>=0.16.12,<1.0.0
10 | feature-engine>=1.0.2,<1.6.0  # breaking change in v1.6.0
11 | joblib>=1.0.1,<2.0.0


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/requirements/test_requirements.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 | 
3 | # testing requirements
4 | pytest>=7.2.0,<8.0.0
5 | 


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/requirements/typing_requirements.txt:
--------------------------------------------------------------------------------
1 | # repo maintenance tooling
2 | black>=22.12.0,<23.0.0
3 | flake8>=6.0.0,<7.0.0
4 | mypy>=0.991,<1.0.0
5 | isort>=5.11.4,<6.0.0


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from pathlib import Path
 5 | 
 6 | from setuptools import find_packages, setup
 7 | 
 8 | # Package meta-data.
 9 | NAME = 'tid-regression-model'
10 | DESCRIPTION = "Example regression model package from Train In Data."
11 | URL = "https://github.com/trainindata/testing-and-monitoring-ml-deployments"
12 | EMAIL = "christopher.samiullah@protonmail.com"
13 | AUTHOR = "ChristopherGS"
14 | REQUIRES_PYTHON = ">=3.7.0"
15 | 
16 | 
17 | # The rest you shouldn't have to touch too much :)
18 | # ------------------------------------------------
19 | # Except, perhaps the License and Trove Classifiers!
20 | # If you do change the License, remember to change the
21 | # Trove Classifier for that!
22 | long_description = DESCRIPTION
23 | 
24 | # Load the package's VERSION file as a dictionary.
25 | about = {}
26 | ROOT_DIR = Path(__file__).resolve().parent
27 | REQUIREMENTS_DIR = ROOT_DIR / 'requirements'
28 | PACKAGE_DIR = ROOT_DIR / 'regression_model'
29 | with open(PACKAGE_DIR / "VERSION") as f:
30 |     _version = f.read().strip()
31 |     about["__version__"] = _version
32 | 
33 | 
34 | # What packages are required for this module to be executed?
35 | def list_reqs(fname="requirements.txt"):
36 |     with open(REQUIREMENTS_DIR / fname) as fd:
37 |         return fd.read().splitlines()
38 | 
39 | # Where the magic happens:
40 | setup(
41 |     name=NAME,
42 |     version=about["__version__"],
43 |     description=DESCRIPTION,
44 |     long_description=long_description,
45 |     long_description_content_type="text/markdown",
46 |     author=AUTHOR,
47 |     author_email=EMAIL,
48 |     python_requires=REQUIRES_PYTHON,
49 |     url=URL,
50 |     packages=find_packages(exclude=("tests",)),
51 |     package_data={"regression_model": ["VERSION"]},
52 |     install_requires=list_reqs(),
53 |     extras_require={},
54 |     include_package_data=True,
55 |     license="BSD-3",
56 |     classifiers=[
57 |         # Trove classifiers
58 |         # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
59 |         "License :: OSI Approved :: MIT License",
60 |         "Programming Language :: Python",
61 |         "Programming Language :: Python :: 3",
62 |         "Programming Language :: Python :: 3.6",
63 |         "Programming Language :: Python :: 3.7",
64 |         "Programming Language :: Python :: 3.8",
65 |         "Programming Language :: Python :: 3.9",
66 |         "Programming Language :: Python :: Implementation :: CPython",
67 |         "Programming Language :: Python :: Implementation :: PyPy",
68 |     ],
69 | )


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/section-07-ci-and-publishing/model-package/tests/__init__.py


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from regression_model.config.core import config
 4 | from regression_model.processing.data_manager import load_dataset
 5 | 
 6 | 
 7 | @pytest.fixture()
 8 | def sample_input_data():
 9 |     return load_dataset(file_name=config.app_config.test_data_file)
10 | 


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/tests/test_features.py:
--------------------------------------------------------------------------------
 1 | from regression_model.config.core import config
 2 | from regression_model.processing.features import TemporalVariableTransformer
 3 | 
 4 | 
 5 | def test_temporal_variable_transformer(sample_input_data):
 6 |     # Given
 7 |     transformer = TemporalVariableTransformer(
 8 |         variables=config.model_config.temporal_vars,  # YearRemodAdd
 9 |         reference_variable=config.model_config.ref_var,
10 |     )
11 |     assert sample_input_data["YearRemodAdd"].iat[0] == 1961
12 | 
13 |     # When
14 |     subject = transformer.fit_transform(sample_input_data)
15 | 
16 |     # Then
17 |     assert subject["YearRemodAdd"].iat[0] == 49
18 | 


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/tests/test_prediction.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import numpy as np
 4 | 
 5 | from regression_model.predict import make_prediction
 6 | 
 7 | 
 8 | def test_make_prediction(sample_input_data):
 9 |     # Given
10 |     expected_first_prediction_value = 113422
11 |     expected_no_predictions = 1449
12 | 
13 |     # When
14 |     result = make_prediction(input_data=sample_input_data)
15 | 
16 |     # Then
17 |     predictions = result.get("predictions")
18 |     assert isinstance(predictions, list)
19 |     assert isinstance(predictions[0], np.float64)
20 |     assert result.get("errors") is None
21 |     assert len(predictions) == expected_no_predictions
22 |     assert math.isclose(predictions[0], expected_first_prediction_value, abs_tol=100)
23 | 


--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/tox.ini:
--------------------------------------------------------------------------------
 1 | # Tox is a generic virtualenv management and test command line tool. Its goal is to
 2 | # standardize testing in Python. We will be using it extensively in this course.
 3 | 
 4 | # Using Tox we can (on multiple operating systems):
 5 | # + Eliminate PYTHONPATH challenges when running scripts/tests
 6 | # + Eliminate virtualenv setup confusion
 7 | # + Streamline steps such as model training, model publishing
 8 | 
 9 | 
10 | [tox]
11 | min_version = 4
12 | envlist = test_package, checks
13 | skipsdist = True
14 | 
15 | [testenv]
16 | basepython = python
17 | install_command = pip install {opts} {packages}
18 | allowlist_externals = train,python
19 | 
20 | passenv =
21 | 	KAGGLE_USERNAME
22 | 	KAGGLE_KEY
23 | 	GEMFURY_PUSH_URL
24 | 
25 | [testenv:test_package]
26 | allowlist_externals = python
27 | deps =
28 | 	-rrequirements/test_requirements.txt
29 | 
30 | setenv =
31 | 	PYTHONPATH=.
32 | 	PYTHONHASHSEED=0
33 | 
34 | commands=
35 | 	python regression_model/train_pipeline.py
36 | 	pytest \
37 | 	-s \
38 | 	-vv \
39 | 	{posargs:tests/}
40 | 
41 | [testenv:train]
42 | envdir = {toxworkdir}/test_package
43 | deps =
44 | 	{[testenv:test_package]deps}
45 | 
46 | setenv =
47 | 	{[testenv:test_package]setenv}
48 | 
49 | commands=
50 | 	python regression_model/train_pipeline.py
51 | 
52 | [testenv:fetch_data]
53 | envdir = {toxworkdir}/test_package
54 | allowlist_externals = unzip
55 | deps =
56 | 	kaggle<1.6.0
57 | 
58 | setenv =
59 | 	{[testenv:test_package]setenv}
60 | 
61 | commands=
62 | 	# fetch
63 | 	kaggle competitions download -c house-prices-advanced-regression-techniques -p ./regression_model/datasets
64 | 	# unzip
65 | 	unzip ./regression_model/datasets/house-prices-advanced-regression-techniques.zip -d ./regression_model/datasets
66 | 
67 | 
68 | [testenv:publish_model]
69 | envdir = {toxworkdir}/test_package
70 | allowlist_externals = *
71 | deps =
72 | 	{[testenv:test_package]deps}
73 | 
74 | setenv =
75 | 	{[testenv:test_package]setenv}
76 | 
77 | commands=
78 | 	python regression_model/train_pipeline.py
79 | 	./publish_model.sh .
80 | 
81 | 
82 | [testenv:checks]
83 | envdir = {toxworkdir}/checks
84 | deps =
85 | 	-r{toxinidir}/requirements/typing_requirements.txt
86 | commands =
87 | 	flake8 regression_model tests
88 | 	isort regression_model tests
89 | 	black regression_model tests
90 | 	{posargs:mypy regression_model}
91 | 
92 | 
93 | [flake8]
94 | exclude = .git,env
95 | max-line-length = 90


--------------------------------------------------------------------------------
/section-08-deploying-with-containers/.dockerignore:
--------------------------------------------------------------------------------
 1 | jupyter_notebooks*
 2 | */env*
 3 | */venv*
 4 | .circleci*
 5 | packages/regression_model
 6 | *.env
 7 | *.log
 8 | .git
 9 | .gitignore
10 | .tox


--------------------------------------------------------------------------------
/section-08-deploying-with-containers/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.11
 2 | 
 3 | # Create the user that will run the app
 4 | RUN adduser --disabled-password --gecos '' ml-api-user
 5 | 
 6 | WORKDIR /opt/house-prices-api
 7 | 
 8 | ARG PIP_EXTRA_INDEX_URL
 9 | 
10 | # Install requirements, including from Gemfury
11 | ADD ./house-prices-api /opt/house-prices-api/
12 | RUN pip install --upgrade pip
13 | RUN pip install -r /opt/house-prices-api/requirements.txt
14 | 
15 | RUN chmod +x /opt/house-prices-api/run.sh
16 | RUN chown -R ml-api-user:ml-api-user ./
17 | 
18 | USER ml-api-user
19 | 
20 | EXPOSE 8001
21 | 
22 | CMD ["bash", "./run.sh"]
23 | 


--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/Procfile:
--------------------------------------------------------------------------------
1 | web: uvicorn app.main:app --host 0.0.0.0 --port $PORT


--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/app/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.2"
2 | 


--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/app/api.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import Any
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | from fastapi import APIRouter, HTTPException
 7 | from fastapi.encoders import jsonable_encoder
 8 | from loguru import logger
 9 | from regression_model import __version__ as model_version
10 | from regression_model.predict import make_prediction
11 | 
12 | from app import __version__, schemas
13 | from app.config import settings
14 | 
15 | api_router = APIRouter()
16 | 
17 | 
18 | @api_router.get("/health", response_model=schemas.Health, status_code=200)
19 | def health() -> dict:
20 |     """
21 |     Root Get
22 |     """
23 |     health = schemas.Health(
24 |         name=settings.PROJECT_NAME, api_version=__version__, model_version=model_version
25 |     )
26 | 
27 |     return health.dict()
28 | 
29 | 
30 | @api_router.post("/predict", response_model=schemas.PredictionResults, status_code=200)
31 | async def predict(input_data: schemas.MultipleHouseDataInputs) -> Any:
32 |     """
33 |     Make house price predictions with the TID regression model
34 |     """
35 | 
36 |     input_df = pd.DataFrame(jsonable_encoder(input_data.inputs))
37 | 
38 |     # Advanced: You can improve performance of your API by rewriting the
39 |     # `make prediction` function to be async and using await here.
40 |     logger.info(f"Making prediction on inputs: {input_data.inputs}")
41 |     results = make_prediction(input_data=input_df.replace({np.nan: None}))
42 | 
43 |     if results["errors"] is not None:
44 |         logger.warning(f"Prediction validation error: {results.get('errors')}")
45 |         raise HTTPException(status_code=400, detail=json.loads(results["errors"]))
46 | 
47 |     logger.info(f"Prediction results: {results.get('predictions')}")
48 | 
49 |     return results
50 | 


--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/app/config.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | from types import FrameType
 4 | from typing import List, cast
 5 | 
 6 | from loguru import logger
 7 | from pydantic import AnyHttpUrl, BaseSettings
 8 | 
 9 | 
10 | class LoggingSettings(BaseSettings):
11 |     LOGGING_LEVEL: int = logging.INFO  # logging levels are type int
12 | 
13 | 
14 | class Settings(BaseSettings):
15 |     API_V1_STR: str = "/api/v1"
16 | 
17 |     # Meta
18 |     logging: LoggingSettings = LoggingSettings()
19 | 
20 |     # BACKEND_CORS_ORIGINS is a comma-separated list of origins
21 |     # e.g: http://localhost,http://localhost:4200,http://localhost:3000
22 |     BACKEND_CORS_ORIGINS: List[AnyHttpUrl] = [
23 |         "http://localhost:3000",  # type: ignore
24 |         "http://localhost:8000",  # type: ignore
25 |         "https://localhost:3000",  # type: ignore
26 |         "https://localhost:8000",  # type: ignore
27 |     ]
28 | 
29 |     PROJECT_NAME: str = "House Price Prediction API"
30 | 
31 |     class Config:
32 |         case_sensitive = True
33 | 
34 | 
35 | # See: https://loguru.readthedocs.io/en/stable/overview.html#entirely-compatible-with-standard-logging  # noqa
36 | class InterceptHandler(logging.Handler):
37 |     def emit(self, record: logging.LogRecord) -> None:  # pragma: no cover
38 |         # Get corresponding Loguru level if it exists
39 |         try:
40 |             level = logger.level(record.levelname).name
41 |         except ValueError:
42 |             level = str(record.levelno)
43 | 
44 |         # Find caller from where originated the logged message
45 |         frame, depth = logging.currentframe(), 2
46 |         while frame.f_code.co_filename == logging.__file__:  # noqa: WPS609
47 |             frame = cast(FrameType, frame.f_back)
48 |             depth += 1
49 | 
50 |         logger.opt(depth=depth, exception=record.exc_info).log(
51 |             level,
52 |             record.getMessage(),
53 |         )
54 | 
55 | 
56 | def setup_app_logging(config: Settings) -> None:
57 |     """Prepare custom logging for our application."""
58 | 
59 |     LOGGERS = ("uvicorn.asgi", "uvicorn.access")
60 |     logging.getLogger().handlers = [InterceptHandler()]
61 |     for logger_name in LOGGERS:
62 |         logging_logger = logging.getLogger(logger_name)
63 |         logging_logger.handlers = [InterceptHandler(level=config.logging.LOGGING_LEVEL)]
64 | 
65 |     logger.configure(
66 |         handlers=[{"sink": sys.stderr, "level": config.logging.LOGGING_LEVEL}]
67 |     )
68 | 
69 | 
70 | settings = Settings()
71 | 


--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/app/main.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | from fastapi import APIRouter, FastAPI, Request
 4 | from fastapi.middleware.cors import CORSMiddleware
 5 | from fastapi.responses import HTMLResponse
 6 | from loguru import logger
 7 | 
 8 | from app.api import api_router
 9 | from app.config import settings, setup_app_logging
10 | 
11 | # setup logging as early as possible
12 | setup_app_logging(config=settings)
13 | 
14 | 
15 | app = FastAPI(
16 |     title=settings.PROJECT_NAME, openapi_url=f"{settings.API_V1_STR}/openapi.json"
17 | )
18 | 
19 | root_router = APIRouter()
20 | 
21 | 
22 | @root_router.get("/")
23 | def index(request: Request) -> Any:
24 |     """Basic HTML response."""
25 |     body = (
26 |         "<html>"
27 |         "<body style='padding: 10px;'>"
28 |         "<h1>Welcome to the API</h1>"
29 |         "<div>"
30 |         "Check the docs: <a href='/docs'>here</a>"
31 |         "</div>"
32 |         "</body>"
33 |         "</html>"
34 |     )
35 | 
36 |     return HTMLResponse(content=body)
37 | 
38 | 
39 | app.include_router(api_router, prefix=settings.API_V1_STR)
40 | app.include_router(root_router)
41 | 
42 | # Set all CORS enabled origins
43 | if settings.BACKEND_CORS_ORIGINS:
44 |     app.add_middleware(
45 |         CORSMiddleware,
46 |         allow_origins=[str(origin) for origin in settings.BACKEND_CORS_ORIGINS],
47 |         allow_credentials=True,
48 |         allow_methods=["*"],
49 |         allow_headers=["*"],
50 |     )
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     # Use this for debugging purposes only
55 |     logger.warning("Running in development mode. Do not run like this in production.")
56 |     import uvicorn
57 | 
58 |     uvicorn.run(app, host="localhost", port=8001, log_level="debug")
59 | 


--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/app/schemas/__init__.py:
--------------------------------------------------------------------------------
1 | from .health import Health
2 | from .predict import MultipleHouseDataInputs, PredictionResults
3 | 


--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/app/schemas/health.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 | 
3 | 
4 | class Health(BaseModel):
5 |     name: str
6 |     api_version: str
7 |     model_version: str
8 | 


--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/app/schemas/predict.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, List, Optional
  2 | 
  3 | from pydantic import BaseModel
  4 | from regression_model.processing.validation import HouseDataInputSchema
  5 | 
  6 | 
  7 | class PredictionResults(BaseModel):
  8 |     errors: Optional[Any]
  9 |     version: str
 10 |     predictions: Optional[List[float]]
 11 | 
 12 | 
 13 | class MultipleHouseDataInputs(BaseModel):
 14 |     inputs: List[HouseDataInputSchema]
 15 | 
 16 |     class Config:
 17 |         schema_extra = {
 18 |             "example": {
 19 |                 "inputs": [
 20 |                     {
 21 |                         "MSSubClass": 20,
 22 |                         "MSZoning": "RH",
 23 |                         "LotFrontage": 80.0,
 24 |                         "LotArea": 11622,
 25 |                         "Street": "Pave",
 26 |                         "Alley": None,
 27 |                         "LotShape": "Reg",
 28 |                         "LandContour": "Lvl",
 29 |                         "Utilities": "AllPub",
 30 |                         "LotConfig": "Inside",
 31 |                         "LandSlope": "Gtl",
 32 |                         "Neighborhood": "NAmes",
 33 |                         "Condition1": "Feedr",
 34 |                         "Condition2": "Norm",
 35 |                         "BldgType": "1Fam",
 36 |                         "HouseStyle": "1Story",
 37 |                         "OverallQual": 5,
 38 |                         "OverallCond": 6,
 39 |                         "YearBuilt": 1961,
 40 |                         "YearRemodAdd": 1961,
 41 |                         "RoofStyle": "Gable",
 42 |                         "RoofMatl": "CompShg",
 43 |                         "Exterior1st": "VinylSd",
 44 |                         "Exterior2nd": "VinylSd",
 45 |                         "MasVnrType": "None",
 46 |                         "MasVnrArea": 0.0,
 47 |                         "ExterQual": "TA",
 48 |                         "ExterCond": "TA",
 49 |                         "Foundation": "CBlock",
 50 |                         "BsmtQual": "TA",
 51 |                         "BsmtCond": "TA",
 52 |                         "BsmtExposure": "No",
 53 |                         "BsmtFinType1": "Rec",
 54 |                         "BsmtFinSF1": 468.0,
 55 |                         "BsmtFinType2": "LwQ",
 56 |                         "BsmtFinSF2": 144.0,
 57 |                         "BsmtUnfSF": 270.0,
 58 |                         "TotalBsmtSF": 882.0,
 59 |                         "Heating": "GasA",
 60 |                         "HeatingQC": "TA",
 61 |                         "CentralAir": "Y",
 62 |                         "Electrical": "SBrkr",
 63 |                         "FirstFlrSF": 896,
 64 |                         "SecondFlrSF": 0,
 65 |                         "LowQualFinSF": 0,
 66 |                         "GrLivArea": 896,
 67 |                         "BsmtFullBath": 0.0,
 68 |                         "BsmtHalfBath": 0.0,
 69 |                         "FullBath": 1,
 70 |                         "HalfBath": 0,
 71 |                         "BedroomAbvGr": 2,
 72 |                         "KitchenAbvGr": 1,
 73 |                         "KitchenQual": "TA",
 74 |                         "TotRmsAbvGrd": 5,
 75 |                         "Functional": "Typ",
 76 |                         "Fireplaces": 0,
 77 |                         "FireplaceQu": None,
 78 |                         "GarageType": "Attchd",
 79 |                         "GarageYrBlt": 1961.0,
 80 |                         "GarageFinish": "Unf",
 81 |                         "GarageCars": 1.0,
 82 |                         "GarageArea": 730.0,
 83 |                         "GarageQual": "TA",
 84 |                         "GarageCond": "TA",
 85 |                         "PavedDrive": "Y",
 86 |                         "WoodDeckSF": 140,
 87 |                         "OpenPorchSF": 0,
 88 |                         "EnclosedPorch": 0,
 89 |                         "ThreeSsnPortch": 0,
 90 |                         "ScreenPorch": 120,
 91 |                         "PoolArea": 0,
 92 |                         "PoolQC": None,
 93 |                         "Fence": "MnPrv",
 94 |                         "MiscFeature": None,
 95 |                         "MiscVal": 0,
 96 |                         "MoSold": 6,
 97 |                         "YrSold": 2010,
 98 |                         "SaleType": "WD",
 99 |                         "SaleCondition": "Normal",
100 |                     }
101 |                 ]
102 |             }
103 |         }
104 | 


--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/app/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/deploying-machine-learning-models/594bebbc451250d61e0a9b836c0a2ce794426d4f/section-08-deploying-with-containers/house-prices-api/app/tests/__init__.py


--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/app/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | from typing import Generator
 2 | 
 3 | import pandas as pd
 4 | import pytest
 5 | from fastapi.testclient import TestClient
 6 | from regression_model.config.core import config
 7 | from regression_model.processing.data_manager import load_dataset
 8 | 
 9 | from app.main import app
10 | 
11 | 
12 | @pytest.fixture(scope="module")
13 | def test_data() -> pd.DataFrame:
14 |     return load_dataset(file_name=config.app_config.test_data_file)
15 | 
16 | 
17 | @pytest.fixture()
18 | def client() -> Generator:
19 |     with TestClient(app) as _client:
20 |         yield _client
21 |         app.dependency_overrides = {}
22 | 


--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/app/tests/test_api.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | from fastapi.testclient import TestClient
 6 | 
 7 | 
 8 | def test_make_prediction(client: TestClient, test_data: pd.DataFrame) -> None:
 9 |     # Given
10 |     payload = {
11 |         # ensure pydantic plays well with np.nan
12 |         "inputs": test_data.replace({np.nan: None}).to_dict(orient="records")
13 |     }
14 | 
15 |     # When
16 |     response = client.post(
17 |         "http://localhost:8001/api/v1/predict",
18 |         json=payload,
19 |     )
20 | 
21 |     # Then
22 |     assert response.status_code == 200
23 |     prediction_data = response.json()
24 |     assert prediction_data["predictions"]
25 |     assert prediction_data["errors"] is None
26 |     assert math.isclose(prediction_data["predictions"][0], 113422, rel_tol=100)
27 | 


--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | plugins = pydantic.mypy
3 | ignore_missing_imports = True
4 | disallow_untyped_defs = True
5 | 


--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/requirements.txt:
--------------------------------------------------------------------------------
 1 | --extra-index-url=${PIP_EXTRA_INDEX_URL}
 2 | 
 3 | uvicorn>=0.20.0,<0.30.0
 4 | fastapi>=0.88.0,<1.0.0
 5 | python-multipart>=0.0.5,<0.1.0
 6 | pydantic>=1.10.4,<1.12.0
 7 | typing_extensions>=4.2.0,<5.0.0
 8 | loguru>=0.5.3,<1.0.0
 9 | # fetched from gemfury
10 | tid-regression-model==4.0.5
11 | feature-engine>=1.0.2,<1.6.0  # breaking change in v1.6.0


--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/run.sh:
--------------------------------------------------------------------------------
1 | uvicorn app.main:app --host 0.0.0.0 --port $PORT


--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/test_requirements.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 | 
3 | # testing requirements
4 | pytest>=7.2.0,<8.0.0
5 | requests>=2.28.0,<2.50.0
6 | httpx>=0.23.2,<0.50.0
7 | 


--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/tox.ini:
--------------------------------------------------------------------------------
 1 | # Tox is a generic virtualenv management and test command line tool. Its goal is to
 2 | # standardize testing in Python. We will be using it extensively in this course.
 3 | 
 4 | # Using Tox we can (on multiple operating systems):
 5 | # + Eliminate PYTHONPATH challenges when running scripts/tests
 6 | # + Eliminate virtualenv setup confusion
 7 | # + Streamline steps such as model training, model publishing
 8 | 
 9 | [pytest]
10 | log_cli_level=WARNING
11 | 
12 | [tox]
13 | envlist = test_app, typechecks, stylechecks, lint
14 | skipsdist = True
15 | 
16 | [testenv]
17 | install_command = pip install {opts} {packages}
18 | 
19 | passenv =
20 | 	PIP_EXTRA_INDEX_URL
21 | 
22 | [testenv:test_app]
23 | deps =
24 | 	-rtest_requirements.txt
25 | 
26 | setenv =
27 | 	PYTHONPATH=.
28 | 	PYTHONHASHSEED=0
29 | 
30 | commands=
31 | 	pytest \
32 | 	-vv \
33 | 	{posargs:app/tests/}
34 | 
35 | 
36 | [testenv:run]
37 | envdir = {toxworkdir}/test_app
38 | deps =
39 | 	{[testenv:test_app]deps}
40 | 
41 | setenv =
42 | 	{[testenv:test_app]setenv}
43 | 
44 | commands=
45 | 	python app/main.py
46 | 
47 | [testenv:checks]
48 | envdir = {toxworkdir}/checks
49 | deps =
50 | 	-r{toxinidir}/typing_requirements.txt
51 | commands =
52 | 	flake8 app
53 | 	isort app
54 | 	black app
55 | 	{posargs:mypy app}
56 | 
57 | [flake8]
58 | exclude = .git,__pycache__,__init__.py,.mypy_cache,.pytest_cache,.venv,alembic
59 | max-line-length = 88


--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/typing_requirements.txt:
--------------------------------------------------------------------------------
1 | # repo maintenance tooling
2 | black>=22.12.0,<23.0.0
3 | flake8>=6.0.0,<7.0.0
4 | mypy>=0.991,<1.0.0
5 | isort>=5.11.4,<6.0.0
6 | pydantic>=1.10.4,<1.12.0


--------------------------------------------------------------------------------