├── packages
├── ml_api
│ ├── VERSION
│ ├── tests
│ │ ├── __init__.py
│ │ ├── differential_tests
│ │ │ ├── __init__.py
│ │ │ └── test_differential.py
│ │ ├── conftest.py
│ │ ├── test_validation.py
│ │ ├── capture_model_predictions.py
│ │ └── test_controller.py
│ ├── run.sh
│ ├── api
│ │ ├── __init__.py
│ │ ├── app.py
│ │ ├── config.py
│ │ └── controller.py
│ ├── run.py
│ ├── requirements.txt
│ ├── diff_test_requirements.txt
│ └── tox.ini
├── regression_model
│ ├── tests
│ │ ├── __init__.py
│ │ └── test_predict.py
│ ├── regression_model
│ │ ├── VERSION
│ │ ├── config
│ │ │ ├── __init__.py
│ │ │ ├── logging_config.py
│ │ │ └── config.py
│ │ ├── datasets
│ │ │ └── __init__.py
│ │ ├── processing
│ │ │ ├── __init__.py
│ │ │ ├── errors.py
│ │ │ ├── features.py
│ │ │ ├── validation.py
│ │ │ └── data_management.py
│ │ ├── trained_models
│ │ │ └── __init__.py
│ │ ├── __init__.py
│ │ ├── train_pipeline.py
│ │ ├── predict.py
│ │ └── pipeline.py
│ ├── MANIFEST.in
│ ├── tox.ini
│ ├── requirements.txt
│ └── setup.py
└── neural_network_model
│ ├── tests
│ ├── __init__.py
│ ├── conftest.py
│ └── test_predict.py
│ ├── neural_network_model
│ ├── VERSION
│ ├── config
│ │ ├── __init__.py
│ │ └── config.py
│ ├── datasets
│ │ ├── __init__.py
│ │ └── test_data
│ │ │ ├── __init__.py
│ │ │ ├── Charlock
│ │ │ └── 1.png
│ │ │ └── Black-grass
│ │ │ └── 1.png
│ ├── processing
│ │ ├── __init__.py
│ │ ├── errors.py
│ │ └── preprocessors.py
│ ├── trained_models
│ │ └── __init__.py
│ ├── __init__.py
│ ├── pipeline.py
│ ├── train_pipeline.py
│ ├── predict.py
│ └── model.py
│ ├── config.yml
│ ├── requirements.txt
│ ├── MANIFEST.in
│ └── setup.py
├── assignment-section-05
├── tests
│ ├── __init__.py
│ ├── test_features.py
│ ├── conftest.py
│ └── test_prediction.py
├── classification_model
│ ├── VERSION
│ ├── config
│ │ ├── __init__.py
│ │ └── core.py
│ ├── datasets
│ │ └── __init__.py
│ ├── processing
│ │ ├── __init__.py
│ │ ├── features.py
│ │ ├── validation.py
│ │ └── data_manager.py
│ ├── trained_models
│ │ └── __init__.py
│ ├── __init__.py
│ ├── config.yml
│ ├── predict.py
│ ├── train_pipeline.py
│ └── pipeline.py
├── requirements
│ ├── test_requirements.txt
│ └── requirements.txt
├── mypy.ini
├── MANIFEST.in
├── README.md
├── tox.ini
├── pyproject.toml
└── setup.py
├── section-05-production-model-package
├── tests
│ ├── __init__.py
│ ├── conftest.py
│ ├── test_features.py
│ └── test_prediction.py
├── regression_model
│ ├── VERSION
│ ├── config
│ │ ├── __init__.py
│ │ └── core.py
│ ├── datasets
│ │ └── __init__.py
│ ├── processing
│ │ ├── __init__.py
│ │ ├── features.py
│ │ ├── data_manager.py
│ │ └── validation.py
│ ├── trained_models
│ │ └── __init__.py
│ ├── __init__.py
│ ├── train_pipeline.py
│ ├── predict.py
│ ├── config.yml
│ └── pipeline.py
├── requirements
│ ├── test_requirements.txt
│ └── requirements.txt
├── mypy.ini
├── MANIFEST.in
├── tox.ini
├── pyproject.toml
└── setup.py
├── section-07-ci-and-publishing
├── model-package
│ ├── tests
│ │ ├── __init__.py
│ │ ├── conftest.py
│ │ ├── test_features.py
│ │ └── test_prediction.py
│ ├── regression_model
│ │ ├── VERSION
│ │ ├── config
│ │ │ ├── __init__.py
│ │ │ └── core.py
│ │ ├── datasets
│ │ │ └── __init__.py
│ │ ├── processing
│ │ │ ├── __init__.py
│ │ │ ├── features.py
│ │ │ └── data_manager.py
│ │ ├── trained_models
│ │ │ └── __init__.py
│ │ ├── __init__.py
│ │ ├── train_pipeline.py
│ │ ├── predict.py
│ │ ├── config.yml
│ │ └── pipeline.py
│ ├── requirements
│ │ ├── test_requirements.txt
│ │ └── requirements.txt
│ ├── mypy.ini
│ ├── MANIFEST.in
│ ├── publish_model.sh
│ ├── pyproject.toml
│ ├── setup.py
│ └── tox.ini
└── house-prices-api
│ ├── app
│ ├── tests
│ │ ├── __init__.py
│ │ ├── conftest.py
│ │ └── test_api.py
│ ├── __init__.py
│ ├── schemas
│ │ ├── __init__.py
│ │ ├── health.py
│ │ └── predict.py
│ ├── main.py
│ ├── api.py
│ └── config.py
│ ├── runtime.txt
│ ├── Procfile
│ ├── mypy.ini
│ ├── test_requirements.txt
│ ├── requirements.txt
│ └── tox.ini
├── section-06-model-serving-api
└── house-prices-api
│ ├── app
│ ├── tests
│ │ ├── __init__.py
│ │ ├── conftest.py
│ │ └── test_api.py
│ ├── __init__.py
│ ├── schemas
│ │ ├── __init__.py
│ │ ├── health.py
│ │ └── predict.py
│ ├── main.py
│ ├── api.py
│ └── config.py
│ ├── runtime.txt
│ ├── Procfile
│ ├── mypy.ini
│ ├── requirements.txt
│ ├── test_requirements.txt
│ └── tox.ini
├── section-08-deploying-with-containers
├── house-prices-api
│ ├── app
│ │ ├── tests
│ │ │ ├── __init__.py
│ │ │ ├── conftest.py
│ │ │ └── test_api.py
│ │ ├── __init__.py
│ │ ├── schemas
│ │ │ ├── __init__.py
│ │ │ ├── health.py
│ │ │ └── predict.py
│ │ ├── main.py
│ │ ├── api.py
│ │ └── config.py
│ ├── runtime.txt
│ ├── run.sh
│ ├── Procfile
│ ├── mypy.ini
│ ├── test_requirements.txt
│ ├── requirements.txt
│ └── tox.ini
├── .dockerignore
├── Makefile
└── Dockerfile
├── .dockerignore
├── scripts
├── fetch_kaggle_dataset.sh
├── fetch_kaggle_large_dataset.sh
├── publish_model.sh
└── input_test.json
├── section-04-research-and-development
├── requirements.txt
├── preprocessors.py
└── preprocessors_bonus.py
├── README.md
├── Dockerfile
├── Makefile
├── LICENSE
└── .gitignore
/packages/ml_api/VERSION:
--------------------------------------------------------------------------------
1 | 0.3.0
--------------------------------------------------------------------------------
/packages/ml_api/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/assignment-section-05/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/packages/regression_model/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/packages/neural_network_model/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/packages/ml_api/tests/differential_tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/section-05-production-model-package/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/VERSION:
--------------------------------------------------------------------------------
1 | 2.0.20
--------------------------------------------------------------------------------
/assignment-section-05/classification_model/VERSION:
--------------------------------------------------------------------------------
1 | 0.0.1
2 |
--------------------------------------------------------------------------------
/assignment-section-05/classification_model/config/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/assignment-section-05/classification_model/datasets/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/VERSION:
--------------------------------------------------------------------------------
1 | 0.1.0
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/config/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/datasets/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/assignment-section-05/classification_model/processing/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/processing/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/app/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/app/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/assignment-section-05/classification_model/trained_models/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/config/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/datasets/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/processing/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/trained_models/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/section-05-production-model-package/regression_model/VERSION:
--------------------------------------------------------------------------------
1 | 0.0.1
2 |
--------------------------------------------------------------------------------
/section-05-production-model-package/regression_model/config/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/section-05-production-model-package/regression_model/datasets/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/runtime.txt:
--------------------------------------------------------------------------------
1 | python-3.9.5
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/runtime.txt:
--------------------------------------------------------------------------------
1 | python-3.9.5
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/trained_models/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/section-05-production-model-package/regression_model/processing/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/regression_model/VERSION:
--------------------------------------------------------------------------------
1 | 4.0.3
2 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/regression_model/config/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/app/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/runtime.txt:
--------------------------------------------------------------------------------
1 | python-3.9.5
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/datasets/test_data/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/section-05-production-model-package/regression_model/trained_models/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/regression_model/datasets/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/regression_model/processing/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/app/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.2"
2 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/app/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.2"
2 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/regression_model/trained_models/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/app/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.2"
2 |
--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/Procfile:
--------------------------------------------------------------------------------
1 | web: uvicorn app.main:app --host 0.0.0.0 --port $PORT
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/Procfile:
--------------------------------------------------------------------------------
1 | web: uvicorn app.main:app --host 0.0.0.0 --port $PORT
--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/run.sh:
--------------------------------------------------------------------------------
1 | uvicorn app.main:app --host 0.0.0.0 --port $PORT
--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/Procfile:
--------------------------------------------------------------------------------
1 | web: uvicorn app.main:app --host 0.0.0.0 --port $PORT
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | jupyter_notebooks*
2 | */env*
3 | */venv*
4 | .circleci*
5 | packages/regression_model
6 | *.env
7 | *.log
8 | .git
9 | .gitignore
--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | plugins = pydantic.mypy
3 | ignore_missing_imports = True
4 | disallow_untyped_defs = True
5 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | plugins = pydantic.mypy
3 | ignore_missing_imports = True
4 | disallow_untyped_defs = True
5 |
--------------------------------------------------------------------------------
/packages/ml_api/run.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | export IS_DEBUG=${DEBUG:-false}
3 | exec gunicorn --bind 0.0.0.0:5000 --access-logfile - --error-logfile - run:application
--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/app/schemas/__init__.py:
--------------------------------------------------------------------------------
1 | from .health import Health
2 | from .predict import MultipleHouseDataInputs, PredictionResults
3 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/app/schemas/__init__.py:
--------------------------------------------------------------------------------
1 | from .health import Health
2 | from .predict import MultipleHouseDataInputs, PredictionResults
3 |
--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | plugins = pydantic.mypy
3 | ignore_missing_imports = True
4 | disallow_untyped_defs = True
5 |
--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/app/schemas/__init__.py:
--------------------------------------------------------------------------------
1 | from .health import Health
2 | from .predict import MultipleHouseDataInputs, PredictionResults
3 |
--------------------------------------------------------------------------------
/packages/ml_api/api/__init__.py:
--------------------------------------------------------------------------------
1 | from api.config import PACKAGE_ROOT
2 |
3 | with open(PACKAGE_ROOT / 'VERSION') as version_file:
4 | __version__ = version_file.read().strip()
5 |
--------------------------------------------------------------------------------
/scripts/fetch_kaggle_dataset.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | kaggle competitions download -c house-prices-advanced-regression-techniques -p packages/regression_model/regression_model/datasets/
--------------------------------------------------------------------------------
/packages/neural_network_model/config.yml:
--------------------------------------------------------------------------------
1 | MODEL_NAME: ${MODEL_NAME:cnn_model}
2 | PIPELINE_NAME: ${PIPELINE_NAME:cnn_pipe}
3 | CLASSES_PATH: ${CLASSES_PATH:False}
4 | IMAGE_SIZE: $(IMAGE_SIZE:150}
5 |
--------------------------------------------------------------------------------
/section-08-deploying-with-containers/.dockerignore:
--------------------------------------------------------------------------------
1 | jupyter_notebooks*
2 | */env*
3 | */venv*
4 | .circleci*
5 | packages/regression_model
6 | *.env
7 | *.log
8 | .git
9 | .gitignore
10 | .tox
--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/app/schemas/health.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 |
3 |
4 | class Health(BaseModel):
5 | name: str
6 | api_version: str
7 | model_version: str
8 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/app/schemas/health.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 |
3 |
4 | class Health(BaseModel):
5 | name: str
6 | api_version: str
7 | model_version: str
8 |
--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/app/schemas/health.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 |
3 |
4 | class Health(BaseModel):
5 | name: str
6 | api_version: str
7 | model_version: str
8 |
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/processing/errors.py:
--------------------------------------------------------------------------------
1 | class BaseError(Exception):
2 | """Base package error."""
3 |
4 |
5 | class InvalidModelInputError(BaseError):
6 | """Model input contains an error."""
7 |
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/processing/errors.py:
--------------------------------------------------------------------------------
1 | class BaseError(Exception):
2 | """Base package error."""
3 |
4 |
5 | class InvalidModelInputError(BaseError):
6 | """Model input contains an error."""
7 |
--------------------------------------------------------------------------------
/section-04-research-and-development/requirements.txt:
--------------------------------------------------------------------------------
1 | feature-engine==1.0.2
2 | joblib==1.0.1
3 | matplotlib==3.3.4
4 | numpy==1.20.1
5 | pandas==1.2.2
6 | scikit-learn==0.24.1
7 | scipy==1.6.0
8 | seaborn==0.11.1
9 | statsmodels==0.12.2
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/datasets/test_data/Charlock/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rochitasundar/deploying-machine-learning-models/master/packages/neural_network_model/neural_network_model/datasets/test_data/Charlock/1.png
--------------------------------------------------------------------------------
/assignment-section-05/requirements/test_requirements.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 |
3 | # testing requirements
4 | pytest>=6.2.3,<6.3.0
5 |
6 | # repo maintenance tooling
7 | black==20.8b1
8 | flake8>=3.9.0,<3.10.0
9 | mypy==0.812
10 | isort==5.8.0
11 |
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/datasets/test_data/Black-grass/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rochitasundar/deploying-machine-learning-models/master/packages/neural_network_model/neural_network_model/datasets/test_data/Black-grass/1.png
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from neural_network_model.config import config
4 |
5 |
6 | with open(os.path.join(config.PACKAGE_ROOT, 'VERSION')) as version_file:
7 | __version__ = version_file.read().strip()
8 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Deployment of Machine Learning Models
2 | Accompanying repo for the online course Deployment of Machine Learning Models.
3 |
4 | For the documentation, visit the [course on Udemy](https://www.udemy.com/deployment-of-machine-learning-models/?couponCode=TIDREPO).
5 |
--------------------------------------------------------------------------------
/packages/ml_api/run.py:
--------------------------------------------------------------------------------
1 | from api.app import create_app
2 | from api.config import DevelopmentConfig, ProductionConfig
3 |
4 |
5 | application = create_app(
6 | config_object=ProductionConfig)
7 |
8 |
9 | if __name__ == '__main__':
10 | application.run()
11 |
--------------------------------------------------------------------------------
/section-05-production-model-package/requirements/test_requirements.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 |
3 | # testing requirements
4 | pytest>=6.2.3,<6.3.0
5 |
6 | # repo maintenance tooling
7 | black>=22.0.0,<23.0.0
8 | flake8>=3.9.0,<3.10.0
9 | mypy==0.812
10 | isort==5.8.0
11 |
--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/requirements.txt:
--------------------------------------------------------------------------------
1 | uvicorn>=0.16.0,<0.18.0
2 | fastapi>=0.64.0,<1.0.0
3 | python-multipart>=0.0.5,<0.1.0
4 | pydantic>=1.8.1,<1.10.0
5 | typing_extensions>=3.7.4,<4.0.0
6 | loguru>=0.5.3,<0.6.0
7 | # We will explain this in the course
8 | tid-regression-model==3.0.3
--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/test_requirements.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 |
3 | # testing requirements
4 | pytest>=6.2.3,<6.3.0
5 | requests>=2.23.0,<2.24.0
6 |
7 | # repo maintenance tooling
8 | black>=22.0.0,<23.0.0
9 | flake8>=3.9.0,<3.10.0
10 | mypy==0.812
11 | isort==5.8.0
12 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/test_requirements.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 |
3 | # testing requirements
4 | pytest>=6.2.3,<6.3.0
5 | requests>=2.23.0,<2.24.0
6 |
7 | # repo maintenance tooling
8 | black>=22.0.0,<23.0.0
9 | flake8>=3.9.0,<3.10.0
10 | mypy==0.812
11 | isort==5.8.0
12 |
--------------------------------------------------------------------------------
/packages/ml_api/requirements.txt:
--------------------------------------------------------------------------------
1 | --extra-index-url=${PIP_EXTRA_INDEX_URL}
2 |
3 | # api
4 | flask>=1.1.1,<1.2.0
5 |
6 | # schema validation
7 | marshmallow==2.17.0
8 |
9 | # Install from gemfury
10 | regression-model==2.0.20
11 | neural_network_model==0.1.1
12 |
13 | # Deployment
14 | gunicorn==19.9.0
--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/test_requirements.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 |
3 | # testing requirements
4 | pytest>=6.2.3,<6.3.0
5 | requests>=2.23.0,<2.24.0
6 |
7 | # repo maintenance tooling
8 | black>=22.0.0,<23.0.0
9 | flake8>=3.9.0,<3.10.0
10 | mypy==0.812
11 | isort==5.8.0
12 |
--------------------------------------------------------------------------------
/section-05-production-model-package/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from regression_model.config.core import config
4 | from regression_model.processing.data_manager import load_dataset
5 |
6 |
7 | @pytest.fixture()
8 | def sample_input_data():
9 | return load_dataset(file_name=config.app_config.test_data_file)
10 |
--------------------------------------------------------------------------------
/packages/ml_api/diff_test_requirements.txt:
--------------------------------------------------------------------------------
1 | --extra-index-url=${PIP_EXTRA_INDEX_URL}
2 |
3 | # api
4 | flask>=1.1.1,<1.2.0
5 |
6 | # schema validation
7 | marshmallow==2.17.0
8 |
9 | # Set this to the previous model version
10 | regression-model==2.0.19
11 |
12 | # temporarily necessary as we update sklearn
13 | joblib>=0.14.1,<0.15.0
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from regression_model.config.core import config
4 | from regression_model.processing.data_manager import load_dataset
5 |
6 |
7 | @pytest.fixture()
8 | def sample_input_data():
9 | return load_dataset(file_name=config.app_config.test_data_file)
10 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/requirements.txt:
--------------------------------------------------------------------------------
1 | --extra-index-url=${PIP_EXTRA_INDEX_URL}
2 |
3 | uvicorn>=0.16.0,<0.18.0
4 | fastapi>=0.64.0,<1.0.0
5 | python-multipart>=0.0.5,<0.1.0
6 | pydantic>=1.8.1,<1.9.0
7 | typing_extensions>=3.7.4,<4.0.0
8 | loguru>=0.5.3,<0.6.0
9 | # fetched from gemfury
10 | tid-regression-model==4.0.2
--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/requirements.txt:
--------------------------------------------------------------------------------
1 | --extra-index-url=${PIP_EXTRA_INDEX_URL}
2 |
3 | uvicorn>=0.16.0,<0.18.0
4 | fastapi>=0.64.0,<1.0.0
5 | python-multipart>=0.0.5,<0.1.0
6 | pydantic>=1.8.1,<1.10.0
7 | typing_extensions>=3.7.4,<4.0.0
8 | loguru>=0.5.3,<0.6.0
9 | # fetched from gemfury
10 | tid-regression-model==4.0.2
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/requirements/test_requirements.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 |
3 | # testing requirements
4 | pytest>=6.2.3,<6.3.0
5 |
6 | # repo maintenance tooling
7 | black>=22.0.0,<23.0.0
8 | flake8>=3.9.0,<3.10.0
9 | mypy==0.812
10 | isort==5.8.0
11 |
12 | # new in section 7: for fetching data in CI
13 | kaggle>=1.5.12,<1.6.0
14 |
--------------------------------------------------------------------------------
/packages/neural_network_model/requirements.txt:
--------------------------------------------------------------------------------
1 | # production requirements
2 | pandas==0.23.4
3 | numpy==1.13.3
4 | scikit-learn==0.19.0
5 | Keras==2.1.3
6 | opencv-python==4.0.0.21
7 | h5py==2.9.0
8 | Theano==0.9.0
9 |
10 | # packaging
11 | setuptools==40.6.3
12 | wheel==0.32.3
13 |
14 | # testing requirements
15 | pytest==4.0.2
16 |
17 | # fetching datasets
18 | kaggle==1.5.1.1
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/pipeline.py:
--------------------------------------------------------------------------------
1 | from sklearn.pipeline import Pipeline
2 |
3 | from neural_network_model.config import config
4 | from neural_network_model.processing import preprocessors as pp
5 | from neural_network_model import model
6 |
7 |
8 | pipe = Pipeline([
9 | ('dataset', pp.CreateDataset(config.IMAGE_SIZE)),
10 | ('cnn_model', model.cnn_clf)])
11 |
--------------------------------------------------------------------------------
/packages/ml_api/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from api.app import create_app
4 | from api.config import TestingConfig
5 |
6 |
7 | @pytest.fixture
8 | def app():
9 | app = create_app(config_object=TestingConfig)
10 |
11 | with app.app_context():
12 | yield app
13 |
14 |
15 | @pytest.fixture
16 | def flask_test_client(app):
17 | with app.test_client() as test_client:
18 | yield test_client
19 |
--------------------------------------------------------------------------------
/assignment-section-05/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | warn_unreachable = False
3 | warn_unused_ignores = True
4 | follow_imports = skip
5 | show_error_context = True
6 | warn_incomplete_stub = True
7 | ignore_missing_imports = True
8 | check_untyped_defs = True
9 | cache_dir = /dev/null
10 | # Allow defining functions without any types.
11 | disallow_untyped_defs = False
12 | warn_redundant_casts = True
13 | warn_unused_configs = True
14 | strict_optional = True
--------------------------------------------------------------------------------
/packages/regression_model/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt
2 | include *.md
3 | include *.cfg
4 | include *.pkl
5 | recursive-include ./regression_model/*
6 |
7 | include regression_model/datasets/train.csv
8 | include regression_model/datasets/test.csv
9 | include regression_model/trained_models/*.pkl
10 | include regression_model/VERSION
11 |
12 | include ./requirements.txt
13 | exclude *.log
14 |
15 | recursive-exclude * __pycache__
16 | recursive-exclude * *.py[co]
--------------------------------------------------------------------------------
/section-05-production-model-package/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | # warn_unreachable = True
3 | warn_unused_ignores = True
4 | follow_imports = skip
5 | show_error_context = True
6 | warn_incomplete_stub = True
7 | ignore_missing_imports = True
8 | check_untyped_defs = True
9 | cache_dir = /dev/null
10 | # Cannot enable this one as we still allow defining functions without any types.
11 | # disallow_untyped_defs = True
12 | warn_redundant_casts = True
13 | warn_unused_configs = True
14 | strict_optional = True
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | # warn_unreachable = True
3 | warn_unused_ignores = True
4 | follow_imports = skip
5 | show_error_context = True
6 | warn_incomplete_stub = True
7 | ignore_missing_imports = True
8 | check_untyped_defs = True
9 | cache_dir = /dev/null
10 | # Cannot enable this one as we still allow defining functions without any types.
11 | # disallow_untyped_defs = True
12 | warn_redundant_casts = True
13 | warn_unused_configs = True
14 | strict_optional = True
--------------------------------------------------------------------------------
/section-08-deploying-with-containers/Makefile:
--------------------------------------------------------------------------------
1 | heroku-login:
2 | HEROKU_API_KEY=${HEROKU_API_KEY} heroku container:login
3 |
4 | build-ml-api-heroku: heroku-login
5 | docker build --build-arg PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} -t registry.heroku.com/${HEROKU_APP_NAME}/web .
6 |
7 | push-ml-api-heroku: heroku-login
8 | docker push registry.heroku.com/${HEROKU_APP_NAME}/web
9 |
10 | release-heroku: heroku-login
11 | heroku container:release web --app ${HEROKU_APP_NAME}
12 |
13 | .PHONY: heroku-login build-ml-api-heroku push-ml-api-heroku
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from regression_model.config import config
4 | from regression_model.config import logging_config
5 |
6 |
7 | VERSION_PATH = config.PACKAGE_ROOT / 'VERSION'
8 |
9 | # Configure logger for use in package
10 | logger = logging.getLogger(__name__)
11 | logger.setLevel(logging.DEBUG)
12 | logger.addHandler(logging_config.get_console_handler())
13 | logger.propagate = False
14 |
15 |
16 | with open(VERSION_PATH, 'r') as version_file:
17 | __version__ = version_file.read().strip()
18 |
--------------------------------------------------------------------------------
/packages/ml_api/api/app.py:
--------------------------------------------------------------------------------
1 | from flask import Flask
2 |
3 | from api.config import get_logger
4 |
5 |
6 | _logger = get_logger(logger_name=__name__)
7 |
8 |
9 | def create_app(*, config_object) -> Flask:
10 | """Create a flask app instance."""
11 |
12 | flask_app = Flask('ml_api')
13 | flask_app.config.from_object(config_object)
14 |
15 | # import blueprints
16 | from api.controller import prediction_app
17 | flask_app.register_blueprint(prediction_app)
18 | _logger.debug('Application instance created')
19 |
20 | return flask_app
21 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.6.4
2 |
3 | # Create the user that will run the app
4 | RUN adduser --disabled-password --gecos '' ml-api-user
5 |
6 | WORKDIR /opt/ml_api
7 |
8 | ARG PIP_EXTRA_INDEX_URL
9 | ENV FLASK_APP run.py
10 |
11 | # Install requirements, including from Gemfury
12 | ADD ./packages/ml_api /opt/ml_api/
13 | RUN pip install --upgrade pip
14 | RUN pip install -r /opt/ml_api/requirements.txt
15 |
16 | RUN chmod +x /opt/ml_api/run.sh
17 | RUN chown -R ml-api-user:ml-api-user ./
18 |
19 | USER ml-api-user
20 |
21 | EXPOSE 5000
22 |
23 | CMD ["bash", "./run.sh"]
--------------------------------------------------------------------------------
/packages/neural_network_model/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt
2 | include *.md
3 | include *.cfg
4 | include *.pkl
5 | recursive-include ./neural_network_model/*.py
6 |
7 | include neural_network_model/trained_models/*.pkl
8 | include neural_network_model/trained_models/*.h5
9 | include neural_network_model/VERSION
10 | include neural_network_model/datasets/test_data/Black-grass/1.png
11 | include neural_network_model/datasets/test_data/Charlock/1.png
12 |
13 | include ./requirements.txt
14 | exclude *.log
15 |
16 | recursive-exclude * __pycache__
17 | recursive-exclude * *.py[co]
--------------------------------------------------------------------------------
/section-05-production-model-package/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt
2 | include *.md
3 | include *.pkl
4 | recursive-include ./regression_model/*
5 |
6 | include regression_model/datasets/train.csv
7 | include regression_model/datasets/test.csv
8 | include regression_model/trained_models/*.pkl
9 | include regression_model/VERSION
10 | include regression_model/config.yml
11 |
12 | include ./requirements/requirements.txt
13 | include ./requirements/test_requirements.txt
14 | exclude *.log
15 | exclude *.cfg
16 |
17 | recursive-exclude * __pycache__
18 | recursive-exclude * *.py[co]
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt
2 | include *.md
3 | include *.pkl
4 | recursive-include ./regression_model/*
5 |
6 | include regression_model/datasets/train.csv
7 | include regression_model/datasets/test.csv
8 | include regression_model/trained_models/*.pkl
9 | include regression_model/VERSION
10 | include regression_model/config.yml
11 |
12 | include ./requirements/requirements.txt
13 | include ./requirements/test_requirements.txt
14 | exclude *.log
15 | exclude *.cfg
16 |
17 | recursive-exclude * __pycache__
18 | recursive-exclude * *.py[co]
--------------------------------------------------------------------------------
/assignment-section-05/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt
2 | include *.md
3 | include *.pkl
4 | recursive-include ./classification_model/*
5 |
6 | include classification_model/datasets/train.csv
7 | include classification_model/datasets/test.csv
8 | include classification_model/trained_models/*.pkl
9 | include classification_model/VERSION
10 | include classification_model/config.yml
11 |
12 | include ./requirements/requirements.txt
13 | include ./requirements/test_requirements.txt
14 | exclude *.log
15 | exclude *.cfg
16 |
17 | recursive-exclude * __pycache__
18 | recursive-exclude * *.py[co]
--------------------------------------------------------------------------------
/packages/neural_network_model/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import os
3 |
4 | from neural_network_model.config import config
5 |
6 |
7 | @pytest.fixture
8 | def black_grass_dir():
9 | test_data_dir = os.path.join(config.DATASET_DIR, 'test_data')
10 | black_grass_dir = os.path.join(test_data_dir, 'Black-grass')
11 |
12 | return black_grass_dir
13 |
14 |
15 | @pytest.fixture
16 | def charlock_dir():
17 | test_data_dir = os.path.join(config.DATASET_DIR, 'test_data')
18 | charlock_dir = os.path.join(test_data_dir, 'Charlock')
19 |
20 | return charlock_dir
21 |
--------------------------------------------------------------------------------
/assignment-section-05/tests/test_features.py:
--------------------------------------------------------------------------------
1 | from classification_model.config.core import config
2 | from classification_model.processing.features import ExtractLetterTransformer
3 |
4 |
5 | def test_temporal_variable_transformer(sample_input_data):
6 | # Given
7 | transformer = ExtractLetterTransformer(
8 | variables=config.model_config.cabin_vars, # cabin
9 | )
10 | assert sample_input_data["cabin"].iat[6] == "E12"
11 |
12 | # When
13 | subject = transformer.fit_transform(sample_input_data)
14 |
15 | # Then
16 | assert subject["cabin"].iat[6] == "E"
17 |
--------------------------------------------------------------------------------
/section-08-deploying-with-containers/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.9.4
2 |
3 | # Create the user that will run the app
4 | RUN adduser --disabled-password --gecos '' ml-api-user
5 |
6 | WORKDIR /opt/house-prices-api
7 |
8 | ARG PIP_EXTRA_INDEX_URL
9 |
10 | # Install requirements, including from Gemfury
11 | ADD ./house-prices-api /opt/house-prices-api/
12 | RUN pip install --upgrade pip
13 | RUN pip install -r /opt/house-prices-api/requirements.txt
14 |
15 | RUN chmod +x /opt/house-prices-api/run.sh
16 | RUN chown -R ml-api-user:ml-api-user ./
17 |
18 | USER ml-api-user
19 |
20 | EXPOSE 8001
21 |
22 | CMD ["bash", "./run.sh"]
23 |
--------------------------------------------------------------------------------
/assignment-section-05/requirements/requirements.txt:
--------------------------------------------------------------------------------
1 | # We use compatible release functionality (see PEP 440 here: https://www.python.org/dev/peps/pep-0440/#compatible-release)
2 | # to specify acceptable version ranges of our project dependencies. This gives us the flexibility to keep up with small
3 | # updates/fixes, whilst ensuring we don't install a major update which could introduce backwards incompatible changes.
4 | numpy>=1.20.0,<1.21.0
5 | pandas>=1.3.5,<1.4.0
6 | pydantic>=1.8.1,<1.9.0
7 | scikit-learn>=0.24.2,<0.25.0
8 | strictyaml>=1.3.2,<1.4.0
9 | ruamel.yaml==0.16.12
10 | feature-engine>=1.0.2,<1.3.0
11 | joblib>=1.0.1,<1.1.0
--------------------------------------------------------------------------------
/section-05-production-model-package/requirements/requirements.txt:
--------------------------------------------------------------------------------
1 | # We use compatible release functionality (see PEP 440 here: https://www.python.org/dev/peps/pep-0440/#compatible-release)
2 | # to specify acceptable version ranges of our project dependencies. This gives us the flexibility to keep up with small
3 | # updates/fixes, whilst ensuring we don't install a major update which could introduce backwards incompatible changes.
4 | numpy>=1.20.0,<1.21.0
5 | pandas>=1.3.5,<1.4.0
6 | pydantic>=1.8.1,<1.9.0
7 | scikit-learn>=1.0.2,<1.1.0
8 | strictyaml>=1.3.2,<1.4.0
9 | ruamel.yaml==0.16.12
10 | feature-engine>=1.0.2,<1.1.0
11 | joblib>=1.0.1,<1.1.0
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/requirements/requirements.txt:
--------------------------------------------------------------------------------
1 | # We use compatible release functionality (see PEP 440 here: https://www.python.org/dev/peps/pep-0440/#compatible-release)
2 | # to specify acceptable version ranges of our project dependencies. This gives us the flexibility to keep up with small
3 | # updates/fixes, whilst ensuring we don't install a major update which could introduce backwards incompatible changes.
4 | numpy>=1.20.0,<1.21.0
5 | pandas>=1.3.5,<1.4.0
6 | pydantic>=1.8.1,<1.9.0
7 | scikit-learn>=1.0.2,<1.1.0
8 | strictyaml>=1.3.2,<1.4.0
9 | ruamel.yaml==0.16.12
10 | feature-engine>=1.0.2,<1.1.0
11 | joblib>=1.0.1,<1.1.0
--------------------------------------------------------------------------------
/packages/neural_network_model/tests/test_predict.py:
--------------------------------------------------------------------------------
1 | from neural_network_model import __version__ as _version
2 | from neural_network_model.predict import (make_single_prediction)
3 |
4 |
5 | def test_make_prediction_on_sample(charlock_dir):
6 | # Given
7 | filename = '1.png'
8 | expected_classification = 'Charlock'
9 |
10 | # When
11 | results = make_single_prediction(image_directory=charlock_dir,
12 | image_name=filename)
13 |
14 | # Then
15 | assert results['predictions'] is not None
16 | assert results['readable_predictions'][0] == expected_classification
17 | assert results['version'] == _version
18 |
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/config/logging_config.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import sys
3 |
4 |
5 | # Multiple calls to logging.getLogger('someLogger') return a
6 | # reference to the same logger object. This is true not only
7 | # within the same module, but also across modules as long as
8 | # it is in the same Python interpreter process.
9 |
10 | FORMATTER = logging.Formatter(
11 | "%(asctime)s — %(name)s — %(levelname)s —" "%(funcName)s:%(lineno)d — %(message)s"
12 | )
13 |
14 |
15 | def get_console_handler():
16 | console_handler = logging.StreamHandler(sys.stdout)
17 | console_handler.setFormatter(FORMATTER)
18 | return console_handler
19 |
--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/app/tests/conftest.py:
--------------------------------------------------------------------------------
1 | from typing import Generator
2 |
3 | import pandas as pd
4 | import pytest
5 | from fastapi.testclient import TestClient
6 | from regression_model.config.core import config
7 | from regression_model.processing.data_manager import load_dataset
8 |
9 | from app.main import app
10 |
11 |
12 | @pytest.fixture(scope="module")
13 | def test_data() -> pd.DataFrame:
14 | return load_dataset(file_name=config.app_config.test_data_file)
15 |
16 |
17 | @pytest.fixture()
18 | def client() -> Generator:
19 | with TestClient(app) as _client:
20 | yield _client
21 | app.dependency_overrides = {}
22 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/app/tests/conftest.py:
--------------------------------------------------------------------------------
1 | from typing import Generator
2 |
3 | import pandas as pd
4 | import pytest
5 | from fastapi.testclient import TestClient
6 | from regression_model.config.core import config
7 | from regression_model.processing.data_manager import load_dataset
8 |
9 | from app.main import app
10 |
11 |
12 | @pytest.fixture(scope="module")
13 | def test_data() -> pd.DataFrame:
14 | return load_dataset(file_name=config.app_config.test_data_file)
15 |
16 |
17 | @pytest.fixture()
18 | def client() -> Generator:
19 | with TestClient(app) as _client:
20 | yield _client
21 | app.dependency_overrides = {}
22 |
--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/app/tests/conftest.py:
--------------------------------------------------------------------------------
1 | from typing import Generator
2 |
3 | import pandas as pd
4 | import pytest
5 | from fastapi.testclient import TestClient
6 | from regression_model.config.core import config
7 | from regression_model.processing.data_manager import load_dataset
8 |
9 | from app.main import app
10 |
11 |
12 | @pytest.fixture(scope="module")
13 | def test_data() -> pd.DataFrame:
14 | return load_dataset(file_name=config.app_config.test_data_file)
15 |
16 |
17 | @pytest.fixture()
18 | def client() -> Generator:
19 | with TestClient(app) as _client:
20 | yield _client
21 | app.dependency_overrides = {}
22 |
--------------------------------------------------------------------------------
/section-05-production-model-package/tests/test_features.py:
--------------------------------------------------------------------------------
1 | from regression_model.config.core import config
2 | from regression_model.processing.features import TemporalVariableTransformer
3 |
4 |
5 | def test_temporal_variable_transformer(sample_input_data):
6 | # Given
7 | transformer = TemporalVariableTransformer(
8 | variables=config.model_config.temporal_vars, # YearRemodAdd
9 | reference_variable=config.model_config.ref_var,
10 | )
11 | assert sample_input_data["YearRemodAdd"].iat[0] == 1961
12 |
13 | # When
14 | subject = transformer.fit_transform(sample_input_data)
15 |
16 | # Then
17 | assert subject["YearRemodAdd"].iat[0] == 49
18 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/tests/test_features.py:
--------------------------------------------------------------------------------
1 | from regression_model.config.core import config
2 | from regression_model.processing.features import TemporalVariableTransformer
3 |
4 |
5 | def test_temporal_variable_transformer(sample_input_data):
6 | # Given
7 | transformer = TemporalVariableTransformer(
8 | variables=config.model_config.temporal_vars, # YearRemodAdd
9 | reference_variable=config.model_config.ref_var,
10 | )
11 | assert sample_input_data["YearRemodAdd"].iat[0] == 1961
12 |
13 | # When
14 | subject = transformer.fit_transform(sample_input_data)
15 |
16 | # Then
17 | assert subject["YearRemodAdd"].iat[0] == 49
18 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | NAME=udemy-ml-api
2 | COMMIT_ID=$(shell git rev-parse HEAD)
3 |
4 |
5 | build-ml-api-heroku:
6 | docker build --build-arg PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} -t registry.heroku.com/$(NAME)/web:$(COMMIT_ID) .
7 |
8 | push-ml-api-heroku:
9 | docker push registry.heroku.com/${HEROKU_APP_NAME}/web:$(COMMIT_ID)
10 |
11 | build-ml-api-aws:
12 | docker build --build-arg PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} -t $(NAME):$(COMMIT_ID) .
13 |
14 | push-ml-api-aws:
15 | docker push ${AWS_ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/$(NAME):$(COMMIT_ID)
16 |
17 | tag-ml-api:
18 | docker tag $(NAME):$(COMMIT_ID) ${AWS_ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/$(NAME):$(COMMIT_ID)
19 |
--------------------------------------------------------------------------------
/packages/ml_api/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py36, py37, py38
3 | skipsdist = True
4 |
5 |
6 | [testenv]
7 | install_command = pip install --pre {opts} {packages}
8 | deps =
9 | -rrequirements.txt
10 |
11 | passenv =
12 | PIP_EXTRA_INDEX_URL
13 | KERAS_BACKEND
14 |
15 | setenv =
16 | PYTHONPATH=.
17 |
18 | commands =
19 | pytest \
20 | -s \
21 | -v \
22 | -m "not differential" \
23 | {posargs:tests}
24 |
25 |
26 | # content of pytest.ini
27 | [pytest]
28 | markers =
29 | integration: mark a test as an integration test.
30 | differential: mark a test as a differential test.
31 | filterwarnings =
32 | ignore::DeprecationWarning
--------------------------------------------------------------------------------
/packages/regression_model/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py36, py37, py38
3 |
4 |
5 | [testenv]
6 | install_command = pip install --pre {opts} {packages}
7 | whitelist_externals = unzip
8 | deps =
9 | -rrequirements.txt
10 |
11 | passenv =
12 | KAGGLE_USERNAME
13 | KAGGLE_KEY
14 |
15 | setenv =
16 | PYTHONPATH=.
17 |
18 | commands =
19 | kaggle competitions download -c house-prices-advanced-regression-techniques -p regression_model/datasets/
20 | unzip -o regression_model/datasets/house-prices-advanced-regression-techniques.zip -d regression_model/datasets
21 | python regression_model/train_pipeline.py
22 | pytest \
23 | -s \
24 | -v \
25 | {posargs:tests}
26 |
--------------------------------------------------------------------------------
/packages/regression_model/requirements.txt:
--------------------------------------------------------------------------------
1 | # We use compatible release functionality (see PEP 440 here: https://www.python.org/dev/peps/pep-0440/#compatible-release)
2 | # to specify acceptable version ranges of our project dependencies. This gives us the flexibility to keep up with small
3 | # updates/fixes, whilst ensuring we don't install a major update which could introduce backwards incompatible changes.
4 |
5 | # Model Building Requirements
6 | numpy>=1.18.1,<1.19.0
7 | pandas>=0.25.3,<0.26.0
8 | scikit-learn>=0.22.1,<0.23.0
9 | joblib>=0.14.1,<0.15.0
10 |
11 | # testing requirements
12 | pytest>=5.3.2,<6.0.0
13 |
14 | # packaging
15 | setuptools>=41.4.0,<42.0.0
16 | wheel>=0.33.6,<0.34.0
17 |
18 | # fetching datasets
19 | kaggle>=1.5.6,<1.6.0
20 |
--------------------------------------------------------------------------------
/section-05-production-model-package/tests/test_prediction.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import numpy as np
4 |
5 | from regression_model.predict import make_prediction
6 |
7 |
8 | def test_make_prediction(sample_input_data):
9 | # Given
10 | expected_first_prediction_value = 113422
11 | expected_no_predictions = 1449
12 |
13 | # When
14 | result = make_prediction(input_data=sample_input_data)
15 |
16 | # Then
17 | predictions = result.get("predictions")
18 | assert isinstance(predictions, list)
19 | assert isinstance(predictions[0], np.float64)
20 | assert result.get("errors") is None
21 | assert len(predictions) == expected_no_predictions
22 | assert math.isclose(predictions[0], expected_first_prediction_value, abs_tol=100)
23 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/tests/test_prediction.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import numpy as np
4 |
5 | from regression_model.predict import make_prediction
6 |
7 |
8 | def test_make_prediction(sample_input_data):
9 | # Given
10 | expected_first_prediction_value = 113422
11 | expected_no_predictions = 1449
12 |
13 | # When
14 | result = make_prediction(input_data=sample_input_data)
15 |
16 | # Then
17 | predictions = result.get("predictions")
18 | assert isinstance(predictions, list)
19 | assert isinstance(predictions[0], np.float64)
20 | assert result.get("errors") is None
21 | assert len(predictions) == expected_no_predictions
22 | assert math.isclose(predictions[0], expected_first_prediction_value, abs_tol=100)
23 |
--------------------------------------------------------------------------------
/assignment-section-05/classification_model/processing/features.py:
--------------------------------------------------------------------------------
1 | from sklearn.base import BaseEstimator, TransformerMixin
2 |
3 |
4 | class ExtractLetterTransformer(BaseEstimator, TransformerMixin):
5 | # Extract first letter of variable
6 |
7 | def __init__(self, variables):
8 |
9 | if not isinstance(variables, list):
10 | raise ValueError("variables should be a list")
11 |
12 | self.variables = variables
13 |
14 | def fit(self, X, y=None):
15 | # we need this step to fit the sklearn pipeline
16 | return self
17 |
18 | def transform(self, X):
19 |
20 | # so that we do not over-write the original dataframe
21 | X = X.copy()
22 |
23 | for feature in self.variables:
24 | X[feature] = X[feature].str[0]
25 |
26 | return X
27 |
--------------------------------------------------------------------------------
/assignment-section-05/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import pytest
4 | from sklearn.model_selection import train_test_split
5 |
6 | from classification_model.config.core import config
7 | from classification_model.processing.data_manager import _load_raw_dataset
8 |
9 | logger = logging.getLogger(__name__)
10 |
11 |
12 | @pytest.fixture
13 | def sample_input_data():
14 | data = _load_raw_dataset(file_name=config.app_config.raw_data_file)
15 |
16 | # divide train and test
17 | X_train, X_test, y_train, y_test = train_test_split(
18 | data, # predictors
19 | data[config.model_config.target],
20 | test_size=config.model_config.test_size,
21 | # we are setting the random seed here
22 | # for reproducibility
23 | random_state=config.model_config.random_state,
24 | )
25 |
26 | return X_test
27 |
--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/app/tests/test_api.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import numpy as np
4 | import pandas as pd
5 | from fastapi.testclient import TestClient
6 |
7 |
8 | def test_make_prediction(client: TestClient, test_data: pd.DataFrame) -> None:
9 | # Given
10 | payload = {
11 | # ensure pydantic plays well with np.nan
12 | "inputs": test_data.replace({np.nan: None}).to_dict(orient="records")
13 | }
14 |
15 | # When
16 | response = client.post(
17 | "http://localhost:8001/api/v1/predict",
18 | json=payload,
19 | )
20 |
21 | # Then
22 | assert response.status_code == 200
23 | prediction_data = response.json()
24 | assert prediction_data["predictions"]
25 | assert prediction_data["errors"] is None
26 | assert math.isclose(prediction_data["predictions"][0], 113422, rel_tol=100)
27 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/app/tests/test_api.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import numpy as np
4 | import pandas as pd
5 | from fastapi.testclient import TestClient
6 |
7 |
8 | def test_make_prediction(client: TestClient, test_data: pd.DataFrame) -> None:
9 | # Given
10 | payload = {
11 | # ensure pydantic plays well with np.nan
12 | "inputs": test_data.replace({np.nan: None}).to_dict(orient="records")
13 | }
14 |
15 | # When
16 | response = client.post(
17 | "http://localhost:8001/api/v1/predict",
18 | json=payload,
19 | )
20 |
21 | # Then
22 | assert response.status_code == 200
23 | prediction_data = response.json()
24 | assert prediction_data["predictions"]
25 | assert prediction_data["errors"] is None
26 | assert math.isclose(prediction_data["predictions"][0], 113422, rel_tol=100)
27 |
--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/app/tests/test_api.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import numpy as np
4 | import pandas as pd
5 | from fastapi.testclient import TestClient
6 |
7 |
8 | def test_make_prediction(client: TestClient, test_data: pd.DataFrame) -> None:
9 | # Given
10 | payload = {
11 | # ensure pydantic plays well with np.nan
12 | "inputs": test_data.replace({np.nan: None}).to_dict(orient="records")
13 | }
14 |
15 | # When
16 | response = client.post(
17 | "http://localhost:8001/api/v1/predict",
18 | json=payload,
19 | )
20 |
21 | # Then
22 | assert response.status_code == 200
23 | prediction_data = response.json()
24 | assert prediction_data["predictions"]
25 | assert prediction_data["errors"] is None
26 | assert math.isclose(prediction_data["predictions"][0], 113422, rel_tol=100)
27 |
--------------------------------------------------------------------------------
/assignment-section-05/tests/test_prediction.py:
--------------------------------------------------------------------------------
1 | """
2 | Note: These tests will fail if you have not first trained the model.
3 | """
4 |
5 | import numpy as np
6 | from sklearn.metrics import accuracy_score
7 |
8 | from classification_model.predict import make_prediction
9 |
10 |
11 | def test_make_prediction(sample_input_data):
12 | # Given
13 | expected_no_predictions = 131
14 |
15 | # When
16 | result = make_prediction(input_data=sample_input_data)
17 |
18 | # Then
19 | predictions = result.get("predictions")
20 | assert isinstance(predictions, np.ndarray)
21 | assert isinstance(predictions[0], np.int64)
22 | assert result.get("errors") is None
23 | assert len(predictions) == expected_no_predictions
24 | _predictions = list(predictions)
25 | y_true = sample_input_data["survived"]
26 | accuracy = accuracy_score(_predictions, y_true)
27 | assert accuracy > 0.7
28 |
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/train_pipeline.py:
--------------------------------------------------------------------------------
1 | from sklearn.externals import joblib
2 |
3 | from neural_network_model import pipeline as pipe
4 | from neural_network_model.config import config
5 | from neural_network_model.processing import data_management as dm
6 | from neural_network_model.processing import preprocessors as pp
7 |
8 |
9 | def run_training(save_result: bool = True):
10 | """Train a Convolutional Neural Network."""
11 |
12 | images_df = dm.load_image_paths(config.DATA_FOLDER)
13 | X_train, X_test, y_train, y_test = dm.get_train_test_target(images_df)
14 |
15 | enc = pp.TargetEncoder()
16 | enc.fit(y_train)
17 | y_train = enc.transform(y_train)
18 |
19 | pipe.pipe.fit(X_train, y_train)
20 |
21 | if save_result:
22 | joblib.dump(enc, config.ENCODER_PATH)
23 | dm.save_pipeline_keras(pipe.pipe)
24 |
25 |
26 | if __name__ == '__main__':
27 | run_training(save_result=True)
28 |
--------------------------------------------------------------------------------
/assignment-section-05/classification_model/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from classification_model.config.core import PACKAGE_ROOT, config
4 |
5 | # It is strongly advised that you do not add any handlers other than
6 | # NullHandler to your library’s loggers. This is because the configuration
7 | # of handlers is the prerogative of the application developer who uses your
8 | # library. The application developer knows their target audience and what
9 | # handlers are most appropriate for their application: if you add handlers
10 | # ‘under the hood’, you might well interfere with their ability to carry out
11 | # unit tests and deliver logs which suit their requirements.
12 | # https://docs.python.org/3/howto/logging.html#configuring-logging-for-a-library
13 | logging.getLogger(config.app_config.package_name).addHandler(logging.NullHandler())
14 |
15 |
16 | with open(PACKAGE_ROOT / "VERSION") as version_file:
17 | __version__ = version_file.read().strip()
18 |
--------------------------------------------------------------------------------
/section-05-production-model-package/regression_model/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from regression_model.config.core import PACKAGE_ROOT, config
4 |
5 | # It is strongly advised that you do not add any handlers other than
6 | # NullHandler to your library’s loggers. This is because the configuration
7 | # of handlers is the prerogative of the application developer who uses your
8 | # library. The application developer knows their target audience and what
9 | # handlers are most appropriate for their application: if you add handlers
10 | # ‘under the hood’, you might well interfere with their ability to carry out
11 | # unit tests and deliver logs which suit their requirements.
12 | # https://docs.python.org/3/howto/logging.html#configuring-logging-for-a-library
13 | logging.getLogger(config.app_config.package_name).addHandler(logging.NullHandler())
14 |
15 |
16 | with open(PACKAGE_ROOT / "VERSION") as version_file:
17 | __version__ = version_file.read().strip()
18 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/regression_model/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from regression_model.config.core import PACKAGE_ROOT, config
4 |
5 | # It is strongly advised that you do not add any handlers other than
6 | # NullHandler to your library’s loggers. This is because the configuration
7 | # of handlers is the prerogative of the application developer who uses your
8 | # library. The application developer knows their target audience and what
9 | # handlers are most appropriate for their application: if you add handlers
10 | # ‘under the hood’, you might well interfere with their ability to carry out
11 | # unit tests and deliver logs which suit their requirements.
12 | # https://docs.python.org/3/howto/logging.html#configuring-logging-for-a-library
13 | logging.getLogger(config.app_config.package_name).addHandler(logging.NullHandler())
14 |
15 |
16 | with open(PACKAGE_ROOT / "VERSION") as version_file:
17 | __version__ = version_file.read().strip()
18 |
--------------------------------------------------------------------------------
/assignment-section-05/classification_model/config.yml:
--------------------------------------------------------------------------------
1 | # Package Overview
2 | package_name: regression_model
3 |
4 | # Data Files
5 | raw_data_file: raw.csv
6 | training_data_file: train.csv
7 | test_data_file: test.csv
8 |
9 | # Variables
10 | # The variable we are attempting to predict (sale price)
11 | target: survived
12 |
13 | pipeline_name: titanic_classification_model
14 | pipeline_save_file: titanic_classification_model_output_v
15 |
16 | features:
17 | - pclass
18 | - sex
19 | - age
20 | - sibsp
21 | - parch
22 | - fare
23 | - cabin
24 | - embarked
25 | - title # generated from name
26 |
27 | # set train/test split
28 | test_size: 0.1
29 |
30 | # to set the random seed
31 | random_state: 0
32 |
33 | unused_fields:
34 | - name
35 | - ticket
36 | - boat
37 | - body
38 | - home.dest
39 |
40 | numerical_vars:
41 | - age
42 | - fare
43 |
44 | categorical_vars:
45 | - sex
46 | - cabin
47 | - embarked
48 | - title
49 |
50 | cabin_vars:
51 | - cabin
--------------------------------------------------------------------------------
/assignment-section-05/README.md:
--------------------------------------------------------------------------------
1 | # Productionized Titanic Classification Model Package
2 |
3 | ## Run With Tox (Recommended)
4 | - Download the data from: https://www.openml.org/data/get_csv/16826755/phpMYEkMl
5 | - Save the file as `raw.csv` in the classification_model/datasets directory
6 | - `pip install tox`
7 | - Make sure you are in the assignment-section-05 directory (where the tox.ini file is) then run the command: `tox` (this runs the tests and typechecks, trains the model under the hood). The first time you run this it creates a virtual env and installs
8 | dependencies, so takes a few minutes.
9 |
10 | ## Run Without Tox
11 | - Download the data from: https://www.openml.org/data/get_csv/16826755/phpMYEkMl
12 | - Save the file as `raw.csv` in the classification_model/datasets directory
13 | - Add assignment-section-05 *and* classification_model paths to your system PYTHONPATH
14 | - `pip install -r requirements/test_requirements`
15 | - Train the model: `python classification_model/train_pipeline.py`
16 | - Run the tests `pytest tests`
--------------------------------------------------------------------------------
/packages/ml_api/tests/test_validation.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | from regression_model.config import config
4 | from regression_model.processing.data_management import load_dataset
5 |
6 |
7 | def test_prediction_endpoint_validation_200(flask_test_client):
8 | # Given
9 | # Load the test data from the regression_model package.
10 | # This is important as it makes it harder for the test
11 | # data versions to get confused by not spreading it
12 | # across packages.
13 | test_data = load_dataset(file_name=config.TESTING_DATA_FILE)
14 | post_json = test_data.to_json(orient='records')
15 |
16 | # When
17 | response = flask_test_client.post('/v1/predict/regression',
18 | json=json.loads(post_json))
19 |
20 | # Then
21 | assert response.status_code == 200
22 | response_json = json.loads(response.data)
23 |
24 | # Check correct number of errors removed
25 | assert len(response_json.get('predictions')) + len(
26 | response_json.get('errors')) == len(test_data)
27 |
--------------------------------------------------------------------------------
/scripts/fetch_kaggle_large_dataset.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | TRAINING_DATA_URL="vbookshelf/v2-plant-seedlings-dataset"
4 | NOW=$(date)
5 |
6 | kaggle datasets download -d $TRAINING_DATA_URL -p packages/neural_network_model/neural_network_model/datasets/ && \
7 | unzip packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset.zip -d packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset && \
8 | echo $TRAINING_DATA_URL 'retrieved on:' $NOW > packages/neural_network_model/neural_network_model/datasets/training_data_reference.txt && \
9 | mkdir -p "./packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset/Shepherds Purse" && \
10 | mv -v "./packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset/Shepherd’s Purse/"* "./packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset/Shepherds Purse"
11 | rm -rf "./packages/neural_network_model/neural_network_model/datasets/v2-plant-seedlings-dataset/Shepherd’s Purse"
--------------------------------------------------------------------------------
/scripts/publish_model.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Building packages and uploading them to a Gemfury repository
4 |
5 | GEMFURY_URL=$GEMFURY_PUSH_URL
6 |
7 | set -e
8 |
9 | DIRS="$@"
10 | BASE_DIR=$(pwd)
11 | SETUP="setup.py"
12 |
13 | warn() {
14 | echo "$@" 1>&2
15 | }
16 |
17 | die() {
18 | warn "$@"
19 | exit 1
20 | }
21 |
22 | build() {
23 | DIR="${1/%\//}"
24 | echo "Checking directory $DIR"
25 | cd "$BASE_DIR/$DIR"
26 | [ ! -e $SETUP ] && warn "No $SETUP file, skipping" && return
27 | PACKAGE_NAME=$(python $SETUP --fullname)
28 | echo "Package $PACKAGE_NAME"
29 | python "$SETUP" sdist bdist_wheel || die "Building package $PACKAGE_NAME failed"
30 | for X in $(ls dist)
31 | do
32 | curl -F package=@"dist/$X" "$GEMFURY_URL" || die "Uploading package $PACKAGE_NAME failed on file dist/$X"
33 | done
34 | }
35 |
36 | if [ -n "$DIRS" ]; then
37 | for dir in $DIRS; do
38 | build $dir
39 | done
40 | else
41 | ls -d */ | while read dir; do
42 | build $dir
43 | done
44 | fi
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/publish_model.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Building packages and uploading them to a Gemfury repository
4 |
5 | GEMFURY_URL=$GEMFURY_PUSH_URL
6 |
7 | set -e
8 |
9 | DIRS="$@"
10 | BASE_DIR=$(pwd)
11 | SETUP="setup.py"
12 |
13 | warn() {
14 | echo "$@" 1>&2
15 | }
16 |
17 | die() {
18 | warn "$@"
19 | exit 1
20 | }
21 |
22 | build() {
23 | DIR="${1/%\//}"
24 | echo "Checking directory $DIR"
25 | cd "$BASE_DIR/$DIR"
26 | [ ! -e $SETUP ] && warn "No $SETUP file, skipping" && return
27 | PACKAGE_NAME=$(python $SETUP --fullname)
28 | echo "Package $PACKAGE_NAME"
29 | python "$SETUP" sdist bdist_wheel || die "Building package $PACKAGE_NAME failed"
30 | for X in $(ls dist)
31 | do
32 | curl -F package=@"dist/$X" "$GEMFURY_URL" || die "Uploading package $PACKAGE_NAME failed on file dist/$X"
33 | done
34 | }
35 |
36 | if [ -n "$DIRS" ]; then
37 | for dir in $DIRS; do
38 | build $dir
39 | done
40 | else
41 | ls -d */ | while read dir; do
42 | build $dir
43 | done
44 | fi
--------------------------------------------------------------------------------
/section-05-production-model-package/regression_model/train_pipeline.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from config.core import config
3 | from pipeline import price_pipe
4 | from processing.data_manager import load_dataset, save_pipeline
5 | from sklearn.model_selection import train_test_split
6 |
7 |
8 | def run_training() -> None:
9 | """Train the model."""
10 |
11 | # read training data
12 | data = load_dataset(file_name=config.app_config.training_data_file)
13 |
14 | # divide train and test
15 | X_train, X_test, y_train, y_test = train_test_split(
16 | data[config.model_config.features], # predictors
17 | data[config.model_config.target],
18 | test_size=config.model_config.test_size,
19 | # we are setting the random seed here
20 | # for reproducibility
21 | random_state=config.model_config.random_state,
22 | )
23 | y_train = np.log(y_train)
24 |
25 | # fit model
26 | price_pipe.fit(X_train, y_train)
27 |
28 | # persist trained model
29 | save_pipeline(pipeline_to_persist=price_pipe)
30 |
31 |
32 | if __name__ == "__main__":
33 | run_training()
34 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/regression_model/train_pipeline.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from config.core import config
3 | from pipeline import price_pipe
4 | from processing.data_manager import load_dataset, save_pipeline
5 | from sklearn.model_selection import train_test_split
6 |
7 |
8 | def run_training() -> None:
9 | """Train the model."""
10 |
11 | # read training data
12 | data = load_dataset(file_name=config.app_config.training_data_file)
13 |
14 | # divide train and test
15 | X_train, X_test, y_train, y_test = train_test_split(
16 | data[config.model_config.features], # predictors
17 | data[config.model_config.target],
18 | test_size=config.model_config.test_size,
19 | # we are setting the random seed here
20 | # for reproducibility
21 | random_state=config.model_config.random_state,
22 | )
23 | y_train = np.log(y_train)
24 |
25 | # fit model
26 | price_pipe.fit(X_train, y_train)
27 |
28 | # persist trained model
29 | save_pipeline(pipeline_to_persist=price_pipe)
30 |
31 |
32 | if __name__ == "__main__":
33 | run_training()
34 |
--------------------------------------------------------------------------------
/packages/ml_api/tests/capture_model_predictions.py:
--------------------------------------------------------------------------------
1 | """
2 | This script should only be run in CI.
3 | Never run it locally or you will disrupt the
4 | differential test versioning logic.
5 | """
6 |
7 | import pandas as pd
8 |
9 | from regression_model.predict import make_prediction
10 | from regression_model.processing.data_management import load_dataset
11 |
12 | from api import config
13 |
14 |
15 | def capture_predictions() -> None:
16 | """Save the test data predictions to a CSV."""
17 |
18 | save_file = 'test_data_predictions.csv'
19 | test_data = load_dataset(file_name='test.csv')
20 |
21 | # we take a slice with no input validation issues
22 | multiple_test_input = test_data[99:600]
23 |
24 | predictions = make_prediction(input_data=multiple_test_input)
25 |
26 | # save predictions for the test dataset
27 | predictions_df = pd.DataFrame(predictions)
28 |
29 | # hack here to save the file to the regression model
30 | # package of the repo, not the installed package
31 | predictions_df.to_csv(f'{config.PACKAGE_ROOT}/{save_file}')
32 |
33 |
34 | if __name__ == '__main__':
35 | capture_predictions()
36 |
--------------------------------------------------------------------------------
/packages/regression_model/tests/test_predict.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | from regression_model.predict import make_prediction
4 | from regression_model.processing.data_management import load_dataset
5 |
6 |
7 | def test_make_single_prediction():
8 | # Given
9 | test_data = load_dataset(file_name='test.csv')
10 | single_test_input = test_data[0:1]
11 |
12 | # When
13 | subject = make_prediction(input_data=single_test_input)
14 |
15 | # Then
16 | assert subject is not None
17 | assert isinstance(subject.get('predictions')[0], float)
18 | assert math.ceil(subject.get('predictions')[0]) == 112476
19 |
20 |
21 | def test_make_multiple_predictions():
22 | # Given
23 | test_data = load_dataset(file_name='test.csv')
24 | original_data_length = len(test_data)
25 | multiple_test_input = test_data
26 |
27 | # When
28 | subject = make_prediction(input_data=multiple_test_input)
29 |
30 | # Then
31 | assert subject is not None
32 | assert len(subject.get('predictions')) == 1451
33 |
34 | # We expect some rows to be filtered out
35 | assert len(subject.get('predictions')) != original_data_length
36 |
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/train_pipeline.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.model_selection import train_test_split
3 |
4 | from regression_model import pipeline
5 | from regression_model.processing.data_management import load_dataset, save_pipeline
6 | from regression_model.config import config
7 | from regression_model import __version__ as _version
8 |
9 | import logging
10 |
11 |
12 | _logger = logging.getLogger(__name__)
13 |
14 |
15 | def run_training() -> None:
16 | """Train the model."""
17 |
18 | # read training data
19 | data = load_dataset(file_name=config.TRAINING_DATA_FILE)
20 |
21 | # divide train and test
22 | X_train, X_test, y_train, y_test = train_test_split(
23 | data[config.FEATURES], data[config.TARGET], test_size=0.1, random_state=0
24 | ) # we are setting the seed here
25 |
26 | # transform the target
27 | y_train = np.log(y_train)
28 |
29 | pipeline.price_pipe.fit(X_train[config.FEATURES], y_train)
30 |
31 | _logger.info(f"saving model version: {_version}")
32 | save_pipeline(pipeline_to_persist=pipeline.price_pipe)
33 |
34 |
35 | if __name__ == "__main__":
36 | run_training()
37 |
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/processing/features.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.base import BaseEstimator, TransformerMixin
3 |
4 | from regression_model.processing.errors import InvalidModelInputError
5 |
6 |
7 | class LogTransformer(BaseEstimator, TransformerMixin):
8 | """Logarithm transformer."""
9 |
10 | def __init__(self, variables=None):
11 | if not isinstance(variables, list):
12 | self.variables = [variables]
13 | else:
14 | self.variables = variables
15 |
16 | def fit(self, X, y=None):
17 | # to accomodate the pipeline
18 | return self
19 |
20 | def transform(self, X):
21 | X = X.copy()
22 |
23 | # check that the values are non-negative for log transform
24 | if not (X[self.variables] > 0).all().all():
25 | vars_ = self.variables[(X[self.variables] <= 0).any()]
26 | raise InvalidModelInputError(
27 | f"Variables contain zero or negative values, "
28 | f"can't apply log for vars: {vars_}"
29 | )
30 |
31 | for feature in self.variables:
32 | X[feature] = np.log(X[feature])
33 |
34 | return X
35 |
--------------------------------------------------------------------------------
/assignment-section-05/classification_model/predict.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 |
3 | import pandas as pd
4 |
5 | from classification_model import __version__ as _version
6 | from classification_model.config.core import config
7 | from classification_model.processing.data_manager import load_pipeline
8 | from classification_model.processing.validation import validate_inputs
9 |
10 | pipeline_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl"
11 | _titanic_pipe = load_pipeline(file_name=pipeline_file_name)
12 |
13 |
14 | def make_prediction(
15 | *,
16 | input_data: t.Union[pd.DataFrame, dict],
17 | ) -> dict:
18 | """Make a prediction using a saved model pipeline."""
19 |
20 | data = pd.DataFrame(input_data)
21 | validated_data, errors = validate_inputs(input_data=data)
22 | results = {"predictions": None, "version": _version, "errors": errors}
23 |
24 | if not errors:
25 | predictions = _titanic_pipe.predict(
26 | X=validated_data[config.model_config.features]
27 | )
28 | results = {
29 | "predictions": predictions,
30 | "version": _version,
31 | "errors": errors,
32 | }
33 |
34 | return results
35 |
--------------------------------------------------------------------------------
/assignment-section-05/classification_model/train_pipeline.py:
--------------------------------------------------------------------------------
1 | from sklearn.model_selection import train_test_split
2 |
3 | from classification_model.config.core import config
4 | from classification_model.pipeline import titanic_pipe
5 | from classification_model.processing.data_manager import load_dataset, save_pipeline
6 |
7 |
8 | def run_training() -> None:
9 | """
10 | Train the model.
11 |
12 | Training data can be found here:
13 | https://www.openml.org/data/get_csv/16826755/phpMYEkMl
14 | """
15 |
16 | # read training data
17 | data = load_dataset(file_name=config.app_config.raw_data_file)
18 |
19 | # divide train and test
20 | X_train, X_test, y_train, y_test = train_test_split(
21 | data[config.model_config.features], # predictors
22 | data[config.model_config.target],
23 | test_size=config.model_config.test_size,
24 | # we are setting the random seed here
25 | # for reproducibility
26 | random_state=config.model_config.random_state,
27 | )
28 |
29 | # fit model
30 | titanic_pipe.fit(X_train, y_train)
31 |
32 | # persist trained model
33 | save_pipeline(pipeline_to_persist=titanic_pipe)
34 |
35 |
36 | if __name__ == "__main__":
37 | run_training()
38 |
--------------------------------------------------------------------------------
/section-05-production-model-package/regression_model/predict.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 |
3 | import numpy as np
4 | import pandas as pd
5 |
6 | from regression_model import __version__ as _version
7 | from regression_model.config.core import config
8 | from regression_model.processing.data_manager import load_pipeline
9 | from regression_model.processing.validation import validate_inputs
10 |
11 | pipeline_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl"
12 | _price_pipe = load_pipeline(file_name=pipeline_file_name)
13 |
14 |
15 | def make_prediction(
16 | *,
17 | input_data: t.Union[pd.DataFrame, dict],
18 | ) -> dict:
19 | """Make a prediction using a saved model pipeline."""
20 |
21 | data = pd.DataFrame(input_data)
22 | validated_data, errors = validate_inputs(input_data=data)
23 | results = {"predictions": None, "version": _version, "errors": errors}
24 |
25 | if not errors:
26 | predictions = _price_pipe.predict(
27 | X=validated_data[config.model_config.features]
28 | )
29 | results = {
30 | "predictions": [np.exp(pred) for pred in predictions], # type: ignore
31 | "version": _version,
32 | "errors": errors,
33 | }
34 |
35 | return results
36 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/regression_model/predict.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 |
3 | import numpy as np
4 | import pandas as pd
5 |
6 | from regression_model import __version__ as _version
7 | from regression_model.config.core import config
8 | from regression_model.processing.data_manager import load_pipeline
9 | from regression_model.processing.validation import validate_inputs
10 |
11 | pipeline_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl"
12 | _price_pipe = load_pipeline(file_name=pipeline_file_name)
13 |
14 |
15 | def make_prediction(
16 | *,
17 | input_data: t.Union[pd.DataFrame, dict],
18 | ) -> dict:
19 | """Make a prediction using a saved model pipeline."""
20 |
21 | data = pd.DataFrame(input_data)
22 | validated_data, errors = validate_inputs(input_data=data)
23 | results = {"predictions": None, "version": _version, "errors": errors}
24 |
25 | if not errors:
26 | predictions = _price_pipe.predict(
27 | X=validated_data[config.model_config.features]
28 | )
29 | results = {
30 | "predictions": [np.exp(pred) for pred in predictions], # type: ignore
31 | "version": _version,
32 | "errors": errors,
33 | }
34 |
35 | return results
36 |
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/processing/validation.py:
--------------------------------------------------------------------------------
1 | from regression_model.config import config
2 |
3 | import pandas as pd
4 |
5 |
6 | def validate_inputs(input_data: pd.DataFrame) -> pd.DataFrame:
7 | """Check model inputs for unprocessable values."""
8 |
9 | validated_data = input_data.copy()
10 |
11 | # check for numerical variables with NA not seen during training
12 | if input_data[config.NUMERICAL_NA_NOT_ALLOWED].isnull().any().any():
13 | validated_data = validated_data.dropna(
14 | axis=0, subset=config.NUMERICAL_NA_NOT_ALLOWED
15 | )
16 |
17 | # check for categorical variables with NA not seen during training
18 | if input_data[config.CATEGORICAL_NA_NOT_ALLOWED].isnull().any().any():
19 | validated_data = validated_data.dropna(
20 | axis=0, subset=config.CATEGORICAL_NA_NOT_ALLOWED
21 | )
22 |
23 | # check for values <= 0 for the log transformed variables
24 | if (input_data[config.NUMERICALS_LOG_VARS] <= 0).any().any():
25 | vars_with_neg_values = config.NUMERICALS_LOG_VARS[
26 | (input_data[config.NUMERICALS_LOG_VARS] <= 0).any()
27 | ]
28 | validated_data = validated_data[validated_data[vars_with_neg_values] > 0]
29 |
30 | return validated_data
31 |
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/predict.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 | from regression_model.processing.data_management import load_pipeline
5 | from regression_model.config import config
6 | from regression_model.processing.validation import validate_inputs
7 | from regression_model import __version__ as _version
8 |
9 | import logging
10 | import typing as t
11 |
12 |
13 | _logger = logging.getLogger(__name__)
14 |
15 | pipeline_file_name = f"{config.PIPELINE_SAVE_FILE}{_version}.pkl"
16 | _price_pipe = load_pipeline(file_name=pipeline_file_name)
17 |
18 |
19 | def make_prediction(*, input_data: t.Union[pd.DataFrame, dict],
20 | ) -> dict:
21 | """Make a prediction using a saved model pipeline.
22 |
23 | Args:
24 | input_data: Array of model prediction inputs.
25 |
26 | Returns:
27 | Predictions for each input row, as well as the model version.
28 | """
29 |
30 | data = pd.DataFrame(input_data)
31 | validated_data = validate_inputs(input_data=data)
32 |
33 | prediction = _price_pipe.predict(validated_data[config.FEATURES])
34 |
35 | output = np.exp(prediction)
36 |
37 | results = {"predictions": output, "version": _version}
38 |
39 | _logger.info(
40 | f"Making predictions with model version: {_version} "
41 | f"Inputs: {validated_data} "
42 | f"Predictions: {results}"
43 | )
44 |
45 | return results
46 |
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/config/config.py:
--------------------------------------------------------------------------------
1 | # The Keras model loading function does not play well with
2 | # Pathlib at the moment, so we are using the old os module
3 | # style
4 |
5 | import os
6 |
7 | PWD = os.path.dirname(os.path.abspath(__file__))
8 | PACKAGE_ROOT = os.path.abspath(os.path.join(PWD, '..'))
9 | DATASET_DIR = os.path.join(PACKAGE_ROOT, 'datasets')
10 | TRAINED_MODEL_DIR = os.path.join(PACKAGE_ROOT, 'trained_models')
11 | DATA_FOLDER = os.path.join(DATASET_DIR, 'v2-plant-seedlings-dataset')
12 |
13 | # MODEL PERSISTING
14 | MODEL_NAME = 'cnn_model'
15 | PIPELINE_NAME = 'cnn_pipe'
16 | CLASSES_NAME = 'classes'
17 | ENCODER_NAME = 'encoder'
18 |
19 | # MODEL FITTING
20 | IMAGE_SIZE = 150 # 50 for testing, 150 for final model
21 | BATCH_SIZE = 10
22 | EPOCHS = int(os.environ.get('EPOCHS', 1)) # 1 for testing, 10 for final model
23 |
24 |
25 | with open(os.path.join(PACKAGE_ROOT, 'VERSION')) as version_file:
26 | _version = version_file.read().strip()
27 |
28 | MODEL_FILE_NAME = f'{MODEL_NAME}_{_version}.h5'
29 | MODEL_PATH = os.path.join(TRAINED_MODEL_DIR, MODEL_FILE_NAME)
30 |
31 | PIPELINE_FILE_NAME = f'{PIPELINE_NAME}_{_version}.pkl'
32 | PIPELINE_PATH = os.path.join(TRAINED_MODEL_DIR, PIPELINE_FILE_NAME)
33 |
34 | CLASSES_FILE_NAME = f'{CLASSES_NAME}_{_version}.pkl'
35 | CLASSES_PATH = os.path.join(TRAINED_MODEL_DIR, CLASSES_FILE_NAME)
36 |
37 | ENCODER_FILE_NAME = f'{ENCODER_NAME}_{_version}.pkl'
38 | ENCODER_PATH = os.path.join(TRAINED_MODEL_DIR, ENCODER_FILE_NAME)
39 |
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/processing/preprocessors.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import cv2
3 | from keras.utils import np_utils
4 | from sklearn.preprocessing import LabelEncoder
5 | from sklearn.base import BaseEstimator, TransformerMixin
6 |
7 |
8 | class TargetEncoder(BaseEstimator, TransformerMixin):
9 |
10 | def __init__(self, encoder=LabelEncoder()):
11 | self.encoder = encoder
12 |
13 | def fit(self, X, y=None):
14 | # note that x is the target in this case
15 | self.encoder.fit(X)
16 | return self
17 |
18 | def transform(self, X):
19 | X = X.copy()
20 | X = np_utils.to_categorical(self.encoder.transform(X))
21 | return X
22 |
23 |
24 | def _im_resize(df, n, image_size):
25 | im = cv2.imread(df[n])
26 | im = cv2.resize(im, (image_size, image_size))
27 | return im
28 |
29 |
30 | class CreateDataset(BaseEstimator, TransformerMixin):
31 |
32 | def __init__(self, image_size=50):
33 | self.image_size = image_size
34 |
35 | def fit(self, X, y=None):
36 | return self
37 |
38 | def transform(self, X):
39 | X = X.copy()
40 | tmp = np.zeros((len(X),
41 | self.image_size,
42 | self.image_size, 3), dtype='float32')
43 |
44 | for n in range(0, len(X)):
45 | im = _im_resize(X, n, self.image_size)
46 | tmp[n] = im
47 |
48 | print('Dataset Images shape: {} size: {:,}'.format(
49 | tmp.shape, tmp.size))
50 | return tmp
51 |
--------------------------------------------------------------------------------
/assignment-section-05/classification_model/processing/validation.py:
--------------------------------------------------------------------------------
1 | from typing import List, Optional, Tuple, Union
2 |
3 | import numpy as np
4 | import pandas as pd
5 | from pydantic import BaseModel, ValidationError
6 |
7 | from classification_model.config.core import config
8 | from classification_model.processing.data_manager import pre_pipeline_preparation
9 |
10 |
11 | def validate_inputs(*, input_data: pd.DataFrame) -> Tuple[pd.DataFrame, Optional[dict]]:
12 | """Check model inputs for unprocessable values."""
13 |
14 | pre_processed = pre_pipeline_preparation(dataframe=input_data)
15 | validated_data = pre_processed[config.model_config.features].copy()
16 | errors = None
17 |
18 | try:
19 | # replace numpy nans so that pydantic can validate
20 | MultipleTitanicDataInputs(
21 | inputs=validated_data.replace({np.nan: None}).to_dict(orient="records")
22 | )
23 | except ValidationError as error:
24 | errors = error.json()
25 |
26 | return validated_data, errors
27 |
28 |
29 | class TitanicDataInputSchema(BaseModel):
30 | pclass: Optional[int]
31 | name: Optional[str]
32 | sex: Optional[str]
33 | age: Optional[int]
34 | sibsp: Optional[int]
35 | parch: Optional[int]
36 | ticket: Optional[int]
37 | fare: Optional[float]
38 | cabin: Optional[str]
39 | embarked: Optional[str]
40 | boat: Optional[Union[str, int]]
41 | body: Optional[int]
42 | # TODO: rename home.dest, can get away with it now as it is not used
43 |
44 |
45 | class MultipleTitanicDataInputs(BaseModel):
46 | inputs: List[TitanicDataInputSchema]
47 |
--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/tox.ini:
--------------------------------------------------------------------------------
1 | # Tox is a generic virtualenv management and test command line tool. Its goal is to
2 | # standardize testing in Python. We will be using it extensively in this course.
3 |
4 | # Using Tox we can (on multiple operating systems):
5 | # + Eliminate PYTHONPATH challenges when running scripts/tests
6 | # + Eliminate virtualenv setup confusion
7 | # + Streamline steps such as model training, model publishing
8 |
9 | [pytest]
10 | log_cli_level=WARNING
11 |
12 | [tox]
13 | envlist = test_app, typechecks, stylechecks, lint
14 | skipsdist = True
15 |
16 | [testenv]
17 | install_command = pip install {opts} {packages}
18 |
19 | [testenv:test_app]
20 | deps =
21 | -rtest_requirements.txt
22 |
23 | setenv =
24 | PYTHONPATH=.
25 | PYTHONHASHSEED=0
26 |
27 | commands=
28 | pytest \
29 | -vv \
30 | {posargs:app/tests/}
31 |
32 | [testenv:run]
33 | envdir = {toxworkdir}/test_app
34 | deps =
35 | {[testenv:test_app]deps}
36 |
37 | setenv =
38 | {[testenv:test_app]setenv}
39 |
40 | commands=
41 | python app/main.py
42 |
43 |
44 | [testenv:typechecks]
45 | envdir = {toxworkdir}/test_app
46 |
47 | deps =
48 | {[testenv:test_app]deps}
49 |
50 | commands = {posargs:mypy app}
51 |
52 |
53 | [testenv:stylechecks]
54 | envdir = {toxworkdir}/test_app
55 |
56 | deps =
57 | {[testenv:test_app]deps}
58 |
59 | commands = {posargs:flake8 app}
60 |
61 |
62 | [testenv:lint]
63 | envdir = {toxworkdir}/test_app
64 |
65 | deps =
66 | {[testenv:test_app]deps}
67 |
68 | commands =
69 | isort app
70 | black app
71 |
72 | [flake8]
73 | exclude = .git,__pycache__,__init__.py,.mypy_cache,.pytest_cache,.venv,alembic
74 | max-line-length = 88
--------------------------------------------------------------------------------
/section-04-research-and-development/preprocessors.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 | from sklearn.base import BaseEstimator, TransformerMixin
5 |
6 |
7 |
8 | class TemporalVariableTransformer(BaseEstimator, TransformerMixin):
9 | # Temporal elapsed time transformer
10 |
11 | def __init__(self, variables, reference_variable):
12 |
13 | if not isinstance(variables, list):
14 | raise ValueError('variables should be a list')
15 |
16 | self.variables = variables
17 | self.reference_variable = reference_variable
18 |
19 | def fit(self, X, y=None):
20 | # we need this step to fit the sklearn pipeline
21 | return self
22 |
23 | def transform(self, X):
24 |
25 | # so that we do not over-write the original dataframe
26 | X = X.copy()
27 |
28 | for feature in self.variables:
29 | X[feature] = X[self.reference_variable] - X[feature]
30 |
31 | return X
32 |
33 |
34 |
35 | # categorical missing value imputer
36 | class Mapper(BaseEstimator, TransformerMixin):
37 |
38 | def __init__(self, variables, mappings):
39 |
40 | if not isinstance(variables, list):
41 | raise ValueError('variables should be a list')
42 |
43 | self.variables = variables
44 | self.mappings = mappings
45 |
46 | def fit(self, X, y=None):
47 | # we need the fit statement to accomodate the sklearn pipeline
48 | return self
49 |
50 | def transform(self, X):
51 | X = X.copy()
52 | for feature in self.variables:
53 | X[feature] = X[feature].map(self.mappings)
54 |
55 | return X
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2019, Soledad Galli and Christopher Samiullah. Deployment of Machine Learning Models, online course.
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | 1. Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | 3. Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/pipeline.py:
--------------------------------------------------------------------------------
1 | from sklearn.linear_model import Lasso
2 | from sklearn.pipeline import Pipeline
3 | from sklearn.preprocessing import MinMaxScaler
4 |
5 | from regression_model.processing import preprocessors as pp
6 | from regression_model.processing import features
7 | from regression_model.config import config
8 |
9 | import logging
10 |
11 |
12 | _logger = logging.getLogger(__name__)
13 |
14 |
15 | price_pipe = Pipeline(
16 | [
17 | (
18 | "categorical_imputer",
19 | pp.CategoricalImputer(variables=config.CATEGORICAL_VARS_WITH_NA),
20 | ),
21 | (
22 | "numerical_inputer",
23 | pp.NumericalImputer(variables=config.NUMERICAL_VARS_WITH_NA),
24 | ),
25 | (
26 | "temporal_variable",
27 | pp.TemporalVariableEstimator(
28 | variables=config.TEMPORAL_VARS, reference_variable=config.DROP_FEATURES
29 | ),
30 | ),
31 | (
32 | "rare_label_encoder",
33 | pp.RareLabelCategoricalEncoder(tol=0.01, variables=config.CATEGORICAL_VARS),
34 | ),
35 | (
36 | "categorical_encoder",
37 | pp.CategoricalEncoder(variables=config.CATEGORICAL_VARS),
38 | ),
39 | (
40 | "log_transformer",
41 | features.LogTransformer(variables=config.NUMERICALS_LOG_VARS),
42 | ),
43 | (
44 | "drop_features",
45 | pp.DropUnecessaryFeatures(variables_to_drop=config.DROP_FEATURES),
46 | ),
47 | ("scaler", MinMaxScaler()),
48 | ("Linear_model", Lasso(alpha=0.005, random_state=0)),
49 | ]
50 | )
51 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/tox.ini:
--------------------------------------------------------------------------------
1 | # Tox is a generic virtualenv management and test command line tool. Its goal is to
2 | # standardize testing in Python. We will be using it extensively in this course.
3 |
4 | # Using Tox we can (on multiple operating systems):
5 | # + Eliminate PYTHONPATH challenges when running scripts/tests
6 | # + Eliminate virtualenv setup confusion
7 | # + Streamline steps such as model training, model publishing
8 |
9 | [pytest]
10 | log_cli_level=WARNING
11 |
12 | [tox]
13 | envlist = test_app, typechecks, stylechecks, lint
14 | skipsdist = True
15 |
16 | [testenv]
17 | install_command = pip install {opts} {packages}
18 |
19 | passenv =
20 | PIP_EXTRA_INDEX_URL
21 |
22 | [testenv:test_app]
23 | deps =
24 | -rtest_requirements.txt
25 |
26 | setenv =
27 | PYTHONPATH=.
28 | PYTHONHASHSEED=0
29 |
30 | commands=
31 | pytest \
32 | -vv \
33 | {posargs:app/tests/}
34 |
35 | [testenv:run]
36 | envdir = {toxworkdir}/test_app
37 | deps =
38 | {[testenv:test_app]deps}
39 |
40 | setenv =
41 | {[testenv:test_app]setenv}
42 |
43 | commands=
44 | python app/main.py
45 |
46 |
47 | [testenv:typechecks]
48 | envdir = {toxworkdir}/test_app
49 |
50 | deps =
51 | {[testenv:test_app]deps}
52 |
53 | commands = {posargs:mypy app}
54 |
55 |
56 | [testenv:stylechecks]
57 | envdir = {toxworkdir}/test_app
58 |
59 | deps =
60 | {[testenv:test_app]deps}
61 |
62 | commands = {posargs:flake8 app}
63 |
64 |
65 | [testenv:lint]
66 | envdir = {toxworkdir}/test_app
67 |
68 | deps =
69 | {[testenv:test_app]deps}
70 |
71 | commands =
72 | isort app
73 | black app
74 | mypy app
75 | flake8 app
76 |
77 | [flake8]
78 | exclude = .git,__pycache__,__init__.py,.mypy_cache,.pytest_cache,.venv,alembic
79 | max-line-length = 88
--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/tox.ini:
--------------------------------------------------------------------------------
1 | # Tox is a generic virtualenv management and test command line tool. Its goal is to
2 | # standardize testing in Python. We will be using it extensively in this course.
3 |
4 | # Using Tox we can (on multiple operating systems):
5 | # + Eliminate PYTHONPATH challenges when running scripts/tests
6 | # + Eliminate virtualenv setup confusion
7 | # + Streamline steps such as model training, model publishing
8 |
9 | [pytest]
10 | log_cli_level=WARNING
11 |
12 | [tox]
13 | envlist = test_app, typechecks, stylechecks, lint
14 | skipsdist = True
15 |
16 | [testenv]
17 | install_command = pip install {opts} {packages}
18 |
19 | passenv =
20 | PIP_EXTRA_INDEX_URL
21 |
22 | [testenv:test_app]
23 | deps =
24 | -rtest_requirements.txt
25 |
26 | setenv =
27 | PYTHONPATH=.
28 | PYTHONHASHSEED=0
29 |
30 | commands=
31 | pytest \
32 | -vv \
33 | {posargs:app/tests/}
34 |
35 | [testenv:run]
36 | envdir = {toxworkdir}/test_app
37 | deps =
38 | {[testenv:test_app]deps}
39 |
40 | setenv =
41 | {[testenv:test_app]setenv}
42 |
43 | commands=
44 | python app/main.py
45 |
46 |
47 | [testenv:typechecks]
48 | envdir = {toxworkdir}/test_app
49 |
50 | deps =
51 | {[testenv:test_app]deps}
52 |
53 | commands = {posargs:mypy app}
54 |
55 |
56 | [testenv:stylechecks]
57 | envdir = {toxworkdir}/test_app
58 |
59 | deps =
60 | {[testenv:test_app]deps}
61 |
62 | commands = {posargs:flake8 app}
63 |
64 |
65 | [testenv:lint]
66 | envdir = {toxworkdir}/test_app
67 |
68 | deps =
69 | {[testenv:test_app]deps}
70 |
71 | commands =
72 | isort app
73 | black app
74 | mypy app
75 | flake8 app
76 |
77 | [flake8]
78 | exclude = .git,__pycache__,__init__.py,.mypy_cache,.pytest_cache,.venv,alembic
79 | max-line-length = 88
--------------------------------------------------------------------------------
/section-05-production-model-package/tox.ini:
--------------------------------------------------------------------------------
1 | # Tox is a generic virtualenv management and test command line tool. Its goal is to
2 | # standardize testing in Python. We will be using it extensively in this course.
3 |
4 | # Using Tox we can (on multiple operating systems):
5 | # + Eliminate PYTHONPATH challenges when running scripts/tests
6 | # + Eliminate virtualenv setup confusion
7 | # + Streamline steps such as model training, model publishing
8 |
9 |
10 | [tox]
11 | envlist = test_package, typechecks, lint, stylechecks
12 | skipsdist = True
13 |
14 | [testenv]
15 | install_command = pip install {opts} {packages}
16 |
17 | [testenv:test_package]
18 | deps =
19 | -rrequirements/test_requirements.txt
20 |
21 | setenv =
22 | PYTHONPATH=.
23 | PYTHONHASHSEED=0
24 |
25 | commands=
26 | python regression_model/train_pipeline.py
27 | pytest \
28 | -s \
29 | -vv \
30 | {posargs:tests/}
31 |
32 | [testenv:train]
33 | envdir = {toxworkdir}/test_package
34 | deps =
35 | {[testenv:test_package]deps}
36 |
37 | setenv =
38 | {[testenv:test_package]setenv}
39 |
40 | commands=
41 | python regression_model/train_pipeline.py
42 |
43 |
44 | [testenv:typechecks]
45 | envdir = {toxworkdir}/test_package
46 |
47 | deps =
48 | {[testenv:test_package]deps}
49 |
50 | commands = {posargs:mypy regression_model}
51 |
52 |
53 | [testenv:stylechecks]
54 | envdir = {toxworkdir}/test_package
55 |
56 | deps =
57 | {[testenv:test_package]deps}
58 |
59 | commands = {posargs:flake8 regression_model tests}
60 |
61 |
62 | [testenv:lint]
63 | envdir = {toxworkdir}/test_package
64 |
65 | deps =
66 | {[testenv:test_package]deps}
67 |
68 | commands =
69 | isort regression_model tests
70 | black regression_model tests
71 |
72 | [flake8]
73 | exclude = .git,env
74 | max-line-length = 100
--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/app/main.py:
--------------------------------------------------------------------------------
1 | from typing import Any
2 |
3 | from fastapi import APIRouter, FastAPI, Request
4 | from fastapi.middleware.cors import CORSMiddleware
5 | from fastapi.responses import HTMLResponse
6 | from loguru import logger
7 |
8 | from app.api import api_router
9 | from app.config import settings, setup_app_logging
10 |
11 | # setup logging as early as possible
12 | setup_app_logging(config=settings)
13 |
14 |
15 | app = FastAPI(
16 | title=settings.PROJECT_NAME, openapi_url=f"{settings.API_V1_STR}/openapi.json"
17 | )
18 |
19 | root_router = APIRouter()
20 |
21 |
22 | @root_router.get("/")
23 | def index(request: Request) -> Any:
24 | """Basic HTML response."""
25 | body = (
26 | ""
27 | "
"
28 | "Welcome to the API
"
29 | ""
30 | "Check the docs:
here"
31 | "
"
32 | ""
33 | ""
34 | )
35 |
36 | return HTMLResponse(content=body)
37 |
38 |
39 | app.include_router(api_router, prefix=settings.API_V1_STR)
40 | app.include_router(root_router)
41 |
42 | # Set all CORS enabled origins
43 | if settings.BACKEND_CORS_ORIGINS:
44 | app.add_middleware(
45 | CORSMiddleware,
46 | allow_origins=[str(origin) for origin in settings.BACKEND_CORS_ORIGINS],
47 | allow_credentials=True,
48 | allow_methods=["*"],
49 | allow_headers=["*"],
50 | )
51 |
52 |
53 | if __name__ == "__main__":
54 | # Use this for debugging purposes only
55 | logger.warning("Running in development mode. Do not run like this in production.")
56 | import uvicorn
57 |
58 | uvicorn.run(app, host="localhost", port=8001, log_level="debug")
59 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/app/main.py:
--------------------------------------------------------------------------------
1 | from typing import Any
2 |
3 | from fastapi import APIRouter, FastAPI, Request
4 | from fastapi.middleware.cors import CORSMiddleware
5 | from fastapi.responses import HTMLResponse
6 | from loguru import logger
7 |
8 | from app.api import api_router
9 | from app.config import settings, setup_app_logging
10 |
11 | # setup logging as early as possible
12 | setup_app_logging(config=settings)
13 |
14 |
15 | app = FastAPI(
16 | title=settings.PROJECT_NAME, openapi_url=f"{settings.API_V1_STR}/openapi.json"
17 | )
18 |
19 | root_router = APIRouter()
20 |
21 |
22 | @root_router.get("/")
23 | def index(request: Request) -> Any:
24 | """Basic HTML response."""
25 | body = (
26 | ""
27 | ""
28 | "Welcome to the API
"
29 | ""
30 | "Check the docs:
here"
31 | "
"
32 | ""
33 | ""
34 | )
35 |
36 | return HTMLResponse(content=body)
37 |
38 |
39 | app.include_router(api_router, prefix=settings.API_V1_STR)
40 | app.include_router(root_router)
41 |
42 | # Set all CORS enabled origins
43 | if settings.BACKEND_CORS_ORIGINS:
44 | app.add_middleware(
45 | CORSMiddleware,
46 | allow_origins=[str(origin) for origin in settings.BACKEND_CORS_ORIGINS],
47 | allow_credentials=True,
48 | allow_methods=["*"],
49 | allow_headers=["*"],
50 | )
51 |
52 |
53 | if __name__ == "__main__":
54 | # Use this for debugging purposes only
55 | logger.warning("Running in development mode. Do not run like this in production.")
56 | import uvicorn
57 |
58 | uvicorn.run(app, host="localhost", port=8001, log_level="debug")
59 |
--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/app/main.py:
--------------------------------------------------------------------------------
1 | from typing import Any
2 |
3 | from fastapi import APIRouter, FastAPI, Request
4 | from fastapi.middleware.cors import CORSMiddleware
5 | from fastapi.responses import HTMLResponse
6 | from loguru import logger
7 |
8 | from app.api import api_router
9 | from app.config import settings, setup_app_logging
10 |
11 | # setup logging as early as possible
12 | setup_app_logging(config=settings)
13 |
14 |
15 | app = FastAPI(
16 | title=settings.PROJECT_NAME, openapi_url=f"{settings.API_V1_STR}/openapi.json"
17 | )
18 |
19 | root_router = APIRouter()
20 |
21 |
22 | @root_router.get("/")
23 | def index(request: Request) -> Any:
24 | """Basic HTML response."""
25 | body = (
26 | ""
27 | ""
28 | "Welcome to the API
"
29 | ""
30 | "Check the docs:
here"
31 | "
"
32 | ""
33 | ""
34 | )
35 |
36 | return HTMLResponse(content=body)
37 |
38 |
39 | app.include_router(api_router, prefix=settings.API_V1_STR)
40 | app.include_router(root_router)
41 |
42 | # Set all CORS enabled origins
43 | if settings.BACKEND_CORS_ORIGINS:
44 | app.add_middleware(
45 | CORSMiddleware,
46 | allow_origins=[str(origin) for origin in settings.BACKEND_CORS_ORIGINS],
47 | allow_credentials=True,
48 | allow_methods=["*"],
49 | allow_headers=["*"],
50 | )
51 |
52 |
53 | if __name__ == "__main__":
54 | # Use this for debugging purposes only
55 | logger.warning("Running in development mode. Do not run like this in production.")
56 | import uvicorn
57 |
58 | uvicorn.run(app, host="localhost", port=8001, log_level="debug")
59 |
--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/app/api.py:
--------------------------------------------------------------------------------
1 | import json
2 | from typing import Any
3 |
4 | import numpy as np
5 | import pandas as pd
6 | from fastapi import APIRouter, HTTPException
7 | from fastapi.encoders import jsonable_encoder
8 | from loguru import logger
9 | from regression_model import __version__ as model_version
10 | from regression_model.predict import make_prediction
11 |
12 | from app import __version__, schemas
13 | from app.config import settings
14 |
15 | api_router = APIRouter()
16 |
17 |
18 | @api_router.get("/health", response_model=schemas.Health, status_code=200)
19 | def health() -> dict:
20 | """
21 | Root Get
22 | """
23 | health = schemas.Health(
24 | name=settings.PROJECT_NAME, api_version=__version__, model_version=model_version
25 | )
26 |
27 | return health.dict()
28 |
29 |
30 | @api_router.post("/predict", response_model=schemas.PredictionResults, status_code=200)
31 | async def predict(input_data: schemas.MultipleHouseDataInputs) -> Any:
32 | """
33 | Make house price predictions with the TID regression model
34 | """
35 |
36 | input_df = pd.DataFrame(jsonable_encoder(input_data.inputs))
37 |
38 | # Advanced: You can improve performance of your API by rewriting the
39 | # `make prediction` function to be async and using await here.
40 | logger.info(f"Making prediction on inputs: {input_data.inputs}")
41 | results = make_prediction(input_data=input_df.replace({np.nan: None}))
42 |
43 | if results["errors"] is not None:
44 | logger.warning(f"Prediction validation error: {results.get('errors')}")
45 | raise HTTPException(status_code=400, detail=json.loads(results["errors"]))
46 |
47 | logger.info(f"Prediction results: {results.get('predictions')}")
48 |
49 | return results
50 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/app/api.py:
--------------------------------------------------------------------------------
1 | import json
2 | from typing import Any
3 |
4 | import numpy as np
5 | import pandas as pd
6 | from fastapi import APIRouter, HTTPException
7 | from fastapi.encoders import jsonable_encoder
8 | from loguru import logger
9 | from regression_model import __version__ as model_version
10 | from regression_model.predict import make_prediction
11 |
12 | from app import __version__, schemas
13 | from app.config import settings
14 |
15 | api_router = APIRouter()
16 |
17 |
18 | @api_router.get("/health", response_model=schemas.Health, status_code=200)
19 | def health() -> dict:
20 | """
21 | Root Get
22 | """
23 | health = schemas.Health(
24 | name=settings.PROJECT_NAME, api_version=__version__, model_version=model_version
25 | )
26 |
27 | return health.dict()
28 |
29 |
30 | @api_router.post("/predict", response_model=schemas.PredictionResults, status_code=200)
31 | async def predict(input_data: schemas.MultipleHouseDataInputs) -> Any:
32 | """
33 | Make house price predictions with the TID regression model
34 | """
35 |
36 | input_df = pd.DataFrame(jsonable_encoder(input_data.inputs))
37 |
38 | # Advanced: You can improve performance of your API by rewriting the
39 | # `make prediction` function to be async and using await here.
40 | logger.info(f"Making prediction on inputs: {input_data.inputs}")
41 | results = make_prediction(input_data=input_df.replace({np.nan: None}))
42 |
43 | if results["errors"] is not None:
44 | logger.warning(f"Prediction validation error: {results.get('errors')}")
45 | raise HTTPException(status_code=400, detail=json.loads(results["errors"]))
46 |
47 | logger.info(f"Prediction results: {results.get('predictions')}")
48 |
49 | return results
50 |
--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/app/api.py:
--------------------------------------------------------------------------------
1 | import json
2 | from typing import Any
3 |
4 | import numpy as np
5 | import pandas as pd
6 | from fastapi import APIRouter, HTTPException
7 | from fastapi.encoders import jsonable_encoder
8 | from loguru import logger
9 | from regression_model import __version__ as model_version
10 | from regression_model.predict import make_prediction
11 |
12 | from app import __version__, schemas
13 | from app.config import settings
14 |
15 | api_router = APIRouter()
16 |
17 |
18 | @api_router.get("/health", response_model=schemas.Health, status_code=200)
19 | def health() -> dict:
20 | """
21 | Root Get
22 | """
23 | health = schemas.Health(
24 | name=settings.PROJECT_NAME, api_version=__version__, model_version=model_version
25 | )
26 |
27 | return health.dict()
28 |
29 |
30 | @api_router.post("/predict", response_model=schemas.PredictionResults, status_code=200)
31 | async def predict(input_data: schemas.MultipleHouseDataInputs) -> Any:
32 | """
33 | Make house price predictions with the TID regression model
34 | """
35 |
36 | input_df = pd.DataFrame(jsonable_encoder(input_data.inputs))
37 |
38 | # Advanced: You can improve performance of your API by rewriting the
39 | # `make prediction` function to be async and using await here.
40 | logger.info(f"Making prediction on inputs: {input_data.inputs}")
41 | results = make_prediction(input_data=input_df.replace({np.nan: None}))
42 |
43 | if results["errors"] is not None:
44 | logger.warning(f"Prediction validation error: {results.get('errors')}")
45 | raise HTTPException(status_code=400, detail=json.loads(results["errors"]))
46 |
47 | logger.info(f"Prediction results: {results.get('predictions')}")
48 |
49 | return results
50 |
--------------------------------------------------------------------------------
/section-05-production-model-package/regression_model/processing/features.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | import pandas as pd
4 | from sklearn.base import BaseEstimator, TransformerMixin
5 |
6 |
7 | class TemporalVariableTransformer(BaseEstimator, TransformerMixin):
8 | """Temporal elapsed time transformer."""
9 |
10 | def __init__(self, variables: List[str], reference_variable: str):
11 |
12 | if not isinstance(variables, list):
13 | raise ValueError("variables should be a list")
14 |
15 | self.variables = variables
16 | self.reference_variable = reference_variable
17 |
18 | def fit(self, X: pd.DataFrame, y: pd.Series = None):
19 | # we need this step to fit the sklearn pipeline
20 | return self
21 |
22 | def transform(self, X: pd.DataFrame) -> pd.DataFrame:
23 |
24 | # so that we do not over-write the original dataframe
25 | X = X.copy()
26 |
27 | for feature in self.variables:
28 | X[feature] = X[self.reference_variable] - X[feature]
29 |
30 | return X
31 |
32 |
33 | class Mapper(BaseEstimator, TransformerMixin):
34 | """Categorical variable mapper."""
35 |
36 | def __init__(self, variables: List[str], mappings: dict):
37 |
38 | if not isinstance(variables, list):
39 | raise ValueError("variables should be a list")
40 |
41 | self.variables = variables
42 | self.mappings = mappings
43 |
44 | def fit(self, X: pd.DataFrame, y: pd.Series = None):
45 | # we need the fit statement to accomodate the sklearn pipeline
46 | return self
47 |
48 | def transform(self, X: pd.DataFrame) -> pd.DataFrame:
49 | X = X.copy()
50 | for feature in self.variables:
51 | X[feature] = X[feature].map(self.mappings)
52 |
53 | return X
54 |
--------------------------------------------------------------------------------
/assignment-section-05/tox.ini:
--------------------------------------------------------------------------------
1 | # Tox is a generic virtualenv management and test command line tool. Its goal is to
2 | # standardize testing in Python. We will be using it extensively in this course.
3 |
4 | # Using Tox we can (on multiple operating systems):
5 | # + Eliminate PYTHONPATH challenges when running scripts/tests
6 | # + Eliminate virtualenv setup confusion
7 | # + Streamline steps such as model training, model publishing
8 |
9 |
10 | [tox]
11 | envlist = test_package, typechecks, stylechecks, lint
12 | skipsdist = True
13 |
14 | [testenv]
15 | install_command = pip install {opts} {packages}
16 |
17 | [testenv:test_package]
18 | deps =
19 | -rrequirements/test_requirements.txt
20 |
21 | setenv =
22 | PYTHONPATH=.
23 | PYTHONHASHSEED=0
24 |
25 | commands=
26 | python classification_model/train_pipeline.py
27 | pytest \
28 | -s \
29 | -vv \
30 | {posargs:tests/}
31 |
32 | [testenv:train]
33 | envdir = {toxworkdir}/test_package
34 | deps =
35 | {[testenv:test_package]deps}
36 |
37 | setenv =
38 | {[testenv:test_package]setenv}
39 |
40 | commands=
41 | python classification_model/train_pipeline.py
42 |
43 |
44 | [testenv:typechecks]
45 | envdir = {toxworkdir}/test_package
46 |
47 | deps =
48 | {[testenv:test_package]deps}
49 |
50 | commands = {posargs:mypy classification_model}
51 |
52 |
53 | [testenv:stylechecks]
54 | envdir = {toxworkdir}/test_package
55 |
56 | deps =
57 | {[testenv:test_package]deps}
58 |
59 | commands = {posargs:flake8 classification_model tests}
60 |
61 |
62 | [testenv:lint]
63 | envdir = {toxworkdir}/test_package
64 |
65 | deps =
66 | {[testenv:test_package]deps}
67 |
68 | commands =
69 | isort classification_model tests
70 | black classification_model tests
71 | mypy classification_model
72 | flake8 classification_model
73 |
74 | [flake8]
75 | exclude = .git,env
76 | max-line-length = 90
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/regression_model/processing/features.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | import pandas as pd
4 | from sklearn.base import BaseEstimator, TransformerMixin
5 |
6 |
7 | class TemporalVariableTransformer(BaseEstimator, TransformerMixin):
8 | """Temporal elapsed time transformer."""
9 |
10 | def __init__(self, variables: List[str], reference_variable: str):
11 |
12 | if not isinstance(variables, list):
13 | raise ValueError("variables should be a list")
14 |
15 | self.variables = variables
16 | self.reference_variable = reference_variable
17 |
18 | def fit(self, X: pd.DataFrame, y: pd.Series = None):
19 | # we need this step to fit the sklearn pipeline
20 | return self
21 |
22 | def transform(self, X: pd.DataFrame) -> pd.DataFrame:
23 |
24 | # so that we do not over-write the original dataframe
25 | X = X.copy()
26 |
27 | for feature in self.variables:
28 | X[feature] = X[self.reference_variable] - X[feature]
29 |
30 | return X
31 |
32 |
33 | class Mapper(BaseEstimator, TransformerMixin):
34 | """Categorical variable mapper."""
35 |
36 | def __init__(self, variables: List[str], mappings: dict):
37 |
38 | if not isinstance(variables, list):
39 | raise ValueError("variables should be a list")
40 |
41 | self.variables = variables
42 | self.mappings = mappings
43 |
44 | def fit(self, X: pd.DataFrame, y: pd.Series = None):
45 | # we need the fit statement to accomodate the sklearn pipeline
46 | return self
47 |
48 | def transform(self, X: pd.DataFrame) -> pd.DataFrame:
49 | X = X.copy()
50 | for feature in self.variables:
51 | X[feature] = X[feature].map(self.mappings)
52 |
53 | return X
54 |
--------------------------------------------------------------------------------
/packages/ml_api/api/config.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from logging.handlers import TimedRotatingFileHandler
3 | import pathlib
4 | import os
5 | import sys
6 |
7 | PACKAGE_ROOT = pathlib.Path(__file__).resolve().parent.parent
8 |
9 | FORMATTER = logging.Formatter(
10 | "%(asctime)s — %(name)s — %(levelname)s —"
11 | "%(funcName)s:%(lineno)d — %(message)s")
12 | LOG_DIR = PACKAGE_ROOT / 'logs'
13 | LOG_DIR.mkdir(exist_ok=True)
14 | LOG_FILE = LOG_DIR / 'ml_api.log'
15 | UPLOAD_FOLDER = PACKAGE_ROOT / 'uploads'
16 | UPLOAD_FOLDER.mkdir(exist_ok=True)
17 |
18 | ALLOWED_EXTENSIONS = set(['png', 'jpg', 'jpeg'])
19 |
20 |
21 | def get_console_handler():
22 | console_handler = logging.StreamHandler(sys.stdout)
23 | console_handler.setFormatter(FORMATTER)
24 | return console_handler
25 |
26 |
27 | def get_file_handler():
28 | file_handler = TimedRotatingFileHandler(
29 | LOG_FILE, when='midnight')
30 | file_handler.setFormatter(FORMATTER)
31 | file_handler.setLevel(logging.WARNING)
32 | return file_handler
33 |
34 |
35 | def get_logger(*, logger_name):
36 | """Get logger with prepared handlers."""
37 |
38 | logger = logging.getLogger(logger_name)
39 |
40 | logger.setLevel(logging.INFO)
41 |
42 | logger.addHandler(get_console_handler())
43 | logger.addHandler(get_file_handler())
44 | logger.propagate = False
45 |
46 | return logger
47 |
48 |
49 | class Config:
50 | DEBUG = False
51 | TESTING = False
52 | CSRF_ENABLED = True
53 | SECRET_KEY = 'this-really-needs-to-be-changed'
54 | SERVER_PORT = 5000
55 | UPLOAD_FOLDER = UPLOAD_FOLDER
56 |
57 |
58 | class ProductionConfig(Config):
59 | DEBUG = False
60 | SERVER_ADDRESS: os.environ.get('SERVER_ADDRESS', '0.0.0.0')
61 | SERVER_PORT: os.environ.get('SERVER_PORT', '5000')
62 |
63 |
64 | class DevelopmentConfig(Config):
65 | DEVELOPMENT = True
66 | DEBUG = True
67 |
68 |
69 | class TestingConfig(Config):
70 | TESTING = True
71 |
--------------------------------------------------------------------------------
/scripts/input_test.json:
--------------------------------------------------------------------------------
1 | [{
2 | "Id": 1461,
3 | "MSSubClass": 20,
4 | "MSZoning": "RH",
5 | "LotFrontage": 80.0,
6 | "LotArea": 11622,
7 | "Street": "Pave",
8 | "Alley": null,
9 | "LotShape": "Reg",
10 | "LandContour": "Lvl",
11 | "Utilities": "AllPub",
12 | "LotConfig": "Inside",
13 | "LandSlope": "Gtl",
14 | "Neighborhood": "NAmes",
15 | "Condition1": "Feedr",
16 | "Condition2": "Norm",
17 | "BldgType": "1Fam",
18 | "HouseStyle": "1Story",
19 | "OverallQual": 5,
20 | "OverallCond": 6,
21 | "YearBuilt": 1961,
22 | "YearRemodAdd": 1961,
23 | "RoofStyle": "Gable",
24 | "RoofMatl": "CompShg",
25 | "Exterior1st": "VinylSd",
26 | "Exterior2nd": "VinylSd",
27 | "MasVnrType": "None",
28 | "MasVnrArea": 0.0,
29 | "ExterQual": "TA",
30 | "ExterCond": "TA",
31 | "Foundation": "CBlock",
32 | "BsmtQual": "TA",
33 | "BsmtCond": "TA",
34 | "BsmtExposure": "No",
35 | "BsmtFinType1": "Rec",
36 | "BsmtFinSF1": 468.0,
37 | "BsmtFinType2": "LwQ",
38 | "BsmtFinSF2": 144.0,
39 | "BsmtUnfSF": 270.0,
40 | "TotalBsmtSF": 882.0,
41 | "Heating": "GasA",
42 | "HeatingQC": "TA",
43 | "CentralAir": "Y",
44 | "Electrical": "SBrkr",
45 | "1stFlrSF": 896,
46 | "2ndFlrSF": 0,
47 | "LowQualFinSF": 0,
48 | "GrLivArea": 896,
49 | "BsmtFullBath": 0.0,
50 | "BsmtHalfBath": 0.0,
51 | "FullBath": 1,
52 | "HalfBath": 0,
53 | "BedroomAbvGr": 2,
54 | "KitchenAbvGr": 1,
55 | "KitchenQual": "TA",
56 | "TotRmsAbvGrd": 5,
57 | "Functional": "Typ",
58 | "Fireplaces": 0,
59 | "FireplaceQu": null,
60 | "GarageType": "Attchd",
61 | "GarageYrBlt": 1961.0,
62 | "GarageFinish": "Unf",
63 | "GarageCars": 1.0,
64 | "GarageArea": 730.0,
65 | "GarageQual": "TA",
66 | "GarageCond": "TA",
67 | "PavedDrive": "Y",
68 | "WoodDeckSF": 140,
69 | "OpenPorchSF": 0,
70 | "EnclosedPorch": 0,
71 | "3SsnPorch": 0,
72 | "ScreenPorch": 120,
73 | "PoolArea": 0,
74 | "PoolQC": null,
75 | "Fence": "MnPrv",
76 | "MiscFeature": null,
77 | "MiscVal": 0,
78 | "MoSold": 6,
79 | "YrSold": 2010,
80 | "SaleType": "WD",
81 | "SaleCondition": "Normal"
82 | }]
--------------------------------------------------------------------------------
/packages/ml_api/tests/differential_tests/test_differential.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | from regression_model.config import config as model_config
4 | from regression_model.predict import make_prediction
5 | from regression_model.processing.data_management import load_dataset
6 | import pandas as pd
7 | import pytest
8 |
9 |
10 | from api import config
11 |
12 |
13 | @pytest.mark.differential
14 | def test_model_prediction_differential(
15 | *,
16 | save_file: str = 'test_data_predictions.csv'):
17 | """
18 | This test compares the prediction result similarity of
19 | the current model with the previous model's results.
20 | """
21 |
22 | # Given
23 | # Load the saved previous model predictions
24 | previous_model_df = pd.read_csv(f'{config.PACKAGE_ROOT}/{save_file}')
25 | previous_model_predictions = previous_model_df.predictions.values
26 |
27 | test_data = load_dataset(file_name=model_config.TESTING_DATA_FILE)
28 | multiple_test_input = test_data[99:600]
29 |
30 | # When
31 | current_result = make_prediction(input_data=multiple_test_input)
32 | current_model_predictions = current_result.get('predictions')
33 |
34 | # Then
35 | # diff the current model vs. the old model
36 | assert len(previous_model_predictions) == len(
37 | current_model_predictions)
38 |
39 | # Perform the differential test
40 | for previous_value, current_value in zip(
41 | previous_model_predictions, current_model_predictions):
42 |
43 | # convert numpy float64 to Python float.
44 | previous_value = previous_value.item()
45 | current_value = current_value.item()
46 |
47 | # rel_tol is the relative tolerance – it is the maximum allowed
48 | # difference between a and b, relative to the larger absolute
49 | # value of a or b. For example, to set a tolerance of 5%, pass
50 | # rel_tol=0.05.
51 | assert math.isclose(previous_value,
52 | current_value,
53 | rel_tol=model_config.ACCEPTABLE_MODEL_DIFFERENCE)
54 |
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/processing/data_management.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import joblib
3 | from sklearn.pipeline import Pipeline
4 |
5 | from regression_model.config import config
6 | from regression_model import __version__ as _version
7 |
8 | import logging
9 | import typing as t
10 |
11 |
12 | _logger = logging.getLogger(__name__)
13 |
14 |
15 | def load_dataset(*, file_name: str) -> pd.DataFrame:
16 | _data = pd.read_csv(f"{config.DATASET_DIR}/{file_name}")
17 | return _data
18 |
19 |
20 | def save_pipeline(*, pipeline_to_persist) -> None:
21 | """Persist the pipeline.
22 | Saves the versioned model, and overwrites any previous
23 | saved models. This ensures that when the package is
24 | published, there is only one trained model that can be
25 | called, and we know exactly how it was built.
26 | """
27 |
28 | # Prepare versioned save file name
29 | save_file_name = f"{config.PIPELINE_SAVE_FILE}{_version}.pkl"
30 | save_path = config.TRAINED_MODEL_DIR / save_file_name
31 |
32 | remove_old_pipelines(files_to_keep=[save_file_name])
33 | joblib.dump(pipeline_to_persist, save_path)
34 | _logger.info(f"saved pipeline: {save_file_name}")
35 |
36 |
37 | def load_pipeline(*, file_name: str) -> Pipeline:
38 | """Load a persisted pipeline."""
39 |
40 | file_path = config.TRAINED_MODEL_DIR / file_name
41 | trained_model = joblib.load(filename=file_path)
42 | return trained_model
43 |
44 |
45 | def remove_old_pipelines(*, files_to_keep: t.List[str]) -> None:
46 | """
47 | Remove old model pipelines.
48 |
49 | This is to ensure there is a simple one-to-one
50 | mapping between the package version and the model
51 | version to be imported and used by other applications.
52 | However, we do also include the immediate previous
53 | pipeline version for differential testing purposes.
54 | """
55 | do_not_delete = files_to_keep + ['__init__.py']
56 | for model_file in config.TRAINED_MODEL_DIR.iterdir():
57 | if model_file.name not in do_not_delete:
58 | model_file.unlink()
59 |
--------------------------------------------------------------------------------
/assignment-section-05/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 | "setuptools>=42",
4 | "wheel"
5 | ]
6 | build-backend = "setuptools.build_meta"
7 |
8 | [tool.pytest.ini_options]
9 | minversion = "2.0"
10 | addopts = "-rfEX -p pytester --strict-markers"
11 | python_files = ["test_*.py", "*_test.py"]
12 | python_classes = ["Test", "Acceptance"]
13 | python_functions = ["test"]
14 | # NOTE: "doc" is not included here, but gets tested explicitly via "doctesting".
15 | testpaths = ["tests"]
16 | xfail_strict = true
17 | filterwarnings = [
18 | "error",
19 | "default:Using or importing the ABCs:DeprecationWarning:unittest2.*",
20 | # produced by older pyparsing<=2.2.0.
21 | "default:Using or importing the ABCs:DeprecationWarning:pyparsing.*",
22 | "default:the imp module is deprecated in favour of importlib:DeprecationWarning:nose.*",
23 | # distutils is deprecated in 3.10, scheduled for removal in 3.12
24 | "ignore:The distutils package is deprecated:DeprecationWarning",
25 | # produced by python3.6/site.py itself (3.6.7 on Travis, could not trigger it with 3.6.8)."
26 | "ignore:.*U.*mode is deprecated:DeprecationWarning:(?!(pytest|_pytest))",
27 | # produced by pytest-xdist
28 | "ignore:.*type argument to addoption.*:DeprecationWarning",
29 | # produced on execnet (pytest-xdist)
30 | "ignore:.*inspect.getargspec.*deprecated, use inspect.signature.*:DeprecationWarning",
31 | # pytest's own futurewarnings
32 | "ignore::pytest.PytestExperimentalApiWarning",
33 | # Do not cause SyntaxError for invalid escape sequences in py37.
34 | # Those are caught/handled by pyupgrade, and not easy to filter with the
35 | # module being the filename (with .py removed).
36 | "default:invalid escape sequence:DeprecationWarning",
37 | # ignore use of unregistered marks, because we use many to test the implementation
38 | "ignore::_pytest.warning_types.PytestUnknownMarkWarning",
39 | ]
40 |
41 | [tool.black]
42 | target-version = ['py36']
43 |
44 | [tool.isort]
45 | profile = "black"
46 | line_length = 100
47 | lines_between_sections = 1
48 | known_first_party = "sentry"
49 | skip = "migrations"
50 |
--------------------------------------------------------------------------------
/section-05-production-model-package/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 | "setuptools>=42",
4 | "wheel"
5 | ]
6 | build-backend = "setuptools.build_meta"
7 |
8 | [tool.pytest.ini_options]
9 | minversion = "2.0"
10 | addopts = "-rfEX -p pytester --strict-markers"
11 | python_files = ["test_*.py", "*_test.py"]
12 | python_classes = ["Test", "Acceptance"]
13 | python_functions = ["test"]
14 | # NOTE: "doc" is not included here, but gets tested explicitly via "doctesting".
15 | testpaths = ["tests"]
16 | xfail_strict = true
17 | filterwarnings = [
18 | "error",
19 | "default:Using or importing the ABCs:DeprecationWarning:unittest2.*",
20 | # produced by older pyparsing<=2.2.0.
21 | "default:Using or importing the ABCs:DeprecationWarning:pyparsing.*",
22 | "default:the imp module is deprecated in favour of importlib:DeprecationWarning:nose.*",
23 | # distutils is deprecated in 3.10, scheduled for removal in 3.12
24 | "ignore:The distutils package is deprecated:DeprecationWarning",
25 | # produced by python3.6/site.py itself (3.6.7 on Travis, could not trigger it with 3.6.8)."
26 | "ignore:.*U.*mode is deprecated:DeprecationWarning:(?!(pytest|_pytest))",
27 | # produced by pytest-xdist
28 | "ignore:.*type argument to addoption.*:DeprecationWarning",
29 | # produced on execnet (pytest-xdist)
30 | "ignore:.*inspect.getargspec.*deprecated, use inspect.signature.*:DeprecationWarning",
31 | # pytest's own futurewarnings
32 | "ignore::pytest.PytestExperimentalApiWarning",
33 | # Do not cause SyntaxError for invalid escape sequences in py37.
34 | # Those are caught/handled by pyupgrade, and not easy to filter with the
35 | # module being the filename (with .py removed).
36 | "default:invalid escape sequence:DeprecationWarning",
37 | # ignore use of unregistered marks, because we use many to test the implementation
38 | "ignore::_pytest.warning_types.PytestUnknownMarkWarning",
39 | ]
40 |
41 | [tool.black]
42 | target-version = ['py37']
43 |
44 | [tool.isort]
45 | profile = "black"
46 | line_length = 100
47 | lines_between_sections = 1
48 | known_first_party = "sentry"
49 | skip = "migrations"
50 |
--------------------------------------------------------------------------------
/section-05-production-model-package/regression_model/processing/data_manager.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 | from pathlib import Path
3 |
4 | import joblib
5 | import pandas as pd
6 | from sklearn.pipeline import Pipeline
7 |
8 | from regression_model import __version__ as _version
9 | from regression_model.config.core import DATASET_DIR, TRAINED_MODEL_DIR, config
10 |
11 |
12 | def load_dataset(*, file_name: str) -> pd.DataFrame:
13 | dataframe = pd.read_csv(Path(f"{DATASET_DIR}/{file_name}"))
14 | dataframe["MSSubClass"] = dataframe["MSSubClass"].astype("O")
15 |
16 | # rename variables beginning with numbers to avoid syntax errors later
17 | transformed = dataframe.rename(columns=config.model_config.variables_to_rename)
18 | return transformed
19 |
20 |
21 | def save_pipeline(*, pipeline_to_persist: Pipeline) -> None:
22 | """Persist the pipeline.
23 | Saves the versioned model, and overwrites any previous
24 | saved models. This ensures that when the package is
25 | published, there is only one trained model that can be
26 | called, and we know exactly how it was built.
27 | """
28 |
29 | # Prepare versioned save file name
30 | save_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl"
31 | save_path = TRAINED_MODEL_DIR / save_file_name
32 |
33 | remove_old_pipelines(files_to_keep=[save_file_name])
34 | joblib.dump(pipeline_to_persist, save_path)
35 |
36 |
37 | def load_pipeline(*, file_name: str) -> Pipeline:
38 | """Load a persisted pipeline."""
39 |
40 | file_path = TRAINED_MODEL_DIR / file_name
41 | trained_model = joblib.load(filename=file_path)
42 | return trained_model
43 |
44 |
45 | def remove_old_pipelines(*, files_to_keep: t.List[str]) -> None:
46 | """
47 | Remove old model pipelines.
48 | This is to ensure there is a simple one-to-one
49 | mapping between the package version and the model
50 | version to be imported and used by other applications.
51 | """
52 | do_not_delete = files_to_keep + ["__init__.py"]
53 | for model_file in TRAINED_MODEL_DIR.iterdir():
54 | if model_file.name not in do_not_delete:
55 | model_file.unlink()
56 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 | "setuptools>=42",
4 | "wheel"
5 | ]
6 | build-backend = "setuptools.build_meta"
7 |
8 | [tool.pytest.ini_options]
9 | minversion = "2.0"
10 | addopts = "-rfEX -p pytester --strict-markers"
11 | python_files = ["test_*.py", "*_test.py"]
12 | python_classes = ["Test", "Acceptance"]
13 | python_functions = ["test"]
14 | # NOTE: "doc" is not included here, but gets tested explicitly via "doctesting".
15 | testpaths = ["tests"]
16 | xfail_strict = true
17 | filterwarnings = [
18 | "error",
19 | "default:Using or importing the ABCs:DeprecationWarning:unittest2.*",
20 | # produced by older pyparsing<=2.2.0.
21 | "default:Using or importing the ABCs:DeprecationWarning:pyparsing.*",
22 | "default:the imp module is deprecated in favour of importlib:DeprecationWarning:nose.*",
23 | # distutils is deprecated in 3.10, scheduled for removal in 3.12
24 | "ignore:The distutils package is deprecated:DeprecationWarning",
25 | # produced by python3.6/site.py itself (3.6.7 on Travis, could not trigger it with 3.6.8)."
26 | "ignore:.*U.*mode is deprecated:DeprecationWarning:(?!(pytest|_pytest))",
27 | # produced by pytest-xdist
28 | "ignore:.*type argument to addoption.*:DeprecationWarning",
29 | # produced on execnet (pytest-xdist)
30 | "ignore:.*inspect.getargspec.*deprecated, use inspect.signature.*:DeprecationWarning",
31 | # pytest's own futurewarnings
32 | "ignore::pytest.PytestExperimentalApiWarning",
33 | # Do not cause SyntaxError for invalid escape sequences in py37.
34 | # Those are caught/handled by pyupgrade, and not easy to filter with the
35 | # module being the filename (with .py removed).
36 | "default:invalid escape sequence:DeprecationWarning",
37 | # ignore use of unregistered marks, because we use many to test the implementation
38 | "ignore::_pytest.warning_types.PytestUnknownMarkWarning",
39 | ]
40 |
41 | [tool.black]
42 | target-version = ['py36']
43 |
44 | [tool.isort]
45 | profile = "black"
46 | line_length = 100
47 | lines_between_sections = 1
48 | known_first_party = "sentry"
49 | skip = "migrations"
50 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/regression_model/processing/data_manager.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 | from pathlib import Path
3 |
4 | import joblib
5 | import pandas as pd
6 | from sklearn.pipeline import Pipeline
7 |
8 | from regression_model import __version__ as _version
9 | from regression_model.config.core import DATASET_DIR, TRAINED_MODEL_DIR, config
10 |
11 |
12 | def load_dataset(*, file_name: str) -> pd.DataFrame:
13 | dataframe = pd.read_csv(Path(f"{DATASET_DIR}/{file_name}"))
14 | dataframe["MSSubClass"] = dataframe["MSSubClass"].astype("O")
15 |
16 | # rename variables beginning with numbers to avoid syntax errors later
17 | transformed = dataframe.rename(columns=config.model_config.variables_to_rename)
18 | return transformed
19 |
20 |
21 | def save_pipeline(*, pipeline_to_persist: Pipeline) -> None:
22 | """Persist the pipeline.
23 | Saves the versioned model, and overwrites any previous
24 | saved models. This ensures that when the package is
25 | published, there is only one trained model that can be
26 | called, and we know exactly how it was built.
27 | """
28 |
29 | # Prepare versioned save file name
30 | save_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl"
31 | save_path = TRAINED_MODEL_DIR / save_file_name
32 |
33 | remove_old_pipelines(files_to_keep=[save_file_name])
34 | joblib.dump(pipeline_to_persist, save_path)
35 |
36 |
37 | def load_pipeline(*, file_name: str) -> Pipeline:
38 | """Load a persisted pipeline."""
39 |
40 | file_path = TRAINED_MODEL_DIR / file_name
41 | trained_model = joblib.load(filename=file_path)
42 | return trained_model
43 |
44 |
45 | def remove_old_pipelines(*, files_to_keep: t.List[str]) -> None:
46 | """
47 | Remove old model pipelines.
48 | This is to ensure there is a simple one-to-one
49 | mapping between the package version and the model
50 | version to be imported and used by other applications.
51 | """
52 | do_not_delete = files_to_keep + ["__init__.py"]
53 | for model_file in TRAINED_MODEL_DIR.iterdir():
54 | if model_file.name not in do_not_delete:
55 | model_file.unlink()
56 |
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/predict.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import pandas as pd
4 |
5 | from neural_network_model import __version__ as _version
6 | from neural_network_model.processing import data_management as dm
7 |
8 | _logger = logging.getLogger(__name__)
9 | KERAS_PIPELINE = dm.load_pipeline_keras()
10 | ENCODER = dm.load_encoder()
11 |
12 |
13 | def make_single_prediction(*, image_name: str, image_directory: str):
14 | """Make a single prediction using the saved model pipeline.
15 |
16 | Args:
17 | image_name: Filename of the image to classify
18 | image_directory: Location of the image to classify
19 |
20 | Returns
21 | Dictionary with both raw predictions and readable values.
22 | """
23 |
24 | image_df = dm.load_single_image(
25 | data_folder=image_directory,
26 | filename=image_name)
27 |
28 | prepared_df = image_df['image'].reset_index(drop=True)
29 | _logger.info(f'received input array: {prepared_df}, '
30 | f'filename: {image_name}')
31 |
32 | predictions = KERAS_PIPELINE.predict(prepared_df)
33 | readable_predictions = ENCODER.encoder.inverse_transform(predictions)
34 |
35 | _logger.info(f'Made prediction: {predictions}'
36 | f' with model version: {_version}')
37 |
38 | return dict(predictions=predictions,
39 | readable_predictions=readable_predictions,
40 | version=_version)
41 |
42 |
43 | def make_bulk_prediction(*, images_df: pd.Series) -> dict:
44 | """Make multiple predictions using the saved model pipeline.
45 |
46 | Currently, this function is primarily for testing purposes,
47 | allowing us to pass in a directory of images for running
48 | bulk predictions.
49 |
50 | Args:
51 | images_df: Pandas series of images
52 |
53 | Returns
54 | Dictionary with both raw predictions and their classifications.
55 | """
56 |
57 | _logger.info(f'received input df: {images_df}')
58 |
59 | predictions = KERAS_PIPELINE.predict(images_df)
60 | readable_predictions = ENCODER.encoder.inverse_transform(predictions)
61 |
62 | _logger.info(f'Made predictions: {predictions}'
63 | f' with model version: {_version}')
64 |
65 | return dict(predictions=predictions,
66 | readable_predictions=readable_predictions,
67 | version=_version)
68 |
--------------------------------------------------------------------------------
/assignment-section-05/classification_model/pipeline.py:
--------------------------------------------------------------------------------
1 | # for encoding categorical variables
2 | from feature_engine.encoding import OneHotEncoder, RareLabelEncoder
3 |
4 | # for imputation
5 | from feature_engine.imputation import (
6 | AddMissingIndicator,
7 | CategoricalImputer,
8 | MeanMedianImputer,
9 | )
10 | from sklearn.linear_model import LogisticRegression
11 | from sklearn.pipeline import Pipeline
12 | from sklearn.preprocessing import StandardScaler
13 |
14 | from classification_model.config.core import config
15 | from classification_model.processing.features import ExtractLetterTransformer
16 |
17 | titanic_pipe = Pipeline(
18 | [
19 | # impute categorical variables with string missing
20 | (
21 | "categorical_imputation",
22 | CategoricalImputer(
23 | imputation_method="missing",
24 | variables=config.model_config.categorical_vars,
25 | ),
26 | ),
27 | # add missing indicator to numerical variables
28 | (
29 | "missing_indicator",
30 | AddMissingIndicator(variables=config.model_config.numerical_vars),
31 | ),
32 | # impute numerical variables with the median
33 | (
34 | "median_imputation",
35 | MeanMedianImputer(
36 | imputation_method="median", variables=config.model_config.numerical_vars
37 | ),
38 | ),
39 | # Extract letter from cabin
40 | (
41 | "extract_letter",
42 | ExtractLetterTransformer(variables=config.model_config.cabin_vars),
43 | ),
44 | # == CATEGORICAL ENCODING ======
45 | # remove categories present in less than 5% of the observations (0.05)
46 | # group them in one category called 'Rare'
47 | (
48 | "rare_label_encoder",
49 | RareLabelEncoder(
50 | tol=0.05, n_categories=1, variables=config.model_config.categorical_vars
51 | ),
52 | ),
53 | # encode categorical variables using one hot encoding into k-1 variables
54 | (
55 | "categorical_encoder",
56 | OneHotEncoder(
57 | drop_last=True, variables=config.model_config.categorical_vars
58 | ),
59 | ),
60 | # scale
61 | ("scaler", StandardScaler()),
62 | ("Logit", LogisticRegression(C=0.0005, random_state=0)),
63 | ]
64 | )
65 |
--------------------------------------------------------------------------------
/assignment-section-05/classification_model/config/core.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from typing import Sequence
3 |
4 | from pydantic import BaseModel
5 | from strictyaml import YAML, load
6 |
7 | import classification_model
8 |
9 | # Project Directories
10 | PACKAGE_ROOT = Path(classification_model.__file__).resolve().parent
11 | ROOT = PACKAGE_ROOT.parent
12 | CONFIG_FILE_PATH = PACKAGE_ROOT / "config.yml"
13 | DATASET_DIR = PACKAGE_ROOT / "datasets"
14 | TRAINED_MODEL_DIR = PACKAGE_ROOT / "trained_models"
15 |
16 |
17 | class AppConfig(BaseModel):
18 | """
19 | Application-level config.
20 | """
21 |
22 | package_name: str
23 | raw_data_file: str
24 | pipeline_save_file: str
25 |
26 |
27 | class ModelConfig(BaseModel):
28 | """
29 | All configuration relevant to model
30 | training and feature engineering.
31 | """
32 |
33 | target: str
34 | unused_fields: Sequence[str]
35 | features: Sequence[str]
36 | test_size: float
37 | random_state: int
38 | numerical_vars: Sequence[str]
39 | categorical_vars: Sequence[str]
40 | cabin_vars: Sequence[str]
41 |
42 |
43 | class Config(BaseModel):
44 | """Master config object."""
45 |
46 | app_config: AppConfig
47 | model_config: ModelConfig
48 |
49 |
50 | def find_config_file() -> Path:
51 | """Locate the configuration file."""
52 | if CONFIG_FILE_PATH.is_file():
53 | return CONFIG_FILE_PATH
54 | raise Exception(f"Config not found at {CONFIG_FILE_PATH!r}")
55 |
56 |
57 | def fetch_config_from_yaml(cfg_path: Path = None) -> YAML:
58 | """Parse YAML containing the package configuration."""
59 |
60 | if not cfg_path:
61 | cfg_path = find_config_file()
62 |
63 | if cfg_path:
64 | with open(cfg_path, "r") as conf_file:
65 | parsed_config = load(conf_file.read())
66 | return parsed_config
67 | raise OSError(f"Did not find config file at path: {cfg_path}")
68 |
69 |
70 | def create_and_validate_config(parsed_config: YAML = None) -> Config:
71 | """Run validation on config values."""
72 | if parsed_config is None:
73 | parsed_config = fetch_config_from_yaml()
74 |
75 | # specify the data attribute from the strictyaml YAML type.
76 | _config = Config(
77 | app_config=AppConfig(**parsed_config.data),
78 | model_config=ModelConfig(**parsed_config.data),
79 | )
80 |
81 | return _config
82 |
83 |
84 | config = create_and_validate_config()
85 |
--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/app/config.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import sys
3 | from types import FrameType
4 | from typing import List, cast
5 |
6 | from loguru import logger
7 | from pydantic import AnyHttpUrl, BaseSettings
8 |
9 |
10 | class LoggingSettings(BaseSettings):
11 | LOGGING_LEVEL: int = logging.INFO # logging levels are type int
12 |
13 |
14 | class Settings(BaseSettings):
15 | API_V1_STR: str = "/api/v1"
16 |
17 | # Meta
18 | logging: LoggingSettings = LoggingSettings()
19 |
20 | # BACKEND_CORS_ORIGINS is a comma-separated list of origins
21 | # e.g: http://localhost,http://localhost:4200,http://localhost:3000
22 | BACKEND_CORS_ORIGINS: List[AnyHttpUrl] = [
23 | "http://localhost:3000", # type: ignore
24 | "http://localhost:8000", # type: ignore
25 | "https://localhost:3000", # type: ignore
26 | "https://localhost:8000", # type: ignore
27 | ]
28 |
29 | PROJECT_NAME: str = "House Price Prediction API"
30 |
31 | class Config:
32 | case_sensitive = True
33 |
34 |
35 | # See: https://loguru.readthedocs.io/en/stable/overview.html#entirely-compatible-with-standard-logging # noqa
36 | class InterceptHandler(logging.Handler):
37 | def emit(self, record: logging.LogRecord) -> None: # pragma: no cover
38 | # Get corresponding Loguru level if it exists
39 | try:
40 | level = logger.level(record.levelname).name
41 | except ValueError:
42 | level = str(record.levelno)
43 |
44 | # Find caller from where originated the logged message
45 | frame, depth = logging.currentframe(), 2
46 | while frame.f_code.co_filename == logging.__file__: # noqa: WPS609
47 | frame = cast(FrameType, frame.f_back)
48 | depth += 1
49 |
50 | logger.opt(depth=depth, exception=record.exc_info).log(
51 | level,
52 | record.getMessage(),
53 | )
54 |
55 |
56 | def setup_app_logging(config: Settings) -> None:
57 | """Prepare custom logging for our application."""
58 |
59 | LOGGERS = ("uvicorn.asgi", "uvicorn.access")
60 | logging.getLogger().handlers = [InterceptHandler()]
61 | for logger_name in LOGGERS:
62 | logging_logger = logging.getLogger(logger_name)
63 | logging_logger.handlers = [InterceptHandler(level=config.logging.LOGGING_LEVEL)]
64 |
65 | logger.configure(
66 | handlers=[{"sink": sys.stderr, "level": config.logging.LOGGING_LEVEL}]
67 | )
68 |
69 |
70 | settings = Settings()
71 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/app/config.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import sys
3 | from types import FrameType
4 | from typing import List, cast
5 |
6 | from loguru import logger
7 | from pydantic import AnyHttpUrl, BaseSettings
8 |
9 |
10 | class LoggingSettings(BaseSettings):
11 | LOGGING_LEVEL: int = logging.INFO # logging levels are type int
12 |
13 |
14 | class Settings(BaseSettings):
15 | API_V1_STR: str = "/api/v1"
16 |
17 | # Meta
18 | logging: LoggingSettings = LoggingSettings()
19 |
20 | # BACKEND_CORS_ORIGINS is a comma-separated list of origins
21 | # e.g: http://localhost,http://localhost:4200,http://localhost:3000
22 | BACKEND_CORS_ORIGINS: List[AnyHttpUrl] = [
23 | "http://localhost:3000", # type: ignore
24 | "http://localhost:8000", # type: ignore
25 | "https://localhost:3000", # type: ignore
26 | "https://localhost:8000", # type: ignore
27 | ]
28 |
29 | PROJECT_NAME: str = "House Price Prediction API"
30 |
31 | class Config:
32 | case_sensitive = True
33 |
34 |
35 | # See: https://loguru.readthedocs.io/en/stable/overview.html#entirely-compatible-with-standard-logging # noqa
36 | class InterceptHandler(logging.Handler):
37 | def emit(self, record: logging.LogRecord) -> None: # pragma: no cover
38 | # Get corresponding Loguru level if it exists
39 | try:
40 | level = logger.level(record.levelname).name
41 | except ValueError:
42 | level = str(record.levelno)
43 |
44 | # Find caller from where originated the logged message
45 | frame, depth = logging.currentframe(), 2
46 | while frame.f_code.co_filename == logging.__file__: # noqa: WPS609
47 | frame = cast(FrameType, frame.f_back)
48 | depth += 1
49 |
50 | logger.opt(depth=depth, exception=record.exc_info).log(
51 | level,
52 | record.getMessage(),
53 | )
54 |
55 |
56 | def setup_app_logging(config: Settings) -> None:
57 | """Prepare custom logging for our application."""
58 |
59 | LOGGERS = ("uvicorn.asgi", "uvicorn.access")
60 | logging.getLogger().handlers = [InterceptHandler()]
61 | for logger_name in LOGGERS:
62 | logging_logger = logging.getLogger(logger_name)
63 | logging_logger.handlers = [InterceptHandler(level=config.logging.LOGGING_LEVEL)]
64 |
65 | logger.configure(
66 | handlers=[{"sink": sys.stderr, "level": config.logging.LOGGING_LEVEL}]
67 | )
68 |
69 |
70 | settings = Settings()
71 |
--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/app/config.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import sys
3 | from types import FrameType
4 | from typing import List, cast
5 |
6 | from loguru import logger
7 | from pydantic import AnyHttpUrl, BaseSettings
8 |
9 |
10 | class LoggingSettings(BaseSettings):
11 | LOGGING_LEVEL: int = logging.INFO # logging levels are type int
12 |
13 |
14 | class Settings(BaseSettings):
15 | API_V1_STR: str = "/api/v1"
16 |
17 | # Meta
18 | logging: LoggingSettings = LoggingSettings()
19 |
20 | # BACKEND_CORS_ORIGINS is a comma-separated list of origins
21 | # e.g: http://localhost,http://localhost:4200,http://localhost:3000
22 | BACKEND_CORS_ORIGINS: List[AnyHttpUrl] = [
23 | "http://localhost:3000", # type: ignore
24 | "http://localhost:8000", # type: ignore
25 | "https://localhost:3000", # type: ignore
26 | "https://localhost:8000", # type: ignore
27 | ]
28 |
29 | PROJECT_NAME: str = "House Price Prediction API"
30 |
31 | class Config:
32 | case_sensitive = True
33 |
34 |
35 | # See: https://loguru.readthedocs.io/en/stable/overview.html#entirely-compatible-with-standard-logging # noqa
36 | class InterceptHandler(logging.Handler):
37 | def emit(self, record: logging.LogRecord) -> None: # pragma: no cover
38 | # Get corresponding Loguru level if it exists
39 | try:
40 | level = logger.level(record.levelname).name
41 | except ValueError:
42 | level = str(record.levelno)
43 |
44 | # Find caller from where originated the logged message
45 | frame, depth = logging.currentframe(), 2
46 | while frame.f_code.co_filename == logging.__file__: # noqa: WPS609
47 | frame = cast(FrameType, frame.f_back)
48 | depth += 1
49 |
50 | logger.opt(depth=depth, exception=record.exc_info).log(
51 | level,
52 | record.getMessage(),
53 | )
54 |
55 |
56 | def setup_app_logging(config: Settings) -> None:
57 | """Prepare custom logging for our application."""
58 |
59 | LOGGERS = ("uvicorn.asgi", "uvicorn.access")
60 | logging.getLogger().handlers = [InterceptHandler()]
61 | for logger_name in LOGGERS:
62 | logging_logger = logging.getLogger(logger_name)
63 | logging_logger.handlers = [InterceptHandler(level=config.logging.LOGGING_LEVEL)]
64 |
65 | logger.configure(
66 | handlers=[{"sink": sys.stderr, "level": config.logging.LOGGING_LEVEL}]
67 | )
68 |
69 |
70 | settings = Settings()
71 |
--------------------------------------------------------------------------------
/section-05-production-model-package/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from pathlib import Path
5 |
6 | from setuptools import find_packages, setup
7 |
8 | # Package meta-data.
9 | NAME = 'tid-regression-model'
10 | DESCRIPTION = "Example regression model package from Train In Data."
11 | URL = "https://github.com/trainindata/testing-and-monitoring-ml-deployments"
12 | EMAIL = "christopher.samiullah@protonmail.com"
13 | AUTHOR = "ChristopherGS"
14 | REQUIRES_PYTHON = ">=3.6.0"
15 |
16 |
17 | # The rest you shouldn't have to touch too much :)
18 | # ------------------------------------------------
19 | # Except, perhaps the License and Trove Classifiers!
20 | # If you do change the License, remember to change the
21 | # Trove Classifier for that!
22 | long_description = DESCRIPTION
23 |
24 | # Load the package's VERSION file as a dictionary.
25 | about = {}
26 | ROOT_DIR = Path(__file__).resolve().parent
27 | REQUIREMENTS_DIR = ROOT_DIR / 'requirements'
28 | PACKAGE_DIR = ROOT_DIR / 'regression_model'
29 | with open(PACKAGE_DIR / "VERSION") as f:
30 | _version = f.read().strip()
31 | about["__version__"] = _version
32 |
33 |
34 | # What packages are required for this module to be executed?
35 | def list_reqs(fname="requirements.txt"):
36 | with open(REQUIREMENTS_DIR / fname) as fd:
37 | return fd.read().splitlines()
38 |
39 | # Where the magic happens:
40 | setup(
41 | name=NAME,
42 | version=about["__version__"],
43 | description=DESCRIPTION,
44 | long_description=long_description,
45 | long_description_content_type="text/markdown",
46 | author=AUTHOR,
47 | author_email=EMAIL,
48 | python_requires=REQUIRES_PYTHON,
49 | url=URL,
50 | packages=find_packages(exclude=("tests",)),
51 | package_data={"regression_model": ["VERSION"]},
52 | install_requires=list_reqs(),
53 | extras_require={},
54 | include_package_data=True,
55 | license="BSD-3",
56 | classifiers=[
57 | # Trove classifiers
58 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
59 | "License :: OSI Approved :: MIT License",
60 | "Programming Language :: Python",
61 | "Programming Language :: Python :: 3",
62 | "Programming Language :: Python :: 3.6",
63 | "Programming Language :: Python :: 3.7",
64 | "Programming Language :: Python :: 3.8",
65 | "Programming Language :: Python :: 3.9",
66 | "Programming Language :: Python :: Implementation :: CPython",
67 | "Programming Language :: Python :: Implementation :: PyPy",
68 | ],
69 | )
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from pathlib import Path
5 |
6 | from setuptools import find_packages, setup
7 |
8 | # Package meta-data.
9 | NAME = 'tid-regression-model'
10 | DESCRIPTION = "Example regression model package from Train In Data."
11 | URL = "https://github.com/trainindata/testing-and-monitoring-ml-deployments"
12 | EMAIL = "christopher.samiullah@protonmail.com"
13 | AUTHOR = "ChristopherGS"
14 | REQUIRES_PYTHON = ">=3.6.0"
15 |
16 |
17 | # The rest you shouldn't have to touch too much :)
18 | # ------------------------------------------------
19 | # Except, perhaps the License and Trove Classifiers!
20 | # If you do change the License, remember to change the
21 | # Trove Classifier for that!
22 | long_description = DESCRIPTION
23 |
24 | # Load the package's VERSION file as a dictionary.
25 | about = {}
26 | ROOT_DIR = Path(__file__).resolve().parent
27 | REQUIREMENTS_DIR = ROOT_DIR / 'requirements'
28 | PACKAGE_DIR = ROOT_DIR / 'regression_model'
29 | with open(PACKAGE_DIR / "VERSION") as f:
30 | _version = f.read().strip()
31 | about["__version__"] = _version
32 |
33 |
34 | # What packages are required for this module to be executed?
35 | def list_reqs(fname="requirements.txt"):
36 | with open(REQUIREMENTS_DIR / fname) as fd:
37 | return fd.read().splitlines()
38 |
39 | # Where the magic happens:
40 | setup(
41 | name=NAME,
42 | version=about["__version__"],
43 | description=DESCRIPTION,
44 | long_description=long_description,
45 | long_description_content_type="text/markdown",
46 | author=AUTHOR,
47 | author_email=EMAIL,
48 | python_requires=REQUIRES_PYTHON,
49 | url=URL,
50 | packages=find_packages(exclude=("tests",)),
51 | package_data={"regression_model": ["VERSION"]},
52 | install_requires=list_reqs(),
53 | extras_require={},
54 | include_package_data=True,
55 | license="BSD-3",
56 | classifiers=[
57 | # Trove classifiers
58 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
59 | "License :: OSI Approved :: MIT License",
60 | "Programming Language :: Python",
61 | "Programming Language :: Python :: 3",
62 | "Programming Language :: Python :: 3.6",
63 | "Programming Language :: Python :: 3.7",
64 | "Programming Language :: Python :: 3.8",
65 | "Programming Language :: Python :: 3.9",
66 | "Programming Language :: Python :: Implementation :: CPython",
67 | "Programming Language :: Python :: Implementation :: PyPy",
68 | ],
69 | )
--------------------------------------------------------------------------------
/assignment-section-05/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from pathlib import Path
5 |
6 | from setuptools import find_packages, setup
7 |
8 | # Package meta-data.
9 | NAME = 'tid-titanic-classification-model'
10 | DESCRIPTION = "Example Titanic dataset classification model package from Train In Data."
11 | URL = "https://github.com/trainindata/deploying-machine-learning-models"
12 | EMAIL = "christopher.samiullah@protonmail.com"
13 | AUTHOR = "ChristopherGS"
14 | REQUIRES_PYTHON = ">=3.7.0"
15 |
16 |
17 | # The rest you shouldn't have to touch too much :)
18 | # ------------------------------------------------
19 | # Except, perhaps the License and Trove Classifiers!
20 | # Trove Classifiers: https://pypi.org/classifiers/
21 | # If you do change the License, remember to change the
22 | # Trove Classifier for that!
23 | long_description = DESCRIPTION
24 |
25 | # Load the package's VERSION file as a dictionary.
26 | about = {}
27 | ROOT_DIR = Path(__file__).resolve().parent
28 | REQUIREMENTS_DIR = ROOT_DIR / 'requirements'
29 | PACKAGE_DIR = ROOT_DIR / 'classification_model'
30 | with open(PACKAGE_DIR / "VERSION") as f:
31 | _version = f.read().strip()
32 | about["__version__"] = _version
33 |
34 |
35 | # What packages are required for this module to be executed?
36 | def list_reqs(fname="requirements.txt"):
37 | with open(REQUIREMENTS_DIR / fname) as fd:
38 | return fd.read().splitlines()
39 |
40 | # Where the magic happens:
41 | setup(
42 | name=NAME,
43 | version=about["__version__"],
44 | description=DESCRIPTION,
45 | long_description=long_description,
46 | long_description_content_type="text/markdown",
47 | author=AUTHOR,
48 | author_email=EMAIL,
49 | python_requires=REQUIRES_PYTHON,
50 | url=URL,
51 | packages=find_packages(exclude=("tests",)),
52 | package_data={"classification_model": ["VERSION"]},
53 | install_requires=list_reqs(),
54 | extras_require={},
55 | include_package_data=True,
56 | license="BSD-3",
57 | classifiers=[
58 | # Trove classifiers
59 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
60 | "License :: OSI Approved :: MIT License",
61 | "Programming Language :: Python",
62 | "Programming Language :: Python :: 3",
63 | "Programming Language :: Python :: 3.7",
64 | "Programming Language :: Python :: 3.8",
65 | "Programming Language :: Python :: 3.9",
66 | "Programming Language :: Python :: Implementation :: CPython",
67 | "Programming Language :: Python :: Implementation :: PyPy",
68 | ],
69 | )
--------------------------------------------------------------------------------
/packages/neural_network_model/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import io
5 | import os
6 | from pathlib import Path
7 |
8 | from setuptools import find_packages, setup
9 |
10 |
11 | # Package meta-data.
12 | NAME = 'neural_network_model'
13 | DESCRIPTION = 'Train and deploy neural network model.'
14 | URL = 'your github project'
15 | EMAIL = 'your_email@email.com'
16 | AUTHOR = 'Your name'
17 | REQUIRES_PYTHON = '>=3.6.0'
18 |
19 |
20 | # What packages are required for this module to be executed?
21 | def list_reqs(fname='requirements.txt'):
22 | with open(fname) as fd:
23 | return fd.read().splitlines()
24 |
25 |
26 | # The rest you shouldn't have to touch too much :)
27 | # ------------------------------------------------
28 | # Except, perhaps the License and Trove Classifiers!
29 | # If you do change the License, remember to change the
30 | # Trove Classifier for that!
31 |
32 | here = os.path.abspath(os.path.dirname(__file__))
33 |
34 | # Import the README and use it as the long-description.
35 | # Note: this will only work if 'README.md' is present in your MANIFEST.in file!
36 | try:
37 | with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
38 | long_description = '\n' + f.read()
39 | except FileNotFoundError:
40 | long_description = DESCRIPTION
41 |
42 |
43 | # Load the package's __version__.py module as a dictionary.
44 | ROOT_DIR = Path(__file__).resolve().parent
45 | PACKAGE_DIR = ROOT_DIR / NAME
46 | about = {}
47 | with open(PACKAGE_DIR / 'VERSION') as f:
48 | _version = f.read().strip()
49 | about['__version__'] = _version
50 |
51 |
52 | # Where the magic happens:
53 | setup(
54 | name=NAME,
55 | version=about['__version__'],
56 | description=DESCRIPTION,
57 | long_description=long_description,
58 | long_description_content_type='text/markdown',
59 | author=AUTHOR,
60 | author_email=EMAIL,
61 | python_requires=REQUIRES_PYTHON,
62 | url=URL,
63 | packages=find_packages(exclude=('tests',)),
64 | package_data={'neural_network_model': ['VERSION']},
65 | install_requires=list_reqs(),
66 | extras_require={},
67 | include_package_data=True,
68 | license='MIT',
69 | classifiers=[
70 | # Trove classifiers
71 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
72 | 'License :: OSI Approved :: MIT License',
73 | 'Programming Language :: Python',
74 | 'Programming Language :: Python :: 3',
75 | 'Programming Language :: Python :: 3.6',
76 | 'Programming Language :: Python :: Implementation :: CPython',
77 | 'Programming Language :: Python :: Implementation :: PyPy'
78 | ],
79 | )
80 |
--------------------------------------------------------------------------------
/packages/regression_model/regression_model/config/config.py:
--------------------------------------------------------------------------------
1 | import pathlib
2 |
3 | import regression_model
4 |
5 | import pandas as pd
6 |
7 |
8 | pd.options.display.max_rows = 10
9 | pd.options.display.max_columns = 10
10 |
11 |
12 | PACKAGE_ROOT = pathlib.Path(regression_model.__file__).resolve().parent
13 | TRAINED_MODEL_DIR = PACKAGE_ROOT / "trained_models"
14 | DATASET_DIR = PACKAGE_ROOT / "datasets"
15 |
16 | # data
17 | TESTING_DATA_FILE = "test.csv"
18 | TRAINING_DATA_FILE = "train.csv"
19 | TARGET = "SalePrice"
20 |
21 |
22 | # variables
23 | FEATURES = [
24 | "MSSubClass",
25 | "MSZoning",
26 | "Neighborhood",
27 | "OverallQual",
28 | "OverallCond",
29 | "YearRemodAdd",
30 | "RoofStyle",
31 | "MasVnrType",
32 | "BsmtQual",
33 | "BsmtExposure",
34 | "HeatingQC",
35 | "CentralAir",
36 | "1stFlrSF",
37 | "GrLivArea",
38 | "BsmtFullBath",
39 | "KitchenQual",
40 | "Fireplaces",
41 | "FireplaceQu",
42 | "GarageType",
43 | "GarageFinish",
44 | "GarageCars",
45 | "PavedDrive",
46 | "LotFrontage",
47 | # this one is only to calculate temporal variable:
48 | "YrSold",
49 | ]
50 |
51 | # this variable is to calculate the temporal variable,
52 | # can be dropped afterwards
53 | DROP_FEATURES = "YrSold"
54 |
55 | # numerical variables with NA in train set
56 | NUMERICAL_VARS_WITH_NA = ["LotFrontage"]
57 |
58 | # categorical variables with NA in train set
59 | CATEGORICAL_VARS_WITH_NA = [
60 | "MasVnrType",
61 | "BsmtQual",
62 | "BsmtExposure",
63 | "FireplaceQu",
64 | "GarageType",
65 | "GarageFinish",
66 | ]
67 |
68 | TEMPORAL_VARS = "YearRemodAdd"
69 |
70 | # variables to log transform
71 | NUMERICALS_LOG_VARS = ["LotFrontage", "1stFlrSF", "GrLivArea"]
72 |
73 | # categorical variables to encode
74 | CATEGORICAL_VARS = [
75 | "MSZoning",
76 | "Neighborhood",
77 | "RoofStyle",
78 | "MasVnrType",
79 | "BsmtQual",
80 | "BsmtExposure",
81 | "HeatingQC",
82 | "CentralAir",
83 | "KitchenQual",
84 | "FireplaceQu",
85 | "GarageType",
86 | "GarageFinish",
87 | "PavedDrive",
88 | ]
89 |
90 | NUMERICAL_NA_NOT_ALLOWED = [
91 | feature
92 | for feature in FEATURES
93 | if feature not in CATEGORICAL_VARS + NUMERICAL_VARS_WITH_NA
94 | ]
95 |
96 | CATEGORICAL_NA_NOT_ALLOWED = [
97 | feature for feature in CATEGORICAL_VARS if feature not in CATEGORICAL_VARS_WITH_NA
98 | ]
99 |
100 |
101 | PIPELINE_NAME = "lasso_regression"
102 | PIPELINE_SAVE_FILE = f"{PIPELINE_NAME}_output_v"
103 |
104 | # used for differential testing
105 | ACCEPTABLE_MODEL_DIFFERENCE = 0.05
106 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/tox.ini:
--------------------------------------------------------------------------------
1 | # Tox is a generic virtualenv management and test command line tool. Its goal is to
2 | # standardize testing in Python. We will be using it extensively in this course.
3 |
4 | # Using Tox we can (on multiple operating systems):
5 | # + Eliminate PYTHONPATH challenges when running scripts/tests
6 | # + Eliminate virtualenv setup confusion
7 | # + Streamline steps such as model training, model publishing
8 |
9 |
10 | [tox]
11 | envlist = test_package, typechecks, stylechecks, lint
12 | skipsdist = True
13 |
14 | [testenv]
15 | install_command = pip install {opts} {packages}
16 |
17 | passenv =
18 | KAGGLE_USERNAME
19 | KAGGLE_KEY
20 | GEMFURY_PUSH_URL
21 |
22 | [testenv:test_package]
23 | deps =
24 | -rrequirements/test_requirements.txt
25 |
26 | setenv =
27 | PYTHONPATH=.
28 | PYTHONHASHSEED=0
29 |
30 | commands=
31 | python regression_model/train_pipeline.py
32 | pytest \
33 | -s \
34 | -vv \
35 | {posargs:tests/}
36 |
37 | [testenv:train]
38 | envdir = {toxworkdir}/test_package
39 | deps =
40 | {[testenv:test_package]deps}
41 |
42 | setenv =
43 | {[testenv:test_package]setenv}
44 |
45 | commands=
46 | python regression_model/train_pipeline.py
47 |
48 | [testenv:fetch_data]
49 | envdir = {toxworkdir}/test_package
50 | deps =
51 | {[testenv:test_package]deps}
52 |
53 | setenv =
54 | {[testenv:test_package]setenv}
55 |
56 | commands=
57 | # fetch
58 | kaggle competitions download -c house-prices-advanced-regression-techniques -p ./regression_model/datasets
59 | # unzip
60 | unzip ./regression_model/datasets/house-prices-advanced-regression-techniques.zip -d ./regression_model/datasets
61 |
62 |
63 | [testenv:publish_model]
64 | envdir = {toxworkdir}/test_package
65 | deps =
66 | {[testenv:test_package]deps}
67 |
68 | setenv =
69 | {[testenv:test_package]setenv}
70 |
71 | commands=
72 | python regression_model/train_pipeline.py
73 | ./publish_model.sh .
74 |
75 |
76 | [testenv:typechecks]
77 | envdir = {toxworkdir}/test_package
78 |
79 | deps =
80 | {[testenv:test_package]deps}
81 |
82 | commands = {posargs:mypy regression_model}
83 |
84 |
85 | [testenv:stylechecks]
86 | envdir = {toxworkdir}/test_package
87 |
88 | deps =
89 | {[testenv:test_package]deps}
90 |
91 | commands = {posargs:flake8 regression_model tests}
92 |
93 |
94 | [testenv:lint]
95 | envdir = {toxworkdir}/test_package
96 |
97 | deps =
98 | {[testenv:test_package]deps}
99 |
100 | commands =
101 | isort regression_model tests
102 | black regression_model tests
103 | mypy regression_model
104 | flake8 regression_model
105 |
106 | [flake8]
107 | exclude = .git,env
108 | max-line-length = 90
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
106 | # pycharm
107 | .idea/
108 |
109 | # datafiles
110 | packages/regression_model/regression_model/datasets/*.csv
111 | packages/regression_model/regression_model/datasets/*.zip
112 | packages/regression_model/regression_model/datasets/*.txt
113 | train.csv
114 | test.csv
115 | raw.csv
116 | data_description.txt
117 | house-prices-advanced-regression-techniques.zip
118 | sample_submission.csv
119 | test_data_predictions.csv
120 | v2-plant-seedlings-dataset/
121 | v2-plant-seedlings-dataset.zip
122 |
123 | # all logs
124 | logs/
125 |
126 | # trained models (will be created in CI)
127 | section-05-production-model-package/regression_model/trained_models/*.pkl
128 | packages/regression_model/regression_model/trained_models/*.pkl
129 | packages/neural_network_model/neural_network_model/trained_models/*.pkl
130 | packages/neural_network_model/neural_network_model/trained_models/*.h5
131 | *.h5
132 | packages/neural_network_model/neural_network_model/datasets/training_data_reference.txt
133 | *.pkl
134 |
135 | .DS_Store
136 |
137 | kaggle.json
138 | packages/ml_api/uploads/*
139 |
--------------------------------------------------------------------------------
/packages/regression_model/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import io
5 | import os
6 | from pathlib import Path
7 |
8 | from setuptools import find_packages, setup
9 |
10 |
11 | # Package meta-data.
12 | NAME = 'regression_model'
13 | DESCRIPTION = 'Regression model for using in the Train In Data online course "Deployment of Machine Learning Models".'
14 | URL = 'https://github.com/trainindata/deploying-machine-learning-models'
15 | EMAIL = 'christopher.samiullah@protonmail.com'
16 | AUTHOR = 'ChristopherGS'
17 | REQUIRES_PYTHON = '>=3.6.0'
18 |
19 |
20 | # Packages that are required for this module to be executed
21 | def list_reqs(fname='requirements.txt'):
22 | with open(fname) as fd:
23 | return fd.read().splitlines()
24 |
25 |
26 | # The rest you shouldn't have to touch too much :)
27 | # ------------------------------------------------
28 | # Except, perhaps the License and Trove Classifiers!
29 | # If you do change the License, remember to change the
30 | # Trove Classifier for that!
31 |
32 | here = os.path.abspath(os.path.dirname(__file__))
33 |
34 | # Import the README and use it as the long-description.
35 | # Note: this will only work if 'README.md' is present in your MANIFEST.in file!
36 | try:
37 | with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
38 | long_description = '\n' + f.read()
39 | except FileNotFoundError:
40 | long_description = DESCRIPTION
41 |
42 |
43 | # Load the package's __version__.py module as a dictionary.
44 | ROOT_DIR = Path(__file__).resolve().parent
45 | PACKAGE_DIR = ROOT_DIR / 'regression_model'
46 | about = {}
47 | with open(PACKAGE_DIR / 'VERSION') as f:
48 | _version = f.read().strip()
49 | about['__version__'] = _version
50 |
51 |
52 | # Where the magic happens:
53 | setup(
54 | name=NAME,
55 | version=about['__version__'],
56 | description=DESCRIPTION,
57 | long_description=long_description,
58 | long_description_content_type='text/markdown',
59 | author=AUTHOR,
60 | author_email=EMAIL,
61 | python_requires=REQUIRES_PYTHON,
62 | url=URL,
63 | packages=find_packages(exclude=('tests',)),
64 | package_data={'regression_model': ['VERSION']},
65 | install_requires=list_reqs(),
66 | extras_require={},
67 | include_package_data=True,
68 | license='BSD 3',
69 | classifiers=[
70 | # Trove classifiers
71 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
72 | 'License :: OSI Approved :: MIT License',
73 | 'Programming Language :: Python',
74 | 'Programming Language :: Python :: 3',
75 | 'Programming Language :: Python :: 3.6',
76 | 'Programming Language :: Python :: 3.7',
77 | 'Programming Language :: Python :: 3.8',
78 | 'Programming Language :: Python :: Implementation :: CPython',
79 | 'Programming Language :: Python :: Implementation :: PyPy'
80 | ],
81 | )
82 |
--------------------------------------------------------------------------------
/packages/neural_network_model/neural_network_model/model.py:
--------------------------------------------------------------------------------
1 | # for the convolutional network
2 | from keras.models import Sequential
3 | from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten
4 | from keras.optimizers import Adam
5 | from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
6 | from keras.wrappers.scikit_learn import KerasClassifier
7 |
8 | from neural_network_model.config import config
9 |
10 |
11 | def cnn_model(kernel_size=(3, 3),
12 | pool_size=(2, 2),
13 | first_filters=32,
14 | second_filters=64,
15 | third_filters=128,
16 | dropout_conv=0.3,
17 | dropout_dense=0.3,
18 | image_size=50):
19 |
20 | model = Sequential()
21 | model.add(Conv2D(
22 | first_filters,
23 | kernel_size,
24 | activation='relu',
25 | input_shape=(image_size, image_size, 3)))
26 | model.add(Conv2D(first_filters, kernel_size, activation = 'relu'))
27 | model.add(MaxPooling2D(pool_size=pool_size))
28 | model.add(Dropout(dropout_conv))
29 |
30 | model.add(Conv2D(second_filters, kernel_size, activation='relu'))
31 | model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
32 | model.add(MaxPooling2D(pool_size=pool_size))
33 | model.add(Dropout(dropout_conv))
34 |
35 | model.add(Conv2D(third_filters, kernel_size, activation='relu'))
36 | model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
37 | model.add(MaxPooling2D(pool_size=pool_size))
38 | model.add(Dropout(dropout_conv))
39 |
40 | model.add(Flatten())
41 | model.add(Dense(256, activation="relu"))
42 | model.add(Dropout(dropout_dense))
43 | model.add(Dense(12, activation="softmax"))
44 |
45 | model.compile(Adam(lr=0.0001),
46 | loss='binary_crossentropy',
47 | metrics=['accuracy'])
48 |
49 | return model
50 |
51 |
52 | checkpoint = ModelCheckpoint(config.MODEL_PATH,
53 | monitor='acc',
54 | verbose=1,
55 | save_best_only=True,
56 | mode='max')
57 |
58 | reduce_lr = ReduceLROnPlateau(monitor='acc',
59 | factor=0.5,
60 | patience=2,
61 | verbose=1,
62 | mode='max',
63 | min_lr=0.00001)
64 |
65 | callbacks_list = [checkpoint, reduce_lr]
66 |
67 | cnn_clf = KerasClassifier(build_fn=cnn_model,
68 | batch_size=config.BATCH_SIZE,
69 | validation_split=10,
70 | epochs=config.EPOCHS,
71 | verbose=1, # progress bar - required for CI job
72 | callbacks=callbacks_list,
73 | image_size=config.IMAGE_SIZE
74 | )
75 |
76 |
77 | if __name__ == '__main__':
78 | model = cnn_model()
79 | model.summary()
80 |
--------------------------------------------------------------------------------
/section-04-research-and-development/preprocessors_bonus.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from sklearn.base import BaseEstimator, TransformerMixin
4 |
5 |
6 | class MeanImputer(BaseEstimator, TransformerMixin):
7 | """Numerical missing value imputer."""
8 |
9 | def __init__(self, variables):
10 | if not isinstance(variables, list):
11 | raise ValueError('variables should be a list')
12 | self.variables = variables
13 |
14 | def fit(self, X, y=None):
15 | # persist mean values in a dictionary
16 | self.imputer_dict_ = X[self.variables].mean().to_dict()
17 | return self
18 |
19 | def transform(self, X):
20 | X = X.copy()
21 | for feature in self.variables:
22 | X[feature].fillna(self.imputer_dict_[feature],
23 | inplace=True)
24 | return X
25 |
26 |
27 |
28 | class RareLabelCategoricalEncoder(BaseEstimator, TransformerMixin):
29 | """Groups infrequent categories into a single string"""
30 |
31 | def __init__(self, tol=0.05, variables):
32 |
33 | if not isinstance(variables, list):
34 | raise ValueError('variables should be a list')
35 |
36 | self.tol = tol
37 | self.variables = variables
38 |
39 | def fit(self, X, y=None):
40 | # persist frequent labels in dictionary
41 | self.encoder_dict_ = {}
42 |
43 | for var in self.variables:
44 | # the encoder will learn the most frequent categories
45 | t = pd.Series(X[var].value_counts(normalize=True)
46 | # frequent labels:
47 | self.encoder_dict_[var] = list(t[t >= self.tol].index)
48 |
49 | return self
50 |
51 | def transform(self, X):
52 | X = X.copy()
53 | for feature in self.variables:
54 | X[feature] = np.where(
55 | X[feature].isin(self.encoder_dict_[feature]),
56 | X[feature], "Rare")
57 |
58 | return X
59 |
60 |
61 | class CategoricalEncoder(BaseEstimator, TransformerMixin):
62 | """String to numbers categorical encoder."""
63 |
64 | def __init__(self, variables):
65 |
66 | if not isinstance(variables, list):
67 | raise ValueError('variables should be a list')
68 |
69 | self.variables = variables
70 |
71 | def fit(self, X, y):
72 | temp = pd.concat([X, y], axis=1)
73 | temp.columns = list(X.columns) + ["target"]
74 |
75 | # persist transforming dictionary
76 | self.encoder_dict_ = {}
77 |
78 | for var in self.variables:
79 | t = temp.groupby([var])["target"].mean().sort_values(ascending=True).index
80 | self.encoder_dict_[var] = {k: i for i, k in enumerate(t, 0)}
81 |
82 | return self
83 |
84 | def transform(self, X):
85 | # encode labels
86 | X = X.copy()
87 | for feature in self.variables:
88 | X[feature] = X[feature].map(self.encoder_dict_[feature])
89 |
90 | return X
--------------------------------------------------------------------------------
/packages/ml_api/tests/test_controller.py:
--------------------------------------------------------------------------------
1 | import io
2 | import json
3 | import math
4 | import os
5 |
6 | from neural_network_model.config import config as ccn_config
7 | from regression_model import __version__ as _version
8 | from regression_model.config import config as model_config
9 | from regression_model.processing.data_management import load_dataset
10 |
11 | from api import __version__ as api_version
12 |
13 |
14 | def test_health_endpoint_returns_200(flask_test_client):
15 | # When
16 | response = flask_test_client.get('/health')
17 |
18 | # Then
19 | assert response.status_code == 200
20 |
21 |
22 | def test_version_endpoint_returns_version(flask_test_client):
23 | # When
24 | response = flask_test_client.get('/version')
25 |
26 | # Then
27 | assert response.status_code == 200
28 | response_json = json.loads(response.data)
29 | assert response_json['model_version'] == _version
30 | assert response_json['api_version'] == api_version
31 |
32 |
33 | def test_prediction_endpoint_returns_prediction(flask_test_client):
34 | # Given
35 | # Load the test data from the regression_model package
36 | # This is important as it makes it harder for the test
37 | # data versions to get confused by not spreading it
38 | # across packages.
39 | test_data = load_dataset(file_name=model_config.TESTING_DATA_FILE)
40 | post_json = test_data[0:1].to_json(orient='records')
41 |
42 | # When
43 | response = flask_test_client.post('/v1/predict/regression',
44 | json=json.loads(post_json))
45 |
46 | # Then
47 | assert response.status_code == 200
48 | response_json = json.loads(response.data)
49 | prediction = response_json['predictions']
50 | response_version = response_json['version']
51 | assert math.ceil(prediction[0]) == 112476
52 | assert response_version == _version
53 |
54 |
55 | def test_classifier_endpoint_returns_prediction(flask_test_client):
56 | # Given
57 | # Load the test data from the neural_network_model package
58 | # This is important as it makes it harder for the test
59 | # data versions to get confused by not spreading it
60 | # across packages.
61 | data_dir = os.path.abspath(os.path.join(ccn_config.DATA_FOLDER, os.pardir))
62 | test_dir = os.path.join(data_dir, 'test_data')
63 | black_grass_dir = os.path.join(test_dir, 'Black-grass')
64 | black_grass_image = os.path.join(black_grass_dir, '1.png')
65 | with open(black_grass_image, "rb") as image_file:
66 | file_bytes = image_file.read()
67 | data = dict(
68 | file=(io.BytesIO(bytearray(file_bytes)), "1.png"),
69 | )
70 |
71 | # When
72 | response = flask_test_client.post('/predict/classifier',
73 | content_type='multipart/form-data',
74 | data=data)
75 |
76 | # Then
77 | assert response.status_code == 200
78 | response_json = json.loads(response.data)
79 | assert response_json['readable_predictions']
80 |
--------------------------------------------------------------------------------
/section-05-production-model-package/regression_model/config/core.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from typing import Dict, List, Sequence
3 |
4 | from pydantic import BaseModel
5 | from strictyaml import YAML, load
6 |
7 | import regression_model
8 |
9 | # Project Directories
10 | PACKAGE_ROOT = Path(regression_model.__file__).resolve().parent
11 | ROOT = PACKAGE_ROOT.parent
12 | CONFIG_FILE_PATH = PACKAGE_ROOT / "config.yml"
13 | DATASET_DIR = PACKAGE_ROOT / "datasets"
14 | TRAINED_MODEL_DIR = PACKAGE_ROOT / "trained_models"
15 |
16 |
17 | class AppConfig(BaseModel):
18 | """
19 | Application-level config.
20 | """
21 |
22 | package_name: str
23 | training_data_file: str
24 | test_data_file: str
25 | pipeline_save_file: str
26 |
27 |
28 | class ModelConfig(BaseModel):
29 | """
30 | All configuration relevant to model
31 | training and feature engineering.
32 | """
33 |
34 | target: str
35 | variables_to_rename: Dict
36 | features: List[str]
37 | test_size: float
38 | random_state: int
39 | alpha: float
40 | categorical_vars_with_na_frequent: List[str]
41 | categorical_vars_with_na_missing: List[str]
42 | numerical_vars_with_na: List[str]
43 | temporal_vars: List[str]
44 | ref_var: str
45 | numericals_log_vars: Sequence[str]
46 | binarize_vars: Sequence[str]
47 | qual_vars: List[str]
48 | exposure_vars: List[str]
49 | finish_vars: List[str]
50 | garage_vars: List[str]
51 | categorical_vars: Sequence[str]
52 | qual_mappings: Dict[str, int]
53 | exposure_mappings: Dict[str, int]
54 | garage_mappings: Dict[str, int]
55 | finish_mappings: Dict[str, int]
56 |
57 |
58 | class Config(BaseModel):
59 | """Master config object."""
60 |
61 | app_config: AppConfig
62 | model_config: ModelConfig
63 |
64 |
65 | def find_config_file() -> Path:
66 | """Locate the configuration file."""
67 | if CONFIG_FILE_PATH.is_file():
68 | return CONFIG_FILE_PATH
69 | raise Exception(f"Config not found at {CONFIG_FILE_PATH!r}")
70 |
71 |
72 | def fetch_config_from_yaml(cfg_path: Path = None) -> YAML:
73 | """Parse YAML containing the package configuration."""
74 |
75 | if not cfg_path:
76 | cfg_path = find_config_file()
77 |
78 | if cfg_path:
79 | with open(cfg_path, "r") as conf_file:
80 | parsed_config = load(conf_file.read())
81 | return parsed_config
82 | raise OSError(f"Did not find config file at path: {cfg_path}")
83 |
84 |
85 | def create_and_validate_config(parsed_config: YAML = None) -> Config:
86 | """Run validation on config values."""
87 | if parsed_config is None:
88 | parsed_config = fetch_config_from_yaml()
89 |
90 | # specify the data attribute from the strictyaml YAML type.
91 | _config = Config(
92 | app_config=AppConfig(**parsed_config.data),
93 | model_config=ModelConfig(**parsed_config.data),
94 | )
95 |
96 | return _config
97 |
98 |
99 | config = create_and_validate_config()
100 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/regression_model/config/core.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from typing import Dict, List, Sequence
3 |
4 | from pydantic import BaseModel
5 | from strictyaml import YAML, load
6 |
7 | import regression_model
8 |
9 | # Project Directories
10 | PACKAGE_ROOT = Path(regression_model.__file__).resolve().parent
11 | ROOT = PACKAGE_ROOT.parent
12 | CONFIG_FILE_PATH = PACKAGE_ROOT / "config.yml"
13 | DATASET_DIR = PACKAGE_ROOT / "datasets"
14 | TRAINED_MODEL_DIR = PACKAGE_ROOT / "trained_models"
15 |
16 |
17 | class AppConfig(BaseModel):
18 | """
19 | Application-level config.
20 | """
21 |
22 | package_name: str
23 | training_data_file: str
24 | test_data_file: str
25 | pipeline_save_file: str
26 |
27 |
28 | class ModelConfig(BaseModel):
29 | """
30 | All configuration relevant to model
31 | training and feature engineering.
32 | """
33 |
34 | target: str
35 | variables_to_rename: Dict
36 | features: List[str]
37 | test_size: float
38 | random_state: int
39 | alpha: float
40 | categorical_vars_with_na_frequent: List[str]
41 | categorical_vars_with_na_missing: List[str]
42 | numerical_vars_with_na: List[str]
43 | temporal_vars: List[str]
44 | ref_var: str
45 | numericals_log_vars: Sequence[str]
46 | binarize_vars: Sequence[str]
47 | qual_vars: List[str]
48 | exposure_vars: List[str]
49 | finish_vars: List[str]
50 | garage_vars: List[str]
51 | categorical_vars: Sequence[str]
52 | qual_mappings: Dict[str, int]
53 | exposure_mappings: Dict[str, int]
54 | garage_mappings: Dict[str, int]
55 | finish_mappings: Dict[str, int]
56 |
57 |
58 | class Config(BaseModel):
59 | """Master config object."""
60 |
61 | app_config: AppConfig
62 | model_config: ModelConfig
63 |
64 |
65 | def find_config_file() -> Path:
66 | """Locate the configuration file."""
67 | if CONFIG_FILE_PATH.is_file():
68 | return CONFIG_FILE_PATH
69 | raise Exception(f"Config not found at {CONFIG_FILE_PATH!r}")
70 |
71 |
72 | def fetch_config_from_yaml(cfg_path: Path = None) -> YAML:
73 | """Parse YAML containing the package configuration."""
74 |
75 | if not cfg_path:
76 | cfg_path = find_config_file()
77 |
78 | if cfg_path:
79 | with open(cfg_path, "r") as conf_file:
80 | parsed_config = load(conf_file.read())
81 | return parsed_config
82 | raise OSError(f"Did not find config file at path: {cfg_path}")
83 |
84 |
85 | def create_and_validate_config(parsed_config: YAML = None) -> Config:
86 | """Run validation on config values."""
87 | if parsed_config is None:
88 | parsed_config = fetch_config_from_yaml()
89 |
90 | # specify the data attribute from the strictyaml YAML type.
91 | _config = Config(
92 | app_config=AppConfig(**parsed_config.data),
93 | model_config=ModelConfig(**parsed_config.data),
94 | )
95 |
96 | return _config
97 |
98 |
99 | config = create_and_validate_config()
100 |
--------------------------------------------------------------------------------
/section-05-production-model-package/regression_model/config.yml:
--------------------------------------------------------------------------------
1 | # Package Overview
2 | package_name: regression_model
3 |
4 | # Data Files
5 | training_data_file: train.csv
6 | test_data_file: test.csv
7 |
8 | # Variables
9 | # The variable we are attempting to predict (sale price)
10 | target: SalePrice
11 |
12 | pipeline_name: regression_model
13 | pipeline_save_file: regression_model_output_v
14 |
15 | # Will cause syntax errors since they begin with numbers
16 | variables_to_rename:
17 | 1stFlrSF: FirstFlrSF
18 | 2ndFlrSF: SecondFlrSF
19 | 3SsnPorch: ThreeSsnPortch
20 |
21 | features:
22 | - MSSubClass
23 | - MSZoning
24 | - LotFrontage
25 | - LotShape
26 | - LandContour
27 | - LotConfig
28 | - Neighborhood
29 | - OverallQual
30 | - OverallCond
31 | - YearRemodAdd
32 | - RoofStyle
33 | - Exterior1st
34 | - ExterQual
35 | - Foundation
36 | - BsmtQual
37 | - BsmtExposure
38 | - BsmtFinType1
39 | - HeatingQC
40 | - CentralAir
41 | - FirstFlrSF # renamed
42 | - SecondFlrSF # renamed
43 | - GrLivArea
44 | - BsmtFullBath
45 | - HalfBath
46 | - KitchenQual
47 | - TotRmsAbvGrd
48 | - Functional
49 | - Fireplaces
50 | - FireplaceQu
51 | - GarageFinish
52 | - GarageCars
53 | - GarageArea
54 | - PavedDrive
55 | - WoodDeckSF
56 | - ScreenPorch
57 | - SaleCondition
58 | # this one is only to calculate temporal variable:
59 | - YrSold
60 |
61 | # set train/test split
62 | test_size: 0.1
63 |
64 | # to set the random seed
65 | random_state: 0
66 |
67 | alpha: 0.001
68 |
69 | # categorical variables with NA in train set
70 | categorical_vars_with_na_frequent:
71 | - BsmtQual
72 | - BsmtExposure
73 | - BsmtFinType1
74 | - GarageFinish
75 |
76 | categorical_vars_with_na_missing:
77 | - FireplaceQu
78 |
79 | numerical_vars_with_na:
80 | - LotFrontage
81 |
82 | temporal_vars:
83 | - YearRemodAdd
84 |
85 | ref_var: YrSold
86 |
87 |
88 | # variables to log transform
89 | numericals_log_vars:
90 | - LotFrontage
91 | - FirstFlrSF
92 | - GrLivArea
93 |
94 | binarize_vars:
95 | - ScreenPorch
96 |
97 | # variables to map
98 | qual_vars:
99 | - ExterQual
100 | - BsmtQual
101 | - HeatingQC
102 | - KitchenQual
103 | - FireplaceQu
104 |
105 | exposure_vars:
106 | - BsmtExposure
107 |
108 | finish_vars:
109 | - BsmtFinType1
110 |
111 | garage_vars:
112 | - GarageFinish
113 |
114 | categorical_vars:
115 | - MSSubClass
116 | - MSZoning
117 | - LotShape
118 | - LandContour
119 | - LotConfig
120 | - Neighborhood
121 | - RoofStyle
122 | - Exterior1st
123 | - Foundation
124 | - CentralAir
125 | - Functional
126 | - PavedDrive
127 | - SaleCondition
128 |
129 | # variable mappings
130 | qual_mappings:
131 | Po: 1
132 | Fa: 2
133 | TA: 3
134 | Gd: 4
135 | Ex: 5
136 | Missing: 0
137 | NA: 0
138 |
139 | exposure_mappings:
140 | No: 1
141 | Mn: 2
142 | Av: 3
143 | Gd: 4
144 |
145 |
146 | finish_mappings:
147 | Missing: 0
148 | NA: 0
149 | Unf: 1
150 | LwQ: 2
151 | Rec: 3
152 | BLQ: 4
153 | ALQ: 5
154 | GLQ: 6
155 |
156 |
157 | garage_mappings:
158 | Missing: 0
159 | NA: 0
160 | Unf: 1
161 | RFn: 2
162 | Fin: 3
163 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/regression_model/config.yml:
--------------------------------------------------------------------------------
1 | # Package Overview
2 | package_name: regression_model
3 |
4 | # Data Files
5 | training_data_file: train.csv
6 | test_data_file: test.csv
7 |
8 | # Variables
9 | # The variable we are attempting to predict (sale price)
10 | target: SalePrice
11 |
12 | pipeline_name: regression_model
13 | pipeline_save_file: regression_model_output_v
14 |
15 | # Will cause syntax errors since they begin with numbers
16 | variables_to_rename:
17 | 1stFlrSF: FirstFlrSF
18 | 2ndFlrSF: SecondFlrSF
19 | 3SsnPorch: ThreeSsnPortch
20 |
21 | features:
22 | - MSSubClass
23 | - MSZoning
24 | - LotFrontage
25 | - LotShape
26 | - LandContour
27 | - LotConfig
28 | - Neighborhood
29 | - OverallQual
30 | - OverallCond
31 | - YearRemodAdd
32 | - RoofStyle
33 | - Exterior1st
34 | - ExterQual
35 | - Foundation
36 | - BsmtQual
37 | - BsmtExposure
38 | - BsmtFinType1
39 | - HeatingQC
40 | - CentralAir
41 | - FirstFlrSF # renamed
42 | - SecondFlrSF # renamed
43 | - GrLivArea
44 | - BsmtFullBath
45 | - HalfBath
46 | - KitchenQual
47 | - TotRmsAbvGrd
48 | - Functional
49 | - Fireplaces
50 | - FireplaceQu
51 | - GarageFinish
52 | - GarageCars
53 | - GarageArea
54 | - PavedDrive
55 | - WoodDeckSF
56 | - ScreenPorch
57 | - SaleCondition
58 | # this one is only to calculate temporal variable:
59 | - YrSold
60 |
61 | # set train/test split
62 | test_size: 0.1
63 |
64 | # to set the random seed
65 | random_state: 0
66 |
67 | alpha: 0.001
68 |
69 | # categorical variables with NA in train set
70 | categorical_vars_with_na_frequent:
71 | - BsmtQual
72 | - BsmtExposure
73 | - BsmtFinType1
74 | - GarageFinish
75 |
76 | categorical_vars_with_na_missing:
77 | - FireplaceQu
78 |
79 | numerical_vars_with_na:
80 | - LotFrontage
81 |
82 | temporal_vars:
83 | - YearRemodAdd
84 |
85 | ref_var: YrSold
86 |
87 |
88 | # variables to log transform
89 | numericals_log_vars:
90 | - LotFrontage
91 | - FirstFlrSF
92 | - GrLivArea
93 |
94 | binarize_vars:
95 | - ScreenPorch
96 |
97 | # variables to map
98 | qual_vars:
99 | - ExterQual
100 | - BsmtQual
101 | - HeatingQC
102 | - KitchenQual
103 | - FireplaceQu
104 |
105 | exposure_vars:
106 | - BsmtExposure
107 |
108 | finish_vars:
109 | - BsmtFinType1
110 |
111 | garage_vars:
112 | - GarageFinish
113 |
114 | categorical_vars:
115 | - MSSubClass
116 | - MSZoning
117 | - LotShape
118 | - LandContour
119 | - LotConfig
120 | - Neighborhood
121 | - RoofStyle
122 | - Exterior1st
123 | - Foundation
124 | - CentralAir
125 | - Functional
126 | - PavedDrive
127 | - SaleCondition
128 |
129 | # variable mappings
130 | qual_mappings:
131 | Po: 1
132 | Fa: 2
133 | TA: 3
134 | Gd: 4
135 | Ex: 5
136 | Missing: 0
137 | NA: 0
138 |
139 | exposure_mappings:
140 | No: 1
141 | Mn: 2
142 | Av: 3
143 | Gd: 4
144 |
145 |
146 | finish_mappings:
147 | Missing: 0
148 | NA: 0
149 | Unf: 1
150 | LwQ: 2
151 | Rec: 3
152 | BLQ: 4
153 | ALQ: 5
154 | GLQ: 6
155 |
156 |
157 | garage_mappings:
158 | Missing: 0
159 | NA: 0
160 | Unf: 1
161 | RFn: 2
162 | Fin: 3
163 |
--------------------------------------------------------------------------------
/packages/ml_api/api/controller.py:
--------------------------------------------------------------------------------
1 | from flask import Blueprint, request, jsonify
2 | from regression_model.predict import make_prediction
3 | from regression_model import __version__ as _version
4 | from neural_network_model.predict import make_single_prediction
5 | import os
6 | from werkzeug.utils import secure_filename
7 |
8 | from api.config import get_logger, UPLOAD_FOLDER
9 | from api.validation import validate_inputs, allowed_file
10 | from api import __version__ as api_version
11 |
12 | _logger = get_logger(logger_name=__name__)
13 |
14 |
15 | prediction_app = Blueprint('prediction_app', __name__)
16 |
17 |
18 | @prediction_app.route('/health', methods=['GET'])
19 | def health():
20 | if request.method == 'GET':
21 | _logger.info('health status OK')
22 | return 'ok'
23 |
24 |
25 | @prediction_app.route('/version', methods=['GET'])
26 | def version():
27 | if request.method == 'GET':
28 | return jsonify({'model_version': _version,
29 | 'api_version': api_version})
30 |
31 |
32 | @prediction_app.route('/v1/predict/regression', methods=['POST'])
33 | def predict():
34 | if request.method == 'POST':
35 | # Step 1: Extract POST data from request body as JSON
36 | json_data = request.get_json()
37 | _logger.debug(f'Inputs: {json_data}')
38 |
39 | # Step 2: Validate the input using marshmallow schema
40 | input_data, errors = validate_inputs(input_data=json_data)
41 |
42 | # Step 3: Model prediction
43 | result = make_prediction(input_data=input_data)
44 | _logger.debug(f'Outputs: {result}')
45 |
46 | # Step 4: Convert numpy ndarray to list
47 | predictions = result.get('predictions').tolist()
48 | version = result.get('version')
49 |
50 | # Step 5: Return the response as JSON
51 | return jsonify({'predictions': predictions,
52 | 'version': version,
53 | 'errors': errors})
54 |
55 |
56 | @prediction_app.route('/predict/classifier', methods=['POST'])
57 | def predict_image():
58 | if request.method == 'POST':
59 | # Step 1: check if the post request has the file part
60 | if 'file' not in request.files:
61 | return jsonify('No file found'), 400
62 |
63 | file = request.files['file']
64 |
65 | # Step 2: Basic file extension validation
66 | if file and allowed_file(file.filename):
67 | filename = secure_filename(file.filename)
68 |
69 | # Step 3: Save the file
70 | # Note, in production, this would require careful
71 | # validation, management and clean up.
72 | file.save(os.path.join(UPLOAD_FOLDER, filename))
73 |
74 | _logger.debug(f'Inputs: {filename}')
75 |
76 | # Step 4: perform prediction
77 | result = make_single_prediction(
78 | image_name=filename,
79 | image_directory=UPLOAD_FOLDER)
80 |
81 | _logger.debug(f'Outputs: {result}')
82 |
83 | readable_predictions = result.get('readable_predictions')
84 | version = result.get('version')
85 |
86 | # Step 5: Return the response as JSON
87 | return jsonify(
88 | {'readable_predictions': readable_predictions[0],
89 | 'version': version})
90 |
--------------------------------------------------------------------------------
/assignment-section-05/classification_model/processing/data_manager.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import re
3 | from pathlib import Path
4 | from typing import Any, List, Union
5 |
6 | import joblib
7 | import numpy as np
8 | import pandas as pd
9 | from sklearn.pipeline import Pipeline
10 |
11 | from classification_model import __version__ as _version
12 | from classification_model.config.core import DATASET_DIR, TRAINED_MODEL_DIR, config
13 |
14 | logger = logging.getLogger(__name__)
15 |
16 |
17 | # float type for np.nan
18 | def get_first_cabin(row: Any) -> Union[str, float]:
19 | try:
20 | return row.split()[0]
21 | except AttributeError:
22 | return np.nan
23 |
24 |
25 | def get_title(passenger: str) -> str:
26 | """Extracts the title (Mr, Ms, etc) from the name variable."""
27 | line = passenger
28 | if re.search("Mrs", line):
29 | return "Mrs"
30 | elif re.search("Mr", line):
31 | return "Mr"
32 | elif re.search("Miss", line):
33 | return "Miss"
34 | elif re.search("Master", line):
35 | return "Master"
36 | else:
37 | return "Other"
38 |
39 |
40 | def pre_pipeline_preparation(*, dataframe: pd.DataFrame) -> pd.DataFrame:
41 | # replace question marks with NaN values
42 | data = dataframe.replace("?", np.nan)
43 |
44 | # retain only the first cabin if more than
45 | # 1 are available per passenger
46 | data["cabin"] = data["cabin"].apply(get_first_cabin)
47 |
48 | data["title"] = data["name"].apply(get_title)
49 |
50 | # cast numerical variables as floats
51 | data["fare"] = data["fare"].astype("float")
52 | data["age"] = data["age"].astype("float")
53 |
54 | # drop unnecessary variables
55 | data.drop(labels=config.model_config.unused_fields, axis=1, inplace=True)
56 |
57 | return data
58 |
59 |
60 | def _load_raw_dataset(*, file_name: str) -> pd.DataFrame:
61 | dataframe = pd.read_csv(Path(f"{DATASET_DIR}/{file_name}"))
62 | return dataframe
63 |
64 |
65 | def load_dataset(*, file_name: str) -> pd.DataFrame:
66 | dataframe = pd.read_csv(Path(f"{DATASET_DIR}/{file_name}"))
67 | transformed = pre_pipeline_preparation(dataframe=dataframe)
68 |
69 | return transformed
70 |
71 |
72 | def save_pipeline(*, pipeline_to_persist: Pipeline) -> None:
73 | """Persist the pipeline.
74 | Saves the versioned model, and overwrites any previous
75 | saved models. This ensures that when the package is
76 | published, there is only one trained model that can be
77 | called, and we know exactly how it was built.
78 | """
79 |
80 | # Prepare versioned save file name
81 | save_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl"
82 | save_path = TRAINED_MODEL_DIR / save_file_name
83 |
84 | remove_old_pipelines(files_to_keep=[save_file_name])
85 | joblib.dump(pipeline_to_persist, save_path)
86 |
87 |
88 | def load_pipeline(*, file_name: str) -> Pipeline:
89 | """Load a persisted pipeline."""
90 |
91 | file_path = TRAINED_MODEL_DIR / file_name
92 | return joblib.load(filename=file_path)
93 |
94 |
95 | def remove_old_pipelines(*, files_to_keep: List[str]) -> None:
96 | """
97 | Remove old model pipelines.
98 | This is to ensure there is a simple one-to-one
99 | mapping between the package version and the model
100 | version to be imported and used by other applications.
101 | """
102 | do_not_delete = files_to_keep + ["__init__.py"]
103 | for model_file in TRAINED_MODEL_DIR.iterdir():
104 | if model_file.name not in do_not_delete:
105 | model_file.unlink()
106 |
--------------------------------------------------------------------------------
/section-05-production-model-package/regression_model/pipeline.py:
--------------------------------------------------------------------------------
1 | from feature_engine.encoding import OrdinalEncoder, RareLabelEncoder
2 | from feature_engine.imputation import (
3 | AddMissingIndicator,
4 | CategoricalImputer,
5 | MeanMedianImputer,
6 | )
7 | from feature_engine.selection import DropFeatures
8 | from feature_engine.transformation import LogTransformer
9 | from feature_engine.wrappers import SklearnTransformerWrapper
10 | from sklearn.linear_model import Lasso
11 | from sklearn.pipeline import Pipeline
12 | from sklearn.preprocessing import Binarizer, MinMaxScaler
13 |
14 | from regression_model.config.core import config
15 | from regression_model.processing import features as pp
16 |
17 | price_pipe = Pipeline(
18 | [
19 | # ===== IMPUTATION =====
20 | # impute categorical variables with string missing
21 | (
22 | "missing_imputation",
23 | CategoricalImputer(
24 | imputation_method="missing",
25 | variables=config.model_config.categorical_vars_with_na_missing,
26 | ),
27 | ),
28 | (
29 | "frequent_imputation",
30 | CategoricalImputer(
31 | imputation_method="frequent",
32 | variables=config.model_config.categorical_vars_with_na_frequent,
33 | ),
34 | ),
35 | # add missing indicator
36 | (
37 | "missing_indicator",
38 | AddMissingIndicator(variables=config.model_config.numerical_vars_with_na),
39 | ),
40 | # impute numerical variables with the mean
41 | (
42 | "mean_imputation",
43 | MeanMedianImputer(
44 | imputation_method="mean",
45 | variables=config.model_config.numerical_vars_with_na,
46 | ),
47 | ),
48 | # == TEMPORAL VARIABLES ====
49 | (
50 | "elapsed_time",
51 | pp.TemporalVariableTransformer(
52 | variables=config.model_config.temporal_vars,
53 | reference_variable=config.model_config.ref_var,
54 | ),
55 | ),
56 | ("drop_features", DropFeatures(features_to_drop=[config.model_config.ref_var])),
57 | # ==== VARIABLE TRANSFORMATION =====
58 | ("log", LogTransformer(variables=config.model_config.numericals_log_vars)),
59 | (
60 | "binarizer",
61 | SklearnTransformerWrapper(
62 | transformer=Binarizer(threshold=0),
63 | variables=config.model_config.binarize_vars,
64 | ),
65 | ),
66 | # === mappers ===
67 | (
68 | "mapper_qual",
69 | pp.Mapper(
70 | variables=config.model_config.qual_vars,
71 | mappings=config.model_config.qual_mappings,
72 | ),
73 | ),
74 | (
75 | "mapper_exposure",
76 | pp.Mapper(
77 | variables=config.model_config.exposure_vars,
78 | mappings=config.model_config.exposure_mappings,
79 | ),
80 | ),
81 | (
82 | "mapper_finish",
83 | pp.Mapper(
84 | variables=config.model_config.finish_vars,
85 | mappings=config.model_config.finish_mappings,
86 | ),
87 | ),
88 | (
89 | "mapper_garage",
90 | pp.Mapper(
91 | variables=config.model_config.garage_vars,
92 | mappings=config.model_config.garage_mappings,
93 | ),
94 | ),
95 | # == CATEGORICAL ENCODING
96 | (
97 | "rare_label_encoder",
98 | RareLabelEncoder(
99 | tol=0.01, n_categories=1, variables=config.model_config.categorical_vars
100 | ),
101 | ),
102 | # encode categorical variables using the target mean
103 | (
104 | "categorical_encoder",
105 | OrdinalEncoder(
106 | encoding_method="ordered",
107 | variables=config.model_config.categorical_vars,
108 | ),
109 | ),
110 | ("scaler", MinMaxScaler()),
111 | (
112 | "Lasso",
113 | Lasso(
114 | alpha=config.model_config.alpha,
115 | random_state=config.model_config.random_state,
116 | ),
117 | ),
118 | ]
119 | )
120 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/model-package/regression_model/pipeline.py:
--------------------------------------------------------------------------------
1 | from feature_engine.encoding import OrdinalEncoder, RareLabelEncoder
2 | from feature_engine.imputation import (
3 | AddMissingIndicator,
4 | CategoricalImputer,
5 | MeanMedianImputer,
6 | )
7 | from feature_engine.selection import DropFeatures
8 | from feature_engine.transformation import LogTransformer
9 | from feature_engine.wrappers import SklearnTransformerWrapper
10 | from sklearn.linear_model import Lasso
11 | from sklearn.pipeline import Pipeline
12 | from sklearn.preprocessing import Binarizer, MinMaxScaler
13 |
14 | from regression_model.config.core import config
15 | from regression_model.processing import features as pp
16 |
17 | price_pipe = Pipeline(
18 | [
19 | # ===== IMPUTATION =====
20 | # impute categorical variables with string missing
21 | (
22 | "missing_imputation",
23 | CategoricalImputer(
24 | imputation_method="missing",
25 | variables=config.model_config.categorical_vars_with_na_missing,
26 | ),
27 | ),
28 | (
29 | "frequent_imputation",
30 | CategoricalImputer(
31 | imputation_method="frequent",
32 | variables=config.model_config.categorical_vars_with_na_frequent,
33 | ),
34 | ),
35 | # add missing indicator
36 | (
37 | "missing_indicator",
38 | AddMissingIndicator(variables=config.model_config.numerical_vars_with_na),
39 | ),
40 | # impute numerical variables with the mean
41 | (
42 | "mean_imputation",
43 | MeanMedianImputer(
44 | imputation_method="mean",
45 | variables=config.model_config.numerical_vars_with_na,
46 | ),
47 | ),
48 | # == TEMPORAL VARIABLES ====
49 | (
50 | "elapsed_time",
51 | pp.TemporalVariableTransformer(
52 | variables=config.model_config.temporal_vars,
53 | reference_variable=config.model_config.ref_var,
54 | ),
55 | ),
56 | ("drop_features", DropFeatures(features_to_drop=[config.model_config.ref_var])),
57 | # ==== VARIABLE TRANSFORMATION =====
58 | ("log", LogTransformer(variables=config.model_config.numericals_log_vars)),
59 | (
60 | "binarizer",
61 | SklearnTransformerWrapper(
62 | transformer=Binarizer(threshold=0),
63 | variables=config.model_config.binarize_vars,
64 | ),
65 | ),
66 | # === mappers ===
67 | (
68 | "mapper_qual",
69 | pp.Mapper(
70 | variables=config.model_config.qual_vars,
71 | mappings=config.model_config.qual_mappings,
72 | ),
73 | ),
74 | (
75 | "mapper_exposure",
76 | pp.Mapper(
77 | variables=config.model_config.exposure_vars,
78 | mappings=config.model_config.exposure_mappings,
79 | ),
80 | ),
81 | (
82 | "mapper_finish",
83 | pp.Mapper(
84 | variables=config.model_config.finish_vars,
85 | mappings=config.model_config.finish_mappings,
86 | ),
87 | ),
88 | (
89 | "mapper_garage",
90 | pp.Mapper(
91 | variables=config.model_config.garage_vars,
92 | mappings=config.model_config.garage_mappings,
93 | ),
94 | ),
95 | # == CATEGORICAL ENCODING
96 | (
97 | "rare_label_encoder",
98 | RareLabelEncoder(
99 | tol=0.01, n_categories=1, variables=config.model_config.categorical_vars
100 | ),
101 | ),
102 | # encode categorical variables using the target mean
103 | (
104 | "categorical_encoder",
105 | OrdinalEncoder(
106 | encoding_method="ordered",
107 | variables=config.model_config.categorical_vars,
108 | ),
109 | ),
110 | ("scaler", MinMaxScaler()),
111 | (
112 | "Lasso",
113 | Lasso(
114 | alpha=config.model_config.alpha,
115 | random_state=config.model_config.random_state,
116 | ),
117 | ),
118 | ]
119 | )
120 |
--------------------------------------------------------------------------------
/section-06-model-serving-api/house-prices-api/app/schemas/predict.py:
--------------------------------------------------------------------------------
1 | from typing import Any, List, Optional
2 |
3 | from pydantic import BaseModel
4 | from regression_model.processing.validation import HouseDataInputSchema
5 |
6 |
7 | class PredictionResults(BaseModel):
8 | errors: Optional[Any]
9 | version: str
10 | predictions: Optional[List[float]]
11 |
12 |
13 | class MultipleHouseDataInputs(BaseModel):
14 | inputs: List[HouseDataInputSchema]
15 |
16 | class Config:
17 | schema_extra = {
18 | "example": {
19 | "inputs": [
20 | {
21 | "MSSubClass": 20,
22 | "MSZoning": "RH",
23 | "LotFrontage": 80.0,
24 | "LotArea": 11622,
25 | "Street": "Pave",
26 | "Alley": None,
27 | "LotShape": "Reg",
28 | "LandContour": "Lvl",
29 | "Utilities": "AllPub",
30 | "LotConfig": "Inside",
31 | "LandSlope": "Gtl",
32 | "Neighborhood": "NAmes",
33 | "Condition1": "Feedr",
34 | "Condition2": "Norm",
35 | "BldgType": "1Fam",
36 | "HouseStyle": "1Story",
37 | "OverallQual": 5,
38 | "OverallCond": 6,
39 | "YearBuilt": 1961,
40 | "YearRemodAdd": 1961,
41 | "RoofStyle": "Gable",
42 | "RoofMatl": "CompShg",
43 | "Exterior1st": "VinylSd",
44 | "Exterior2nd": "VinylSd",
45 | "MasVnrType": "None",
46 | "MasVnrArea": 0.0,
47 | "ExterQual": "TA",
48 | "ExterCond": "TA",
49 | "Foundation": "CBlock",
50 | "BsmtQual": "TA",
51 | "BsmtCond": "TA",
52 | "BsmtExposure": "No",
53 | "BsmtFinType1": "Rec",
54 | "BsmtFinSF1": 468.0,
55 | "BsmtFinType2": "LwQ",
56 | "BsmtFinSF2": 144.0,
57 | "BsmtUnfSF": 270.0,
58 | "TotalBsmtSF": 882.0,
59 | "Heating": "GasA",
60 | "HeatingQC": "TA",
61 | "CentralAir": "Y",
62 | "Electrical": "SBrkr",
63 | "FirstFlrSF": 896,
64 | "SecondFlrSF": 0,
65 | "LowQualFinSF": 0,
66 | "GrLivArea": 896,
67 | "BsmtFullBath": 0.0,
68 | "BsmtHalfBath": 0.0,
69 | "FullBath": 1,
70 | "HalfBath": 0,
71 | "BedroomAbvGr": 2,
72 | "KitchenAbvGr": 1,
73 | "KitchenQual": "TA",
74 | "TotRmsAbvGrd": 5,
75 | "Functional": "Typ",
76 | "Fireplaces": 0,
77 | "FireplaceQu": None,
78 | "GarageType": "Attchd",
79 | "GarageYrBlt": 1961.0,
80 | "GarageFinish": "Unf",
81 | "GarageCars": 1.0,
82 | "GarageArea": 730.0,
83 | "GarageQual": "TA",
84 | "GarageCond": "TA",
85 | "PavedDrive": "Y",
86 | "WoodDeckSF": 140,
87 | "OpenPorchSF": 0,
88 | "EnclosedPorch": 0,
89 | "ThreeSsnPortch": 0,
90 | "ScreenPorch": 120,
91 | "PoolArea": 0,
92 | "PoolQC": None,
93 | "Fence": "MnPrv",
94 | "MiscFeature": None,
95 | "MiscVal": 0,
96 | "MoSold": 6,
97 | "YrSold": 2010,
98 | "SaleType": "WD",
99 | "SaleCondition": "Normal",
100 | }
101 | ]
102 | }
103 | }
104 |
--------------------------------------------------------------------------------
/section-07-ci-and-publishing/house-prices-api/app/schemas/predict.py:
--------------------------------------------------------------------------------
1 | from typing import Any, List, Optional
2 |
3 | from pydantic import BaseModel
4 | from regression_model.processing.validation import HouseDataInputSchema
5 |
6 |
7 | class PredictionResults(BaseModel):
8 | errors: Optional[Any]
9 | version: str
10 | predictions: Optional[List[float]]
11 |
12 |
13 | class MultipleHouseDataInputs(BaseModel):
14 | inputs: List[HouseDataInputSchema]
15 |
16 | class Config:
17 | schema_extra = {
18 | "example": {
19 | "inputs": [
20 | {
21 | "MSSubClass": 20,
22 | "MSZoning": "RH",
23 | "LotFrontage": 80.0,
24 | "LotArea": 11622,
25 | "Street": "Pave",
26 | "Alley": None,
27 | "LotShape": "Reg",
28 | "LandContour": "Lvl",
29 | "Utilities": "AllPub",
30 | "LotConfig": "Inside",
31 | "LandSlope": "Gtl",
32 | "Neighborhood": "NAmes",
33 | "Condition1": "Feedr",
34 | "Condition2": "Norm",
35 | "BldgType": "1Fam",
36 | "HouseStyle": "1Story",
37 | "OverallQual": 5,
38 | "OverallCond": 6,
39 | "YearBuilt": 1961,
40 | "YearRemodAdd": 1961,
41 | "RoofStyle": "Gable",
42 | "RoofMatl": "CompShg",
43 | "Exterior1st": "VinylSd",
44 | "Exterior2nd": "VinylSd",
45 | "MasVnrType": "None",
46 | "MasVnrArea": 0.0,
47 | "ExterQual": "TA",
48 | "ExterCond": "TA",
49 | "Foundation": "CBlock",
50 | "BsmtQual": "TA",
51 | "BsmtCond": "TA",
52 | "BsmtExposure": "No",
53 | "BsmtFinType1": "Rec",
54 | "BsmtFinSF1": 468.0,
55 | "BsmtFinType2": "LwQ",
56 | "BsmtFinSF2": 144.0,
57 | "BsmtUnfSF": 270.0,
58 | "TotalBsmtSF": 882.0,
59 | "Heating": "GasA",
60 | "HeatingQC": "TA",
61 | "CentralAir": "Y",
62 | "Electrical": "SBrkr",
63 | "FirstFlrSF": 896,
64 | "SecondFlrSF": 0,
65 | "LowQualFinSF": 0,
66 | "GrLivArea": 896,
67 | "BsmtFullBath": 0.0,
68 | "BsmtHalfBath": 0.0,
69 | "FullBath": 1,
70 | "HalfBath": 0,
71 | "BedroomAbvGr": 2,
72 | "KitchenAbvGr": 1,
73 | "KitchenQual": "TA",
74 | "TotRmsAbvGrd": 5,
75 | "Functional": "Typ",
76 | "Fireplaces": 0,
77 | "FireplaceQu": None,
78 | "GarageType": "Attchd",
79 | "GarageYrBlt": 1961.0,
80 | "GarageFinish": "Unf",
81 | "GarageCars": 1.0,
82 | "GarageArea": 730.0,
83 | "GarageQual": "TA",
84 | "GarageCond": "TA",
85 | "PavedDrive": "Y",
86 | "WoodDeckSF": 140,
87 | "OpenPorchSF": 0,
88 | "EnclosedPorch": 0,
89 | "ThreeSsnPortch": 0,
90 | "ScreenPorch": 120,
91 | "PoolArea": 0,
92 | "PoolQC": None,
93 | "Fence": "MnPrv",
94 | "MiscFeature": None,
95 | "MiscVal": 0,
96 | "MoSold": 6,
97 | "YrSold": 2010,
98 | "SaleType": "WD",
99 | "SaleCondition": "Normal",
100 | }
101 | ]
102 | }
103 | }
104 |
--------------------------------------------------------------------------------
/section-08-deploying-with-containers/house-prices-api/app/schemas/predict.py:
--------------------------------------------------------------------------------
1 | from typing import Any, List, Optional
2 |
3 | from pydantic import BaseModel
4 | from regression_model.processing.validation import HouseDataInputSchema
5 |
6 |
7 | class PredictionResults(BaseModel):
8 | errors: Optional[Any]
9 | version: str
10 | predictions: Optional[List[float]]
11 |
12 |
13 | class MultipleHouseDataInputs(BaseModel):
14 | inputs: List[HouseDataInputSchema]
15 |
16 | class Config:
17 | schema_extra = {
18 | "example": {
19 | "inputs": [
20 | {
21 | "MSSubClass": 20,
22 | "MSZoning": "RH",
23 | "LotFrontage": 80.0,
24 | "LotArea": 11622,
25 | "Street": "Pave",
26 | "Alley": None,
27 | "LotShape": "Reg",
28 | "LandContour": "Lvl",
29 | "Utilities": "AllPub",
30 | "LotConfig": "Inside",
31 | "LandSlope": "Gtl",
32 | "Neighborhood": "NAmes",
33 | "Condition1": "Feedr",
34 | "Condition2": "Norm",
35 | "BldgType": "1Fam",
36 | "HouseStyle": "1Story",
37 | "OverallQual": 5,
38 | "OverallCond": 6,
39 | "YearBuilt": 1961,
40 | "YearRemodAdd": 1961,
41 | "RoofStyle": "Gable",
42 | "RoofMatl": "CompShg",
43 | "Exterior1st": "VinylSd",
44 | "Exterior2nd": "VinylSd",
45 | "MasVnrType": "None",
46 | "MasVnrArea": 0.0,
47 | "ExterQual": "TA",
48 | "ExterCond": "TA",
49 | "Foundation": "CBlock",
50 | "BsmtQual": "TA",
51 | "BsmtCond": "TA",
52 | "BsmtExposure": "No",
53 | "BsmtFinType1": "Rec",
54 | "BsmtFinSF1": 468.0,
55 | "BsmtFinType2": "LwQ",
56 | "BsmtFinSF2": 144.0,
57 | "BsmtUnfSF": 270.0,
58 | "TotalBsmtSF": 882.0,
59 | "Heating": "GasA",
60 | "HeatingQC": "TA",
61 | "CentralAir": "Y",
62 | "Electrical": "SBrkr",
63 | "FirstFlrSF": 896,
64 | "SecondFlrSF": 0,
65 | "LowQualFinSF": 0,
66 | "GrLivArea": 896,
67 | "BsmtFullBath": 0.0,
68 | "BsmtHalfBath": 0.0,
69 | "FullBath": 1,
70 | "HalfBath": 0,
71 | "BedroomAbvGr": 2,
72 | "KitchenAbvGr": 1,
73 | "KitchenQual": "TA",
74 | "TotRmsAbvGrd": 5,
75 | "Functional": "Typ",
76 | "Fireplaces": 0,
77 | "FireplaceQu": None,
78 | "GarageType": "Attchd",
79 | "GarageYrBlt": 1961.0,
80 | "GarageFinish": "Unf",
81 | "GarageCars": 1.0,
82 | "GarageArea": 730.0,
83 | "GarageQual": "TA",
84 | "GarageCond": "TA",
85 | "PavedDrive": "Y",
86 | "WoodDeckSF": 140,
87 | "OpenPorchSF": 0,
88 | "EnclosedPorch": 0,
89 | "ThreeSsnPortch": 0,
90 | "ScreenPorch": 120,
91 | "PoolArea": 0,
92 | "PoolQC": None,
93 | "Fence": "MnPrv",
94 | "MiscFeature": None,
95 | "MiscVal": 0,
96 | "MoSold": 6,
97 | "YrSold": 2010,
98 | "SaleType": "WD",
99 | "SaleCondition": "Normal",
100 | }
101 | ]
102 | }
103 | }
104 |
--------------------------------------------------------------------------------
/section-05-production-model-package/regression_model/processing/validation.py:
--------------------------------------------------------------------------------
1 | from typing import List, Optional, Tuple
2 |
3 | import numpy as np
4 | import pandas as pd
5 | from pydantic import BaseModel, ValidationError
6 |
7 | from regression_model.config.core import config
8 |
9 |
10 | def drop_na_inputs(*, input_data: pd.DataFrame) -> pd.DataFrame:
11 | """Check model inputs for na values and filter."""
12 | validated_data = input_data.copy()
13 | new_vars_with_na = [
14 | var
15 | for var in config.model_config.features
16 | if var
17 | not in config.model_config.categorical_vars_with_na_frequent
18 | + config.model_config.categorical_vars_with_na_missing
19 | + config.model_config.numerical_vars_with_na
20 | and validated_data[var].isnull().sum() > 0
21 | ]
22 | validated_data.dropna(subset=new_vars_with_na, inplace=True)
23 |
24 | return validated_data
25 |
26 |
27 | def validate_inputs(*, input_data: pd.DataFrame) -> Tuple[pd.DataFrame, Optional[dict]]:
28 | """Check model inputs for unprocessable values."""
29 |
30 | # convert syntax error field names (beginning with numbers)
31 | input_data.rename(columns=config.model_config.variables_to_rename, inplace=True)
32 | input_data["MSSubClass"] = input_data["MSSubClass"].astype("O")
33 | relevant_data = input_data[config.model_config.features].copy()
34 | validated_data = drop_na_inputs(input_data=relevant_data)
35 | errors = None
36 |
37 | try:
38 | # replace numpy nans so that pydantic can validate
39 | MultipleHouseDataInputs(
40 | inputs=validated_data.replace({np.nan: None}).to_dict(orient="records")
41 | )
42 | except ValidationError as error:
43 | errors = error.json()
44 |
45 | return validated_data, errors
46 |
47 |
48 | class HouseDataInputSchema(BaseModel):
49 | Alley: Optional[str]
50 | BedroomAbvGr: Optional[int]
51 | BldgType: Optional[str]
52 | BsmtCond: Optional[str]
53 | BsmtExposure: Optional[str]
54 | BsmtFinSF1: Optional[float]
55 | BsmtFinSF2: Optional[float]
56 | BsmtFinType1: Optional[str]
57 | BsmtFinType2: Optional[str]
58 | BsmtFullBath: Optional[float]
59 | BsmtHalfBath: Optional[float]
60 | BsmtQual: Optional[str]
61 | BsmtUnfSF: Optional[float]
62 | CentralAir: Optional[str]
63 | Condition1: Optional[str]
64 | Condition2: Optional[str]
65 | Electrical: Optional[str]
66 | EnclosedPorch: Optional[int]
67 | ExterCond: Optional[str]
68 | ExterQual: Optional[str]
69 | Exterior1st: Optional[str]
70 | Exterior2nd: Optional[str]
71 | Fence: Optional[str]
72 | FireplaceQu: Optional[str]
73 | Fireplaces: Optional[int]
74 | Foundation: Optional[str]
75 | FullBath: Optional[int]
76 | Functional: Optional[str]
77 | GarageArea: Optional[float]
78 | GarageCars: Optional[float]
79 | GarageCond: Optional[str]
80 | GarageFinish: Optional[str]
81 | GarageQual: Optional[str]
82 | GarageType: Optional[str]
83 | GarageYrBlt: Optional[float]
84 | GrLivArea: Optional[int]
85 | HalfBath: Optional[int]
86 | Heating: Optional[str]
87 | HeatingQC: Optional[str]
88 | HouseStyle: Optional[str]
89 | Id: Optional[int]
90 | KitchenAbvGr: Optional[int]
91 | KitchenQual: Optional[str]
92 | LandContour: Optional[str]
93 | LandSlope: Optional[str]
94 | LotArea: Optional[int]
95 | LotConfig: Optional[str]
96 | LotFrontage: Optional[float]
97 | LotShape: Optional[str]
98 | LowQualFinSF: Optional[int]
99 | MSSubClass: Optional[int]
100 | MSZoning: Optional[str]
101 | MasVnrArea: Optional[float]
102 | MasVnrType: Optional[str]
103 | MiscFeature: Optional[str]
104 | MiscVal: Optional[int]
105 | MoSold: Optional[int]
106 | Neighborhood: Optional[str]
107 | OpenPorchSF: Optional[int]
108 | OverallCond: Optional[int]
109 | OverallQual: Optional[int]
110 | PavedDrive: Optional[str]
111 | PoolArea: Optional[int]
112 | PoolQC: Optional[str]
113 | RoofMatl: Optional[str]
114 | RoofStyle: Optional[str]
115 | SaleCondition: Optional[str]
116 | SaleType: Optional[str]
117 | ScreenPorch: Optional[int]
118 | Street: Optional[str]
119 | TotRmsAbvGrd: Optional[int]
120 | TotalBsmtSF: Optional[float]
121 | Utilities: Optional[str]
122 | WoodDeckSF: Optional[int]
123 | YearBuilt: Optional[int]
124 | YearRemodAdd: Optional[int]
125 | YrSold: Optional[int]
126 | FirstFlrSF: Optional[int] # renamed
127 | SecondFlrSF: Optional[int] # renamed
128 | ThreeSsnPortch: Optional[int] # renamed
129 |
130 |
131 | class MultipleHouseDataInputs(BaseModel):
132 | inputs: List[HouseDataInputSchema]
133 |
--------------------------------------------------------------------------------