├── .circleci
    └── config.yml
├── .dockerignore
├── .gitignore
├── LICENSE
├── README.md
├── exercise_notebooks
    ├── .gitkeep
    ├── docker_exercise
    │   ├── Dockerfile
    │   ├── app.py
    │   ├── docker-compose.yml
    │   └── requirements.txt
    ├── elk_exercise
    │   ├── Dockerfile
    │   ├── app
    │   │   ├── __init__.py
    │   │   └── flask_app.py
    │   ├── application.py
    │   ├── docker-compose.yml
    │   ├── elasticsearch
    │   │   └── config
    │   │   │   └── elasticsearch.yml
    │   ├── gunicorn_logging.conf
    │   ├── kibana
    │   │   └── config
    │   │   │   └── kibana.yml
    │   ├── logstash
    │   │   ├── config
    │   │   │   └── logstash.yml
    │   │   └── pipeline
    │   │   │   └── logstash.conf
    │   └── requirements.txt
    ├── prometheus_exercise
    │   ├── Dockerfile
    │   ├── app
    │   │   ├── __init__.py
    │   │   ├── flask_app.py
    │   │   └── helpers
    │   │   │   ├── __init__.py
    │   │   │   └── middleware.py
    │   ├── application.py
    │   ├── config
    │   │   ├── grafana
    │   │   │   ├── basic_cadvisor_dashboard.json
    │   │   │   └── grafana_flask_basic_dashboard.json
    │   │   └── prometheus
    │   │   │   └── prometheus.yml
    │   ├── docker-compose.yml
    │   └── requirements.txt
    ├── shadow_mode_exercise
    │   ├── assessing_model_results.ipynb
    │   └── requirements.txt
    ├── unit_testing_exercise
    │   ├── requirements.txt
    │   ├── unit_testing_data_engineering.ipynb
    │   ├── unit_testing_input_data.ipynb
    │   ├── unit_testing_model_configuration.ipynb
    │   └── unit_testing_model_predictions_quality.ipynb
    └── utility_scripts
    │   └── MapPortsForDocker.cmd
├── packages
    ├── gradient_boosting_model
    │   ├── MANIFEST.in
    │   ├── gradient_boosting_model
    │   │   ├── VERSION
    │   │   ├── __init__.py
    │   │   ├── config.yml
    │   │   ├── config
    │   │   │   ├── __init__.py
    │   │   │   └── core.py
    │   │   ├── datasets
    │   │   │   ├── .gitkeep
    │   │   │   └── __init__.py
    │   │   ├── pipeline.py
    │   │   ├── predict.py
    │   │   ├── processing
    │   │   │   ├── __init__.py
    │   │   │   ├── data_management.py
    │   │   │   ├── errors.py
    │   │   │   ├── preprocessors.py
    │   │   │   └── validation.py
    │   │   ├── train_pipeline.py
    │   │   └── trained_models
    │   │   │   └── __init__.py
    │   ├── mypy.ini
    │   ├── requirements.txt
    │   ├── setup.py
    │   ├── test_requirements.txt
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── conftest.py
    │   │   ├── test_config.py
    │   │   ├── test_pipeline.py
    │   │   ├── test_predict.py
    │   │   ├── test_preprocessors.py
    │   │   └── test_validation.py
    │   └── tox.ini
    └── ml_api
    │   ├── .dockerignore
    │   ├── Makefile
    │   ├── __init__.py
    │   ├── alembic.ini
    │   ├── alembic
    │       ├── env.py
    │       ├── script.py.mako
    │       └── versions
    │       │   └── cf4abb13368d_create_prediction_tables.py
    │   ├── api
    │       ├── __init__.py
    │       ├── app.py
    │       ├── config.py
    │       ├── controller.py
    │       ├── monitoring
    │       │   ├── __init__.py
    │       │   └── middleware.py
    │       ├── persistence
    │       │   ├── __init__.py
    │       │   ├── core.py
    │       │   ├── data_access.py
    │       │   └── models.py
    │       └── spec
    │       │   ├── __init__.py
    │       │   └── api.yaml
    │   ├── differential_tests
    │       ├── __init__.py
    │       ├── __main__.py
    │       ├── compare.py
    │       └── sample_payloads
    │       │   └── sample_input1.json
    │   ├── docker
    │       ├── Dockerfile
    │       ├── Dockerfile.test
    │       ├── config
    │       │   ├── grafana
    │       │   │   ├── basic_cadvisor_dashboard_ml_api.json
    │       │   │   ├── grafana_flask_basic_dashboard_ml_api.json
    │       │   │   └── ml_api_dashboard.json
    │       │   └── prometheus
    │       │   │   └── prometheus.yml
    │       ├── docker-compose-ci-candidate.yml
    │       ├── docker-compose-ci-master.yml
    │       ├── docker-compose-elk.yml
    │       ├── docker-compose.test.yml
    │       ├── docker-compose.yml
    │       ├── elasticsearch
    │       │   └── config
    │       │   │   └── elasticsearch.yml
    │       ├── kibana
    │       │   └── config
    │       │   │   ├── kibana.yml
    │       │   │   └── kibana_example_inputs_dashboard.ndjson
    │       ├── logstash
    │       │   ├── config
    │       │   │   └── logstash.yml
    │       │   └── pipeline
    │       │   │   └── logstash.conf
    │       └── workaround_32_os
    │       │   ├── Dockerfile.workaround
    │       │   └── docker-compose-workaround.yml
    │   ├── gunicorn_logging.conf
    │   ├── mypy.ini
    │   ├── requirements
    │       ├── requirements.txt
    │       └── test_requirements.txt
    │   ├── run.py
    │   ├── scripts
    │       ├── differential_tests.sh
    │       └── populate_database.py
    │   ├── tests
    │       ├── __init__.py
    │       ├── conftest.py
    │       ├── test_api.py
    │       ├── test_back_to_back_models.py
    │       └── test_persistence.py
    │   └── tox.ini
└── research_phase
    ├── gradient_boosting_model.ipynb
    └── requirements.txt


/.circleci/config.yml:
--------------------------------------------------------------------------------
  1 | version: 2.1
  2 | 
  3 | jobs:
  4 |   test_gradient_model_py36:
  5 |     docker:
  6 |       - image: circleci/python:3.6.9
  7 |     working_directory: ~/project/packages/gradient_boosting_model
  8 |     steps:
  9 |       - checkout:
 10 |           path: ~/project
 11 |       - run:
 12 |           name: Run tests with Python 3.6
 13 |           command: |
 14 |             sudo pip install --upgrade pip
 15 |             pip install --user tox
 16 |             tox -e py36
 17 |   test_gradient_model_py37:
 18 |     docker:
 19 |       - image: circleci/python:3.7.6
 20 |     working_directory: ~/project/packages/gradient_boosting_model
 21 |     steps:
 22 |       - checkout:
 23 |           path: ~/project
 24 |       - run:
 25 |           name: Run tests with Python 3.7
 26 |           command: |
 27 |             sudo pip install --upgrade pip
 28 |             pip install --user tox
 29 |             tox -e py37
 30 |   test_gradient_model_py38:
 31 |     docker:
 32 |       - image: circleci/python:3.8.0
 33 |     working_directory: ~/project/packages/gradient_boosting_model
 34 |     steps:
 35 |       - checkout:
 36 |           path: ~/project
 37 |       - run:
 38 |           name: Run tests with Python 3.8
 39 |           command: |
 40 |             sudo pip install --upgrade pip
 41 |             pip install --user tox
 42 |             tox -e py38
 43 |   test_ml_api_py36:
 44 |     docker:
 45 |       - image: circleci/python:3.6.9
 46 |       - image: postgres
 47 |         environment:
 48 |           POSTGRES_USER: test_user
 49 |           POSTGRES_PASSWORD: password
 50 |           POSTGRES_DB: ml_api_test
 51 |     environment:
 52 |       DB_HOST: localhost
 53 |       DB_PORT: 5432
 54 |       DB_USER: test_user
 55 |       DB_PASSWORD: password
 56 |       DB_NAME: ml_api_test
 57 |       SHADOW_MODE_ACTIVE: true
 58 |     working_directory: ~/project/packages/ml_api
 59 |     steps:
 60 |       - checkout:
 61 |           path: ~/project
 62 |       - run:
 63 |           name: Run API tests with Python 3.6
 64 |           command: |
 65 |             sudo pip install --upgrade pip
 66 |             pip install --user tox
 67 |             tox -e py36
 68 |   test_ml_api_py37:
 69 |     docker:
 70 |       - image: circleci/python:3.7.6
 71 |       - image: postgres
 72 |         environment:
 73 |           POSTGRES_USER: test_user
 74 |           POSTGRES_PASSWORD: password
 75 |           POSTGRES_DB: ml_api_test
 76 |     environment:
 77 |       DB_HOST: localhost
 78 |       DB_PORT: 5432
 79 |       DB_USER: test_user
 80 |       DB_PASSWORD: password
 81 |       DB_NAME: ml_api_test
 82 |       SHADOW_MODE_ACTIVE: true
 83 |     working_directory: ~/project/packages/ml_api
 84 |     steps:
 85 |       - checkout:
 86 |           path: ~/project
 87 |       - run:
 88 |           name: Run API tests with Python 3.7
 89 |           command: |
 90 |             sudo pip install --upgrade pip
 91 |             pip install --user tox
 92 |             tox -e py37
 93 |   test_ml_api_py38:
 94 |     docker:
 95 |       - image: circleci/python:3.8.1
 96 |       - image: postgres
 97 |         environment:
 98 |           POSTGRES_USER: test_user
 99 |           POSTGRES_PASSWORD: password
100 |           POSTGRES_DB: ml_api_test
101 |     environment:
102 |       DB_HOST: localhost
103 |       DB_PORT: 5432
104 |       DB_USER: test_user
105 |       DB_PASSWORD: password
106 |       DB_NAME: ml_api_test
107 |       SHADOW_MODE_ACTIVE: true
108 |     working_directory: ~/project/packages/ml_api
109 |     steps:
110 |       - checkout:
111 |           path: ~/project
112 |       - run:
113 |           name: Run API tests with Python 3.8
114 |           command: |
115 |             sudo pip install --upgrade pip
116 |             pip install --user tox
117 |             tox -e py38
118 | workflows:
119 |   version: 2
120 |   test-all:
121 |     jobs:
122 |       - test_gradient_model_py36
123 |       - test_gradient_model_py37
124 |       - test_gradient_model_py38
125 |       - test_ml_api_py36
126 |       - test_ml_api_py37
127 |       - test_ml_api_py38
128 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | exercise_notebooks/*
 2 | */env*
 3 | */venv*
 4 | .circleci*
 5 | packages/gradient_boosting_model
 6 | *.env
 7 | *.log
 8 | .git
 9 | .gitignore
10 | .dockerignore
11 | *.mypy_cache
12 | *.pytest_cache
13 | 
14 | ### Python ###
15 | 
16 | # Byte-compiled / optimized / DLL files
17 | __pycache__/
18 | *.py[cod]
19 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | .tox/
 93 | 
 94 | # Spyder project settings
 95 | .spyderproject
 96 | .spyproject
 97 | 
 98 | # Rope project settings
 99 | .ropeproject
100 | 
101 | # mkdocs documentation
102 | /site
103 | 
104 | # mypy
105 | .mypy_cache/
106 | 
107 | # pycharm
108 | .idea/
109 | 
110 | # OSX
111 | .DS_Store
112 | 
113 | # all logs
114 | logs/
115 | 
116 | # training data
117 | packages/gradient_boosting_model/gradient_boosting_model/datasets/*.csv
118 | packages/gradient_boosting_model/gradient_boosting_model/datasets/*.txt
119 | packages/gradient_boosting_model/gradient_boosting_model/datasets/*.zip
120 | houseprice.csv
121 | train.csv
122 | test.csv
123 | !/packages/gradient_boosting_model/gradient_boosting_model/datasets/.gitkeep
124 | 
125 | # trained models
126 | packages/gradient_boosting_model/gradient_boosting_model/trained_models/*.pkl
127 | *.h5
128 | 
129 | # differential test artifacts
130 | packages/ml_api/differential_tests/expected_results/
131 | packages/ml_api/differential_tests/actual_results/
132 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2019, Train In Data
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Example project for the course "Testing & Monitoring Machine Learning Model Deployments". For setup instructions, see the course lectures.
2 | 


--------------------------------------------------------------------------------
/exercise_notebooks/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/testing-and-monitoring-ml-deployments/88d0fdac0898178f5ed9611e44a56ea5e53c9dd5/exercise_notebooks/.gitkeep


--------------------------------------------------------------------------------
/exercise_notebooks/docker_exercise/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.7-alpine
 2 | WORKDIR /code
 3 | 
 4 | # Set env vars required by Flask
 5 | ENV FLASK_APP app.py
 6 | ENV FLASK_RUN_HOST 0.0.0.0
 7 | 
 8 | # Install gcc so Python packages such as MarkupSafe
 9 | # and SQLAlchemy can compile speedups.
10 | RUN apk add --no-cache gcc musl-dev linux-headers
11 | 
12 | # copy local requirements.txt into container
13 | # doing this separately from the main copy
14 | # operation makes more efficient use of docker
15 | # layer caching.
16 | COPY requirements.txt requirements.txt
17 | 
18 | # install requirements inside the container
19 | RUN pip install -r requirements.txt
20 | 
21 | # Copy the current directory . in the project
22 | # to the workdir . in the image
23 | COPY . .
24 | 
25 | # Set the default command for the container to flask run
26 | CMD ["flask", "run"]
27 | 


--------------------------------------------------------------------------------
/exercise_notebooks/docker_exercise/app.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import redis
 4 | from flask import Flask
 5 | 
 6 | app = Flask(__name__)
 7 | cache = redis.Redis(host='redis', port=6379)
 8 | 
 9 | 
10 | def get_hit_count():
11 |     retries = 5
12 |     while True:
13 |         try:
14 |             return cache.incr('hits')
15 |         except redis.exceptions.ConnectionError as exc:
16 |             if retries == 0:
17 |                 raise exc
18 |             retries -= 1
19 |             time.sleep(0.5)
20 | 
21 | 
22 | @app.route('/')
23 | def hello():
24 |     count = get_hit_count()
25 |     return f'Hello World! I have been seen {count} times.\n'
26 | 


--------------------------------------------------------------------------------
/exercise_notebooks/docker_exercise/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3'
2 | services:
3 |   web:
4 |     build: .
5 |     ports:
6 |       - "5000:5000"
7 |   redis:
8 |     image: "redis:alpine"
9 | 


--------------------------------------------------------------------------------
/exercise_notebooks/docker_exercise/requirements.txt:
--------------------------------------------------------------------------------
1 | flask>=1.1.1,<1.2.0
2 | markupsafe==2.0.1 # https://github.com/aws/aws-sam-cli/issues/3661
3 | redis>=3.3.11,<3.4
4 | 


--------------------------------------------------------------------------------
/exercise_notebooks/elk_exercise/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9-alpine
 2 | WORKDIR /application
 3 | 
 4 | COPY ./requirements.txt requirements.txt
 5 | RUN apk add --no-cache \
 6 | 		gcc \
 7 | 		libc-dev \
 8 | 		linux-headers \
 9 | 		bash; \
10 | 	pip install -r requirements.txt;
11 | 
12 | COPY . /application
13 | 
14 | 
15 | EXPOSE 5000
16 | VOLUME /application
17 | CMD gunicorn --bind 0.0.0.0:5000 \
18 |              --workers=1 \
19 |              --log-config gunicorn_logging.conf \
20 |              --log-level=DEBUG \
21 |              --access-logfile=- \
22 |              --error-logfile=- \
23 |              application:application
24 | 


--------------------------------------------------------------------------------
/exercise_notebooks/elk_exercise/app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/testing-and-monitoring-ml-deployments/88d0fdac0898178f5ed9611e44a56ea5e53c9dd5/exercise_notebooks/elk_exercise/app/__init__.py


--------------------------------------------------------------------------------
/exercise_notebooks/elk_exercise/app/flask_app.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from flask import Flask, current_app
 4 | 
 5 | 
 6 | def index():
 7 |     current_app.logger.info('home')
 8 |     return 'home'
 9 | 
10 | 
11 | def create_app():
12 |     main_app = Flask(__name__)
13 |     main_app.add_url_rule('/', 'index', index)
14 |     gunicorn_error_logger = logging.getLogger('gunicorn.error')
15 |     main_app.logger.addHandler(gunicorn_error_logger)
16 |     main_app.logger.setLevel(logging.DEBUG)
17 | 
18 |     return main_app
19 | 


--------------------------------------------------------------------------------
/exercise_notebooks/elk_exercise/application.py:
--------------------------------------------------------------------------------
1 | from app.flask_app import create_app
2 | 
3 | 
4 | application = create_app()
5 | 
6 | if __name__ == '__main__':
7 |     application.run()
8 | 


--------------------------------------------------------------------------------
/exercise_notebooks/elk_exercise/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.2'
 2 | 
 3 | services:
 4 |   # The environment variable "ELK_VERSION" is used throughout this file to
 5 |   # specify the version of the images to run. The default is set in the
 6 |   # '.env' file in this folder. It can be overridden with any normal
 7 |   # technique for setting environment variables, for example:
 8 |   #
 9 |   #  ELK_VERSION=7.0.0-beta1 docker-compose up
10 |   #
11 |   # REF: https://docs.docker.com/compose/compose-file/#variable-substitution
12 |   webapp:
13 |     build: .
14 |     container_name: webapp
15 |     expose:
16 |       - 5000
17 |     ports:
18 |       - 5000:5000
19 |     links:
20 |       - logstash
21 |     networks:
22 |       - elk
23 |     depends_on:
24 |       - logstash
25 |       - kibana
26 |       - elasticsearch
27 |     volumes:
28 |       - ./:/application
29 |   elasticsearch:
30 |     image: docker.elastic.co/elasticsearch/elasticsearch:${ELK_VERSION}
31 |     volumes:
32 |       - type: bind
33 |         source: ./elasticsearch/config/elasticsearch.yml
34 |         target: /usr/share/elasticsearch/config/elasticsearch.yml
35 |         read_only: true
36 |       - type: volume
37 |         source: elasticsearch
38 |         target: /usr/share/elasticsearch/data
39 |     ports:
40 |       - "9200:9200"
41 |       - "9300:9300"
42 |     environment:
43 |       ES_JAVA_OPTS: "-Xmx256m -Xms256m"
44 |       ELASTIC_PASSWORD: changeme
45 |       # Use single node discovery in order to disable production mode and avoid bootstrap checks
46 |       # see https://www.elastic.co/guide/en/elasticsearch/reference/current/bootstrap-checks.html
47 |       discovery.type: single-node
48 |     networks:
49 |       - elk
50 | 
51 |   logstash:
52 |     image: docker.elastic.co/logstash/logstash:${ELK_VERSION}
53 |     volumes:
54 |       - type: bind
55 |         source: ./logstash/config/logstash.yml
56 |         target: /usr/share/logstash/config/logstash.yml
57 |         read_only: true
58 |       - type: bind
59 |         source: ./logstash/pipeline
60 |         target: /usr/share/logstash/pipeline
61 |         read_only: true
62 |     ports:
63 |       - "5001:5001"
64 |       - "9600:9600"
65 |     environment:
66 |       LS_JAVA_OPTS: "-Xmx256m -Xms256m"
67 |     networks:
68 |       - elk
69 |     depends_on:
70 |       - elasticsearch
71 | 
72 |   kibana:
73 |     image: docker.elastic.co/kibana/kibana:${ELK_VERSION}
74 |     volumes:
75 |       - type: bind
76 |         source: ./kibana/config/kibana.yml
77 |         target: /usr/share/kibana/config/kibana.yml
78 |         read_only: true
79 |     ports:
80 |       - "5601:5601"
81 |     networks:
82 |       - elk
83 |     depends_on:
84 |       - elasticsearch
85 | 
86 | networks:
87 |   elk:
88 |     driver: bridge
89 | 
90 | volumes:
91 |   elasticsearch:


--------------------------------------------------------------------------------
/exercise_notebooks/elk_exercise/elasticsearch/config/elasticsearch.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | ## Default Elasticsearch configuration from Elasticsearch base image.
 3 | ## https://github.com/elastic/elasticsearch/blob/master/distribution/docker/src/docker/config/elasticsearch.yml
 4 | cluster.name: "docker-cluster"
 5 | network.host: 0.0.0.0
 6 | 
 7 | ## X-Pack settings
 8 | ## see https://www.elastic.co/guide/en/elasticsearch/reference/current/setup-xpack.html
 9 | xpack.license.self_generated.type: basic
10 | xpack.security.enabled: true
11 | xpack.monitoring.collection.enabled: true
12 | 


--------------------------------------------------------------------------------
/exercise_notebooks/elk_exercise/gunicorn_logging.conf:
--------------------------------------------------------------------------------
 1 | [loggers]
 2 | keys=root, logstash.error, logstash.access
 3 | 
 4 | [handlers]
 5 | keys=console, logstash
 6 | 
 7 | [formatters]
 8 | keys=generic, access, json
 9 | 
10 | [logger_root]
11 | level=INFO
12 | handlers=console
13 | 
14 | [logger_logstash.error]
15 | level=INFO
16 | handlers=logstash
17 | propagate=1
18 | qualname=gunicorn.error
19 | 
20 | [logger_logstash.access]
21 | level=INFO
22 | handlers=logstash
23 | propagate=0
24 | qualname=gunicorn.access
25 | 
26 | [handler_console]
27 | class=StreamHandler
28 | formatter=generic
29 | args=(sys.stdout, )
30 | 
31 | [handler_logstash]
32 | class=logstash.TCPLogstashHandler
33 | formatter=json
34 | args=('logstash', 5001)
35 | 
36 | [formatter_generic]
37 | format=%(asctime)s [%(process)d] [%(levelname)s] %(message)s
38 | datefmt=%Y-%m-%d %H:%M:%S
39 | class=logging.Formatter
40 | 
41 | [formatter_access]
42 | format=%(message)s
43 | class=logging.Formatter
44 | 
45 | [formatter_json]
46 | class=pythonjsonlogger.jsonlogger.JsonFormatter


--------------------------------------------------------------------------------
/exercise_notebooks/elk_exercise/kibana/config/kibana.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | ## Default Kibana configuration from Kibana base image.
 3 | ## https://github.com/elastic/kibana/blob/master/src/dev/build/tasks/os_packages/docker_generator/templates/kibana_yml.template.js
 4 | #
 5 | server.name: kibana
 6 | server.host: "0"
 7 | elasticsearch.hosts: [ "http://elasticsearch:9200" ]
 8 | xpack.monitoring.ui.container.elasticsearch.enabled: true
 9 | 
10 | ## X-Pack security credentials
11 | #
12 | elasticsearch.username: elastic
13 | elasticsearch.password: changeme


--------------------------------------------------------------------------------
/exercise_notebooks/elk_exercise/logstash/config/logstash.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | ## Default Logstash configuration from Logstash base image.
 3 | ## https://github.com/elastic/logstash/blob/master/docker/data/logstash/config/logstash-full.yml
 4 | #
 5 | http.host: "0.0.0.0"
 6 | xpack.monitoring.elasticsearch.hosts: [ "http://elasticsearch:9200" ]
 7 | 
 8 | ## X-Pack security credentials
 9 | #
10 | xpack.monitoring.enabled: true
11 | xpack.monitoring.elasticsearch.username: elastic
12 | xpack.monitoring.elasticsearch.password: changeme
13 | 


--------------------------------------------------------------------------------
/exercise_notebooks/elk_exercise/logstash/pipeline/logstash.conf:
--------------------------------------------------------------------------------
 1 | input {
 2 | 	tcp {
 3 | 		port => 5001
 4 | 		tags => ["webapp_logs"]
 5 | 		type => "webapp_logs"
 6 | 		codec => json
 7 | 	}
 8 | }
 9 | 
10 | output {
11 | 	elasticsearch {
12 | 		hosts => "elasticsearch:9200"
13 | 		user => "elastic"
14 | 		password => "changeme"
15 |  		index => "webapp_logs-%{+YYYY.MM.dd}"
16 |  		}
17 | }


--------------------------------------------------------------------------------
/exercise_notebooks/elk_exercise/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask>=1.1.1,<1.2.0
2 | markupsafe==2.0.1  # https://github.com/aws/aws-sam-cli/issues/3661
3 | python3-logstash>=0.4.80,<0.5.0
4 | python-json-logger>=0.1.11,<0.2.0
5 | gunicorn>=20.0.4,<20.1.0
6 | 
7 | 


--------------------------------------------------------------------------------
/exercise_notebooks/prometheus_exercise/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9-alpine
 2 | WORKDIR /application
 3 | 
 4 | COPY ./requirements.txt requirements.txt
 5 | RUN apk add --no-cache \
 6 | 		gcc \
 7 | 		libc-dev \
 8 | 		linux-headers \
 9 | 		bash; \
10 | 	pip install -r requirements.txt;
11 | 
12 | COPY . /application
13 | 
14 | 
15 | EXPOSE 5000
16 | VOLUME /application
17 | CMD gunicorn --workers=1 --bind 0.0.0.0:5000 application:application
18 | 


--------------------------------------------------------------------------------
/exercise_notebooks/prometheus_exercise/app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/testing-and-monitoring-ml-deployments/88d0fdac0898178f5ed9611e44a56ea5e53c9dd5/exercise_notebooks/prometheus_exercise/app/__init__.py


--------------------------------------------------------------------------------
/exercise_notebooks/prometheus_exercise/app/flask_app.py:
--------------------------------------------------------------------------------
 1 | import prometheus_client
 2 | from flask import Flask
 3 | from werkzeug.middleware.dispatcher import DispatcherMiddleware
 4 | from app.helpers.middleware import setup_metrics
 5 | 
 6 | 
 7 | def index():
 8 |     return 'home'
 9 | 
10 | 
11 | def cpu():
12 |     # For older machines, you may want to lower
13 |     # this range to prevent timeouts.
14 |     for i in range(10000):
15 |         i**i
16 | 
17 |     return 'cpu intensive operation complete'
18 | 
19 | 
20 | def memory():
21 |     d = {}
22 |     # For older machines, you may want to lower
23 |     # this range to prevent timeouts.
24 |     for i in range(10000000):
25 |         i = str(i)
26 |         i += "xyz"
27 |         d[i] = i
28 | 
29 |     return 'memory intensive operation complete'
30 | 
31 | 
32 | def create_app():
33 |     main_app = Flask(__name__)
34 |     main_app.add_url_rule('/', 'index', index)
35 |     main_app.add_url_rule('/cpu', 'cpu', cpu)
36 |     main_app.add_url_rule('/memory', 'memory', memory)
37 |     setup_metrics(main_app)
38 | 
39 |     # Add prometheus wsgi middleware to route /metrics requests
40 |     app = DispatcherMiddleware(
41 |         app=main_app.wsgi_app,
42 |         mounts={'/metrics': prometheus_client.make_wsgi_app()}
43 |     )
44 | 
45 |     return app
46 | 


--------------------------------------------------------------------------------
/exercise_notebooks/prometheus_exercise/app/helpers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/testing-and-monitoring-ml-deployments/88d0fdac0898178f5ed9611e44a56ea5e53c9dd5/exercise_notebooks/prometheus_exercise/app/helpers/__init__.py


--------------------------------------------------------------------------------
/exercise_notebooks/prometheus_exercise/app/helpers/middleware.py:
--------------------------------------------------------------------------------
 1 | from flask import request, Flask
 2 | from flask.wrappers import Response
 3 | from prometheus_client import Counter, Histogram
 4 | import time
 5 | 
 6 | 
 7 | # Counter and Histogram are examples of default metrics
 8 | # available from the prometheus Python client.
 9 | REQUEST_COUNT = Counter(
10 |     name='http_request_count',
11 |     documentation='App Request Count',
12 |     labelnames=['app_name', 'method', 'endpoint', 'http_status']
13 | )
14 | REQUEST_LATENCY = Histogram(
15 |     name='http_request_latency_seconds',
16 |     documentation='Request latency',
17 |     labelnames=['app_name', 'endpoint']
18 | )
19 | 
20 | 
21 | def start_timer() -> None:
22 |     """Get start time of a request."""
23 |     request._prometheus_metrics_request_start_time = time.time()
24 | 
25 | 
26 | def stop_timer(response: Response) -> Response:
27 |     """Get stop time of a request.."""
28 |     request_latency = time.time() - request._prometheus_metrics_request_start_time
29 |     REQUEST_LATENCY.labels(
30 |         app_name='webapp',
31 |         endpoint=request.path).observe(request_latency)
32 |     return response
33 | 
34 | 
35 | def record_request_data(response: Response) -> Response:
36 |     """Capture request data.
37 | 
38 |     Uses the flask request object to extract information such as
39 |     the HTTP request method, endpoint and HTTP status.
40 |     """
41 |     REQUEST_COUNT.labels(
42 |         app_name='webapp',
43 |         method=request.method,
44 |         endpoint=request.path,
45 |         http_status=response.status_code).inc()
46 |     return response
47 | 
48 | 
49 | def setup_metrics(app: Flask) -> None:
50 |     """Setup Prometheus metrics.
51 | 
52 |     This function uses the flask before_request
53 |     and after_request hooks to capture metrics
54 |     with each HTTP request to the application.
55 |     """
56 |     app.before_request(start_timer)
57 |     app.after_request(record_request_data)
58 |     app.after_request(stop_timer)
59 | 


--------------------------------------------------------------------------------
/exercise_notebooks/prometheus_exercise/application.py:
--------------------------------------------------------------------------------
1 | from app.flask_app import create_app
2 | 
3 | 
4 | application = create_app()
5 | 
6 | if __name__ == '__main__':
7 |     application.run()
8 | 


--------------------------------------------------------------------------------
/exercise_notebooks/prometheus_exercise/config/grafana/grafana_flask_basic_dashboard.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "annotations": {
  3 |     "list": [
  4 |       {
  5 |         "builtIn": 1,
  6 |         "datasource": "-- Grafana --",
  7 |         "enable": true,
  8 |         "hide": true,
  9 |         "iconColor": "rgba(0, 211, 255, 1)",
 10 |         "name": "Annotations & Alerts",
 11 |         "type": "dashboard"
 12 |       }
 13 |     ]
 14 |   },
 15 |   "editable": true,
 16 |   "gnetId": null,
 17 |   "graphTooltip": 0,
 18 |   "id": 1,
 19 |   "links": [],
 20 |   "panels": [
 21 |     {
 22 |       "aliasColors": {},
 23 |       "bars": false,
 24 |       "dashLength": 10,
 25 |       "dashes": false,
 26 |       "datasource": "Prometheus",
 27 |       "fill": 1,
 28 |       "fillGradient": 0,
 29 |       "gridPos": {
 30 |         "h": 9,
 31 |         "w": 12,
 32 |         "x": 0,
 33 |         "y": 0
 34 |       },
 35 |       "hiddenSeries": false,
 36 |       "id": 2,
 37 |       "legend": {
 38 |         "avg": false,
 39 |         "current": false,
 40 |         "max": false,
 41 |         "min": false,
 42 |         "show": true,
 43 |         "total": false,
 44 |         "values": false
 45 |       },
 46 |       "lines": true,
 47 |       "linewidth": 1,
 48 |       "nullPointMode": "null",
 49 |       "options": {
 50 |         "dataLinks": []
 51 |       },
 52 |       "percentage": false,
 53 |       "pointradius": 2,
 54 |       "points": false,
 55 |       "renderer": "flot",
 56 |       "seriesOverrides": [],
 57 |       "spaceLength": 10,
 58 |       "stack": false,
 59 |       "steppedLine": false,
 60 |       "targets": [
 61 |         {
 62 |           "expr": "rate(http_request_count_total{job=\"webapp\"}[1m])",
 63 |           "legendFormat": "{{app_name}} {{endpoint}} {{http_status}}",
 64 |           "refId": "A"
 65 |         }
 66 |       ],
 67 |       "thresholds": [],
 68 |       "timeFrom": null,
 69 |       "timeRegions": [],
 70 |       "timeShift": null,
 71 |       "title": "Requests Rate",
 72 |       "tooltip": {
 73 |         "shared": true,
 74 |         "sort": 0,
 75 |         "value_type": "individual"
 76 |       },
 77 |       "type": "graph",
 78 |       "xaxis": {
 79 |         "buckets": null,
 80 |         "mode": "time",
 81 |         "name": null,
 82 |         "show": true,
 83 |         "values": []
 84 |       },
 85 |       "yaxes": [
 86 |         {
 87 |           "format": "short",
 88 |           "label": null,
 89 |           "logBase": 1,
 90 |           "max": null,
 91 |           "min": null,
 92 |           "show": true
 93 |         },
 94 |         {
 95 |           "format": "short",
 96 |           "label": null,
 97 |           "logBase": 1,
 98 |           "max": null,
 99 |           "min": null,
100 |           "show": true
101 |         }
102 |       ],
103 |       "yaxis": {
104 |         "align": false,
105 |         "alignLevel": null
106 |       }
107 |     },
108 |     {
109 |       "aliasColors": {},
110 |       "bars": false,
111 |       "dashLength": 10,
112 |       "dashes": false,
113 |       "datasource": "Prometheus",
114 |       "fill": 1,
115 |       "fillGradient": 0,
116 |       "gridPos": {
117 |         "h": 9,
118 |         "w": 12,
119 |         "x": 12,
120 |         "y": 0
121 |       },
122 |       "hiddenSeries": false,
123 |       "id": 3,
124 |       "legend": {
125 |         "avg": false,
126 |         "current": false,
127 |         "max": false,
128 |         "min": false,
129 |         "show": true,
130 |         "total": false,
131 |         "values": false
132 |       },
133 |       "lines": true,
134 |       "linewidth": 1,
135 |       "nullPointMode": "null",
136 |       "options": {
137 |         "dataLinks": []
138 |       },
139 |       "percentage": false,
140 |       "pointradius": 2,
141 |       "points": false,
142 |       "renderer": "flot",
143 |       "seriesOverrides": [],
144 |       "spaceLength": 10,
145 |       "stack": false,
146 |       "steppedLine": false,
147 |       "targets": [
148 |         {
149 |           "expr": "rate(http_request_latency_seconds_sum{job=\"webapp\"}[1m]) / rate(http_request_latency_seconds_count{job=\"webapp\"}[1m])",
150 |           "legendFormat": "{{endpoint}} (seconds)",
151 |           "refId": "B"
152 |         }
153 |       ],
154 |       "thresholds": [],
155 |       "timeFrom": null,
156 |       "timeRegions": [],
157 |       "timeShift": null,
158 |       "title": "Latency",
159 |       "tooltip": {
160 |         "shared": true,
161 |         "sort": 0,
162 |         "value_type": "individual"
163 |       },
164 |       "type": "graph",
165 |       "xaxis": {
166 |         "buckets": null,
167 |         "mode": "time",
168 |         "name": null,
169 |         "show": true,
170 |         "values": []
171 |       },
172 |       "yaxes": [
173 |         {
174 |           "format": "short",
175 |           "label": null,
176 |           "logBase": 1,
177 |           "max": null,
178 |           "min": null,
179 |           "show": true
180 |         },
181 |         {
182 |           "format": "short",
183 |           "label": null,
184 |           "logBase": 1,
185 |           "max": null,
186 |           "min": null,
187 |           "show": true
188 |         }
189 |       ],
190 |       "yaxis": {
191 |         "align": false,
192 |         "alignLevel": null
193 |       }
194 |     }
195 |   ],
196 |   "schemaVersion": 21,
197 |   "style": "dark",
198 |   "tags": [],
199 |   "templating": {
200 |     "list": []
201 |   },
202 |   "time": {
203 |     "from": "now-5m",
204 |     "to": "now"
205 |   },
206 |   "timepicker": {
207 |     "refresh_intervals": [
208 |       "5s",
209 |       "10s",
210 |       "30s",
211 |       "1m",
212 |       "5m",
213 |       "15m",
214 |       "30m",
215 |       "1h",
216 |       "2h",
217 |       "1d"
218 |     ]
219 |   },
220 |   "timezone": "",
221 |   "title": "Really Simple Flask Dashboard",
222 |   "uid": "q8vgEpLZk",
223 |   "version": 4
224 | }


--------------------------------------------------------------------------------
/exercise_notebooks/prometheus_exercise/config/prometheus/prometheus.yml:
--------------------------------------------------------------------------------
 1 | # my global config
 2 | global:
 3 |   scrape_interval:     15s # By default, scrape targets every 15 seconds.
 4 |   evaluation_interval: 15s # By default, scrape targets every 15 seconds.
 5 |   # scrape_timeout is set to the global default (10s).
 6 | 
 7 |   # Attach these labels to any time series or alerts when communicating with
 8 |   # external systems (federation, remote storage, Alertmanager).
 9 |   external_labels:
10 |       monitor: 'my-project'
11 | 
12 | # A scrape configuration containing exactly one endpoint to scrape:
13 | # Here it's Prometheus itself.
14 | scrape_configs:
15 |   # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
16 |   - job_name: 'prometheus'
17 | 
18 |     # Override the global default and scrape targets from this job every 5 seconds.
19 |     scrape_interval: 5s
20 | 
21 |     # metrics_path defaults to '/metrics'
22 |     # scheme defaults to 'http'.
23 | 
24 |     static_configs:
25 |          - targets: ['prometheus:9090']
26 |   - job_name: 'webapp'
27 | 
28 |     # Override the global default and scrape targets from this job every 5 seconds.
29 |     scrape_interval: 5s
30 | 
31 |     # metrics_path defaults to '/metrics'
32 |     # scheme defaults to 'http'.
33 |     static_configs:
34 |         - targets: ['webapp:5000']
35 | 
36 |   - job_name: 'cadvisor'
37 | 
38 |     # Override the global default and scrape targets from this job every 5 seconds.
39 |     scrape_interval: 5s
40 | 
41 |     static_configs:
42 |       - targets: ['cadvisor:8080']
43 | 


--------------------------------------------------------------------------------
/exercise_notebooks/prometheus_exercise/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | 
 3 | volumes:
 4 |     prometheus_data: {}
 5 |     grafana_data: {}
 6 | 
 7 | services:
 8 |   webapp:
 9 |     build: .
10 |     container_name: webapp 
11 |     expose:
12 |       - 5000
13 |     ports:
14 |       - 5000:5000
15 |     volumes:
16 |       - ./:/application
17 |   prometheus:
18 |     image: prom/prometheus
19 |     container_name: prometheus
20 |     volumes:
21 |       - ./config/prometheus/:/etc/prometheus/
22 |       - prometheus_data:/prometheus
23 |     command:
24 |       - '--config.file=/etc/prometheus/prometheus.yml'
25 |     expose:
26 |       - 9090
27 |     ports:
28 |       - 9090:9090
29 |     depends_on:
30 |       - cadvisor
31 |   grafana:
32 |     image: grafana/grafana
33 |     depends_on:
34 |       - prometheus
35 |     ports:
36 |       - 3000:3000
37 |     volumes:
38 |       - grafana_data:/var/lib/grafana
39 |     environment:
40 |       - GF_SECURITY_ADMIN_PASSWORD=foobar
41 |       - GF_USERS_ALLOW_SIGN_UP=false
42 | 
43 |   cadvisor:
44 |     image: google/cadvisor
45 |     volumes:
46 |       - /:/rootfs:ro
47 |       - /var/run:/var/run:rw
48 |       - /sys:/sys:ro
49 |       - /var/lib/docker/:/var/lib/docker:ro
50 |     ports:
51 |       - 8080:8080
52 | 


--------------------------------------------------------------------------------
/exercise_notebooks/prometheus_exercise/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask>=1.1.1,<1.2.0
2 | markupsafe==2.0.1  # https://github.com/aws/aws-sam-cli/issues/3661
3 | prometheus_client>=0.7.1,<0.8.0
4 | gunicorn>=20.0.4,<20.1.0
5 | 
6 | 


--------------------------------------------------------------------------------
/exercise_notebooks/shadow_mode_exercise/requirements.txt:
--------------------------------------------------------------------------------
 1 | # ML requirements
 2 | numpy>=1.20.0,<1.21.0
 3 | pandas>=1.3.5,<1.4.0
 4 | scikit-learn>=1.0.2,<1.1.0
 5 | jupyter>=1.0.0,<1.1.0
 6 | feature_engine>=0.3.1,<0.4.0
 7 | joblib>=1.0.1,<1.1.0
 8 | matplotlib>=3.1.3,<3.2.0
 9 | seaborn>=0.10.0,<0.11.0
10 | jupyter>=1.0.0,<1.1.0
11 | 
12 | # Persistence
13 | sqlalchemy>=1.3.11,<1.4.0  # ORM
14 | psycopg2>=2.8.4,<2.9.0  # DB Driver
15 | alembic>=1.3.1,<1.4.0  # DB Migrations
16 | sqlalchemy_utils>=0.36.0,<0.37.0  # DB Utils


--------------------------------------------------------------------------------
/exercise_notebooks/unit_testing_exercise/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.20.0,<1.21.0
2 | pandas>=1.3.5,<1.4.0
3 | scikit-learn>=1.0.2,<1.1.0
4 | jupyter>=1.0.0,<1.1.0


--------------------------------------------------------------------------------
/exercise_notebooks/unit_testing_exercise/unit_testing_data_engineering.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Unit Testing ML Code: Hands-on Exercise (Data Engineering)\n",
  8 |     "\n",
  9 |     "## In this notebook we will explore unit tests for data engineering\n",
 10 |     "\n",
 11 |     "#### We will use a classic toy dataset: the Iris plants dataset, which comes included with scikit-learn\n",
 12 |     "Dataset details: https://scikit-learn.org/stable/datasets/index.html#iris-plants-dataset\n",
 13 |     "\n",
 14 |     "As we progress through the course, the complexity of examples will increase, but we will start with something basic. This notebook is designed so that it can be run in isolation, once the setup steps described below are complete.\n",
 15 |     "\n",
 16 |     "### Setup\n",
 17 |     "\n",
 18 |     "Let's begin by importing the dataset and the libraries we are going to use. Make sure you have run `pip install -r requirements.txt` on requirements file located in the same directory as this notebook. We recommend doing this in a separate virtual environment (see dedicated setup lecture).\n",
 19 |     "\n",
 20 |     "If you need a refresher on jupyter, pandas or numpy, there are some links to resources in the section notes."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 15,
 26 |    "metadata": {
 27 |     "pycharm": {
 28 |      "is_executing": false,
 29 |      "name": "#%%\n"
 30 |     }
 31 |    },
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "from sklearn import datasets\n",
 35 |     "import pandas as pd\n",
 36 |     "import numpy as np\n",
 37 |     "\n",
 38 |     "# Access the iris dataset from sklearn\n",
 39 |     "iris = datasets.load_iris()\n",
 40 |     "\n",
 41 |     "# Load the iris data into a pandas dataframe. The `data` and `feature_names`\n",
 42 |     "# attributes of the dataset are added by default by sklearn. We use them to\n",
 43 |     "# specify the columns of our dataframes.\n",
 44 |     "iris_frame = pd.DataFrame(iris.data, columns=iris.feature_names)\n",
 45 |     "\n",
 46 |     "# Create a \"target\" column in our dataframe, and set the values to the correct\n",
 47 |     "# classifications from the dataset.\n",
 48 |     "iris_frame['target'] = iris.target"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "### Add the `SimplePipeline` from the Test Input Values notebook (same as previous lecture, no changes here)"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 16,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "from sklearn.linear_model import LogisticRegression\n",
 65 |     "from sklearn.model_selection import train_test_split\n",
 66 |     "\n",
 67 |     "\n",
 68 |     "class SimplePipeline:\n",
 69 |     "    def __init__(self):\n",
 70 |     "        self.frame = None\n",
 71 |     "        # Shorthand to specify that each value should start out as\n",
 72 |     "        # None when the class is instantiated.\n",
 73 |     "        self.X_train, self.X_test, self.y_train, self.Y_test = None, None, None, None\n",
 74 |     "        self.model = None\n",
 75 |     "        self.load_dataset()\n",
 76 |     "    \n",
 77 |     "    def load_dataset(self):\n",
 78 |     "        \"\"\"Load the dataset and perform train test split.\"\"\"\n",
 79 |     "        # fetch from sklearn\n",
 80 |     "        dataset = datasets.load_iris()\n",
 81 |     "        \n",
 82 |     "        # remove units ' (cm)' from variable names\n",
 83 |     "        self.feature_names = [fn[:-5] for fn in dataset.feature_names]\n",
 84 |     "        self.frame = pd.DataFrame(dataset.data, columns=self.feature_names)\n",
 85 |     "        self.frame['target'] = dataset.target\n",
 86 |     "        \n",
 87 |     "        # we divide the data set using the train_test_split function from sklearn, \n",
 88 |     "        # which takes as parameters, the dataframe with the predictor variables, \n",
 89 |     "        # then the target, then the percentage of data to assign to the test set, \n",
 90 |     "        # and finally the random_state to ensure reproducibility.\n",
 91 |     "        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(\n",
 92 |     "            self.frame[self.feature_names], self.frame.target, test_size=0.65, random_state=42)\n",
 93 |     "        \n",
 94 |     "    def train(self, algorithm=LogisticRegression):\n",
 95 |     "        \n",
 96 |     "        # we set up a LogisticRegression classifier with default parameters\n",
 97 |     "        self.model = algorithm(solver='lbfgs', multi_class='auto')\n",
 98 |     "        self.model.fit(self.X_train, self.y_train)\n",
 99 |     "        \n",
100 |     "    def predict(self, input_data):\n",
101 |     "        return self.model.predict(input_data)\n",
102 |     "        \n",
103 |     "    def get_accuracy(self):\n",
104 |     "        \n",
105 |     "        # use our X_test and y_test values generated when we used\n",
106 |     "        # `train_test_split` to test accuracy.\n",
107 |     "        # score is a method on the Logisitic Regression that \n",
108 |     "        # returns the accuracy by default, but can be changed to other metrics, see: \n",
109 |     "        # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression.score\n",
110 |     "        return self.model.score(X=self.X_test, y=self.y_test)\n",
111 |     "    \n",
112 |     "    def run_pipeline(self):\n",
113 |     "        \"\"\"Helper method to run multiple pipeline methods with one call.\"\"\"\n",
114 |     "        self.load_dataset()\n",
115 |     "        self.train()"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {},
121 |    "source": [
122 |     "### Test Engineered Data (preprocessing)\n",
123 |     "\n",
124 |     "Below we create an updated pipeline which inherits from the SimplePipeline but has new functionality to preprocess the data by applying a scaler. Linear models are sensitive to the scale of the features. For example features with bigger magnitudes tend to dominate if we do not apply a scaler."
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 17,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "from sklearn.preprocessing import StandardScaler\n",
134 |     "\n",
135 |     "\n",
136 |     "class PipelineWithDataEngineering(SimplePipeline):\n",
137 |     "    def __init__(self):\n",
138 |     "        # Call the inherited SimplePipeline __init__ method first.\n",
139 |     "        super().__init__()\n",
140 |     "        \n",
141 |     "        # scaler to standardize the variables in the dataset\n",
142 |     "        self.scaler = StandardScaler()\n",
143 |     "        # Train the scaler once upon pipeline instantiation:\n",
144 |     "        # Compute the mean and standard deviation based on the training data\n",
145 |     "        self.scaler.fit(self.X_train)\n",
146 |     "    \n",
147 |     "    def apply_scaler(self):\n",
148 |     "        # Scale the test and training data to be of mean 0 and of unit variance\n",
149 |     "        self.X_train = self.scaler.transform(self.X_train)\n",
150 |     "        self.X_test = self.scaler.transform(self.X_test)\n",
151 |     "        \n",
152 |     "    def predict(self, input_data):\n",
153 |     "        # apply scaler transform on inputs before predictions\n",
154 |     "        scaled_input_data = self.scaler.transform(input_data)\n",
155 |     "        return self.model.predict(scaled_input_data)\n",
156 |     "                  \n",
157 |     "    def run_pipeline(self):\n",
158 |     "        \"\"\"Helper method to run multiple pipeline methods with one call.\"\"\"\n",
159 |     "        self.load_dataset()\n",
160 |     "        self.apply_scaler()  # updated in the this class\n",
161 |     "        self.train()"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 18,
167 |    "metadata": {},
168 |    "outputs": [
169 |     {
170 |      "name": "stdout",
171 |      "output_type": "stream",
172 |      "text": [
173 |       "current model accuracy is: 0.9591836734693877\n"
174 |      ]
175 |     }
176 |    ],
177 |    "source": [
178 |     "pipeline = PipelineWithDataEngineering()\n",
179 |     "pipeline.run_pipeline()\n",
180 |     "accuracy_score = pipeline.get_accuracy()\n",
181 |     "print(f'current model accuracy is: {accuracy_score}')"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "### Now we Unit Test\n",
189 |     "We focus specifically on the feature engineering step"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": 19,
195 |    "metadata": {},
196 |    "outputs": [],
197 |    "source": [
198 |     "import unittest\n",
199 |     "\n",
200 |     "\n",
201 |     "class TestIrisDataEngineering(unittest.TestCase):\n",
202 |     "    def setUp(self):\n",
203 |     "        self.pipeline = PipelineWithDataEngineering()\n",
204 |     "        self.pipeline.load_dataset()\n",
205 |     "    \n",
206 |     "    def test_scaler_preprocessing_brings_x_train_mean_near_zero(self):\n",
207 |     "        # Given\n",
208 |     "        # convert the dataframe to be a single column with pandas stack\n",
209 |     "        original_mean = self.pipeline.X_train.stack().mean()\n",
210 |     "        \n",
211 |     "        # When\n",
212 |     "        self.pipeline.apply_scaler()\n",
213 |     "        \n",
214 |     "        # Then\n",
215 |     "        # The idea behind StandardScaler is that it will transform your data \n",
216 |     "        # to center the distribution at 0 and scale the variance at 1.\n",
217 |     "        # Therefore we test that the mean has shifted to be less than the original\n",
218 |     "        # and close to 0 using assertAlmostEqual to check to 3 decimal places:\n",
219 |     "        # https://docs.python.org/3/library/unittest.html#unittest.TestCase.assertAlmostEqual\n",
220 |     "        self.assertTrue(original_mean > self.pipeline.X_train.mean())  # X_train is a numpy array at this point.\n",
221 |     "        self.assertAlmostEqual(self.pipeline.X_train.mean(), 0.0, places=3)\n",
222 |     "        print(f'Original X train mean: {original_mean}')\n",
223 |     "        print(f'Transformed X train mean: {self.pipeline.X_train.mean()}')\n",
224 |     "        \n",
225 |     "    def test_scaler_preprocessing_brings_x_train_std_near_one(self):\n",
226 |     "        # When\n",
227 |     "        self.pipeline.apply_scaler()\n",
228 |     "        \n",
229 |     "        # Then\n",
230 |     "        # We also check that the standard deviation is close to 1\n",
231 |     "        self.assertAlmostEqual(self.pipeline.X_train.std(), 1.0, places=3)\n",
232 |     "        print(f'Transformed X train standard deviation : {self.pipeline.X_train.std()}')\n",
233 |     "        "
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 20,
239 |    "metadata": {},
240 |    "outputs": [
241 |     {
242 |      "name": "stderr",
243 |      "output_type": "stream",
244 |      "text": [
245 |       ".."
246 |      ]
247 |     },
248 |     {
249 |      "name": "stdout",
250 |      "output_type": "stream",
251 |      "text": [
252 |       "Original X train mean: 3.5889423076923075\n",
253 |       "Transformed X train mean: -5.978123978750843e-17\n",
254 |       "Transformed X train standard deviation : 1.0\n"
255 |      ]
256 |     },
257 |     {
258 |      "name": "stderr",
259 |      "output_type": "stream",
260 |      "text": [
261 |       "\n",
262 |       "----------------------------------------------------------------------\n",
263 |       "Ran 2 tests in 0.029s\n",
264 |       "\n",
265 |       "OK\n"
266 |      ]
267 |     },
268 |     {
269 |      "data": {
270 |       "text/plain": [
271 |        "<unittest.runner.TextTestResult run=2 errors=0 failures=0>"
272 |       ]
273 |      },
274 |      "execution_count": 20,
275 |      "metadata": {},
276 |      "output_type": "execute_result"
277 |     }
278 |    ],
279 |    "source": [
280 |     "import sys\n",
281 |     "\n",
282 |     "\n",
283 |     "suite = unittest.TestLoader().loadTestsFromTestCase(TestIrisDataEngineering)\n",
284 |     "unittest.TextTestRunner(verbosity=1, stream=sys.stderr).run(suite)"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "markdown",
289 |    "metadata": {},
290 |    "source": [
291 |     "## Data Engineering Test: Hands-on Exercise\n",
292 |     "Change the pipeline class preprocessing so that the test fails. Do you understand why the test is failing?"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": null,
298 |    "metadata": {},
299 |    "outputs": [],
300 |    "source": []
301 |   }
302 |  ],
303 |  "metadata": {
304 |   "kernelspec": {
305 |    "display_name": "Python 3",
306 |    "language": "python",
307 |    "name": "python3"
308 |   },
309 |   "language_info": {
310 |    "codemirror_mode": {
311 |     "name": "ipython",
312 |     "version": 3
313 |    },
314 |    "file_extension": ".py",
315 |    "mimetype": "text/x-python",
316 |    "name": "python",
317 |    "nbconvert_exporter": "python",
318 |    "pygments_lexer": "ipython3",
319 |    "version": "3.7.6"
320 |   },
321 |   "pycharm": {
322 |    "stem_cell": {
323 |     "cell_type": "raw",
324 |     "metadata": {
325 |      "collapsed": false
326 |     },
327 |     "source": []
328 |    }
329 |   }
330 |  },
331 |  "nbformat": 4,
332 |  "nbformat_minor": 1
333 | }
334 | 


--------------------------------------------------------------------------------
/exercise_notebooks/unit_testing_exercise/unit_testing_model_configuration.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Unit Testing ML Code: Hands-on Exercise (Configuration)\n",
  8 |     "\n",
  9 |     "## In this notebook we will explore unit tests for *model configuration*\n",
 10 |     "\n",
 11 |     "#### We will use a classic toy dataset: the Iris plants dataset, which comes included with scikit-learn\n",
 12 |     "Dataset details: https://scikit-learn.org/stable/datasets/index.html#iris-plants-dataset\n",
 13 |     "\n",
 14 |     "As we progress through the course, the complexity of examples will increase, but we will start with something basic. This notebook is designed so that it can be run in isolation, once the setup steps described below are complete. Cells should be run one after the other without skipping any.\n",
 15 |     "\n",
 16 |     "### Setup\n",
 17 |     "\n",
 18 |     "Let's begin by importing the dataset and the libraries we are going to use. Make sure you have run `pip install -r requirements.txt` on requirements file located in the same directory as this notebook. We recommend doing this in a separate virtual environment (see dedicated setup lecture).\n",
 19 |     "\n",
 20 |     "If you need a refresher on jupyter, pandas or numpy, there are some links to resources in the section notes."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 1,
 26 |    "metadata": {
 27 |     "pycharm": {
 28 |      "is_executing": false,
 29 |      "name": "#%%\n"
 30 |     }
 31 |    },
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "from sklearn import datasets\n",
 35 |     "import pandas as pd\n",
 36 |     "import numpy as np\n",
 37 |     "\n",
 38 |     "# Access the iris dataset from sklearn\n",
 39 |     "iris = datasets.load_iris()\n",
 40 |     "\n",
 41 |     "# Load the iris data into a pandas dataframe. The `data` and `feature_names`\n",
 42 |     "# attributes of the dataset are added by default by sklearn. We use them to\n",
 43 |     "# specify the columns of our dataframes.\n",
 44 |     "iris_frame = pd.DataFrame(iris.data, columns=iris.feature_names)\n",
 45 |     "\n",
 46 |     "# Create a \"target\" column in our dataframe, and set the values to the correct\n",
 47 |     "# classifications from the dataset.\n",
 48 |     "iris_frame['target'] = iris.target"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "### Add the `SimplePipeline` from the Test Input Values notebook (same as first exercise, no changes here)"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 2,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "from sklearn.linear_model import LogisticRegression\n",
 65 |     "from sklearn.model_selection import train_test_split\n",
 66 |     "\n",
 67 |     "\n",
 68 |     "class SimplePipeline:\n",
 69 |     "    def __init__(self):\n",
 70 |     "        self.frame = None\n",
 71 |     "        # Shorthand to specify that each value should start out as\n",
 72 |     "        # None when the class is instantiated.\n",
 73 |     "        self.X_train, self.X_test, self.y_train, self.Y_test = None, None, None, None\n",
 74 |     "        self.model = None\n",
 75 |     "        self.load_dataset()\n",
 76 |     "    \n",
 77 |     "    def load_dataset(self):\n",
 78 |     "        \"\"\"Load the dataset and perform train test split.\"\"\"\n",
 79 |     "        # fetch from sklearn\n",
 80 |     "        dataset = datasets.load_iris()\n",
 81 |     "        \n",
 82 |     "        # remove units ' (cm)' from variable names\n",
 83 |     "        self.feature_names = [fn[:-5] for fn in dataset.feature_names]\n",
 84 |     "        self.frame = pd.DataFrame(dataset.data, columns=self.feature_names)\n",
 85 |     "        self.frame['target'] = dataset.target\n",
 86 |     "        \n",
 87 |     "        # we divide the data set using the train_test_split function from sklearn, \n",
 88 |     "        # which takes as parameters, the dataframe with the predictor variables, \n",
 89 |     "        # then the target, then the percentage of data to assign to the test set, \n",
 90 |     "        # and finally the random_state to ensure reproducibility.\n",
 91 |     "        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(\n",
 92 |     "            self.frame[self.feature_names], self.frame.target, test_size=0.65, random_state=42)\n",
 93 |     "        \n",
 94 |     "    def train(self, algorithm=LogisticRegression):\n",
 95 |     "        \n",
 96 |     "        # we set up a LogisticRegression classifier with default parameters\n",
 97 |     "        self.model = algorithm(solver='lbfgs', multi_class='auto')\n",
 98 |     "        self.model.fit(self.X_train, self.y_train)\n",
 99 |     "        \n",
100 |     "    def predict(self, input_data):\n",
101 |     "        return self.model.predict(input_data)\n",
102 |     "        \n",
103 |     "    def get_accuracy(self):\n",
104 |     "        \n",
105 |     "        # use our X_test and y_test values generated when we used\n",
106 |     "        # `train_test_split` to test accuracy.\n",
107 |     "        # score is a method on the Logisitic Regression that \n",
108 |     "        # returns the accuracy by default, but can be changed to other metrics, see: \n",
109 |     "        # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression.score\n",
110 |     "        return self.model.score(X=self.X_test, y=self.y_test)\n",
111 |     "    \n",
112 |     "    def run_pipeline(self):\n",
113 |     "        \"\"\"Helper method to run multiple pipeline methods with one call.\"\"\"\n",
114 |     "        self.load_dataset()\n",
115 |     "        self.train()"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {},
121 |    "source": [
122 |     "### Update the Pipeline\n",
123 |     "\n",
124 |     "We now create a new pipeline class which inherits from the `SimplePipeline` with one important modification: The configuration for the model is passed in as an argument when the pipeline object is instantiated. This means that configuration can be set via an external object or file."
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 3,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "class PipelineWithConfig(SimplePipeline):\n",
134 |     "    def __init__(self, config):\n",
135 |     "        # Call the inherited SimplePipeline __init__ method first.\n",
136 |     "        super().__init__()\n",
137 |     "        # Pass in a config object which we use during the train method.\n",
138 |     "        self.config = config\n",
139 |     "            \n",
140 |     "    def train(self, algorithm=LogisticRegression):\n",
141 |     "        # note that we instantiate the LogisticRegression classifier \n",
142 |     "        # with params from the pipeline config\n",
143 |     "        self.model = algorithm(solver=self.config.get('solver'),\n",
144 |     "                               multi_class=self.config.get('multi_class'))\n",
145 |     "        self.model.fit(self.X_train, self.y_train)"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {},
151 |    "source": [
152 |     "### Now we Unit Test\n",
153 |     "\n",
154 |     "We will employ a simple unit test to check the configuration values.\n",
155 |     "\n",
156 |     "Let's say that after extensive testing in the research environment, we deduce that certain types of configuration (parameters passed to the model, preprocessing settings, GPU configurations etc.) are optimal, or that certain configurations tend to be a bad idea. We should then test our configuration is validated against this understanding."
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 4,
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "import unittest\n",
166 |     "\n",
167 |     "\n",
168 |     "# arbitrarily selected for demonstration purposes. In a real\n",
169 |     "# system you would define this in config and import into your\n",
170 |     "# tests so you didn't have to update config and tests when\n",
171 |     "# the values changed.\n",
172 |     "ENABLED_MODEL_SOLVERS = {'lbfgs', 'newton-cg'}\n",
173 |     "\n",
174 |     "\n",
175 |     "class TestIrisConfig(unittest.TestCase):\n",
176 |     "    def setUp(self):\n",
177 |     "        # We prepare the pipeline for use in the tests\n",
178 |     "        config = {'solver': 'lbfgs', 'multi_class': 'auto'}\n",
179 |     "        self.pipeline = PipelineWithConfig(config=config)\n",
180 |     "        self.pipeline.run_pipeline()\n",
181 |     "    \n",
182 |     "    def test_pipeline_config(self):\n",
183 |     "        # Given\n",
184 |     "        # fetch model config using sklearn get_params()\n",
185 |     "        # https://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html#sklearn.base.BaseEstimator.get_params\n",
186 |     "        model_params = self.pipeline.model.get_params()\n",
187 |     "        \n",
188 |     "        # Then\n",
189 |     "        self.assertTrue(model_params['solver'] in ENABLED_MODEL_SOLVERS)"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": 5,
195 |    "metadata": {},
196 |    "outputs": [
197 |     {
198 |      "name": "stderr",
199 |      "output_type": "stream",
200 |      "text": [
201 |       ".\n",
202 |       "----------------------------------------------------------------------\n",
203 |       "Ran 1 test in 0.034s\n",
204 |       "\n",
205 |       "OK\n"
206 |      ]
207 |     },
208 |     {
209 |      "data": {
210 |       "text/plain": [
211 |        "<unittest.runner.TextTestResult run=1 errors=0 failures=0>"
212 |       ]
213 |      },
214 |      "execution_count": 5,
215 |      "metadata": {},
216 |      "output_type": "execute_result"
217 |     }
218 |    ],
219 |    "source": [
220 |     "import sys\n",
221 |     "\n",
222 |     "\n",
223 |     "suite = unittest.TestLoader().loadTestsFromTestCase(TestIrisConfig)\n",
224 |     "unittest.TextTestRunner(verbosity=1, stream=sys.stderr).run(suite)"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "markdown",
229 |    "metadata": {},
230 |    "source": [
231 |     "## Model Configuration Testing: Hands-on Exercise\n",
232 |     "Change the model config so that the test fails. Do you understand why the test is failing?"
233 |    ]
234 |   }
235 |  ],
236 |  "metadata": {
237 |   "kernelspec": {
238 |    "display_name": "Python 3",
239 |    "language": "python",
240 |    "name": "python3"
241 |   },
242 |   "language_info": {
243 |    "codemirror_mode": {
244 |     "name": "ipython",
245 |     "version": 3
246 |    },
247 |    "file_extension": ".py",
248 |    "mimetype": "text/x-python",
249 |    "name": "python",
250 |    "nbconvert_exporter": "python",
251 |    "pygments_lexer": "ipython3",
252 |    "version": "3.7.6"
253 |   },
254 |   "pycharm": {
255 |    "stem_cell": {
256 |     "cell_type": "raw",
257 |     "metadata": {
258 |      "collapsed": false
259 |     },
260 |     "source": []
261 |    }
262 |   }
263 |  },
264 |  "nbformat": 4,
265 |  "nbformat_minor": 1
266 | }
267 | 


--------------------------------------------------------------------------------
/exercise_notebooks/unit_testing_exercise/unit_testing_model_predictions_quality.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Unit Testing ML Code: Hands-on Exercise (Model Quality)\n",
  8 |     "\n",
  9 |     "## In this notebook we will explore unit tests for *model prediction quality*\n",
 10 |     "\n",
 11 |     "#### We will use a classic toy dataset: the Iris plants dataset, which comes included with scikit-learn\n",
 12 |     "Dataset details: https://scikit-learn.org/stable/datasets/index.html#iris-plants-dataset\n",
 13 |     "\n",
 14 |     "As we progress through the course, the complexity of examples will increase, but we will start with something basic. This notebook is designed so that it can be run in isolation, once the setup steps described below are complete. Cells should be run one after the other without skipping any.\n",
 15 |     "\n",
 16 |     "### Setup\n",
 17 |     "\n",
 18 |     "Let's begin by importing the dataset and the libraries we are going to use. Make sure you have run `pip install -r requirements.txt` on requirements file located in the same directory as this notebook. We recommend doing this in a separate virtual environment (see dedicated setup lecture).\n",
 19 |     "\n",
 20 |     "If you need a refresher on jupyter, pandas or numpy, there are some links to resources in the section notes."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 1,
 26 |    "metadata": {
 27 |     "pycharm": {
 28 |      "is_executing": false,
 29 |      "name": "#%%\n"
 30 |     }
 31 |    },
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "from sklearn import datasets\n",
 35 |     "import pandas as pd\n",
 36 |     "import numpy as np\n",
 37 |     "\n",
 38 |     "# Access the iris dataset from sklearn\n",
 39 |     "iris = datasets.load_iris()\n",
 40 |     "\n",
 41 |     "# Load the iris data into a pandas dataframe. The `data` and `feature_names`\n",
 42 |     "# attributes of the dataset are added by default by sklearn. We use them to\n",
 43 |     "# specify the columns of our dataframes.\n",
 44 |     "iris_frame = pd.DataFrame(iris.data, columns=iris.feature_names)\n",
 45 |     "\n",
 46 |     "# Create a \"target\" column in our dataframe, and set the values to the correct\n",
 47 |     "# classifications from the dataset.\n",
 48 |     "iris_frame['target'] = iris.target"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "### Create the Pipelines\n",
 56 |     "\n",
 57 |     "Below we use both pipelines from the previous exercises:\n",
 58 |     "\n",
 59 |     "- `SimplePipeline` from the testing inputs lecture\n",
 60 |     "- `PipelineWithFeatureEngineering` from the testing data engineering lecture\n",
 61 |     "\n",
 62 |     "The pipelines have not been changed. We use both so that we can compare predictions between them in our tests."
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 2,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "from sklearn.linear_model import LogisticRegression\n",
 72 |     "from sklearn.model_selection import train_test_split\n",
 73 |     "\n",
 74 |     "\n",
 75 |     "class SimplePipeline:\n",
 76 |     "    def __init__(self):\n",
 77 |     "        self.frame = None\n",
 78 |     "        # Shorthand to specify that each value should start out as\n",
 79 |     "        # None when the class is instantiated.\n",
 80 |     "        self.X_train, self.X_test, self.y_train, self.Y_test = None, None, None, None\n",
 81 |     "        self.model = None\n",
 82 |     "        self.load_dataset()\n",
 83 |     "    \n",
 84 |     "    def load_dataset(self):\n",
 85 |     "        \"\"\"Load the dataset and perform train test split.\"\"\"\n",
 86 |     "        # fetch from sklearn\n",
 87 |     "        dataset = datasets.load_iris()\n",
 88 |     "        \n",
 89 |     "        # remove units ' (cm)' from variable names\n",
 90 |     "        self.feature_names = [fn[:-5] for fn in dataset.feature_names]\n",
 91 |     "        self.frame = pd.DataFrame(dataset.data, columns=self.feature_names)\n",
 92 |     "        self.frame['target'] = dataset.target\n",
 93 |     "        \n",
 94 |     "        # we divide the data set using the train_test_split function from sklearn, \n",
 95 |     "        # which takes as parameters, the dataframe with the predictor variables, \n",
 96 |     "        # then the target, then the percentage of data to assign to the test set, \n",
 97 |     "        # and finally the random_state to ensure reproducibility.\n",
 98 |     "        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(\n",
 99 |     "            self.frame[self.feature_names], self.frame.target, test_size=0.65, random_state=42)\n",
100 |     "        \n",
101 |     "    def train(self, algorithm=LogisticRegression):\n",
102 |     "        \n",
103 |     "        # we set up a LogisticRegression classifier with default parameters\n",
104 |     "        self.model = algorithm(solver='lbfgs', multi_class='auto')\n",
105 |     "        self.model.fit(self.X_train, self.y_train)\n",
106 |     "        \n",
107 |     "    def predict(self, input_data):\n",
108 |     "        return self.model.predict(input_data)\n",
109 |     "        \n",
110 |     "    def get_accuracy(self):\n",
111 |     "        \n",
112 |     "        # use our X_test and y_test values generated when we used\n",
113 |     "        # `train_test_split` to test accuracy.\n",
114 |     "        # score is a method on the Logisitic Regression that \n",
115 |     "        # returns the accuracy by default, but can be changed to other metrics, see: \n",
116 |     "        # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression.score\n",
117 |     "        return self.model.score(X=self.X_test, y=self.y_test)\n",
118 |     "    \n",
119 |     "    def run_pipeline(self):\n",
120 |     "        \"\"\"Helper method to run multiple pipeline methods with one call.\"\"\"\n",
121 |     "        self.load_dataset()\n",
122 |     "        self.train()"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 3,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "from sklearn.preprocessing import StandardScaler\n",
132 |     "\n",
133 |     "\n",
134 |     "class PipelineWithDataEngineering(SimplePipeline):\n",
135 |     "    def __init__(self):\n",
136 |     "        # Call the inherited SimplePipeline __init__ method first.\n",
137 |     "        super().__init__()\n",
138 |     "        \n",
139 |     "        # scaler to standardize the variables in the dataset\n",
140 |     "        self.scaler = StandardScaler()\n",
141 |     "        # Train the scaler once upon pipeline instantiation:\n",
142 |     "        # Compute the mean and standard deviation based on the training data\n",
143 |     "        self.scaler.fit(self.X_train)\n",
144 |     "    \n",
145 |     "    def apply_scaler(self):\n",
146 |     "        # Scale the test and training data to be of mean 0 and of unit variance\n",
147 |     "        self.X_train = self.scaler.transform(self.X_train)\n",
148 |     "        self.X_test = self.scaler.transform(self.X_test)\n",
149 |     "        \n",
150 |     "    def predict(self, input_data):\n",
151 |     "        # apply scaler transform on inputs before predictions\n",
152 |     "        scaled_input_data = self.scaler.transform(input_data)\n",
153 |     "        return self.model.predict(scaled_input_data)\n",
154 |     "                  \n",
155 |     "    def run_pipeline(self):\n",
156 |     "        \"\"\"Helper method to run multiple pipeline methods with one call.\"\"\"\n",
157 |     "        self.load_dataset()\n",
158 |     "        self.apply_scaler()  # updated in the this class\n",
159 |     "        self.train()"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "markdown",
164 |    "metadata": {},
165 |    "source": [
166 |     "### Now we Unit Test\n",
167 |     "\n",
168 |     "We will employ a few different tests for model prediction quality:\n",
169 |     "\n",
170 |     "1. A benchmark test: checking model accuracy against a simple benchmark\n",
171 |     "2. A differential test: checking model accuracy from one version to the next\n",
172 |     "\n",
173 |     "To begin, let's establish a base line. The simplest baseline is predicting the most common class. If we run: "
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 4,
179 |    "metadata": {},
180 |    "outputs": [
181 |     {
182 |      "data": {
183 |       "text/plain": [
184 |        "2    50\n",
185 |        "1    50\n",
186 |        "0    50\n",
187 |        "Name: target, dtype: int64"
188 |       ]
189 |      },
190 |      "execution_count": 4,
191 |      "metadata": {},
192 |      "output_type": "execute_result"
193 |     }
194 |    ],
195 |    "source": [
196 |     "iris_frame['target'].value_counts()"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "markdown",
201 |    "metadata": {},
202 |    "source": [
203 |     "We can see that there an equal number of classifications for the 3 flower types. Let's check the accuracy when always predicting classification 1. Obviously this is a very low benchmark (circa 33% accuracy on the dataset), but it serves to illustrate the sort of checks you should be running with your models. If this test fails, then our model accuracy is terrible and we have probably introduced a severe bug into our code."
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 5,
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": [
212 |     "import unittest\n",
213 |     "from sklearn.metrics import mean_squared_error, accuracy_score\n",
214 |     "\n",
215 |     "\n",
216 |     "class TestIrisPredictions(unittest.TestCase):\n",
217 |     "    def setUp(self):\n",
218 |     "        # We prepare both pipelines for use in the tests\n",
219 |     "        self.pipeline_v1 = SimplePipeline()\n",
220 |     "        self.pipeline_v2 = PipelineWithDataEngineering()\n",
221 |     "        self.pipeline_v1.run_pipeline()\n",
222 |     "        self.pipeline_v2.run_pipeline()\n",
223 |     "        \n",
224 |     "        # the benchmark is simply the same classification value for \n",
225 |     "        # for every test entry\n",
226 |     "        self.benchmark_predictions = [1.0] * len(self.pipeline_v1.y_test)\n",
227 |     "    \n",
228 |     "    def test_accuracy_higher_than_benchmark(self):\n",
229 |     "        # Given\n",
230 |     "        benchmark_accuracy = accuracy_score(\n",
231 |     "            y_true=self.pipeline_v1.y_test,\n",
232 |     "            y_pred=self.benchmark_predictions)\n",
233 |     "        \n",
234 |     "        predictions = self.pipeline_v1.predict(self.pipeline_v1.X_test)\n",
235 |     "        \n",
236 |     "        # When\n",
237 |     "        actual_accuracy = accuracy_score(\n",
238 |     "            y_true=self.pipeline_v1.y_test,\n",
239 |     "            y_pred=predictions)\n",
240 |     "        \n",
241 |     "        # Then\n",
242 |     "        print(f'model accuracy: {actual_accuracy}, benchmark accuracy: {benchmark_accuracy}')\n",
243 |     "        self.assertTrue(actual_accuracy > benchmark_accuracy)\n",
244 |     "        \n",
245 |     "    def test_accuracy_compared_to_previous_version(self):\n",
246 |     "        # When\n",
247 |     "        v1_accuracy = self.pipeline_v1.get_accuracy()\n",
248 |     "        v2_accuracy = self.pipeline_v2.get_accuracy()\n",
249 |     "        print(f'pipeline v1 accuracy: {v1_accuracy}')\n",
250 |     "        print(f'pipeline v2 accuracy: {v2_accuracy}')\n",
251 |     "        \n",
252 |     "        # Then\n",
253 |     "        self.assertTrue(v2_accuracy >= v1_accuracy)"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": 6,
259 |    "metadata": {},
260 |    "outputs": [
261 |     {
262 |      "name": "stderr",
263 |      "output_type": "stream",
264 |      "text": [
265 |       "F."
266 |      ]
267 |     },
268 |     {
269 |      "name": "stdout",
270 |      "output_type": "stream",
271 |      "text": [
272 |       "pipeline v1 accuracy: 0.9693877551020408\n",
273 |       "pipeline v2 accuracy: 0.9591836734693877\n",
274 |       "model accuracy: 0.9693877551020408, benchmark accuracy: 0.32653061224489793\n"
275 |      ]
276 |     },
277 |     {
278 |      "name": "stderr",
279 |      "output_type": "stream",
280 |      "text": [
281 |       "\n",
282 |       "======================================================================\n",
283 |       "FAIL: test_accuracy_compared_to_previous_version (__main__.TestIrisPredictions)\n",
284 |       "----------------------------------------------------------------------\n",
285 |       "Traceback (most recent call last):\n",
286 |       "  File \"<ipython-input-5-ad175269f46a>\", line 42, in test_accuracy_compared_to_previous_version\n",
287 |       "    self.assertTrue(v2_accuracy >= v1_accuracy)\n",
288 |       "AssertionError: False is not true\n",
289 |       "\n",
290 |       "----------------------------------------------------------------------\n",
291 |       "Ran 2 tests in 0.115s\n",
292 |       "\n",
293 |       "FAILED (failures=1)\n"
294 |      ]
295 |     },
296 |     {
297 |      "data": {
298 |       "text/plain": [
299 |        "<unittest.runner.TextTestResult run=2 errors=0 failures=1>"
300 |       ]
301 |      },
302 |      "execution_count": 6,
303 |      "metadata": {},
304 |      "output_type": "execute_result"
305 |     }
306 |    ],
307 |    "source": [
308 |     "import sys\n",
309 |     "\n",
310 |     "\n",
311 |     "suite = unittest.TestLoader().loadTestsFromTestCase(TestIrisPredictions)\n",
312 |     "unittest.TextTestRunner(verbosity=1, stream=sys.stderr).run(suite)"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "markdown",
317 |    "metadata": {},
318 |    "source": [
319 |     "## Model Quality Testing: Hands-on Exercise\n",
320 |     "1. Change the SimplePipeline class so that the benchmark test fails. Do you understand why the test is failing?\n",
321 |     "\n",
322 |     "2. Change either the SimplePipeline or the PipelineWithDataEngineering classes so that `test_accuracy_compared_to_previous_version` **passes**. \n",
323 |     "\n",
324 |     "These tests are a little more open ended than others we have looked at, don't worry if you find them tricky!"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": null,
330 |    "metadata": {},
331 |    "outputs": [],
332 |    "source": []
333 |   }
334 |  ],
335 |  "metadata": {
336 |   "kernelspec": {
337 |    "display_name": "Python 3",
338 |    "language": "python",
339 |    "name": "python3"
340 |   },
341 |   "language_info": {
342 |    "codemirror_mode": {
343 |     "name": "ipython",
344 |     "version": 3
345 |    },
346 |    "file_extension": ".py",
347 |    "mimetype": "text/x-python",
348 |    "name": "python",
349 |    "nbconvert_exporter": "python",
350 |    "pygments_lexer": "ipython3",
351 |    "version": "3.7.6"
352 |   },
353 |   "pycharm": {
354 |    "stem_cell": {
355 |     "cell_type": "raw",
356 |     "metadata": {
357 |      "collapsed": false
358 |     },
359 |     "source": []
360 |    }
361 |   }
362 |  },
363 |  "nbformat": 4,
364 |  "nbformat_minor": 1
365 | }
366 | 


--------------------------------------------------------------------------------
/exercise_notebooks/utility_scripts/MapPortsForDocker.cmd:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | SETLOCAL ENABLEEXTENSIONS ENABLEDELAYEDEXPANSION
 3 | ::%1 is mean first script parameter or switch
 4 | if "%~1"=="" (
 5 | 	echo You need to pass a port as a parameter. Example %~n0 port
 6 | 	pause
 7 | 	goto :EOF
 8 | )
 9 | 
10 | set hostFilePath="c:\Windows\System32\drivers\etc\hosts"
11 | set ContainerPort=%1
12 | 
13 | for /f "USEBACKQ" %%a in (`docker-machine ip`) do set DockerIP=%%a
14 | for /f "tokens=3 delims=: USEBACKQ" %%b in (`find /c "%DockerIP%" %hostFilePath%`) do (
15 | 	if /I "%%b"==" 0" (echo %DockerIP% localhost >> %hostFilePath%)
16 | )
17 | netsh interface portproxy add v4tov4 listenport=%ContainerPort% listenaddress=127.0.0.1 connectaddress=%DockerIP% connectport=%ContainerPort%
18 | netsh interface portproxy add v6tov4 listenport=%ContainerPort% listenaddress=::1 connectaddress=%DockerIP% connectport=%ContainerPort% 
19 | netsh interface portproxy show v4tov4
20 | netsh interface portproxy show v6tov4
21 | ::"netsh interface portproxy show v4tov4" allows you to view current port redirection
22 | ::"netsh interface portproxy delete v4tov4 listenport=%ContainerPort% listenaddress=127.0.0.1" allows you to remove port redirection
23 | ping -n 10 127.0.0.1 > nul


--------------------------------------------------------------------------------
/packages/gradient_boosting_model/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include *.txt
 2 | include *.md
 3 | include *.pkl
 4 | recursive-include ./gradient_boosting_model/*
 5 | 
 6 | include gradient_boosting_model/datasets/houseprice.csv
 7 | include gradient_boosting_model/datasets/test.csv
 8 | include gradient_boosting_model/trained_models/*.pkl
 9 | include gradient_boosting_model/VERSION
10 | include gradient_boosting_model/config.yml
11 | 
12 | include ./requirements.txt
13 | exclude *.log
14 | exclude *.cfg
15 | 
16 | recursive-exclude * __pycache__
17 | recursive-exclude * *.py[co]
18 | 


--------------------------------------------------------------------------------
/packages/gradient_boosting_model/gradient_boosting_model/VERSION:
--------------------------------------------------------------------------------
1 | 0.3.0
2 | 


--------------------------------------------------------------------------------
/packages/gradient_boosting_model/gradient_boosting_model/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from gradient_boosting_model.config.core import config, PACKAGE_ROOT
 4 | 
 5 | # It is strongly advised that you do not add any handlers other than
 6 | # NullHandler to your library’s loggers. This is because the configuration
 7 | # of handlers is the prerogative of the application developer who uses your
 8 | # library. The application developer knows their target audience and what
 9 | # handlers are most appropriate for their application: if you add handlers
10 | # ‘under the hood’, you might well interfere with their ability to carry out
11 | # unit tests and deliver logs which suit their requirements.
12 | # https://docs.python.org/3/howto/logging.html#configuring-logging-for-a-library
13 | logging.getLogger(config.app_config.package_name).addHandler(logging.NullHandler())
14 | 
15 | 
16 | with open(PACKAGE_ROOT / "VERSION") as version_file:
17 |     __version__ = version_file.read().strip()
18 | 


--------------------------------------------------------------------------------
/packages/gradient_boosting_model/gradient_boosting_model/config.yml:
--------------------------------------------------------------------------------
 1 | # Package Overview
 2 | package_name: gradient_boosting_model
 3 | 
 4 | # Data Files
 5 | training_data_file: houseprice.csv
 6 | test_data_file: test.csv
 7 | 
 8 | # this variable is to calculate the temporal variable
 9 | # but is dropped prior to model training.
10 | drop_features: YrSold
11 | 
12 | pipeline_name: gb_regression
13 | pipeline_save_file: gb_regression_output_v
14 | 
15 | # Variables
16 | # The variable we are attempting to predict (sale price)
17 | target: SalePrice
18 | 
19 | # Will cause syntax errors since they begin with numbers
20 | variables_to_rename:
21 |   1stFlrSF: FirstFlrSF
22 |   2ndFlrSF: SecondFlrSF
23 |   3SsnPorch: ThreeSsnPortch
24 | 
25 | features:
26 |   - LotArea
27 |   - OverallQual
28 |   - YearRemodAdd
29 |   - BsmtQual
30 |   - BsmtFinSF1
31 |   - TotalBsmtSF
32 |   - FirstFlrSF
33 |   - SecondFlrSF
34 |   - GrLivArea
35 |   - GarageCars
36 |     # this one is only to calculate temporal variable:
37 |   - YrSold
38 | 
39 | numerical_vars:
40 |   - LotArea
41 |   - OverallQual
42 |   - YearRemodAdd
43 |   - BsmtQual
44 |   - BsmtFinSF1
45 |   - TotalBsmtSF
46 |   - FirstFlrSF
47 |   - SecondFlrSF
48 |   - GrLivArea
49 |   - GarageCars
50 | 
51 | categorical_vars:
52 |   - BsmtQual
53 | 
54 | temporal_vars: YearRemodAdd
55 | 
56 | # Validation
57 | # numerical variables with NA in train set
58 | numerical_vars_with_na:
59 |   - LotFrontage
60 | 
61 | numerical_na_not_allowed:
62 |   - LotArea
63 |   - OverallQual
64 |   - YearRemodAdd
65 |   - BsmtFinSF1
66 |   - TotalBsmtSF
67 |   - FirstFlrSF
68 |   - SecondFlrSF
69 |   - GrLivArea
70 |   - GarageCars
71 |   - YrSold
72 | 
73 | # set train/test split
74 | test_size: 0.1
75 | 
76 | # to set the random seed
77 | random_state: 0
78 | 
79 | # The number of boosting stages to perform
80 | n_estimators: 50
81 | 
82 | # the minimum frequency a label should have to be considered frequent
83 | # and not be removed.
84 | rare_label_tol: 0.01
85 | 
86 | # the minimum number of categories a variable should have in order for
87 | # the encoder to find frequent labels
88 | rare_label_n_categories: 5
89 | 
90 | # loss function to be optimized
91 | loss: ls
92 | allowed_loss_functions:
93 |   - ls
94 |   - huber
95 | 


--------------------------------------------------------------------------------
/packages/gradient_boosting_model/gradient_boosting_model/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/testing-and-monitoring-ml-deployments/88d0fdac0898178f5ed9611e44a56ea5e53c9dd5/packages/gradient_boosting_model/gradient_boosting_model/config/__init__.py


--------------------------------------------------------------------------------
/packages/gradient_boosting_model/gradient_boosting_model/config/core.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | import typing as t
  3 | 
  4 | from pydantic import BaseModel, validator
  5 | from strictyaml import load, YAML
  6 | 
  7 | import gradient_boosting_model
  8 | 
  9 | # Project Directories
 10 | PACKAGE_ROOT = Path(gradient_boosting_model.__file__).resolve().parent
 11 | ROOT = PACKAGE_ROOT.parent
 12 | CONFIG_FILE_PATH = PACKAGE_ROOT / "config.yml"
 13 | TRAINED_MODEL_DIR = PACKAGE_ROOT / "trained_models"
 14 | DATASET_DIR = PACKAGE_ROOT / "datasets"
 15 | 
 16 | 
 17 | class AppConfig(BaseModel):
 18 |     """
 19 |     Application-level config.
 20 |     """
 21 | 
 22 |     package_name: str
 23 |     pipeline_name: str
 24 |     pipeline_save_file: str
 25 |     training_data_file: str
 26 |     test_data_file: str
 27 | 
 28 | 
 29 | class ModelConfig(BaseModel):
 30 |     """
 31 |     All configuration relevant to model
 32 |     training and feature engineering.
 33 |     """
 34 | 
 35 |     drop_features: str
 36 |     target: str
 37 |     variables_to_rename: t.Dict
 38 |     features: t.Sequence[str]
 39 |     numerical_vars: t.Sequence[str]
 40 |     categorical_vars: t.Sequence[str]
 41 |     temporal_vars: str
 42 |     numerical_vars_with_na: t.Sequence[str]
 43 |     numerical_na_not_allowed: t.Sequence[str]
 44 |     test_size: float
 45 |     random_state: int
 46 |     n_estimators: int
 47 |     rare_label_n_categories: int
 48 |     rare_label_tol: float
 49 | 
 50 |     # the order is necessary for validation
 51 |     allowed_loss_functions: t.Tuple[str, ...]
 52 |     loss: str
 53 | 
 54 |     @validator("loss")
 55 |     def allowed_loss_function(cls, value, values):
 56 |         """
 57 |         Loss function to be optimized.
 58 | 
 59 |         `ls` refers to least squares regression.
 60 |         `lad` (least absolute deviation)
 61 |         `huber` is a combination of the two.
 62 |         `quantile` allows quantile regression.
 63 | 
 64 |         Following the research phase, loss is restricted to
 65 |         `ls` and `huber` for this model.
 66 |         """
 67 | 
 68 |         allowed_loss_functions = values.get("allowed_loss_functions")
 69 |         if value in allowed_loss_functions:
 70 |             return value
 71 |         raise ValueError(
 72 |             f"the loss parameter specified: {value}, "
 73 |             f"is not in the allowed set: {allowed_loss_functions}"
 74 |         )
 75 | 
 76 | 
 77 | class Config(BaseModel):
 78 |     """Master config object."""
 79 | 
 80 |     app_config: AppConfig
 81 |     model_config: ModelConfig
 82 | 
 83 | 
 84 | def find_config_file() -> Path:
 85 |     """Locate the configuration file."""
 86 |     if CONFIG_FILE_PATH.is_file():
 87 |         return CONFIG_FILE_PATH
 88 |     raise Exception(f"Config not found at {CONFIG_FILE_PATH!r}")
 89 | 
 90 | 
 91 | def fetch_config_from_yaml(cfg_path: Path = None) -> YAML:
 92 |     """Parse YAML containing the package configuration."""
 93 | 
 94 |     if not cfg_path:
 95 |         cfg_path = find_config_file()
 96 | 
 97 |     if cfg_path:
 98 |         with open(cfg_path, "r") as conf_file:
 99 |             parsed_config = load(conf_file.read())
100 |             return parsed_config
101 |     raise OSError(f"Did not find config file at path: {cfg_path}")
102 | 
103 | 
104 | def create_and_validate_config(parsed_config: YAML = None) -> Config:
105 |     """Run validation on config values."""
106 |     if parsed_config is None:
107 |         parsed_config = fetch_config_from_yaml()
108 | 
109 |     # specify the data attribute from the strictyaml YAML type.
110 |     _config = Config(
111 |         app_config=AppConfig(**parsed_config.data),
112 |         model_config=ModelConfig(**parsed_config.data),
113 |     )
114 | 
115 |     return _config
116 | 
117 | 
118 | config = create_and_validate_config()
119 | 


--------------------------------------------------------------------------------
/packages/gradient_boosting_model/gradient_boosting_model/datasets/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/testing-and-monitoring-ml-deployments/88d0fdac0898178f5ed9611e44a56ea5e53c9dd5/packages/gradient_boosting_model/gradient_boosting_model/datasets/.gitkeep


--------------------------------------------------------------------------------
/packages/gradient_boosting_model/gradient_boosting_model/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/testing-and-monitoring-ml-deployments/88d0fdac0898178f5ed9611e44a56ea5e53c9dd5/packages/gradient_boosting_model/gradient_boosting_model/datasets/__init__.py


--------------------------------------------------------------------------------
/packages/gradient_boosting_model/gradient_boosting_model/pipeline.py:
--------------------------------------------------------------------------------
 1 | from sklearn.impute import SimpleImputer
 2 | from sklearn.preprocessing import OrdinalEncoder
 3 | from sklearn.ensemble import GradientBoostingRegressor
 4 | from sklearn.pipeline import Pipeline
 5 | from feature_engine.encoding import RareLabelEncoder
 6 | 
 7 | from gradient_boosting_model.processing import preprocessors as pp
 8 | from gradient_boosting_model.config.core import config
 9 | 
10 | import logging
11 | 
12 | 
13 | _logger = logging.getLogger(__name__)
14 | 
15 | 
16 | price_pipe = Pipeline(
17 |     [
18 |         (
19 |             "numerical_imputer",
20 |             pp.SklearnTransformerWrapper(
21 |                 variables=config.model_config.numerical_vars,
22 |                 transformer=SimpleImputer(strategy="most_frequent"),
23 |             ),
24 |         ),
25 |         (
26 |             "categorical_imputer",
27 |             pp.SklearnTransformerWrapper(
28 |                 variables=config.model_config.categorical_vars,
29 |                 transformer=SimpleImputer(strategy="constant", fill_value="missing"),
30 |             ),
31 |         ),
32 |         (
33 |             "temporal_variable",
34 |             pp.TemporalVariableEstimator(
35 |                 variables=config.model_config.temporal_vars,
36 |                 reference_variable=config.model_config.drop_features,
37 |             ),
38 |         ),
39 |         (
40 |             "rare_label_encoder",
41 |             RareLabelEncoder(
42 |                 tol=config.model_config.rare_label_tol,
43 |                 n_categories=config.model_config.rare_label_n_categories,
44 |                 variables=config.model_config.categorical_vars,
45 |             ),
46 |         ),
47 |         (
48 |             "categorical_encoder",
49 |             pp.SklearnTransformerWrapper(
50 |                 variables=config.model_config.categorical_vars,
51 |                 transformer=OrdinalEncoder(),
52 |             ),
53 |         ),
54 |         (
55 |             "drop_features",
56 |             pp.DropUnecessaryFeatures(
57 |                 variables_to_drop=config.model_config.drop_features,
58 |             ),
59 |         ),
60 |         (
61 |             "gb_model",
62 |             GradientBoostingRegressor(
63 |                 loss=config.model_config.loss,
64 |                 random_state=config.model_config.random_state,
65 |                 n_estimators=config.model_config.n_estimators,
66 |             ),
67 |         ),
68 |     ]
69 | )
70 | 


--------------------------------------------------------------------------------
/packages/gradient_boosting_model/gradient_boosting_model/predict.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import typing as t
 3 | 
 4 | import pandas as pd
 5 | 
 6 | from gradient_boosting_model import __version__ as _version
 7 | from gradient_boosting_model.config.core import config
 8 | from gradient_boosting_model.processing.data_management import load_pipeline
 9 | from gradient_boosting_model.processing.validation import validate_inputs
10 | 
11 | _logger = logging.getLogger(__name__)
12 | 
13 | pipeline_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl"
14 | _price_pipe = load_pipeline(file_name=pipeline_file_name)
15 | 
16 | 
17 | def make_prediction(*, input_data: t.Union[pd.DataFrame, dict],) -> dict:
18 |     """Make a prediction using a saved model pipeline."""
19 | 
20 |     data = pd.DataFrame(input_data)
21 |     validated_data, errors = validate_inputs(input_data=data)
22 |     results = {"predictions": None, "version": _version, "errors": errors}
23 | 
24 |     if not errors:
25 |         predictions = _price_pipe.predict(
26 |             X=validated_data[config.model_config.features]
27 |         )
28 |         _logger.info(
29 |             f"Making predictions with model version: {_version} "
30 |             f"Predictions: {predictions}"
31 |         )
32 |         results = {"predictions": predictions, "version": _version, "errors": errors}
33 | 
34 |     return results
35 | 


--------------------------------------------------------------------------------
/packages/gradient_boosting_model/gradient_boosting_model/processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/testing-and-monitoring-ml-deployments/88d0fdac0898178f5ed9611e44a56ea5e53c9dd5/packages/gradient_boosting_model/gradient_boosting_model/processing/__init__.py


--------------------------------------------------------------------------------
/packages/gradient_boosting_model/gradient_boosting_model/processing/data_management.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import joblib
 3 | from sklearn.pipeline import Pipeline
 4 | 
 5 | from gradient_boosting_model.config.core import config, DATASET_DIR, TRAINED_MODEL_DIR
 6 | from gradient_boosting_model import __version__ as _version
 7 | 
 8 | import logging
 9 | import typing as t
10 | 
11 | 
12 | _logger = logging.getLogger(__name__)
13 | 
14 | 
15 | def load_dataset(*, file_name: str) -> pd.DataFrame:
16 |     dataframe = pd.read_csv(f"{DATASET_DIR}/{file_name}")
17 | 
18 |     # rename variables beginning with numbers to avoid syntax errors later
19 |     transformed = dataframe.rename(columns=config.model_config.variables_to_rename)
20 |     return transformed
21 | 
22 | 
23 | def save_pipeline(*, pipeline_to_persist: Pipeline) -> None:
24 |     """Persist the pipeline.
25 | 
26 |     Saves the versioned model, and overwrites any previous
27 |     saved models. This ensures that when the package is
28 |     published, there is only one trained model that can be
29 |     called, and we know exactly how it was built.
30 |     """
31 | 
32 |     # Prepare versioned save file name
33 |     save_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl"
34 |     save_path = TRAINED_MODEL_DIR / save_file_name
35 | 
36 |     remove_old_pipelines(files_to_keep=[save_file_name])
37 |     joblib.dump(pipeline_to_persist, save_path)
38 |     _logger.info(f"saved pipeline: {save_file_name}")
39 | 
40 | 
41 | def load_pipeline(*, file_name: str) -> Pipeline:
42 |     """Load a persisted pipeline."""
43 | 
44 |     file_path = TRAINED_MODEL_DIR / file_name
45 |     trained_model = joblib.load(filename=file_path)
46 |     return trained_model
47 | 
48 | 
49 | def remove_old_pipelines(*, files_to_keep: t.List[str]) -> None:
50 |     """
51 |     Remove old model pipelines.
52 | 
53 |     This is to ensure there is a simple one-to-one
54 |     mapping between the package version and the model
55 |     version to be imported and used by other applications.
56 |     """
57 |     do_not_delete = files_to_keep + ["__init__.py"]
58 |     for model_file in TRAINED_MODEL_DIR.iterdir():
59 |         if model_file.name not in do_not_delete:
60 |             model_file.unlink()
61 | 


--------------------------------------------------------------------------------
/packages/gradient_boosting_model/gradient_boosting_model/processing/errors.py:
--------------------------------------------------------------------------------
1 | class BaseError(Exception):
2 |     """Base package error."""
3 | 
4 | 
5 | class InvalidModelInputError(BaseError):
6 |     """Model input contains an error."""
7 | 


--------------------------------------------------------------------------------
/packages/gradient_boosting_model/gradient_boosting_model/processing/preprocessors.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sklearn.base import BaseEstimator, TransformerMixin
 3 | 
 4 | 
 5 | class SklearnTransformerWrapper(BaseEstimator, TransformerMixin):
 6 |     """
 7 |     Wrapper for Scikit-learn pre-processing transformers,
 8 |     like the SimpleImputer() or OrdinalEncoder(), to allow
 9 |     the use of the transformer on a selected group of variables.
10 |     """
11 | 
12 |     def __init__(self, variables=None, transformer=None):
13 | 
14 |         if not isinstance(variables, list):
15 |             self.variables = [variables]
16 |         else:
17 |             self.variables = variables
18 | 
19 |         self.transformer = transformer
20 | 
21 |     def fit(self, X: pd.DataFrame, y: pd.Series = None):
22 |         """
23 |         The `fit` method allows scikit-learn transformers to
24 |         learn the required parameters from the training data set.
25 |         """
26 | 
27 |         self.transformer.fit(X[self.variables])
28 |         return self
29 | 
30 |     def transform(self, X: pd.DataFrame) -> pd.DataFrame:
31 |         """Apply the transforms to the dataframe."""
32 |         X = X.copy()
33 |         X[self.variables] = self.transformer.transform(X[self.variables])
34 |         return X
35 | 
36 | 
37 | class TemporalVariableEstimator(BaseEstimator, TransformerMixin):
38 |     """Calculates the time difference between 2 temporal variables."""
39 | 
40 |     def __init__(self, variables=None, reference_variable=None):
41 |         if not isinstance(variables, list):
42 |             self.variables = [variables]
43 |         else:
44 |             self.variables = variables
45 | 
46 |         self.reference_variables = reference_variable
47 | 
48 |     def fit(self, X, y=None):
49 |         """
50 |         The `fit` method is necessary to accommodate the
51 |         scikit-learn pipeline functionality.
52 |         """
53 | 
54 |         return self
55 | 
56 |     def transform(self, X):
57 |         X = X.copy()
58 |         for feature in self.variables:
59 |             X[feature] = X[self.reference_variables] - X[feature]
60 | 
61 |         return X
62 | 
63 | 
64 | class DropUnecessaryFeatures(BaseEstimator, TransformerMixin):
65 |     def __init__(self, variables_to_drop=None):
66 |         self.variables = variables_to_drop
67 | 
68 |     def fit(self, X, y=None):
69 |         """
70 |         The `fit` method is necessary to accommodate the
71 |         scikit-learn pipeline functionality.
72 |         """
73 | 
74 |         return self
75 | 
76 |     def transform(self, X):
77 |         # drop unnecesary / unused features from the data set
78 |         X = X.copy()
79 |         X = X.drop(self.variables, axis=1)
80 | 
81 |         return X
82 | 


--------------------------------------------------------------------------------
/packages/gradient_boosting_model/gradient_boosting_model/processing/validation.py:
--------------------------------------------------------------------------------
  1 | import typing as t
  2 | 
  3 | from gradient_boosting_model.config.core import config
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from marshmallow import fields, Schema, ValidationError
  8 | 
  9 | 
 10 | class HouseDataInputSchema(Schema):
 11 |     Alley = fields.Str(allow_none=True)
 12 |     BedroomAbvGr = fields.Integer()
 13 |     BldgType = fields.Str()
 14 |     BsmtCond = fields.Str(allow_none=True)
 15 |     BsmtExposure = fields.Str(allow_none=True)
 16 |     BsmtFinSF1 = fields.Float(allow_none=True)
 17 |     BsmtFinSF2 = fields.Float(allow_none=True)
 18 |     BsmtFinType1 = fields.Str(allow_none=True)
 19 |     BsmtFinType2 = fields.Str(allow_none=True)
 20 |     BsmtFullBath = fields.Float(allow_none=True)
 21 |     BsmtHalfBath = fields.Float(allow_none=True)
 22 |     BsmtQual = fields.Str(allow_none=True)
 23 |     BsmtUnfSF = fields.Float()
 24 |     CentralAir = fields.Str()
 25 |     Condition1 = fields.Str()
 26 |     Condition2 = fields.Str()
 27 |     Electrical = fields.Str(allow_none=True)
 28 |     EnclosedPorch = fields.Integer()
 29 |     ExterCond = fields.Str()
 30 |     ExterQual = fields.Str()
 31 |     Exterior1st = fields.Str(allow_none=True)
 32 |     Exterior2nd = fields.Str(allow_none=True)
 33 |     Fence = fields.Str(allow_none=True)
 34 |     FireplaceQu = fields.Str(allow_none=True)
 35 |     Fireplaces = fields.Integer()
 36 |     Foundation = fields.Str()
 37 |     FullBath = fields.Integer()
 38 |     Functional = fields.Str(allow_none=True)
 39 |     GarageArea = fields.Float()
 40 |     GarageCars = fields.Float()
 41 |     GarageCond = fields.Str(allow_none=True)
 42 |     GarageFinish = fields.Str(allow_none=True)
 43 |     GarageQual = fields.Str(allow_none=True)
 44 |     GarageType = fields.Str(allow_none=True)
 45 |     GarageYrBlt = fields.Float(allow_none=True)
 46 |     GrLivArea = fields.Integer()
 47 |     HalfBath = fields.Integer()
 48 |     Heating = fields.Str()
 49 |     HeatingQC = fields.Str()
 50 |     HouseStyle = fields.Str()
 51 |     Id = fields.Integer()
 52 |     KitchenAbvGr = fields.Integer()
 53 |     KitchenQual = fields.Str(allow_none=True)
 54 |     LandContour = fields.Str()
 55 |     LandSlope = fields.Str()
 56 |     LotArea = fields.Integer()
 57 |     LotConfig = fields.Str()
 58 |     LotFrontage = fields.Float(allow_none=True)
 59 |     LotShape = fields.Str()
 60 |     LowQualFinSF = fields.Integer()
 61 |     MSSubClass = fields.Integer()
 62 |     MSZoning = fields.Str(allow_none=True)
 63 |     MasVnrArea = fields.Float(allow_none=True)
 64 |     MasVnrType = fields.Str(allow_none=True)
 65 |     MiscFeature = fields.Str(allow_none=True)
 66 |     MiscVal = fields.Integer()
 67 |     MoSold = fields.Integer()
 68 |     Neighborhood = fields.Str()
 69 |     OpenPorchSF = fields.Integer()
 70 |     OverallCond = fields.Integer()
 71 |     OverallQual = fields.Integer()
 72 |     PavedDrive = fields.Str()
 73 |     PoolArea = fields.Integer()
 74 |     PoolQC = fields.Str(allow_none=True)
 75 |     RoofMatl = fields.Str()
 76 |     RoofStyle = fields.Str()
 77 |     SaleCondition = fields.Str()
 78 |     SaleType = fields.Str(allow_none=True)
 79 |     ScreenPorch = fields.Integer()
 80 |     Street = fields.Str()
 81 |     TotRmsAbvGrd = fields.Integer()
 82 |     TotalBsmtSF = fields.Float()
 83 |     Utilities = fields.Str(allow_none=True)
 84 |     WoodDeckSF = fields.Integer()
 85 |     YearBuilt = fields.Integer()
 86 |     YearRemodAdd = fields.Integer()
 87 |     YrSold = fields.Integer()
 88 |     FirstFlrSF = fields.Integer()
 89 |     SecondFlrSF = fields.Integer()
 90 |     ThreeSsnPortch = fields.Integer()
 91 | 
 92 | 
 93 | def drop_na_inputs(*, input_data: pd.DataFrame) -> pd.DataFrame:
 94 |     """Check model inputs for na values and filter."""
 95 |     validated_data = input_data.copy()
 96 |     if input_data[config.model_config.numerical_na_not_allowed].isnull().any().any():
 97 |         validated_data = validated_data.dropna(
 98 |             axis=0, subset=config.model_config.numerical_na_not_allowed
 99 |         )
100 | 
101 |     return validated_data
102 | 
103 | 
104 | def validate_inputs(
105 |     *, input_data: pd.DataFrame
106 | ) -> t.Tuple[pd.DataFrame, t.Optional[dict]]:
107 |     """Check model inputs for unprocessable values."""
108 | 
109 |     # convert syntax error field names (beginning with numbers)
110 |     input_data.rename(columns=config.model_config.variables_to_rename, inplace=True)
111 |     validated_data = drop_na_inputs(input_data=input_data)
112 | 
113 |     # set many=True to allow passing in a list
114 |     schema = HouseDataInputSchema(many=True)
115 |     errors = None
116 | 
117 |     try:
118 |         # replace numpy nans so that Marshmallow can validate
119 |         schema.load(validated_data.replace({np.nan: None}).to_dict(orient="records"))
120 |     except ValidationError as exc:
121 |         errors = exc.messages
122 | 
123 |     return validated_data, errors
124 | 


--------------------------------------------------------------------------------
/packages/gradient_boosting_model/gradient_boosting_model/train_pipeline.py:
--------------------------------------------------------------------------------
 1 | from sklearn.model_selection import train_test_split
 2 | 
 3 | from gradient_boosting_model import pipeline
 4 | from gradient_boosting_model.processing.data_management import (
 5 |     load_dataset,
 6 |     save_pipeline,
 7 | )
 8 | from gradient_boosting_model.config.core import config
 9 | from gradient_boosting_model import __version__ as _version
10 | 
11 | import logging
12 | 
13 | 
14 | _logger = logging.getLogger(__name__)
15 | 
16 | 
17 | def run_training() -> None:
18 |     """Train the model."""
19 | 
20 |     # read training data
21 |     data = load_dataset(file_name=config.app_config.training_data_file)
22 | 
23 |     # divide train and test
24 |     X_train, X_test, y_train, y_test = train_test_split(
25 |         data[config.model_config.features],  # predictors
26 |         data[config.model_config.target],
27 |         test_size=config.model_config.test_size,
28 |         # we are setting the random seed here
29 |         # for reproducibility
30 |         random_state=config.model_config.random_state,
31 |     )
32 | 
33 |     pipeline.price_pipe.fit(X_train, y_train)
34 | 
35 |     _logger.warning(f"saving model version: {_version}")
36 |     save_pipeline(pipeline_to_persist=pipeline.price_pipe)
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     run_training()
41 | 


--------------------------------------------------------------------------------
/packages/gradient_boosting_model/gradient_boosting_model/trained_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/testing-and-monitoring-ml-deployments/88d0fdac0898178f5ed9611e44a56ea5e53c9dd5/packages/gradient_boosting_model/gradient_boosting_model/trained_models/__init__.py


--------------------------------------------------------------------------------
/packages/gradient_boosting_model/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | warn_unused_ignores = True
 3 | follow_imports = skip
 4 | show_error_context = True
 5 | warn_incomplete_stub = True
 6 | ignore_missing_imports = True
 7 | check_untyped_defs = True
 8 | cache_dir = /dev/null
 9 | warn_redundant_casts = True
10 | warn_unused_configs = True
11 | strict_optional = True
12 | 


--------------------------------------------------------------------------------
/packages/gradient_boosting_model/requirements.txt:
--------------------------------------------------------------------------------
 1 | # ML requirements
 2 | numpy>=1.20.0,<1.21.0
 3 | pandas>=1.3.5,<1.4.0
 4 | scikit-learn>=1.0.2,<1.1.0
 5 | feature-engine>=1.0.2,<1.1.0
 6 | joblib>=1.0.1,<1.1.0
 7 | 
 8 | # config parsing
 9 | strictyaml>=1.3.2,<1.4.0
10 | ruamel.yaml==0.16.12
11 | pydantic>=1.8.1,<1.9.0
12 | 
13 | # validation
14 | marshmallow>=3.2.2,<4.0
15 | 
16 | # packaging
17 | setuptools>=41.4.0,<42.0.0
18 | wheel>=0.33.6,<0.34.0
19 | 


--------------------------------------------------------------------------------
/packages/gradient_boosting_model/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from pathlib import Path
 5 | 
 6 | from setuptools import find_packages, setup
 7 | 
 8 | # Package meta-data.
 9 | NAME = 'tid-gradient-boosting-model'
10 | DESCRIPTION = "Gradient boosting regression model from Train In Data."
11 | URL = "https://github.com/trainindata/testing-and-monitoring-ml-deployments"
12 | EMAIL = "christopher.samiullah@protonmail.com"
13 | AUTHOR = "ChristopherGS"
14 | REQUIRES_PYTHON = ">=3.6.0"
15 | 
16 | 
17 | # What packages are required for this module to be executed?
18 | def list_reqs(fname="requirements.txt"):
19 |     with open(fname) as fd:
20 |         return fd.read().splitlines()
21 | 
22 | 
23 | # The rest you shouldn't have to touch too much :)
24 | # ------------------------------------------------
25 | # Except, perhaps the License and Trove Classifiers!
26 | # If you do change the License, remember to change the
27 | # Trove Classifier for that!
28 | long_description = DESCRIPTION
29 | 
30 | # Load the package's VERSION file as a dictionary.
31 | about = {}
32 | ROOT_DIR = Path(__file__).resolve().parent
33 | PACKAGE_DIR = ROOT_DIR / 'gradient_boosting_model'
34 | with open(PACKAGE_DIR / "VERSION") as f:
35 |     _version = f.read().strip()
36 |     about["__version__"] = _version
37 | 
38 | 
39 | # Where the magic happens:
40 | setup(
41 |     name=NAME,
42 |     version=about["__version__"],
43 |     description=DESCRIPTION,
44 |     long_description=long_description,
45 |     long_description_content_type="text/markdown",
46 |     author=AUTHOR,
47 |     author_email=EMAIL,
48 |     python_requires=REQUIRES_PYTHON,
49 |     url=URL,
50 |     packages=find_packages(exclude=("tests",)),
51 |     package_data={"gradient_boosting_model": ["VERSION"]},
52 |     install_requires=list_reqs(),
53 |     extras_require={},
54 |     include_package_data=True,
55 |     license="BSD-3",
56 |     classifiers=[
57 |         # Trove classifiers
58 |         # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
59 |         "License :: OSI Approved :: MIT License",
60 |         "Programming Language :: Python",
61 |         "Programming Language :: Python :: 3",
62 |         "Programming Language :: Python :: 3.6",
63 |         "Programming Language :: Python :: 3.7",
64 |         "Programming Language :: Python :: 3.8",
65 |         "Programming Language :: Python :: Implementation :: CPython",
66 |         "Programming Language :: Python :: Implementation :: PyPy",
67 |     ],
68 | )
69 | 


--------------------------------------------------------------------------------
/packages/gradient_boosting_model/test_requirements.txt:
--------------------------------------------------------------------------------
 1 | -r requirements.txt
 2 | 
 3 | # testing requirements
 4 | pytest>=5.3.2,<6.0.0
 5 | 
 6 | # old model for testing purposes
 7 | # source code: https://github.com/trainindata/deploying-machine-learning-models/tree/master/packages/regression_model
 8 | tid-regression-model==3.1.2
 9 | 
10 | # repo maintenance tooling
11 | black>=19.10b0,<20.0
12 | flake8>=3.7.9,<4.0
13 | mypy>=0.740
14 | 
15 | # kaggle cli
16 | kaggle>=1.5.6,<1.6.0
17 | 


--------------------------------------------------------------------------------
/packages/gradient_boosting_model/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/testing-and-monitoring-ml-deployments/88d0fdac0898178f5ed9611e44a56ea5e53c9dd5/packages/gradient_boosting_model/tests/__init__.py


--------------------------------------------------------------------------------
/packages/gradient_boosting_model/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from sklearn.model_selection import train_test_split
 3 | 
 4 | from gradient_boosting_model.config.core import config
 5 | from gradient_boosting_model.processing.data_management import load_dataset
 6 | 
 7 | 
 8 | @pytest.fixture(scope="session")
 9 | def pipeline_inputs():
10 |     # For larger datasets, here we would use a testing sub-sample.
11 |     data = load_dataset(file_name=config.app_config.training_data_file)
12 | 
13 |     # Divide train and test
14 |     X_train, X_test, y_train, y_test = train_test_split(
15 |         data[config.model_config.features],  # predictors
16 |         data[config.model_config.target],
17 |         test_size=config.model_config.test_size,
18 |         # we are setting the random seed here
19 |         # for reproducibility
20 |         random_state=config.model_config.random_state,
21 |     )
22 | 
23 |     return X_train, X_test, y_train, y_test
24 | 
25 | 
26 | @pytest.fixture()
27 | def raw_training_data():
28 |     # For larger datasets, here we would use a testing sub-sample.
29 |     return load_dataset(file_name=config.app_config.training_data_file)
30 | 
31 | 
32 | @pytest.fixture()
33 | def sample_input_data():
34 |     return load_dataset(file_name=config.app_config.test_data_file)
35 | 


--------------------------------------------------------------------------------
/packages/gradient_boosting_model/tests/test_config.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | 
  3 | from gradient_boosting_model.config.core import (
  4 |     create_and_validate_config,
  5 |     fetch_config_from_yaml,
  6 | )
  7 | 
  8 | import pytest
  9 | from pydantic import ValidationError
 10 | 
 11 | 
 12 | TEST_CONFIG_TEXT = """
 13 | package_name: gradient_boosting_model
 14 | training_data_file: houseprice.csv
 15 | test_data_file: test.csv
 16 | drop_features: YrSold
 17 | pipeline_name: gb_regression
 18 | pipeline_save_file: gb_regression_output_v
 19 | target: SalePrice
 20 | variables_to_rename:
 21 |     foo: bar
 22 | test_size: 0.1
 23 | features:
 24 |   - LotArea
 25 | numerical_vars:
 26 |   - LotArea
 27 | categorical_vars:
 28 |   - BsmtQual
 29 | temporal_vars: YearRemodAdd
 30 | numerical_vars_with_na:
 31 |   - LotFrontage
 32 | numerical_na_not_allowed:
 33 |   - LotArea
 34 | random_state: 0
 35 | n_estimators: 50
 36 | rare_label_tol: 0.01
 37 | rare_label_n_categories: 5
 38 | loss: ls
 39 | allowed_loss_functions:
 40 |   - ls
 41 |   - huber
 42 | """
 43 | 
 44 | INVALID_TEST_CONFIG_TEXT = """
 45 | package_name: gradient_boosting_model
 46 | training_data_file: houseprice.csv
 47 | test_data_file: test.csv
 48 | drop_features: YrSold
 49 | pipeline_name: gb_regression
 50 | pipeline_save_file: gb_regression_output_v
 51 | target: SalePrice
 52 | features:
 53 |   - LotArea
 54 | numerical_vars:
 55 |   - LotArea
 56 | categorical_vars:
 57 |   - BsmtQual
 58 | temporal_vars: YearRemodAdd
 59 | numerical_vars_with_na:
 60 |   - LotFrontage
 61 | numerical_na_not_allowed:
 62 |   - LotArea
 63 | random_state: 0
 64 | n_estimators: 50
 65 | rare_label_tol: 0.01
 66 | rare_label_n_categories: 5
 67 | loss: ls
 68 | allowed_loss_functions:
 69 |   - huber
 70 | """
 71 | 
 72 | 
 73 | def test_fetch_config_structure(tmpdir):
 74 |     # Given
 75 |     # We make use of the pytest built-in tmpdir fixture
 76 |     configs_dir = Path(tmpdir)
 77 |     config_1 = configs_dir / "sample_config.yml"
 78 |     config_1.write_text(TEST_CONFIG_TEXT)
 79 |     parsed_config = fetch_config_from_yaml(cfg_path=config_1)
 80 | 
 81 |     # When
 82 |     config = create_and_validate_config(parsed_config=parsed_config)
 83 | 
 84 |     # Then
 85 |     assert config.model_config
 86 |     assert config.app_config
 87 | 
 88 | 
 89 | def test_config_validation_raises_error_for_invalid_config(tmpdir):
 90 |     # Given
 91 |     # We make use of the pytest built-in tmpdir fixture
 92 |     configs_dir = Path(tmpdir)
 93 |     config_1 = configs_dir / "sample_config.yml"
 94 | 
 95 |     # invalid config attempts to set a prohibited loss
 96 |     # function which we validate against an allowed set of
 97 |     # loss function parameters.
 98 |     config_1.write_text(INVALID_TEST_CONFIG_TEXT)
 99 |     parsed_config = fetch_config_from_yaml(cfg_path=config_1)
100 | 
101 |     # When
102 |     with pytest.raises(ValidationError) as excinfo:
103 |         create_and_validate_config(parsed_config=parsed_config)
104 | 
105 |     # Then
106 |     assert "not in the allowed set" in str(excinfo.value)
107 | 
108 | 
109 | def test_missing_config_field_raises_validation_error(tmpdir):
110 |     # Given
111 |     # We make use of the pytest built-in tmpdir fixture
112 |     configs_dir = Path(tmpdir)
113 |     config_1 = configs_dir / "sample_config.yml"
114 |     TEST_CONFIG_TEXT = """package_name: gradient_boosting_model"""
115 |     config_1.write_text(TEST_CONFIG_TEXT)
116 |     parsed_config = fetch_config_from_yaml(cfg_path=config_1)
117 | 
118 |     # When
119 |     with pytest.raises(ValidationError) as excinfo:
120 |         create_and_validate_config(parsed_config=parsed_config)
121 | 
122 |     # Then
123 |     assert "field required" in str(excinfo.value)
124 |     assert "pipeline_name" in str(excinfo.value)
125 | 


--------------------------------------------------------------------------------
/packages/gradient_boosting_model/tests/test_pipeline.py:
--------------------------------------------------------------------------------
 1 | from gradient_boosting_model import pipeline
 2 | from gradient_boosting_model.config.core import config
 3 | from gradient_boosting_model.processing.validation import validate_inputs
 4 | 
 5 | 
 6 | def test_pipeline_drops_unnecessary_features(pipeline_inputs):
 7 |     # Given
 8 |     X_train, X_test, y_train, y_test = pipeline_inputs
 9 |     assert config.model_config.drop_features in X_train.columns
10 |     pipeline.price_pipe.fit(X_train, y_train)
11 | 
12 |     # When
13 |     # We access the transformed inputs with slicing
14 |     transformed_inputs = pipeline.price_pipe[:-1].transform(X_train)
15 | 
16 |     # Then
17 |     assert config.model_config.drop_features in X_train.columns
18 |     assert config.model_config.drop_features not in transformed_inputs.columns
19 | 
20 | 
21 | def test_pipeline_transforms_temporal_features(pipeline_inputs):
22 |     # Given
23 |     X_train, X_test, y_train, y_test = pipeline_inputs
24 | 
25 |     # When
26 |     # We access the transformed inputs with slicing
27 |     transformed_inputs = pipeline.price_pipe[:-1].transform(X_train)
28 | 
29 |     # Then
30 |     assert (
31 |         transformed_inputs.iloc[0]["YearRemodAdd"]
32 |         == X_train.iloc[0]["YrSold"] - X_train.iloc[0]["YearRemodAdd"]
33 |     )
34 | 
35 | 
36 | def test_pipeline_predict_takes_validated_input(pipeline_inputs, sample_input_data):
37 |     # Given
38 |     X_train, X_test, y_train, y_test = pipeline_inputs
39 |     pipeline.price_pipe.fit(X_train, y_train)
40 | 
41 |     # When
42 |     validated_inputs, errors = validate_inputs(input_data=sample_input_data)
43 |     predictions = pipeline.price_pipe.predict(
44 |         validated_inputs[config.model_config.features]
45 |     )
46 | 
47 |     # Then
48 |     assert predictions is not None
49 |     assert errors is None
50 | 


--------------------------------------------------------------------------------
/packages/gradient_boosting_model/tests/test_predict.py:
--------------------------------------------------------------------------------
 1 | from gradient_boosting_model.predict import make_prediction
 2 | from gradient_boosting_model.config.core import config
 3 | 
 4 | from sklearn.metrics import mean_squared_error
 5 | 
 6 | from regression_model.predict import make_prediction as alt_make_prediction
 7 | 
 8 | 
 9 | def test_prediction_quality_against_benchmark(raw_training_data, sample_input_data):
10 |     # Given
11 |     input_df = raw_training_data.drop(config.model_config.target, axis=1)
12 |     output_df = raw_training_data[config.model_config.target]
13 | 
14 |     # Generate rough benchmarks (you would tweak depending on your model)
15 |     benchmark_flexibility = 50000
16 |     # setting ndigits to -4 will round the value to the nearest 10,000 i.e. 210,000
17 |     benchmark_lower_boundary = (
18 |         round(output_df.iloc[0], ndigits=-4) - benchmark_flexibility
19 |     )  # 210,000 - 50000 = 160000
20 |     benchmark_upper_boundary = (
21 |         round(output_df.iloc[0], ndigits=-4) + benchmark_flexibility
22 |     )  # 210000 + 50000 = 260000
23 | 
24 |     # When
25 |     subject = make_prediction(input_data=input_df[0:1])
26 | 
27 |     # Then
28 |     assert subject is not None
29 |     prediction = subject.get("predictions")[0]
30 |     assert isinstance(prediction, float)
31 |     assert prediction > benchmark_lower_boundary
32 |     assert prediction < benchmark_upper_boundary
33 | 
34 | 
35 | def test_prediction_quality_against_another_model(raw_training_data, sample_input_data):
36 |     # Given
37 |     input_df = raw_training_data.drop(config.model_config.target, axis=1)
38 |     output_df = raw_training_data[config.model_config.target]
39 |     current_predictions = make_prediction(input_data=input_df)
40 | 
41 |     # the older model has these variable names reversed
42 |     input_df.rename(
43 |         columns={
44 |             "FirstFlrSF": "1stFlrSF",
45 |             "SecondFlrSF": "2ndFlrSF",
46 |             "ThreeSsnPortch": "3SsnPorch",
47 |         },
48 |         inplace=True,
49 |     )
50 |     alternative_predictions = alt_make_prediction(input_data=input_df)
51 | 
52 |     # When
53 |     current_mse = mean_squared_error(
54 |         y_true=output_df.values, y_pred=current_predictions["predictions"]
55 |     )
56 | 
57 |     alternative_mse = mean_squared_error(
58 |         y_true=output_df.values, y_pred=alternative_predictions["predictions"]
59 |     )
60 | 
61 |     # Then
62 |     assert current_mse < alternative_mse
63 | 


--------------------------------------------------------------------------------
/packages/gradient_boosting_model/tests/test_preprocessors.py:
--------------------------------------------------------------------------------
 1 | from gradient_boosting_model.config.core import config
 2 | from gradient_boosting_model.processing import preprocessors as pp
 3 | 
 4 | 
 5 | def test_drop_unnecessary_features_transformer(pipeline_inputs):
 6 |     # Given
 7 |     X_train, X_test, y_train, y_test = pipeline_inputs
 8 |     assert config.model_config.drop_features in X_train.columns
 9 | 
10 |     transformer = pp.DropUnecessaryFeatures(
11 |         variables_to_drop=config.model_config.drop_features,
12 |     )
13 | 
14 |     # When
15 |     X_transformed = transformer.transform(X_train)
16 | 
17 |     # Then
18 |     assert config.model_config.drop_features not in X_transformed.columns
19 | 
20 | 
21 | def test_temporal_variable_estimator(pipeline_inputs):
22 |     # Given
23 |     X_train, X_test, y_train, y_test = pipeline_inputs
24 | 
25 |     transformer = pp.TemporalVariableEstimator(
26 |         variables=config.model_config.temporal_vars,
27 |         reference_variable=config.model_config.drop_features,
28 |     )
29 | 
30 |     # When
31 |     X_transformed = transformer.transform(X_train)
32 | 
33 |     # Then
34 |     assert (
35 |         X_transformed.iloc[0]["YearRemodAdd"]
36 |         == X_train.iloc[0]["YrSold"] - X_train.iloc[0]["YearRemodAdd"]
37 |     )
38 | 


--------------------------------------------------------------------------------
/packages/gradient_boosting_model/tests/test_validation.py:
--------------------------------------------------------------------------------
 1 | from gradient_boosting_model.processing.validation import validate_inputs
 2 | 
 3 | 
 4 | def test_validate_inputs(sample_input_data):
 5 |     # When
 6 |     validated_inputs, errors = validate_inputs(input_data=sample_input_data)
 7 | 
 8 |     # Then
 9 |     assert not errors
10 | 
11 |     # we expect that 2 rows are removed due to missing vars
12 |     # 1459 is the total number of rows in the test data set (test.csv)
13 |     # and 1457 number returned after 2 rows are filtered out.
14 |     assert len(sample_input_data) == 1459
15 |     assert len(validated_inputs) == 1457
16 | 
17 | 
18 | def test_validate_inputs_identifies_errors(sample_input_data):
19 |     # Given
20 |     test_inputs = sample_input_data.copy()
21 | 
22 |     # introduce errors
23 |     test_inputs.at[1, "BldgType"] = 50  # we expect a string
24 | 
25 |     # When
26 |     validated_inputs, errors = validate_inputs(input_data=test_inputs)
27 | 
28 |     # Then
29 |     assert errors
30 |     assert errors[1] == {"BldgType": ["Not a valid string."]}
31 | 


--------------------------------------------------------------------------------
/packages/gradient_boosting_model/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = unit_tests,typechecks,stylechecks
 3 | skipsdist = True
 4 | 
 5 | 
 6 | [testenv]
 7 | install_command = pip install {opts} {packages}
 8 | deps =
 9 |     -rtest_requirements.txt
10 | 
11 | passenv =
12 |      KAGGLE_USERNAME
13 |      KAGGLE_KEY
14 | 
15 | setenv =
16 |   PYTHONPATH=.
17 | 
18 | commands=
19 |     kaggle competitions download -c house-prices-advanced-regression-techniques -p gradient_boosting_model/datasets/
20 |     unzip -o gradient_boosting_model/datasets/house-prices-advanced-regression-techniques.zip -d gradient_boosting_model/datasets
21 |     mv gradient_boosting_model/datasets/train.csv gradient_boosting_model/datasets/houseprice.csv
22 |     python gradient_boosting_model/train_pipeline.py
23 |     pytest \
24 |           -s \
25 |           -vv \
26 |           {posargs:tests/}
27 | 
28 | 
29 | [testenv:unit_tests]
30 | envdir = {toxworkdir}/unit_tests
31 | deps =
32 |      {[testenv]deps}
33 | 
34 | setenv =
35 |   PYTHONPATH=.
36 | 
37 | commands =
38 |      python gradient_boosting_model/train_pipeline.py
39 |      pytest \
40 |            -s \
41 |            -vv \
42 |            {posargs:tests/}
43 | 
44 | [testenv:train]
45 | envdir = {toxworkdir}/train
46 | deps =
47 |      {[testenv]deps}
48 | 
49 | setenv =
50 |   PYTHONPATH=.
51 | 
52 | commands =
53 |      python gradient_boosting_model/train_pipeline.py
54 | 
55 | [testenv:typechecks]
56 | envdir = {toxworkdir}/unit_tests
57 | 
58 | deps =
59 |      {[testenv:unit_tests]deps}
60 | 
61 | commands = {posargs:mypy gradient_boosting_model}
62 | 
63 | 
64 | [testenv:stylechecks]
65 | envdir = {toxworkdir}/unit_tests
66 | 
67 | deps =
68 |      {[testenv:unit_tests]deps}
69 | 
70 | commands = {posargs:flake8 gradient_boosting_model tests}
71 | 
72 | 
73 | [flake8]
74 | exclude = .git,env
75 | max-line-length = 90
76 | 


--------------------------------------------------------------------------------
/packages/ml_api/.dockerignore:
--------------------------------------------------------------------------------
 1 | exercise_notebooks/*
 2 | *env*
 3 | *venv*
 4 | .circleci*
 5 | packages/gradient_boosting_model
 6 | *.env
 7 | *.log
 8 | .git
 9 | .gitignore
10 | .dockerignore
11 | *.mypy_cache
12 | *.pytest_cache
13 | *.tox
14 | 
15 | # alembic
16 | !alembic/env.py
17 | 
18 | # Byte-compiled / optimized / DLL files
19 | *__pycache__*
20 | *.py[cod]


--------------------------------------------------------------------------------
/packages/ml_api/Makefile:
--------------------------------------------------------------------------------
 1 | # For details on Makefiles, see the section notes.
 2 | NAME=ml_api
 3 | VERSION=$(shell git rev-parse HEAD)
 4 | REPO=UPDATEME
 5 | PASSWORD=UPDATEME
 6 | 
 7 | # Specify phony list to ensure make recipes do not conflict with real file names
 8 | .PHONY: run-service-development tag-push-master tag-push-local db-migrations
 9 | 
10 | 
11 | tag-push-local:
12 | 	@echo "+ $@"
13 | 	docker login --username $(REPO) --password $(PASSWORD)
14 | 	env TARGET=$(VERSION) docker-compose -f docker/docker-compose-ci-candidate.yml build
15 | 	docker push $(REPO)/$(NAME):$(VERSION)
16 | 
17 | tag-push-master:
18 | 	@echo "+ $@"
19 | 	docker login --username $(REPO) --password $(PASSWORD)
20 | 	env TARGET=master docker-compose -f docker/docker-compose-ci-master.yml build
21 | 	docker push $(REPO)/$(NAME):master
22 | 
23 | # start up Flask API service
24 | run-service-development:
25 | 	@echo "+ $@"
26 | 	python run.py
27 | 
28 | run-service-wsgi:
29 | 	@echo "+ $@"
30 | 	gunicorn --bind 0.0.0.0:5000 \
31 | 			 --workers=1 \
32 | 			 --log-config gunicorn_logging.conf \
33 |              --log-level=DEBUG \
34 |              --access-logfile=- \
35 |              --error-logfile=- \
36 |              run:application
37 | 
38 | db-migrations:
39 | 	@echo "+ $@"
40 | 	PYTHONPATH=. alembic -c alembic.ini upgrade head
41 | 


--------------------------------------------------------------------------------
/packages/ml_api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/testing-and-monitoring-ml-deployments/88d0fdac0898178f5ed9611e44a56ea5e53c9dd5/packages/ml_api/__init__.py


--------------------------------------------------------------------------------
/packages/ml_api/alembic.ini:
--------------------------------------------------------------------------------
 1 | # A generic, single database configuration.
 2 | 
 3 | [alembic]
 4 | # path to migration scripts
 5 | script_location = alembic
 6 | 
 7 | # timezone to use when rendering the date
 8 | # within the migration file as well as the filename.
 9 | # string value is passed to dateutil.tz.gettz()
10 | # leave blank for localtime
11 | timezone = UTC
12 | 
13 | sqlalchemy.url = VALUE_IS_SET_AT_RUNTIME
14 | 
15 | 
16 | # Logging configuration
17 | [loggers]
18 | keys = root,sqlalchemy,alembic
19 | 
20 | [handlers]
21 | keys = console
22 | 
23 | [formatters]
24 | keys = generic
25 | 
26 | [logger_root]
27 | level = WARN
28 | handlers = console
29 | qualname =
30 | 
31 | [logger_sqlalchemy]
32 | level = WARN
33 | handlers =
34 | qualname = sqlalchemy.engine
35 | 
36 | [logger_alembic]
37 | level = INFO
38 | handlers =
39 | qualname = alembic
40 | 
41 | [handler_console]
42 | class = StreamHandler
43 | args = (sys.stderr,)
44 | level = NOTSET
45 | formatter = generic
46 | 
47 | [formatter_generic]
48 | format = %(levelname)-5.5s [%(name)s] %(message)s
49 | datefmt = %H:%M:%S
50 | 


--------------------------------------------------------------------------------
/packages/ml_api/alembic/env.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from alembic import context
 4 | from sqlalchemy import engine_from_config, pool
 5 | 
 6 | # Import the models so the changes in them are automatically reflected in the
 7 | # generated migrations.
 8 | from api.persistence import models  # noqa
 9 | from api.config import DevelopmentConfig as user_config
10 | from api.persistence.core import Base
11 | 
12 | # this is the Alembic Config object, which provides
13 | # access to the values within the .ini file in use.
14 | config = context.config
15 | database_url = os.environ.get("ALEMBIC_DB_URI", user_config.SQLALCHEMY_DATABASE_URI)
16 | config.set_main_option("sqlalchemy.url", database_url)
17 | 
18 | # add your model's MetaData object here
19 | # for 'autogenerate' support
20 | target_metadata = Base.metadata
21 | 
22 | 
23 | def run_migrations_offline():
24 |     """Run migrations in 'offline' mode.
25 |     This configures the context with just a URL
26 |     and not a user_ratings, though a user_ratings is acceptable
27 |     here as well.  By skipping the user_ratings creation
28 |     we don't even need a DBAPI to be available.
29 |     Calls to context.execute() here emit the given string to the
30 |     script output.
31 |     """
32 |     url = config.get_main_option("sqlalchemy.url")
33 |     context.configure(
34 |         url=url, target_metadata=target_metadata, literal_binds=True,
35 |     )
36 | 
37 |     with context.begin_transaction():
38 |         context.run_migrations()
39 | 
40 | 
41 | def run_migrations_online():
42 |     """Run migrations in 'online' mode.
43 |     In this scenario we need to create a user_ratings
44 |     and associate a connection with the context.
45 |     """
46 |     alembic_config = config.get_section(config.config_ini_section)
47 |     connectable = engine_from_config(
48 |         alembic_config, prefix="sqlalchemy.", poolclass=pool.NullPool,
49 |     )
50 | 
51 |     with connectable.connect() as connection:
52 |         context.configure(
53 |             connection=connection, target_metadata=target_metadata,
54 |         )
55 | 
56 |         with context.begin_transaction():
57 |             context.run_migrations()
58 | 
59 | 
60 | if context.is_offline_mode():
61 |     run_migrations_offline()
62 | else:
63 |     run_migrations_online()
64 | 


--------------------------------------------------------------------------------
/packages/ml_api/alembic/script.py.mako:
--------------------------------------------------------------------------------
 1 | """${message}
 2 | 
 3 | Revision ID: ${up_revision}
 4 | Revises: ${down_revision | comma,n}
 5 | Create Date: ${create_date}
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | ${imports if imports else ""}
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = ${repr(up_revision)}
14 | down_revision = ${repr(down_revision)}
15 | branch_labels = ${repr(branch_labels)}
16 | depends_on = ${repr(depends_on)}
17 | 
18 | 
19 | def upgrade():
20 |     ${upgrades if upgrades else "pass"}
21 | 
22 | 
23 | def downgrade():
24 |     ${downgrades if downgrades else "pass"}
25 | 


--------------------------------------------------------------------------------
/packages/ml_api/alembic/versions/cf4abb13368d_create_prediction_tables.py:
--------------------------------------------------------------------------------
 1 | """create prediction tables
 2 | 
 3 | Revision ID: cf4abb13368d
 4 | Revises: 
 5 | Create Date: 2019-12-15 14:54:07.857500+00:00
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | from sqlalchemy.dialects import postgresql
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = "cf4abb13368d"
14 | down_revision = None
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     # ### commands auto generated by Alembic - please adjust! ###
21 |     op.create_table(
22 |         "gradient_boosting_model_predictions",
23 |         sa.Column("id", sa.Integer(), nullable=False),
24 |         sa.Column("user_id", sa.String(length=36), nullable=False),
25 |         sa.Column(
26 |             "datetime_captured",
27 |             sa.DateTime(timezone=True),
28 |             server_default=sa.text("now()"),
29 |             nullable=True,
30 |         ),
31 |         sa.Column("model_version", sa.String(length=36), nullable=False),
32 |         sa.Column("inputs", postgresql.JSONB(astext_type=sa.Text()), nullable=True),
33 |         sa.Column("outputs", postgresql.JSONB(astext_type=sa.Text()), nullable=True),
34 |         sa.PrimaryKeyConstraint("id"),
35 |     )
36 |     op.create_index(
37 |         op.f("ix_gradient_boosting_model_predictions_datetime_captured"),
38 |         "gradient_boosting_model_predictions",
39 |         ["datetime_captured"],
40 |         unique=False,
41 |     )
42 |     op.create_table(
43 |         "regression_model_predictions",
44 |         sa.Column("id", sa.Integer(), nullable=False),
45 |         sa.Column("user_id", sa.String(length=36), nullable=False),
46 |         sa.Column(
47 |             "datetime_captured",
48 |             sa.DateTime(timezone=True),
49 |             server_default=sa.text("now()"),
50 |             nullable=True,
51 |         ),
52 |         sa.Column("model_version", sa.String(length=36), nullable=False),
53 |         sa.Column("inputs", postgresql.JSONB(astext_type=sa.Text()), nullable=True),
54 |         sa.Column("outputs", postgresql.JSONB(astext_type=sa.Text()), nullable=True),
55 |         sa.PrimaryKeyConstraint("id"),
56 |     )
57 |     op.create_index(
58 |         op.f("ix_regression_model_predictions_datetime_captured"),
59 |         "regression_model_predictions",
60 |         ["datetime_captured"],
61 |         unique=False,
62 |     )
63 |     # ### end Alembic commands ###
64 | 
65 | 
66 | def downgrade():
67 |     # ### commands auto generated by Alembic - please adjust! ###
68 |     op.drop_index(
69 |         op.f("ix_regression_model_predictions_datetime_captured"),
70 |         table_name="regression_model_predictions",
71 |     )
72 |     op.drop_table("regression_model_predictions")
73 |     op.drop_index(
74 |         op.f("ix_gradient_boosting_model_predictions_datetime_captured"),
75 |         table_name="gradient_boosting_model_predictions",
76 |     )
77 |     op.drop_table("gradient_boosting_model_predictions")
78 |     # ### end Alembic commands ###
79 | 


--------------------------------------------------------------------------------
/packages/ml_api/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/testing-and-monitoring-ml-deployments/88d0fdac0898178f5ed9611e44a56ea5e53c9dd5/packages/ml_api/api/__init__.py


--------------------------------------------------------------------------------
/packages/ml_api/api/app.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import connexion
 4 | from sqlalchemy.orm import scoped_session
 5 | 
 6 | from api.config import Config
 7 | from api.monitoring.middleware import setup_metrics
 8 | from api.persistence.core import init_database
 9 | 
10 | _logger = logging.getLogger('mlapi')
11 | 
12 | 
13 | def create_app(
14 |     *, config_object: Config, db_session: scoped_session = None
15 | ) -> connexion.App:
16 |     """Create app instance."""
17 | 
18 |     connexion_app = connexion.App(
19 |         __name__, debug=config_object.DEBUG, specification_dir="spec/"
20 |     )
21 |     flask_app = connexion_app.app
22 |     flask_app.config.from_object(config_object)
23 | 
24 |     # Setup database
25 |     init_database(flask_app, config=config_object, db_session=db_session)
26 | 
27 |     # Setup prometheus monitoring
28 |     setup_metrics(flask_app)
29 | 
30 |     connexion_app.add_api("api.yaml")
31 |     _logger.info("Application instance created")
32 | 
33 |     return connexion_app
34 | 


--------------------------------------------------------------------------------
/packages/ml_api/api/config.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import pathlib
  4 | import sys
  5 | from logging.config import fileConfig
  6 | 
  7 | import api
  8 | 
  9 | # logging format
 10 | FORMATTER = logging.Formatter(
 11 |     "%(asctime)s — %(name)s — %(levelname)s —" "%(funcName)s:%(lineno)d — %(message)s"
 12 | )
 13 | 
 14 | # Project Directories
 15 | ROOT = pathlib.Path(api.__file__).resolve().parent.parent
 16 | 
 17 | APP_NAME = 'ml_api'
 18 | 
 19 | 
 20 | class Config:
 21 |     DEBUG = False
 22 |     TESTING = False
 23 |     ENV = os.getenv("FLASK_ENV", "production")
 24 |     SERVER_PORT = int(os.getenv("SERVER_PORT", 5000))
 25 |     SERVER_HOST = os.getenv("SERVER_HOST", "0.0.0.0")
 26 |     LOGGING_LEVEL = os.getenv("LOGGING_LEVEL", logging.INFO)
 27 |     SHADOW_MODE_ACTIVE = os.getenv('SHADOW_MODE_ACTIVE', True)
 28 |     SQLALCHEMY_DATABASE_URI = (
 29 |         f"postgresql+psycopg2://{os.getenv('DB_USER')}:"
 30 |         f"{os.getenv('DB_PASSWORD')}@{os.getenv('DB_HOST')}/{os.getenv('DB_NAME')}"
 31 |     )
 32 |     # DB config matches docker container
 33 |     DB_USER = os.getenv("DB_USER", "user")
 34 |     DB_PASSWORD = os.getenv("DB_PASSWORD", "password")
 35 |     DB_PORT = os.getenv("DB_PORT", 6609)
 36 |     DB_HOST = os.getenv("DB_HOST", "0.0.0.0")
 37 |     DB_NAME = os.getenv("DB_NAME", "ml_api_dev")
 38 | 
 39 | 
 40 | class DevelopmentConfig(Config):
 41 |     DEBUG = True
 42 |     ENV = "development"  # do not use in production!
 43 |     LOGGING_LEVEL = logging.DEBUG
 44 | 
 45 | 
 46 | class TestingConfig(Config):
 47 |     DEBUG = True
 48 |     TESTING = True
 49 |     LOGGING_LEVEL = logging.DEBUG
 50 | 
 51 |     # DB config matches test docker container
 52 |     DB_USER = os.getenv("DB_USER", "test_user")
 53 |     DB_PASSWORD = os.getenv("DB_PASSWORD", "password")
 54 |     DB_PORT = os.getenv("DB_PORT", 6608)
 55 |     DB_HOST = os.getenv("DB_HOST", "0.0.0.0")
 56 |     DB_NAME = "ml_api_test"
 57 |     SQLALCHEMY_DATABASE_URI = (
 58 |         f"postgresql+psycopg2://{DB_USER}:"
 59 |         f"{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
 60 |     )
 61 | 
 62 | 
 63 | class ProductionConfig(Config):
 64 |     DB_USER = os.getenv("DB_USER", "user")
 65 |     DB_PASSWORD = os.getenv("DB_PASSWORD", "password")
 66 |     DB_PORT = os.getenv("DB_PORT", 6609)
 67 |     DB_HOST = os.getenv("DB_HOST", "database")
 68 |     DB_NAME = os.getenv("DB_NAME", "ml_api")
 69 |     SQLALCHEMY_DATABASE_URI = (
 70 |         f"postgresql+psycopg2://{DB_USER}:"
 71 |         f"{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
 72 |     )
 73 | 
 74 | 
 75 | def get_console_handler():
 76 |     """Setup console logging handler."""
 77 |     console_handler = logging.StreamHandler(sys.stdout)
 78 |     console_handler.setFormatter(FORMATTER)
 79 |     return console_handler
 80 | 
 81 | 
 82 | def setup_app_logging(config: Config) -> None:
 83 |     """Prepare custom logging for our application."""
 84 |     _disable_irrelevant_loggers()
 85 |     fileConfig(ROOT / 'gunicorn_logging.conf')
 86 |     logger = logging.getLogger('mlapi')
 87 |     logger.setLevel(config.LOGGING_LEVEL)
 88 | 
 89 | 
 90 | def _disable_irrelevant_loggers() -> None:
 91 |     """Disable loggers created by packages which create a lot of noise."""
 92 |     for logger_name in (
 93 |         "connexion.apis.flask_api",
 94 |         "connexion.apis.abstract",
 95 |         "connexion.decorators",
 96 |         "connexion.operation",
 97 |         "connexion.operations",
 98 |         "connexion.app",
 99 |         "openapi_spec_validator",
100 |     ):
101 |         logging.getLogger(logger_name).level = logging.WARNING
102 | 


--------------------------------------------------------------------------------
/packages/ml_api/api/controller.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import threading
  4 | 
  5 | from flask import request, jsonify, Response, current_app
  6 | from prometheus_client import Histogram, Gauge, Info
  7 | from regression_model import __version__ as live_version
  8 | 
  9 | from api.config import APP_NAME
 10 | from api.persistence.data_access import PredictionPersistence, ModelType
 11 | from gradient_boosting_model import __version__ as shadow_version
 12 | from gradient_boosting_model.predict import make_prediction
 13 | 
 14 | _logger = logging.getLogger('mlapi')
 15 | 
 16 | 
 17 | PREDICTION_TRACKER = Histogram(
 18 |     name='house_price_prediction_dollars',
 19 |     documentation='ML Model Prediction on House Price',
 20 |     labelnames=['app_name', 'model_name', 'model_version']
 21 | )
 22 | 
 23 | PREDICTION_GAUGE = Gauge(
 24 |     name='house_price_gauge_dollars',
 25 |     documentation='ML Model Prediction on House Price for min max calcs',
 26 |     labelnames=['app_name', 'model_name', 'model_version']
 27 | )
 28 | 
 29 | PREDICTION_GAUGE.labels(
 30 |                 app_name=APP_NAME,
 31 |                 model_name=ModelType.LASSO.name,
 32 |                 model_version=live_version)
 33 | 
 34 | MODEL_VERSIONS = Info(
 35 |     'model_version_details',
 36 |     'Capture model version information',
 37 | )
 38 | 
 39 | MODEL_VERSIONS.info({
 40 |     'live_model': ModelType.LASSO.name,
 41 |     'live_version': live_version,
 42 |     'shadow_model': ModelType.GRADIENT_BOOSTING.name,
 43 |     'shadow_version': shadow_version})
 44 | 
 45 | 
 46 | def health():
 47 |     if request.method == "GET":
 48 |         status = {"status": "ok"}
 49 |         _logger.debug(status)
 50 |         return jsonify(status)
 51 | 
 52 | 
 53 | def predict():
 54 |     if request.method == "POST":
 55 |         # Step 1: Extract POST data from request body as JSON
 56 |         json_data = request.get_json()
 57 |         for entry in json_data:
 58 |             _logger.info(entry)
 59 | 
 60 |         # Step 2a: Get and save live model predictions
 61 |         persistence = PredictionPersistence(db_session=current_app.db_session)
 62 |         result = persistence.make_save_predictions(
 63 |             db_model=ModelType.LASSO, input_data=json_data
 64 |         )
 65 | 
 66 |         # Step 2b: Get and save shadow predictions asynchronously
 67 |         if current_app.config.get("SHADOW_MODE_ACTIVE"):
 68 |             _logger.debug(
 69 |                 f"Calling shadow model asynchronously: "
 70 |                 f"{ModelType.GRADIENT_BOOSTING.value}"
 71 |             )
 72 |             thread = threading.Thread(
 73 |                 target=persistence.make_save_predictions,
 74 |                 kwargs={
 75 |                     "db_model": ModelType.GRADIENT_BOOSTING,
 76 |                     "input_data": json_data,
 77 |                 },
 78 |             )
 79 |             thread.start()
 80 | 
 81 |         # Step 3: Handle errors
 82 |         if result.errors:
 83 |             _logger.warning(f"errors during prediction: {result.errors}")
 84 |             return Response(json.dumps(result.errors), status=400)
 85 | 
 86 |         # Step 4: Monitoring
 87 |         for _prediction in result.predictions:
 88 |             PREDICTION_TRACKER.labels(
 89 |                 app_name=APP_NAME,
 90 |                 model_name=ModelType.LASSO.name,
 91 |                 model_version=live_version).observe(_prediction)
 92 |             PREDICTION_GAUGE.labels(
 93 |                 app_name=APP_NAME,
 94 |                 model_name=ModelType.LASSO.name,
 95 |                 model_version=live_version).set(_prediction)
 96 |         _logger.info(
 97 |             f'Prediction results for model: {ModelType.LASSO.name} '
 98 |             f'version: {result.model_version} '
 99 |             f'Output values: {result.predictions}')
100 | 
101 |         # Step 5: Prepare prediction response
102 |         return jsonify(
103 |             {
104 |                 "predictions": result.predictions,
105 |                 "version": result.model_version,
106 |                 "errors": result.errors,
107 |             }
108 |         )
109 | 
110 | 
111 | def predict_previous():
112 |     if request.method == "POST":
113 |         # Step 1: Extract POST data from request body as JSON
114 |         json_data = request.get_json()
115 | 
116 |         # Step 2: Access the model prediction function (also validates data)
117 |         result = make_prediction(input_data=json_data)
118 | 
119 |         # Step 3: Handle errors
120 |         errors = result.get("errors")
121 |         if errors:
122 |             return Response(json.dumps(errors), status=400)
123 | 
124 |         # Step 4: Split out results
125 |         # Regression model interface has changed
126 |         # so no need to call tolist
127 |         predictions = result.get("predictions")
128 |         version = result.get("version")
129 | 
130 |         # Step 5: Save predictions
131 |         persistence = PredictionPersistence(db_session=current_app.db_session)
132 |         persistence.save_predictions(
133 |             inputs=json_data,
134 |             model_version=version,
135 |             predictions=predictions,
136 |             db_model=ModelType.GRADIENT_BOOSTING,
137 |         )
138 | 
139 |         # Step 6: Prepare prediction response
140 |         return jsonify(
141 |             {"predictions": predictions, "version": version, "errors": errors}
142 |         )
143 | 


--------------------------------------------------------------------------------
/packages/ml_api/api/monitoring/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/testing-and-monitoring-ml-deployments/88d0fdac0898178f5ed9611e44a56ea5e53c9dd5/packages/ml_api/api/monitoring/__init__.py


--------------------------------------------------------------------------------
/packages/ml_api/api/monitoring/middleware.py:
--------------------------------------------------------------------------------
 1 | from flask import request, Flask
 2 | from flask.wrappers import Response
 3 | from prometheus_client import Counter, Histogram
 4 | import time
 5 | 
 6 | from api.config import APP_NAME
 7 | 
 8 | 
 9 | # Counter and Histogram are examples of default metrics
10 | # available from the prometheus Python client.
11 | REQUEST_COUNT = Counter(
12 |     name='http_request_count',
13 |     documentation='App Request Count',
14 |     labelnames=['app_name', 'method', 'endpoint', 'http_status']
15 | )
16 | REQUEST_LATENCY = Histogram(
17 |     name='http_request_latency_seconds',
18 |     documentation='Request latency',
19 |     labelnames=['app_name', 'endpoint']
20 | )
21 | 
22 | 
23 | def start_timer() -> None:
24 |     """Get start time of a request."""
25 |     request._prometheus_metrics_request_start_time = time.time()
26 | 
27 | 
28 | def stop_timer(response: Response) -> Response:
29 |     """Get stop time of a request.."""
30 |     request_latency = time.time() - request._prometheus_metrics_request_start_time
31 |     REQUEST_LATENCY.labels(
32 |         app_name=APP_NAME,
33 |         endpoint=request.path).observe(request_latency)
34 |     return response
35 | 
36 | 
37 | def record_request_data(response: Response) -> Response:
38 |     """Capture request data.
39 | 
40 |     Uses the flask request object to extract information such as
41 |     the HTTP request method, endpoint and HTTP status.
42 |     """
43 |     REQUEST_COUNT.labels(
44 |         app_name=APP_NAME,
45 |         method=request.method,
46 |         endpoint=request.path,
47 |         http_status=response.status_code).inc()
48 |     return response
49 | 
50 | 
51 | def setup_metrics(app: Flask) -> None:
52 |     """Setup Prometheus metrics.
53 | 
54 |     This function uses the flask before_request
55 |     and after_request hooks to capture metrics
56 |     with each HTTP request to the application.
57 |     """
58 |     app.before_request(start_timer)
59 |     app.after_request(record_request_data)
60 |     app.after_request(stop_timer)
61 | 


--------------------------------------------------------------------------------
/packages/ml_api/api/persistence/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/testing-and-monitoring-ml-deployments/88d0fdac0898178f5ed9611e44a56ea5e53c9dd5/packages/ml_api/api/persistence/__init__.py


--------------------------------------------------------------------------------
/packages/ml_api/api/persistence/core.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | import alembic.config
 5 | from flask import Flask
 6 | from sqlalchemy import create_engine
 7 | from sqlalchemy.engine import Engine
 8 | from sqlalchemy.ext.declarative import declarative_base
 9 | from sqlalchemy.orm import scoped_session, sessionmaker
10 | from sqlalchemy_utils import database_exists, create_database
11 | 
12 | from api.config import Config, ROOT
13 | 
14 | _logger = logging.getLogger('mlapi')
15 | 
16 | # Base class for SQLAlchemy models
17 | Base = declarative_base()
18 | 
19 | 
20 | def create_db_engine_from_config(*, config: Config) -> Engine:
21 |     """The Engine is the starting point for any SQLAlchemy application.
22 | 
23 |     It’s “home base” for the actual database and its DBAPI, delivered to the SQLAlchemy
24 |     application through a connection pool and a Dialect, which describes how to talk to
25 |     a specific kind of database / DBAPI combination.
26 |     """
27 | 
28 |     db_url = config.SQLALCHEMY_DATABASE_URI
29 |     if not database_exists(db_url):
30 |         create_database(db_url)
31 |     engine = create_engine(db_url)
32 | 
33 |     _logger.info(f"creating DB conn with URI: {db_url}")
34 |     return engine
35 | 
36 | 
37 | def create_db_session(*, engine: Engine) -> scoped_session:
38 |     """Broadly speaking, the Session establishes all conversations with the database.
39 | 
40 |      It represents a “holding zone” for all the objects which you’ve loaded or
41 |      associated with it during its lifespan.
42 |      """
43 |     return scoped_session(sessionmaker(autocommit=False, autoflush=False, bind=engine))
44 | 
45 | 
46 | def init_database(app: Flask, config: Config, db_session=None) -> None:
47 |     """Connect to the database and attach DB session to the app."""
48 | 
49 |     if not db_session:
50 |         engine = create_db_engine_from_config(config=config)
51 |         db_session = create_db_session(engine=engine)
52 | 
53 |     app.db_session = db_session
54 | 
55 |     @app.teardown_appcontext
56 |     def shutdown_session(exception=None):
57 |         db_session.remove()
58 | 
59 | 
60 | def run_migrations():
61 |     """Run the DB migrations prior to the tests."""
62 | 
63 |     # alembic looks for the migrations in the current
64 |     # directory so we change to the correct directory.
65 |     os.chdir(str(ROOT))
66 |     alembicArgs = ["--raiseerr", "upgrade", "head"]
67 |     alembic.config.main(argv=alembicArgs)
68 | 


--------------------------------------------------------------------------------
/packages/ml_api/api/persistence/data_access.py:
--------------------------------------------------------------------------------
  1 | import enum
  2 | import json
  3 | import logging
  4 | import typing as t
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | from regression_model.predict import make_prediction as make_live_prediction
  9 | from sqlalchemy.orm.session import Session
 10 | 
 11 | from api.persistence.models import (
 12 |     LassoModelPredictions,
 13 |     GradientBoostingModelPredictions,
 14 | )
 15 | from gradient_boosting_model.predict import make_prediction as make_shadow_prediction
 16 | 
 17 | _logger = logging.getLogger('mlapi')
 18 | 
 19 | 
 20 | SECONDARY_VARIABLES_TO_RENAME = {
 21 |     "FirstFlrSF": "1stFlrSF",
 22 |     "SecondFlrSF": "2ndFlrSF",
 23 |     "ThreeSsnPortch": "3SsnPorch",
 24 | }
 25 | 
 26 | 
 27 | class ModelType(enum.Enum):
 28 |     LASSO = "lasso"
 29 |     GRADIENT_BOOSTING = "gradient_boosting"
 30 | 
 31 | 
 32 | class PredictionResult(t.NamedTuple):
 33 |     errors: t.Any
 34 |     predictions: np.array
 35 |     model_version: str
 36 | 
 37 | 
 38 | MODEL_PREDICTION_MAP = {
 39 |     ModelType.GRADIENT_BOOSTING: make_shadow_prediction,
 40 |     ModelType.LASSO: make_live_prediction,
 41 | }
 42 | 
 43 | 
 44 | class PredictionPersistence:
 45 |     def __init__(self, *, db_session: Session, user_id: str = None) -> None:
 46 |         self.db_session = db_session
 47 |         if not user_id:
 48 |             # in reality, here we would use something like a UUID for anonymous users
 49 |             # and if we had user logins, we would record the user ID.
 50 |             self.user_id = "007"
 51 | 
 52 |     def make_save_predictions(
 53 |         self, *, db_model: ModelType, input_data: t.List
 54 |     ) -> PredictionResult:
 55 |         """Get the prediction from a given model and persist it."""
 56 |         # Access the model prediction function via mapping
 57 |         if db_model == ModelType.LASSO:
 58 |             # we have to rename a few of the columns for backwards
 59 |             # compatibility with the regression model package.
 60 |             live_frame = pd.DataFrame(input_data)
 61 |             input_data = live_frame.rename(
 62 |                 columns=SECONDARY_VARIABLES_TO_RENAME
 63 |             ).to_dict(orient="records")
 64 | 
 65 |         result = MODEL_PREDICTION_MAP[db_model](input_data=input_data)
 66 |         errors = None
 67 |         try:
 68 |             errors = result["errors"]
 69 |         except KeyError:
 70 |             # regression model `make_prediction` does not include errors
 71 |             pass
 72 | 
 73 |         prediction_result = PredictionResult(
 74 |             errors=errors,
 75 |             predictions=result.get("predictions").tolist() if not errors else None,
 76 |             model_version=result.get("version"),
 77 |         )
 78 | 
 79 |         if prediction_result.errors:
 80 |             return prediction_result
 81 | 
 82 |         self.save_predictions(
 83 |             inputs=input_data, prediction_result=prediction_result, db_model=db_model
 84 |         )
 85 | 
 86 |         return prediction_result
 87 | 
 88 |     def save_predictions(
 89 |         self,
 90 |         *,
 91 |         inputs: t.List,
 92 |         prediction_result: PredictionResult,
 93 |         db_model: ModelType,
 94 |     ) -> None:
 95 |         """Persist model predictions to storage."""
 96 |         if db_model == db_model.LASSO:
 97 |             prediction_data = LassoModelPredictions(
 98 |                 user_id=self.user_id,
 99 |                 model_version=prediction_result.model_version,
100 |                 inputs=json.dumps(inputs),
101 |                 outputs=json.dumps(prediction_result.predictions),
102 |             )
103 |         else:
104 |             prediction_data = GradientBoostingModelPredictions(
105 |                 user_id=self.user_id,
106 |                 model_version=prediction_result.model_version,
107 |                 inputs=json.dumps(inputs),
108 |                 outputs=json.dumps(prediction_result.predictions),
109 |             )
110 | 
111 |         self.db_session.add(prediction_data)
112 |         self.db_session.commit()
113 |         _logger.debug(f"saved data for model: {db_model}")
114 | 


--------------------------------------------------------------------------------
/packages/ml_api/api/persistence/models.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import Column, String, DateTime, Integer
 2 | from sqlalchemy.dialects.postgresql import JSONB
 3 | from sqlalchemy.sql import func
 4 | 
 5 | from api.persistence.core import Base
 6 | 
 7 | 
 8 | class LassoModelPredictions(Base):
 9 |     __tablename__ = "regression_model_predictions"
10 |     id = Column(Integer, primary_key=True)
11 |     user_id = Column(String(36), nullable=False)
12 |     datetime_captured = Column(
13 |         DateTime(timezone=True), server_default=func.now(), index=True
14 |     )
15 |     model_version = Column(String(36), nullable=False)
16 |     inputs = Column(JSONB)
17 |     outputs = Column(JSONB)
18 | 
19 | 
20 | class GradientBoostingModelPredictions(Base):
21 |     __tablename__ = "gradient_boosting_model_predictions"
22 |     id = Column(Integer, primary_key=True)
23 |     user_id = Column(String(36), nullable=False)
24 |     datetime_captured = Column(
25 |         DateTime(timezone=True), server_default=func.now(), index=True
26 |     )
27 |     model_version = Column(String(36), nullable=False)
28 |     inputs = Column(JSONB)
29 |     outputs = Column(JSONB)
30 | 


--------------------------------------------------------------------------------
/packages/ml_api/api/spec/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/testing-and-monitoring-ml-deployments/88d0fdac0898178f5ed9611e44a56ea5e53c9dd5/packages/ml_api/api/spec/__init__.py


--------------------------------------------------------------------------------
/packages/ml_api/api/spec/api.yaml:
--------------------------------------------------------------------------------
  1 | openapi: 3.0.0
  2 | 
  3 | info:
  4 |   title: Spec for House Price Prediction API
  5 |   version: '1'
  6 | 
  7 | servers:
  8 | - url: http://{base}:5000/
  9 |   description: API for performing house price predictions.
 10 |   variables:
 11 |     base:
 12 |       default: 0.0.0.0
 13 | 
 14 | paths:
 15 |   /:
 16 |     get:
 17 |       operationId: api.controller.health
 18 |       responses:
 19 |         '200':
 20 |           description: API Health Status
 21 | 
 22 |   /v1/predictions/regression:
 23 |     post:
 24 |       operationId: api.controller.predict
 25 |       requestBody:
 26 |         description: House details used to make price prediction
 27 |         required: true
 28 |         content:
 29 |           application/json:
 30 |             schema:
 31 |               type: array
 32 |               items:
 33 |                 $ref: '#/components/schemas/HouseDetails'
 34 |       responses:
 35 |         '200':
 36 |           description: House Price Predictions
 37 |         '400':
 38 |           description: Bad request, house data validation failed
 39 |         '5XX':
 40 |           description: Unexpected error
 41 | 
 42 |   /v1/predictions/gradient:
 43 |     post:
 44 |       operationId: api.controller.predict_previous
 45 |       requestBody:
 46 |         description: House details used to make price prediction
 47 |         required: true
 48 |         content:
 49 |           application/json:
 50 |             schema:
 51 |               type: array
 52 |               items:
 53 |                 $ref: '#/components/schemas/HouseDetails'
 54 |       responses:
 55 |         '200':
 56 |           description: House Price Predictions
 57 |         '400':
 58 |           description: Bad request, house data validation failed
 59 |         '5XX':
 60 |           description: Unexpected error
 61 | 
 62 | components:
 63 |   schemas:
 64 |     HouseDetails:
 65 |       type: object
 66 |       description: "List of the houses to get predictions for."
 67 |       example:
 68 |         Id: 1461
 69 |         MSSubClass: 20
 70 |         MSZoning: RH
 71 |         LotFrontage: 80.0
 72 |         LotArea: 11622
 73 |         Street: Pave
 74 |         Alley: null
 75 |         LotShape: Reg
 76 |         LandContour: Lvl
 77 |         Utilities: AllPub
 78 |         LotConfig: Inside
 79 |         LandSlope: Gtl
 80 |         Neighborhood: NAmes
 81 |         Condition1: Feedr
 82 |         Condition2: Norm
 83 |         BldgType: 1Fam
 84 |         HouseStyle: 1Story
 85 |         OverallQual: 5
 86 |         OverallCond: 6
 87 |         YearBuilt: 1961
 88 |         YearRemodAdd: 1961
 89 |         RoofStyle: Gable
 90 |         RoofMatl: CompShg
 91 |         Exterior1st: VinylSd
 92 |         Exterior2nd: VinylSd
 93 |         MasVnrType: None
 94 |         MasVnrArea: 0.0
 95 |         ExterQual: TA
 96 |         ExterCond: TA
 97 |         Foundation: CBlock
 98 |         BsmtQual: TA
 99 |         BsmtCond: TA
100 |         BsmtExposure: null
101 |         BsmtFinType1: Rec
102 |         BsmtFinSF1: 468.0
103 |         BsmtFinType2: LwQ
104 |         BsmtFinSF2: 144.0
105 |         BsmtUnfSF: 270.0
106 |         TotalBsmtSF: 882.0
107 |         Heating: GasA
108 |         HeatingQC: TA
109 |         CentralAir: Y
110 |         Electrical: SBrkr
111 |         1stFlrSF: 896
112 |         2ndFlrSF: 0
113 |         LowQualFinSF: 0
114 |         GrLivArea: 896
115 |         BsmtFullBath: 0.0
116 |         BsmtHalfBath: 0.0
117 |         FullBath: 1
118 |         HalfBath: 0
119 |         BedroomAbvGr: 2
120 |         KitchenAbvGr: 1
121 |         KitchenQual: TA
122 |         TotRmsAbvGrd: 5
123 |         Functional: Typ
124 |         Fireplaces: 0
125 |         FireplaceQu: null
126 |         GarageType: Attchd
127 |         GarageYrBlt: 1961.0
128 |         GarageFinish: Unf
129 |         GarageCars: 1.0
130 |         GarageArea: 730.0
131 |         GarageQual: TA
132 |         GarageCond: TA
133 |         PavedDrive: Y
134 |         WoodDeckSF: 140
135 |         OpenPorchSF: 0
136 |         EnclosedPorch: 0
137 |         3SsnPorch: 0
138 |         ScreenPorch: 120
139 |         PoolArea: 0
140 |         PoolQC: null
141 |         Fence: MnPrv
142 |         MiscFeature: null
143 |         MiscVal: 0
144 |         MoSold: 6
145 |         YrSold: 2010
146 |         SaleType: WD
147 |         SaleCondition: Normal
148 | 


--------------------------------------------------------------------------------
/packages/ml_api/differential_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/testing-and-monitoring-ml-deployments/88d0fdac0898178f5ed9611e44a56ea5e53c9dd5/packages/ml_api/differential_tests/__init__.py


--------------------------------------------------------------------------------
/packages/ml_api/differential_tests/__main__.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from argparse import ArgumentParser, Namespace
  3 | from pathlib import Path
  4 | from typing import Mapping
  5 | 
  6 | from differential_tests.compare import compare_predictions
  7 | from api.config import ROOT
  8 | 
  9 | from termcolor import cprint
 10 | from yarl import URL
 11 | import requests
 12 | 
 13 | Marginals = Mapping[str, Mapping[str, float]]
 14 | 
 15 | 
 16 | def parse_args() -> Namespace:
 17 |     parser = ArgumentParser()
 18 | 
 19 |     subparsers = parser.add_subparsers(dest="command")
 20 | 
 21 |     compute_parser = subparsers.add_parser(
 22 |         "compute", help="Compute the predictions for a test set"
 23 |     )
 24 |     compute_parser.add_argument(
 25 |         "--base-url",
 26 |         default=URL("http://0.0.0.0:5000"),
 27 |         type=URL,
 28 |         help="Base URL of the service to test",
 29 |     )
 30 |     compute_parser.add_argument(
 31 |         "tests_dir", type=Path, help="Directory containing the test set to use"
 32 |     )
 33 |     compute_parser.add_argument(
 34 |         "results_dir", type=Path, help="Directory to save the prediction results to"
 35 |     )
 36 | 
 37 |     compare_parser = subparsers.add_parser(
 38 |         "compare", help="Compare the actual results with the expected results"
 39 |     )
 40 |     compare_parser.add_argument(
 41 |         "--absolute-tolerance",
 42 |         dest="abs_tol",
 43 |         metavar="X",
 44 |         type=float,
 45 |         help="math.isclose(a, b, abs_tol=X)",
 46 |         default=1e-5,
 47 |     )
 48 |     compare_parser.add_argument(
 49 |         "--relative-tolerance",
 50 |         dest="rel_tol",
 51 |         metavar="X",
 52 |         type=float,
 53 |         default=1e-5,
 54 |         help="math.isclose(a, b, rel_tol=X)",
 55 |     )
 56 |     compare_parser.add_argument(
 57 |         "expected_results_dir",
 58 |         type=Path,
 59 |         help="Directory containing the expected results",
 60 |     )
 61 |     compare_parser.add_argument(
 62 |         "actual_results_dir", type=Path, help="Directory containing the actual results"
 63 |     )
 64 | 
 65 |     return parser.parse_args()
 66 | 
 67 | 
 68 | def main(args: Namespace) -> None:
 69 |     if args.command == "compute":
 70 |         compute_predictions(args)
 71 |     elif args.command == "compare":
 72 |         compare_predictions(args)
 73 | 
 74 | 
 75 | def compute_predictions(args: Namespace) -> None:
 76 |     print("computing")
 77 | 
 78 |     diff_test_dir = ROOT / "differential_tests"
 79 |     results_dir = args.results_dir
 80 |     results_dir.mkdir(parents=True, exist_ok=True)
 81 |     prepared_test_dir = diff_test_dir / Path(args.tests_dir)
 82 | 
 83 |     for test_filename in sorted(prepared_test_dir.glob("*.json")):
 84 |         results_filename = results_dir / test_filename.name
 85 |         print(f"Computing {results_filename} from {test_filename} ... ", end="")
 86 | 
 87 |         with test_filename.open() as f:
 88 |             test = json.load(f)
 89 | 
 90 |         results = requests.post(f"{args.base_url}/v1/predictions/primary", json=test)
 91 | 
 92 |         with results_filename.open("w") as f:
 93 |             json.dump(results.json(), f, indent=2, sort_keys=True)
 94 | 
 95 |         cprint("OK", "green")
 96 | 
 97 | 
 98 | if __name__ == "__main__":
 99 |     args = parse_args()
100 |     main(args)
101 | 


--------------------------------------------------------------------------------
/packages/ml_api/differential_tests/compare.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import math
 3 | import sys
 4 | import typing as t
 5 | from argparse import Namespace
 6 | 
 7 | from termcolor import cprint
 8 | 
 9 | from api.config import ROOT
10 | 
11 | 
12 | def compare_differences(
13 |     *,
14 |     expected_predictions: t.List,
15 |     actual_predictions: t.List,
16 |     rel_tol: t.Optional[float] = None,
17 |     abs_tol: t.Optional[float] = None,
18 | ) -> None:
19 |     """
20 |     :param rel_tol: is the relative tolerance – it is the maximum allowed difference
21 |     between a and b, relative to the larger absolute value of a or b.
22 |     For example, to set a tolerance of 5%, pass rel_tol=0.05. The default
23 |     tolerance is 1e-09, which assures that the two values are the same within
24 |     about 9 decimal digits. rel_tol must be greater than zero.
25 | 
26 |     :param abs_tol: abs_tol is the minimum absolute tolerance – useful for comparisons
27 |     near zero. abs_tol must be at least zero.
28 |     """
29 |     only_in_expected = len(expected_predictions) - len(actual_predictions)
30 | 
31 |     if only_in_expected:
32 |         raise ValueError(f"Missing {only_in_expected} predictions")
33 | 
34 |     only_in_actual = len(actual_predictions) - len(expected_predictions)
35 | 
36 |     if only_in_actual:
37 |         raise ValueError(f"Found {only_in_actual} unexpected predictions")
38 | 
39 |     thresholds = {}
40 | 
41 |     if abs_tol is not None:
42 |         thresholds["abs_tol"] = abs_tol
43 | 
44 |     if rel_tol is not None:
45 |         thresholds["rel_tol"] = rel_tol
46 | 
47 |     for index, (actual_prediction, expected_prediction) in enumerate(
48 |         zip(actual_predictions, expected_predictions)
49 |     ):
50 |         if not math.isclose(expected_prediction, actual_prediction, **thresholds):
51 |             raise ValueError(
52 |                 f"Price prediction {index} has changed by more "
53 |                 f"than the thresholds: {thresholds}: "
54 |                 f"{expected_prediction} (expected) vs "
55 |                 f"{actual_prediction} (actual)"
56 |             )
57 | 
58 | 
59 | def compare_predictions(args: Namespace) -> None:
60 |     expected_results_dir = ROOT / args.expected_results_dir
61 |     actual_results_dir = ROOT / args.actual_results_dir
62 | 
63 |     expected_results_filenames = list(expected_results_dir.glob("*.json"))
64 | 
65 |     if not expected_results_filenames:
66 |         print("No results found!")
67 |         sys.exit(1)
68 | 
69 |     for expected_results_filename in sorted(expected_results_filenames):
70 |         name = expected_results_filename.name
71 |         actual_results_filename = actual_results_dir / name
72 | 
73 |         print(
74 |             f"Comparing {expected_results_filename} with {actual_results_filename} ... ",
75 |             end="",
76 |         )
77 | 
78 |         with expected_results_filename.open() as f:
79 |             expected_results = json.load(f)
80 | 
81 |         with actual_results_filename.open() as f:
82 |             actual_results = json.load(f)
83 | 
84 |         try:
85 |             compare_differences(
86 |                 expected_predictions=expected_results["predictions"],
87 |                 actual_predictions=actual_results["predictions"],
88 |                 rel_tol=args.rel_tol,
89 |                 abs_tol=args.abs_tol,
90 |             )
91 |         except ValueError as exc:
92 |             cprint("ERROR", "red")
93 |             cprint(f"  • {exc}", "red")
94 |         else:
95 |             cprint("OK", "green")
96 | 


--------------------------------------------------------------------------------
/packages/ml_api/differential_tests/sample_payloads/sample_input1.json:
--------------------------------------------------------------------------------
  1 | [{
  2 | 	"Id": 1461,
  3 | 	"MSSubClass": 20,
  4 | 	"MSZoning": "RH",
  5 | 	"LotFrontage": 80.0,
  6 | 	"LotArea": 11622,
  7 | 	"Street": "Pave",
  8 | 	"Alley": null,
  9 | 	"LotShape": "Reg",
 10 | 	"LandContour": "Lvl",
 11 | 	"Utilities": "AllPub",
 12 | 	"LotConfig": "Inside",
 13 | 	"LandSlope": "Gtl",
 14 | 	"Neighborhood": "NAmes",
 15 | 	"Condition1": "Feedr",
 16 | 	"Condition2": "Norm",
 17 | 	"BldgType": "1Fam",
 18 | 	"HouseStyle": "1Story",
 19 | 	"OverallQual": 5,
 20 | 	"OverallCond": 6,
 21 | 	"YearBuilt": 1961,
 22 | 	"YearRemodAdd": 1961,
 23 | 	"RoofStyle": "Gable",
 24 | 	"RoofMatl": "CompShg",
 25 | 	"Exterior1st": "VinylSd",
 26 | 	"Exterior2nd": "VinylSd",
 27 | 	"MasVnrType": "None",
 28 | 	"MasVnrArea": 0.0,
 29 | 	"ExterQual": "TA",
 30 | 	"ExterCond": "TA",
 31 | 	"Foundation": "CBlock",
 32 | 	"BsmtQual": "TA",
 33 | 	"BsmtCond": "TA",
 34 | 	"BsmtExposure": "No",
 35 | 	"BsmtFinType1": "Rec",
 36 | 	"BsmtFinSF1": 468.0,
 37 | 	"BsmtFinType2": "LwQ",
 38 | 	"BsmtFinSF2": 144.0,
 39 | 	"BsmtUnfSF": 270.0,
 40 | 	"TotalBsmtSF": 882.0,
 41 | 	"Heating": "GasA",
 42 | 	"HeatingQC": "TA",
 43 | 	"CentralAir": "Y",
 44 | 	"Electrical": "SBrkr",
 45 | 	"1stFlrSF": 896,
 46 | 	"2ndFlrSF": 0,
 47 | 	"LowQualFinSF": 0,
 48 | 	"GrLivArea": 896,
 49 | 	"BsmtFullBath": 0.0,
 50 | 	"BsmtHalfBath": 0.0,
 51 | 	"FullBath": 1,
 52 | 	"HalfBath": 0,
 53 | 	"BedroomAbvGr": 2,
 54 | 	"KitchenAbvGr": 1,
 55 | 	"KitchenQual": "TA",
 56 | 	"TotRmsAbvGrd": 5,
 57 | 	"Functional": "Typ",
 58 | 	"Fireplaces": 0,
 59 | 	"FireplaceQu": null,
 60 | 	"GarageType": "Attchd",
 61 | 	"GarageYrBlt": 1961.0,
 62 | 	"GarageFinish": "Unf",
 63 | 	"GarageCars": 1.0,
 64 | 	"GarageArea": 730.0,
 65 | 	"GarageQual": "TA",
 66 | 	"GarageCond": "TA",
 67 | 	"PavedDrive": "Y",
 68 | 	"WoodDeckSF": 140,
 69 | 	"OpenPorchSF": 0,
 70 | 	"EnclosedPorch": 0,
 71 | 	"3SsnPorch": 0,
 72 | 	"ScreenPorch": 120,
 73 | 	"PoolArea": 0,
 74 | 	"PoolQC": null,
 75 | 	"Fence": "MnPrv",
 76 | 	"MiscFeature": null,
 77 | 	"MiscVal": 0,
 78 | 	"MoSold": 6,
 79 | 	"YrSold": 2010,
 80 | 	"SaleType": "WD",
 81 | 	"SaleCondition": "Normal"
 82 | }, {
 83 | 	"Id": 1461,
 84 | 	"MSSubClass": 20,
 85 | 	"MSZoning": "RH",
 86 | 	"LotFrontage": 80.0,
 87 | 	"LotArea": 11689,
 88 | 	"Street": "Pave",
 89 | 	"Alley": null,
 90 | 	"LotShape": "Reg",
 91 | 	"LandContour": "Lvl",
 92 | 	"Utilities": "AllPub",
 93 | 	"LotConfig": "Inside",
 94 | 	"LandSlope": "Gtl",
 95 | 	"Neighborhood": "NAmes",
 96 | 	"Condition1": "Feedr",
 97 | 	"Condition2": "Norm",
 98 | 	"BldgType": "1Fam",
 99 | 	"HouseStyle": "1Story",
100 | 	"OverallQual": 5,
101 | 	"OverallCond": 6,
102 | 	"YearBuilt": 1969,
103 | 	"YearRemodAdd": 1961,
104 | 	"RoofStyle": "Gable",
105 | 	"RoofMatl": "CompShg",
106 | 	"Exterior1st": "VinylSd",
107 | 	"Exterior2nd": "VinylSd",
108 | 	"MasVnrType": "None",
109 | 	"MasVnrArea": 0.0,
110 | 	"ExterQual": "TA",
111 | 	"ExterCond": "TA",
112 | 	"Foundation": "CBlock",
113 | 	"BsmtQual": "TA",
114 | 	"BsmtCond": "TA",
115 | 	"BsmtExposure": "No",
116 | 	"BsmtFinType1": "Rec",
117 | 	"BsmtFinSF1": 468.0,
118 | 	"BsmtFinType2": "LwQ",
119 | 	"BsmtFinSF2": 144.0,
120 | 	"BsmtUnfSF": 270.0,
121 | 	"TotalBsmtSF": 882.0,
122 | 	"Heating": "GasA",
123 | 	"HeatingQC": "TA",
124 | 	"CentralAir": "Y",
125 | 	"Electrical": "SBrkr",
126 | 	"1stFlrSF": 752,
127 | 	"2ndFlrSF": 0,
128 | 	"LowQualFinSF": 0,
129 | 	"GrLivArea": 896,
130 | 	"BsmtFullBath": 0.0,
131 | 	"BsmtHalfBath": 0.0,
132 | 	"FullBath": 1,
133 | 	"HalfBath": 0,
134 | 	"BedroomAbvGr": 2,
135 | 	"KitchenAbvGr": 1,
136 | 	"KitchenQual": "TA",
137 | 	"TotRmsAbvGrd": 5,
138 | 	"Functional": "Typ",
139 | 	"Fireplaces": 0,
140 | 	"FireplaceQu": null,
141 | 	"GarageType": "Attchd",
142 | 	"GarageYrBlt": 1961.0,
143 | 	"GarageFinish": "Unf",
144 | 	"GarageCars": 1.0,
145 | 	"GarageArea": 730.0,
146 | 	"GarageQual": "TA",
147 | 	"GarageCond": "TA",
148 | 	"PavedDrive": "Y",
149 | 	"WoodDeckSF": 140,
150 | 	"OpenPorchSF": 0,
151 | 	"EnclosedPorch": 0,
152 | 	"3SsnPorch": 0,
153 | 	"ScreenPorch": 120,
154 | 	"PoolArea": 0,
155 | 	"PoolQC": null,
156 | 	"Fence": "MnPrv",
157 | 	"MiscFeature": null,
158 | 	"MiscVal": 0,
159 | 	"MoSold": 6,
160 | 	"YrSold": 2010,
161 | 	"SaleType": "WD",
162 | 	"SaleCondition": "Normal"
163 | },
164 | {
165 | 	"Id": 1461,
166 | 	"MSSubClass": 20,
167 | 	"MSZoning": "RH",
168 | 	"LotFrontage": 80.0,
169 | 	"LotArea": 22689,
170 | 	"Street": "Pave",
171 | 	"Alley": null,
172 | 	"LotShape": "Reg",
173 | 	"LandContour": "Lvl",
174 | 	"Utilities": "AllPub",
175 | 	"LotConfig": "Inside",
176 | 	"LandSlope": "Gtl",
177 | 	"Neighborhood": "NAmes",
178 | 	"Condition1": "Feedr",
179 | 	"Condition2": "Norm",
180 | 	"BldgType": "1Fam",
181 | 	"HouseStyle": "1Story",
182 | 	"OverallQual": 5,
183 | 	"OverallCond": 6,
184 | 	"YearBuilt": 1969,
185 | 	"YearRemodAdd": 1961,
186 | 	"RoofStyle": "Gable",
187 | 	"RoofMatl": "CompShg",
188 | 	"Exterior1st": "VinylSd",
189 | 	"Exterior2nd": "VinylSd",
190 | 	"MasVnrType": "None",
191 | 	"MasVnrArea": 0.0,
192 | 	"ExterQual": "TA",
193 | 	"ExterCond": "TA",
194 | 	"Foundation": "CBlock",
195 | 	"BsmtQual": "TA",
196 | 	"BsmtCond": "TA",
197 | 	"BsmtExposure": "No",
198 | 	"BsmtFinType1": "Rec",
199 | 	"BsmtFinSF1": 468.0,
200 | 	"BsmtFinType2": "LwQ",
201 | 	"BsmtFinSF2": 144.0,
202 | 	"BsmtUnfSF": 270.0,
203 | 	"TotalBsmtSF": 882.0,
204 | 	"Heating": "GasA",
205 | 	"HeatingQC": "TA",
206 | 	"CentralAir": "Y",
207 | 	"Electrical": "SBrkr",
208 | 	"1stFlrSF": 752,
209 | 	"2ndFlrSF": 0,
210 | 	"LowQualFinSF": 0,
211 | 	"GrLivArea": 896,
212 | 	"BsmtFullBath": 0.0,
213 | 	"BsmtHalfBath": 0.0,
214 | 	"FullBath": 1,
215 | 	"HalfBath": 0,
216 | 	"BedroomAbvGr": 2,
217 | 	"KitchenAbvGr": 1,
218 | 	"KitchenQual": "TA",
219 | 	"TotRmsAbvGrd": 5,
220 | 	"Functional": "Typ",
221 | 	"Fireplaces": 0,
222 | 	"FireplaceQu": null,
223 | 	"GarageType": "Attchd",
224 | 	"GarageYrBlt": 1961.0,
225 | 	"GarageFinish": "Unf",
226 | 	"GarageCars": 1.0,
227 | 	"GarageArea": 730.0,
228 | 	"GarageQual": "TA",
229 | 	"GarageCond": "TA",
230 | 	"PavedDrive": "Y",
231 | 	"WoodDeckSF": 140,
232 | 	"OpenPorchSF": 0,
233 | 	"EnclosedPorch": 0,
234 | 	"3SsnPorch": 0,
235 | 	"ScreenPorch": 120,
236 | 	"PoolArea": 0,
237 | 	"PoolQC": null,
238 | 	"Fence": "MnPrv",
239 | 	"MiscFeature": null,
240 | 	"MiscVal": 0,
241 | 	"MoSold": 6,
242 | 	"YrSold": 2010,
243 | 	"SaleType": "WD",
244 | 	"SaleCondition": "Normal"
245 | },{
246 | 	"Id": 1461,
247 | 	"MSSubClass": 20,
248 | 	"MSZoning": "RH",
249 | 	"LotFrontage": 80.0,
250 | 	"LotArea": 11689,
251 | 	"Street": "Pave",
252 | 	"Alley": null,
253 | 	"LotShape": "Reg",
254 | 	"LandContour": "Lvl",
255 | 	"Utilities": "AllPub",
256 | 	"LotConfig": "Inside",
257 | 	"LandSlope": "Gtl",
258 | 	"Neighborhood": "NAmes",
259 | 	"Condition1": "Feedr",
260 | 	"Condition2": "Norm",
261 | 	"BldgType": "1Fam",
262 | 	"HouseStyle": "1Story",
263 | 	"OverallQual": 5,
264 | 	"OverallCond": 6,
265 | 	"YearBuilt": 1969,
266 | 	"YearRemodAdd": 1961,
267 | 	"RoofStyle": "Gable",
268 | 	"RoofMatl": "CompShg",
269 | 	"Exterior1st": "VinylSd",
270 | 	"Exterior2nd": "VinylSd",
271 | 	"MasVnrType": "None",
272 | 	"MasVnrArea": 0.0,
273 | 	"ExterQual": "TA",
274 | 	"ExterCond": "TA",
275 | 	"Foundation": "CBlock",
276 | 	"BsmtQual": "TA",
277 | 	"BsmtCond": "TA",
278 | 	"BsmtExposure": "No",
279 | 	"BsmtFinType1": "Rec",
280 | 	"BsmtFinSF1": 468.0,
281 | 	"BsmtFinType2": "LwQ",
282 | 	"BsmtFinSF2": 144.0,
283 | 	"BsmtUnfSF": 270.0,
284 | 	"TotalBsmtSF": 882.0,
285 | 	"Heating": "GasA",
286 | 	"HeatingQC": "TA",
287 | 	"CentralAir": "Y",
288 | 	"Electrical": "SBrkr",
289 | 	"1stFlrSF": 988,
290 | 	"2ndFlrSF": 0,
291 | 	"LowQualFinSF": 0,
292 | 	"GrLivArea": 896,
293 | 	"BsmtFullBath": 0.0,
294 | 	"BsmtHalfBath": 0.0,
295 | 	"FullBath": 1,
296 | 	"HalfBath": 0,
297 | 	"BedroomAbvGr": 2,
298 | 	"KitchenAbvGr": 1,
299 | 	"KitchenQual": "TA",
300 | 	"TotRmsAbvGrd": 5,
301 | 	"Functional": "Typ",
302 | 	"Fireplaces": 0,
303 | 	"FireplaceQu": null,
304 | 	"GarageType": "Attchd",
305 | 	"GarageYrBlt": 1961.0,
306 | 	"GarageFinish": "Unf",
307 | 	"GarageCars": 1.0,
308 | 	"GarageArea": 730.0,
309 | 	"GarageQual": "TA",
310 | 	"GarageCond": "TA",
311 | 	"PavedDrive": "Y",
312 | 	"WoodDeckSF": 140,
313 | 	"OpenPorchSF": 0,
314 | 	"EnclosedPorch": 0,
315 | 	"3SsnPorch": 0,
316 | 	"ScreenPorch": 120,
317 | 	"PoolArea": 0,
318 | 	"PoolQC": null,
319 | 	"Fence": "MnPrv",
320 | 	"MiscFeature": null,
321 | 	"MiscVal": 0,
322 | 	"MoSold": 6,
323 | 	"YrSold": 2010,
324 | 	"SaleType": "WD",
325 | 	"SaleCondition": "Normal"
326 | },{
327 | 	"Id": 1461,
328 | 	"MSSubClass": 20,
329 | 	"MSZoning": "RH",
330 | 	"LotFrontage": 80.0,
331 | 	"LotArea": 11689,
332 | 	"Street": "Pave",
333 | 	"Alley": null,
334 | 	"LotShape": "Reg",
335 | 	"LandContour": "Lvl",
336 | 	"Utilities": "AllPub",
337 | 	"LotConfig": "Inside",
338 | 	"LandSlope": "Gtl",
339 | 	"Neighborhood": "NAmes",
340 | 	"Condition1": "Feedr",
341 | 	"Condition2": "Norm",
342 | 	"BldgType": "1Fam",
343 | 	"HouseStyle": "1Story",
344 | 	"OverallQual": 5,
345 | 	"OverallCond": 6,
346 | 	"YearBuilt": 1969,
347 | 	"YearRemodAdd": 1961,
348 | 	"RoofStyle": "Gable",
349 | 	"RoofMatl": "CompShg",
350 | 	"Exterior1st": "VinylSd",
351 | 	"Exterior2nd": "VinylSd",
352 | 	"MasVnrType": "None",
353 | 	"MasVnrArea": 0.0,
354 | 	"ExterQual": "TA",
355 | 	"ExterCond": "TA",
356 | 	"Foundation": "CBlock",
357 | 	"BsmtQual": "TA",
358 | 	"BsmtCond": "TA",
359 | 	"BsmtExposure": "No",
360 | 	"BsmtFinType1": "Rec",
361 | 	"BsmtFinSF1": 468.0,
362 | 	"BsmtFinType2": "LwQ",
363 | 	"BsmtFinSF2": 144.0,
364 | 	"BsmtUnfSF": 270.0,
365 | 	"TotalBsmtSF": 882.0,
366 | 	"Heating": "GasA",
367 | 	"HeatingQC": "TA",
368 | 	"CentralAir": "Y",
369 | 	"Electrical": "SBrkr",
370 | 	"1stFlrSF": 752,
371 | 	"2ndFlrSF": 0,
372 | 	"LowQualFinSF": 0,
373 | 	"GrLivArea": 896,
374 | 	"BsmtFullBath": 0.0,
375 | 	"BsmtHalfBath": 0.0,
376 | 	"FullBath": 1,
377 | 	"HalfBath": 0,
378 | 	"BedroomAbvGr": 2,
379 | 	"KitchenAbvGr": 1,
380 | 	"KitchenQual": "TA",
381 | 	"TotRmsAbvGrd": 5,
382 | 	"Functional": "Typ",
383 | 	"Fireplaces": 0,
384 | 	"FireplaceQu": null,
385 | 	"GarageType": "Attchd",
386 | 	"GarageYrBlt": 1961.0,
387 | 	"GarageFinish": "Unf",
388 | 	"GarageCars": 1.0,
389 | 	"GarageArea": 730.0,
390 | 	"GarageQual": "TA",
391 | 	"GarageCond": "TA",
392 | 	"PavedDrive": "Y",
393 | 	"WoodDeckSF": 140,
394 | 	"OpenPorchSF": 0,
395 | 	"EnclosedPorch": 0,
396 | 	"3SsnPorch": 0,
397 | 	"ScreenPorch": 120,
398 | 	"PoolArea": 0,
399 | 	"PoolQC": null,
400 | 	"Fence": "MnPrv",
401 | 	"MiscFeature": null,
402 | 	"MiscVal": 0,
403 | 	"MoSold": 6,
404 | 	"YrSold": 2008,
405 | 	"SaleType": "WD",
406 | 	"SaleCondition": "Normal"
407 | },{
408 | 	"Id": 1461,
409 | 	"MSSubClass": 20,
410 | 	"MSZoning": "RH",
411 | 	"LotFrontage": 80.0,
412 | 	"LotArea": 25000,
413 | 	"Street": "Pave",
414 | 	"Alley": null,
415 | 	"LotShape": "Reg",
416 | 	"LandContour": "Lvl",
417 | 	"Utilities": "AllPub",
418 | 	"LotConfig": "Inside",
419 | 	"LandSlope": "Gtl",
420 | 	"Neighborhood": "NAmes",
421 | 	"Condition1": "Feedr",
422 | 	"Condition2": "Norm",
423 | 	"BldgType": "1Fam",
424 | 	"HouseStyle": "1Story",
425 | 	"OverallQual": 5,
426 | 	"OverallCond": 6,
427 | 	"YearBuilt": 1969,
428 | 	"YearRemodAdd": 1961,
429 | 	"RoofStyle": "Gable",
430 | 	"RoofMatl": "CompShg",
431 | 	"Exterior1st": "VinylSd",
432 | 	"Exterior2nd": "VinylSd",
433 | 	"MasVnrType": "None",
434 | 	"MasVnrArea": 0.0,
435 | 	"ExterQual": "TA",
436 | 	"ExterCond": "TA",
437 | 	"Foundation": "CBlock",
438 | 	"BsmtQual": "TA",
439 | 	"BsmtCond": "TA",
440 | 	"BsmtExposure": "No",
441 | 	"BsmtFinType1": "Rec",
442 | 	"BsmtFinSF1": 468.0,
443 | 	"BsmtFinType2": "LwQ",
444 | 	"BsmtFinSF2": 144.0,
445 | 	"BsmtUnfSF": 270.0,
446 | 	"TotalBsmtSF": 882.0,
447 | 	"Heating": "GasA",
448 | 	"HeatingQC": "TA",
449 | 	"CentralAir": "Y",
450 | 	"Electrical": "SBrkr",
451 | 	"1stFlrSF": 752,
452 | 	"2ndFlrSF": 0,
453 | 	"LowQualFinSF": 0,
454 | 	"GrLivArea": 896,
455 | 	"BsmtFullBath": 0.0,
456 | 	"BsmtHalfBath": 0.0,
457 | 	"FullBath": 1,
458 | 	"HalfBath": 0,
459 | 	"BedroomAbvGr": 2,
460 | 	"KitchenAbvGr": 1,
461 | 	"KitchenQual": "TA",
462 | 	"TotRmsAbvGrd": 5,
463 | 	"Functional": "Typ",
464 | 	"Fireplaces": 0,
465 | 	"FireplaceQu": null,
466 | 	"GarageType": "Attchd",
467 | 	"GarageYrBlt": 1961.0,
468 | 	"GarageFinish": "Unf",
469 | 	"GarageCars": 1.0,
470 | 	"GarageArea": 730.0,
471 | 	"GarageQual": "TA",
472 | 	"GarageCond": "TA",
473 | 	"PavedDrive": "Y",
474 | 	"WoodDeckSF": 140,
475 | 	"OpenPorchSF": 0,
476 | 	"EnclosedPorch": 0,
477 | 	"3SsnPorch": 0,
478 | 	"ScreenPorch": 120,
479 | 	"PoolArea": 0,
480 | 	"PoolQC": null,
481 | 	"Fence": "MnPrv",
482 | 	"MiscFeature": null,
483 | 	"MiscVal": 0,
484 | 	"MoSold": 6,
485 | 	"YrSold": 2010,
486 | 	"SaleType": "WD",
487 | 	"SaleCondition": "Normal"
488 | }]


--------------------------------------------------------------------------------
/packages/ml_api/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9.5-slim-buster
 2 | 
 3 | RUN mkdir -p /opt/app
 4 | COPY requirements /opt/app/requirements
 5 | RUN pip install --upgrade pip
 6 | 
 7 | # ensure we can run the make commands
 8 | RUN apt-get update -y && \
 9 |  	apt-get install -y make && \
10 |  	apt-get install -y libffi-dev gcc && \
11 |  	# for swagger
12 |  	apt-get install -y curl && \
13 |  	# for postgres driver
14 |  	apt-get install -y libpq-dev
15 | 
16 | RUN pip install -r /opt/app/requirements/requirements.txt
17 | ENV PYTHONPATH "${PYTHONPATH}:/opt/app/"
18 | 
19 | ADD . /opt/app
20 | WORKDIR /opt/app
21 | 


--------------------------------------------------------------------------------
/packages/ml_api/docker/Dockerfile.test:
--------------------------------------------------------------------------------
 1 | FROM python:3.9.5-slim-buster
 2 | 
 3 | RUN mkdir -p /opt/app
 4 | COPY requirements /opt/app/requirements
 5 | RUN pip install --upgrade pip
 6 | 
 7 | # ensure we can run the make commands
 8 | RUN apt-get update -y && \
 9 |  	apt-get install -y make && \
10 |  	apt-get install -y libffi-dev gcc && \
11 |  	# for swagger
12 |  	apt-get install -y curl
13 | 
14 | ENV PYTHONPATH "${PYTHONPATH}:/opt/app"
15 | RUN pip install -r /opt/app/requirements/test_requirements.txt
16 | 
17 | ADD . /opt/app
18 | WORKDIR /opt/app
19 | 


--------------------------------------------------------------------------------
/packages/ml_api/docker/config/grafana/grafana_flask_basic_dashboard_ml_api.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "annotations": {
  3 |     "list": [
  4 |       {
  5 |         "builtIn": 1,
  6 |         "datasource": "-- Grafana --",
  7 |         "enable": true,
  8 |         "hide": true,
  9 |         "iconColor": "rgba(0, 211, 255, 1)",
 10 |         "name": "Annotations & Alerts",
 11 |         "type": "dashboard"
 12 |       }
 13 |     ]
 14 |   },
 15 |   "editable": true,
 16 |   "gnetId": null,
 17 |   "graphTooltip": 0,
 18 |   "id": 3,
 19 |   "links": [],
 20 |   "panels": [
 21 |     {
 22 |       "aliasColors": {},
 23 |       "bars": false,
 24 |       "dashLength": 10,
 25 |       "dashes": false,
 26 |       "datasource": "Prometheus",
 27 |       "fill": 1,
 28 |       "fillGradient": 0,
 29 |       "gridPos": {
 30 |         "h": 9,
 31 |         "w": 12,
 32 |         "x": 0,
 33 |         "y": 0
 34 |       },
 35 |       "hiddenSeries": false,
 36 |       "id": 2,
 37 |       "legend": {
 38 |         "avg": false,
 39 |         "current": false,
 40 |         "max": false,
 41 |         "min": false,
 42 |         "show": true,
 43 |         "total": false,
 44 |         "values": false
 45 |       },
 46 |       "lines": true,
 47 |       "linewidth": 1,
 48 |       "nullPointMode": "null",
 49 |       "options": {
 50 |         "dataLinks": []
 51 |       },
 52 |       "percentage": false,
 53 |       "pointradius": 2,
 54 |       "points": false,
 55 |       "renderer": "flot",
 56 |       "seriesOverrides": [],
 57 |       "spaceLength": 10,
 58 |       "stack": false,
 59 |       "steppedLine": false,
 60 |       "targets": [
 61 |         {
 62 |           "expr": "rate(http_request_count_total{job=\"ml_api\"}[5m])",
 63 |           "legendFormat": "{{app_name}} {{method}} {{endpoint}} {{http_status}}",
 64 |           "refId": "A"
 65 |         }
 66 |       ],
 67 |       "thresholds": [],
 68 |       "timeFrom": null,
 69 |       "timeRegions": [],
 70 |       "timeShift": null,
 71 |       "title": "Requests Rate",
 72 |       "tooltip": {
 73 |         "shared": true,
 74 |         "sort": 0,
 75 |         "value_type": "individual"
 76 |       },
 77 |       "type": "graph",
 78 |       "xaxis": {
 79 |         "buckets": null,
 80 |         "mode": "time",
 81 |         "name": null,
 82 |         "show": true,
 83 |         "values": []
 84 |       },
 85 |       "yaxes": [
 86 |         {
 87 |           "format": "short",
 88 |           "label": null,
 89 |           "logBase": 1,
 90 |           "max": null,
 91 |           "min": null,
 92 |           "show": true
 93 |         },
 94 |         {
 95 |           "format": "short",
 96 |           "label": null,
 97 |           "logBase": 1,
 98 |           "max": null,
 99 |           "min": null,
100 |           "show": true
101 |         }
102 |       ],
103 |       "yaxis": {
104 |         "align": false,
105 |         "alignLevel": null
106 |       }
107 |     },
108 |     {
109 |       "aliasColors": {},
110 |       "bars": false,
111 |       "dashLength": 10,
112 |       "dashes": false,
113 |       "datasource": "Prometheus",
114 |       "fill": 1,
115 |       "fillGradient": 0,
116 |       "gridPos": {
117 |         "h": 9,
118 |         "w": 12,
119 |         "x": 12,
120 |         "y": 0
121 |       },
122 |       "hiddenSeries": false,
123 |       "id": 3,
124 |       "legend": {
125 |         "avg": false,
126 |         "current": false,
127 |         "max": false,
128 |         "min": false,
129 |         "show": true,
130 |         "total": false,
131 |         "values": false
132 |       },
133 |       "lines": true,
134 |       "linewidth": 1,
135 |       "nullPointMode": "null",
136 |       "options": {
137 |         "dataLinks": []
138 |       },
139 |       "percentage": false,
140 |       "pointradius": 2,
141 |       "points": false,
142 |       "renderer": "flot",
143 |       "seriesOverrides": [],
144 |       "spaceLength": 10,
145 |       "stack": false,
146 |       "steppedLine": false,
147 |       "targets": [
148 |         {
149 |           "expr": "sum (rate(http_request_latency_seconds_sum{job=\"ml_api\"}[5m])) / sum (rate(http_request_latency_seconds_count{job=\"ml_api\"}[5m]))",
150 |           "legendFormat": "Average (seconds)",
151 |           "refId": "A"
152 |         }
153 |       ],
154 |       "thresholds": [],
155 |       "timeFrom": null,
156 |       "timeRegions": [],
157 |       "timeShift": null,
158 |       "title": "Latency",
159 |       "tooltip": {
160 |         "shared": true,
161 |         "sort": 0,
162 |         "value_type": "individual"
163 |       },
164 |       "type": "graph",
165 |       "xaxis": {
166 |         "buckets": null,
167 |         "mode": "time",
168 |         "name": null,
169 |         "show": true,
170 |         "values": []
171 |       },
172 |       "yaxes": [
173 |         {
174 |           "format": "short",
175 |           "label": null,
176 |           "logBase": 1,
177 |           "max": null,
178 |           "min": null,
179 |           "show": true
180 |         },
181 |         {
182 |           "format": "short",
183 |           "label": null,
184 |           "logBase": 1,
185 |           "max": null,
186 |           "min": null,
187 |           "show": true
188 |         }
189 |       ],
190 |       "yaxis": {
191 |         "align": false,
192 |         "alignLevel": null
193 |       }
194 |     }
195 |   ],
196 |   "schemaVersion": 21,
197 |   "style": "dark",
198 |   "tags": [],
199 |   "templating": {
200 |     "list": []
201 |   },
202 |   "time": {
203 |     "from": "now-1h",
204 |     "to": "now"
205 |   },
206 |   "timepicker": {
207 |     "refresh_intervals": [
208 |       "5s",
209 |       "10s",
210 |       "30s",
211 |       "1m",
212 |       "5m",
213 |       "15m",
214 |       "30m",
215 |       "1h",
216 |       "2h",
217 |       "1d"
218 |     ]
219 |   },
220 |   "timezone": "",
221 |   "title": "Really Simple Flask Dashboard",
222 |   "uid": "q8vgEpLZl",
223 |   "version": 3
224 | }


--------------------------------------------------------------------------------
/packages/ml_api/docker/config/prometheus/prometheus.yml:
--------------------------------------------------------------------------------
 1 | # my global config
 2 | global:
 3 |   scrape_interval:     15s # By default, scrape targets every 15 seconds.
 4 |   evaluation_interval: 15s # By default, scrape targets every 15 seconds.
 5 |   # scrape_timeout is set to the global default (10s).
 6 | 
 7 |   # Attach these labels to any time series or alerts when communicating with
 8 |   # external systems (federation, remote storage, Alertmanager).
 9 |   external_labels:
10 |       monitor: 'my-project'
11 | 
12 | # A scrape configuration containing exactly one endpoint to scrape:
13 | # Here it's Prometheus itself.
14 | scrape_configs:
15 |   # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
16 |   - job_name: 'prometheus'
17 | 
18 |     # Override the global default and scrape targets from this job every 5 seconds.
19 |     scrape_interval: 5s
20 | 
21 |     # metrics_path defaults to '/metrics'
22 |     # scheme defaults to 'http'.
23 | 
24 |     static_configs:
25 |          - targets: ['prometheus:9090']
26 |   - job_name: 'ml_api'
27 | 
28 |     # Override the global default and scrape targets from this job every 5 seconds.
29 |     scrape_interval: 5s
30 | 
31 |     # metrics_path defaults to '/metrics'
32 |     # scheme defaults to 'http'.
33 |     static_configs:
34 |         - targets: ['ml_api:5000']
35 | 
36 |   - job_name: 'cadvisor'
37 | 
38 |     # Override the global default and scrape targets from this job every 5 seconds.
39 |     scrape_interval: 5s
40 | 
41 |     static_configs:
42 |       - targets: ['cadvisor:8080']
43 | 


--------------------------------------------------------------------------------
/packages/ml_api/docker/docker-compose-ci-candidate.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | services:
 3 | 
 4 |   ml_api:
 5 |     image: christophergs/ml_api:${TARGET}
 6 |     environment:
 7 |       SERVER_PORT: ${SERVER_PORT:-5001}
 8 |     build:
 9 |       context: ../
10 |       dockerfile: docker/Dockerfile.test
11 |     ports:
12 |       - "5001:5001"
13 |     tty: true
14 |     command: bash -c "make run-service-development"
15 | 
16 |   differential-tests:
17 |     image: christophergs/ml_api:${TARGET}
18 |     command: ["true"]
19 |     depends_on:
20 |       - ml_api


--------------------------------------------------------------------------------
/packages/ml_api/docker/docker-compose-ci-master.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | services:
 3 | 
 4 |   ml_api:
 5 |     image: christophergs/ml_api:${TARGET}
 6 |     environment:
 7 |       SERVER_PORT: ${SERVER_PORT:-5000}
 8 |     build:
 9 |       context: ../
10 |       dockerfile: docker/Dockerfile.test
11 |     ports:
12 |       - "5000:5000"
13 |     tty: true
14 |     command: bash -c "make run-service-development"
15 | 
16 |   differential-tests:
17 |     image: christophergs/ml_api:${TARGET}
18 |     command: ["true"]
19 |     depends_on:
20 |       - ml_api


--------------------------------------------------------------------------------
/packages/ml_api/docker/docker-compose-elk.yml:
--------------------------------------------------------------------------------
 1 | version: '3.2'
 2 | services:
 3 |   ml_api:
 4 |     build:
 5 |       context: ../
 6 |       dockerfile: docker/Dockerfile
 7 |     environment:
 8 |       DB_HOST: database
 9 |       DB_PORT: 5432
10 |       DB_USER: user
11 |       DB_PASSWORD: ${DB_PASSWORD:-password}
12 |       DB_NAME: ml_api_dev
13 |     networks:
14 |       - elk
15 |     depends_on:
16 |       - database
17 |       - logstash
18 |     ports:
19 |       - "5000:5000"   # expose webserver to localhost host:container
20 |     command: bash -c "make db-migrations && make run-service-wsgi"
21 | 
22 |   database:
23 |     image: postgres:latest
24 |     environment:
25 |       POSTGRES_USER: user
26 |       POSTGRES_PASSWORD: password
27 |       POSTGRES_DB: ml_api_dev
28 |     ports:
29 |       # expose postgres container on different host port to default (host:container)
30 |       - "6609:5432"
31 |     volumes:
32 |       - my_dbdata:/var/lib/postgresql/data
33 |     networks:
34 |       - elk
35 | 
36 |   elasticsearch:
37 |     image: docker.elastic.co/elasticsearch/elasticsearch:${ELK_VERSION}
38 |     volumes:
39 |       - type: bind
40 |         source: ./elasticsearch/config/elasticsearch.yml
41 |         target: /usr/share/elasticsearch/config/elasticsearch.yml
42 |         read_only: true
43 |       - type: volume
44 |         source: elasticsearch
45 |         target: /usr/share/elasticsearch/data
46 |     ports:
47 |       - "9200:9200"
48 |       - "9300:9300"
49 |     environment:
50 |       ES_JAVA_OPTS: "-Xmx256m -Xms256m"
51 |       ELASTIC_PASSWORD: changeme
52 |       # Use single node discovery in order to disable production mode and avoid bootstrap checks
53 |       # see https://www.elastic.co/guide/en/elasticsearch/reference/current/bootstrap-checks.html
54 |       discovery.type: single-node
55 |     networks:
56 |       - elk
57 | 
58 |   logstash:
59 |     image: docker.elastic.co/logstash/logstash:${ELK_VERSION}
60 |     volumes:
61 |       - type: bind
62 |         source: ./logstash/config/logstash.yml
63 |         target: /usr/share/logstash/config/logstash.yml
64 |         read_only: true
65 |       - type: bind
66 |         source: ./logstash/pipeline
67 |         target: /usr/share/logstash/pipeline
68 |         read_only: true
69 |     ports:
70 |       - "5001:5001"
71 |       - "9600:9600"
72 |     environment:
73 |       LS_JAVA_OPTS: "-Xmx256m -Xms256m"
74 |     networks:
75 |       - elk
76 |     depends_on:
77 |       - elasticsearch
78 | 
79 |   kibana:
80 |     image: docker.elastic.co/kibana/kibana:${ELK_VERSION}
81 |     volumes:
82 |       - type: bind
83 |         source: ./kibana/config/kibana.yml
84 |         target: /usr/share/kibana/config/kibana.yml
85 |         read_only: true
86 |     ports:
87 |       - "5601:5601"
88 |     networks:
89 |       - elk
90 |     depends_on:
91 |       - elasticsearch
92 | 
93 | networks:
94 |   elk:
95 |     driver: bridge
96 | 
97 | volumes:
98 |   my_dbdata:
99 |   elasticsearch:


--------------------------------------------------------------------------------
/packages/ml_api/docker/docker-compose.test.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | services:
 3 |   ml_api_test:
 4 |     image: christophergs/ml_api:master
 5 |     build:
 6 |       context: ../
 7 |       dockerfile: docker/Dockerfile.test
 8 |     environment:
 9 |       DB_HOST: test_database
10 |       DB_PORT: 5432
11 |       DB_USER: test_user
12 |       DB_PASSWORD: ${DB_PASSWORD:-password}
13 |       DB_NAME: ml_api_test
14 |     depends_on:
15 |       - test_database
16 |     ports:
17 |       - "5000:5000"   # expose webserver to localhost host:container
18 |     command: bash -c "make db-migrations && make run-service-development"
19 | 
20 |   test_database:
21 |     image: postgres:latest
22 |     environment:
23 |       POSTGRES_USER: test_user
24 |       POSTGRES_PASSWORD: password
25 |       POSTGRES_DB: ml_api_test
26 |     ports:
27 |       # expose postgres container on different host port to default (host:container)
28 |       - "6608:5432"
29 |     volumes:
30 |       - my_dbdata_test:/var/lib/postgresql/test_data
31 | 
32 | volumes:
33 |   my_dbdata_test:
34 | 


--------------------------------------------------------------------------------
/packages/ml_api/docker/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | services:
 3 |   ml_api:
 4 |     build:
 5 |       context: ../
 6 |       dockerfile: docker/Dockerfile
 7 |     environment:
 8 |       DB_HOST: database
 9 |       DB_PORT: 5432
10 |       DB_USER: user
11 |       DB_PASSWORD: ${DB_PASSWORD:-password}
12 |       DB_NAME: ml_api_dev
13 |     depends_on:
14 |       - database
15 |       - cadvisor
16 |     ports:
17 |       - "5000:5000"   # expose webserver to localhost host:container
18 |     command: bash -c "make db-migrations && make run-service-wsgi"
19 | 
20 |   database:
21 |     image: postgres:latest
22 |     environment:
23 |       POSTGRES_USER: user
24 |       POSTGRES_PASSWORD: password
25 |       POSTGRES_DB: ml_api_dev
26 |     ports:
27 |       # expose postgres container on different host port to default (host:container)
28 |       - "6609:5432"
29 |     volumes:
30 |       - my_dbdata:/var/lib/postgresql/data
31 | 
32 |   prometheus:
33 |     image: prom/prometheus
34 |     container_name: prometheus
35 |     volumes:
36 |       - ./config/prometheus/:/etc/prometheus/
37 |       - prometheus_data:/prometheus
38 |     command:
39 |       - '--config.file=/etc/prometheus/prometheus.yml'
40 |     expose:
41 |       - 9090
42 |     ports:
43 |       - 9090:9090
44 |     depends_on:
45 |       - cadvisor
46 | 
47 |   grafana:
48 |     image: grafana/grafana
49 |     depends_on:
50 |       - prometheus
51 |     ports:
52 |       - 3000:3000
53 |     volumes:
54 |       - grafana_data:/var/lib/grafana
55 |     environment:
56 |       - GF_SECURITY_ADMIN_PASSWORD=foobar
57 |       - GF_USERS_ALLOW_SIGN_UP=false
58 | 
59 |   cadvisor:
60 |     image: google/cadvisor
61 |     volumes:
62 |       - /:/rootfs:ro
63 |       - /var/run:/var/run:rw
64 |       - /sys:/sys:ro
65 |       - /var/lib/docker/:/var/lib/docker:ro
66 |     ports:
67 |       - 8080:8080
68 | 
69 | volumes:
70 |   my_dbdata: {}
71 |   prometheus_data: {}
72 |   grafana_data: {}
73 | 


--------------------------------------------------------------------------------
/packages/ml_api/docker/elasticsearch/config/elasticsearch.yml:
--------------------------------------------------------------------------------
 1 | ## Default Elasticsearch configuration from Elasticsearch base image.
 2 | ## https://github.com/elastic/elasticsearch/blob/master/distribution/docker/src/docker/config/elasticsearch.yml
 3 | cluster.name: "docker-cluster"
 4 | network.host: 0.0.0.0
 5 | 
 6 | ## X-Pack settings
 7 | ## see https://www.elastic.co/guide/en/elasticsearch/reference/current/setup-xpack.html
 8 | xpack.license.self_generated.type: basic
 9 | xpack.security.enabled: true
10 | xpack.monitoring.collection.enabled: true
11 | 


--------------------------------------------------------------------------------
/packages/ml_api/docker/kibana/config/kibana.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | ## Default Kibana configuration from Kibana base image.
 3 | ## https://github.com/elastic/kibana/blob/master/src/dev/build/tasks/os_packages/docker_generator/templates/kibana_yml.template.js
 4 | 
 5 | server.name: kibana
 6 | server.host: "0"
 7 | elasticsearch.hosts: [ "http://elasticsearch:9200" ]
 8 | xpack.monitoring.ui.container.elasticsearch.enabled: true
 9 | 
10 | ## X-Pack security credentials
11 | elasticsearch.username: elastic
12 | elasticsearch.password: changeme
13 | 


--------------------------------------------------------------------------------
/packages/ml_api/docker/logstash/config/logstash.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | ## Default Logstash configuration from Logstash base image.
 3 | ## https://github.com/elastic/logstash/blob/master/docker/data/logstash/config/logstash-full.yml
 4 | #
 5 | http.host: "0.0.0.0"
 6 | xpack.monitoring.elasticsearch.hosts: [ "http://elasticsearch:9200" ]
 7 | 
 8 | ## X-Pack security credentials
 9 | xpack.monitoring.enabled: true
10 | xpack.monitoring.elasticsearch.username: elastic
11 | xpack.monitoring.elasticsearch.password: changeme


--------------------------------------------------------------------------------
/packages/ml_api/docker/logstash/pipeline/logstash.conf:
--------------------------------------------------------------------------------
 1 | input {
 2 | 	tcp {
 3 | 		port => 5001
 4 | 		tags => ["webapp_logs"]
 5 | 		type => "webapp_logs"
 6 | 		codec => json
 7 | 	}
 8 | }
 9 | 
10 | output {
11 | 	if [LotArea] {
12 | 		elasticsearch {
13 | 			hosts => "elasticsearch:9200"
14 | 			user => "elastic"
15 | 			password => "changeme"
16 | 			index => "input_logs-%{+YYYY.MM.dd}"
17 | 		}
18 | 	} else {
19 | 		elasticsearch {
20 | 			hosts => "elasticsearch:9200"
21 | 			user => "elastic"
22 | 			password => "changeme"
23 |  			index => "webapp_logs-%{+YYYY.MM.dd}"
24 |  		}
25 | 	}
26 | }


--------------------------------------------------------------------------------
/packages/ml_api/docker/workaround_32_os/Dockerfile.workaround:
--------------------------------------------------------------------------------
 1 | FROM python:3.9.5-slim-buster
 2 | 
 3 | RUN mkdir -p /opt/app
 4 | COPY requirements /opt/app/requirements
 5 | RUN pip install --upgrade pip
 6 | RUN pip install tox
 7 | 
 8 | # ensure we can run the make commands
 9 | RUN apt-get update -y && \
10 |  	apt-get install -y make && \
11 |  	apt-get install -y libffi-dev gcc && \
12 |  	# for swagger
13 |  	apt-get install -y curl
14 | 
15 | RUN pip install -r /opt/app/requirements/test_requirements.txt
16 | COPY tests /opt/app/tests
17 | COPY tox.ini /opt/app/tox.ini
18 | COPY api /opt/app/api
19 | COPY run.py /opt/app/run.py
20 | WORKDIR /opt/app
21 | 


--------------------------------------------------------------------------------
/packages/ml_api/docker/workaround_32_os/docker-compose-workaround.yml:
--------------------------------------------------------------------------------
 1 | # This is only to be used as a workaround for students who
 2 | # are unable to install the gradient_boosting_model package
 3 | # because they are on a 32 bit operating system
 4 | 
 5 | version: '3'
 6 | services:
 7 |   ml_api:
 8 |     build:
 9 |       context: ../../
10 |       dockerfile: docker/workaround_32_os/Dockerfile.workaround
11 |     ports:
12 |       - "5000:5000"
13 |     command: bash -c "tox -e integration_tests"
14 | 


--------------------------------------------------------------------------------
/packages/ml_api/gunicorn_logging.conf:
--------------------------------------------------------------------------------
 1 | [loggers]
 2 | keys=root, mlapi, logstash.error, logstash.access
 3 | 
 4 | [handlers]
 5 | keys=console, logstash
 6 | 
 7 | [formatters]
 8 | keys=generic, json
 9 | 
10 | [logger_root]
11 | level=INFO
12 | handlers=console
13 | propagate=1
14 | 
15 | [logger_mlapi]
16 | level=INFO
17 | handlers=console,logstash
18 | propagate=0
19 | qualname=mlapi
20 | 
21 | [logger_logstash.error]
22 | level=INFO
23 | handlers=logstash
24 | propagate=1
25 | qualname=gunicorn.error
26 | 
27 | [logger_logstash.access]
28 | level=INFO
29 | handlers=logstash
30 | propagate=0
31 | qualname=gunicorn.access
32 | 
33 | [handler_console]
34 | class=StreamHandler
35 | formatter=generic
36 | args=(sys.stdout, )
37 | 
38 | [handler_logstash]
39 | class=logstash.TCPLogstashHandler
40 | formatter=json
41 | args=('logstash', 5001)
42 | 
43 | [formatter_generic]
44 | format=%(asctime)s [%(process)d] [%(levelname)s] %(message)s
45 | datefmt=%Y-%m-%d %H:%M:%S
46 | class=logging.Formatter
47 | 
48 | [formatter_json]
49 | class=pythonjsonlogger.jsonlogger.JsonFormatter
50 | 


--------------------------------------------------------------------------------
/packages/ml_api/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | warn_unused_ignores = True
 3 | follow_imports = skip
 4 | show_error_context = True
 5 | warn_incomplete_stub = True
 6 | ignore_missing_imports = True
 7 | check_untyped_defs = True
 8 | cache_dir = /dev/null
 9 | warn_redundant_casts = True
10 | warn_unused_configs = True
11 | strict_optional = True
12 | 


--------------------------------------------------------------------------------
/packages/ml_api/requirements/requirements.txt:
--------------------------------------------------------------------------------
 1 | # ML Model
 2 | tid-gradient-boosting-model>=0.3.0,<0.4.0
 3 | 
 4 | # Old model
 5 | tid-regression-model==3.1.2
 6 | 
 7 | # Web microframework for the API
 8 | flask>=1.1.1,<1.2.0
 9 | connexion[swagger-ui]>=2.5.1,<2.6.0
10 | markupsafe==2.0.1 # https://github.com/aws/aws-sam-cli/issues/3661
11 | 
12 | # repo maintenance tooling
13 | black>=19.10b0,<20.0
14 | flake8>=3.7.9,<4.0
15 | mypy>=0.740
16 | 
17 | # Persistence
18 | sqlalchemy>=1.3.11,<1.4.0  # ORM
19 | psycopg2>=2.8.4,<2.9.0  # DB Driver
20 | alembic>=1.3.1,<1.4.0  # DB Migrations
21 | sqlalchemy_utils>=0.36.0,<0.37.0  # DB Utils
22 | 
23 | # Metrics
24 | prometheus_client>=0.7.1,<0.8.0
25 | 
26 | # Logging
27 | python3-logstash>=0.4.80,<0.5.0
28 | python-json-logger>=0.1.11,<0.2.0
29 | 
30 | # Deployment
31 | gunicorn>=20.0.4,<20.1.0
32 | 


--------------------------------------------------------------------------------
/packages/ml_api/requirements/test_requirements.txt:
--------------------------------------------------------------------------------
 1 | -r requirements.txt
 2 | 
 3 | # testing requirements
 4 | pytest>=5.3.2,<6.0.0
 5 | requests>=2.22.0,<2.23.0
 6 | 
 7 | # repo maintenance tooling
 8 | black>=19.10b0,<20.0
 9 | flake8>=3.7.9,<4.0
10 | mypy>=0.740
11 | 
12 | # diff test tooling
13 | termcolor==1.1.0
14 | yarl==1.3.0


--------------------------------------------------------------------------------
/packages/ml_api/run.py:
--------------------------------------------------------------------------------
 1 | import prometheus_client
 2 | from werkzeug.middleware.dispatcher import DispatcherMiddleware
 3 | 
 4 | from api.app import create_app
 5 | from api.config import DevelopmentConfig, setup_app_logging
 6 | 
 7 | _config = DevelopmentConfig()
 8 | 
 9 | # setup logging as early as possible
10 | setup_app_logging(config=_config)
11 | main_app = create_app(config_object=_config).app
12 | application = DispatcherMiddleware(
13 |         app=main_app.wsgi_app,
14 |         mounts={'/metrics': prometheus_client.make_wsgi_app()}
15 |     )
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     main_app.run(port=_config.SERVER_PORT, host=_config.SERVER_HOST)
20 | 


--------------------------------------------------------------------------------
/packages/ml_api/scripts/differential_tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -euox pipefail
 4 | 
 5 | MODEL_VERSION="master"
 6 | MODEL_VARIANT="candidate"
 7 | NUMBER_OF_TESTS="50"
 8 | 
 9 | CANDIDATE_MODEL_SHA="$(git rev-parse HEAD)"
10 | 
11 | # required once only (or whenever you make local changes):
12 | # comment these two lines out otherwise as they can take some time.
13 | make tag-push-local
14 | 
15 | # should only be run once a model version has been finalized
16 | # best practice is to run as part of a CI pipeline on merge to master branch.
17 | make tag-push-master
18 | 
19 | ## Pull latest published image
20 | env TARGET=master docker-compose --file docker/docker-compose.yml pull
21 | 
22 | # start latest (master) image and local image
23 | env TARGET=master SERVER_PORT=5000 docker-compose --project-name master --file docker/docker-compose-ci-master.yml up --no-recreate -d ml_api
24 | env TARGET=$CANDIDATE_MODEL_SHA SERVER_PORT=5001 docker-compose --project-name head --file docker/docker-compose-ci-candidate.yml up --no-recreate -d ml_api
25 | 
26 | ## Start the test runner containers
27 | env TARGET=master docker-compose --project-name master --file docker/docker-compose-ci-master.yml run -d --name differential-tests-expected differential-tests sleep infinity
28 | env TARGET=$CANDIDATE_MODEL_SHA docker-compose --project-name head --file docker/docker-compose-ci-candidate.yml run -d --name differential-tests-actual differential-tests sleep infinity
29 | 
30 | docker ps --all
31 | 
32 | echo "===== Running $CANDIDATE_MODEL_SHA ... ====="
33 | 
34 | ## Compute the actual predictions (i.e. candidate model)
35 | docker exec --user root differential-tests-actual \
36 |     python3 differential_tests compute sample_payloads differential_tests/actual_results --base-url http://head_ml_api_1:5001
37 | 
38 | ## Copy the actual predictions
39 | docker cp differential-tests-actual:/opt/app/differential_tests/actual_results/. differential_tests/actual_results
40 | 
41 | echo "===== Running master ... ====="
42 | ## Compute the expected marginals (i.e. existing model)
43 | docker exec --user root differential-tests-expected \
44 |     python3 differential_tests compute sample_payloads differential_tests/expected_results --base-url http://master_ml_api_1:5000
45 | 
46 | ## Copy the expected marginals
47 | docker cp differential-tests-expected:/opt/app/differential_tests/expected_results/. differential_tests/expected_results
48 | 
49 | # then copy all results into the differential-tests-actual container for comparison
50 | docker cp differential_tests/expected_results/. differential-tests-actual:/opt/app/differential_tests/expected_results
51 | 
52 | echo "===== Comparing $CANDIDATE_MODEL_SHA vs. master ... ====="
53 | ## Compare the expected and actual marginals
54 | docker exec differential-tests-actual \
55 |     python3 -m differential_tests compare differential_tests/expected_results differential_tests/actual_results
56 | 
57 | # clear any docker containers (will stop the script if no containers found)
58 | docker rm $(docker ps -a -q) -f
59 | 


--------------------------------------------------------------------------------
/packages/ml_api/scripts/populate_database.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import time
  4 | import typing as t
  5 | from random import randint, choice
  6 | 
  7 | import pandas as pd
  8 | import requests
  9 | from gradient_boosting_model.config.core import config
 10 | from gradient_boosting_model.processing.data_management import load_dataset
 11 | 
 12 | LOCAL_URL = f'http://{os.getenv("DB_HOST", "localhost")}:5000'
 13 | 
 14 | HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
 15 | 
 16 | LOT_AREA_MAP = {"min": 1470, "max": 56600}
 17 | 
 18 | FIRST_FLR_SF_MAP = {"min": 407, "max": 5095}
 19 | 
 20 | SECOND_FLR_SF_MAP = {"min": 0, "max": 1862}
 21 | 
 22 | BSMT_QUAL_VALUES = ('Gd', 'TA', 'Ex', 'Fa')
 23 | 
 24 | 
 25 | def _generate_random_int(value: int, value_ranges: t.Mapping) -> int:
 26 |     """Generate random integer within a min and max range."""
 27 |     random_value = randint(value_ranges["min"], value_ranges["max"])
 28 |     return int(random_value)
 29 | 
 30 | 
 31 | def _select_random_category(value: str, value_options: t.Sequence) -> str:
 32 |     """Select random category given a sequence of categories."""
 33 |     random_category = choice(value_options)
 34 |     return random_category
 35 | 
 36 | 
 37 | def _prepare_inputs(dataframe: pd.DataFrame) -> pd.DataFrame:
 38 |     """Prepare input data by removing key rows with NA values."""
 39 |     clean_inputs_df = dataframe.dropna(
 40 |         subset=config.model_config.features + ["KitchenQual", "LotFrontage"]
 41 |     ).copy()
 42 | 
 43 |     clean_inputs_df.loc[:, "FirstFlrSF"] = clean_inputs_df["FirstFlrSF"].apply(
 44 |         _generate_random_int, value_ranges=FIRST_FLR_SF_MAP
 45 |     )
 46 |     clean_inputs_df.loc[:, "SecondFlrSF"] = clean_inputs_df["SecondFlrSF"].apply(
 47 |         _generate_random_int, value_ranges=SECOND_FLR_SF_MAP
 48 |     )
 49 |     clean_inputs_df.loc[:, "LotArea"] = clean_inputs_df["LotArea"].apply(
 50 |         _generate_random_int, value_ranges=LOT_AREA_MAP
 51 |     )
 52 | 
 53 |     clean_inputs_df.loc[:, "BsmtQual"] = clean_inputs_df["BsmtQual"].apply(
 54 |         _select_random_category, value_options=BSMT_QUAL_VALUES
 55 |     )
 56 | 
 57 |     return clean_inputs_df
 58 | 
 59 | 
 60 | def populate_database(n_predictions: int = 500, anomaly: bool = False) -> None:
 61 |     """
 62 |     Manipulate the test data to generate random
 63 |     predictions and save them to the database.
 64 |     Before running this script, ensure that the
 65 |     API and Database docker containers are running.
 66 |     """
 67 | 
 68 |     print(f"Preparing to generate: {n_predictions} predictions.")
 69 | 
 70 |     # Load the gradient boosting test dataset which
 71 |     # is included in the model package
 72 |     test_inputs_df = load_dataset(file_name="test.csv")
 73 |     clean_inputs_df = _prepare_inputs(dataframe=test_inputs_df)
 74 |     if len(clean_inputs_df) < n_predictions:
 75 |         print(
 76 |             f"If you want {n_predictions} predictions, you need to"
 77 |             "extend the script to handle more predictions."
 78 |         )
 79 | 
 80 |     if anomaly:
 81 |         # set extremely low values to generate an outlier
 82 |         n_predictions = 1
 83 |         clean_inputs_df.loc[:, "FirstFlrSF"] = 1
 84 |         clean_inputs_df.loc[:, "LotArea"] = 1
 85 |         clean_inputs_df.loc[:, "OverallQual"] = 1
 86 |         clean_inputs_df.loc[:, "GrLivArea"] = 1
 87 | 
 88 |     clean_inputs_df = clean_inputs_df.where(pd.notnull(clean_inputs_df), None)
 89 |     for index, data in clean_inputs_df.iterrows():
 90 |         if index > n_predictions:
 91 |             if anomaly:
 92 |                 print('Created 1 anomaly')
 93 |             break
 94 | 
 95 |         response = requests.post(
 96 |             f"{LOCAL_URL}/v1/predictions/regression",
 97 |             headers=HEADERS,
 98 |             json=[data.to_dict()],
 99 |         )
100 |         response.raise_for_status()
101 | 
102 |         if index % 50 == 0:
103 |             print(f"{index} predictions complete")
104 | 
105 |             # prevent overloading the server
106 |             time.sleep(0.5)
107 | 
108 |     print("Prediction generation complete.")
109 | 
110 | 
111 | if __name__ == "__main__":
112 |     anomaly = False
113 |     parser = argparse.ArgumentParser(
114 |         description='Send random requests to House Price API.')
115 |     parser.add_argument('--anomaly', help="generate unusual inputs")
116 |     args = parser.parse_args()
117 |     if args.anomaly:
118 |         print("Generating unusual inputs")
119 |         anomaly = True
120 | 
121 |     populate_database(n_predictions=500, anomaly=anomaly)
122 | 


--------------------------------------------------------------------------------
/packages/ml_api/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trainindata/testing-and-monitoring-ml-deployments/88d0fdac0898178f5ed9611e44a56ea5e53c9dd5/packages/ml_api/tests/__init__.py


--------------------------------------------------------------------------------
/packages/ml_api/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from unittest import mock
 4 | import pytest
 5 | from gradient_boosting_model.processing.data_management import load_dataset
 6 | from sqlalchemy_utils import create_database, database_exists
 7 | 
 8 | from api.app import create_app
 9 | from api.config import TestingConfig
10 | from api.persistence import core
11 | 
12 | 
13 | @pytest.fixture(scope='session')
14 | def _db():
15 |     db_url = TestingConfig.SQLALCHEMY_DATABASE_URI
16 |     if not database_exists(db_url):
17 |         create_database(db_url)
18 |     # alembic can be configured through the configuration file. For testing
19 |     # purposes 'env.py' also checks the 'ALEMBIC_DB_URI' variable first.
20 |     engine = core.create_db_engine_from_config(config=TestingConfig())
21 |     evars = {"ALEMBIC_DB_URI": db_url}
22 |     with mock.patch.dict(os.environ, evars):
23 |         core.run_migrations()
24 | 
25 |     yield engine
26 | 
27 | 
28 | @pytest.fixture(scope='session')
29 | def _db_session(_db):
30 |     """ Create DB session for testing.
31 |     """
32 |     session = core.create_db_session(engine=_db)
33 |     yield session
34 | 
35 | 
36 | @pytest.fixture(scope='session')
37 | def app(_db_session):
38 |     app = create_app(config_object=TestingConfig(), db_session=_db_session).app
39 |     with app.app_context():
40 |         yield app
41 | 
42 | 
43 | @pytest.fixture
44 | def client(app):
45 |     with app.test_client() as client:
46 |         yield client  # Has to be yielded to access session cookies
47 | 
48 | 
49 | @pytest.fixture
50 | def test_inputs_df():
51 |     # Load the gradient boosting test dataset which
52 |     # is included in the model package
53 |     test_inputs_df = load_dataset(file_name="test.csv")
54 |     return test_inputs_df.copy(deep=True)
55 | 


--------------------------------------------------------------------------------
/packages/ml_api/tests/test_api.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import time
  3 | 
  4 | import numpy as np
  5 | import pytest
  6 | 
  7 | from api.persistence.data_access import SECONDARY_VARIABLES_TO_RENAME
  8 | from api.persistence.models import (
  9 |     GradientBoostingModelPredictions,
 10 |     LassoModelPredictions,
 11 | )
 12 | from gradient_boosting_model.processing.data_management import load_dataset
 13 | 
 14 | 
 15 | @pytest.mark.integration
 16 | def test_health_endpoint(client):
 17 |     # When
 18 |     response = client.get("/")
 19 | 
 20 |     # Then
 21 |     assert response.status_code == 200
 22 |     assert json.loads(response.data) == {"status": "ok"}
 23 | 
 24 | 
 25 | @pytest.mark.integration
 26 | @pytest.mark.parametrize(
 27 |     "api_endpoint, expected_no_predictions",
 28 |     (
 29 |         (
 30 |             "v1/predictions/regression",
 31 |             # test csv contains 1459 rows
 32 |             # we expect 2 rows to be filtered
 33 |             1451,
 34 |         ),
 35 |         (
 36 |             "v1/predictions/gradient",
 37 |             # we expect 8 rows to be filtered
 38 |             1457,
 39 |         ),
 40 |     ),
 41 | )
 42 | def test_prediction_endpoint(
 43 |     api_endpoint, expected_no_predictions, client, test_inputs_df
 44 | ):
 45 |     # Given
 46 |     # Load the test dataset which is included in the model package
 47 |     test_inputs_df = load_dataset(file_name="test.csv")  # dataframe
 48 |     if api_endpoint == "v1/predictions/regression":
 49 |         # adjust column names to those expected by the secondary model
 50 |         test_inputs_df.rename(columns=SECONDARY_VARIABLES_TO_RENAME, inplace=True)
 51 | 
 52 |     # When
 53 |     response = client.post(api_endpoint, json=test_inputs_df.to_dict(orient="records"))
 54 | 
 55 |     # Then
 56 |     assert response.status_code == 200
 57 |     data = json.loads(response.data)
 58 |     assert data["errors"] is None
 59 |     assert len(data["predictions"]) == expected_no_predictions
 60 | 
 61 | 
 62 | # parameterizationa allows us to try many combinations of data
 63 | # within the same test, see the pytest docs for details:
 64 | # https://docs.pytest.org/en/latest/parametrize.html
 65 | @pytest.mark.parametrize(
 66 |     "field, field_value, index, expected_error",
 67 |     (
 68 |         (
 69 |             "BldgType",
 70 |             1,  # expected str
 71 |             33,
 72 |             {"33": {"BldgType": ["Not a valid string."]}},
 73 |         ),
 74 |         (
 75 |             "GarageArea",  # model feature
 76 |             "abc",  # expected float
 77 |             45,
 78 |             {"45": {"GarageArea": ["Not a valid number."]}},
 79 |         ),
 80 |         (
 81 |             "CentralAir",
 82 |             np.nan,  # nan not allowed
 83 |             34,
 84 |             {"34": {"CentralAir": ["Field may not be null."]}},
 85 |         ),
 86 |         ("LotArea", "", 2, {"2": {"LotArea": ["Not a valid integer."]}}),
 87 |     ),
 88 | )
 89 | @pytest.mark.integration
 90 | def test_prediction_validation(
 91 |     field, field_value, index, expected_error, client, test_inputs_df
 92 | ):
 93 |     # Given
 94 |     # Check gradient_boosting_model.processing.validation import HouseDataInputSchema
 95 |     # and you will see the expected values for the inputs to the house price prediction
 96 |     # model. In this test, inputs are changed to incorrect values to check the validation.
 97 |     test_inputs_df.loc[index, field] = field_value
 98 | 
 99 |     # When
100 |     response = client.post(
101 |         "/v1/predictions/gradient", json=test_inputs_df.to_dict(orient="records")
102 |     )
103 | 
104 |     # Then
105 |     assert response.status_code == 400
106 |     data = json.loads(response.data)
107 |     assert data == expected_error
108 | 
109 | 
110 | @pytest.mark.integration
111 | def test_prediction_data_saved(client, app, test_inputs_df):
112 |     # Given
113 |     initial_gradient_count = app.db_session.query(
114 |         GradientBoostingModelPredictions
115 |     ).count()
116 |     initial_lasso_count = app.db_session.query(LassoModelPredictions).count()
117 | 
118 |     # When
119 |     response = client.post(
120 |         "/v1/predictions/regression", json=test_inputs_df.to_dict(orient="records")
121 |     )
122 | 
123 |     # Then
124 |     assert response.status_code == 200
125 |     assert (
126 |         app.db_session.query(LassoModelPredictions).count() == initial_lasso_count + 1
127 |     )
128 | 
129 |     # The gradient prediction save occurs on a separate async thread which can take
130 |     # time to complete. We pause the test briefly to allow the save operation to finish.
131 |     time.sleep(2)
132 |     assert (
133 |         app.db_session.query(GradientBoostingModelPredictions).count()
134 |         == initial_gradient_count + 1
135 |     )
136 | 


--------------------------------------------------------------------------------
/packages/ml_api/tests/test_back_to_back_models.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import pytest
 4 | from gradient_boosting_model.processing.data_management import load_dataset
 5 | 
 6 | from api.persistence.data_access import SECONDARY_VARIABLES_TO_RENAME
 7 | from differential_tests.compare import compare_differences
 8 | 
 9 | 
10 | @pytest.mark.differential
11 | def test_model_prediction_differentials(client):
12 |     test_inputs_df = load_dataset(file_name="test.csv")
13 |     old_model_inputs_df = test_inputs_df.rename(
14 |         columns=SECONDARY_VARIABLES_TO_RENAME
15 |     )
16 | 
17 |     new_model_response = client.post(
18 |         "v1/predictions/gradient", json=test_inputs_df.to_dict(orient="records")
19 |     )
20 |     new_model_predictions = json.loads(new_model_response.data)["predictions"]
21 | 
22 |     old_model_response = client.post(
23 |         "v1/predictions/regression",
24 |         json=old_model_inputs_df.to_dict(orient="records"),
25 |     )
26 |     old_model_predictions = json.loads(old_model_response.data)["predictions"]
27 | 
28 |     # We just pass in the first 10 rows as the two models' validation differs
29 |     # which means they filter out a slightly different number of rows
30 |     # which would cause the differential tests to fail.
31 |     compare_differences(
32 |         expected_predictions=new_model_predictions[:10],
33 |         actual_predictions=old_model_predictions[:10],
34 |         # you would adjust the rel_tol level parameter on your model.
35 |         # right now this is extremely permissive of variation.
36 |         rel_tol=0.2,
37 |     )
38 | 


--------------------------------------------------------------------------------
/packages/ml_api/tests/test_persistence.py:
--------------------------------------------------------------------------------
 1 | from unittest import mock
 2 | import pytest
 3 | 
 4 | from api.persistence.data_access import PredictionPersistence, ModelType
 5 | 
 6 | from api.persistence.models import (
 7 |     GradientBoostingModelPredictions,
 8 |     LassoModelPredictions,
 9 | )
10 | 
11 | 
12 | # parameterizationa allows us to try many combinations of data
13 | # within the same test, see the pytest docs for details:
14 | # https://docs.pytest.org/en/latest/parametrize.html
15 | @pytest.mark.parametrize(
16 |     "model_type, model,",
17 |     (
18 |         (ModelType.GRADIENT_BOOSTING, GradientBoostingModelPredictions),
19 |         (ModelType.LASSO, LassoModelPredictions),
20 |     ),
21 | )
22 | def test_data_access(model_type, model, test_inputs_df):
23 |     # Given
24 |     # We mock the database session
25 |     mock_session = mock.MagicMock()
26 |     _persistence = PredictionPersistence(db_session=mock_session)
27 | 
28 |     # When
29 |     _persistence.make_save_predictions(
30 |         db_model=model_type, input_data=test_inputs_df.to_dict(orient="records")
31 |     )
32 | 
33 |     # Then
34 |     assert mock_session.commit.call_count == 1
35 |     assert mock_session.add.call_count == 1
36 |     assert isinstance(mock_session.add.call_args[0][0], model)
37 | 


--------------------------------------------------------------------------------
/packages/ml_api/tox.ini:
--------------------------------------------------------------------------------
  1 | [tox]
  2 | envlist = integration_tests,unit_tests,differential_tests,typechecks,stylechecks
  3 | skipsdist = True
  4 | 
  5 | 
  6 | [testenv]
  7 | install_command = pip install {opts} {packages}
  8 | 
  9 | deps =
 10 |     -rrequirements/test_requirements.txt
 11 | 
 12 | setenv =
 13 |   PYTHONPATH=.
 14 | 
 15 | passenv =
 16 | # A list of wildcard environment variable names which shall be copied from
 17 | # the tox invocation environment to the test environment when executing test commands
 18 |     DB_*
 19 |     SHADOW_MODE_ACTIVE
 20 | 
 21 | commands=
 22 |     py.test
 23 | 
 24 | 
 25 | [testenv:integration_tests]
 26 | envdir = {toxworkdir}/integration_tests
 27 | deps =
 28 |      {[testenv]deps}
 29 | 
 30 | passenv =
 31 |       {[testenv]passenv}
 32 | 
 33 | setenv =
 34 |   PYTHONPATH=.
 35 |   DB_USER={env:DB_USER:test_user}
 36 |   DB_PASSWORD={env:DB_PASSWORD:password}
 37 |   DB_HOST={env:DB_HOST:localhost}
 38 |   DB_PORT={env:DB_PORT:6608}
 39 |   DB_NAME={env:DB_NAME:ml_api_test}
 40 |   SHADOW_MODE_ACTIVE={env:SHADOW_MODE_ACTIVE:true}
 41 | 
 42 | commands =
 43 |      pytest \
 44 |            -s \
 45 |            -vv \
 46 |            -m integration \
 47 |            {posargs:tests/}
 48 | 
 49 | 
 50 | [testenv:unit_tests]
 51 | envdir = {toxworkdir}/integration_tests
 52 | deps =
 53 |      {[testenv]deps}
 54 | 
 55 | passenv =
 56 |       {[testenv]passenv}
 57 | 
 58 | setenv =
 59 |   PYTHONPATH=.
 60 | 
 61 | commands =
 62 |      pytest \
 63 |            -s \
 64 |            -vv \
 65 |            -m "not integration and not differential" \
 66 |            {posargs:tests/}
 67 | 
 68 | 
 69 | [testenv:differential_tests]
 70 | envdir = {toxworkdir}/integration_tests
 71 | deps =
 72 |      {[testenv]deps}
 73 | 
 74 | passenv =
 75 |       {[testenv]passenv}
 76 | 
 77 | setenv =
 78 |   PYTHONPATH=.
 79 |   DB_USER={env:DB_USER:test_user}
 80 |   DB_PASSWORD={env:DB_PASSWORD:password}
 81 |   DB_HOST={env:DB_HOST:localhost}
 82 |   DB_PORT={env:DB_PORT:6608}
 83 |   DB_NAME={env:DB_NAME:ml_api_test}
 84 |   SHADOW_MODE_ACTIVE={env:SHADOW_MODE_ACTIVE:true}
 85 | 
 86 | commands =
 87 |      pytest \
 88 |            -s \
 89 |            -vv \
 90 |            -m differential \
 91 |            {posargs:tests/}
 92 | 
 93 | 
 94 | [testenv:generate_predictions]
 95 | envdir = {toxworkdir}/generate_predictions
 96 | deps =
 97 |      {[testenv]deps}
 98 | 
 99 | passenv =
100 |       {[testenv]passenv}
101 | 
102 | setenv =
103 |   PYTHONPATH=.
104 |   DB_HOST=localhost
105 | 
106 | commands = python scripts/populate_database.py {posargs}
107 | 
108 | 
109 | [testenv:typechecks]
110 | envdir = {toxworkdir}/integration_tests
111 | 
112 | deps =
113 |      {[testenv:integration_tests]deps}
114 | 
115 | commands = {posargs:mypy api}
116 | 
117 | 
118 | [testenv:stylechecks]
119 | envdir = {toxworkdir}/integration_tests
120 | 
121 | deps =
122 |      {[testenv:integration_tests]deps}
123 | 
124 | commands = {posargs:flake8 api tests}
125 | 
126 | 
127 | [flake8]
128 | exclude = .git,env
129 | max-line-length = 90
130 | 
131 | 
132 | [pytest]
133 | markers =
134 |     integration: mark a test as an integration test.
135 |     differential: mark a test as a differential test.
136 | 
137 | filterwarnings =
138 |     ignore::DeprecationWarning
139 |     ignore::RuntimeWarning
140 |     ignore::UserWarning
141 |     ignore::FutureWarning
142 | 


--------------------------------------------------------------------------------
/research_phase/requirements.txt:
--------------------------------------------------------------------------------
1 | # We use compatible release functionality (see PEP 440 here: https://www.python.org/dev/peps/pep-0440/#compatible-release)
2 | # to specify acceptable version ranges of our project dependencies. This gives us the flexibility to keep up with small
3 | # updates/fixes, whilst ensuring we don't install a major update which could introduce backwards incompatible changes.
4 | numpy>=1.20.0,<1.21.0
5 | pandas>=1.3.5,<1.4.0
6 | scikit-learn>=1.0.2,<1.1.0
7 | jupyter>=1.0.0,<1.1.0
8 | feature_engine>=0.3.1,<0.4.0
9 | matplotlib>=3.1.2,<4.0.0


--------------------------------------------------------------------------------