├── .github └── ISSUE_TEMPLATE │ ├── bug_report.md │ └── course-content-report.md ├── .gitignore ├── Jenkinsfile ├── Jenkinsfile_data_pipeline ├── Jenkinsfile_model_serving ├── LICENSE ├── README.md ├── data_pipeline ├── .dockerignore ├── .gitignore ├── Makefile ├── README.md ├── dags │ ├── db_to_offline_store.py │ ├── materialize_offline_to_online.py │ ├── stream_to_stores.py │ └── utils.py ├── data_sources │ └── driver_stats.parquet ├── deployment │ ├── .env │ ├── Dockerfile │ ├── deploy.sh │ └── requirements.txt ├── dev_requirements.txt ├── examples │ ├── get_historical_features.py │ └── get_online_features.py ├── feature_repo │ ├── data_sources.py │ ├── entities.py │ ├── feature_store.yaml │ └── features.py ├── scripts │ ├── feast_helper.sh │ └── utils │ │ └── logger.py └── src │ ├── db_to_offline_store │ ├── clean.py │ ├── explore_and_validate.py │ └── ingest.py │ ├── stream_to_stores │ ├── ingest.py │ └── processor.py │ └── utils │ ├── __init__.py │ └── logger.py ├── model_serving ├── .dockerignore ├── .env ├── .gitignore ├── Makefile ├── README.md ├── artifacts │ └── .gitkeep ├── dags │ ├── batch_serving_dag.py │ └── utils.py ├── data │ └── batch_request.csv ├── deployment │ ├── .env │ ├── Dockerfile │ ├── deploy.sh │ ├── docker-compose.yml │ └── requirements.txt ├── dev_requirements.txt ├── scripts │ └── bentoml_helper.sh └── src │ ├── batch_prediction.py │ ├── bentoml_service.py │ ├── data_extraction.py │ └── utils.py ├── monitoring_service ├── .dockerignore ├── .gitignore ├── Makefile ├── README.md ├── dashboards │ ├── bentoml_dashboard.json │ ├── classification_performance.json │ └── data_drift.json ├── data │ ├── .gitkeep │ └── orig_driver_stats.parquet ├── deployment │ ├── .env │ ├── Dockerfile │ ├── deploy.sh │ ├── docker-compose.yml │ └── requirements.txt ├── dev_requirements.txt ├── nbs │ ├── prepare_datasets.ipynb │ └── test_datasets.ipynb └── src │ ├── mock_request.py │ ├── monitoring_service.py │ └── utils.py ├── stream_emitting ├── Dockerfile ├── README.md ├── data │ └── driver_stats_stream.parquet ├── deploy.sh ├── dev_requirements.txt ├── docker-compose.yaml └── stream_emitting.py └── training_pipeline ├── .dockerignore ├── .env ├── .gitignore ├── Makefile ├── README.md ├── artifacts └── .gitkeep ├── dags ├── training_dag.py └── utils.py ├── data └── driver_orders.csv ├── deployment ├── .env ├── Dockerfile ├── deploy.sh └── requirements.txt ├── dev_requirements.txt ├── nbs ├── data │ ├── exp_driver_orders.csv │ └── exp_driver_stats.parquet ├── poc-integrate-mlflow.ipynb └── poc-training-code.ipynb └── src ├── data_extraction.py ├── data_preparation.py ├── data_validation.py ├── model_evaluation.py ├── model_training.py ├── model_validation.py └── utils.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: "[BUG]" 5 | labels: bug 6 | assignees: "" 7 | --- 8 | 9 | **Mô tả bug** 10 | 11 | - Tên bug 12 | - Link tới bài 13 | - Xảy ra ở bước (ảnh ở bước xảy ra bug trong bài) 14 | 15 | **Cách tạo lại bug** 16 | 17 | 1. Đi tới ABC 18 | 2. Click ABC 19 | 3. Kéo xuống phần ABC 20 | 4. Xuất hiện lỗi ABC 21 | 22 | **Expected behaviors** 23 | 24 | Mô tả đầu ra/hành vi mong muốn thay vì bug 25 | 26 | **Attachments** 27 | 28 | Đính kèm lỗi dạng ảnh hoặc dạng text 29 | 30 | **Specs** 31 | 32 | - Software: thông tin về version của docker, docker-compose, software tools, etc. 33 | - System: thông tin về OS, CPU, Memory, GPU, etc. 34 | 35 | **Additional context** 36 | 37 | Bất kì điều gì khác 38 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/course-content-report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Course content report 3 | about: Đóng góp ý kiến về nội dung của một khoá học 4 | title: "[COURSE]" 5 | labels: documentation 6 | assignees: "" 7 | --- 8 | 9 | **Khoá học** 10 | 11 | - Tên khoá học 12 | - Tên bài trong khoá học 13 | 14 | ## 1. Tên lỗi, hoặc tên bài, hoặc tên phần trong bài 15 | 16 | - Ảnh chụp đoạn cần chỉnh sửa trong bài 17 | - Mô tả lỗi hay đóng góp ý kiến về một phần trong bài 18 | - Giải pháp/cách sửa (nếu có) 19 | 20 | ## 2. Tên lỗi, hoặc tên bài, hoặc tên phần trong bài 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .venv 106 | env/ 107 | venv/ 108 | ENV/ 109 | env.bak/ 110 | venv.bak/ 111 | 112 | # Spyder project settings 113 | .spyderproject 114 | .spyproject 115 | 116 | # Rope project settings 117 | .ropeproject 118 | 119 | # mkdocs documentation 120 | /site 121 | 122 | # mypy 123 | .mypy_cache/ 124 | .dmypy.json 125 | dmypy.json 126 | 127 | # Pyre type checker 128 | .pyre/ 129 | 130 | # Common 131 | .DS_Store 132 | .vscode 133 | bin 134 | tmp 135 | -------------------------------------------------------------------------------- /Jenkinsfile: -------------------------------------------------------------------------------- 1 | pipeline { 2 | agent any 3 | 4 | stages { 5 | stage('Build') { 6 | steps { 7 | echo 'Building something..' 8 | } 9 | } 10 | stage('Test') { 11 | steps { 12 | echo 'Testing something..' 13 | } 14 | } 15 | stage('Deploy') { 16 | steps { 17 | echo 'Deploying something..' 18 | } 19 | } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /Jenkinsfile_data_pipeline: -------------------------------------------------------------------------------- 1 | pipeline { 2 | agent { 3 | docker { 4 | image 'python:3.9' 5 | } 6 | } 7 | 8 | stages { 9 | stage('build data pipeline') { 10 | when {changeset "data_pipeline/**" } 11 | 12 | steps { 13 | echo 'Building data pipeline..' 14 | sh 'cd data_pipeline && make build_image' 15 | } 16 | } 17 | 18 | stage('test data pipeline') { 19 | when {changeset "data_pipeline/**" } 20 | 21 | steps { 22 | echo 'Testing data pipeline..' 23 | } 24 | } 25 | 26 | stage('deploy data pipeline') { 27 | when {changeset "data_pipeline/**" } 28 | 29 | steps { 30 | sh 'cd data_pipeline && make deploy_dags' 31 | } 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /Jenkinsfile_model_serving: -------------------------------------------------------------------------------- 1 | pipeline { 2 | agent { docker { image 'python:3.9' } } 3 | 4 | stages { 5 | stage('build model serving') { 6 | when {changeset "model_serving/**" } 7 | 8 | steps { 9 | echo 'Building model serving..' 10 | sh 'cd model_serving && make build_image' 11 | } 12 | } 13 | 14 | stage('test model serving') { 15 | when {changeset "model_serving/**" } 16 | 17 | steps { 18 | echo 'Testing model serving..' 19 | } 20 | } 21 | 22 | stage('deploy model serving') { 23 | parallel { 24 | stage('batch serving pipeline') { 25 | when {changeset "model_serving/**" } 26 | 27 | steps { 28 | sh 'cd model_serving && make deploy_dags' 29 | } 30 | } 31 | 32 | stage('online serving API') { 33 | when {changeset "model_serving/**" } 34 | 35 | steps { 36 | sh 'cd model_serving && make compose_up' 37 | } 38 | } 39 | } 40 | } 41 | } 42 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 MLOpsVN 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mlops-crash-course-code 2 | 3 | ## Requirements 4 | 5 | ```bash 6 | Docker version 20.10.17, build 100c701 7 | Docker Compose version v2.10.2 8 | ``` 9 | 10 | Tested on OS: 11 | 12 | - Ubuntu 20.04 13 | - MacOS using M1, M2, Intel CPU 14 | -------------------------------------------------------------------------------- /data_pipeline/.dockerignore: -------------------------------------------------------------------------------- 1 | .dockerignore 2 | __pycache__ 3 | dev_requirements.txt 4 | dags 5 | examples 6 | feature_repo 7 | -------------------------------------------------------------------------------- /data_pipeline/.gitignore: -------------------------------------------------------------------------------- 1 | feature_repo/registry 2 | __pycache__ -------------------------------------------------------------------------------- /data_pipeline/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/bash 2 | 3 | ENV_FILE="./deployment/.env" 4 | 5 | feast_apply: 6 | source ${ENV_FILE} && bash scripts/feast_helper.sh apply 7 | 8 | build_image: 9 | source ${ENV_FILE} && bash deployment/deploy.sh build 10 | 11 | build_push_image: 12 | source ${ENV_FILE} && bash deployment/deploy.sh build_push 13 | 14 | deploy_dags: 15 | source ${ENV_FILE} && bash deployment/deploy.sh dags 16 | 17 | deploy_feature_repo: 18 | source ${ENV_FILE} && bash deployment/deploy.sh feature_repo 19 | -------------------------------------------------------------------------------- /data_pipeline/README.md: -------------------------------------------------------------------------------- 1 | # Data pipeline 2 | 3 | ```bash 4 | # Create/update feature store 5 | make feast_apply 6 | 7 | # Build 8 | make build_image && make deploy_dags 9 | 10 | # Go to airflow UI 11 | # Set variable MLOPS_CRASH_COURSE_CODE_DIR=path/to/mlops-crash-course-code 12 | # Run dags 13 | 14 | # Deploy feature repo to training pipeline 15 | make deploy_feature_repo 16 | ``` 17 | -------------------------------------------------------------------------------- /data_pipeline/dags/db_to_offline_store.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | BASE_DIR = os.path.dirname(os.path.abspath(__file__)) 5 | sys.path.append(BASE_DIR) 6 | 7 | import pendulum 8 | 9 | from airflow import DAG 10 | from airflow.providers.docker.operators.docker import DockerOperator 11 | 12 | from utils import * 13 | 14 | with DAG( 15 | dag_id="db_to_offline_store", 16 | default_args=DefaultConfig.DEFAULT_DAG_ARGS, 17 | schedule_interval="@once", 18 | start_date=pendulum.datetime(2022, 1, 1, tz="UTC"), 19 | catchup=False, 20 | tags=["data_pipeline"], 21 | ) as dag: 22 | ingest_task = DockerOperator( 23 | task_id="ingest_task", 24 | **DefaultConfig.DEFAULT_DOCKER_OPERATOR_ARGS, 25 | command="/bin/bash -c 'cd src/db_to_offline_store && python ingest.py'", 26 | ) 27 | 28 | clean_task = DockerOperator( 29 | task_id="clean_task", 30 | **DefaultConfig.DEFAULT_DOCKER_OPERATOR_ARGS, 31 | command="/bin/bash -c 'cd src/db_to_offline_store && python clean.py'", 32 | ) 33 | 34 | explore_and_validate_task = DockerOperator( 35 | task_id="explore_and_validate_task", 36 | **DefaultConfig.DEFAULT_DOCKER_OPERATOR_ARGS, 37 | command="/bin/bash -c 'cd src/db_to_offline_store && python explore_and_validate.py'", 38 | ) 39 | 40 | ingest_task >> clean_task >> explore_and_validate_task -------------------------------------------------------------------------------- /data_pipeline/dags/materialize_offline_to_online.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | BASE_DIR = os.path.dirname(os.path.abspath(__file__)) 5 | sys.path.append(BASE_DIR) 6 | 7 | import pendulum 8 | 9 | from airflow import DAG 10 | from airflow.providers.docker.operators.docker import DockerOperator 11 | 12 | from utils import * 13 | 14 | with DAG( 15 | dag_id="materlize_offline_to_online", 16 | default_args=DefaultConfig.DEFAULT_DAG_ARGS, 17 | schedule_interval="@once", 18 | start_date=pendulum.datetime(2022, 1, 1, tz="UTC"), 19 | catchup=False, 20 | tags=["data_pipeline"], 21 | ) as dag: 22 | materialize_task = DockerOperator( 23 | task_id="materialize_task", 24 | **DefaultConfig.DEFAULT_DOCKER_OPERATOR_ARGS, 25 | command="/bin/bash ./scripts/feast_helper.sh materialize", 26 | ) -------------------------------------------------------------------------------- /data_pipeline/dags/stream_to_stores.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | BASE_DIR = os.path.dirname(os.path.abspath(__file__)) 5 | sys.path.append(BASE_DIR) 6 | 7 | import pendulum 8 | 9 | from airflow import DAG 10 | from airflow.providers.docker.operators.docker import DockerOperator 11 | 12 | from utils import * 13 | 14 | with DAG( 15 | dag_id="stream_to_stores", 16 | default_args=DefaultConfig.DEFAULT_DAG_ARGS, 17 | schedule_interval="@once", 18 | start_date=pendulum.datetime(2022, 1, 1, tz="UTC"), 19 | catchup=False, 20 | tags=["data_pipeline"], 21 | ) as dag: 22 | stream_to_online_task = DockerOperator( 23 | task_id="stream_to_online_task", 24 | command="/bin/bash -c 'cd src/stream_to_stores && python ingest.py --store online'", 25 | **DefaultConfig.DEFAULT_DOCKER_OPERATOR_ARGS, 26 | ) 27 | 28 | stream_to_offline_task = DockerOperator( 29 | task_id="stream_to_offline_task", 30 | **DefaultConfig.DEFAULT_DOCKER_OPERATOR_ARGS, 31 | command="/bin/bash -c 'cd src/stream_to_stores && python ingest.py --store offline'", 32 | ) -------------------------------------------------------------------------------- /data_pipeline/dags/utils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import pendulum 3 | from airflow.models import Variable 4 | from docker.types import Mount 5 | 6 | 7 | class AppConst: 8 | DOCKER_USER = Variable.get("DOCKER_USER", "mlopsvn") 9 | 10 | 11 | class AppPath: 12 | MLOPS_CRASH_COURSE_CODE_DIR = Path(Variable.get("MLOPS_CRASH_COURSE_CODE_DIR")) 13 | DATA_PIPELINE_DIR = MLOPS_CRASH_COURSE_CODE_DIR / "data_pipeline" 14 | FEATURE_REPO = DATA_PIPELINE_DIR / "feature_repo" 15 | 16 | 17 | class DefaultConfig: 18 | DEFAULT_DAG_ARGS = { 19 | "owner": "mlopsvn", 20 | "retries": 0, 21 | "retry_delay": pendulum.duration(seconds=20), 22 | } 23 | 24 | DEFAULT_DOCKER_OPERATOR_ARGS = { 25 | "image": f"{AppConst.DOCKER_USER}/mlops_crash_course/data_pipeline:latest", 26 | "api_version": "auto", 27 | "auto_remove": True, 28 | "mounts": [ 29 | # feature repo 30 | Mount( 31 | source=AppPath.FEATURE_REPO.absolute().as_posix(), 32 | target="/data_pipeline/feature_repo", 33 | type="bind", 34 | ), 35 | ], 36 | # Fix a permission denied when using DockerOperator in Airflow 37 | # Ref: https://stackoverflow.com/a/70100729 38 | # "docker_url": "tcp://docker-proxy:2375", 39 | } 40 | -------------------------------------------------------------------------------- /data_pipeline/data_sources/driver_stats.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLOpsVN/mlops-crash-course-code/8124d3c6afe344ff0df618ac99fcb1e5b1148be0/data_pipeline/data_sources/driver_stats.parquet -------------------------------------------------------------------------------- /data_pipeline/deployment/.env: -------------------------------------------------------------------------------- 1 | export DOCKER_USER="mlopsvn" 2 | export DAGS_DIR="../../mlops-crash-course-platform/airflow/run_env/dags/data_pipeline" -------------------------------------------------------------------------------- /data_pipeline/deployment/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim AS build 2 | 3 | RUN apt-get update 4 | RUN python -m venv /opt/venv 5 | ENV PATH="/opt/venv/bin:$PATH" 6 | 7 | COPY deployment/requirements.txt . 8 | RUN pip install -r requirements.txt 9 | 10 | FROM python:3.9-slim 11 | 12 | RUN apt-get update && \ 13 | apt-get install -y --no-install-recommends \ 14 | openjdk-11-jre-headless && \ 15 | apt-get autoremove -yqq --purge && \ 16 | apt-get clean && \ 17 | rm -rf /var/lib/apt/lists/* 18 | 19 | ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64 20 | 21 | COPY --from=build /opt/venv /opt/venv 22 | ENV PATH="/opt/venv/bin:$PATH" 23 | 24 | COPY . /data_pipeline 25 | WORKDIR /data_pipeline 26 | -------------------------------------------------------------------------------- /data_pipeline/deployment/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cmd=$1 4 | 5 | # constants 6 | DOCKER_USER="$DOCKER_USER" 7 | PROJECT="mlops_crash_course" 8 | IMAGE_NAME="data_pipeline" 9 | IMAGE_TAG=$(git describe --always) 10 | 11 | if [[ -z "$DOCKER_USER" ]]; then 12 | echo "Missing \$DOCKER_USER env var" 13 | exit 1 14 | fi 15 | 16 | usage() { 17 | echo "deploy.sh " 18 | echo "Available commands:" 19 | echo " build build image" 20 | echo " push push image" 21 | echo " build_push build and push image" 22 | echo " dags deploy airflow dags" 23 | echo " feature_repo deploy feature repo and related scripts" 24 | } 25 | 26 | if [[ -z "$cmd" ]]; then 27 | echo "Missing command" 28 | usage 29 | exit 1 30 | fi 31 | 32 | build() { 33 | docker build --tag $DOCKER_USER/$PROJECT/$IMAGE_NAME:$IMAGE_TAG -f deployment/Dockerfile . 34 | docker tag $DOCKER_USER/$PROJECT/$IMAGE_NAME:$IMAGE_TAG $DOCKER_USER/$PROJECT/$IMAGE_NAME:latest 35 | } 36 | 37 | push() { 38 | docker push $DOCKER_USER/$PROJECT/$IMAGE_NAME:$IMAGE_TAG 39 | docker push $DOCKER_USER/$PROJECT/$IMAGE_NAME:latest 40 | } 41 | 42 | deploy_dags() { 43 | if [[ -z "$DAGS_DIR" ]]; then 44 | echo "Missing DAGS_DIR env var" 45 | usage 46 | exit 1 47 | fi 48 | 49 | mkdir -p "$DAGS_DIR" 50 | cp dags/* "$DAGS_DIR" 51 | } 52 | 53 | deploy_feature_repo() { 54 | rsync -avr data_sources ../training_pipeline 55 | rsync -avr feature_repo ../training_pipeline --exclude registry 56 | 57 | rsync -avr data_sources ../model_serving 58 | rsync -avr feature_repo ../model_serving --exclude registry 59 | 60 | rsync -avr data_sources ../monitoring_service 61 | rsync -avr feature_repo ../monitoring_service --exclude registry 62 | rsync -avr scripts ../monitoring_service 63 | } 64 | 65 | shift 66 | 67 | case $cmd in 68 | build) 69 | build "$@" 70 | ;; 71 | push) 72 | push "$@" 73 | ;; 74 | build_push) 75 | build "$@" 76 | push "$@" 77 | ;; 78 | dags) 79 | deploy_dags "$@" 80 | ;; 81 | feature_repo) 82 | deploy_feature_repo "$@" 83 | ;; 84 | *) 85 | echo -n "Unknown command: $cmd" 86 | usage 87 | exit 1 88 | ;; 89 | esac -------------------------------------------------------------------------------- /data_pipeline/deployment/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.0.1 2 | feast[redis]==0.24.0 -------------------------------------------------------------------------------- /data_pipeline/dev_requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.3.4 2 | apache-airflow-providers-docker==3.1.0 3 | feast[redis]==0.24.0 4 | pandas==1.4.4 5 | pendulum==2.1.2 6 | pyspark==3.0.1 7 | -------------------------------------------------------------------------------- /data_pipeline/examples/get_historical_features.py: -------------------------------------------------------------------------------- 1 | from feast import FeatureStore 2 | import pandas as pd 3 | from datetime import datetime 4 | 5 | store = FeatureStore(repo_path="../feature_repo") 6 | 7 | entity_df = pd.DataFrame.from_dict( 8 | { 9 | "driver_id": [1001, 1002, 1003], 10 | "datetime": [ 11 | datetime(2022, 5, 11, 11, 59, 59), 12 | datetime(2022, 6, 12, 1, 15, 10), 13 | datetime.now(), 14 | ], 15 | } 16 | ) 17 | training_df = store.get_historical_features( 18 | entity_df=entity_df, features=["driver_stats:acc_rate", "driver_stats:conv_rate"], 19 | ).to_df() 20 | print(training_df.head()) 21 | -------------------------------------------------------------------------------- /data_pipeline/examples/get_online_features.py: -------------------------------------------------------------------------------- 1 | from feast import FeatureStore 2 | 3 | store = FeatureStore(repo_path="../feature_repo") 4 | 5 | features = store.get_online_features( 6 | features=["driver_stats:acc_rate", "driver_stats:conv_rate"], 7 | entity_rows=[{"driver_id": 1001,}], 8 | ).to_dict(include_event_timestamps=True) 9 | 10 | 11 | def print_online_features(features): 12 | for key, value in sorted(features.items()): 13 | print(key, " : ", value) 14 | 15 | 16 | print_online_features(features) 17 | -------------------------------------------------------------------------------- /data_pipeline/feature_repo/data_sources.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | 3 | from feast import FileSource, KafkaSource 4 | from feast.data_format import JsonFormat, ParquetFormat 5 | 6 | driver_stats_parquet_file = "../data_sources/driver_stats.parquet" 7 | 8 | driver_stats_batch_source = FileSource( 9 | name="driver_stats", 10 | file_format=ParquetFormat(), 11 | path=driver_stats_parquet_file, 12 | timestamp_field="datetime", 13 | created_timestamp_column="created", 14 | ) 15 | 16 | driver_stats_stream_source = KafkaSource( 17 | name="driver_stats_stream", 18 | kafka_bootstrap_servers="localhost:29092", 19 | topic="drivers", 20 | timestamp_field="datetime", 21 | batch_source=driver_stats_batch_source, 22 | message_format=JsonFormat( 23 | schema_json="driver_id integer, acc_rate double, conv_rate double, datetime timestamp, created timestamp" 24 | ), 25 | watermark_delay_threshold=timedelta(minutes=5), 26 | description="The Kafka stream containing the driver stats", 27 | ) 28 | -------------------------------------------------------------------------------- /data_pipeline/feature_repo/entities.py: -------------------------------------------------------------------------------- 1 | from feast import Entity 2 | 3 | driver = Entity( 4 | name="driver", 5 | join_keys=["driver_id"], 6 | description="driver id", 7 | tags={}, 8 | owner="mlopsvn@gmail.com", 9 | ) 10 | -------------------------------------------------------------------------------- /data_pipeline/feature_repo/feature_store.yaml: -------------------------------------------------------------------------------- 1 | project: feast_demo 2 | provider: local 3 | registry: 4 | path: registry/local_registry.db 5 | cache_ttl_seconds: 5 6 | online_store: 7 | type: redis 8 | connection_string: localhost:6378 9 | offline_store: 10 | type: file 11 | -------------------------------------------------------------------------------- /data_pipeline/feature_repo/features.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | 3 | from feast import FeatureView, Field 4 | from feast.stream_feature_view import stream_feature_view 5 | from feast.types import Float32, Int32 6 | from pyspark.sql import DataFrame 7 | 8 | from data_sources import driver_stats_batch_source, driver_stats_stream_source 9 | from entities import driver 10 | 11 | driver_stats_view = FeatureView( 12 | name="driver_stats", 13 | description="driver features", 14 | entities=[driver], 15 | ttl=timedelta(days=36500), 16 | schema=[ 17 | Field(name="conv_rate", dtype=Float32), 18 | Field(name="acc_rate", dtype=Float32), 19 | Field(name="avg_daily_trips", dtype=Int32), 20 | ], 21 | online=True, 22 | source=driver_stats_batch_source, 23 | tags={}, 24 | owner="mlopsvn@gmail.com", 25 | ) 26 | 27 | 28 | @stream_feature_view( 29 | entities=[driver], 30 | ttl=timedelta(days=36500), 31 | mode="spark", 32 | schema=[ 33 | Field(name="conv_rate", dtype=Float32), 34 | Field(name="acc_rate", dtype=Float32), 35 | ], 36 | timestamp_field="datetime", 37 | online=True, 38 | source=driver_stats_stream_source, 39 | tags={}, 40 | owner="stream_source_owner@gmail.com", 41 | ) 42 | def driver_stats_stream(df: DataFrame): 43 | from pyspark.sql.functions import col 44 | 45 | return ( 46 | df.withColumn("conv_percentage", col("conv_rate") * 100.0) 47 | .withColumn("acc_percentage", col("acc_rate") * 100.0) 48 | .drop("conv_rate", "acc_rate") 49 | .withColumnRenamed("conv_percentage", "conv_rate") 50 | .withColumnRenamed("acc_percentage", "acc_rate") 51 | ) 52 | -------------------------------------------------------------------------------- /data_pipeline/scripts/feast_helper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cmd=$1 4 | 5 | usage() { 6 | echo "feast_helper.sh " 7 | echo "Available commands:" 8 | echo " teardown run feast teardown" 9 | echo " apply run feast apply" 10 | echo " materialize materialize offline to online" 11 | } 12 | 13 | if [[ -z "$cmd" ]]; then 14 | echo "Missing command" 15 | usage 16 | exit 1 17 | fi 18 | 19 | teardown() { 20 | cd feature_repo 21 | feast teardown 22 | } 23 | 24 | apply() { 25 | cd feature_repo 26 | feast apply 27 | } 28 | 29 | materialize() { 30 | cd feature_repo 31 | feast materialize-incremental $(date +%Y-%m-%d) 32 | } 33 | 34 | shift 35 | 36 | case $cmd in 37 | teardown) 38 | teardown "$@" 39 | ;; 40 | apply) 41 | apply "$@" 42 | ;; 43 | materialize) 44 | materialize "$@" 45 | ;; 46 | *) 47 | echo -n "Unknown command: $cmd" 48 | usage 49 | exit 1 50 | ;; 51 | esac 52 | -------------------------------------------------------------------------------- /data_pipeline/scripts/utils/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | 4 | # logging initialization 5 | def get_logger(): 6 | logger = logging.getLogger(__name__) 7 | formatter = logging.Formatter( 8 | "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 9 | ) 10 | stream_handler = logging.StreamHandler(sys.stdout) 11 | stream_handler.setFormatter(formatter) 12 | logger.addHandler(stream_handler) 13 | logger.setLevel(logging.INFO) 14 | 15 | return logger 16 | -------------------------------------------------------------------------------- /data_pipeline/src/db_to_offline_store/clean.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 5 | sys.path.append(BASE_DIR) 6 | 7 | from utils import logger 8 | 9 | my_logger = logger.get_logger() 10 | 11 | 12 | def main(): 13 | my_logger.info("Cleaning up...") 14 | 15 | 16 | if __name__ == "__main__": 17 | main() 18 | -------------------------------------------------------------------------------- /data_pipeline/src/db_to_offline_store/explore_and_validate.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 5 | sys.path.append(BASE_DIR) 6 | 7 | from utils import logger 8 | 9 | my_logger = logger.get_logger() 10 | 11 | 12 | def main(): 13 | my_logger.info("Exploring and validating...") 14 | 15 | 16 | if __name__ == "__main__": 17 | main() 18 | -------------------------------------------------------------------------------- /data_pipeline/src/db_to_offline_store/ingest.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 5 | sys.path.append(BASE_DIR) 6 | 7 | from utils import logger 8 | 9 | my_logger = logger.get_logger() 10 | 11 | 12 | def main(): 13 | my_logger.info("Ingesting...") 14 | 15 | 16 | if __name__ == "__main__": 17 | main() 18 | -------------------------------------------------------------------------------- /data_pipeline/src/stream_to_stores/ingest.py: -------------------------------------------------------------------------------- 1 | from processor import processor 2 | import argparse 3 | from feast.data_source import PushMode 4 | 5 | 6 | def main(args): 7 | if args.mode == "setup": 8 | if args.store == "online": 9 | query = processor.ingest_stream_feature_view() 10 | elif args.store == "offline": 11 | query = processor.ingest_stream_feature_view(PushMode.OFFLINE) 12 | else: 13 | raise ValueError("Invalid store! Please select online or offline") 14 | elif args.store == "teardown": 15 | query.stop() 16 | else: 17 | raise ValueError("Invalid mode! Please select setup or teardown") 18 | 19 | 20 | if __name__ == "__main__": 21 | parser = argparse.ArgumentParser(description="Ingest stream to stores.") 22 | parser.add_argument( 23 | "-m", 24 | "--mode", 25 | default="setup", 26 | help="mode for ingesting stream: setup or teardown", 27 | ) 28 | parser.add_argument( 29 | "-s", "--store", default="online", help="store type: online or offline" 30 | ) 31 | args = parser.parse_args() 32 | main(args) 33 | -------------------------------------------------------------------------------- /data_pipeline/src/stream_to_stores/processor.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | 4 | from feast.infra.contrib.spark_kafka_processor import SparkProcessorConfig 5 | from feast.infra.contrib.stream_processor import get_stream_processor_object 6 | from feast import FeatureStore 7 | 8 | from pyspark.sql import SparkSession 9 | 10 | # See https://spark.apache.org/docs/3.1.2/structured-streaming-kafka-integration.html#deploying for notes on why we need this environment variable. 11 | os.environ[ 12 | "PYSPARK_SUBMIT_ARGS" 13 | ] = "--packages=org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0 pyspark-shell" 14 | spark = SparkSession.builder.master("local").appName("feast-spark").getOrCreate() 15 | spark.conf.set("spark.sql.shuffle.partitions", 5) 16 | 17 | store = FeatureStore(repo_path="../../feature_repo") 18 | 19 | 20 | def preprocess_fn(rows: pd.DataFrame): 21 | print(f"df columns: {rows.columns}") 22 | print(f"df size: {rows.size}") 23 | print(f"df preview:\n{rows.head()}") 24 | return rows 25 | 26 | 27 | ingestion_config = SparkProcessorConfig( 28 | mode="spark", 29 | source="kafka", 30 | spark_session=spark, 31 | processing_time="30 seconds", 32 | query_timeout=15, 33 | ) 34 | sfv = store.get_stream_feature_view("driver_stats_stream") 35 | 36 | processor = get_stream_processor_object( 37 | config=ingestion_config, fs=store, sfv=sfv, preprocess_fn=preprocess_fn, 38 | ) 39 | -------------------------------------------------------------------------------- /data_pipeline/src/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLOpsVN/mlops-crash-course-code/8124d3c6afe344ff0df618ac99fcb1e5b1148be0/data_pipeline/src/utils/__init__.py -------------------------------------------------------------------------------- /data_pipeline/src/utils/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | 4 | # logging initialization 5 | def get_logger(): 6 | logger = logging.getLogger(__name__) 7 | formatter = logging.Formatter( 8 | "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 9 | ) 10 | stream_handler = logging.StreamHandler(sys.stdout) 11 | stream_handler.setFormatter(formatter) 12 | logger.addHandler(stream_handler) 13 | logger.setLevel(logging.INFO) 14 | return logger 15 | -------------------------------------------------------------------------------- /model_serving/.dockerignore: -------------------------------------------------------------------------------- 1 | .dockerignore 2 | __pycache__ 3 | artifacts 4 | dev_requirements.txt 5 | dags 6 | data 7 | feature_repo 8 | -------------------------------------------------------------------------------- /model_serving/.env: -------------------------------------------------------------------------------- 1 | MLFLOW_TRACKING_URI="http://host.docker.internal:5000" 2 | BATCH_INPUT_FILE="./data/batch_request.csv" 3 | REGISTERED_MODEL_FILE="./artifacts/registered_model_version.json" 4 | MONITORING_SERVICE_API="http://host.docker.internal:8309/iterate" 5 | -------------------------------------------------------------------------------- /model_serving/.gitignore: -------------------------------------------------------------------------------- 1 | artifacts/* 2 | data_sources 3 | feature_repo -------------------------------------------------------------------------------- /model_serving/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/bash 2 | 3 | ENV_FILE="./deployment/.env" 4 | 5 | feast_apply: 6 | source ${ENV_FILE} && cd feature_repo && feast apply 7 | 8 | build_image: 9 | source ${ENV_FILE} && bash deployment/deploy.sh build 10 | 11 | build_push_image: 12 | source ${ENV_FILE} && bash deployment/deploy.sh build_push 13 | 14 | deploy_dags: 15 | source ${ENV_FILE} && bash deployment/deploy.sh dags 16 | 17 | bentoml_serve: 18 | bash scripts/bentoml_helper.sh serve 19 | 20 | compose_up: 21 | source ${ENV_FILE} && bash deployment/deploy.sh compose_up 22 | 23 | compose_down: 24 | source ${ENV_FILE} && bash deployment/deploy.sh compose_down 25 | -------------------------------------------------------------------------------- /model_serving/README.md: -------------------------------------------------------------------------------- 1 | # Model serving 2 | 3 | ```bash 4 | # Go to data pipeline and deploy feature repo 5 | cd ../data_pipeline 6 | make deploy_feature_repo 7 | cd ../model_serving 8 | 9 | # To test source files at local before running in Airflow 10 | cd feature_repo 11 | feast apply 12 | cd .. 13 | 14 | export MODEL_SERVING_DIR="path/to/mlops-crash-course-code/model_serving" 15 | cd src 16 | python 17 | 18 | # Build 19 | make build_image && make deploy_dags 20 | 21 | # Run batch serving 22 | # Go to airflow UI 23 | # Set variable MLOPS_CRASH_COURSE_CODE_DIR=path/to/mlops-crash-course-code 24 | # Run dags 25 | 26 | # Run online serving 27 | make compose_up 28 | # To shutdown online serving 29 | make compose_down 30 | ``` 31 | -------------------------------------------------------------------------------- /model_serving/artifacts/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLOpsVN/mlops-crash-course-code/8124d3c6afe344ff0df618ac99fcb1e5b1148be0/model_serving/artifacts/.gitkeep -------------------------------------------------------------------------------- /model_serving/dags/batch_serving_dag.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | BASE_DIR = os.path.dirname(os.path.abspath(__file__)) 5 | sys.path.append(BASE_DIR) 6 | 7 | import pendulum 8 | from airflow import DAG 9 | from airflow.providers.docker.operators.docker import DockerOperator 10 | 11 | from utils import * 12 | 13 | 14 | with DAG( 15 | dag_id="batch_serving_pipeline", 16 | default_args=DefaultConfig.DEFAULT_DAG_ARGS, 17 | schedule_interval="@once", 18 | start_date=pendulum.datetime(2022, 1, 1, tz="UTC"), 19 | catchup=False, 20 | tags=["model_serving"], 21 | ) as dag: 22 | feature_store_init_task = DockerOperator( 23 | task_id="feature_store_init_task", 24 | command="bash -c 'cd feature_repo && feast apply'", 25 | **DefaultConfig.DEFAULT_DOCKER_OPERATOR_ARGS, 26 | ) 27 | 28 | data_extraction_task = DockerOperator( 29 | task_id="data_extraction_task", 30 | command="bash -c 'cd src && python data_extraction.py'", 31 | **DefaultConfig.DEFAULT_DOCKER_OPERATOR_ARGS, 32 | ) 33 | 34 | batch_prediction_task = DockerOperator( 35 | task_id="batch_prediction_task", 36 | command="bash -c 'cd src && python batch_prediction.py'", 37 | **DefaultConfig.DEFAULT_DOCKER_OPERATOR_ARGS, 38 | ) 39 | 40 | (feature_store_init_task >> data_extraction_task >> batch_prediction_task) 41 | -------------------------------------------------------------------------------- /model_serving/dags/utils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pendulum 4 | from airflow.models import Variable 5 | from docker.types import Mount 6 | 7 | 8 | class AppConst: 9 | DOCKER_USER = Variable.get("DOCKER_USER", "mlopsvn") 10 | 11 | 12 | class AppPath: 13 | MLOPS_CRASH_COURSE_CODE_DIR = Path(Variable.get("MLOPS_CRASH_COURSE_CODE_DIR")) 14 | MODEL_SERVING_DIR = MLOPS_CRASH_COURSE_CODE_DIR / "model_serving" 15 | FEATURE_REPO = MODEL_SERVING_DIR / "feature_repo" 16 | ARTIFACTS = MODEL_SERVING_DIR / "artifacts" 17 | DATA = MODEL_SERVING_DIR / "data" 18 | 19 | 20 | class DefaultConfig: 21 | DEFAULT_DAG_ARGS = { 22 | "owner": "mlopsvn", 23 | "retries": 0, 24 | "retry_delay": pendulum.duration(seconds=20), 25 | } 26 | 27 | DEFAULT_DOCKER_OPERATOR_ARGS = { 28 | "image": f"{AppConst.DOCKER_USER}/mlops_crash_course/model_serving:latest", 29 | "api_version": "auto", 30 | "auto_remove": True, 31 | "network_mode": "bridge", 32 | "docker_url": "tcp://docker-proxy:2375", 33 | "mounts": [ 34 | # feature repo 35 | Mount( 36 | source=AppPath.FEATURE_REPO.absolute().as_posix(), 37 | target="/model_serving/feature_repo", 38 | type="bind", 39 | ), 40 | # artifacts 41 | Mount( 42 | source=AppPath.ARTIFACTS.absolute().as_posix(), 43 | target="/model_serving/artifacts", 44 | type="bind", 45 | ), 46 | # data 47 | Mount( 48 | source=AppPath.DATA.absolute().as_posix(), 49 | target="/model_serving/data", 50 | type="bind", 51 | ), 52 | ], 53 | } 54 | -------------------------------------------------------------------------------- /model_serving/data/batch_request.csv: -------------------------------------------------------------------------------- 1 | event_timestamp driver_id 2 | 2021-04-16 20:29:28+00:00 1001 3 | 2021-04-17 04:29:28+00:00 1002 4 | 2021-04-17 12:29:28+00:00 1003 5 | 2021-04-17 20:29:28+00:00 1001 6 | 2021-04-18 04:29:28+00:00 1002 7 | 2021-04-18 12:29:28+00:00 1003 8 | 2021-04-18 20:29:28+00:00 1001 9 | 2021-04-19 04:29:28+00:00 1002 10 | 2021-04-19 12:29:28+00:00 1003 11 | 2021-04-19 20:29:28+00:00 1004 12 | -------------------------------------------------------------------------------- /model_serving/deployment/.env: -------------------------------------------------------------------------------- 1 | export DOCKER_USER="mlopsvn" 2 | export DAGS_DIR="../../mlops-crash-course-platform/airflow/run_env/dags/model_serving" 3 | export ONLINE_SERVING_PORT="8172" 4 | export FEAST_ONLINE_STORE_HOST="host.docker.internal" 5 | -------------------------------------------------------------------------------- /model_serving/deployment/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-buster AS build 2 | 3 | RUN apt-get update 4 | 5 | RUN python -m venv /opt/venv 6 | ENV PATH="/opt/venv/bin:$PATH" 7 | RUN pip install --upgrade pip 8 | 9 | COPY deployment/requirements.txt . 10 | RUN pip install -r requirements.txt 11 | 12 | FROM python:3.9-buster 13 | 14 | COPY --from=build /opt/venv /opt/venv 15 | ENV PATH="/opt/venv/bin:$PATH" 16 | COPY . /model_serving 17 | WORKDIR /model_serving 18 | -------------------------------------------------------------------------------- /model_serving/deployment/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cmd=$1 4 | 5 | # constants 6 | DOCKER_USER="$DOCKER_USER" 7 | PROJECT="mlops_crash_course" 8 | IMAGE_NAME="model_serving" 9 | IMAGE_TAG=$(git describe --always) 10 | 11 | if [[ -z "$DOCKER_USER" ]]; then 12 | echo "Missing \$DOCKER_USER env var" 13 | exit 1 14 | fi 15 | 16 | usage() { 17 | echo "deploy.sh " 18 | echo "Available commands:" 19 | echo " build build image" 20 | echo " push push image" 21 | echo " build_push build and push image" 22 | echo " compose_up up docker compose" 23 | echo " compose_down down docker compose" 24 | echo " dags deploy airflow dags" 25 | echo "Available arguments:" 26 | echo " [dags dir] airflow dags directory, for command dags only" 27 | } 28 | 29 | if [[ -z "$cmd" ]]; then 30 | echo "Missing command" 31 | usage 32 | exit 1 33 | fi 34 | 35 | build() { 36 | docker build --tag $DOCKER_USER/$PROJECT/$IMAGE_NAME:$IMAGE_TAG -f deployment/Dockerfile . 37 | docker tag $DOCKER_USER/$PROJECT/$IMAGE_NAME:$IMAGE_TAG $DOCKER_USER/$PROJECT/$IMAGE_NAME:latest 38 | } 39 | 40 | push() { 41 | docker push $DOCKER_USER/$PROJECT/$IMAGE_NAME:$IMAGE_TAG 42 | docker push $DOCKER_USER/$PROJECT/$IMAGE_NAME:latest 43 | } 44 | 45 | compose_up() { 46 | # escape slash / 47 | FEAST_ONLINE_STORE_HOST=$(echo ${FEAST_ONLINE_STORE_HOST} | sed -e "s#/#\\\/#g") 48 | # replace value 49 | sed -i '' -e s/localhost/${FEAST_ONLINE_STORE_HOST}/ ./feature_repo/feature_store.yaml 50 | 51 | docker-compose --env-file ./deployment/.env -f ./deployment/docker-compose.yml up -d 52 | } 53 | 54 | compose_down() { 55 | # escape slash / 56 | FEAST_ONLINE_STORE_HOST=$(echo ${FEAST_ONLINE_STORE_HOST} | sed -e "s#/#\\\/#g") 57 | # replace value 58 | sed -i '' -e s/${FEAST_ONLINE_STORE_HOST}/localhost/ ./feature_repo/feature_store.yaml 59 | 60 | docker-compose --env-file ./deployment/.env -f ./deployment/docker-compose.yml down 61 | } 62 | 63 | deploy_dags() { 64 | if [[ -z "$DAGS_DIR" ]]; then 65 | echo "Missing DAGS_DIR env var" 66 | usage 67 | exit 1 68 | fi 69 | 70 | mkdir -p "$DAGS_DIR" 71 | cp dags/* "$DAGS_DIR" 72 | } 73 | 74 | shift 75 | 76 | case $cmd in 77 | build) 78 | build "$@" 79 | ;; 80 | push) 81 | push "$@" 82 | ;; 83 | build_push) 84 | build "$@" 85 | push "$@" 86 | ;; 87 | compose_up) 88 | compose_up "$@" 89 | ;; 90 | compose_down) 91 | compose_down "$@" 92 | ;; 93 | dags) 94 | deploy_dags "$@" 95 | ;; 96 | *) 97 | echo -n "Unknown command: $cmd" 98 | usage 99 | exit 1 100 | ;; 101 | esac 102 | -------------------------------------------------------------------------------- /model_serving/deployment/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.7" 2 | services: 3 | bentoml_service: 4 | image: $DOCKER_USER/mlops_crash_course/model_serving:latest 5 | container_name: online_serving 6 | restart: on-failure 7 | volumes: 8 | - ../artifacts:/model_serving/artifacts 9 | - ../feature_repo:/model_serving/feature_repo 10 | command: /bin/bash scripts/bentoml_helper.sh serve --port=$ONLINE_SERVING_PORT 11 | ports: 12 | - $ONLINE_SERVING_PORT:$ONLINE_SERVING_PORT 13 | -------------------------------------------------------------------------------- /model_serving/deployment/requirements.txt: -------------------------------------------------------------------------------- 1 | bentoml==1.0.5 2 | fastparquet==0.8.3 3 | feast[redis]==0.24.0 4 | mlflow==1.29.0 5 | pandas==1.4.4 6 | pyspark==3.0.1 7 | scikit-learn==1.1.2 8 | 9 | absl-py==1.3.0 10 | aiohttp==3.8.3 11 | aiosignal==1.3.1 12 | alembic==1.8.1 13 | anyio==3.6.2 14 | appdirs==1.4.4 15 | asgiref==3.5.2 16 | async-timeout==4.0.2 17 | attrs==22.1.0 18 | bowler==0.9.0 19 | build==0.9.0 20 | cachetools==5.2.0 21 | cattrs==22.2.0 22 | certifi==2022.9.24 23 | charset-normalizer==2.1.1 24 | circus==0.17.2 25 | click==8.1.3 26 | cloudpickle==2.2.0 27 | colorama==0.4.6 28 | commonmark==0.9.1 29 | contextlib2==21.6.0 30 | cramjam==2.6.2 31 | dask==2022.1.1 32 | databricks-cli==0.17.3 33 | deepmerge==1.1.0 34 | Deprecated==1.2.13 35 | dill==0.3.6 36 | docker==5.0.3 37 | entrypoints==0.4 38 | exceptiongroup==1.0.2 39 | fastapi==0.87.0 40 | fastavro==1.7.0 41 | fissix==21.11.13 42 | Flask==2.2.2 43 | frozenlist==1.3.3 44 | fs==2.4.16 45 | fsspec==2022.11.0 46 | gitdb==4.0.9 47 | GitPython==3.1.29 48 | google-api-core==2.10.2 49 | google-auth==2.14.1 50 | googleapis-common-protos==1.56.4 51 | greenlet==2.0.1 52 | grpcio==1.50.0 53 | grpcio-reflection==1.48.2 54 | gunicorn==20.1.0 55 | h11==0.14.0 56 | hiredis==2.0.0 57 | httptools==0.5.0 58 | idna==3.4 59 | importlib-metadata==4.13.0 60 | itsdangerous==2.1.2 61 | Jinja2==3.1.2 62 | joblib==1.2.0 63 | jsonschema==4.17.0 64 | locket==1.0.0 65 | Mako==1.2.3 66 | MarkupSafe==2.1.1 67 | mmh3==3.0.0 68 | moreorless==0.4.0 69 | multidict==6.0.2 70 | mypy==0.990 71 | mypy-extensions==0.4.3 72 | numpy==1.23.4 73 | oauthlib==3.2.2 74 | opentelemetry-api==1.12.0 75 | opentelemetry-instrumentation==0.33b0 76 | opentelemetry-instrumentation-aiohttp-client==0.33b0 77 | opentelemetry-instrumentation-asgi==0.33b0 78 | opentelemetry-sdk==1.12.0 79 | opentelemetry-semantic-conventions==0.33b0 80 | opentelemetry-util-http==0.33b0 81 | packaging==21.3 82 | pandavro==1.5.2 83 | partd==1.3.0 84 | pathspec==0.10.2 85 | pep517==0.13.0 86 | pip==22.3.1 87 | pip-tools==6.9.0 88 | prometheus-client==0.13.1 89 | prometheus-flask-exporter==0.21.0 90 | proto-plus==1.22.1 91 | protobuf==3.20.3 92 | psutil==5.9.4 93 | py4j==0.10.9 94 | pyarrow==8.0.0 95 | pyasn1==0.4.8 96 | pyasn1-modules==0.2.8 97 | pydantic==1.10.2 98 | Pygments==2.13.0 99 | PyJWT==2.6.0 100 | pynvml==11.4.1 101 | pyparsing==3.0.9 102 | pyrsistent==0.19.2 103 | python-dateutil==2.8.2 104 | python-dotenv==0.21.0 105 | python-multipart==0.0.5 106 | pytz==2022.6 107 | PyYAML==6.0 108 | pyzmq==24.0.1 109 | querystring-parser==1.2.4 110 | redis==4.2.2 111 | requests==2.28.1 112 | rich==12.6.0 113 | rsa==4.9 114 | schema==0.7.5 115 | scipy==1.9.3 116 | setuptools==58.1.0 117 | simple-di==0.1.5 118 | six==1.16.0 119 | smmap==5.0.0 120 | sniffio==1.3.0 121 | SQLAlchemy==1.4.44 122 | sqlalchemy2-stubs==0.0.2a29 123 | sqlparse==0.4.3 124 | starlette==0.21.0 125 | tabulate==0.9.0 126 | tenacity==8.1.0 127 | tensorflow-metadata==1.11.0 128 | threadpoolctl==3.1.0 129 | toml==0.10.2 130 | tomli==2.0.1 131 | toolz==0.12.0 132 | tornado==6.2 133 | tqdm==4.64.1 134 | typeguard==2.13.3 135 | typing_extensions==4.4.0 136 | urllib3==1.26.12 137 | uvicorn==0.19.0 138 | uvloop==0.17.0 139 | volatile==2.1.0 140 | watchfiles==0.18.1 141 | websocket-client==1.4.2 142 | websockets==10.4 143 | Werkzeug==2.2.2 144 | wheel==0.38.4 145 | wrapt==1.14.1 146 | yarl==1.8.1 147 | zipp==3.10.0 148 | -------------------------------------------------------------------------------- /model_serving/dev_requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.3.4 2 | apache-airflow-providers-docker==3.1.0 3 | bentoml==1.0.5 4 | black==22.6.0 5 | fastparquet==0.8.3 6 | feast[redis]==0.24.0 7 | mlflow==1.29.0 8 | pandas==1.4.4 9 | pyspark==3.0.1 10 | scikit-learn==1.1.2 11 | -------------------------------------------------------------------------------- /model_serving/scripts/bentoml_helper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cmd=$1 4 | 5 | usage() { 6 | echo "bentoml_helper.sh [options]" 7 | echo "Available commands:" 8 | echo " serve serve a bentoml service" 9 | echo "Available options:" 10 | echo " --port=x bentoml service's port" 11 | echo " --reload auto reload bentoml service" 12 | } 13 | 14 | if [[ -z "$cmd" ]]; then 15 | echo "Missing command" 16 | usage 17 | exit 1 18 | fi 19 | 20 | serve() { 21 | cd src 22 | bentoml serve bentoml_service:svc "$@" 23 | } 24 | 25 | shift 26 | 27 | case $cmd in 28 | serve) 29 | serve "$@" 30 | ;; 31 | *) 32 | echo -n "Unknown command: $cmd" 33 | usage 34 | exit 1 35 | ;; 36 | esac 37 | -------------------------------------------------------------------------------- /model_serving/src/batch_prediction.py: -------------------------------------------------------------------------------- 1 | import mlflow 2 | from mlflow.models.signature import ModelSignature 3 | 4 | from utils import * 5 | 6 | Log(AppConst.BATCH_PREDICTION) 7 | AppPath() 8 | 9 | 10 | def batch_prediction(): 11 | Log().log.info("start batch_prediction") 12 | inspect_curr_dir() 13 | 14 | config = Config() 15 | Log().log.info(f"config: {config.__dict__}") 16 | 17 | # Download model 18 | registered_model_file = AppPath.ROOT / config.registered_model_file 19 | Log().log.info(f"registered_model_file: {registered_model_file}") 20 | registered_model_dict = load_json(registered_model_file) 21 | Log().log.info(registered_model_dict) 22 | model_uri = registered_model_dict["_source"] 23 | 24 | mlflow.set_tracking_uri(config.mlflow_tracking_uri) 25 | mlflow_model = mlflow.pyfunc.load_model(model_uri=model_uri) 26 | Log().log.info(mlflow_model.__dict__) 27 | 28 | # Load data 29 | batch_df = load_df(AppPath.BATCH_INPUT_PQ) 30 | 31 | # restructure features 32 | model_signature: ModelSignature = mlflow_model.metadata.signature 33 | feature_list = [] 34 | for name in model_signature.inputs.input_names(): 35 | feature_list.append(name) 36 | Log().log.info(f"feature_list: {feature_list}") 37 | 38 | batch_df = batch_df[feature_list] 39 | Log().log.info(f"batch_df: {batch_df}") 40 | 41 | # Predict 42 | preds = mlflow_model.predict(batch_df) 43 | batch_df["pred"] = preds 44 | 45 | Log().log.info("----- Example output -----") 46 | Log().log.info(batch_df.head()) 47 | 48 | # Write preds to file 49 | to_parquet(batch_df, AppPath.BATCH_OUTPUT_PQ) 50 | inspect_dir(AppPath.BATCH_OUTPUT_PQ) 51 | 52 | 53 | if __name__ == "__main__": 54 | batch_prediction() 55 | -------------------------------------------------------------------------------- /model_serving/src/bentoml_service.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List, Optional 2 | 3 | import bentoml 4 | import feast 5 | import mlflow 6 | import numpy as np 7 | import pandas as pd 8 | import requests 9 | from bentoml.io import JSON 10 | from mlflow.models.signature import ModelSignature 11 | from pydantic import BaseModel 12 | 13 | from utils import * 14 | 15 | Log(AppConst.BENTOML_SERVICE) 16 | AppPath() 17 | pd.set_option("display.max_columns", None) 18 | config = Config() 19 | Log().log.info(f"config: {config.__dict__}") 20 | 21 | 22 | def save_model() -> bentoml.Model: 23 | Log().log.info("start save_model") 24 | # read from .env file registered_model_version.json, get model name, model version 25 | 26 | registered_model_file = AppPath.ROOT / config.registered_model_file 27 | Log().log.info(f"registered_model_file: {registered_model_file}") 28 | registered_model_dict = load_json(registered_model_file) 29 | Log().log.info(f"registered_model_dict: {registered_model_dict}") 30 | 31 | run_id = registered_model_dict["_run_id"] 32 | model_name = registered_model_dict["_name"] 33 | model_version = registered_model_dict["_version"] 34 | model_uri = registered_model_dict["_source"] 35 | 36 | mlflow.set_tracking_uri(config.mlflow_tracking_uri) 37 | mlflow_model = mlflow.pyfunc.load_model(model_uri=model_uri) 38 | Log().log.info(mlflow_model.__dict__) 39 | model = mlflow_model._model_impl 40 | model_signature: ModelSignature = mlflow_model.metadata.signature 41 | 42 | # construct feature list 43 | feature_list = [] 44 | for name in model_signature.inputs.input_names(): 45 | feature_list.append(name) 46 | 47 | # save model using bentoml 48 | bentoml_model = bentoml.sklearn.save_model( 49 | model_name, 50 | model, 51 | # model signatures for runner inference 52 | signatures={ 53 | "predict": { 54 | "batchable": False, 55 | }, 56 | }, 57 | labels={ 58 | "owner": "mlopsvn", 59 | }, 60 | metadata={ 61 | "mlflow_run_id": run_id, 62 | "mlflow_model_name": model_name, 63 | "mlflow_model_version": model_version, 64 | }, 65 | custom_objects={ 66 | "feature_list": feature_list, 67 | }, 68 | ) 69 | Log().log.info(bentoml_model.__dict__) 70 | return bentoml_model 71 | 72 | 73 | bentoml_model = save_model() 74 | feature_list = bentoml_model.custom_objects["feature_list"] 75 | bentoml_runner = bentoml.sklearn.get(bentoml_model.tag).to_runner() 76 | svc = bentoml.Service(bentoml_model.tag.name, runners=[bentoml_runner]) 77 | fs = feast.FeatureStore(repo_path=AppPath.FEATURE_REPO) 78 | 79 | 80 | def predict(request: np.ndarray) -> np.ndarray: 81 | Log().log.info(f"start predict") 82 | result = bentoml_runner.predict.run(request) 83 | Log().log.info(f"result: {result}") 84 | return result 85 | 86 | 87 | class InferenceRequest(BaseModel): 88 | request_id: str 89 | driver_ids: List[int] 90 | 91 | 92 | class InferenceResponse(BaseModel): 93 | prediction: Optional[float] 94 | error: Optional[str] 95 | 96 | 97 | @svc.api( 98 | input=JSON(pydantic_model=InferenceRequest), 99 | output=JSON(pydantic_model=InferenceResponse), 100 | ) 101 | def inference(request: InferenceRequest, ctx: bentoml.Context) -> Dict[str, Any]: 102 | """ 103 | Example request: {"request_id": "uuid-1", "driver_ids":[1001,1002,1003,1004,1005]} 104 | """ 105 | Log().log.info(f"start inference") 106 | response = InferenceResponse() 107 | try: 108 | Log().log.info(f"request: {request}") 109 | driver_ids = request.driver_ids 110 | 111 | online_features = fs.get_online_features( 112 | entity_rows=[{"driver_id": driver_id} for driver_id in driver_ids], 113 | features=[f"driver_stats:{name}" for name in feature_list], 114 | ) 115 | df = pd.DataFrame.from_dict(online_features.to_dict()) 116 | Log().log.info(f"online features: {df}") 117 | 118 | input_features = df.drop(["driver_id"], axis=1) 119 | input_features = input_features[feature_list] 120 | Log().log.info(f"input_features: {input_features}") 121 | 122 | result = predict(input_features) 123 | df["prediction"] = result 124 | best_idx = df["prediction"].argmax() 125 | best_driver_id = df["driver_id"].iloc[best_idx] 126 | Log().log.info(f"best_driver_id: {best_driver_id}") 127 | Log().log.info(f"df: {df}") 128 | 129 | response.prediction = best_driver_id 130 | ctx.response.status_code = 200 131 | 132 | # monitor 133 | monitor_df = df.iloc[[best_idx]] 134 | monitor_df = monitor_df.assign(request_id=[request.request_id]) 135 | monitor_df = monitor_df.assign(best_driver_id=[best_driver_id]) 136 | Log().log.info(f"monitor_df: {monitor_df}") 137 | monitor_request(monitor_df) 138 | 139 | except Exception as e: 140 | Log().log.error(f"error: {e}") 141 | response.error = str(e) 142 | ctx.response.status_code = 500 143 | 144 | Log().log.info(f"response: {response}") 145 | return response 146 | 147 | 148 | def monitor_request(df: pd.DataFrame): 149 | Log().log.info("start monitor_request") 150 | try: 151 | data = json.dumps(df.to_dict(), cls=NumpyEncoder) 152 | 153 | Log().log.info(f"sending {data}") 154 | response = requests.post( 155 | config.monitoring_service_api, 156 | data=data, 157 | headers={"content-type": "application/json"}, 158 | ) 159 | 160 | if response.status_code == 200: 161 | Log().log.info(f"Success") 162 | else: 163 | Log().log.info( 164 | f"Got an error code {response.status_code} for the data chunk. Reason: {response.reason}, error text: {response.text}" 165 | ) 166 | 167 | except requests.exceptions.ConnectionError as error: 168 | Log().log.error( 169 | f"Cannot reach monitoring service, error: {error}, data: {data}" 170 | ) 171 | 172 | except Exception as error: 173 | Log().log.error(f"Error: {error}") 174 | -------------------------------------------------------------------------------- /model_serving/src/data_extraction.py: -------------------------------------------------------------------------------- 1 | import feast 2 | import pandas as pd 3 | 4 | from utils import * 5 | 6 | Log(AppConst.DATA_EXTRACTION) 7 | AppPath() 8 | 9 | 10 | def extract_data(): 11 | Log().log.info("start extract_data") 12 | inspect_curr_dir() 13 | config = Config() 14 | Log().log.info(f"config: {config.__dict__}") 15 | 16 | # Connect to your feature store provider 17 | inspect_dir(AppPath.DATA_SOURCES) 18 | inspect_dir(AppPath.FEATURE_REPO) 19 | fs = feast.FeatureStore(repo_path=AppPath.FEATURE_REPO) 20 | 21 | # Load driver order data 22 | batch_input_file = AppPath.ROOT / config.batch_input_file 23 | inspect_dir(batch_input_file) 24 | orders = pd.read_csv(batch_input_file, sep="\t") 25 | orders["event_timestamp"] = pd.to_datetime(orders["event_timestamp"]) 26 | 27 | # Retrieve training data 28 | batch_input_df = fs.get_historical_features( 29 | entity_df=orders, 30 | features=[ 31 | "driver_stats:conv_rate", 32 | "driver_stats:acc_rate", 33 | "driver_stats:avg_daily_trips", 34 | ], 35 | ).to_df() 36 | 37 | batch_input_df = batch_input_df.drop(["event_timestamp", "driver_id"], axis=1) 38 | 39 | Log().log.info("----- Feature schema -----") 40 | Log().log.info(batch_input_df.info()) 41 | 42 | Log().log.info("----- Example features -----") 43 | Log().log.info(batch_input_df.head()) 44 | 45 | # Write to file 46 | to_parquet(batch_input_df, AppPath.BATCH_INPUT_PQ) 47 | inspect_dir(AppPath.BATCH_INPUT_PQ) 48 | 49 | 50 | if __name__ == "__main__": 51 | extract_data() 52 | -------------------------------------------------------------------------------- /model_serving/src/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import sys 5 | from pathlib import Path 6 | 7 | import numpy as np 8 | import pandas as pd 9 | from dotenv import load_dotenv 10 | 11 | load_dotenv() 12 | 13 | 14 | class AppConst: 15 | LOG_LEVEL = logging.DEBUG 16 | BENTOML_MODEL_SAVING = "bentoml_model_saving" 17 | BENTOML_SERVICE = "bentoml_service" 18 | DATA_EXTRACTION = "data_extraction" 19 | BATCH_PREDICTION = "batch_prediction" 20 | 21 | 22 | class AppPath: 23 | # set MODEL_SERVING_DIR in dev environment for quickly testing the code 24 | ROOT = Path(os.environ.get("MODEL_SERVING_DIR", "/model_serving")) 25 | DATA = ROOT / "data" 26 | DATA_SOURCES = ROOT / "data_sources" 27 | FEATURE_REPO = ROOT / "feature_repo" 28 | ARTIFACTS = ROOT / "artifacts" 29 | 30 | BATCH_INPUT_PQ = ARTIFACTS / "batch_input.parquet" 31 | BATCH_OUTPUT_PQ = ARTIFACTS / "batch_output.parquet" 32 | 33 | def __init__(self) -> None: 34 | AppPath.ARTIFACTS.mkdir(parents=True, exist_ok=True) 35 | 36 | 37 | class Config: 38 | def __init__(self) -> None: 39 | import numpy as np 40 | 41 | self.feature_dict = { 42 | "conv_rate": np.float64, 43 | "acc_rate": np.float64, 44 | "avg_daily_trips": np.int64, 45 | "trip_completed": np.int64, 46 | } 47 | self.mlflow_tracking_uri = os.environ.get("MLFLOW_TRACKING_URI") 48 | self.batch_input_file = os.environ.get("BATCH_INPUT_FILE") 49 | self.registered_model_file = os.environ.get("REGISTERED_MODEL_FILE") 50 | self.monitoring_service_api = os.environ.get("MONITORING_SERVICE_API") 51 | 52 | 53 | class Log: 54 | log: logging.Logger = None 55 | 56 | def __init__(self, name="") -> None: 57 | if Log.log == None: 58 | Log.log = self._init_logger(name) 59 | 60 | def _init_logger(self, name): 61 | logger = logging.getLogger(name) 62 | formatter = logging.Formatter( 63 | "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 64 | ) 65 | stream_handler = logging.StreamHandler(sys.stdout) 66 | stream_handler.setFormatter(formatter) 67 | logger.addHandler(stream_handler) 68 | logger.setLevel(AppConst.LOG_LEVEL) 69 | return logger 70 | 71 | 72 | # the encoder helps to convert NumPy types in source data to JSON-compatible types 73 | class NumpyEncoder(json.JSONEncoder): 74 | def default(self, obj): 75 | if isinstance(obj, np.void): 76 | return None 77 | 78 | if isinstance(obj, (np.generic, np.bool_)): 79 | return obj.item() 80 | 81 | if isinstance(obj, np.ndarray): 82 | return obj.tolist() 83 | 84 | return obj 85 | 86 | 87 | def inspect_dir(path): 88 | Log().log.info(f"inspect_dir {path}") 89 | path = Path(path) 90 | if not path.exists(): 91 | Log().log.info(f"Path {path} doesn't exist") 92 | return 93 | elif path.is_file(): 94 | Log().log.info(f"Path {path} is file") 95 | return 96 | 97 | paths = os.listdir(path) 98 | paths = sorted(paths) 99 | for path in paths: 100 | Log().log.info(path) 101 | 102 | 103 | def inspect_curr_dir(): 104 | cwd = os.getcwd() 105 | Log().log.info(f"current dir: {cwd}") 106 | inspect_dir(cwd) 107 | 108 | 109 | def load_df(path) -> pd.DataFrame: 110 | Log().log.info(f"start load_df {path}") 111 | df = pd.read_parquet(path, engine="fastparquet") 112 | return df 113 | 114 | 115 | def to_parquet(df: pd.DataFrame, path): 116 | Log().log.info(f"start to_parquet {path}") 117 | df.to_parquet(path, engine="fastparquet") 118 | 119 | 120 | def dump_json(dict_obj: dict, path): 121 | with open(path, "w", encoding="utf-8") as f: 122 | json.dump(dict_obj, f) 123 | 124 | 125 | def load_json(path) -> dict: 126 | with open(path, "r", encoding="utf-8") as f: 127 | data = json.load(f) 128 | return data 129 | -------------------------------------------------------------------------------- /monitoring_service/.dockerignore: -------------------------------------------------------------------------------- 1 | .dockerignore 2 | __pycache__ 3 | dev_requirements.txt 4 | dashboards 5 | data 6 | data_sources 7 | feature_repo 8 | nbs 9 | scripts 10 | -------------------------------------------------------------------------------- /monitoring_service/.gitignore: -------------------------------------------------------------------------------- 1 | data/mock* 2 | data/train* 3 | data_sources 4 | feature_repo 5 | scripts/ 6 | -------------------------------------------------------------------------------- /monitoring_service/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/bash 2 | 3 | ENV_FILE="./deployment/.env" 4 | 5 | feast_teardown: 6 | source ${ENV_FILE} && bash scripts/feast_helper.sh teardown 7 | 8 | feast_apply: 9 | source ${ENV_FILE} && bash scripts/feast_helper.sh apply 10 | 11 | feast_materialize: 12 | source ${ENV_FILE} && bash scripts/feast_helper.sh materialize 13 | 14 | build_image: 15 | source ${ENV_FILE} && bash deployment/deploy.sh build 16 | 17 | build_push_image: 18 | source ${ENV_FILE} && bash deployment/deploy.sh build_push 19 | 20 | compose_up: 21 | source ${ENV_FILE} && bash deployment/deploy.sh compose_up 22 | 23 | compose_down: 24 | source ${ENV_FILE} && bash deployment/deploy.sh compose_down 25 | -------------------------------------------------------------------------------- /monitoring_service/README.md: -------------------------------------------------------------------------------- 1 | # Monitoring service 2 | 3 | ```bash 4 | # Go to data pipeline and deploy feature repo 5 | cd ../data_pipeline 6 | make deploy_feature_repo 7 | cd ../monitoring_service 8 | 9 | # Build 10 | make build_image 11 | 12 | # Run monitoring service 13 | make compose_up 14 | # To shutdown monitoring service 15 | make compose_down 16 | 17 | # To run mock_request.py 18 | export MONITORING_SERVICE_DIR="path/to/mlops-crash-course-code/monitoring_service" 19 | python src/mock_request.py -d normal -n 5 20 | python src/mock_request.py -d drift -n 5 21 | ``` 22 | -------------------------------------------------------------------------------- /monitoring_service/dashboards/bentoml_dashboard.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": "-- Grafana --", 7 | "enable": true, 8 | "hide": true, 9 | "iconColor": "rgba(0, 211, 255, 1)", 10 | "name": "Annotations & Alerts", 11 | "type": "dashboard" 12 | } 13 | ] 14 | }, 15 | "editable": true, 16 | "gnetId": null, 17 | "graphTooltip": 0, 18 | "id": 6, 19 | "links": [], 20 | "panels": [ 21 | { 22 | "datasource": { 23 | "type": "prometheus", 24 | "uid": "PBFA97CFB590B2093" 25 | }, 26 | "fieldConfig": { 27 | "defaults": { 28 | "color": { 29 | "mode": "palette-classic" 30 | }, 31 | "custom": { 32 | "axisLabel": "", 33 | "axisPlacement": "auto", 34 | "barAlignment": 1, 35 | "drawStyle": "bars", 36 | "fillOpacity": 10, 37 | "gradientMode": "none", 38 | "hideFrom": { 39 | "graph": false, 40 | "legend": false, 41 | "tooltip": false 42 | }, 43 | "lineInterpolation": "linear", 44 | "lineWidth": 1, 45 | "pointSize": 5, 46 | "scaleDistribution": { 47 | "type": "linear" 48 | }, 49 | "showPoints": "auto", 50 | "spanNulls": true 51 | }, 52 | "mappings": [], 53 | "thresholds": { 54 | "mode": "absolute", 55 | "steps": [ 56 | { 57 | "color": "green", 58 | "value": null 59 | }, 60 | { 61 | "color": "red", 62 | "value": 80 63 | } 64 | ] 65 | }, 66 | "unit": "short" 67 | }, 68 | "overrides": [] 69 | }, 70 | "gridPos": { 71 | "h": 8, 72 | "w": 12, 73 | "x": 0, 74 | "y": 0 75 | }, 76 | "id": 6, 77 | "options": { 78 | "graph": {}, 79 | "legend": { 80 | "calcs": [], 81 | "displayMode": "hidden", 82 | "placement": "right" 83 | }, 84 | "tooltipOptions": { 85 | "mode": "single" 86 | } 87 | }, 88 | "pluginVersion": "7.5.7", 89 | "targets": [ 90 | { 91 | "exemplar": true, 92 | "expr": "BENTOML_sklearn:elastic:net:reg_request_in_progress{endpoint=\"/inference\"}", 93 | "interval": "", 94 | "legendFormat": "", 95 | "refId": "A" 96 | } 97 | ], 98 | "timeFrom": null, 99 | "timeShift": null, 100 | "title": "request_in_progress", 101 | "type": "timeseries" 102 | }, 103 | { 104 | "datasource": { 105 | "type": "prometheus", 106 | "uid": "PBFA97CFB590B2093" 107 | }, 108 | "fieldConfig": { 109 | "defaults": { 110 | "color": { 111 | "mode": "palette-classic" 112 | }, 113 | "custom": { 114 | "axisLabel": "", 115 | "axisPlacement": "auto", 116 | "barAlignment": 0, 117 | "drawStyle": "line", 118 | "fillOpacity": 10, 119 | "gradientMode": "none", 120 | "hideFrom": { 121 | "graph": false, 122 | "legend": false, 123 | "tooltip": false 124 | }, 125 | "lineInterpolation": "smooth", 126 | "lineWidth": 1, 127 | "pointSize": 5, 128 | "scaleDistribution": { 129 | "type": "linear" 130 | }, 131 | "showPoints": "never", 132 | "spanNulls": true 133 | }, 134 | "mappings": [], 135 | "thresholds": { 136 | "mode": "absolute", 137 | "steps": [ 138 | { 139 | "color": "green", 140 | "value": null 141 | }, 142 | { 143 | "color": "red", 144 | "value": 80 145 | } 146 | ] 147 | }, 148 | "unit": "short" 149 | }, 150 | "overrides": [] 151 | }, 152 | "gridPos": { 153 | "h": 8, 154 | "w": 12, 155 | "x": 12, 156 | "y": 10 157 | }, 158 | "id": 8, 159 | "options": { 160 | "graph": {}, 161 | "legend": { 162 | "calcs": ["mean"], 163 | "displayMode": "table", 164 | "placement": "bottom" 165 | }, 166 | "tooltipOptions": { 167 | "mode": "multi" 168 | } 169 | }, 170 | "pluginVersion": "7.5.7", 171 | "targets": [ 172 | { 173 | "exemplar": true, 174 | "expr": "rate(BENTOML_sklearn:elastic:net:reg_request_total{endpoint=\"/inference\"}[5m])", 175 | "interval": "", 176 | "legendFormat": "", 177 | "refId": "A" 178 | } 179 | ], 180 | "timeFrom": null, 181 | "timeShift": null, 182 | "title": "request_total", 183 | "type": "timeseries" 184 | } 185 | ], 186 | "refresh": "5s", 187 | "schemaVersion": 27, 188 | "style": "dark", 189 | "tags": [], 190 | "templating": { 191 | "list": [] 192 | }, 193 | "time": { 194 | "from": "now-5m", 195 | "to": "now" 196 | }, 197 | "timepicker": {}, 198 | "timezone": "", 199 | "title": "BentoML Dashboard", 200 | "uid": "Wsy8ZNgnz", 201 | "version": 1 202 | } 203 | -------------------------------------------------------------------------------- /monitoring_service/dashboards/classification_performance.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": "-- Grafana --", 7 | "enable": true, 8 | "hide": true, 9 | "iconColor": "rgba(0, 211, 255, 1)", 10 | "name": "Annotations & Alerts", 11 | "target": { 12 | "limit": 100, 13 | "matchAny": false, 14 | "tags": [], 15 | "type": "dashboard" 16 | }, 17 | "type": "dashboard" 18 | } 19 | ] 20 | }, 21 | "editable": true, 22 | "gnetId": null, 23 | "graphTooltip": 0, 24 | "iteration": 1644849809944, 25 | "links": [], 26 | "panels": [ 27 | { 28 | "collapsed": false, 29 | "datasource": null, 30 | "gridPos": { 31 | "h": 1, 32 | "w": 24, 33 | "x": 0, 34 | "y": 0 35 | }, 36 | "id": 126, 37 | "panels": [], 38 | "title": "Reference dataset data", 39 | "type": "row" 40 | }, 41 | { 42 | "datasource": "Prometheus", 43 | "fieldConfig": { 44 | "defaults": { 45 | "color": { 46 | "mode": "palette-classic" 47 | }, 48 | "custom": { 49 | "axisLabel": "", 50 | "axisPlacement": "auto", 51 | "barAlignment": 0, 52 | "drawStyle": "line", 53 | "fillOpacity": 0, 54 | "gradientMode": "none", 55 | "hideFrom": { 56 | "legend": false, 57 | "tooltip": false, 58 | "viz": false 59 | }, 60 | "lineInterpolation": "linear", 61 | "lineWidth": 1, 62 | "pointSize": 5, 63 | "scaleDistribution": { 64 | "type": "linear" 65 | }, 66 | "showPoints": "never", 67 | "spanNulls": false, 68 | "stacking": { 69 | "group": "A", 70 | "mode": "none" 71 | }, 72 | "thresholdsStyle": { 73 | "mode": "off" 74 | } 75 | }, 76 | "mappings": [], 77 | "max": 1, 78 | "min": 0, 79 | "thresholds": { 80 | "mode": "absolute", 81 | "steps": [ 82 | { 83 | "color": "green", 84 | "value": null 85 | }, 86 | { 87 | "color": "red", 88 | "value": 80 89 | } 90 | ] 91 | } 92 | }, 93 | "overrides": [] 94 | }, 95 | "gridPos": { 96 | "h": 11, 97 | "w": 24, 98 | "x": 0, 99 | "y": 1 100 | }, 101 | "id": 2, 102 | "options": { 103 | "legend": { 104 | "calcs": [], 105 | "displayMode": "table", 106 | "placement": "right" 107 | }, 108 | "tooltip": { 109 | "mode": "single" 110 | } 111 | }, 112 | "targets": [ 113 | { 114 | "exemplar": true, 115 | "expr": "evidently:classification_performance:quality{dataset=\"current\"}", 116 | "interval": "", 117 | "legendFormat": "{{metric}}", 118 | "refId": "A" 119 | } 120 | ], 121 | "title": "Quality", 122 | "type": "timeseries" 123 | }, 124 | { 125 | "datasource": "Prometheus", 126 | "fieldConfig": { 127 | "defaults": { 128 | "color": { 129 | "mode": "thresholds" 130 | }, 131 | "mappings": [], 132 | "max": 1, 133 | "min": 0, 134 | "thresholds": { 135 | "mode": "absolute", 136 | "steps": [ 137 | { 138 | "color": "red", 139 | "value": null 140 | }, 141 | { 142 | "color": "green", 143 | "value": 0.5 144 | } 145 | ] 146 | } 147 | }, 148 | "overrides": [] 149 | }, 150 | "gridPos": { 151 | "h": 8, 152 | "w": 6, 153 | "x": 0, 154 | "y": 12 155 | }, 156 | "id": 4, 157 | "options": { 158 | "colorMode": "value", 159 | "graphMode": "area", 160 | "justifyMode": "auto", 161 | "orientation": "auto", 162 | "reduceOptions": { 163 | "calcs": ["lastNotNull"], 164 | "fields": "", 165 | "values": false 166 | }, 167 | "text": {}, 168 | "textMode": "auto" 169 | }, 170 | "pluginVersion": "8.1.5", 171 | "repeat": "metric", 172 | "repeatDirection": "h", 173 | "targets": [ 174 | { 175 | "exemplar": true, 176 | "expr": "evidently:classification_performance:quality{dataset=\"current\", metric=\"$metric\"}", 177 | "interval": "", 178 | "legendFormat": "", 179 | "refId": "A" 180 | } 181 | ], 182 | "title": "$metric", 183 | "type": "stat" 184 | }, 185 | { 186 | "datasource": "Prometheus", 187 | "fieldConfig": { 188 | "defaults": { 189 | "color": { 190 | "mode": "thresholds" 191 | }, 192 | "mappings": [], 193 | "thresholds": { 194 | "mode": "absolute", 195 | "steps": [ 196 | { 197 | "color": "green", 198 | "value": null 199 | } 200 | ] 201 | } 202 | }, 203 | "overrides": [] 204 | }, 205 | "gridPos": { 206 | "h": 8, 207 | "w": 12, 208 | "x": 0, 209 | "y": 20 210 | }, 211 | "id": 171, 212 | "options": { 213 | "displayMode": "basic", 214 | "orientation": "auto", 215 | "reduceOptions": { 216 | "calcs": ["lastNotNull"], 217 | "fields": "", 218 | "values": false 219 | }, 220 | "showUnfilled": true, 221 | "text": {} 222 | }, 223 | "pluginVersion": "8.1.5", 224 | "targets": [ 225 | { 226 | "exemplar": true, 227 | "expr": "evidently:classification_performance:class_representation{dataset=\"current\", type=\"prediction\"}", 228 | "interval": "", 229 | "legendFormat": "{{class_name}}", 230 | "refId": "A" 231 | } 232 | ], 233 | "title": "Prediction class representation", 234 | "type": "bargauge" 235 | }, 236 | { 237 | "datasource": "Prometheus", 238 | "fieldConfig": { 239 | "defaults": { 240 | "color": { 241 | "mode": "thresholds" 242 | }, 243 | "mappings": [], 244 | "thresholds": { 245 | "mode": "absolute", 246 | "steps": [ 247 | { 248 | "color": "green", 249 | "value": null 250 | } 251 | ] 252 | } 253 | }, 254 | "overrides": [] 255 | }, 256 | "gridPos": { 257 | "h": 8, 258 | "w": 12, 259 | "x": 12, 260 | "y": 20 261 | }, 262 | "id": 215, 263 | "options": { 264 | "displayMode": "basic", 265 | "orientation": "auto", 266 | "reduceOptions": { 267 | "calcs": ["lastNotNull"], 268 | "fields": "", 269 | "values": false 270 | }, 271 | "showUnfilled": true, 272 | "text": {} 273 | }, 274 | "pluginVersion": "8.1.5", 275 | "targets": [ 276 | { 277 | "exemplar": true, 278 | "expr": "evidently:classification_performance:class_representation{dataset=\"current\", type=\"target\"}", 279 | "interval": "", 280 | "legendFormat": "{{class_name}}", 281 | "refId": "A" 282 | } 283 | ], 284 | "title": "Target class representation", 285 | "type": "bargauge" 286 | }, 287 | { 288 | "collapsed": false, 289 | "datasource": null, 290 | "gridPos": { 291 | "h": 1, 292 | "w": 24, 293 | "x": 0, 294 | "y": 28 295 | }, 296 | "id": 303, 297 | "panels": [], 298 | "repeat": "classes", 299 | "title": "Class $classes information ", 300 | "type": "row" 301 | }, 302 | { 303 | "datasource": "Prometheus", 304 | "description": "", 305 | "fieldConfig": { 306 | "defaults": { 307 | "color": { 308 | "mode": "thresholds" 309 | }, 310 | "mappings": [], 311 | "thresholds": { 312 | "mode": "absolute", 313 | "steps": [ 314 | { 315 | "color": "green", 316 | "value": null 317 | } 318 | ] 319 | } 320 | }, 321 | "overrides": [ 322 | { 323 | "matcher": { 324 | "id": "byName", 325 | "options": "FN" 326 | }, 327 | "properties": [ 328 | { 329 | "id": "color", 330 | "value": { 331 | "fixedColor": "red", 332 | "mode": "fixed" 333 | } 334 | } 335 | ] 336 | }, 337 | { 338 | "matcher": { 339 | "id": "byName", 340 | "options": "FP" 341 | }, 342 | "properties": [ 343 | { 344 | "id": "color", 345 | "value": { 346 | "fixedColor": "red", 347 | "mode": "fixed" 348 | } 349 | } 350 | ] 351 | } 352 | ] 353 | }, 354 | "gridPos": { 355 | "h": 6, 356 | "w": 5, 357 | "x": 0, 358 | "y": 29 359 | }, 360 | "id": 309, 361 | "maxPerRow": 6, 362 | "options": { 363 | "displayMode": "gradient", 364 | "orientation": "auto", 365 | "reduceOptions": { 366 | "calcs": ["lastNotNull"], 367 | "fields": "", 368 | "values": false 369 | }, 370 | "showUnfilled": true, 371 | "text": {} 372 | }, 373 | "pluginVersion": "8.1.5", 374 | "repeatDirection": "h", 375 | "targets": [ 376 | { 377 | "exemplar": true, 378 | "expr": "evidently:classification_performance:class_confusion{dataset=\"current\", class_name=\"${classes:raw}\"}", 379 | "format": "time_series", 380 | "instant": false, 381 | "interval": "", 382 | "legendFormat": "{{metric}}", 383 | "refId": "A" 384 | } 385 | ], 386 | "title": "Confusion $classes", 387 | "transformations": [], 388 | "type": "bargauge" 389 | }, 390 | { 391 | "datasource": "Prometheus", 392 | "fieldConfig": { 393 | "defaults": { 394 | "color": { 395 | "mode": "palette-classic" 396 | }, 397 | "custom": { 398 | "axisLabel": "", 399 | "axisPlacement": "auto", 400 | "barAlignment": 0, 401 | "drawStyle": "line", 402 | "fillOpacity": 0, 403 | "gradientMode": "none", 404 | "hideFrom": { 405 | "legend": false, 406 | "tooltip": false, 407 | "viz": false 408 | }, 409 | "lineInterpolation": "linear", 410 | "lineWidth": 1, 411 | "pointSize": 5, 412 | "scaleDistribution": { 413 | "type": "linear" 414 | }, 415 | "showPoints": "auto", 416 | "spanNulls": false, 417 | "stacking": { 418 | "group": "A", 419 | "mode": "none" 420 | }, 421 | "thresholdsStyle": { 422 | "mode": "off" 423 | } 424 | }, 425 | "mappings": [], 426 | "min": 0, 427 | "thresholds": { 428 | "mode": "absolute", 429 | "steps": [ 430 | { 431 | "color": "green", 432 | "value": null 433 | }, 434 | { 435 | "color": "red", 436 | "value": 80 437 | } 438 | ] 439 | } 440 | }, 441 | "overrides": [] 442 | }, 443 | "gridPos": { 444 | "h": 6, 445 | "w": 9, 446 | "x": 5, 447 | "y": 29 448 | }, 449 | "id": 413, 450 | "options": { 451 | "legend": { 452 | "calcs": [], 453 | "displayMode": "table", 454 | "placement": "right" 455 | }, 456 | "tooltip": { 457 | "mode": "single" 458 | } 459 | }, 460 | "targets": [ 461 | { 462 | "exemplar": true, 463 | "expr": "evidently:classification_performance:class_confusion{dataset=\"current\", class_name=\"${classes:raw}\"}", 464 | "interval": "", 465 | "legendFormat": "{{metric}}", 466 | "refId": "A" 467 | } 468 | ], 469 | "title": "Confusion in time", 470 | "type": "timeseries" 471 | }, 472 | { 473 | "datasource": "Prometheus", 474 | "fieldConfig": { 475 | "defaults": { 476 | "color": { 477 | "mode": "palette-classic" 478 | }, 479 | "custom": { 480 | "axisLabel": "", 481 | "axisPlacement": "auto", 482 | "barAlignment": 0, 483 | "drawStyle": "line", 484 | "fillOpacity": 0, 485 | "gradientMode": "none", 486 | "hideFrom": { 487 | "legend": false, 488 | "tooltip": false, 489 | "viz": false 490 | }, 491 | "lineInterpolation": "linear", 492 | "lineWidth": 1, 493 | "pointSize": 5, 494 | "scaleDistribution": { 495 | "type": "linear" 496 | }, 497 | "showPoints": "auto", 498 | "spanNulls": false, 499 | "stacking": { 500 | "group": "A", 501 | "mode": "none" 502 | }, 503 | "thresholdsStyle": { 504 | "mode": "off" 505 | } 506 | }, 507 | "mappings": [], 508 | "max": 1, 509 | "min": 0, 510 | "thresholds": { 511 | "mode": "absolute", 512 | "steps": [ 513 | { 514 | "color": "green", 515 | "value": null 516 | }, 517 | { 518 | "color": "red", 519 | "value": 80 520 | } 521 | ] 522 | } 523 | }, 524 | "overrides": [] 525 | }, 526 | "gridPos": { 527 | "h": 6, 528 | "w": 10, 529 | "x": 14, 530 | "y": 29 531 | }, 532 | "id": 308, 533 | "options": { 534 | "legend": { 535 | "calcs": [], 536 | "displayMode": "list", 537 | "placement": "bottom" 538 | }, 539 | "tooltip": { 540 | "mode": "single" 541 | } 542 | }, 543 | "targets": [ 544 | { 545 | "exemplar": true, 546 | "expr": "evidently:classification_performance:class_quality{dataset=\"current\", class_name=\"${classes:raw}\"}", 547 | "interval": "", 548 | "legendFormat": "{{metric}}", 549 | "refId": "A" 550 | } 551 | ], 552 | "title": "Quality", 553 | "type": "timeseries" 554 | } 555 | ], 556 | "refresh": "5s", 557 | "schemaVersion": 30, 558 | "style": "dark", 559 | "tags": [], 560 | "templating": { 561 | "list": [ 562 | { 563 | "allValue": null, 564 | "current": { 565 | "selected": false, 566 | "text": ["All"], 567 | "value": ["$__all"] 568 | }, 569 | "datasource": "Prometheus", 570 | "definition": "label_values(evidently:classification_performance:quality, metric)", 571 | "description": null, 572 | "error": null, 573 | "hide": 0, 574 | "includeAll": true, 575 | "label": null, 576 | "multi": true, 577 | "name": "metric", 578 | "options": [], 579 | "query": { 580 | "query": "label_values(evidently:classification_performance:quality, metric)", 581 | "refId": "StandardVariableQuery" 582 | }, 583 | "refresh": 1, 584 | "regex": "", 585 | "skipUrlSync": false, 586 | "sort": 0, 587 | "type": "query" 588 | }, 589 | { 590 | "allValue": null, 591 | "current": { 592 | "selected": false, 593 | "text": [""], 594 | "value": [""] 595 | }, 596 | "datasource": "Prometheus", 597 | "definition": "label_values(evidently:classification_performance:class_confusion, class_name)", 598 | "description": null, 599 | "error": null, 600 | "hide": 0, 601 | "includeAll": true, 602 | "label": null, 603 | "multi": true, 604 | "name": "classes", 605 | "options": [], 606 | "query": { 607 | "query": "label_values(evidently:classification_performance:class_confusion, class_name)", 608 | "refId": "StandardVariableQuery" 609 | }, 610 | "refresh": 1, 611 | "regex": "", 612 | "skipUrlSync": false, 613 | "sort": 0, 614 | "type": "query" 615 | } 616 | ] 617 | }, 618 | "time": { 619 | "from": "now-30m", 620 | "to": "now" 621 | }, 622 | "timepicker": {}, 623 | "timezone": "", 624 | "title": "Evidently Classification Performance Dashboard", 625 | "uid": "5doIRYank", 626 | "version": 1 627 | } 628 | -------------------------------------------------------------------------------- /monitoring_service/dashboards/data_drift.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": "-- Grafana --", 7 | "enable": true, 8 | "hide": true, 9 | "iconColor": "rgba(0, 211, 255, 1)", 10 | "name": "Annotations & Alerts", 11 | "target": { 12 | "limit": 100, 13 | "matchAny": false, 14 | "tags": [], 15 | "type": "dashboard" 16 | }, 17 | "type": "dashboard" 18 | } 19 | ] 20 | }, 21 | "editable": true, 22 | "fiscalYearStartMonth": 0, 23 | "graphTooltip": 0, 24 | "iteration": 1651157034829, 25 | "links": [], 26 | "liveNow": false, 27 | "panels": [ 28 | { 29 | "gridPos": { 30 | "h": 3, 31 | "w": 24, 32 | "x": 0, 33 | "y": 0 34 | }, 35 | "id": 14, 36 | "options": { 37 | "content": "# Evidently Data Drift Monitoring\n\n Here can be your information about dashboard and its usage.\n", 38 | "mode": "markdown" 39 | }, 40 | "pluginVersion": "8.3.6", 41 | "type": "text" 42 | }, 43 | { 44 | "collapsed": false, 45 | "gridPos": { 46 | "h": 1, 47 | "w": 24, 48 | "x": 0, 49 | "y": 3 50 | }, 51 | "id": 12, 52 | "panels": [], 53 | "title": "General information", 54 | "type": "row" 55 | }, 56 | { 57 | "datasource": { 58 | "type": "prometheus", 59 | "uid": "PBFA97CFB590B2093" 60 | }, 61 | "description": "", 62 | "fieldConfig": { 63 | "defaults": { 64 | "color": { 65 | "mode": "fixed" 66 | }, 67 | "custom": { 68 | "fillOpacity": 70, 69 | "lineWidth": 0 70 | }, 71 | "mappings": [ 72 | { 73 | "options": { 74 | "0": { 75 | "color": "green", 76 | "index": 1, 77 | "text": "No Data Drift" 78 | }, 79 | "1": { 80 | "color": "red", 81 | "index": 0, 82 | "text": "Data Drift Detected" 83 | } 84 | }, 85 | "type": "value" 86 | } 87 | ], 88 | "thresholds": { 89 | "mode": "absolute", 90 | "steps": [ 91 | { 92 | "color": "green", 93 | "value": null 94 | }, 95 | { 96 | "color": "red", 97 | "value": 1 98 | } 99 | ] 100 | } 101 | }, 102 | "overrides": [] 103 | }, 104 | "gridPos": { 105 | "h": 7, 106 | "w": 24, 107 | "x": 0, 108 | "y": 4 109 | }, 110 | "id": 2, 111 | "options": { 112 | "alignValue": "center", 113 | "legend": { 114 | "displayMode": "hidden", 115 | "placement": "bottom" 116 | }, 117 | "mergeValues": true, 118 | "rowHeight": 0.9, 119 | "showValue": "auto", 120 | "tooltip": { 121 | "mode": "single" 122 | } 123 | }, 124 | "targets": [ 125 | { 126 | "datasource": { 127 | "type": "prometheus", 128 | "uid": "PBFA97CFB590B2093" 129 | }, 130 | "exemplar": true, 131 | "expr": "evidently:data_drift:dataset_drift{dataset_name=\"${dataset_name:raw}\"}", 132 | "instant": false, 133 | "interval": "", 134 | "legendFormat": "dataset drift", 135 | "refId": "A" 136 | } 137 | ], 138 | "title": "Dataset Drift", 139 | "type": "state-timeline" 140 | }, 141 | { 142 | "datasource": { 143 | "type": "prometheus", 144 | "uid": "PBFA97CFB590B2093" 145 | }, 146 | "fieldConfig": { 147 | "defaults": { 148 | "color": { 149 | "mode": "palette-classic" 150 | }, 151 | "custom": { 152 | "axisLabel": "", 153 | "axisPlacement": "auto", 154 | "barAlignment": 0, 155 | "drawStyle": "line", 156 | "fillOpacity": 0, 157 | "gradientMode": "none", 158 | "hideFrom": { 159 | "legend": false, 160 | "tooltip": false, 161 | "viz": false 162 | }, 163 | "lineInterpolation": "linear", 164 | "lineWidth": 1, 165 | "pointSize": 5, 166 | "scaleDistribution": { 167 | "type": "linear" 168 | }, 169 | "showPoints": "auto", 170 | "spanNulls": false, 171 | "stacking": { 172 | "group": "A", 173 | "mode": "none" 174 | }, 175 | "thresholdsStyle": { 176 | "mode": "off" 177 | } 178 | }, 179 | "mappings": [], 180 | "max": 1, 181 | "min": 0, 182 | "thresholds": { 183 | "mode": "absolute", 184 | "steps": [ 185 | { 186 | "color": "green", 187 | "value": null 188 | }, 189 | { 190 | "color": "red", 191 | "value": 0.8 192 | } 193 | ] 194 | }, 195 | "unit": "percentunit" 196 | }, 197 | "overrides": [] 198 | }, 199 | "gridPos": { 200 | "h": 7, 201 | "w": 12, 202 | "x": 0, 203 | "y": 11 204 | }, 205 | "id": 6, 206 | "options": { 207 | "legend": { 208 | "calcs": [], 209 | "displayMode": "hidden", 210 | "placement": "bottom" 211 | }, 212 | "tooltip": { 213 | "mode": "single" 214 | } 215 | }, 216 | "targets": [ 217 | { 218 | "datasource": { 219 | "type": "prometheus", 220 | "uid": "PBFA97CFB590B2093" 221 | }, 222 | "exemplar": true, 223 | "expr": "evidently:data_drift:share_drifted_features{dataset_name=\"${dataset_name:raw}\"}", 224 | "interval": "", 225 | "legendFormat": "share", 226 | "refId": "A" 227 | } 228 | ], 229 | "title": "Share of drifted features", 230 | "type": "timeseries" 231 | }, 232 | { 233 | "datasource": { 234 | "type": "prometheus", 235 | "uid": "PBFA97CFB590B2093" 236 | }, 237 | "fieldConfig": { 238 | "defaults": { 239 | "color": { 240 | "mode": "thresholds" 241 | }, 242 | "mappings": [], 243 | "thresholds": { 244 | "mode": "absolute", 245 | "steps": [ 246 | { 247 | "color": "green", 248 | "value": null 249 | }, 250 | { 251 | "color": "red", 252 | "value": 0 253 | } 254 | ] 255 | }, 256 | "unit": "none" 257 | }, 258 | "overrides": [] 259 | }, 260 | "gridPos": { 261 | "h": 7, 262 | "w": 6, 263 | "x": 12, 264 | "y": 11 265 | }, 266 | "id": 8, 267 | "options": { 268 | "colorMode": "value", 269 | "graphMode": "area", 270 | "justifyMode": "center", 271 | "orientation": "auto", 272 | "reduceOptions": { 273 | "calcs": ["lastNotNull"], 274 | "fields": "/^Drifted$/", 275 | "values": false 276 | }, 277 | "text": {}, 278 | "textMode": "auto" 279 | }, 280 | "pluginVersion": "8.3.6", 281 | "targets": [ 282 | { 283 | "datasource": { 284 | "type": "prometheus", 285 | "uid": "PBFA97CFB590B2093" 286 | }, 287 | "exemplar": true, 288 | "expr": "evidently:data_drift:n_drifted_features{dataset_name=\"${dataset_name:raw}\"}", 289 | "format": "time_series", 290 | "hide": false, 291 | "instant": false, 292 | "interval": "", 293 | "legendFormat": "Drifted", 294 | "refId": "B" 295 | } 296 | ], 297 | "title": "# of drifted features", 298 | "transformations": [], 299 | "type": "stat" 300 | }, 301 | { 302 | "datasource": { 303 | "type": "prometheus", 304 | "uid": "PBFA97CFB590B2093" 305 | }, 306 | "fieldConfig": { 307 | "defaults": { 308 | "color": { 309 | "mode": "thresholds" 310 | }, 311 | "mappings": [], 312 | "thresholds": { 313 | "mode": "absolute", 314 | "steps": [ 315 | { 316 | "color": "green", 317 | "value": null 318 | }, 319 | { 320 | "color": "red", 321 | "value": 80 322 | } 323 | ] 324 | }, 325 | "unit": "none" 326 | }, 327 | "overrides": [] 328 | }, 329 | "gridPos": { 330 | "h": 7, 331 | "w": 6, 332 | "x": 18, 333 | "y": 11 334 | }, 335 | "id": 15, 336 | "options": { 337 | "colorMode": "value", 338 | "graphMode": "area", 339 | "justifyMode": "auto", 340 | "orientation": "auto", 341 | "reduceOptions": { 342 | "calcs": ["lastNotNull"], 343 | "fields": "", 344 | "values": false 345 | }, 346 | "text": {}, 347 | "textMode": "auto" 348 | }, 349 | "pluginVersion": "8.3.6", 350 | "targets": [ 351 | { 352 | "datasource": { 353 | "type": "prometheus", 354 | "uid": "PBFA97CFB590B2093" 355 | }, 356 | "exemplar": true, 357 | "expr": "evidently:data_drift:n_drifted_features{dataset_name=\"${dataset_name:raw}\"} / evidently:data_drift:share_drifted_features{dataset_name=\"${dataset_name:raw}\"}", 358 | "format": "time_series", 359 | "hide": false, 360 | "instant": false, 361 | "interval": "", 362 | "legendFormat": "Total", 363 | "refId": "A" 364 | } 365 | ], 366 | "title": "# of features", 367 | "transformations": [], 368 | "type": "stat" 369 | }, 370 | { 371 | "collapsed": false, 372 | "gridPos": { 373 | "h": 1, 374 | "w": 24, 375 | "x": 0, 376 | "y": 18 377 | }, 378 | "id": 10, 379 | "panels": [], 380 | "title": "Detailed information", 381 | "type": "row" 382 | }, 383 | { 384 | "datasource": { 385 | "type": "prometheus", 386 | "uid": "PBFA97CFB590B2093" 387 | }, 388 | "fieldConfig": { 389 | "defaults": { 390 | "color": { 391 | "mode": "palette-classic" 392 | }, 393 | "custom": { 394 | "axisLabel": "", 395 | "axisPlacement": "auto", 396 | "barAlignment": 0, 397 | "drawStyle": "line", 398 | "fillOpacity": 0, 399 | "gradientMode": "none", 400 | "hideFrom": { 401 | "legend": false, 402 | "tooltip": false, 403 | "viz": false 404 | }, 405 | "lineInterpolation": "linear", 406 | "lineStyle": { 407 | "fill": "solid" 408 | }, 409 | "lineWidth": 1, 410 | "pointSize": 5, 411 | "scaleDistribution": { 412 | "type": "linear" 413 | }, 414 | "showPoints": "never", 415 | "spanNulls": false, 416 | "stacking": { 417 | "group": "A", 418 | "mode": "none" 419 | }, 420 | "thresholdsStyle": { 421 | "mode": "off" 422 | } 423 | }, 424 | "mappings": [], 425 | "thresholds": { 426 | "mode": "absolute", 427 | "steps": [ 428 | { 429 | "color": "green", 430 | "value": null 431 | }, 432 | { 433 | "color": "red", 434 | "value": 80 435 | } 436 | ] 437 | } 438 | }, 439 | "overrides": [ 440 | { 441 | "matcher": { 442 | "id": "byFrameRefID", 443 | "options": "B" 444 | }, 445 | "properties": [ 446 | { 447 | "id": "unit", 448 | "value": "none" 449 | }, 450 | { 451 | "id": "min", 452 | "value": 0 453 | }, 454 | { 455 | "id": "max", 456 | "value": 1 457 | }, 458 | { 459 | "id": "custom.lineWidth", 460 | "value": 8 461 | }, 462 | { 463 | "id": "custom.axisPlacement", 464 | "value": "hidden" 465 | } 466 | ] 467 | } 468 | ] 469 | }, 470 | "gridPos": { 471 | "h": 11, 472 | "w": 24, 473 | "x": 0, 474 | "y": 19 475 | }, 476 | "id": 4, 477 | "options": { 478 | "legend": { 479 | "calcs": [], 480 | "displayMode": "table", 481 | "placement": "right" 482 | }, 483 | "tooltip": { 484 | "mode": "single" 485 | } 486 | }, 487 | "targets": [ 488 | { 489 | "datasource": { 490 | "type": "prometheus", 491 | "uid": "PBFA97CFB590B2093" 492 | }, 493 | "exemplar": true, 494 | "expr": "evidently:data_drift:p_value{dataset_name=\"${dataset_name:raw}\"}", 495 | "interval": "", 496 | "legendFormat": "{{feature}}", 497 | "refId": "A" 498 | }, 499 | { 500 | "datasource": { 501 | "type": "prometheus", 502 | "uid": "PBFA97CFB590B2093" 503 | }, 504 | "exemplar": true, 505 | "expr": "evidently:reference_dataset_hash", 506 | "hide": false, 507 | "interval": "", 508 | "intervalFactor": 1, 509 | "legendFormat": "reference_hash", 510 | "refId": "B" 511 | } 512 | ], 513 | "title": "P-value of features", 514 | "type": "timeseries" 515 | } 516 | ], 517 | "refresh": "5s", 518 | "schemaVersion": 34, 519 | "style": "dark", 520 | "tags": [], 521 | "templating": { 522 | "list": [ 523 | { 524 | "current": { 525 | "selected": true, 526 | "text": "bike_other", 527 | "value": "bike_other" 528 | }, 529 | "datasource": { 530 | "type": "prometheus", 531 | "uid": "PBFA97CFB590B2093" 532 | }, 533 | "definition": "label_values(dataset_name)", 534 | "hide": 0, 535 | "includeAll": false, 536 | "multi": false, 537 | "name": "dataset_name", 538 | "options": [], 539 | "query": { 540 | "query": "label_values(dataset_name)", 541 | "refId": "StandardVariableQuery" 542 | }, 543 | "refresh": 1, 544 | "regex": "", 545 | "skipUrlSync": false, 546 | "sort": 0, 547 | "type": "query" 548 | } 549 | ] 550 | }, 551 | "time": { 552 | "from": "now-30m", 553 | "to": "now" 554 | }, 555 | "timepicker": {}, 556 | "timezone": "", 557 | "title": "Evidently Data Drift Dashboard", 558 | "uid": "U54hsxv7k", 559 | "version": 2, 560 | "weekStart": "" 561 | } 562 | -------------------------------------------------------------------------------- /monitoring_service/data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLOpsVN/mlops-crash-course-code/8124d3c6afe344ff0df618ac99fcb1e5b1148be0/monitoring_service/data/.gitkeep -------------------------------------------------------------------------------- /monitoring_service/data/orig_driver_stats.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLOpsVN/mlops-crash-course-code/8124d3c6afe344ff0df618ac99fcb1e5b1148be0/monitoring_service/data/orig_driver_stats.parquet -------------------------------------------------------------------------------- /monitoring_service/deployment/.env: -------------------------------------------------------------------------------- 1 | export DOCKER_USER="mlopsvn" 2 | export MONITORING_SERVICE_PORT="8309" 3 | -------------------------------------------------------------------------------- /monitoring_service/deployment/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim AS build 2 | 3 | RUN apt-get update 4 | 5 | RUN python -m venv /opt/venv 6 | ENV PATH="/opt/venv/bin:$PATH" 7 | 8 | COPY deployment/requirements.txt . 9 | RUN pip install -r requirements.txt 10 | 11 | FROM python:3.9-slim 12 | 13 | COPY --from=build /opt/venv /opt/venv 14 | ENV PATH="/opt/venv/bin:$PATH" 15 | COPY . /monitoring_service 16 | WORKDIR /monitoring_service 17 | -------------------------------------------------------------------------------- /monitoring_service/deployment/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cmd=$1 4 | 5 | # constants 6 | DOCKER_USER="$DOCKER_USER" 7 | PROJECT="mlops_crash_course" 8 | IMAGE_NAME="monitoring_service" 9 | IMAGE_TAG=$(git describe --always) 10 | 11 | if [[ -z "$DOCKER_USER" ]]; then 12 | echo "Missing \$DOCKER_USER env var" 13 | exit 1 14 | fi 15 | 16 | usage() { 17 | echo "deploy.sh " 18 | echo "Available commands:" 19 | echo " build build image" 20 | echo " push push image" 21 | echo " build_push build and push image" 22 | echo " compose_up up docker compose" 23 | echo " compose_down down docker compose" 24 | } 25 | 26 | if [[ -z "$cmd" ]]; then 27 | echo "Missing command" 28 | usage 29 | exit 1 30 | fi 31 | 32 | build() { 33 | docker build --tag $DOCKER_USER/$PROJECT/$IMAGE_NAME:$IMAGE_TAG -f deployment/Dockerfile . 34 | docker tag $DOCKER_USER/$PROJECT/$IMAGE_NAME:$IMAGE_TAG $DOCKER_USER/$PROJECT/$IMAGE_NAME:latest 35 | } 36 | 37 | push() { 38 | docker push $DOCKER_USER/$PROJECT/$IMAGE_NAME:$IMAGE_TAG 39 | docker push $DOCKER_USER/$PROJECT/$IMAGE_NAME:latest 40 | } 41 | 42 | compose_up() { 43 | docker-compose --env-file ./deployment/.env -f ./deployment/docker-compose.yml up -d 44 | } 45 | 46 | compose_down() { 47 | docker-compose --env-file ./deployment/.env -f ./deployment/docker-compose.yml down 48 | } 49 | 50 | shift 51 | 52 | case $cmd in 53 | build) 54 | build "$@" 55 | ;; 56 | push) 57 | push "$@" 58 | ;; 59 | build_push) 60 | build "$@" 61 | push "$@" 62 | ;; 63 | compose_up) 64 | compose_up "$@" 65 | ;; 66 | compose_down) 67 | compose_down "$@" 68 | ;; 69 | *) 70 | echo -n "Unknown command: $cmd" 71 | usage 72 | exit 1 73 | ;; 74 | esac 75 | -------------------------------------------------------------------------------- /monitoring_service/deployment/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.7" 2 | services: 3 | monitoring: 4 | image: $DOCKER_USER/mlops_crash_course/monitoring_service:latest 5 | container_name: monitoring_service 6 | restart: on-failure 7 | volumes: 8 | - ../data:/monitoring_service/data 9 | - ../data_sources:/monitoring_service/data_sources 10 | command: python src/monitoring_service.py --port=$MONITORING_SERVICE_PORT 11 | ports: 12 | - $MONITORING_SERVICE_PORT:$MONITORING_SERVICE_PORT 13 | -------------------------------------------------------------------------------- /monitoring_service/deployment/requirements.txt: -------------------------------------------------------------------------------- 1 | evidently==0.1.57.dev0 2 | fastparquet==0.8.3 3 | Flask==2.2.2 4 | pandas==1.4.4 5 | prometheus_client==0.14.1 6 | python-dotenv==0.21.0 7 | -------------------------------------------------------------------------------- /monitoring_service/dev_requirements.txt: -------------------------------------------------------------------------------- 1 | black==22.6.0 2 | evidently==0.1.57.dev0 3 | fastparquet==0.8.3 4 | feast[redis]==0.24.0 5 | Flask==2.2.2 6 | jupyterlab==3.4.5 7 | pandas==1.4.4 8 | prometheus_client==0.14.1 9 | pyspark==3.0.1 10 | python-dotenv==0.21.0 11 | -------------------------------------------------------------------------------- /monitoring_service/nbs/test_datasets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "/Users/tung.dao/tung/mlopsvn/code/mlops-crash-course-code/monitoring_service/nbs\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "!pwd" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "import pandas as pd\n", 27 | "import fastparquet\n", 28 | "from pathlib import Path\n", 29 | "import numpy as np\n", 30 | "\n", 31 | "random_seed = 17\n", 32 | "np.random.seed(random_seed)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "id": "092197b6-1dc5-4154-98e7-6ad74c75b88e", 38 | "metadata": {}, 39 | "source": [ 40 | "## Load data" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "OUTSIDE_DATA_DIR = Path(\"../data\")\n", 50 | "ORIG_DATA_PATH = OUTSIDE_DATA_DIR / \"mock_normal_data.parquet\"\n", 51 | "DRIFT_DATA_PATH = OUTSIDE_DATA_DIR / \"mock_drift_data.parquet\"\n", 52 | "REQUEST_DATA_PATH = OUTSIDE_DATA_DIR / \"mock_request_data.csv\"" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 4, 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "data": { 62 | "text/html": [ 63 | "
\n", 64 | "\n", 77 | "\n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | "
datetimedriver_idconv_rateacc_rateavg_daily_tripscreated
02021-07-19 23:00:00+00:0010010.1863410.2268791072021-07-28 11:08:04.802
12021-07-18 06:00:00+00:0010020.0710320.2294902502021-07-28 11:08:04.802
22021-07-28 09:00:00+00:0010030.0500000.1928641032021-07-28 11:08:04.802
32021-07-27 10:00:00+00:0010040.1843320.050000492021-07-28 11:08:04.802
42021-07-23 05:00:00+00:0010050.2500000.2500002462021-07-28 11:08:04.802
\n", 137 | "
" 138 | ], 139 | "text/plain": [ 140 | " datetime driver_id conv_rate acc_rate avg_daily_trips \\\n", 141 | "0 2021-07-19 23:00:00+00:00 1001 0.186341 0.226879 107 \n", 142 | "1 2021-07-18 06:00:00+00:00 1002 0.071032 0.229490 250 \n", 143 | "2 2021-07-28 09:00:00+00:00 1003 0.050000 0.192864 103 \n", 144 | "3 2021-07-27 10:00:00+00:00 1004 0.184332 0.050000 49 \n", 145 | "4 2021-07-23 05:00:00+00:00 1005 0.250000 0.250000 246 \n", 146 | "\n", 147 | " created \n", 148 | "0 2021-07-28 11:08:04.802 \n", 149 | "1 2021-07-28 11:08:04.802 \n", 150 | "2 2021-07-28 11:08:04.802 \n", 151 | "3 2021-07-28 11:08:04.802 \n", 152 | "4 2021-07-28 11:08:04.802 " 153 | ] 154 | }, 155 | "execution_count": 4, 156 | "metadata": {}, 157 | "output_type": "execute_result" 158 | } 159 | ], 160 | "source": [ 161 | "normal_df = pd.read_parquet(ORIG_DATA_PATH, engine='fastparquet')\n", 162 | "normal_df" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 5, 168 | "metadata": {}, 169 | "outputs": [ 170 | { 171 | "data": { 172 | "text/html": [ 173 | "
\n", 174 | "\n", 187 | "\n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | "
datetimedriver_idconv_rateacc_rateavg_daily_tripscreated
02021-07-19 23:00:00+00:0010010.8863410.9268798072021-07-28 11:08:04.802
12021-07-18 06:00:00+00:0010020.7710320.9294909502021-07-28 11:08:04.802
22021-07-28 09:00:00+00:0010030.7500000.8928648032021-07-28 11:08:04.802
32021-07-27 10:00:00+00:0010040.8843320.7500007502021-07-28 11:08:04.802
42021-07-23 05:00:00+00:0010050.9500000.9500009462021-07-28 11:08:04.802
\n", 247 | "
" 248 | ], 249 | "text/plain": [ 250 | " datetime driver_id conv_rate acc_rate avg_daily_trips \\\n", 251 | "0 2021-07-19 23:00:00+00:00 1001 0.886341 0.926879 807 \n", 252 | "1 2021-07-18 06:00:00+00:00 1002 0.771032 0.929490 950 \n", 253 | "2 2021-07-28 09:00:00+00:00 1003 0.750000 0.892864 803 \n", 254 | "3 2021-07-27 10:00:00+00:00 1004 0.884332 0.750000 750 \n", 255 | "4 2021-07-23 05:00:00+00:00 1005 0.950000 0.950000 946 \n", 256 | "\n", 257 | " created \n", 258 | "0 2021-07-28 11:08:04.802 \n", 259 | "1 2021-07-28 11:08:04.802 \n", 260 | "2 2021-07-28 11:08:04.802 \n", 261 | "3 2021-07-28 11:08:04.802 \n", 262 | "4 2021-07-28 11:08:04.802 " 263 | ] 264 | }, 265 | "execution_count": 5, 266 | "metadata": {}, 267 | "output_type": "execute_result" 268 | } 269 | ], 270 | "source": [ 271 | "drift_df = pd.read_parquet(DRIFT_DATA_PATH, engine='fastparquet')\n", 272 | "drift_df" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 6, 278 | "metadata": {}, 279 | "outputs": [ 280 | { 281 | "data": { 282 | "text/html": [ 283 | "
\n", 284 | "\n", 297 | "\n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | "
request_iddriver_idstrip_completed
0uuid-0[1001]0
1uuid-1[1002]1
2uuid-2[1003]0
3uuid-3[1004]0
4uuid-4[1005]1
\n", 339 | "
" 340 | ], 341 | "text/plain": [ 342 | " request_id driver_ids trip_completed\n", 343 | "0 uuid-0 [1001] 0\n", 344 | "1 uuid-1 [1002] 1\n", 345 | "2 uuid-2 [1003] 0\n", 346 | "3 uuid-3 [1004] 0\n", 347 | "4 uuid-4 [1005] 1" 348 | ] 349 | }, 350 | "execution_count": 6, 351 | "metadata": {}, 352 | "output_type": "execute_result" 353 | } 354 | ], 355 | "source": [ 356 | "request_df = pd.read_csv(REQUEST_DATA_PATH)\n", 357 | "request_df" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 7, 363 | "metadata": {}, 364 | "outputs": [ 365 | { 366 | "data": { 367 | "text/html": [ 368 | "
\n", 369 | "\n", 382 | "\n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | "
driver_idconv_rateacc_rateavg_daily_trips
count5.0000005.0000005.0000005.00000
mean1003.0000000.1483410.189847151.00000
std1.5811390.0847370.08082091.47404
min1001.0000000.0500000.05000049.00000
25%1002.0000000.0710320.192864103.00000
50%1003.0000000.1843320.226879107.00000
75%1004.0000000.1863410.229490246.00000
max1005.0000000.2500000.250000250.00000
\n", 451 | "
" 452 | ], 453 | "text/plain": [ 454 | " driver_id conv_rate acc_rate avg_daily_trips\n", 455 | "count 5.000000 5.000000 5.000000 5.00000\n", 456 | "mean 1003.000000 0.148341 0.189847 151.00000\n", 457 | "std 1.581139 0.084737 0.080820 91.47404\n", 458 | "min 1001.000000 0.050000 0.050000 49.00000\n", 459 | "25% 1002.000000 0.071032 0.192864 103.00000\n", 460 | "50% 1003.000000 0.184332 0.226879 107.00000\n", 461 | "75% 1004.000000 0.186341 0.229490 246.00000\n", 462 | "max 1005.000000 0.250000 0.250000 250.00000" 463 | ] 464 | }, 465 | "execution_count": 7, 466 | "metadata": {}, 467 | "output_type": "execute_result" 468 | } 469 | ], 470 | "source": [ 471 | "normal_df.describe()" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": 8, 477 | "metadata": {}, 478 | "outputs": [ 479 | { 480 | "data": { 481 | "text/html": [ 482 | "
\n", 483 | "\n", 496 | "\n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | "
driver_idconv_rateacc_rateavg_daily_trips
count5.0000005.0000005.0000005.000000
mean1003.0000000.8483410.889847851.200000
std1.5811390.0847370.08082091.195943
min1001.0000000.7500000.750000750.000000
25%1002.0000000.7710320.892864803.000000
50%1003.0000000.8843320.926879807.000000
75%1004.0000000.8863410.929490946.000000
max1005.0000000.9500000.950000950.000000
\n", 565 | "
" 566 | ], 567 | "text/plain": [ 568 | " driver_id conv_rate acc_rate avg_daily_trips\n", 569 | "count 5.000000 5.000000 5.000000 5.000000\n", 570 | "mean 1003.000000 0.848341 0.889847 851.200000\n", 571 | "std 1.581139 0.084737 0.080820 91.195943\n", 572 | "min 1001.000000 0.750000 0.750000 750.000000\n", 573 | "25% 1002.000000 0.771032 0.892864 803.000000\n", 574 | "50% 1003.000000 0.884332 0.926879 807.000000\n", 575 | "75% 1004.000000 0.886341 0.929490 946.000000\n", 576 | "max 1005.000000 0.950000 0.950000 950.000000" 577 | ] 578 | }, 579 | "execution_count": 8, 580 | "metadata": {}, 581 | "output_type": "execute_result" 582 | } 583 | ], 584 | "source": [ 585 | "drift_df.describe()" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": 9, 591 | "metadata": {}, 592 | "outputs": [ 593 | { 594 | "data": { 595 | "text/html": [ 596 | "
\n", 597 | "\n", 610 | "\n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | "
trip_completed
count5.000000
mean0.400000
std0.547723
min0.000000
25%0.000000
50%0.000000
75%1.000000
max1.000000
\n", 652 | "
" 653 | ], 654 | "text/plain": [ 655 | " trip_completed\n", 656 | "count 5.000000\n", 657 | "mean 0.400000\n", 658 | "std 0.547723\n", 659 | "min 0.000000\n", 660 | "25% 0.000000\n", 661 | "50% 0.000000\n", 662 | "75% 1.000000\n", 663 | "max 1.000000" 664 | ] 665 | }, 666 | "execution_count": 9, 667 | "metadata": {}, 668 | "output_type": "execute_result" 669 | } 670 | ], 671 | "source": [ 672 | "request_df.describe()" 673 | ] 674 | }, 675 | { 676 | "cell_type": "markdown", 677 | "metadata": {}, 678 | "source": [ 679 | "## Check data quality" 680 | ] 681 | }, 682 | { 683 | "cell_type": "code", 684 | "execution_count": 10, 685 | "metadata": {}, 686 | "outputs": [], 687 | "source": [ 688 | "import dataclasses\n", 689 | "from typing import Dict, List, Optional\n", 690 | "\n", 691 | "from evidently.pipeline.column_mapping import ColumnMapping\n", 692 | "from evidently.model_monitoring import ModelMonitoring\n", 693 | "from evidently.model_monitoring import ClassificationPerformanceMonitor\n", 694 | "from evidently.model_monitoring import DataDriftMonitor\n", 695 | "\n", 696 | "@dataclasses.dataclass\n", 697 | "class LoadedDataset:\n", 698 | " name: str\n", 699 | " references: pd.DataFrame\n", 700 | " monitors: List[str]\n", 701 | " column_mapping: ColumnMapping" 702 | ] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "execution_count": 11, 707 | "metadata": {}, 708 | "outputs": [ 709 | { 710 | "data": { 711 | "text/plain": [ 712 | "ColumnMapping(target='trip_completed', prediction='prediction', datetime='datetime', id=None, numerical_features=['conv_rate', 'acc_rate', 'avg_daily_trips'], categorical_features=[], datetime_features=None, target_names=None, task=None, pos_label=1)" 713 | ] 714 | }, 715 | "execution_count": 11, 716 | "metadata": {}, 717 | "output_type": "execute_result" 718 | } 719 | ], 720 | "source": [ 721 | "column_mapping = ColumnMapping(\n", 722 | " target=\"trip_completed\",\n", 723 | " prediction=\"prediction\",\n", 724 | " numerical_features=[\"conv_rate\", \"acc_rate\", \"avg_daily_trips\"],\n", 725 | " categorical_features=[],\n", 726 | ")\n", 727 | "column_mapping\n" 728 | ] 729 | }, 730 | { 731 | "cell_type": "code", 732 | "execution_count": 12, 733 | "metadata": {}, 734 | "outputs": [], 735 | "source": [ 736 | "features_and_target_monitor = ModelMonitoring(monitors=[DataDriftMonitor()])\n", 737 | "model_performance_monitor = ModelMonitoring(monitors=[ClassificationPerformanceMonitor()])" 738 | ] 739 | }, 740 | { 741 | "cell_type": "markdown", 742 | "metadata": {}, 743 | "source": [ 744 | "## Run data drift monitoring" 745 | ] 746 | }, 747 | { 748 | "cell_type": "code", 749 | "execution_count": 13, 750 | "metadata": {}, 751 | "outputs": [], 752 | "source": [ 753 | "def print_metrics(monitoring):\n", 754 | " for metric, value, labels in monitoring.metrics():\n", 755 | " report = f\"{metric.name} | {value} | {labels}\"\n", 756 | " print(report)" 757 | ] 758 | }, 759 | { 760 | "cell_type": "code", 761 | "execution_count": 14, 762 | "metadata": {}, 763 | "outputs": [ 764 | { 765 | "name": "stdout", 766 | "output_type": "stream", 767 | "text": [ 768 | "data_drift:share_drifted_features | 1.0 | None\n", 769 | "data_drift:n_drifted_features | 3 | None\n", 770 | "data_drift:dataset_drift | True | None\n", 771 | "data_drift:p_value | 0.007936507936507936 | {'feature': 'acc_rate', 'feature_type': 'num'}\n", 772 | "data_drift:p_value | 0.007936507936507936 | {'feature': 'avg_daily_trips', 'feature_type': 'num'}\n", 773 | "data_drift:p_value | 0.007936507936507936 | {'feature': 'conv_rate', 'feature_type': 'num'}\n" 774 | ] 775 | } 776 | ], 777 | "source": [ 778 | "features_and_target_monitor.execute(\n", 779 | " reference_data=normal_df,\n", 780 | " current_data=drift_df,\n", 781 | " column_mapping=column_mapping,\n", 782 | ")\n", 783 | "\n", 784 | "print_metrics(features_and_target_monitor)" 785 | ] 786 | }, 787 | { 788 | "cell_type": "markdown", 789 | "metadata": {}, 790 | "source": [ 791 | "## Run model performance monitoring" 792 | ] 793 | }, 794 | { 795 | "cell_type": "code", 796 | "execution_count": 15, 797 | "metadata": {}, 798 | "outputs": [ 799 | { 800 | "data": { 801 | "text/html": [ 802 | "
\n", 803 | "\n", 816 | "\n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | "
datetimedriver_idconv_rateacc_rateavg_daily_tripscreatedpredictiontrip_completed
02021-07-19 23:00:00+00:0010010.8863410.9268798072021-07-28 11:08:04.80210
12021-07-18 06:00:00+00:0010020.7710320.9294909502021-07-28 11:08:04.80211
22021-07-28 09:00:00+00:0010030.7500000.8928648032021-07-28 11:08:04.80210
32021-07-27 10:00:00+00:0010040.8843320.7500007502021-07-28 11:08:04.80210
42021-07-23 05:00:00+00:0010050.9500000.9500009462021-07-28 11:08:04.80211
\n", 888 | "
" 889 | ], 890 | "text/plain": [ 891 | " datetime driver_id conv_rate acc_rate avg_daily_trips \\\n", 892 | "0 2021-07-19 23:00:00+00:00 1001 0.886341 0.926879 807 \n", 893 | "1 2021-07-18 06:00:00+00:00 1002 0.771032 0.929490 950 \n", 894 | "2 2021-07-28 09:00:00+00:00 1003 0.750000 0.892864 803 \n", 895 | "3 2021-07-27 10:00:00+00:00 1004 0.884332 0.750000 750 \n", 896 | "4 2021-07-23 05:00:00+00:00 1005 0.950000 0.950000 946 \n", 897 | "\n", 898 | " created prediction trip_completed \n", 899 | "0 2021-07-28 11:08:04.802 1 0 \n", 900 | "1 2021-07-28 11:08:04.802 1 1 \n", 901 | "2 2021-07-28 11:08:04.802 1 0 \n", 902 | "3 2021-07-28 11:08:04.802 1 0 \n", 903 | "4 2021-07-28 11:08:04.802 1 1 " 904 | ] 905 | }, 906 | "execution_count": 15, 907 | "metadata": {}, 908 | "output_type": "execute_result" 909 | } 910 | ], 911 | "source": [ 912 | "predictions = [1] * drift_df.shape[0]\n", 913 | "drift_df = drift_df.assign(prediction=predictions)\n", 914 | "drift_df = drift_df.assign(trip_completed=request_df[\"trip_completed\"])\n", 915 | "drift_df" 916 | ] 917 | }, 918 | { 919 | "cell_type": "code", 920 | "execution_count": 16, 921 | "metadata": {}, 922 | "outputs": [ 923 | { 924 | "name": "stdout", 925 | "output_type": "stream", 926 | "text": [ 927 | "classification_performance:quality | 0.4 | {'dataset': 'reference', 'metric': 'accuracy'}\n", 928 | "classification_performance:quality | 0.2 | {'dataset': 'reference', 'metric': 'precision'}\n", 929 | "classification_performance:quality | 0.5 | {'dataset': 'reference', 'metric': 'recall'}\n", 930 | "classification_performance:quality | 0.28571428571428575 | {'dataset': 'reference', 'metric': 'f1'}\n", 931 | "classification_performance:class_quality | 0.0 | {'dataset': 'reference', 'class_name': '0', 'metric': 'precision'}\n", 932 | "classification_performance:class_quality | 0.0 | {'dataset': 'reference', 'class_name': '0', 'metric': 'recall'}\n", 933 | "classification_performance:class_quality | 0.0 | {'dataset': 'reference', 'class_name': '0', 'metric': 'f1'}\n", 934 | "classification_performance:class_quality | 0.4 | {'dataset': 'reference', 'class_name': '1', 'metric': 'precision'}\n", 935 | "classification_performance:class_quality | 1.0 | {'dataset': 'reference', 'class_name': '1', 'metric': 'recall'}\n", 936 | "classification_performance:class_quality | 0.5714285714285715 | {'dataset': 'reference', 'class_name': '1', 'metric': 'f1'}\n", 937 | "classification_performance:class_representation | 3 | {'dataset': 'reference', 'class_name': '0', 'type': 'target'}\n", 938 | "classification_performance:class_representation | 0 | {'dataset': 'reference', 'class_name': '0', 'type': 'prediction'}\n", 939 | "classification_performance:class_confusion | 0 | {'dataset': 'reference', 'class_name': '0', 'metric': 'TP'}\n", 940 | "classification_performance:class_confusion | 0 | {'dataset': 'reference', 'class_name': '0', 'metric': 'FP'}\n", 941 | "classification_performance:class_confusion | 2 | {'dataset': 'reference', 'class_name': '0', 'metric': 'TN'}\n", 942 | "classification_performance:class_confusion | 3 | {'dataset': 'reference', 'class_name': '0', 'metric': 'FN'}\n", 943 | "classification_performance:confusion | 0 | {'dataset': 'reference', 'class_x_name': '0', 'class_y_name': '0'}\n", 944 | "classification_performance:confusion | 3 | {'dataset': 'reference', 'class_x_name': '0', 'class_y_name': '1'}\n", 945 | "classification_performance:class_representation | 2 | {'dataset': 'reference', 'class_name': '1', 'type': 'target'}\n", 946 | "classification_performance:class_representation | 5 | {'dataset': 'reference', 'class_name': '1', 'type': 'prediction'}\n", 947 | "classification_performance:class_confusion | 2 | {'dataset': 'reference', 'class_name': '1', 'metric': 'TP'}\n", 948 | "classification_performance:class_confusion | 3 | {'dataset': 'reference', 'class_name': '1', 'metric': 'FP'}\n", 949 | "classification_performance:class_confusion | 0 | {'dataset': 'reference', 'class_name': '1', 'metric': 'TN'}\n", 950 | "classification_performance:class_confusion | 0 | {'dataset': 'reference', 'class_name': '1', 'metric': 'FN'}\n", 951 | "classification_performance:confusion | 0 | {'dataset': 'reference', 'class_x_name': '1', 'class_y_name': '0'}\n", 952 | "classification_performance:confusion | 2 | {'dataset': 'reference', 'class_x_name': '1', 'class_y_name': '1'}\n", 953 | "classification_performance:quality | 0.4 | {'dataset': 'current', 'metric': 'accuracy'}\n", 954 | "classification_performance:quality | 0.2 | {'dataset': 'current', 'metric': 'precision'}\n", 955 | "classification_performance:quality | 0.5 | {'dataset': 'current', 'metric': 'recall'}\n", 956 | "classification_performance:quality | 0.28571428571428575 | {'dataset': 'current', 'metric': 'f1'}\n", 957 | "classification_performance:class_quality | 0.0 | {'dataset': 'current', 'class_name': '0', 'metric': 'precision'}\n", 958 | "classification_performance:class_quality | 0.0 | {'dataset': 'current', 'class_name': '0', 'metric': 'recall'}\n", 959 | "classification_performance:class_quality | 0.0 | {'dataset': 'current', 'class_name': '0', 'metric': 'f1'}\n", 960 | "classification_performance:class_quality | 0.4 | {'dataset': 'current', 'class_name': '1', 'metric': 'precision'}\n", 961 | "classification_performance:class_quality | 1.0 | {'dataset': 'current', 'class_name': '1', 'metric': 'recall'}\n", 962 | "classification_performance:class_quality | 0.5714285714285715 | {'dataset': 'current', 'class_name': '1', 'metric': 'f1'}\n", 963 | "classification_performance:class_representation | 3 | {'dataset': 'current', 'class_name': '0', 'type': 'target'}\n", 964 | "classification_performance:class_representation | 0 | {'dataset': 'current', 'class_name': '0', 'type': 'prediction'}\n", 965 | "classification_performance:class_confusion | 0 | {'dataset': 'current', 'class_name': '0', 'metric': 'TP'}\n", 966 | "classification_performance:class_confusion | 0 | {'dataset': 'current', 'class_name': '0', 'metric': 'FP'}\n", 967 | "classification_performance:class_confusion | 2 | {'dataset': 'current', 'class_name': '0', 'metric': 'TN'}\n", 968 | "classification_performance:class_confusion | 3 | {'dataset': 'current', 'class_name': '0', 'metric': 'FN'}\n", 969 | "classification_performance:confusion | 0 | {'dataset': 'current', 'class_x_name': '0', 'class_y_name': '0'}\n", 970 | "classification_performance:confusion | 3 | {'dataset': 'current', 'class_x_name': '0', 'class_y_name': '1'}\n", 971 | "classification_performance:class_representation | 2 | {'dataset': 'current', 'class_name': '1', 'type': 'target'}\n", 972 | "classification_performance:class_representation | 5 | {'dataset': 'current', 'class_name': '1', 'type': 'prediction'}\n", 973 | "classification_performance:class_confusion | 2 | {'dataset': 'current', 'class_name': '1', 'metric': 'TP'}\n", 974 | "classification_performance:class_confusion | 3 | {'dataset': 'current', 'class_name': '1', 'metric': 'FP'}\n", 975 | "classification_performance:class_confusion | 0 | {'dataset': 'current', 'class_name': '1', 'metric': 'TN'}\n", 976 | "classification_performance:class_confusion | 0 | {'dataset': 'current', 'class_name': '1', 'metric': 'FN'}\n", 977 | "classification_performance:confusion | 0 | {'dataset': 'current', 'class_x_name': '1', 'class_y_name': '0'}\n", 978 | "classification_performance:confusion | 2 | {'dataset': 'current', 'class_x_name': '1', 'class_y_name': '1'}\n" 979 | ] 980 | }, 981 | { 982 | "name": "stderr", 983 | "output_type": "stream", 984 | "text": [ 985 | "/Users/tung.dao/miniconda3/envs/mlops-monitoring/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1334: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", 986 | " _warn_prf(average, modifier, msg_start, len(result))\n", 987 | "/Users/tung.dao/miniconda3/envs/mlops-monitoring/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1334: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", 988 | " _warn_prf(average, modifier, msg_start, len(result))\n", 989 | "/Users/tung.dao/miniconda3/envs/mlops-monitoring/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1334: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", 990 | " _warn_prf(average, modifier, msg_start, len(result))\n", 991 | "/Users/tung.dao/miniconda3/envs/mlops-monitoring/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1334: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", 992 | " _warn_prf(average, modifier, msg_start, len(result))\n", 993 | "/Users/tung.dao/miniconda3/envs/mlops-monitoring/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1334: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", 994 | " _warn_prf(average, modifier, msg_start, len(result))\n", 995 | "/Users/tung.dao/miniconda3/envs/mlops-monitoring/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1334: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", 996 | " _warn_prf(average, modifier, msg_start, len(result))\n" 997 | ] 998 | } 999 | ], 1000 | "source": [ 1001 | "model_performance_monitor.execute(\n", 1002 | " reference_data=drift_df,\n", 1003 | " current_data=drift_df,\n", 1004 | " column_mapping=column_mapping,\n", 1005 | ")\n", 1006 | "\n", 1007 | "print_metrics(model_performance_monitor)" 1008 | ] 1009 | }, 1010 | { 1011 | "cell_type": "code", 1012 | "execution_count": null, 1013 | "metadata": {}, 1014 | "outputs": [], 1015 | "source": [] 1016 | } 1017 | ], 1018 | "metadata": { 1019 | "kernelspec": { 1020 | "display_name": "Python 3.9.13 ('mlops-monitoring')", 1021 | "language": "python", 1022 | "name": "python3" 1023 | }, 1024 | "language_info": { 1025 | "codemirror_mode": { 1026 | "name": "ipython", 1027 | "version": 3 1028 | }, 1029 | "file_extension": ".py", 1030 | "mimetype": "text/x-python", 1031 | "name": "python", 1032 | "nbconvert_exporter": "python", 1033 | "pygments_lexer": "ipython3", 1034 | "version": "3.9.13" 1035 | }, 1036 | "orig_nbformat": 4, 1037 | "vscode": { 1038 | "interpreter": { 1039 | "hash": "0924ada4a589b75ad84c87d4985e6df9b04f255eff6dfbed9b8422ba8cd93a76" 1040 | } 1041 | } 1042 | }, 1043 | "nbformat": 4, 1044 | "nbformat_minor": 2 1045 | } 1046 | -------------------------------------------------------------------------------- /monitoring_service/src/mock_request.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import ast 3 | import json 4 | import subprocess 5 | import time 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import requests 10 | 11 | from utils import * 12 | 13 | Log(AppConst.MOCK_REQUEST) 14 | AppPath() 15 | 16 | ONLINE_SERVING_API = "http://localhost:8172/inference" 17 | MIN_DELAY_SEC = 1 18 | MAX_DELAY_SEC = 2 19 | 20 | 21 | def construct_request(row: pd.Series) -> dict: 22 | request_id = row["request_id"] 23 | driver_ids = ast.literal_eval(row["driver_ids"]) 24 | return { 25 | "request_id": request_id, 26 | "driver_ids": driver_ids, 27 | } 28 | 29 | 30 | def send_request(request: dict) -> None: 31 | Log().log.info(f"start send_request") 32 | 33 | try: 34 | data = json.dumps(request) 35 | Log().log.info(f"sending {data}") 36 | response = requests.post( 37 | ONLINE_SERVING_API, 38 | data=data, 39 | headers={"content-type": "application/json"}, 40 | ) 41 | 42 | if response.status_code == 200: 43 | Log().log.info(f"Success.") 44 | else: 45 | Log().log.info( 46 | f"Status code: {response.status_code}. Reason: {response.reason}, error text: {response.text}" 47 | ) 48 | 49 | except Exception as error: 50 | Log().log.info(f"Error: {error}") 51 | 52 | 53 | def main(data_type: str, n_request: int = 1): 54 | Log().log.info(f"load data") 55 | data_path = AppPath.NORMAL_DATA 56 | if data_type == DataType.DRIFT: 57 | data_path = AppPath.DRIFT_DATA 58 | data_source = pd.read_parquet(data_path, engine="fastparquet") 59 | request_data = pd.read_csv(AppPath.REQUEST_DATA) 60 | 61 | Log().log.info(f"write data_source to {AppPath.FEAST_DATA_SOURCE}") 62 | if AppPath.FEAST_DATA_SOURCE.exists(): 63 | os.remove(AppPath.FEAST_DATA_SOURCE) 64 | data_source.to_parquet(AppPath.FEAST_DATA_SOURCE, engine="fastparquet") 65 | 66 | Log().log.info(f"run feast_teardown") 67 | result = subprocess.run(["make", "feast_teardown"]) 68 | if result.returncode != 0: 69 | raise Exception("Failed to run feast_teardown") 70 | 71 | Log().log.info(f"run feast_apply") 72 | result = subprocess.run(["make", "feast_apply"]) 73 | if result.returncode != 0: 74 | raise Exception("Failed to run feast_apply") 75 | 76 | Log().log.info(f"run feast_materialize") 77 | result = subprocess.run(["make", "feast_materialize"]) 78 | if result.returncode != 0: 79 | raise Exception("Failed to run feast_materialize") 80 | 81 | Log().log.info(f"Send request to online serving API") 82 | 83 | Log().log.info(f"Start sending {n_request} requests") 84 | total_request = request_data.shape[0] 85 | for idx in range(n_request): 86 | Log().log.info(f"Sending request {idx+1}/{n_request}") 87 | row = request_data.iloc[idx % total_request] 88 | request = construct_request(row) 89 | send_request(request) 90 | delay_sec = np.random.uniform(low=MIN_DELAY_SEC, high=MAX_DELAY_SEC) 91 | Log().log.info(f"Wait {delay_sec} seconds") 92 | time.sleep(delay_sec) 93 | 94 | 95 | if __name__ == "__main__": 96 | parser = argparse.ArgumentParser() 97 | parser.add_argument( 98 | "-d", 99 | "--data_type", 100 | type=str, 101 | default=DataType.NORMAL, 102 | help=f"values=[{DataType.NORMAL}, {DataType.DRIFT}]", 103 | ) 104 | parser.add_argument( 105 | "-n", 106 | "--n_request", 107 | type=int, 108 | default=10, 109 | ) 110 | args = parser.parse_args() 111 | Log().log.info(f"args {args}") 112 | main(args.data_type, args.n_request) 113 | -------------------------------------------------------------------------------- /monitoring_service/src/monitoring_service.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from datetime import datetime, timedelta 3 | 4 | import flask 5 | import pandas as pd 6 | import prometheus_client 7 | from evidently.model_monitoring import ( 8 | ClassificationPerformanceMonitor, 9 | DataDriftMonitor, 10 | ModelMonitoring, 11 | ) 12 | from evidently.pipeline.column_mapping import ColumnMapping 13 | from flask import Flask 14 | from werkzeug.middleware.dispatcher import DispatcherMiddleware 15 | 16 | from utils import * 17 | 18 | Log(AppConst.MONITORING_SERVICE) 19 | AppPath() 20 | app = Flask(AppConst.MONITORING_SERVICE) 21 | 22 | # Add prometheus wsgi middleware to route /metrics requests 23 | app.wsgi_app = DispatcherMiddleware( 24 | app.wsgi_app, {"/metrics": prometheus_client.make_wsgi_app()} 25 | ) 26 | pd.set_option("display.max_columns", None) 27 | 28 | 29 | def read_reference_data() -> pd.DataFrame: 30 | Log().log.info("read_reference_data") 31 | reference_data = pd.read_parquet(AppPath.REFERENCE_PQ, engine="fastparquet") 32 | return reference_data 33 | 34 | 35 | def read_label_data() -> pd.DataFrame: 36 | Log().log.info("read_label_data") 37 | label_file = AppPath.REQUEST_DATA 38 | if not label_file.exists(): 39 | return None 40 | 41 | label_data = pd.read_csv(label_file) 42 | return label_data 43 | 44 | 45 | def merge_request_with_label( 46 | new_rows: pd.DataFrame, label_data: pd.DataFrame 47 | ) -> pd.DataFrame: 48 | Log().log.info("_merge_request_with_label") 49 | merged_data = pd.merge( 50 | left=new_rows, right=label_data, how="inner", on="request_id" 51 | ) 52 | merged_data["prediction"] = 1 53 | return merged_data 54 | 55 | 56 | class MonitoringService: 57 | DATASET_NAME = "drivers" 58 | WINDOW_SIZE = 5 59 | RUN_PERIOD_SEC = 5 60 | DATETIME_COL = "datetime" 61 | NUMERICAL_COLS = ["conv_rate", "acc_rate", "avg_daily_trips"] 62 | CATEGORICAL_COLS = [] 63 | TARGET_COL = "trip_completed" 64 | PREDICTION_COL = "prediction" 65 | 66 | def __init__(self) -> None: 67 | self.next_run = None 68 | self.current_data = None 69 | self.metrics = {} 70 | 71 | # read reference data 72 | self.reference_data = read_reference_data() 73 | Log().log.info(f"reference_data {self.reference_data}") 74 | 75 | # init column mapping 76 | self.column_mapping = ColumnMapping( 77 | target=self.TARGET_COL, 78 | prediction=self.PREDICTION_COL, 79 | numerical_features=self.NUMERICAL_COLS, 80 | categorical_features=self.CATEGORICAL_COLS, 81 | datetime=self.DATETIME_COL, 82 | ) 83 | Log().log.info(f"column_mapping {self.column_mapping}") 84 | 85 | # init monitoring 86 | self.features_and_target_monitor = ModelMonitoring( 87 | monitors=[DataDriftMonitor()] 88 | ) 89 | self.model_performance_monitor = ModelMonitoring( 90 | monitors=[ClassificationPerformanceMonitor()] 91 | ) 92 | 93 | def _process_curr_data(self, new_rows: pd.DataFrame): 94 | Log().log.info("_process_curr_data") 95 | label_data = read_label_data() 96 | if label_data is None: 97 | return False 98 | Log().log.info(label_data.info()) 99 | 100 | merged_data = merge_request_with_label(new_rows, label_data) 101 | Log().log.info(merged_data.info()) 102 | 103 | if not self.current_data is None: 104 | curr_data: pd.DataFrame = pd.concat( 105 | [self.current_data, merged_data], ignore_index=True 106 | ) 107 | else: 108 | curr_data = merged_data 109 | Log().log.info(curr_data.info()) 110 | 111 | curr_size = curr_data.shape[0] 112 | if curr_size > self.WINDOW_SIZE: 113 | curr_data.drop( 114 | index=list(range(0, curr_size - self.WINDOW_SIZE)), inplace=True 115 | ) 116 | curr_data.reset_index(drop=True, inplace=True) 117 | Log().log.info(curr_data.info()) 118 | 119 | self.current_data = curr_data 120 | Log().log.info(f"current_data {self.current_data}") 121 | 122 | if curr_size < self.WINDOW_SIZE: 123 | Log().log.info( 124 | f"Not enough data for measurement: {curr_size}/{self.WINDOW_SIZE} rows. Waiting for more data" 125 | ) 126 | return False 127 | return True 128 | 129 | def _process_next_run(self): 130 | Log().log.info("_process_next_run") 131 | if not self.next_run is None and self.next_run > datetime.now(): 132 | Log().log.info(f"Next run at {self.next_run}") 133 | return False 134 | 135 | self.next_run = datetime.now() + timedelta(seconds=self.RUN_PERIOD_SEC) 136 | return True 137 | 138 | def _execute_monitoring(self): 139 | Log().log.info("_execute_monitoring") 140 | self.features_and_target_monitor.execute( 141 | self.reference_data, 142 | self.current_data, 143 | self.column_mapping, 144 | ) 145 | self.model_performance_monitor.execute( 146 | self.current_data, 147 | self.current_data, 148 | self.column_mapping, 149 | ) 150 | 151 | def _process_metrics(self, evidently_metrics): 152 | for metric, value, labels in evidently_metrics: 153 | metric_key = f"evidently:{metric.name}" 154 | 155 | if not labels: 156 | labels = {} 157 | labels["dataset_name"] = MonitoringService.DATASET_NAME 158 | 159 | if isinstance(value, str): 160 | continue 161 | 162 | found = self.metrics.get(metric_key) 163 | if found is None: 164 | found = prometheus_client.Gauge( 165 | metric_key, "", list(sorted(labels.keys())) 166 | ) 167 | self.metrics[metric_key] = found 168 | 169 | try: 170 | found.labels(**labels).set(value) 171 | Log().log.info( 172 | f"Metric {metric_key}: Set {labels} to {value} successful" 173 | ) 174 | 175 | except ValueError as error: 176 | # ignore errors sending other metrics 177 | Log().log.error( 178 | f"Value error for metric key {metric_key}, error: {error}" 179 | ) 180 | 181 | def iterate(self, new_rows: pd.DataFrame): 182 | Log().log.info("iterate") 183 | if not self._process_curr_data(new_rows): 184 | return 185 | 186 | if not self._process_next_run(): 187 | return 188 | 189 | self._execute_monitoring() 190 | 191 | Log().log.info("_process_metrics features_and_target_monitor.metrics") 192 | self._process_metrics(self.features_and_target_monitor.metrics()) 193 | 194 | Log().log.info("_process_metrics model_performance_monitor.metrics") 195 | self._process_metrics(self.model_performance_monitor.metrics()) 196 | 197 | 198 | SERVICE = MonitoringService() 199 | 200 | 201 | @app.route("/iterate", methods=["POST"]) 202 | def iterate(): 203 | item = flask.request.json 204 | Log().log.info(f"receive item {item}") 205 | 206 | df = pd.DataFrame.from_dict(item) 207 | Log().log.info(f"df {df}") 208 | 209 | SERVICE.iterate(new_rows=df) 210 | return "ok" 211 | 212 | 213 | if __name__ == "__main__": 214 | parser = argparse.ArgumentParser() 215 | parser.add_argument( 216 | "-p", 217 | "--port", 218 | type=int, 219 | default=8309, 220 | ) 221 | args = parser.parse_args() 222 | Log().log.info(f"args {args}") 223 | app.run(host="0.0.0.0", port=args.port, debug=True) 224 | -------------------------------------------------------------------------------- /monitoring_service/src/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import sys 5 | from pathlib import Path 6 | 7 | import pandas as pd 8 | from dotenv import load_dotenv 9 | 10 | load_dotenv() 11 | 12 | 13 | class AppConst: 14 | LOG_LEVEL = logging.DEBUG 15 | MONITORING_SERVICE = "monitoring_service" 16 | MOCK_REQUEST = "mock_request" 17 | 18 | 19 | class DataType: 20 | NORMAL = "normal" 21 | DRIFT = "drift" 22 | 23 | 24 | class AppPath: 25 | # set MONITORING_SERVICE_DIR in dev environment for quickly testing the code 26 | ROOT = Path(os.environ.get("MONITORING_SERVICE_DIR", "/monitoring_service")) 27 | DATA = ROOT / "data" 28 | DATA_SOURCES = ROOT / "data_sources" 29 | FEATURE_REPO = ROOT / "feature_repo" 30 | ARTIFACTS = ROOT / "artifacts" 31 | 32 | REFERENCE_PQ = DATA / "mock_normal_data.parquet" 33 | FEAST_DATA_SOURCE = DATA_SOURCES / "driver_stats.parquet" 34 | NORMAL_DATA = DATA / "mock_normal_data.parquet" 35 | DRIFT_DATA = DATA / "mock_drift_data.parquet" 36 | REQUEST_DATA = DATA / "mock_request_data.csv" 37 | 38 | 39 | class Log: 40 | log: logging.Logger = None 41 | 42 | def __init__(self, name="") -> None: 43 | if Log.log == None: 44 | Log.log = self._init_logger(name) 45 | 46 | def _init_logger(self, name): 47 | logger = logging.getLogger(name) 48 | formatter = logging.Formatter( 49 | "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 50 | ) 51 | stream_handler = logging.StreamHandler(sys.stdout) 52 | stream_handler.setFormatter(formatter) 53 | logger.addHandler(stream_handler) 54 | logger.setLevel(AppConst.LOG_LEVEL) 55 | return logger 56 | 57 | 58 | def inspect_dir(path): 59 | Log().log.info(f"inspect_dir {path}") 60 | path = Path(path) 61 | if not path.exists(): 62 | Log().log.info(f"Path {path} doesn't exist") 63 | return 64 | elif path.is_file(): 65 | Log().log.info(f"Path {path} is file") 66 | return 67 | 68 | paths = os.listdir(path) 69 | paths = sorted(paths) 70 | for path in paths: 71 | Log().log.info(path) 72 | 73 | 74 | def inspect_curr_dir(): 75 | cwd = os.getcwd() 76 | Log().log.info(f"current dir: {cwd}") 77 | inspect_dir(cwd) 78 | 79 | 80 | def load_df(path) -> pd.DataFrame: 81 | Log().log.info(f"start load_df {path}") 82 | df = pd.read_parquet(path, engine="fastparquet") 83 | return df 84 | 85 | 86 | def to_parquet(df: pd.DataFrame, path): 87 | Log().log.info(f"start to_parquet {path}") 88 | df.to_parquet(path, engine="fastparquet") 89 | 90 | 91 | def dump_json(dict_obj: dict, path): 92 | with open(path, "w", encoding="utf-8") as f: 93 | json.dump(dict_obj, f) 94 | 95 | 96 | def load_json(path) -> dict: 97 | with open(path, "r", encoding="utf-8") as f: 98 | data = json.load(f) 99 | return data 100 | -------------------------------------------------------------------------------- /stream_emitting/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim 2 | 3 | # copy code, data and dependencies 4 | COPY stream_emitting.py stream_emitting.py 5 | COPY data/driver_stats_stream.parquet data/driver_stats_stream.parquet 6 | COPY dev_requirements.txt requirements.txt 7 | 8 | # install dependencies 9 | RUN pip3 install -r requirements.txt 10 | 11 | CMD python stream_emitting.py \ 12 | --bootstrap_servers ${BOOTSTRAP_SERVERS} \ 13 | --data ${DATA} 14 | -------------------------------------------------------------------------------- /stream_emitting/README.md: -------------------------------------------------------------------------------- 1 | # Stream emitting 2 | 3 | ```bash 4 | # start stream emitting service 5 | bash deploy.sh start 6 | 7 | # stop stream emitting service 8 | bash deploy.sh stop 9 | 10 | # teardown stream emitting service 11 | bash deploy.sh teardown 12 | ``` -------------------------------------------------------------------------------- /stream_emitting/data/driver_stats_stream.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLOpsVN/mlops-crash-course-code/8124d3c6afe344ff0df618ac99fcb1e5b1148be0/stream_emitting/data/driver_stats_stream.parquet -------------------------------------------------------------------------------- /stream_emitting/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cmd=$1 4 | 5 | # dockerhub username 6 | DOCKER_USER="mlopsvn" 7 | PROJECT="mlops_crash_course" 8 | 9 | usage() { 10 | echo "deploy.sh " 11 | echo "Available commands:" 12 | echo " start start emiting" 13 | echo " stop stop emiting" 14 | echo " teardown teardown emiting" 15 | } 16 | 17 | if [[ -z "$cmd" ]]; then 18 | echo "Missing command" 19 | usage 20 | exit 1 21 | fi 22 | 23 | start_emit() { 24 | docker-compose -f docker-compose.yaml up 25 | } 26 | 27 | stop_emit() { 28 | docker-compose -f docker-compose.yaml down 29 | } 30 | 31 | teardown_emit() { 32 | docker-compose -f docker-compose.yaml down --volumes 33 | } 34 | 35 | case $cmd in 36 | start) 37 | start_emit 38 | ;; 39 | stop) 40 | stop_emit 41 | ;; 42 | teardown) 43 | teardown_emit 44 | ;; 45 | *) 46 | echo -n "Unknown command: $cmd" 47 | ;; 48 | esac -------------------------------------------------------------------------------- /stream_emitting/dev_requirements.txt: -------------------------------------------------------------------------------- 1 | kafka-python==2.0.2 2 | pandas==1.4.4 3 | pyarrow==8.0.0 4 | fastparquet==0.8.3 -------------------------------------------------------------------------------- /stream_emitting/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | zookeeper: 4 | image: confluentinc/cp-zookeeper:7.0.1 5 | environment: 6 | ZOOKEEPER_CLIENT_PORT: 2181 7 | ZOOKEEPER_TICK_TIME: 2000 8 | ports: 9 | - 2181:2181 10 | 11 | broker: 12 | image: confluentinc/cp-kafka:7.0.1 13 | depends_on: 14 | - zookeeper 15 | ports: 16 | - 29092:29092 17 | environment: 18 | KAFKA_BROKER_ID: 1 19 | KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 20 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://broker:9092,PLAINTEXT_HOST://localhost:29092 21 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT 22 | KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT 23 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 24 | 25 | kafka_events: 26 | build: 27 | context: . 28 | dockerfile: Dockerfile 29 | environment: 30 | BOOTSTRAP_SERVERS: localhost:29092 31 | DATA: data/driver_stats_stream.parquet 32 | depends_on: 33 | - broker 34 | network_mode: "host" 35 | -------------------------------------------------------------------------------- /stream_emitting/stream_emitting.py: -------------------------------------------------------------------------------- 1 | from kafka import KafkaProducer 2 | from kafka.admin import KafkaAdminClient, NewTopic 3 | import argparse 4 | import json 5 | import pandas as pd 6 | from time import sleep 7 | 8 | import logging 9 | import sys 10 | 11 | # logging initialization 12 | logger = logging.getLogger() 13 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 14 | stream_handler = logging.StreamHandler(sys.stdout) 15 | stream_handler.setFormatter(formatter) 16 | logger.addHandler(stream_handler) 17 | logger.setLevel(logging.INFO) 18 | 19 | def main(args): 20 | for _ in range(20): 21 | try: 22 | # initialize admin and producer 23 | producer = KafkaProducer( 24 | bootstrap_servers=[args.bootstrap_servers], 25 | client_id="driver_stats_producer", 26 | ) 27 | 28 | admin = KafkaAdminClient( 29 | bootstrap_servers=[args.bootstrap_servers], 30 | client_id="driver_stats_admin", 31 | ) 32 | 33 | logger.info("Starting admin and producer successfully!") 34 | break 35 | except Exception as e: 36 | logger.info(f"Failed to start admin and producer with error {e}") 37 | sleep(20) 38 | pass 39 | 40 | try: 41 | # create a Kafka topic 42 | topic = NewTopic(name=args.topic_name, num_partitions=1, replication_factor=1) 43 | admin.create_topics(new_topics=[topic]) 44 | logger.info(f"Created topic {args.topic_name}") 45 | except Exception as e: 46 | logger.info(f"Failed to create a new topic with error {e}") 47 | sleep(20) 48 | pass 49 | 50 | # publish messages to the newly created topic 51 | df = pd.read_parquet(args.data).sort_values(by="datetime") 52 | records = df[["driver_id", "datetime", "created", "conv_rate", "acc_rate"]].to_dict("records") 53 | 54 | # simulate streaming events by increase one week for every records 55 | iteration = 1 56 | while True: 57 | for record in records: 58 | record["datetime"] = ( 59 | record["datetime"] + pd.Timedelta(weeks=iteration) 60 | ).strftime("%Y-%m-%d %H:%M:%S") 61 | record["created"] = record["created"].strftime("%Y-%m-%d %H:%M:%S") 62 | producer.send(args.topic_name, json.dumps(record).encode()) 63 | # sleep 5 seconds before continuing pushing events 64 | sleep(5) 65 | logger.info(record) 66 | iteration += 1 67 | 68 | if __name__ == "__main__": 69 | parser = argparse.ArgumentParser(description='Kafka event emiting.') 70 | parser.add_argument('-b', '--bootstrap_servers', default="localhost:29092", 71 | help='Kafka bootstrap servers') 72 | parser.add_argument('-t', '--topic_name', default="drivers", 73 | help='Kafka topic name') 74 | parser.add_argument('-d', '--data', default="data/driver_stats_stream.parquet", 75 | help='data path to create streams') 76 | args = parser.parse_args() 77 | main(args) -------------------------------------------------------------------------------- /training_pipeline/.dockerignore: -------------------------------------------------------------------------------- 1 | .dockerignore 2 | __pycache__ 3 | artifacts 4 | dev_requirements.txt 5 | nbs 6 | dags 7 | feature_repo 8 | -------------------------------------------------------------------------------- /training_pipeline/.env: -------------------------------------------------------------------------------- 1 | RANDOM_SEED="17" 2 | TEST_SIZE="0.2" 3 | TARGET_COL="trip_completed" 4 | 5 | EXPERIMENT_NAME="elastic-net" 6 | ALPHA="0.5" 7 | L1_RATIO="0.1" 8 | MLFLOW_TRACKING_URI="http://host.docker.internal:5000" 9 | 10 | RMSE_THRESHOLD="0.2" 11 | MAE_THRESHOLD="0.2" 12 | REGISTERED_MODEL_NAME="sklearn-elastic-net-reg" 13 | -------------------------------------------------------------------------------- /training_pipeline/.gitignore: -------------------------------------------------------------------------------- 1 | artifacts/* 2 | data_sources 3 | feature_repo 4 | nbs/model 5 | -------------------------------------------------------------------------------- /training_pipeline/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/bash 2 | 3 | ENV_FILE="./deployment/.env" 4 | 5 | feast_apply: 6 | source ${ENV_FILE} && cd feature_repo && feast apply 7 | 8 | build_image: 9 | source ${ENV_FILE} && bash deployment/deploy.sh build 10 | 11 | build_push_image: 12 | source ${ENV_FILE} && bash deployment/deploy.sh build_push 13 | 14 | deploy_dags: 15 | source ${ENV_FILE} && bash deployment/deploy.sh dags 16 | 17 | deploy_registered_model_file: 18 | source ${ENV_FILE} && bash deployment/deploy.sh registered_model_file 19 | -------------------------------------------------------------------------------- /training_pipeline/README.md: -------------------------------------------------------------------------------- 1 | # Training pipeline 2 | 3 | ```bash 4 | # Go to data pipeline and deploy feature repo 5 | cd ../data_pipeline 6 | make deploy_feature_repo 7 | cd ../training_pipeline 8 | 9 | # To test source files at local before running in Airflow 10 | export TRAINING_PIPELINE_DIR="path/to/mlops-crash-course-code/training_pipeline" 11 | cd src 12 | python 13 | 14 | # Build 15 | make build_image && make deploy_dags 16 | 17 | # Go to airflow UI 18 | # Set variable MLOPS_CRASH_COURSE_CODE_DIR=path/to/mlops-crash-course-code 19 | # Run dags 20 | ``` 21 | -------------------------------------------------------------------------------- /training_pipeline/artifacts/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLOpsVN/mlops-crash-course-code/8124d3c6afe344ff0df618ac99fcb1e5b1148be0/training_pipeline/artifacts/.gitkeep -------------------------------------------------------------------------------- /training_pipeline/dags/training_dag.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | BASE_DIR = os.path.dirname(os.path.abspath(__file__)) 5 | sys.path.append(BASE_DIR) 6 | 7 | import pendulum 8 | from airflow import DAG 9 | from airflow.providers.docker.operators.docker import DockerOperator 10 | 11 | from utils import * 12 | 13 | 14 | with DAG( 15 | dag_id="training_pipeline", 16 | default_args=DefaultConfig.DEFAULT_DAG_ARGS, 17 | schedule_interval="@once", 18 | start_date=pendulum.datetime(2022, 1, 1, tz="UTC"), 19 | catchup=False, 20 | tags=["training_pipeline"], 21 | ) as dag: 22 | feature_store_init_task = DockerOperator( 23 | task_id="feature_store_init_task", 24 | command="bash -c 'cd feature_repo && feast apply'", 25 | **DefaultConfig.DEFAULT_DOCKER_OPERATOR_ARGS, 26 | ) 27 | 28 | data_extraction_task = DockerOperator( 29 | task_id="data_extraction_task", 30 | command="bash -c 'cd src && python data_extraction.py'", 31 | **DefaultConfig.DEFAULT_DOCKER_OPERATOR_ARGS, 32 | ) 33 | 34 | data_validation_task = DockerOperator( 35 | task_id="data_validation_task", 36 | command="bash -c 'cd src && python data_validation.py'", 37 | **DefaultConfig.DEFAULT_DOCKER_OPERATOR_ARGS, 38 | ) 39 | 40 | data_preparation_task = DockerOperator( 41 | task_id="data_preparation_task", 42 | command="bash -c 'cd src && python data_preparation.py'", 43 | **DefaultConfig.DEFAULT_DOCKER_OPERATOR_ARGS, 44 | ) 45 | 46 | model_training_task = DockerOperator( 47 | task_id="model_training_task", 48 | command="bash -c 'cd src && python model_training.py'", 49 | **DefaultConfig.DEFAULT_DOCKER_OPERATOR_ARGS, 50 | ) 51 | 52 | model_evaluation_task = DockerOperator( 53 | task_id="model_evaluation_task", 54 | command="bash -c 'cd src && python model_evaluation.py'", 55 | **DefaultConfig.DEFAULT_DOCKER_OPERATOR_ARGS, 56 | ) 57 | 58 | model_validation_task = DockerOperator( 59 | task_id="model_validation_task", 60 | command="bash -c 'cd src && python model_validation.py'", 61 | **DefaultConfig.DEFAULT_DOCKER_OPERATOR_ARGS, 62 | ) 63 | 64 | ( 65 | feature_store_init_task 66 | >> data_extraction_task 67 | >> data_validation_task 68 | >> data_preparation_task 69 | >> model_training_task 70 | >> model_evaluation_task 71 | >> model_validation_task 72 | ) 73 | -------------------------------------------------------------------------------- /training_pipeline/dags/utils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pendulum 4 | from airflow.models import Variable 5 | from docker.types import Mount 6 | 7 | 8 | class AppConst: 9 | DOCKER_USER = Variable.get("DOCKER_USER", "mlopsvn") 10 | 11 | 12 | class AppPath: 13 | MLOPS_CRASH_COURSE_CODE_DIR = Path(Variable.get("MLOPS_CRASH_COURSE_CODE_DIR")) 14 | TRAINING_PIPELINE_DIR = MLOPS_CRASH_COURSE_CODE_DIR / "training_pipeline" 15 | FEATURE_REPO = TRAINING_PIPELINE_DIR / "feature_repo" 16 | ARTIFACTS = TRAINING_PIPELINE_DIR / "artifacts" 17 | 18 | 19 | class DefaultConfig: 20 | DEFAULT_DAG_ARGS = { 21 | "owner": "mlopsvn", 22 | "retries": 0, 23 | "retry_delay": pendulum.duration(seconds=20), 24 | } 25 | 26 | DEFAULT_DOCKER_OPERATOR_ARGS = { 27 | "image": f"{AppConst.DOCKER_USER}/mlops_crash_course/training_pipeline:latest", 28 | "api_version": "auto", 29 | "auto_remove": True, 30 | "network_mode": "bridge", 31 | "docker_url": "tcp://docker-proxy:2375", 32 | "mounts": [ 33 | # feature repo 34 | Mount( 35 | source=AppPath.FEATURE_REPO.absolute().as_posix(), 36 | target="/training_pipeline/feature_repo", 37 | type="bind", 38 | ), 39 | # artifacts 40 | Mount( 41 | source=AppPath.ARTIFACTS.absolute().as_posix(), 42 | target="/training_pipeline/artifacts", 43 | type="bind", 44 | ), 45 | ], 46 | } 47 | -------------------------------------------------------------------------------- /training_pipeline/data/driver_orders.csv: -------------------------------------------------------------------------------- 1 | event_timestamp driver_id trip_completed 2 | 2021-04-16 20:29:28+00:00 1001 1 3 | 2021-04-17 04:29:28+00:00 1002 0 4 | 2021-04-17 12:29:28+00:00 1003 0 5 | 2021-04-17 20:29:28+00:00 1001 1 6 | 2021-04-18 04:29:28+00:00 1002 0 7 | 2021-04-18 12:29:28+00:00 1003 0 8 | 2021-04-18 20:29:28+00:00 1001 1 9 | 2021-04-19 04:29:28+00:00 1002 0 10 | 2021-04-19 12:29:28+00:00 1003 0 11 | 2021-04-19 20:29:28+00:00 1004 1 12 | -------------------------------------------------------------------------------- /training_pipeline/deployment/.env: -------------------------------------------------------------------------------- 1 | export DOCKER_USER="mlopsvn" 2 | export DAGS_DIR="../../mlops-crash-course-platform/airflow/run_env/dags/training_pipeline" 3 | -------------------------------------------------------------------------------- /training_pipeline/deployment/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim AS build 2 | 3 | RUN apt-get update 4 | 5 | RUN python -m venv /opt/venv 6 | ENV PATH="/opt/venv/bin:$PATH" 7 | 8 | COPY deployment/requirements.txt . 9 | RUN pip install -r requirements.txt 10 | 11 | FROM python:3.9-slim 12 | 13 | COPY --from=build /opt/venv /opt/venv 14 | ENV PATH="/opt/venv/bin:$PATH" 15 | COPY . /training_pipeline 16 | WORKDIR /training_pipeline 17 | -------------------------------------------------------------------------------- /training_pipeline/deployment/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cmd=$1 4 | 5 | # constants 6 | DOCKER_USER="$DOCKER_USER" 7 | PROJECT="mlops_crash_course" 8 | IMAGE_NAME="training_pipeline" 9 | IMAGE_TAG=$(git describe --always) 10 | 11 | if [[ -z "$DOCKER_USER" ]]; then 12 | echo "Missing \$DOCKER_USER env var" 13 | exit 1 14 | fi 15 | 16 | usage() { 17 | echo "deploy.sh " 18 | echo "Available commands:" 19 | echo " build build image" 20 | echo " push push image" 21 | echo " build_push build and push image" 22 | echo " dags deploy airflow dags" 23 | echo " registered_model_file deploy registered model file to model_serving" 24 | echo "Available arguments:" 25 | echo " [dags dir] airflow dags directory, for command dags only" 26 | } 27 | 28 | if [[ -z "$cmd" ]]; then 29 | echo "Missing command" 30 | usage 31 | exit 1 32 | fi 33 | 34 | build() { 35 | docker build --tag $DOCKER_USER/$PROJECT/$IMAGE_NAME:$IMAGE_TAG -f deployment/Dockerfile . 36 | docker tag $DOCKER_USER/$PROJECT/$IMAGE_NAME:$IMAGE_TAG $DOCKER_USER/$PROJECT/$IMAGE_NAME:latest 37 | } 38 | 39 | push() { 40 | docker push $DOCKER_USER/$PROJECT/$IMAGE_NAME:$IMAGE_TAG 41 | docker push $DOCKER_USER/$PROJECT/$IMAGE_NAME:latest 42 | } 43 | 44 | deploy_dags() { 45 | if [[ -z "$DAGS_DIR" ]]; then 46 | echo "Missing DAGS_DIR env var" 47 | usage 48 | exit 1 49 | fi 50 | 51 | mkdir -p "$DAGS_DIR" 52 | cp dags/* "$DAGS_DIR" 53 | } 54 | 55 | deploy_registered_model_file() { 56 | registered_model_file="./artifacts/registered_model_version.json" 57 | if [[ ! -f "$registered_model_file" ]]; then 58 | echo "$registered_model_file doesn't exist" 59 | exit 1 60 | fi 61 | 62 | model_serving_artifacts_dir="../model_serving/artifacts/" 63 | cp "$registered_model_file" "$model_serving_artifacts_dir" 64 | } 65 | 66 | shift 67 | 68 | case $cmd in 69 | build) 70 | build "$@" 71 | ;; 72 | push) 73 | push "$@" 74 | ;; 75 | build_push) 76 | build "$@" 77 | push "$@" 78 | ;; 79 | dags) 80 | deploy_dags "$@" 81 | ;; 82 | registered_model_file) 83 | deploy_registered_model_file "$@" 84 | ;; 85 | *) 86 | echo -n "Unknown command: $cmd" 87 | usage 88 | exit 1 89 | ;; 90 | esac 91 | -------------------------------------------------------------------------------- /training_pipeline/deployment/requirements.txt: -------------------------------------------------------------------------------- 1 | fastparquet==0.8.3 2 | feast[redis]==0.24.0 3 | mlflow==1.29.0 4 | pandas==1.4.4 5 | pyspark==3.0.1 6 | scikit-learn==1.1.2 7 | -------------------------------------------------------------------------------- /training_pipeline/dev_requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==2.3.4 2 | apache-airflow-providers-docker==3.1.0 3 | black==22.6.0 4 | fastparquet==0.8.3 5 | feast[redis]==0.24.0 6 | jupyterlab==3.4.5 7 | mlflow==1.29.0 8 | pandas==1.4.4 9 | pyspark==3.0.1 10 | scikit-learn==1.1.2 11 | -------------------------------------------------------------------------------- /training_pipeline/nbs/data/exp_driver_orders.csv: -------------------------------------------------------------------------------- 1 | event_timestamp driver_id trip_completed 2 | 2021-04-16 20:29:28+00:00 1001 1 3 | 2021-04-17 04:29:28+00:00 1002 0 4 | 2021-04-17 12:29:28+00:00 1003 0 5 | 2021-04-17 20:29:28+00:00 1001 1 6 | 2021-04-18 04:29:28+00:00 1002 0 7 | 2021-04-18 12:29:28+00:00 1003 0 8 | 2021-04-18 20:29:28+00:00 1001 1 9 | 2021-04-19 04:29:28+00:00 1002 0 10 | 2021-04-19 12:29:28+00:00 1003 0 11 | 2021-04-19 20:29:28+00:00 1004 1 12 | -------------------------------------------------------------------------------- /training_pipeline/nbs/data/exp_driver_stats.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MLOpsVN/mlops-crash-course-code/8124d3c6afe344ff0df618ac99fcb1e5b1148be0/training_pipeline/nbs/data/exp_driver_stats.parquet -------------------------------------------------------------------------------- /training_pipeline/src/data_extraction.py: -------------------------------------------------------------------------------- 1 | import feast 2 | import pandas as pd 3 | 4 | from utils import * 5 | 6 | Log(AppConst.DATA_EXTRACTION) 7 | AppPath() 8 | 9 | 10 | def extract_data(): 11 | Log().log.info("start extract_data") 12 | inspect_curr_dir() 13 | 14 | # Connect to your feature store provider 15 | inspect_dir(AppPath.DATA_SOURCES) 16 | inspect_dir(AppPath.FEATURE_REPO) 17 | fs = feast.FeatureStore(repo_path=AppPath.FEATURE_REPO) 18 | 19 | # Load driver order data 20 | inspect_dir(AppPath.DATA) 21 | orders = pd.read_csv(AppPath.DATA / "driver_orders.csv", sep="\t") 22 | orders["event_timestamp"] = pd.to_datetime(orders["event_timestamp"]) 23 | 24 | # Retrieve training data 25 | training_df = fs.get_historical_features( 26 | entity_df=orders, 27 | features=[ 28 | "driver_stats:conv_rate", 29 | "driver_stats:acc_rate", 30 | "driver_stats:avg_daily_trips", 31 | ], 32 | ).to_df() 33 | 34 | training_df = training_df.drop(["event_timestamp", "driver_id"], axis=1) 35 | 36 | Log().log.info("----- Feature schema -----") 37 | Log().log.info(training_df.info()) 38 | 39 | Log().log.info("----- Example features -----") 40 | Log().log.info(training_df.head()) 41 | 42 | # Write to file 43 | to_parquet(training_df, AppPath.TRAINING_PQ) 44 | inspect_dir(AppPath.TRAINING_PQ.parent) 45 | 46 | 47 | if __name__ == "__main__": 48 | extract_data() 49 | -------------------------------------------------------------------------------- /training_pipeline/src/data_preparation.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.model_selection import train_test_split 3 | 4 | from utils import * 5 | 6 | Log(AppConst.DATA_PREPARATION) 7 | AppPath() 8 | 9 | 10 | def prepare_data(df: pd.DataFrame): 11 | Log().log.info("start prepare_data") 12 | inspect_curr_dir() 13 | 14 | config = Config() 15 | Log().log.info(f"config: {config.__dict__}") 16 | train, test = train_test_split( 17 | df, test_size=config.test_size, random_state=config.random_seed 18 | ) 19 | target_col = config.target_col 20 | train_x = train.drop([target_col], axis=1) 21 | train_y = train[[target_col]] 22 | test_x = test.drop([target_col], axis=1) 23 | test_y = test[[target_col]] 24 | 25 | to_parquet(train_x, AppPath.TRAIN_X_PQ) 26 | to_parquet(train_y, AppPath.TRAIN_Y_PQ) 27 | to_parquet(test_x, AppPath.TEST_X_PQ) 28 | to_parquet(test_y, AppPath.TEST_Y_PQ) 29 | inspect_dir(AppPath.TRAIN_X_PQ.parent) 30 | 31 | 32 | if __name__ == "__main__": 33 | df = load_df(AppPath.TRAINING_PQ) 34 | prepare_data(df) 35 | -------------------------------------------------------------------------------- /training_pipeline/src/data_validation.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from utils import * 4 | 5 | Log(AppConst.DATA_VALIDATION) 6 | AppPath() 7 | 8 | 9 | # Check data schema skews 10 | def check_unexpected_features(df: pd.DataFrame): 11 | Log().log.info("start check_unexpected_features") 12 | config = Config() 13 | Log().log.info(f"config: {config.__dict__}") 14 | cols = set(df.columns) 15 | errors = [] 16 | for col in cols: 17 | if not col in config.feature_dict: 18 | errors.append(f"feature {col} is not expected") 19 | 20 | if len(errors) > 0: 21 | raise Exception(errors) 22 | 23 | 24 | def check_expected_features(df: pd.DataFrame): 25 | Log().log.info("start check_expected_features") 26 | config = Config() 27 | Log().log.info(f"config: {config.__dict__}") 28 | dtypes = dict(df.dtypes) 29 | errors = [] 30 | for feature in config.feature_dict: 31 | if not feature in dtypes: 32 | errors.append(f"feature {feature} not found") 33 | else: 34 | expected_type = config.feature_dict[feature] 35 | real_type = dtypes[feature] 36 | if expected_type != real_type: 37 | errors.append( 38 | f"feature {feature} expects type {expected_type}, received {real_type}" 39 | ) 40 | 41 | if len(errors) > 0: 42 | raise Exception(errors) 43 | 44 | 45 | # Check data values skews 46 | def check_data_values_skews(df: pd.DataFrame): 47 | Log().log.info("start check_data_values_skews") 48 | 49 | 50 | # combine 51 | def validate_data(): 52 | Log().log.info("start validate_data") 53 | inspect_curr_dir() 54 | 55 | df = load_df(AppPath.TRAINING_PQ) 56 | check_unexpected_features(df) 57 | check_expected_features(df) 58 | check_data_values_skews(df) 59 | 60 | 61 | if __name__ == "__main__": 62 | validate_data() 63 | -------------------------------------------------------------------------------- /training_pipeline/src/model_evaluation.py: -------------------------------------------------------------------------------- 1 | import mlflow 2 | import numpy as np 3 | from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score 4 | 5 | from utils import * 6 | 7 | Log(AppConst.MODEL_EVALUATION) 8 | AppPath() 9 | 10 | 11 | def eval_metrics(actual, pred): 12 | Log().log.info("start eval_metrics") 13 | rmse = np.sqrt(mean_squared_error(actual, pred)) 14 | mae = mean_absolute_error(actual, pred) 15 | return rmse, mae 16 | 17 | 18 | def evaluate_model(): 19 | Log().log.info("start evaluate_model") 20 | inspect_curr_dir() 21 | 22 | run_info = RunInfo.load(AppPath.RUN_INFO) 23 | Log().log.info(f"loaded run_info {run_info.__dict__}") 24 | 25 | config = Config() 26 | Log().log.info(f"config: {config.__dict__}") 27 | mlflow.set_tracking_uri(config.mlflow_tracking_uri) 28 | 29 | model = mlflow.pyfunc.load_model( 30 | f"runs:/{run_info.run_id}/{AppConst.MLFLOW_MODEL_PATH_PREFIX}" 31 | ) 32 | Log().log.info(f"loaded model {model.__dict__}") 33 | 34 | test_x = load_df(AppPath.TEST_X_PQ) 35 | test_y = load_df(AppPath.TEST_Y_PQ) 36 | 37 | predicted_qualities = model.predict(test_x) 38 | (rmse, mae) = eval_metrics(test_y, predicted_qualities) 39 | 40 | # Write evaluation result to file 41 | eval_result = EvaluationResult(rmse, mae) 42 | Log().log.info(f"eval result: {eval_result}") 43 | eval_result.save() 44 | inspect_dir(eval_result.path) 45 | 46 | 47 | if __name__ == "__main__": 48 | evaluate_model() 49 | -------------------------------------------------------------------------------- /training_pipeline/src/model_training.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | 3 | import mlflow 4 | from mlflow.models.signature import infer_signature 5 | from mlflow.tracking import MlflowClient 6 | from sklearn.linear_model import ElasticNet 7 | 8 | from utils import * 9 | 10 | Log(AppConst.MODEL_TRAINING) 11 | AppPath() 12 | 13 | 14 | def yield_artifacts(run_id, path=None): 15 | """Yield all artifacts in the specified run""" 16 | client = MlflowClient() 17 | for item in client.list_artifacts(run_id, path): 18 | if item.is_dir: 19 | yield from yield_artifacts(run_id, item.path) 20 | else: 21 | yield item.path 22 | 23 | 24 | def fetch_logged_data(run_id): 25 | """Fetch params, metrics, tags, and artifacts in the specified run""" 26 | client = MlflowClient() 27 | data = client.get_run(run_id).data 28 | # Exclude system tags: https://www.mlflow.org/docs/latest/tracking.html#system-tags 29 | tags = {k: v for k, v in data.tags.items() if not k.startswith("mlflow.")} 30 | artifacts = list(yield_artifacts(run_id)) 31 | return { 32 | "params": data.params, 33 | "metrics": data.metrics, 34 | "tags": tags, 35 | "artifacts": artifacts, 36 | } 37 | 38 | 39 | def train_model(): 40 | Log().log.info("start train_model") 41 | inspect_curr_dir() 42 | 43 | # Setup tracking server 44 | config = Config() 45 | Log().log.info(f"config: {config.__dict__}") 46 | mlflow.set_tracking_uri(config.mlflow_tracking_uri) 47 | mlflow.set_experiment(config.experiment_name) 48 | Log().log.info((mlflow.get_tracking_uri(), mlflow.get_artifact_uri())) 49 | mlflow.sklearn.autolog() 50 | 51 | # Load data 52 | train_x = load_df(AppPath.TRAIN_X_PQ) 53 | train_y = load_df(AppPath.TRAIN_Y_PQ) 54 | 55 | # Training 56 | model = ElasticNet( 57 | alpha=config.alpha, 58 | l1_ratio=config.l1_ratio, 59 | random_state=config.random_seed, 60 | ) 61 | model.fit(train_x, train_y) 62 | 63 | # Log metadata 64 | mlflow.set_tag("mlflow.runName", str(uuid.uuid1())[:8]) 65 | mlflow.log_param("alpha", config.alpha) 66 | mlflow.log_param("l1_ratio", config.l1_ratio) 67 | signature = infer_signature(train_x, model.predict(train_x)) 68 | mlflow.sklearn.log_model( 69 | sk_model=model, 70 | artifact_path=AppConst.MLFLOW_MODEL_PATH_PREFIX, 71 | signature=signature, 72 | ) 73 | mlflow.end_run() 74 | 75 | # Inspect metadata 76 | run_id = mlflow.last_active_run().info.run_id 77 | Log().log.info("Logged data and model in run {}".format(run_id)) 78 | for key, data in fetch_logged_data(run_id).items(): 79 | Log().log.info("\n---------- logged {} ----------".format(key)) 80 | Log().log.info(data) 81 | 82 | # Write latest run_id to file 83 | run_info = RunInfo(run_id) 84 | run_info.save() 85 | inspect_dir(run_info.path) 86 | 87 | 88 | if __name__ == "__main__": 89 | train_model() 90 | -------------------------------------------------------------------------------- /training_pipeline/src/model_validation.py: -------------------------------------------------------------------------------- 1 | import mlflow 2 | from utils import * 3 | 4 | Log(AppConst.MODEL_VALIDATION) 5 | AppPath() 6 | 7 | 8 | def validate_model(): 9 | Log().log.info("start validate_model") 10 | inspect_curr_dir() 11 | 12 | eval_result = EvaluationResult.load(AppPath.EVALUATION_RESULT) 13 | Log().log.info(f"loaded eval_result {eval_result.__dict__}") 14 | 15 | errors = [] 16 | config = Config() 17 | Log().log.info(f"config: {config.__dict__}") 18 | if eval_result.rmse > config.rmse_threshold: 19 | errors.append( 20 | f"rmse result {eval_result.rmse} exceeds threshold {config.rmse_threshold}" 21 | ) 22 | if eval_result.mae > config.mae_threshold: 23 | errors.append( 24 | f"mae result {eval_result.mae} exceeds threshold {config.mae_threshold}" 25 | ) 26 | 27 | if len(errors) > 0: 28 | Log().log.info(f"Model validation fails, will not register model: {errors}") 29 | return 30 | 31 | Log().log.info(f"Model validation succeeds, registering model") 32 | run_info = RunInfo.load(AppPath.RUN_INFO) 33 | Log().log.info(f"loaded run_info {run_info.__dict__}") 34 | 35 | mlflow.set_tracking_uri(config.mlflow_tracking_uri) 36 | result = mlflow.register_model( 37 | f"runs:/{run_info.run_id}/{AppConst.MLFLOW_MODEL_PATH_PREFIX}", 38 | config.registered_model_name, 39 | ) 40 | dump_json(result.__dict__, AppPath.REGISTERED_MODEL_VERSION) 41 | inspect_dir(AppPath.REGISTERED_MODEL_VERSION) 42 | 43 | 44 | if __name__ == "__main__": 45 | validate_model() 46 | -------------------------------------------------------------------------------- /training_pipeline/src/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import sys 5 | from pathlib import Path 6 | 7 | import pandas as pd 8 | from dotenv import load_dotenv 9 | 10 | load_dotenv() 11 | 12 | 13 | class AppConst: 14 | LOG_LEVEL = logging.DEBUG 15 | DATA_EXTRACTION = "data_extraction" 16 | DATA_VALIDATION = "data_validation" 17 | DATA_PREPARATION = "data_preparation" 18 | MODEL_TRAINING = "model_training" 19 | MODEL_EVALUATION = "model_evaluation" 20 | MODEL_VALIDATION = "model_validation" 21 | MLFLOW_MODEL_PATH_PREFIX = "model" 22 | 23 | 24 | class AppPath: 25 | # set TRAINING_PIPELINE_DIR in dev environment for quickly testing the code 26 | ROOT = Path(os.environ.get("TRAINING_PIPELINE_DIR", "/training_pipeline")) 27 | DATA = ROOT / "data" 28 | DATA_SOURCES = ROOT / "data_sources" 29 | FEATURE_REPO = ROOT / "feature_repo" 30 | ARTIFACTS = ROOT / "artifacts" 31 | 32 | TRAINING_PQ = ARTIFACTS / "training.parquet" 33 | TRAIN_X_PQ = ARTIFACTS / "train_x.parquet" 34 | TRAIN_Y_PQ = ARTIFACTS / "train_y.parquet" 35 | TEST_X_PQ = ARTIFACTS / "test_x.parquet" 36 | TEST_Y_PQ = ARTIFACTS / "test_y.parquet" 37 | RUN_INFO = ARTIFACTS / "run_info.json" 38 | EVALUATION_RESULT = ARTIFACTS / "evaluation.json" 39 | REGISTERED_MODEL_VERSION = ARTIFACTS / "registered_model_version.json" 40 | 41 | def __init__(self) -> None: 42 | AppPath.ARTIFACTS.mkdir(parents=True, exist_ok=True) 43 | 44 | 45 | class Config: 46 | def __init__(self) -> None: 47 | import numpy as np 48 | 49 | self.random_seed = int(os.environ.get("RANDOM_SEED")) 50 | self.feature_dict = { 51 | # "event_timestamp": pd.DatetimeTZDtype(tz="UTC"), 52 | # "driver_id": np.int64, 53 | "conv_rate": np.float64, 54 | "acc_rate": np.float64, 55 | "avg_daily_trips": np.int64, 56 | "trip_completed": np.int64, 57 | } 58 | self.target_col = os.environ.get("TARGET_COL") 59 | self.test_size = float(os.environ.get("TEST_SIZE")) 60 | self.experiment_name = os.environ.get("EXPERIMENT_NAME") 61 | self.mlflow_tracking_uri = os.environ.get("MLFLOW_TRACKING_URI") 62 | self.alpha = float(os.environ.get("ALPHA")) 63 | self.l1_ratio = float(os.environ.get("L1_RATIO")) 64 | self.rmse_threshold = float(os.environ.get("RMSE_THRESHOLD")) 65 | self.mae_threshold = float(os.environ.get("MAE_THRESHOLD")) 66 | self.registered_model_name = os.environ.get("REGISTERED_MODEL_NAME") 67 | 68 | 69 | class RunInfo: 70 | def __init__(self, run_id) -> None: 71 | self.path = AppPath.RUN_INFO 72 | self.run_id = run_id 73 | 74 | def save(self): 75 | run_info = { 76 | "run_id": self.run_id, 77 | } 78 | dump_json(run_info, self.path) 79 | 80 | @staticmethod 81 | def load(path): 82 | data = load_json(path) 83 | run_info = RunInfo(data["run_id"]) 84 | return run_info 85 | 86 | 87 | class EvaluationResult: 88 | def __init__(self, rmse, mae) -> None: 89 | self.path = AppPath.EVALUATION_RESULT 90 | self.rmse = rmse 91 | self.mae = mae 92 | 93 | def __str__(self) -> str: 94 | return f"RMSE: {self.rmse}, MAE: {self.mae}" 95 | 96 | def save(self): 97 | eval_result = { 98 | "rmse": self.rmse, 99 | "mae": self.mae, 100 | } 101 | dump_json(eval_result, self.path) 102 | 103 | @staticmethod 104 | def load(path): 105 | data = load_json(path) 106 | eval_result = EvaluationResult( 107 | data["rmse"], 108 | data["mae"], 109 | ) 110 | return eval_result 111 | 112 | 113 | class Log: 114 | log: logging.Logger = None 115 | 116 | def __init__(self, name="") -> None: 117 | if Log.log == None: 118 | Log.log = self._init_logger(name) 119 | 120 | def _init_logger(self, name): 121 | logger = logging.getLogger(name) 122 | formatter = logging.Formatter( 123 | "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 124 | ) 125 | stream_handler = logging.StreamHandler(sys.stdout) 126 | stream_handler.setFormatter(formatter) 127 | logger.addHandler(stream_handler) 128 | logger.setLevel(AppConst.LOG_LEVEL) 129 | return logger 130 | 131 | 132 | def inspect_dir(path): 133 | Log().log.info(f"inspect_dir {path}") 134 | path = Path(path) 135 | if not path.exists(): 136 | Log().log.info(f"Path {path} doesn't exist") 137 | return 138 | elif path.is_file(): 139 | Log().log.info(f"Path {path} is file") 140 | return 141 | 142 | paths = os.listdir(path) 143 | paths = sorted(paths) 144 | for path in paths: 145 | Log().log.info(path) 146 | 147 | 148 | def inspect_curr_dir(): 149 | cwd = os.getcwd() 150 | Log().log.info(f"current dir: {cwd}") 151 | inspect_dir(cwd) 152 | 153 | 154 | def load_df(path) -> pd.DataFrame: 155 | Log().log.info(f"start load_df {path}") 156 | df = pd.read_parquet(path, engine="fastparquet") 157 | return df 158 | 159 | 160 | def to_parquet(df: pd.DataFrame, path): 161 | Log().log.info(f"start to_parquet {path}") 162 | df.to_parquet(path, engine="fastparquet") 163 | 164 | 165 | def dump_json(dict_obj: dict, path): 166 | with open(path, "w", encoding="utf-8") as f: 167 | json.dump(dict_obj, f) 168 | 169 | 170 | def load_json(path) -> dict: 171 | with open(path, "r", encoding="utf-8") as f: 172 | data = json.load(f) 173 | return data 174 | --------------------------------------------------------------------------------