├── 06-best-practices
    ├── code
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── data.b64
    │   │   └── model_test.py
    │   ├── .gitignore
    │   ├── scripts
    │   │   ├── publish.sh
    │   │   ├── test_cloud_e2e.sh
    │   │   └── deploy_manual.sh
    │   ├── infrastructure
    │   │   ├── modules
    │   │   │   ├── s3
    │   │   │   │   ├── variables.tf
    │   │   │   │   └── main.tf
    │   │   │   ├── kinesis
    │   │   │   │   ├── main.tf
    │   │   │   │   └── variables.tf
    │   │   │   ├── ecr
    │   │   │   │   ├── variables.tf
    │   │   │   │   └── main.tf
    │   │   │   └── lambda
    │   │   │   │   ├── variables.tf
    │   │   │   │   ├── main.tf
    │   │   │   │   └── iam.tf
    │   │   ├── vars
    │   │   │   ├── stg.tfvars
    │   │   │   └── prod.tfvars
    │   │   ├── variables.tf
    │   │   └── main.tf
    │   ├── integraton-test
    │   │   ├── model
    │   │   │   ├── requirements.txt
    │   │   │   ├── model.pkl
    │   │   │   ├── python_env.yaml
    │   │   │   ├── conda.yaml
    │   │   │   └── MLmodel
    │   │   ├── docker-compose.yaml
    │   │   ├── test_docker.py
    │   │   ├── event.json
    │   │   ├── run.sh
    │   │   └── test_kinesis.py
    │   ├── .vscode
    │   │   └── settings.json
    │   ├── Dockerfile
    │   ├── Pipfile
    │   ├── plan.md
    │   ├── pyproject.toml
    │   ├── lambda_function.py
    │   ├── Makefile
    │   ├── .pre-commit-config.yaml
    │   ├── model.py
    │   └── README.md
    ├── homework_solution
    │   ├── tests
    │   │   ├── __init__.py
    │   │   └── test_batch.py
    │   ├── model.bin
    │   ├── docker-compose.yaml
    │   ├── Pipfile
    │   ├── Dockerfile
    │   ├── integration_test.sh
    │   ├── integration_test.py
    │   └── batch.py
    ├── ci_cd_zoomcamp.png
    ├── homework
    │   ├── model.bin
    │   ├── Pipfile
    │   ├── Dockerfile
    │   └── batch.py
    ├── AWS-stream-pipeline.png
    ├── images
    │   ├── thumbnail-6-1.jpg
    │   ├── thumbnail-6-2.jpg
    │   ├── thumbnail-6-3.jpg
    │   ├── thumbnail-6-4.jpg
    │   ├── thumbnail-6-5.jpg
    │   └── thumbnail-6-6.jpg
    ├── meta.json
    └── docs.md
├── 05-monitoring
    ├── evidently_service
    │   ├── datasets
    │   │   └── do_not_delete
    │   ├── requirements.txt
    │   ├── Dockerfile
    │   ├── config
    │   │   ├── grafana_datasources.yaml
    │   │   ├── grafana_dashboards.yaml
    │   │   └── prometheus.yml
    │   └── config.yaml
    ├── images
    │   ├── thumbnail-5-01.jpg
    │   ├── thumbnail-5-02.jpg
    │   ├── thumbnail-5-03.jpg
    │   ├── thumbnail-5-04.jpg
    │   └── thumbnail-5-05.jpg
    ├── prediction_service
    │   ├── lin_reg.bin
    │   ├── requirements.txt
    │   ├── Dockerfile
    │   └── app.py
    ├── homework
    │   ├── prediction_service
    │   │   ├── lin_reg.bin
    │   │   ├── lin_reg_V2.bin
    │   │   ├── Pipfile
    │   │   ├── Dockerfile
    │   │   └── app.py
    │   ├── requirements.txt
    │   ├── prefect-monitoring
    │   │   ├── clean_mongo.py
    │   │   ├── Pipfile
    │   │   ├── prepare_reference_data.py
    │   │   ├── monitor_profile.ipynb
    │   │   ├── send_data.py
    │   │   ├── monitor_profile_solution.ipynb
    │   │   ├── prefect_monitoring.py
    │   │   └── prefect_monitoring_solution.py
    │   ├── test.py
    │   ├── prepare.py
    │   ├── docker-compose-homework.yml
    │   ├── docker-compose-homework-solution.yml
    │   └── model_training.py
    ├── requirements.txt
    ├── test.py
    ├── prepare.py
    ├── send_data.py
    ├── meta.json
    ├── docker-compose.yml
    ├── prefect_example.py
    └── README.md
├── images
    ├── play.png
    ├── banner.png
    ├── prefect.png
    ├── mlops-world.png
    └── IMG_20230323_134059_927.png
├── 04-deployment
    ├── homework
    │   ├── Dockerfile
    │   ├── model.bin
    │   ├── Pipfile
    │   ├── homework.dockerfile
    │   ├── batch.py
    │   └── starter.ipynb
    ├── web-service
    │   ├── lin_reg.bin
    │   ├── test.py
    │   ├── Pipfile
    │   ├── Dockerfile
    │   ├── README.md
    │   └── predict.py
    ├── images
    │   ├── thumbnail-4-01.jpg
    │   ├── thumbnail-4-02.jpg
    │   ├── thumbnail-4-03.jpg
    │   ├── thumbnail-4-04.jpg
    │   ├── thumbnail-4-05.jpg
    │   └── thumbnail-4-06.jpg
    ├── web-service-mlflow
    │   ├── dict_vectorizer.bin
    │   ├── test.py
    │   ├── Pipfile
    │   ├── README.md
    │   └── predict.py
    ├── batch
    │   ├── README.md
    │   ├── Pipfile
    │   ├── score_deploy.py
    │   ├── score_backfill.py
    │   └── score.py
    ├── streaming
    │   ├── Pipfile
    │   ├── Dockerfile
    │   ├── test.py
    │   ├── test_docker.py
    │   └── lambda_function.py
    ├── meta.json
    └── README.md
├── 03-orchestration
    ├── .gitignore
    ├── requirements.txt
    ├── images
    │   ├── thumbnail-3-01.jpg
    │   ├── thumbnail-3-02.jpg
    │   ├── thumbnail-3-03.jpg
    │   ├── thumbnail-3-04.jpg
    │   ├── thumbnail-3-05.jpg
    │   └── thumbnail-3-06.jpg
    ├── work-queue.py
    ├── windows.md
    ├── meta.json
    ├── homework.py
    ├── homework_solution.py
    └── README.md
├── 02-experiment-tracking
    ├── requirements.txt
    ├── images
    │   ├── ec2_os.png
    │   ├── key_pair.png
    │   ├── s3_bucket.png
    │   ├── db_password.png
    │   ├── db_settings.png
    │   ├── postgresql.png
    │   ├── security_group.png
    │   ├── thumbnail-2-01.jpg
    │   ├── thumbnail-2-02.jpg
    │   ├── thumbnail-2-03.jpg
    │   ├── thumbnail-2-04.jpg
    │   ├── thumbnail-2-05.jpg
    │   ├── thumbnail-2-06.jpg
    │   ├── thumbnail-2-07.jpg
    │   ├── db_configuration.png
    │   ├── select_key_pair.png
    │   ├── ec2_instance_type.png
    │   └── postgresql_inbound_rule.png
    ├── homework
    │   ├── train.py
    │   ├── hpo.py
    │   ├── preprocess_data.py
    │   └── register_model.py
    ├── meta.json
    ├── README.md
    ├── running-mlflow-examples
    │   ├── scenario-1.ipynb
    │   ├── scenario-2.ipynb
    │   └── scenario-3.ipynb
    └── mlflow_on_aws.md
├── 01-intro
    ├── images
    │   ├── thumbnail-1-01.jpg
    │   ├── thumbnail-1-02.jpg
    │   ├── thumbnail-1-03.jpg
    │   ├── thumbnail-1-04.jpg
    │   └── thumbnail-1-05.jpg
    ├── meta.json
    └── homework.md
├── .gitignore
├── after-sign-up.md
├── asking-questions.md
├── certificate.md
└── .github
    └── workflows
        ├── ci-tests.yml
        └── cd-deploy.yml


/06-best-practices/code/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/06-best-practices/code/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | 


--------------------------------------------------------------------------------
/06-best-practices/homework_solution/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/05-monitoring/evidently_service/datasets/do_not_delete:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/images/play.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/images/play.png


--------------------------------------------------------------------------------
/images/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/images/banner.png


--------------------------------------------------------------------------------
/images/prefect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/images/prefect.png


--------------------------------------------------------------------------------
/06-best-practices/code/scripts/publish.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | echo "publishing image ${LOCAL_IMAGE_NAME} to ECR..."


--------------------------------------------------------------------------------
/04-deployment/homework/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.9.7-slim
2 | 
3 | WORKDIR /app
4 | COPY [ "model2.bin", "model.bin" ]
5 |     


--------------------------------------------------------------------------------
/images/mlops-world.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/images/mlops-world.png


--------------------------------------------------------------------------------
/03-orchestration/.gitignore:
--------------------------------------------------------------------------------
1 | data/
2 | .ipynb_checkpoints
3 | models/*
4 | mlruns/*
5 | .vscode/
6 | ./DS_Store
7 | *.db
8 | *.DS_Store


--------------------------------------------------------------------------------
/06-best-practices/code/infrastructure/modules/s3/variables.tf:
--------------------------------------------------------------------------------
1 | variable "bucket_name" {
2 |   description = "Name of the bucket"
3 | }
4 | 


--------------------------------------------------------------------------------
/02-experiment-tracking/requirements.txt:
--------------------------------------------------------------------------------
1 | mlflow
2 | jupyter
3 | scikit-learn
4 | pandas
5 | seaborn
6 | hyperopt
7 | xgboost
8 | fastparquet
9 | boto3


--------------------------------------------------------------------------------
/04-deployment/homework/model.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/04-deployment/homework/model.bin


--------------------------------------------------------------------------------
/01-intro/images/thumbnail-1-01.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/01-intro/images/thumbnail-1-01.jpg


--------------------------------------------------------------------------------
/01-intro/images/thumbnail-1-02.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/01-intro/images/thumbnail-1-02.jpg


--------------------------------------------------------------------------------
/01-intro/images/thumbnail-1-03.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/01-intro/images/thumbnail-1-03.jpg


--------------------------------------------------------------------------------
/01-intro/images/thumbnail-1-04.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/01-intro/images/thumbnail-1-04.jpg


--------------------------------------------------------------------------------
/01-intro/images/thumbnail-1-05.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/01-intro/images/thumbnail-1-05.jpg


--------------------------------------------------------------------------------
/03-orchestration/requirements.txt:
--------------------------------------------------------------------------------
1 | mlflow
2 | jupyter
3 | scikit-learn
4 | pandas
5 | seaborn
6 | hyperopt
7 | xgboost
8 | fastparquet
9 | prefect==2.3.1


--------------------------------------------------------------------------------
/images/IMG_20230323_134059_927.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/images/IMG_20230323_134059_927.png


--------------------------------------------------------------------------------
/04-deployment/web-service/lin_reg.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/04-deployment/web-service/lin_reg.bin


--------------------------------------------------------------------------------
/06-best-practices/ci_cd_zoomcamp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/06-best-practices/ci_cd_zoomcamp.png


--------------------------------------------------------------------------------
/06-best-practices/homework/model.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/06-best-practices/homework/model.bin


--------------------------------------------------------------------------------
/02-experiment-tracking/images/ec2_os.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/02-experiment-tracking/images/ec2_os.png


--------------------------------------------------------------------------------
/04-deployment/images/thumbnail-4-01.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/04-deployment/images/thumbnail-4-01.jpg


--------------------------------------------------------------------------------
/04-deployment/images/thumbnail-4-02.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/04-deployment/images/thumbnail-4-02.jpg


--------------------------------------------------------------------------------
/04-deployment/images/thumbnail-4-03.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/04-deployment/images/thumbnail-4-03.jpg


--------------------------------------------------------------------------------
/04-deployment/images/thumbnail-4-04.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/04-deployment/images/thumbnail-4-04.jpg


--------------------------------------------------------------------------------
/04-deployment/images/thumbnail-4-05.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/04-deployment/images/thumbnail-4-05.jpg


--------------------------------------------------------------------------------
/04-deployment/images/thumbnail-4-06.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/04-deployment/images/thumbnail-4-06.jpg


--------------------------------------------------------------------------------
/05-monitoring/images/thumbnail-5-01.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/05-monitoring/images/thumbnail-5-01.jpg


--------------------------------------------------------------------------------
/05-monitoring/images/thumbnail-5-02.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/05-monitoring/images/thumbnail-5-02.jpg


--------------------------------------------------------------------------------
/05-monitoring/images/thumbnail-5-03.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/05-monitoring/images/thumbnail-5-03.jpg


--------------------------------------------------------------------------------
/05-monitoring/images/thumbnail-5-04.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/05-monitoring/images/thumbnail-5-04.jpg


--------------------------------------------------------------------------------
/05-monitoring/images/thumbnail-5-05.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/05-monitoring/images/thumbnail-5-05.jpg


--------------------------------------------------------------------------------
/02-experiment-tracking/images/key_pair.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/02-experiment-tracking/images/key_pair.png


--------------------------------------------------------------------------------
/02-experiment-tracking/images/s3_bucket.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/02-experiment-tracking/images/s3_bucket.png


--------------------------------------------------------------------------------
/03-orchestration/images/thumbnail-3-01.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/03-orchestration/images/thumbnail-3-01.jpg


--------------------------------------------------------------------------------
/03-orchestration/images/thumbnail-3-02.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/03-orchestration/images/thumbnail-3-02.jpg


--------------------------------------------------------------------------------
/03-orchestration/images/thumbnail-3-03.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/03-orchestration/images/thumbnail-3-03.jpg


--------------------------------------------------------------------------------
/03-orchestration/images/thumbnail-3-04.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/03-orchestration/images/thumbnail-3-04.jpg


--------------------------------------------------------------------------------
/03-orchestration/images/thumbnail-3-05.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/03-orchestration/images/thumbnail-3-05.jpg


--------------------------------------------------------------------------------
/03-orchestration/images/thumbnail-3-06.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/03-orchestration/images/thumbnail-3-06.jpg


--------------------------------------------------------------------------------
/06-best-practices/AWS-stream-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/06-best-practices/AWS-stream-pipeline.png


--------------------------------------------------------------------------------
/06-best-practices/images/thumbnail-6-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/06-best-practices/images/thumbnail-6-1.jpg


--------------------------------------------------------------------------------
/06-best-practices/images/thumbnail-6-2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/06-best-practices/images/thumbnail-6-2.jpg


--------------------------------------------------------------------------------
/06-best-practices/images/thumbnail-6-3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/06-best-practices/images/thumbnail-6-3.jpg


--------------------------------------------------------------------------------
/06-best-practices/images/thumbnail-6-4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/06-best-practices/images/thumbnail-6-4.jpg


--------------------------------------------------------------------------------
/06-best-practices/images/thumbnail-6-5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/06-best-practices/images/thumbnail-6-5.jpg


--------------------------------------------------------------------------------
/06-best-practices/images/thumbnail-6-6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/06-best-practices/images/thumbnail-6-6.jpg


--------------------------------------------------------------------------------
/02-experiment-tracking/images/db_password.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/02-experiment-tracking/images/db_password.png


--------------------------------------------------------------------------------
/02-experiment-tracking/images/db_settings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/02-experiment-tracking/images/db_settings.png


--------------------------------------------------------------------------------
/02-experiment-tracking/images/postgresql.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/02-experiment-tracking/images/postgresql.png


--------------------------------------------------------------------------------
/05-monitoring/prediction_service/lin_reg.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/05-monitoring/prediction_service/lin_reg.bin


--------------------------------------------------------------------------------
/06-best-practices/code/integraton-test/model/requirements.txt:
--------------------------------------------------------------------------------
1 | mlflow
2 | cloudpickle==2.0.0
3 | psutil==5.8.0
4 | scikit-learn==1.0.2
5 | typing-extensions==3.10.0.2
6 | 


--------------------------------------------------------------------------------
/06-best-practices/homework_solution/model.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/06-best-practices/homework_solution/model.bin


--------------------------------------------------------------------------------
/02-experiment-tracking/images/security_group.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/02-experiment-tracking/images/security_group.png


--------------------------------------------------------------------------------
/02-experiment-tracking/images/thumbnail-2-01.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/02-experiment-tracking/images/thumbnail-2-01.jpg


--------------------------------------------------------------------------------
/02-experiment-tracking/images/thumbnail-2-02.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/02-experiment-tracking/images/thumbnail-2-02.jpg


--------------------------------------------------------------------------------
/02-experiment-tracking/images/thumbnail-2-03.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/02-experiment-tracking/images/thumbnail-2-03.jpg


--------------------------------------------------------------------------------
/02-experiment-tracking/images/thumbnail-2-04.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/02-experiment-tracking/images/thumbnail-2-04.jpg


--------------------------------------------------------------------------------
/02-experiment-tracking/images/thumbnail-2-05.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/02-experiment-tracking/images/thumbnail-2-05.jpg


--------------------------------------------------------------------------------
/02-experiment-tracking/images/thumbnail-2-06.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/02-experiment-tracking/images/thumbnail-2-06.jpg


--------------------------------------------------------------------------------
/02-experiment-tracking/images/thumbnail-2-07.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/02-experiment-tracking/images/thumbnail-2-07.jpg


--------------------------------------------------------------------------------
/02-experiment-tracking/images/db_configuration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/02-experiment-tracking/images/db_configuration.png


--------------------------------------------------------------------------------
/02-experiment-tracking/images/select_key_pair.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/02-experiment-tracking/images/select_key_pair.png


--------------------------------------------------------------------------------
/02-experiment-tracking/images/ec2_instance_type.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/02-experiment-tracking/images/ec2_instance_type.png


--------------------------------------------------------------------------------
/04-deployment/web-service-mlflow/dict_vectorizer.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/04-deployment/web-service-mlflow/dict_vectorizer.bin


--------------------------------------------------------------------------------
/05-monitoring/homework/prediction_service/lin_reg.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/05-monitoring/homework/prediction_service/lin_reg.bin


--------------------------------------------------------------------------------
/06-best-practices/code/integraton-test/model/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/06-best-practices/code/integraton-test/model/model.pkl


--------------------------------------------------------------------------------
/02-experiment-tracking/images/postgresql_inbound_rule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/02-experiment-tracking/images/postgresql_inbound_rule.png


--------------------------------------------------------------------------------
/05-monitoring/homework/prediction_service/lin_reg_V2.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/My-Machine-Learning-Projects-CT/mlops-zoomcamp/main/05-monitoring/homework/prediction_service/lin_reg_V2.bin


--------------------------------------------------------------------------------
/06-best-practices/homework_solution/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | services:
2 |   s3:
3 |     image: localstack/localstack
4 |     ports:
5 |       - "4566:4566"
6 |     environment:
7 |       - SERVICES=s3


--------------------------------------------------------------------------------
/05-monitoring/evidently_service/requirements.txt:
--------------------------------------------------------------------------------
1 | dataclasses==0.6
2 | Flask~=2.0.1
3 | pandas~=1.1.5
4 | Werkzeug~=2.0.1
5 | requests~=2.26.0
6 | prometheus_client~=0.11.0
7 | pyyaml~=5.4.1
8 | pyarrow


--------------------------------------------------------------------------------
/06-best-practices/code/integraton-test/model/python_env.yaml:
--------------------------------------------------------------------------------
1 | python: 3.9.7
2 | build_dependencies:
3 | - pip==22.1
4 | - setuptools==58.0.4
5 | - wheel==0.37.0
6 | dependencies:
7 | - -r requirements.txt
8 | 


--------------------------------------------------------------------------------
/04-deployment/batch/README.md:
--------------------------------------------------------------------------------
1 | ## Batch deployment
2 | 
3 | * Turn the notebook for training a model into a notebook for applying the model
4 | * Turn the notebook into a script 
5 | * Clean it and parametrize
6 | 


--------------------------------------------------------------------------------
/05-monitoring/prediction_service/requirements.txt:
--------------------------------------------------------------------------------
 1 | scikit-learn
 2 | dataclasses==0.6
 3 | Flask~=2.0.1
 4 | pandas~=1.1.5
 5 | Werkzeug~=2.0.1
 6 | requests~=2.26.0
 7 | prometheus_client~=0.11.0
 8 | pyyaml~=5.4.1
 9 | evidently
10 | pymongo


--------------------------------------------------------------------------------
/06-best-practices/code/tests/data.b64:
--------------------------------------------------------------------------------
1 | ewogICAgICAgICJyaWRlIjogewogICAgICAgICAgICAiUFVMb2NhdGlvbklEIjogMTMwLAogICAgICAgICAgICAiRE9Mb2NhdGlvbklEIjogMjA1LAogICAgICAgICAgICAidHJpcF9kaXN0YW5jZSI6IDMuNjYKICAgICAgICB9LCAKICAgICAgICAicmlkZV9pZCI6IDI1NgogICAgfQ==
2 | 


--------------------------------------------------------------------------------
/05-monitoring/requirements.txt:
--------------------------------------------------------------------------------
 1 | scikit-learn
 2 | dataclasses==0.6
 3 | Flask~=2.0.1
 4 | pandas>=1.1.5
 5 | Werkzeug~=2.0.1
 6 | requests~=2.26.0
 7 | prometheus_client~=0.11.0
 8 | pyyaml~=5.4.1
 9 | tqdm
10 | pyarrow
11 | prefect>=2.0b
12 | pymongo
13 | evidently
14 | 


--------------------------------------------------------------------------------
/06-best-practices/code/infrastructure/modules/s3/main.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_s3_bucket" "s3_bucket" {
 2 |   bucket = var.bucket_name
 3 |   acl    = "private"
 4 |   force_destroy = true
 5 | }
 6 | 
 7 | output "name" {
 8 |   value = aws_s3_bucket.s3_bucket.bucket
 9 | }
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | data/
 2 | .ipynb_checkpoints
 3 | .bin
 4 | *.db
 5 | 
 6 | *.parquet
 7 | *.html
 8 | *.csv
 9 | 
10 | .venv
11 | venv
12 | .idea
13 | **/artifacts/
14 | **/models/
15 | 
16 | __pycache__/
17 | **.env
18 | **.terraform/
19 | **.terraform.lock*
20 | **terraform.tfstate*
21 | 


--------------------------------------------------------------------------------
/04-deployment/streaming/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | boto3 = "*"
 8 | mlflow = "*"
 9 | scikit-learn = "==1.0.2"
10 | 
11 | [dev-packages]
12 | 
13 | [requires]
14 | python_version = "3.9"
15 | 


--------------------------------------------------------------------------------
/04-deployment/web-service/test.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | ride = {
 4 |     "PULocationID": 10,
 5 |     "DOLocationID": 50,
 6 |     "trip_distance": 40
 7 | }
 8 | 
 9 | url = 'http://localhost:9696/predict'
10 | response = requests.post(url, json=ride)
11 | print(response.json())
12 | 


--------------------------------------------------------------------------------
/04-deployment/web-service-mlflow/test.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | ride = {
 4 |     "PULocationID": 10,
 5 |     "DOLocationID": 50,
 6 |     "trip_distance": 40
 7 | }
 8 | 
 9 | url = 'http://localhost:9696/predict'
10 | response = requests.post(url, json=ride)
11 | print(response.json())
12 | 


--------------------------------------------------------------------------------
/05-monitoring/homework/requirements.txt:
--------------------------------------------------------------------------------
 1 | scikit-learn==1.0.2
 2 | dataclasses==0.6
 3 | Flask~=2.0.1
 4 | pandas>=1.1.5
 5 | Werkzeug~=2.0.1
 6 | requests~=2.26.0
 7 | prometheus_client~=0.11.0
 8 | pyyaml~=5.4.1
 9 | tqdm
10 | pyarrow
11 | prefect==2.0b8
12 | pymongo
13 | evidently
14 | pipenv
15 | 


--------------------------------------------------------------------------------
/04-deployment/homework/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | scikit-learn = "==1.0.2"
 8 | pandas = "*"
 9 | pyarrow = "*"
10 | s3fs = "*"
11 | 
12 | [dev-packages]
13 | 
14 | [requires]
15 | python_version = "3.9"
16 | 


--------------------------------------------------------------------------------
/06-best-practices/code/integraton-test/model/conda.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 | - conda-forge
 3 | dependencies:
 4 | - python=3.9.7
 5 | - pip<=22.1
 6 | - pip:
 7 |   - mlflow
 8 |   - cloudpickle==2.0.0
 9 |   - psutil==5.8.0
10 |   - scikit-learn==1.0.2
11 |   - typing-extensions==3.10.0.2
12 | name: mlflow-env
13 | 


--------------------------------------------------------------------------------
/06-best-practices/homework/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | scikit-learn = "==1.0.2"
 8 | pandas = "*"
 9 | pyarrow = "*"
10 | s3fs = "*"
11 | 
12 | [dev-packages]
13 | 
14 | [requires]
15 | python_version = "3.9"
16 | 


--------------------------------------------------------------------------------
/04-deployment/web-service/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | scikit-learn = "==1.0.2"
 8 | flask = "*"
 9 | gunicorn = "*"
10 | 
11 | [dev-packages]
12 | requests = "*"
13 | 
14 | [requires]
15 | python_version = "3.9"
16 | 


--------------------------------------------------------------------------------
/06-best-practices/code/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "python.testing.pytestArgs": [
 3 |         "tests"
 4 |     ],
 5 |     "python.testing.unittestEnabled": false,
 6 |     "python.testing.pytestEnabled": true,
 7 |     "python.linting.pylintEnabled": true,
 8 |     "python.linting.enabled": true
 9 | }
10 | 


--------------------------------------------------------------------------------
/06-best-practices/homework_solution/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | scikit-learn = "==1.0.2"
 8 | pandas = "*"
 9 | pyarrow = "*"
10 | s3fs = "*"
11 | 
12 | [dev-packages]
13 | pytest = "*"
14 | 
15 | [requires]
16 | python_version = "3.9"
17 | 


--------------------------------------------------------------------------------
/04-deployment/streaming/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM public.ecr.aws/lambda/python:3.9
 2 | 
 3 | RUN pip install -U pip
 4 | RUN pip install pipenv 
 5 | 
 6 | COPY [ "Pipfile", "Pipfile.lock", "./" ]
 7 | 
 8 | RUN pipenv install --system --deploy
 9 | 
10 | COPY [ "lambda_function.py", "./" ]
11 | 
12 | CMD [ "lambda_function.lambda_handler" ]
13 | 


--------------------------------------------------------------------------------
/05-monitoring/homework/prefect-monitoring/clean_mongo.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient
 2 | 
 3 | MONGO_CLIENT_ADDRESS = "mongodb://localhost:27017/"
 4 | MONGO_DATABASE = "prediction_service"
 5 | 
 6 | 
 7 | if __name__ == "__main__":
 8 |     client = MongoClient(MONGO_CLIENT_ADDRESS)
 9 |     client.drop_database(MONGO_DATABASE)
10 | 


--------------------------------------------------------------------------------
/06-best-practices/code/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM public.ecr.aws/lambda/python:3.9
 2 | 
 3 | RUN pip install -U pip
 4 | RUN pip install pipenv
 5 | 
 6 | COPY [ "Pipfile", "Pipfile.lock", "./" ]
 7 | 
 8 | RUN pipenv install --system --deploy
 9 | 
10 | COPY [ "lambda_function.py", "model.py", "./" ]
11 | 
12 | CMD [ "lambda_function.lambda_handler" ]
13 | 


--------------------------------------------------------------------------------
/04-deployment/web-service-mlflow/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | scikit-learn = "==1.0.2"
 8 | flask = "*"
 9 | gunicorn = "*"
10 | mlflow = "*"
11 | boto3 = "*"
12 | 
13 | [dev-packages]
14 | requests = "*"
15 | 
16 | [requires]
17 | python_version = "3.9"
18 | 


--------------------------------------------------------------------------------
/04-deployment/batch/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | scikit-learn = "==1.0.2"
 8 | prefect = "==2.0b6"
 9 | mlflow = "*"
10 | pandas = "*"
11 | boto3 = "*"
12 | pyarrow = "*"
13 | s3fs = "*"
14 | 
15 | [dev-packages]
16 | 
17 | [requires]
18 | python_version = "3.9"
19 | 


--------------------------------------------------------------------------------
/04-deployment/homework/homework.dockerfile:
--------------------------------------------------------------------------------
 1 | FROM agrigorev/zoomcamp-model:mlops-3.9.7-slim
 2 | 
 3 | RUN pip install -U pip
 4 | RUN pip install pipenv 
 5 | 
 6 | WORKDIR /app
 7 | 
 8 | COPY [ "Pipfile", "Pipfile.lock", "./" ]
 9 | 
10 | RUN pipenv install --system --deploy
11 | 
12 | COPY [ "batch.py", "batch.py" ]
13 | 
14 | ENTRYPOINT [ "python", "batch.py" ]


--------------------------------------------------------------------------------
/05-monitoring/test.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | url = 'http://127.0.0.1:9696/predict'
 4 | 
 5 | ride = {
 6 |     'lpep_pickup_datetime': '2021-01-01 00:15:56',
 7 |     'PULocationID': 43,
 8 |     'DOLocationID': 151,
 9 |     'passenger_count': 1.0,
10 |     'trip_distance': 1.01
11 | }
12 | 
13 | response = requests.post(url, json=ride).json()
14 | print(response)
15 | 


--------------------------------------------------------------------------------
/06-best-practices/homework/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9.7-slim
 2 | 
 3 | RUN pip install -U pip
 4 | RUN pip install pipenv 
 5 | 
 6 | WORKDIR /app
 7 | 
 8 | COPY [ "Pipfile", "Pipfile.lock", "./" ]
 9 | 
10 | RUN pipenv install --system --deploy
11 | 
12 | COPY [ "batch.py", "batch.py" ]
13 | COPY [ "model.bin", "model.bin" ]
14 | 
15 | ENTRYPOINT [ "python", "batch.py" ]


--------------------------------------------------------------------------------
/06-best-practices/homework_solution/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9.7-slim
 2 | 
 3 | RUN pip install -U pip
 4 | RUN pip install pipenv 
 5 | 
 6 | WORKDIR /app
 7 | 
 8 | COPY [ "Pipfile", "Pipfile.lock", "./" ]
 9 | 
10 | RUN pipenv install --system --deploy
11 | 
12 | COPY [ "batch.py", "batch.py" ]
13 | COPY [ "model.bin", "model.bin" ]
14 | 
15 | ENTRYPOINT [ "python", "batch.py" ]


--------------------------------------------------------------------------------
/05-monitoring/evidently_service/Dockerfile:
--------------------------------------------------------------------------------
 1 | # syntax=docker/dockerfile:1
 2 | 
 3 | FROM python:3.8-slim-buster
 4 | 
 5 | WORKDIR /app
 6 | 
 7 | COPY requirements.txt requirements.txt
 8 | 
 9 | RUN pip3 install -r requirements.txt
10 | 
11 | RUN pip3 install evidently==0.1.51.dev0
12 | 
13 | COPY app.py .
14 | 
15 | CMD [ "python3", "-m" , "flask", "run", "--host=0.0.0.0", "--port=8085"]


--------------------------------------------------------------------------------
/04-deployment/web-service/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9.7-slim
 2 | 
 3 | RUN pip install -U pip
 4 | RUN pip install pipenv 
 5 | 
 6 | WORKDIR /app
 7 | 
 8 | COPY [ "Pipfile", "Pipfile.lock", "./" ]
 9 | 
10 | RUN pipenv install --system --deploy
11 | 
12 | COPY [ "predict.py", "lin_reg.bin", "./" ]
13 | 
14 | EXPOSE 9696
15 | 
16 | ENTRYPOINT [ "gunicorn", "--bind=0.0.0.0:9696", "predict:app" ]


--------------------------------------------------------------------------------
/06-best-practices/code/infrastructure/vars/stg.tfvars:
--------------------------------------------------------------------------------
1 | source_stream_name = "stg_ride_events"
2 | output_stream_name = "stg_ride_predictions"
3 | model_bucket = "stg-mlflow-models-code-owners"
4 | lambda_function_local_path = "../lambda_function.py"
5 | docker_image_local_path = "../Dockerfile"
6 | ecr_repo_name = "stg_stream_model_duration"
7 | lambda_function_name = "stg_prediction_lambda"
8 | 


--------------------------------------------------------------------------------
/05-monitoring/homework/prediction_service/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | scikit-learn = "==1.0.2"
 8 | flask = "==2.0.1"
 9 | pandas = "==1.1.5"
10 | evidently = "*"
11 | pymongo = "*"
12 | gunicorn = "*"
13 | 
14 | [dev-packages]
15 | pyarrow = "*"
16 | 
17 | [requires]
18 | python_version = "3.8"
19 | 


--------------------------------------------------------------------------------
/05-monitoring/homework/prefect-monitoring/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | scikit-learn = "==1.0.2"
 8 | pyarrow = "*"
 9 | prefect = "==2.0b8"
10 | pandas = "*"
11 | pymongo = "*"
12 | psutil = "==5.9.1"
13 | evidently = "*"
14 | 
15 | [dev-packages]
16 | 
17 | [requires]
18 | python_version = "3.8"
19 | 


--------------------------------------------------------------------------------
/06-best-practices/code/infrastructure/vars/prod.tfvars:
--------------------------------------------------------------------------------
1 | source_stream_name = "prod_ride_events"
2 | output_stream_name = "prod_ride_predictions"
3 | model_bucket = "prod-mlflow-models-code-owners"
4 | lambda_function_local_path = "../lambda_function.py"
5 | docker_image_local_path = "../Dockerfile"
6 | ecr_repo_name = "prod_stream_model_duration"
7 | lambda_function_name = "prod_prediction_lambda"
8 | 


--------------------------------------------------------------------------------
/05-monitoring/prediction_service/Dockerfile:
--------------------------------------------------------------------------------
 1 | # syntax=docker/dockerfile:1
 2 | 
 3 | FROM python:3.8-slim-buster
 4 | 
 5 | WORKDIR /app
 6 | 
 7 | COPY requirements.txt requirements.txt
 8 | 
 9 | RUN pip3 install -r requirements.txt
10 | 
11 | RUN pip3 install evidently
12 | 
13 | COPY app.py .
14 | COPY lin_reg.bin .
15 | 
16 | CMD [ "python3", "-m" , "flask", "run", "--host=0.0.0.0", "--port=9696"]


--------------------------------------------------------------------------------
/06-best-practices/code/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | boto3 = "*"
 8 | mlflow = "*"
 9 | scikit-learn = "==1.0.2"
10 | 
11 | [dev-packages]
12 | pytest = "*"
13 | deepdiff = "*"
14 | pylint = "==2.14.4"
15 | black = "*"
16 | isort = "*"
17 | pre-commit = "*"
18 | 
19 | [requires]
20 | python_version = "3.9"
21 | 


--------------------------------------------------------------------------------
/06-best-practices/code/plan.md:
--------------------------------------------------------------------------------
 1 | ## Plan
 2 | 
 3 | - [x] Testing the code: unit tests with pytest
 4 | - [x] Integration tests with docker-compose
 5 | - [x] Testing cloud services with LocalStack
 6 | - [x] Code quality: linting and formatting
 7 | - [x] Git pre-commit hooks
 8 | - [x] Makefiles and make
 9 | - [ ] Staging and production environments
10 | - [ ] Infrastructure as Code
11 | - [ ] CI/CD and GitHub Actions
12 | 


--------------------------------------------------------------------------------
/05-monitoring/homework/prediction_service/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.8-slim-buster
 2 | 
 3 | RUN pip install -U pip
 4 | RUN pip install pipenv 
 5 | 
 6 | WORKDIR /app
 7 | 
 8 | COPY [ "Pipfile", "Pipfile.lock", "./" ]
 9 | 
10 | RUN pipenv install --system --deploy
11 | 
12 | COPY [ "app.py", "lin_reg.bin", "lin_reg_V2.bin", "./" ]
13 | 
14 | EXPOSE 9696
15 | 
16 | ENTRYPOINT ["gunicorn", "--bind=0.0.0.0:9696", "app:app" ]
17 | 


--------------------------------------------------------------------------------
/04-deployment/web-service/README.md:
--------------------------------------------------------------------------------
 1 | ## Deploying a model as a web-service
 2 | 
 3 | * Creating a virtual environment with Pipenv
 4 | * Creating a script for predictiong 
 5 | * Putting the script into a Flask app
 6 | * Packaging the app to Docker
 7 | 
 8 | 
 9 | ```bash
10 | docker build -t ride-duration-prediction-service:v1 .
11 | ```
12 | 
13 | ```bash
14 | docker run -it --rm -p 9696:9696  ride-duration-prediction-service:v1
15 | ```
16 | 


--------------------------------------------------------------------------------
/05-monitoring/evidently_service/config/grafana_datasources.yaml:
--------------------------------------------------------------------------------
 1 | # config file version
 2 | apiVersion: 1
 3 | 
 4 | # list of datasources that should be deleted from the database
 5 | deleteDatasources:
 6 |   - name: Prometheus
 7 |     orgId: 1
 8 | 
 9 | # list of datasources to insert/update depending
10 | # what's available in the database
11 | datasources:
12 |   - name: Prometheus
13 |     type: prometheus
14 |     access: proxy
15 |     url: http://prometheus.:9090


--------------------------------------------------------------------------------
/06-best-practices/code/infrastructure/modules/kinesis/main.tf:
--------------------------------------------------------------------------------
 1 | # Create Kinesis Data Stream
 2 | 
 3 | resource "aws_kinesis_stream" "stream" {
 4 |   name             = var.stream_name
 5 |   shard_count      = var.shard_count
 6 |   retention_period = var.retention_period
 7 |   shard_level_metrics = var.shard_level_metrics
 8 |   tags = {
 9 |     CreatedBy = var.tags
10 |   }
11 | }
12 | 
13 | output "stream_arn" {
14 |   value = aws_kinesis_stream.stream.arn
15 | }
16 | 


--------------------------------------------------------------------------------
/06-best-practices/code/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.pylint.messages_control]
 2 | 
 3 | disable = [
 4 |     "missing-function-docstring",
 5 |     "missing-final-newline",
 6 |     "missing-class-docstring",
 7 |     "missing-module-docstring",
 8 |     "invalid-name",
 9 |     "too-few-public-methods"
10 | ]
11 | 
12 | [tool.black]
13 | line-length = 88
14 | target-version = ['py39']
15 | skip-string-normalization = true
16 | 
17 | [tool.isort]
18 | multi_line_output = 3
19 | length_sort = true
20 | 


--------------------------------------------------------------------------------
/06-best-practices/code/integraton-test/model/MLmodel:
--------------------------------------------------------------------------------
 1 | artifact_path: model
 2 | flavors:
 3 |   python_function:
 4 |     env: conda.yaml
 5 |     loader_module: mlflow.sklearn
 6 |     model_path: model.pkl
 7 |     python_version: 3.9.7
 8 |   sklearn:
 9 |     code: null
10 |     pickled_model: model.pkl
11 |     serialization_format: cloudpickle
12 |     sklearn_version: 1.0.2
13 | mlflow_version: 1.26.1
14 | model_uuid: 78edf19ceea5463aadce7d84f3f9bc82
15 | run_id: e1efc53e9bd149078b0c12aeaa6365df
16 | utc_time_created: '2022-06-01 12:49:55.846831'
17 | 


--------------------------------------------------------------------------------
/04-deployment/batch/score_deploy.py:
--------------------------------------------------------------------------------
 1 | from prefect.deployments import Deployment
 2 | from prefect.orion.schemas.schedules import CronSchedule
 3 | from score import ride_duration_prediction
 4 | 
 5 | deployment = Deployment.build_from_flow(
 6 |     flow=ride_duration_prediction,
 7 |     name="ride_duration_prediction",
 8 |     parameters={
 9 |         "taxi_type": "green",
10 |         "run_id": "e1efc53e9bd149078b0c12aeaa6365df",
11 |     },
12 |     schedule=CronSchedule(cron="0 3 2 * *"),
13 |     work_queue_name="ml",
14 | )
15 | 
16 | deployment.apply()
17 | 


--------------------------------------------------------------------------------
/06-best-practices/code/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import model
 4 | 
 5 | PREDICTIONS_STREAM_NAME = os.getenv('PREDICTIONS_STREAM_NAME', 'ride_predictions')
 6 | RUN_ID = os.getenv('RUN_ID')
 7 | TEST_RUN = os.getenv('TEST_RUN', 'False') == 'True'
 8 | 
 9 | 
10 | model_service = model.init(
11 |     prediction_stream_name=PREDICTIONS_STREAM_NAME,
12 |     run_id=RUN_ID,
13 |     test_run=TEST_RUN,
14 | )
15 | 
16 | 
17 | def lambda_handler(event, context):
18 |     # pylint: disable=unused-argument
19 |     return model_service.lambda_handler(event)
20 | 


--------------------------------------------------------------------------------
/05-monitoring/homework/prefect-monitoring/prepare_reference_data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pyarrow.parquet as pq
 3 | 
 4 | 
 5 | data_files = ["../datasets/green_tripdata_2021-03.parquet", "../datasets/green_tripdata_2021-04.parquet"]
 6 | output_file = "green_tripdata_2021-03to04.parquet"
 7 | 
 8 | df = pd.DataFrame()
 9 | for file in data_files:
10 |     data = pq.read_table(file).to_pandas()
11 |     df = pd.concat([data, df], ignore_index=True)
12 | 
13 | df.to_parquet(
14 |     output_file,
15 |     engine='pyarrow',
16 |     compression=None,
17 |     index=False
18 | )
19 | 


--------------------------------------------------------------------------------
/05-monitoring/evidently_service/config.yaml:
--------------------------------------------------------------------------------
 1 | datasets:
 2 |   taxi:
 3 |     column_mapping:
 4 |       categorical_features:
 5 |         - 'PULocationID'
 6 |         - 'DOLocationID'
 7 |       numerical_features:
 8 |         - 'trip_distance'
 9 |     data_format:
10 |       header: true
11 |       separator: ','
12 |     monitors:
13 |       - data_drift
14 |     reference_file: ./datasets/green_tripdata_2021-01.parquet
15 | service:
16 |   calculation_period_sec: 2
17 |   min_reference_size: 30
18 |   moving_reference: false
19 |   datasets_path: datasets
20 |   use_reference: true
21 |   window_size: 5
22 | 


--------------------------------------------------------------------------------
/06-best-practices/code/Makefile:
--------------------------------------------------------------------------------
 1 | LOCAL_TAG:=$(shell date +"%Y-%m-%d-%H-%M")
 2 | LOCAL_IMAGE_NAME:=stream-model-duration:${LOCAL_TAG}
 3 | 
 4 | test:
 5 | 	pytest tests/
 6 | 
 7 | quality_checks:
 8 | 	isort .
 9 | 	black .
10 | 	pylint --recursive=y .
11 | 
12 | build: quality_checks test
13 | 	docker build -t ${LOCAL_IMAGE_NAME} .
14 | 
15 | integration_test: build
16 | 	LOCAL_IMAGE_NAME=${LOCAL_IMAGE_NAME} bash integraton-test/run.sh
17 | 
18 | publish: build integration_test
19 | 	LOCAL_IMAGE_NAME=${LOCAL_IMAGE_NAME} bash scripts/publish.sh
20 | 
21 | setup:
22 | 	pipenv install --dev
23 | 	pre-commit install


--------------------------------------------------------------------------------
/06-best-practices/code/integraton-test/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   backend:
 3 |     image: ${LOCAL_IMAGE_NAME}
 4 |     ports:
 5 |       - "8080:8080"
 6 |     environment:
 7 |       - PREDICTIONS_STREAM_NAME=${PREDICTIONS_STREAM_NAME}
 8 |       - RUN_ID=Test123
 9 |       - AWS_DEFAULT_REGION=eu-west-1
10 |       - MODEL_LOCATION=/app/model
11 |       - KINESIS_ENDPOINT_URL=http://kinesis:4566/
12 |       - AWS_ACCESS_KEY_ID=abc
13 |       - AWS_SECRET_ACCESS_KEY=xyz
14 |     volumes:
15 |       - "./model:/app/model"
16 |   kinesis:
17 |     image: localstack/localstack
18 |     ports:
19 |       - "4566:4566"
20 |     environment:
21 |       - SERVICES=kinesis
22 | 


--------------------------------------------------------------------------------
/06-best-practices/homework_solution/integration_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | 
 4 | 
 5 | docker-compose up -d
 6 | 
 7 | sleep 5
 8 | 
 9 | export INPUT_FILE_PATTERN="s3://nyc-duration/in/{year:04d}-{month:02d}.parquet"
10 | export OUTPUT_FILE_PATTERN="s3://nyc-duration/out/{year:04d}-{month:02d}.parquet"
11 | export S3_ENDPOINT_URL="http://localhost:4566"
12 | 
13 | 
14 | aws --endpoint-url="${S3_ENDPOINT_URL}" s3 mb s3://nyc-duration
15 | 
16 | pipenv run python integration_test.py
17 | 
18 | ERROR_CODE=$?
19 | 
20 | if [ ${ERROR_CODE} != 0 ]; then
21 |     docker-compose logs
22 |     docker-compose down
23 |     exit ${ERROR_CODE}
24 | fi
25 | 
26 | echo "yay tests work!"
27 | 
28 | docker-compose down


--------------------------------------------------------------------------------
/05-monitoring/homework/test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pprint
 3 | from pymongo import MongoClient
 4 | 
 5 | import requests
 6 | 
 7 | MONGODB_ADDRESS = os.getenv("MONGODB_ADDRESS", "mongodb://127.0.0.1:27017/")
 8 | FLASK_URL = "http://127.0.0.1:9696/predict-duration"
 9 | 
10 | 
11 | mongo_client = MongoClient(MONGODB_ADDRESS)
12 | mongo_db = mongo_client['prediction_service']
13 | mongo_collection = mongo_db['data']
14 | ride_test_data = {
15 |     "PULocationID": 10, 
16 |     "DOLocationID": 50,
17 |     "trip_distance": 40
18 |     }
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     requests.post(url=FLASK_URL ,json=ride_test_data)
23 |     for coll in mongo_collection.find():
24 |         pprint.pprint(coll)
25 |     


--------------------------------------------------------------------------------
/05-monitoring/prepare.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | import requests
 3 | 
 4 | files = [("green_tripdata_2022-01.parquet", "."), ("green_tripdata_2021-01.parquet", "./evidently_service/datasets")]
 5 | 
 6 | print(f"Download files:")
 7 | for file, path in files:
 8 |     url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
 9 |     resp = requests.get(url, stream=True)
10 |     save_path = f"{path}/{file}"
11 |     with open(save_path, "wb") as handle:
12 |         for data in tqdm(resp.iter_content(),
13 |                          desc=f"{file}",
14 |                          postfix=f"save to {save_path}",
15 |                          total=int(resp.headers["Content-Length"])):
16 |             handle.write(data)
17 | 


--------------------------------------------------------------------------------
/04-deployment/batch/score_backfill.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from dateutil.relativedelta import relativedelta
 3 | 
 4 | from prefect import flow
 5 | 
 6 | import score
 7 | 
 8 | 
 9 | @flow
10 | def ride_duration_prediction_backfill():
11 |     start_date = datetime(year=2021, month=3, day=1)
12 |     end_date = datetime(year=2022, month=4, day=1)
13 | 
14 |     d = start_date
15 | 
16 |     while d <= end_date:
17 |         score.ride_duration_prediction(
18 |             taxi_type='green',
19 |             run_id='e1efc53e9bd149078b0c12aeaa6365df',
20 |             run_date=d
21 |         )
22 | 
23 |         d = d + relativedelta(months=1)
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     ride_duration_prediction_backfill()


--------------------------------------------------------------------------------
/06-best-practices/code/infrastructure/modules/ecr/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "ecr_repo_name" {
 2 |     type        = string
 3 |     description = "ECR repo name"
 4 | }
 5 | 
 6 | variable "ecr_image_tag" {
 7 |     type        = string
 8 |     description = "ECR repo name"
 9 |     default = "latest"
10 | }
11 | 
12 | variable "lambda_function_local_path" {
13 |     type        = string
14 |     description = "Local path to lambda function / python file"
15 | }
16 | 
17 | variable "docker_image_local_path" {
18 |     type        = string
19 |     description = "Local path to Dockerfile"
20 | }
21 | 
22 | variable "region" {
23 |     type        = string
24 |     description = "region"
25 |     default = "eu-west-1"
26 | }
27 | 
28 | variable "account_id" {
29 | }
30 | 


--------------------------------------------------------------------------------
/03-orchestration/work-queue.py:
--------------------------------------------------------------------------------
 1 | from prefect import flow
 2 | 
 3 | @flow
 4 | def myflow():
 5 |     print("hello")
 6 | 
 7 | from prefect.deployments import Deployment
 8 | from prefect.orion.schemas.schedules import IntervalSchedule
 9 | from datetime import timedelta
10 | 
11 | deployment_dev = Deployment.build_from_flow(
12 |     flow=myflow,
13 |     name="model_training-dev",
14 |     schedule=IntervalSchedule(interval=timedelta(minutes=5)),
15 |     work_queue_name="dev"
16 | )
17 | 
18 | deployment_dev.apply()
19 | 
20 | deployment_prod = Deployment.build_from_flow(
21 |     flow=myflow,
22 |     name="model_training-prod",
23 |     schedule=IntervalSchedule(interval=timedelta(minutes=5)),
24 |     work_queue_name="prod"
25 | )
26 | 
27 | deployment_prod.apply()
28 | 
29 | 


--------------------------------------------------------------------------------
/06-best-practices/code/infrastructure/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "aws_region" {
 2 |   description = "AWS region to create resources"
 3 |   default     = "eu-west-1"
 4 | }
 5 | 
 6 | variable "project_id" {
 7 |   description = "project_id"
 8 |   default = "mlops-zoomcamp"
 9 | }
10 | 
11 | variable "source_stream_name" {
12 |   description = ""
13 | }
14 | 
15 | variable "output_stream_name" {
16 |   description = ""
17 | }
18 | 
19 | variable "model_bucket" {
20 |   description = "s3_bucket"
21 | }
22 | 
23 | variable "lambda_function_local_path" {
24 |   description = ""
25 | }
26 | 
27 | variable "docker_image_local_path" {
28 |   description = ""
29 | }
30 | 
31 | variable "ecr_repo_name" {
32 |   description = ""
33 | }
34 | 
35 | variable "lambda_function_name" {
36 |   description = ""
37 | }


--------------------------------------------------------------------------------
/05-monitoring/homework/prepare.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | import requests
 3 | 
 4 | files = ["green_tripdata_2021-03.parquet", "green_tripdata_2021-04.parquet", "green_tripdata_2021-05.parquet"]
 5 | path = "./datasets"
 6 | print(f"Download files:")
 7 | for file in files:
 8 | 
 9 |     # Change the url based on what works for you whether s3 or cloudfront
10 |     url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
11 |     resp = requests.get(url, stream=True)
12 |     save_path = f"{path}/{file}"
13 |     with open(save_path, "wb") as handle:
14 |         for data in tqdm(resp.iter_content(),
15 |                          desc=f"{file}",
16 |                          postfix=f"save to {save_path}",
17 |                          total=int(resp.headers["Content-Length"])):
18 |             handle.write(data)
19 | 


--------------------------------------------------------------------------------
/06-best-practices/code/scripts/test_cloud_e2e.sh:
--------------------------------------------------------------------------------
 1 | export KINESIS_STREAM_INPUT="stg_ride_events-mlops-zoomcamp"
 2 | export KINESIS_STREAM_OUTPUT="stg_ride_predictions-mlops-zoomcamp"
 3 | 
 4 | SHARD_ID=$(aws kinesis put-record  \
 5 |         --stream-name ${KINESIS_STREAM_INPUT}   \
 6 |         --partition-key 1  --cli-binary-format raw-in-base64-out  \
 7 |         --data '{"ride": {
 8 |             "PULocationID": 130,
 9 |             "DOLocationID": 205,
10 |             "trip_distance": 3.66
11 |         },
12 |         "ride_id": 156}'  \
13 |         --query 'ShardId'
14 |     )
15 | 
16 | #SHARD_ITERATOR=$(aws kinesis get-shard-iterator --shard-id ${SHARD_ID} --shard-iterator-type TRIM_HORIZON --stream-name ${KINESIS_STREAM_OUTPUT} --query 'ShardIterator')
17 | 
18 | #aws kinesis get-records --shard-iterator $SHARD_ITERATOR
19 | 


--------------------------------------------------------------------------------
/04-deployment/web-service-mlflow/README.md:
--------------------------------------------------------------------------------
 1 | ## Getting the model for deployment from MLflow
 2 | 
 3 | * Take the code from the previous video
 4 | * Train another model, register with MLflow
 5 | * Put the model into a scikit-learn pipeline
 6 | * Model deployment with tracking server
 7 | * Model deployment without the tracking server
 8 | 
 9 | Starting the MLflow server with S3:
10 | 
11 | ```bash
12 | mlflow server \
13 |     --backend-store-uri=sqlite:///mlflow.db \
14 |     --default-artifact-root=s3://mlflow-models-alexey/
15 | ```
16 | 
17 | Downloading the artifact
18 | 
19 | ```bash
20 | export MLFLOW_TRACKING_URI="http://127.0.0.1:5000"
21 | export MODEL_RUN_ID="6dd459b11b4e48dc862f4e1019d166f6"
22 | 
23 | mlflow artifacts download \
24 |     --run-id ${MODEL_RUN_ID} \
25 |     --artifact-path model \
26 |     --dst-path .
27 | ```


--------------------------------------------------------------------------------
/05-monitoring/homework/docker-compose-homework.yml:
--------------------------------------------------------------------------------
 1 | version: "3.7"
 2 | 
 3 | volumes:
 4 |     mongo_data: {}
 5 | 
 6 | networks:
 7 |   front-tier:
 8 |   back-tier:
 9 | 
10 | services:
11 |   prediction_service:
12 |     build:
13 |       context: prediction_service
14 |       dockerfile: Dockerfile
15 |     depends_on:
16 |       - mongo
17 |     environment:
18 |       MONGO_DATABASE: "prediction_service"
19 |       MONGO_ADDRESS: "mongodb://mongo.:27017/"
20 |       MODEL_VERSION: "1"
21 |       MODEL_FILE: "lin_reg.bin"
22 |     
23 |     ports:
24 |       - 9696:9696
25 |     networks:
26 |       - back-tier
27 |       - front-tier
28 | 
29 |   mongo:
30 |     image: mongo
31 |     ports:
32 |       - 27017:27017
33 |     volumes:
34 |       - mongo_data:/data/db
35 |     networks:
36 |       - back-tier
37 |       - front-tier
38 | 


--------------------------------------------------------------------------------
/05-monitoring/homework/docker-compose-homework-solution.yml:
--------------------------------------------------------------------------------
 1 | version: "3.7"
 2 | 
 3 | volumes:
 4 |     mongo_data: {}
 5 | 
 6 | networks:
 7 |   front-tier:
 8 |   back-tier:
 9 | 
10 | services:
11 |   prediction_service:
12 |     build:
13 |       context: prediction_service
14 |       dockerfile: Dockerfile
15 |     depends_on:
16 |       - mongo
17 |     environment:
18 |       MONGO_DATABASE: "prediction_service"
19 |       MONGO_ADDRESS: "mongodb://mongo.:27017/"
20 |       MODEL_VERSION: "2"
21 |       MODEL_FILE: "lin_reg_V2.bin"
22 |     
23 |     ports:
24 |       - 9696:9696
25 |     networks:
26 |       - back-tier
27 |       - front-tier
28 | 
29 |   mongo:
30 |     image: mongo
31 |     ports:
32 |       - 27017:27017
33 |     volumes:
34 |       - mongo_data:/data/db
35 |     networks:
36 |       - back-tier
37 |       - front-tier
38 | 


--------------------------------------------------------------------------------
/06-best-practices/code/infrastructure/modules/lambda/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "source_stream_name" {
 2 |   type        = string
 3 |   description = "Source Kinesis Data Streams stream name"
 4 | }
 5 | 
 6 | variable "source_stream_arn" {
 7 |   type        = string
 8 |   description = "Source Kinesis Data Streams stream name"
 9 | }
10 | 
11 | variable "output_stream_name" {
12 |   description = "Name of output stream where all the events will be passed"
13 | }
14 | 
15 | variable "output_stream_arn" {
16 |   description = "ARN of output stream where all the events will be passed"
17 | }
18 | 
19 | variable "model_bucket" {
20 |   description = "Name of the bucket"
21 | }
22 | 
23 | variable "lambda_function_name" {
24 |   description = "Name of the lambda function"
25 | }
26 | 
27 | variable "image_uri" {
28 |   description = "ECR image uri"
29 | }
30 | 


--------------------------------------------------------------------------------
/06-best-practices/code/infrastructure/modules/kinesis/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "stream_name" {
 2 |     type        = string
 3 |     description = "Kinesis stream name"
 4 | }
 5 | 
 6 | variable "shard_count" {
 7 |     type        = number
 8 |     description = "Kinesis stream shard count"
 9 | }
10 | 
11 | variable "retention_period" {
12 |     type        = number
13 |     description = "Kinesis stream retention period"
14 | }
15 | 
16 | variable "shard_level_metrics" {
17 |     type        = list(string)
18 |     description = "shard_level_metrics"
19 |     default     = [
20 |     "IncomingBytes",
21 |     "OutgoingBytes",
22 |     "OutgoingRecords",
23 |     "ReadProvisionedThroughputExceeded",
24 |     "WriteProvisionedThroughputExceeded",
25 |     "IncomingRecords",
26 |     "IteratorAgeMilliseconds",
27 |   ]
28 | }
29 | 
30 | variable "tags" {
31 |   description = "Tags for kinesis stream"
32 |     default = "mlops-zoomcamp"
33 | }
34 | 


--------------------------------------------------------------------------------
/04-deployment/web-service/predict.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | from flask import Flask, request, jsonify
 4 | 
 5 | with open('lin_reg.bin', 'rb') as f_in:
 6 |     (dv, model) = pickle.load(f_in)
 7 | 
 8 | 
 9 | def prepare_features(ride):
10 |     features = {}
11 |     features['PU_DO'] = '%s_%s' % (ride['PULocationID'], ride['DOLocationID'])
12 |     features['trip_distance'] = ride['trip_distance']
13 |     return features
14 | 
15 | 
16 | def predict(features):
17 |     X = dv.transform(features)
18 |     preds = model.predict(X)
19 |     return float(preds[0])
20 | 
21 | 
22 | app = Flask('duration-prediction')
23 | 
24 | 
25 | @app.route('/predict', methods=['POST'])
26 | def predict_endpoint():
27 |     ride = request.get_json()
28 | 
29 |     features = prepare_features(ride)
30 |     pred = predict(features)
31 | 
32 |     result = {
33 |         'duration': pred
34 |     }
35 | 
36 |     return jsonify(result)
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     app.run(debug=True, host='0.0.0.0', port=9696)


--------------------------------------------------------------------------------
/01-intro/meta.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "module": {
 3 |     "number": 1,
 4 |     "title": "Introduction"
 5 |   },
 6 |   "units": [
 7 |     {
 8 |       "number": 1,
 9 |       "title": "Introduction",
10 |       "youtube": "https://www.youtube.com/watch?v=s0uaFZSzwfI"
11 |     },
12 |     {
13 |       "number": 2,
14 |       "title": "Environment preparation",
15 |       "youtube": "https://www.youtube.com/watch?v=IXSiYkP23zo"
16 |     },
17 |     {
18 |       "number": 3,
19 |       "title": "(Optional) Training a ride duration prediction model",
20 |       "youtube": "https://www.youtube.com/watch?v=iRunifGSHFc"
21 |     },
22 |     {
23 |       "number": 4,
24 |       "title": "Course overview",
25 |       "youtube": "https://www.youtube.com/watch?v=teP9KWkP6SM"
26 |     },
27 |     {
28 |       "number": 5,
29 |       "title": "MLOps maturity model",
30 |       "youtube": "https://www.youtube.com/watch?v=XwTH8BDGzYk"
31 |     },
32 |     {
33 |       "number": 6,
34 |       "title": "Homework",
35 |       "youtube": ""
36 |     }
37 |   ]
38 | }


--------------------------------------------------------------------------------
/05-monitoring/evidently_service/config/grafana_dashboards.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: 1
 2 | 
 3 | providers:
 4 |   # <string> an unique provider name. Required
 5 |   - name: 'Evidently Dashboards'
 6 |     # <int> Org id. Default to 1
 7 |     orgId: 1
 8 |     # <string> name of the dashboard folder.
 9 |     folder: ''
10 |     # <string> folder UID. will be automatically generated if not specified
11 |     folderUid: ''
12 |     # <string> provider type. Default to 'file'
13 |     type: file
14 |     # <bool> disable dashboard deletion
15 |     disableDeletion: false
16 |     # <int> how often Grafana will scan for changed dashboards
17 |     updateIntervalSeconds: 10
18 |     # <bool> allow updating provisioned dashboards from the UI
19 |     allowUiUpdates: false
20 |     options:
21 |       # <string, required> path to dashboard files on disk. Required when using the 'file' type
22 |       path: /opt/grafana/dashboards
23 |       # <bool> use folder names from filesystem to create folders in Grafana
24 |       foldersFromFilesStructure: true
25 | 


--------------------------------------------------------------------------------
/06-best-practices/code/integraton-test/test_docker.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable=duplicate-code
 2 | 
 3 | import json
 4 | 
 5 | import requests
 6 | from deepdiff import DeepDiff
 7 | 
 8 | with open('event.json', 'rt', encoding='utf-8') as f_in:
 9 |     event = json.load(f_in)
10 | 
11 | 
12 | url = 'http://localhost:8080/2015-03-31/functions/function/invocations'
13 | actual_response = requests.post(url, json=event).json()
14 | print('actual response:')
15 | 
16 | print(json.dumps(actual_response, indent=2))
17 | 
18 | expected_response = {
19 |     'predictions': [
20 |         {
21 |             'model': 'ride_duration_prediction_model',
22 |             'version': 'Test123',
23 |             'prediction': {
24 |                 'ride_duration': 21.3,
25 |                 'ride_id': 256,
26 |             },
27 |         }
28 |     ]
29 | }
30 | 
31 | 
32 | diff = DeepDiff(actual_response, expected_response, significant_digits=1)
33 | print(f'diff={diff}')
34 | 
35 | assert 'type_changes' not in diff
36 | assert 'values_changed' not in diff
37 | 


--------------------------------------------------------------------------------
/05-monitoring/homework/prefect-monitoring/monitor_profile.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 48,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "from pymongo import MongoClient\n",
10 |     "import pprint"
11 |    ]
12 |   }
13 |  ],
14 |  "metadata": {
15 |   "kernelspec": {
16 |    "display_name": "Python 3.9.12 ('prediction_service_practice-b8Zbdkaa')",
17 |    "language": "python",
18 |    "name": "python3"
19 |   },
20 |   "language_info": {
21 |    "codemirror_mode": {
22 |     "name": "ipython",
23 |     "version": 3
24 |    },
25 |    "file_extension": ".py",
26 |    "mimetype": "text/x-python",
27 |    "name": "python",
28 |    "nbconvert_exporter": "python",
29 |    "pygments_lexer": "ipython3",
30 |    "version": "3.9.12"
31 |   },
32 |   "orig_nbformat": 4,
33 |   "vscode": {
34 |    "interpreter": {
35 |     "hash": "63df8a96dcc14a3f8fc6f13bb4daf95ac616547a440980d0dc62a5d5ed05a07e"
36 |    }
37 |   }
38 |  },
39 |  "nbformat": 4,
40 |  "nbformat_minor": 2
41 | }
42 | 


--------------------------------------------------------------------------------
/02-experiment-tracking/homework/train.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import pickle
 4 | 
 5 | from sklearn.ensemble import RandomForestRegressor
 6 | from sklearn.metrics import mean_squared_error
 7 | 
 8 | 
 9 | def load_pickle(filename: str):
10 |     with open(filename, "rb") as f_in:
11 |         return pickle.load(f_in)
12 | 
13 | 
14 | def run(data_path):
15 | 
16 |     X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
17 |     X_valid, y_valid = load_pickle(os.path.join(data_path, "valid.pkl"))
18 | 
19 |     rf = RandomForestRegressor(max_depth=10, random_state=0)
20 |     rf.fit(X_train, y_train)
21 |     y_pred = rf.predict(X_valid)
22 | 
23 |     rmse = mean_squared_error(y_valid, y_pred, squared=False)
24 | 
25 | 
26 | if __name__ == '__main__':
27 | 
28 |     parser = argparse.ArgumentParser()
29 |     parser.add_argument(
30 |         "--data_path",
31 |         default="./output",
32 |         help="the location where the processed NYC taxi trip data was saved."
33 |     )
34 |     args = parser.parse_args()
35 | 
36 |     run(args.data_path)
37 | 


--------------------------------------------------------------------------------
/05-monitoring/send_data.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import uuid
 3 | from datetime import datetime
 4 | from time import sleep
 5 | 
 6 | import pyarrow.parquet as pq
 7 | import requests
 8 | 
 9 | table = pq.read_table("green_tripdata_2022-01.parquet")
10 | data = table.to_pylist()
11 | 
12 | 
13 | class DateTimeEncoder(json.JSONEncoder):
14 |     def default(self, o):
15 |         if isinstance(o, datetime):
16 |             return o.isoformat()
17 |         return json.JSONEncoder.default(self, o)
18 | 
19 | 
20 | with open("target.csv", 'w') as f_target:
21 |     for row in data:
22 |         row['id'] = str(uuid.uuid4())
23 |         duration = (row['lpep_dropoff_datetime'] - row['lpep_pickup_datetime']).total_seconds() / 60
24 |         if duration != 0.0:
25 |             f_target.write(f"{row['id']},{duration}\n")
26 |         resp = requests.post("http://127.0.0.1:9696/predict",
27 |                              headers={"Content-Type": "application/json"},
28 |                              data=json.dumps(row, cls=DateTimeEncoder)).json()
29 |         print(f"prediction: {resp['duration']}")
30 |         sleep(1)
31 | 


--------------------------------------------------------------------------------
/after-sign-up.md:
--------------------------------------------------------------------------------
 1 | ## Thank you!
 2 | 
 3 | Thanks for signining up for the course.
 4 | 
 5 | The process of adding you to the mailing list is not automated yet, 
 6 | but you will hear from us closer to the course start. 
 7 | 
 8 | To make sure you don't miss any announcements
 9 | 
10 | - Register in [DataTalks.Club's Slack](https://datatalks.club/slack.html) and join the [`#course-mlops-zoomcamp`](https://app.slack.com/client/T01ATQK62F8/C02R98X7DS9) channel
11 | - Join the [course Telegram channel with announcements](https://t.me/dtc_courses)
12 | - [Tweet about the course!](https://ctt.ac/fH67W)
13 | - Subscribe to [DataTalks.Club's YouTube channel](https://www.youtube.com/c/DataTalksClub) and check 
14 |   [the course playlist](https://www.youtube.com/playlist?list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK)
15 | - [Technical FAQ](https://docs.google.com/document/d/12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0/edit) 
16 | Subscribe to our [public Google Calendar](https://calendar.google.com/calendar/?cid=M3Jzbmg0ZDA2aHVsY2M1ZjcyNDJtODNyMTRAZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ) (it works from Desktop only)
17 | 
18 | See you in May!
19 | 


--------------------------------------------------------------------------------
/06-best-practices/code/integraton-test/event.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Records": [
 3 |         {
 4 |             "kinesis": {
 5 |                 "kinesisSchemaVersion": "1.0",
 6 |                 "partitionKey": "1",
 7 |                 "sequenceNumber": "49630081666084879290581185630324770398608704880802529282",
 8 |                 "data": "ewogICAgICAgICJyaWRlIjogewogICAgICAgICAgICAiUFVMb2NhdGlvbklEIjogMTMwLAogICAgICAgICAgICAiRE9Mb2NhdGlvbklEIjogMjA1LAogICAgICAgICAgICAidHJpcF9kaXN0YW5jZSI6IDMuNjYKICAgICAgICB9LCAKICAgICAgICAicmlkZV9pZCI6IDI1NgogICAgfQ==",
 9 |                 "approximateArrivalTimestamp": 1654161514.132
10 |             },
11 |             "eventSource": "aws:kinesis",
12 |             "eventVersion": "1.0",
13 |             "eventID": "shardId-000000000000:49630081666084879290581185630324770398608704880802529282",
14 |             "eventName": "aws:kinesis:record",
15 |             "invokeIdentityArn": "arn:aws:iam::387546586013:role/lambda-kinesis-role",
16 |             "awsRegion": "eu-west-1",
17 |             "eventSourceARN": "arn:aws:kinesis:eu-west-1:387546586013:stream/ride_events"
18 |         }
19 |     ]
20 | }
21 | 


--------------------------------------------------------------------------------
/05-monitoring/meta.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "module": {
 3 |     "number": 5,
 4 |     "title": "Model Monitoring"
 5 |   },
 6 |   "units": [
 7 |     {
 8 |       "number": 1,
 9 |       "title": "Monitoring for ML-based services",
10 |       "youtube": "https://www.youtube.com/watch?v=gMiT11Bp05A"
11 |     },
12 |     {
13 |       "number": 2,
14 |       "title": "Setting up the environment",
15 |       "youtube": "https://www.youtube.com/watch?v=VkkpVXW53bo"
16 |     },
17 |     {
18 |       "number": 3,
19 |       "title": "Creating a prediction service and simulating traffic",
20 |       "youtube": "https://www.youtube.com/watch?v=umQ3Mo5G1o8"
21 |     },
22 |     {
23 |       "number": 4,
24 |       "title": "Realtime monitoring walktrough (Prometheus, Evidently, Grafana)",
25 |       "youtube": "https://www.youtube.com/watch?v=r_m4VFEJ8yY"
26 |     },
27 |     {
28 |       "number": 5,
29 |       "title": "Batch monitoring walktrough (Prefect, MongoDB, Evidently)",
30 |       "youtube": "https://www.youtube.com/watch?v=KefdYuue_FE"
31 |     },
32 |     {
33 |       "number": 6,
34 |       "title": "Homework",
35 |       "youtube": ""
36 |     }
37 |   ]
38 | }


--------------------------------------------------------------------------------
/04-deployment/web-service-mlflow/predict.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | 
 4 | import mlflow
 5 | from flask import Flask, request, jsonify
 6 | 
 7 | 
 8 | RUN_ID = os.getenv('RUN_ID')
 9 | 
10 | logged_model = f's3://mlflow-models-alexey/1/{RUN_ID}/artifacts/model'
11 | # logged_model = f'runs:/{RUN_ID}/model'
12 | model = mlflow.pyfunc.load_model(logged_model)
13 | 
14 | 
15 | def prepare_features(ride):
16 |     features = {}
17 |     features['PU_DO'] = '%s_%s' % (ride['PULocationID'], ride['DOLocationID'])
18 |     features['trip_distance'] = ride['trip_distance']
19 |     return features
20 | 
21 | 
22 | def predict(features):
23 |     preds = model.predict(features)
24 |     return float(preds[0])
25 | 
26 | 
27 | app = Flask('duration-prediction')
28 | 
29 | 
30 | @app.route('/predict', methods=['POST'])
31 | def predict_endpoint():
32 |     ride = request.get_json()
33 | 
34 |     features = prepare_features(ride)
35 |     pred = predict(features)
36 | 
37 |     result = {
38 |         'duration': pred,
39 |         'model_version': RUN_ID
40 |     }
41 | 
42 |     return jsonify(result)
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     app.run(debug=True, host='0.0.0.0', port=9696)
47 | 


--------------------------------------------------------------------------------
/06-best-practices/code/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # See https://pre-commit.com for more information
 2 | # See https://pre-commit.com/hooks.html for more hooks
 3 | repos:
 4 | - repo: https://github.com/pre-commit/pre-commit-hooks
 5 |   rev: v3.2.0
 6 |   hooks:
 7 |     - id: trailing-whitespace
 8 |     - id: end-of-file-fixer
 9 |     - id: check-yaml
10 |     - id: check-added-large-files
11 | - repo: https://github.com/pycqa/isort
12 |   rev: 5.10.1
13 |   hooks:
14 |     - id: isort
15 |       name: isort (python)
16 | - repo: https://github.com/psf/black
17 |   rev: 22.6.0
18 |   hooks:
19 |     - id: black
20 |       language_version: python3.9
21 | - repo: local
22 |   hooks:
23 |     - id: pylint
24 |       name: pylint
25 |       entry: pylint
26 |       language: system
27 |       types: [python]
28 |       args: [
29 |         "-rn", # Only display messages
30 |         "-sn", # Don't display the score
31 |         "--recursive=y"
32 |       ]
33 | - repo: local
34 |   hooks:
35 |     - id: pytest-check
36 |       name: pytest-check
37 |       entry: pytest
38 |       language: system
39 |       pass_filenames: false
40 |       always_run: true
41 |       args: [
42 |         "tests/"
43 |       ]
44 | 


--------------------------------------------------------------------------------
/05-monitoring/homework/prefect-monitoring/send_data.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import uuid
 3 | from datetime import datetime
 4 | 
 5 | import pyarrow.parquet as pq
 6 | import requests
 7 | 
 8 | table = pq.read_table("../datasets/green_tripdata_2021-05.parquet")\
 9 |           .to_pandas()\
10 |           .sample(n=5000, random_state=42) #5000 rows sampled
11 | data = table.copy()
12 | 
13 | 
14 | class DateTimeEncoder(json.JSONEncoder):
15 |     def default(self, o):
16 |         if isinstance(o, datetime):
17 |             return o.isoformat()
18 |         return json.JSONEncoder.default(self, o)
19 | 
20 | 
21 | with open("target.csv", 'w') as f_target:
22 |     for index, row in data.iterrows():
23 |         row['id'] = str(uuid.uuid4())
24 |         duration = (row['lpep_dropoff_datetime'] - row['lpep_pickup_datetime']).total_seconds() / 60
25 |         if duration >= 1 and duration <= 60:
26 |             f_target.write(f"{row['id']},{duration}\n")
27 |         resp = requests.post("http://127.0.0.1:9696/predict-duration",
28 |                              headers={"Content-Type": "application/json"},
29 |                              data=row.to_json()).json()
30 |         print(f"prediction: {resp['data']['duration']}")
31 | 


--------------------------------------------------------------------------------
/03-orchestration/windows.md:
--------------------------------------------------------------------------------
 1 | ## Prefect on Windows
 2 | 
 3 | If you use WSL, you should have no problems running Prefect Orion.
 4 | 
 5 | But if you aren't, there is just a slight tweak to installation instructions if you are on Windows.
 6 | 
 7 | You will need to install 2.0b7 (to be released soon). 2.0b7 will officially support Windows. Use this instead of 2.0b5 shows in the lectures.
 8 | 
 9 | ```
10 | pip install prefect==2.0b7
11 | ```
12 | 
13 | Note that 2.0b5 and 2.0b7 are not compatible because 2.0b7 contains breaking changes. If you run into issues, you can reset the Prefect database by doing:
14 | 
15 | ```
16 | prefect orion database reset
17 | ```
18 | 
19 | This command will clear the data held by Orion.
20 | 
21 | ### Docker
22 | 
23 | You can also try running Prefect in Docker. For example:
24 | 
25 | ```
26 | docker run -it --rm \
27 |     -p 4200:4200 \
28 |     prefecthq/prefect:2.0b5-python3.8 \
29 |         prefect orion start --host=0.0.0.0
30 | ```
31 | 
32 | and then view it from `localhost:4200`.
33 | 
34 | ### Prefect Cloud
35 | 
36 | You can also just use Cloud so you don't have to host Prefect Orion yourself. Instructions can be found here:
37 | 
38 | https://orion-docs.prefect.io/ui/cloud-getting-started/
39 | 


--------------------------------------------------------------------------------
/06-best-practices/code/integraton-test/run.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [[ -z "${GITHUB_ACTIONS}" ]]; then
 4 |   cd "$(dirname "$0")"
 5 | fi
 6 | 
 7 | if [ "${LOCAL_IMAGE_NAME}" == "" ]; then 
 8 |     LOCAL_TAG=`date +"%Y-%m-%d-%H-%M"`
 9 |     export LOCAL_IMAGE_NAME="stream-model-duration:${LOCAL_TAG}"
10 |     echo "LOCAL_IMAGE_NAME is not set, building a new image with tag ${LOCAL_IMAGE_NAME}"
11 |     docker build -t ${LOCAL_IMAGE_NAME} ..
12 | else
13 |     echo "no need to build image ${LOCAL_IMAGE_NAME}"
14 | fi
15 | 
16 | export PREDICTIONS_STREAM_NAME="ride_predictions"
17 | 
18 | docker-compose up -d
19 | 
20 | sleep 5
21 | 
22 | aws --endpoint-url=http://localhost:4566 \
23 |     kinesis create-stream \
24 |     --stream-name ${PREDICTIONS_STREAM_NAME} \
25 |     --shard-count 1
26 | 
27 | pipenv run python test_docker.py
28 | 
29 | ERROR_CODE=$?
30 | 
31 | if [ ${ERROR_CODE} != 0 ]; then
32 |     docker-compose logs
33 |     docker-compose down
34 |     exit ${ERROR_CODE}
35 | fi
36 | 
37 | 
38 | pipenv run python test_kinesis.py
39 | 
40 | ERROR_CODE=$?
41 | 
42 | if [ ${ERROR_CODE} != 0 ]; then
43 |     docker-compose logs
44 |     docker-compose down
45 |     exit ${ERROR_CODE}
46 | fi
47 | 
48 | 
49 | docker-compose down
50 | 


--------------------------------------------------------------------------------
/04-deployment/streaming/test.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import lambda_function
 3 | 
 4 | event = {
 5 |     "Records": [
 6 |         {
 7 |             "kinesis": {
 8 |                 "kinesisSchemaVersion": "1.0",
 9 |                 "partitionKey": "1",
10 |                 "sequenceNumber": "49630081666084879290581185630324770398608704880802529282",
11 |                 "data": "ewogICAgICAgICJyaWRlIjogewogICAgICAgICAgICAiUFVMb2NhdGlvbklEIjogMTMwLAogICAgICAgICAgICAiRE9Mb2NhdGlvbklEIjogMjA1LAogICAgICAgICAgICAidHJpcF9kaXN0YW5jZSI6IDMuNjYKICAgICAgICB9LCAKICAgICAgICAicmlkZV9pZCI6IDI1NgogICAgfQ==",
12 |                 "approximateArrivalTimestamp": 1654161514.132
13 |             },
14 |             "eventSource": "aws:kinesis",
15 |             "eventVersion": "1.0",
16 |             "eventID": "shardId-000000000000:49630081666084879290581185630324770398608704880802529282",
17 |             "eventName": "aws:kinesis:record",
18 |             "invokeIdentityArn": "arn:aws:iam::387546586013:role/lambda-kinesis-role",
19 |             "awsRegion": "eu-west-1",
20 |             "eventSourceARN": "arn:aws:kinesis:eu-west-1:387546586013:stream/ride_events"
21 |         }
22 |     ]
23 | }
24 | 
25 | 
26 | result = lambda_function.lambda_handler(event, None)
27 | print(result)
28 | 


--------------------------------------------------------------------------------
/06-best-practices/homework_solution/integration_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from datetime import datetime
 4 | import pandas as pd
 5 | 
 6 | import batch
 7 | 
 8 | def dt(hour, minute, second=0):
 9 |     return datetime(2021, 1, 1, hour, minute, second)
10 | 
11 | 
12 | S3_ENDPOINT_URL = os.getenv('S3_ENDPOINT_URL')
13 | 
14 | options = {
15 |     'client_kwargs': {
16 |         'endpoint_url': S3_ENDPOINT_URL
17 |     }
18 | }
19 | 
20 | data = [
21 |     (None, None, dt(1, 2), dt(1, 10)),
22 |     (1, 1, dt(1, 2), dt(1, 10)),
23 |     (1, 1, dt(1, 2, 0), dt(1, 2, 50)),
24 |     (1, 1, dt(1, 2, 0), dt(2, 2, 1)),        
25 | ]
26 | 
27 | columns = ['PUlocationID', 'DOlocationID', 'pickup_datetime', 'dropOff_datetime']
28 | df_input = pd.DataFrame(data, columns=columns)
29 | 
30 | 
31 | input_file = batch.get_input_path(2021, 1)
32 | output_file = batch.get_output_path(2021, 1)
33 | 
34 | df_input.to_parquet(
35 |     input_file,
36 |     engine='pyarrow',
37 |     compression=None,
38 |     index=False,
39 |     storage_options=options
40 | )
41 | 
42 | 
43 | os.system('python batch.py 2021 1')
44 | 
45 | 
46 | df_actual = pd.read_parquet(output_file, storage_options=options)
47 | 
48 | 
49 | assert abs(df_actual['predicted_duration'].sum() - 69.28) < 0.1


--------------------------------------------------------------------------------
/06-best-practices/meta.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "module": {
 3 |     "number": 6,
 4 |     "title": "Best Practices"
 5 |   },
 6 |   "units": [
 7 |     {
 8 |       "number": "1",
 9 |       "title": "Testing Python code with pytest",
10 |       "youtube": "https://www.youtube.com/watch?v=CJp1eFQP5nk"
11 |     },
12 |     {
13 |       "number": "2",
14 |       "title": "Integration tests with docker-compose",
15 |       "youtube": "https://www.youtube.com/watch?v=lBX0Gl7Z1ck"
16 |     },
17 |     {
18 |       "number": "3",
19 |       "title": "Testing cloud services with LocalStack",
20 |       "youtube": "https://www.youtube.com/watch?v=9yMO86SYvuI"
21 |     },
22 |     {
23 |       "number": "4",
24 |       "title": "Code quality: linting and formatting",
25 |       "youtube": "https://www.youtube.com/watch?v=uImvWE-iSDQ"
26 |     },
27 |     {
28 |       "number": "5",
29 |       "title": "Git pre-commit hooks",
30 |       "youtube": "https://www.youtube.com/watch?v=lmMZ7Axk2T8"
31 |     },
32 |     {
33 |       "number": "6",
34 |       "title": "Makefiles and make",
35 |       "youtube": "https://www.youtube.com/watch?v=F6DZdvbRZQQ"
36 |     },
37 |     {
38 |       "number": "X",
39 |       "title": "Homework",
40 |       "youtube": ""
41 |     }
42 |   ]
43 | }


--------------------------------------------------------------------------------
/03-orchestration/meta.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "module": {
 3 |     "number": 3,
 4 |     "title": "Orchestration and ML Pipelines"
 5 |   },
 6 |   "units": [
 7 |     {
 8 |       "number": 1,
 9 |       "title": "Negative engineering and workflow orchestration",
10 |       "youtube": "https://www.youtube.com/watch?v=eKzCjNXoCTc"
11 |     },
12 |     {
13 |       "number": 2,
14 |       "title": "Introduction to Prefect 2.0",
15 |       "youtube": "https://www.youtube.com/watch?v=Yb6NJwI7bXw"
16 |     },
17 |     {
18 |       "number": 3,
19 |       "title": "First Prefect flow and basics",
20 |       "youtube": "https://www.youtube.com/watch?v=MCFpURG506w"
21 |     },
22 |     {
23 |       "number": 4,
24 |       "title": "Remote Prefect Orion deployment",
25 |       "youtube": "https://www.youtube.com/watch?v=ComkSIAB0k4"
26 |     },
27 |     {
28 |       "number": 5,
29 |       "title": "Deployment of Prefect flow",
30 |       "youtube": "https://www.youtube.com/watch?v=xw9JfaWPPps"
31 |     },
32 |     {
33 |       "number": 6,
34 |       "title": "MLOps Zoomcamp 3.6 - (Optional) Work queues and agents",
35 |       "youtube": "https://www.youtube.com/watch?v=oDSf0ThKsso"
36 |     },
37 |     {
38 |       "number": 7,
39 |       "title": "Homework",
40 |       "youtube": ""
41 |     }
42 |   ]
43 | }


--------------------------------------------------------------------------------
/06-best-practices/homework_solution/tests/test_batch.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | import pandas as pd
 4 | 
 5 | import batch
 6 | 
 7 | 
 8 | def dt(hour, minute, second=0):
 9 |     return datetime(2021, 1, 1, hour, minute, second)
10 | 
11 | 
12 | def test_prepare_data():
13 |     data = [
14 |         (None, None, dt(1, 2), dt(1, 10)),
15 |         (1, 1, dt(1, 2), dt(1, 10)),
16 |         (1, 1, dt(1, 2, 0), dt(1, 2, 50)),
17 |         (1, 1, dt(1, 2, 0), dt(2, 2, 1)),        
18 |     ]
19 | 
20 |     categorical = ['PUlocationID', 'DOlocationID']
21 |     columns = ['PUlocationID', 'DOlocationID', 'pickup_datetime', 'dropOff_datetime']
22 |     df = pd.DataFrame(data, columns=columns)
23 | 
24 |     df_actual = batch.prepare_data(df, categorical)
25 | 
26 |     data_expected = [
27 |         ('-1', '-1', 8.0),
28 |         ( '1',  '1', 8.0),
29 |     ]
30 | 
31 |     columns_test = ['PUlocationID', 'DOlocationID', 'duration']
32 |     df_expected = pd.DataFrame(data_expected, columns=columns_test)
33 |     print(df_actual)
34 | 
35 |     assert (df_actual['PUlocationID'] == df_expected['PUlocationID']).all()
36 |     assert (df_actual['DOlocationID'] == df_expected['DOlocationID']).all()
37 |     assert (df_actual['duration'] - df_expected['duration']).abs().sum() < 0.0000001
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/04-deployment/streaming/test_docker.py:
--------------------------------------------------------------------------------
 1 | import requests 
 2 | 
 3 | event = {
 4 |     "Records": [
 5 |         {
 6 |             "kinesis": {
 7 |                 "kinesisSchemaVersion": "1.0",
 8 |                 "partitionKey": "1",
 9 |                 "sequenceNumber": "49630081666084879290581185630324770398608704880802529282",
10 |                 "data": "ewogICAgICAgICJyaWRlIjogewogICAgICAgICAgICAiUFVMb2NhdGlvbklEIjogMTMwLAogICAgICAgICAgICAiRE9Mb2NhdGlvbklEIjogMjA1LAogICAgICAgICAgICAidHJpcF9kaXN0YW5jZSI6IDMuNjYKICAgICAgICB9LCAKICAgICAgICAicmlkZV9pZCI6IDI1NgogICAgfQ==",
11 |                 "approximateArrivalTimestamp": 1654161514.132
12 |             },
13 |             "eventSource": "aws:kinesis",
14 |             "eventVersion": "1.0",
15 |             "eventID": "shardId-000000000000:49630081666084879290581185630324770398608704880802529282",
16 |             "eventName": "aws:kinesis:record",
17 |             "invokeIdentityArn": "arn:aws:iam::387546586013:role/lambda-kinesis-role",
18 |             "awsRegion": "eu-west-1",
19 |             "eventSourceARN": "arn:aws:kinesis:eu-west-1:387546586013:stream/ride_events"
20 |         }
21 |     ]
22 | }
23 | 
24 | 
25 | url = 'http://localhost:8080/2015-03-31/functions/function/invocations'
26 | response = requests.post(url, json=event)
27 | print(response.json())
28 | 


--------------------------------------------------------------------------------
/asking-questions.md:
--------------------------------------------------------------------------------
 1 | ## Asking questions
 2 | 
 3 | If you have any questions, ask them 
 4 | in the [`#course-mlops-zoomcamp`](https://app.slack.com/client/T01ATQK62F8/C02R98X7DS9) channel in [DataTalks.Club](https://datatalks.club) slack.
 5 | 
 6 | To keep our discussion in Slack more organized, we ask you to follow these suggestions:
 7 | 
 8 | * Before asking a question, check [FAQ](https://docs.google.com/document/d/12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0/edit).
 9 | * Use threads. When you have a problem, first describe the problem shortly
10 |   and then put the actual error in the thread - so it doesn't take the entire screen.
11 | * Instead of screenshots, it's better to copy-paste the error you're getting in text.
12 |   Use ` ``` ` for formatting your code.
13 |   It's very difficult to read text from screenshots.
14 | * Please don't take pictures of your code with a phone. It's even harder to read. Follow the previous suggestion,
15 |   and in rare cases when you need to show what happens on your screen, take a screenshot.
16 | * You don't need to tag the instructors when you have a problem. We will see it eventually.
17 | * If somebody helped you with your problem and it's not in [FAQ](https://docs.google.com/document/d/12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0/edit), please add it there.
18 |   It'll help other students.
19 | 


--------------------------------------------------------------------------------
/06-best-practices/code/scripts/deploy_manual.sh:
--------------------------------------------------------------------------------
 1 | AWS_REGION="eu-west-1"
 2 | 
 3 | # Dynamically generated by TF
 4 | export MODEL_BUCKET_PROD="stg-mlflow-models-code-owners-mlops-zoomcamp"
 5 | export PREDICTIONS_STREAM_NAME="stg_ride_predictions-mlops-zoomcamp"
 6 | export LAMBDA_FUNCTION="stg_prediction_lambda_mlops-zoomcamp"
 7 | 
 8 | # Model artifacts bucket from the previous weeks (MLflow experiments)
 9 | export MODEL_BUCKET_DEV="mlflow-models-alexey"
10 | 
11 | # Get latest RUN_ID from latest S3 partition.
12 | # NOT FOR PRODUCTION!
13 | # In practice, this is generally picked up from your experiment tracking tool such as MLflow or DVC
14 | export RUN_ID=$(aws s3api list-objects-v2 --bucket ${MODEL_BUCKET_DEV} \
15 | --query 'sort_by(Contents, &LastModified)[-1].Key' --output=text | cut -f2 -d/)
16 | 
17 | # NOT FOR PRODUCTION!
18 | # Just mocking the artifacts from training process in the Prod env
19 | aws s3 sync s3://${MODEL_BUCKET_DEV} s3://${MODEL_BUCKET_PROD}
20 | 
21 | # Set new var RUN_ID in existing set of vars.
22 | variables="{PREDICTIONS_STREAM_NAME=${PREDICTIONS_STREAM_NAME}, MODEL_BUCKET=${MODEL_BUCKET_PROD}, RUN_ID=${RUN_ID}}"
23 | 
24 | # https://docs.aws.amazon.com/lambda/latest/dg/configuration-envvars.html
25 | aws lambda update-function-configuration --function-name ${LAMBDA_FUNCTION} --environment "Variables=${variables}"
26 | 


--------------------------------------------------------------------------------
/04-deployment/meta.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "module": {
 3 |     "number": 4,
 4 |     "title": "Model Deployment"
 5 |   },
 6 |   "units": [
 7 |     {
 8 |       "number": 1,
 9 |       "title": "Three ways of deploying a model",
10 |       "youtube": "https://www.youtube.com/watch?v=JMGe4yIoBRA"
11 |     },
12 |     {
13 |       "number": 2,
14 |       "title": "Web-services: Deploying models with Flask and Docker",
15 |       "youtube": "https://www.youtube.com/watch?v=D7wfMAdgdF8"
16 |     },
17 |     {
18 |       "number": 3,
19 |       "title": "Web-services: Getting the models from the model registry (MLflow)",
20 |       "youtube": "https://www.youtube.com/watch?v=aewOpHSCkqI"
21 |     },
22 |     {
23 |       "number": 4,
24 |       "title": "(Optional) Streaming: Deploying models with Kinesis and Lambda ",
25 |       "youtube": "https://www.youtube.com/watch?v=TCqr9HNcrsI"
26 |     },
27 |     {
28 |       "number": 5,
29 |       "title": "Batch: Preparing a scoring script",
30 |       "youtube": "https://www.youtube.com/watch?v=18Lbaaeigek"
31 |     },
32 |     {
33 |       "number": 6,
34 |       "title": "MLOps Zoomcamp 4.6 - Batch: Scheduling batch scoring jobs with Prefect",
35 |       "youtube": "https://www.youtube.com/watch?v=ekT_JW213Tc"
36 |     },
37 |     {
38 |       "number": 7,
39 |       "title": "Homework",
40 |       "youtube": ""
41 |     }
42 |   ]
43 | }


--------------------------------------------------------------------------------
/02-experiment-tracking/meta.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "module": {
 3 |     "number": 2,
 4 |     "title": "Experiment tracking and model management"
 5 |   },
 6 |   "units": [
 7 |     {
 8 |       "number": 1,
 9 |       "title": "Experiment tracking intro",
10 |       "youtube": "https://www.youtube.com/watch?v=MiA7LQin9c8"
11 |     },
12 |     {
13 |       "number": 2,
14 |       "title": "Getting started with MLflow",
15 |       "youtube": "https://www.youtube.com/watch?v=cESCQE9J3ZE"
16 |     },
17 |     {
18 |       "number": 3,
19 |       "title": "Experiment tracking with MLflow",
20 |       "youtube": "https://www.youtube.com/watch?v=iaJz-T7VWec"
21 |     },
22 |     {
23 |       "number": 4,
24 |       "title": "Model management",
25 |       "youtube": "https://www.youtube.com/watch?v=OVUPIX88q88"
26 |     },
27 |     {
28 |       "number": 5,
29 |       "title": "Model registry",
30 |       "youtube": "https://www.youtube.com/watch?v=TKHU7HAvGH8"
31 |     },
32 |     {
33 |       "number": 6,
34 |       "title": "MLflow in practice",
35 |       "youtube": "https://www.youtube.com/watch?v=1ykg4YmbFVA"
36 |     },
37 |     {
38 |       "number": 7,
39 |       "title": "MLflow: benefits, limitations and alternatives",
40 |       "youtube": "https://www.youtube.com/watch?v=Lugy1JPsBRY"
41 |     },
42 |     {
43 |       "number": 8,
44 |       "title": "Homework",
45 |       "youtube": ""
46 |     }
47 |   ]
48 | }


--------------------------------------------------------------------------------
/06-best-practices/code/infrastructure/modules/lambda/main.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_lambda_function" "kinesis_lambda" {
 2 |   function_name = var.lambda_function_name
 3 |   # This can also be any base image to bootstrap the lambda config, unrelated to your Inference service on ECR
 4 |   # which would be anyway updated regularly via a CI/CD pipeline
 5 |   image_uri = var.image_uri   # required-argument
 6 |   package_type = "Image"
 7 |   role          = aws_iam_role.iam_lambda.arn
 8 |   tracing_config {
 9 |     mode = "Active"
10 |   }
11 |   // This step is optional (environment)
12 |   environment {
13 |     variables = {
14 |       PREDICTIONS_STREAM_NAME = var.output_stream_name
15 |       MODEL_BUCKET = var.model_bucket
16 |     }
17 |   }
18 |   timeout = 180
19 | }
20 | 
21 | # Lambda Invoke & Event Source Mapping:
22 | 
23 | resource "aws_lambda_function_event_invoke_config" "kinesis_lambda_event" {
24 |   function_name                = aws_lambda_function.kinesis_lambda.function_name
25 |   maximum_event_age_in_seconds = 60
26 |   maximum_retry_attempts       = 0
27 | }
28 | 
29 | resource "aws_lambda_event_source_mapping" "kinesis_mapping" {
30 |   event_source_arn  = var.source_stream_arn
31 |   function_name     = aws_lambda_function.kinesis_lambda.arn
32 |   starting_position = "LATEST"
33 |   depends_on = [
34 |     aws_iam_role_policy_attachment.kinesis_processing
35 |   ]
36 |   // enabled           = var.lambda_event_source_mapping_enabled
37 |   // batch_size        = var.lambda_event_source_mapping_batch_size
38 | }
39 | 


--------------------------------------------------------------------------------
/06-best-practices/code/integraton-test/test_kinesis.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable=duplicate-code
 2 | 
 3 | import os
 4 | import json
 5 | from pprint import pprint
 6 | 
 7 | import boto3
 8 | from deepdiff import DeepDiff
 9 | 
10 | kinesis_endpoint = os.getenv('KINESIS_ENDPOINT_URL', "http://localhost:4566")
11 | kinesis_client = boto3.client('kinesis', endpoint_url=kinesis_endpoint)
12 | 
13 | stream_name = os.getenv('PREDICTIONS_STREAM_NAME', 'ride_predictions')
14 | shard_id = 'shardId-000000000000'
15 | 
16 | 
17 | shard_iterator_response = kinesis_client.get_shard_iterator(
18 |     StreamName=stream_name,
19 |     ShardId=shard_id,
20 |     ShardIteratorType='TRIM_HORIZON',
21 | )
22 | 
23 | shard_iterator_id = shard_iterator_response['ShardIterator']
24 | 
25 | 
26 | records_response = kinesis_client.get_records(
27 |     ShardIterator=shard_iterator_id,
28 |     Limit=1,
29 | )
30 | 
31 | 
32 | records = records_response['Records']
33 | pprint(records)
34 | 
35 | 
36 | assert len(records) == 1
37 | 
38 | 
39 | actual_record = json.loads(records[0]['Data'])
40 | pprint(actual_record)
41 | 
42 | expected_record = {
43 |     'model': 'ride_duration_prediction_model',
44 |     'version': 'Test123',
45 |     'prediction': {
46 |         'ride_duration': 21.3,
47 |         'ride_id': 256,
48 |     },
49 | }
50 | 
51 | diff = DeepDiff(actual_record, expected_record, significant_digits=1)
52 | print(f'diff={diff}')
53 | 
54 | assert 'values_changed' not in diff
55 | assert 'type_changes' not in diff
56 | 
57 | 
58 | print('all good')
59 | 


--------------------------------------------------------------------------------
/06-best-practices/homework/batch.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | import sys
 5 | import pickle
 6 | import pandas as pd
 7 | 
 8 | 
 9 | year = int(sys.argv[1])
10 | month = int(sys.argv[2])
11 | 
12 | input_file = f'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/nyc-tlc/fhv/fhv_tripdata_{year:04d}-{month:02d}.parquet'
13 | output_file = f's3://nyc-duration-prediction-alexey/taxi_type=fhv/year={year:04d}/month={month:02d}/predictions.parquet'
14 | 
15 | 
16 | with open('model.bin', 'rb') as f_in:
17 |     dv, lr = pickle.load(f_in)
18 | 
19 | 
20 | categorical = ['PUlocationID', 'DOlocationID']
21 | 
22 | def read_data(filename):
23 |     df = pd.read_parquet(filename)
24 |     
25 |     df['duration'] = df.dropOff_datetime - df.pickup_datetime
26 |     df['duration'] = df.duration.dt.total_seconds() / 60
27 | 
28 |     df = df[(df.duration >= 1) & (df.duration <= 60)].copy()
29 | 
30 |     df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
31 |     
32 |     return df
33 | 
34 | 
35 | df = read_data(input_file)
36 | df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')
37 | 
38 | 
39 | dicts = df[categorical].to_dict(orient='records')
40 | X_val = dv.transform(dicts)
41 | y_pred = lr.predict(X_val)
42 | 
43 | 
44 | print('predicted mean duration:', y_pred.mean())
45 | 
46 | 
47 | df_result = pd.DataFrame()
48 | df_result['ride_id'] = df['ride_id']
49 | df_result['predicted_duration'] = y_pred
50 | 
51 | df_result.to_parquet(output_file, engine='pyarrow', index=False)


--------------------------------------------------------------------------------
/04-deployment/homework/batch.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | import sys
 5 | import pickle
 6 | import pandas as pd
 7 | 
 8 | 
 9 | year = int(sys.argv[1]) # 2021
10 | month = int(sys.argv[2]) #2
11 | 
12 | input_file = f's3://nyc-tlc/trip data/fhv_tripdata_{year:04d}-{month:02d}.parquet'
13 | output_file = f's3://nyc-duration-prediction-alexey/taxi_type=fhv/year={year:04d}/month={month:02d}/predictions.parquet'
14 | 
15 | 
16 | with open('model.bin', 'rb') as f_in:
17 |     dv, lr = pickle.load(f_in)
18 | 
19 | 
20 | categorical = ['PUlocationID', 'DOlocationID']
21 | 
22 | def read_data(filename):
23 |     df = pd.read_parquet(filename)
24 |     
25 |     df['duration'] = df.dropOff_datetime - df.pickup_datetime
26 |     df['duration'] = df.duration.dt.total_seconds() / 60
27 | 
28 |     df = df[(df.duration >= 1) & (df.duration <= 60)].copy()
29 | 
30 |     df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
31 |     
32 |     return df
33 | 
34 | 
35 | df = read_data(input_file)
36 | df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')
37 | 
38 | 
39 | dicts = df[categorical].to_dict(orient='records')
40 | X_val = dv.transform(dicts)
41 | y_pred = lr.predict(X_val)
42 | 
43 | 
44 | print('predicted mean duration:', y_pred.mean())
45 | 
46 | 
47 | df_result = pd.DataFrame()
48 | df_result['ride_id'] = df['ride_id']
49 | df_result['predicted_duration'] = y_pred
50 | 
51 | 
52 | df_result.to_parquet(
53 |     output_file,
54 |     engine='pyarrow',
55 |     compression=None,
56 |     index=False
57 | )


--------------------------------------------------------------------------------
/05-monitoring/prediction_service/app.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | 
 4 | import requests
 5 | from flask import Flask
 6 | from flask import request
 7 | from flask import jsonify
 8 | 
 9 | from pymongo import MongoClient
10 | 
11 | 
12 | MODEL_FILE = os.getenv('MODEL_FILE', 'lin_reg.bin')
13 | 
14 | EVIDENTLY_SERVICE_ADDRESS = os.getenv('EVIDENTLY_SERVICE', 'http://127.0.0.1:5000')
15 | MONGODB_ADDRESS = os.getenv("MONGODB_ADDRESS", "mongodb://127.0.0.1:27017")
16 | 
17 | with open(MODEL_FILE, 'rb') as f_in:
18 |     dv, model = pickle.load(f_in)
19 | 
20 | 
21 | app = Flask('duration')
22 | mongo_client = MongoClient(MONGODB_ADDRESS)
23 | db = mongo_client.get_database("prediction_service")
24 | collection = db.get_collection("data")
25 | 
26 | 
27 | @app.route('/predict', methods=['POST'])
28 | def predict():
29 |     record = request.get_json()
30 | 
31 |     record['PU_DO'] = '%s_%s' % (record['PULocationID'], record['DOLocationID'])
32 | 
33 |     X = dv.transform([record])
34 |     y_pred = model.predict(X)
35 | 
36 |     result = {
37 |         'duration': float(y_pred),
38 |     }
39 | 
40 |     save_to_db(record, float(y_pred))
41 |     send_to_evidently_service(record, float(y_pred))
42 |     return jsonify(result)
43 | 
44 | 
45 | def save_to_db(record, prediction):
46 |     rec = record.copy()
47 |     rec['prediction'] = prediction
48 |     collection.insert_one(rec)
49 | 
50 | 
51 | def send_to_evidently_service(record, prediction):
52 |     rec = record.copy()
53 |     rec['prediction'] = prediction
54 |     requests.post(f"{EVIDENTLY_SERVICE_ADDRESS}/iterate/taxi", json=[rec])
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     app.run(debug=True, host='0.0.0.0', port=9696)


--------------------------------------------------------------------------------
/06-best-practices/code/infrastructure/modules/ecr/main.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_ecr_repository" "repo" {
 2 |   name                 = var.ecr_repo_name
 3 |   image_tag_mutability = "MUTABLE"
 4 | 
 5 |   image_scanning_configuration {
 6 |     scan_on_push = false
 7 |   }
 8 | 
 9 |   force_delete = true
10 | }
11 | 
12 | # In practice, the Image build-and-push step is handled separately by the CI/CD pipeline and not the IaC script.
13 | # But because the lambda config would fail without an existing Image URI in ECR,
14 | # we can also upload any base image to bootstrap the lambda config, unrelated to your Inference logic
15 | resource null_resource ecr_image {
16 |    triggers = {
17 |      python_file = md5(file(var.lambda_function_local_path))
18 |      docker_file = md5(file(var.docker_image_local_path))
19 |    }
20 | 
21 |    provisioner "local-exec" {
22 |      command = <<EOF
23 |              aws ecr get-login-password --region ${var.region} | docker login --username AWS --password-stdin ${var.account_id}.dkr.ecr.${var.region}.amazonaws.com
24 |              cd ../
25 |              docker build -t ${aws_ecr_repository.repo.repository_url}:${var.ecr_image_tag} .
26 |              docker push ${aws_ecr_repository.repo.repository_url}:${var.ecr_image_tag}
27 |          EOF
28 |    }
29 | }
30 | 
31 | // Wait for the image to be uploaded, before lambda config runs
32 | data aws_ecr_image lambda_image {
33 |  depends_on = [
34 |    null_resource.ecr_image
35 |  ]
36 |  repository_name = var.ecr_repo_name
37 |  image_tag       = var.ecr_image_tag
38 | }
39 | 
40 | output "image_uri" {
41 |   value     = "${aws_ecr_repository.repo.repository_url}:${data.aws_ecr_image.lambda_image.image_tag}"
42 | }
43 | 


--------------------------------------------------------------------------------
/certificate.md:
--------------------------------------------------------------------------------
 1 | ## Getting your certificate
 2 | 
 3 | Congratulations on finishing the course!
 4 | 
 5 | Here's how you can get your certificate.
 6 | 
 7 | First, get your certificate id using the `compute_certificate_id` function:
 8 | 
 9 | ```python
10 | from hashlib import sha1
11 | 
12 | def compute_hash(email):
13 |     return sha1(email.encode('utf-8')).hexdigest()
14 | 
15 | def compute_certificate_id(email):
16 |     email_clean = email.lower().strip()
17 |     return compute_hash(email_clean + '_')
18 | ```
19 | 
20 | Then use this hash to get the URL
21 | 
22 | ```python
23 | cohort = 2022
24 | course = 'mlops-zoomcamp'
25 | your_id = compute_certificate_id('never.give.up@gmail.com')
26 | url = f"https://certificate.datatalks.club/{course}/{cohort}/{your_id}.pdf"
27 | print(url)
28 | ```
29 | 
30 | Example: https://certificate.datatalks.club/mlops-zoomcamp/2022/fe629854d45c559e9c10b3b8458ea392fdeb68a9.pdf
31 | 
32 | 
33 | ## Adding to LinkedIn
34 | 
35 | You can add your certificate to LinkedIn:
36 | 
37 | * Log in to your LinkedIn account, then go to your profile.
38 | * On the right, in the "Add profile" section dropdown, choose "Background" and then select the drop-down triangle next to "Licenses & Certifications".
39 | * In "Name", enter "MLOps Zoomcamp".
40 | * In "Issuing Organization", enter "DataTalksClub".
41 | * (Optional) In "Issue Date", enter the time when the certificate was created.
42 | * (Optional) Select the checkbox This certification does not expire. 
43 | * Put your certificate ID.
44 | * In "Certification URL", enter the URL for your certificate.
45 | 
46 | [Adapted from here](https://support.edx.org/hc/en-us/articles/206501938-How-can-I-add-my-certificate-to-my-LinkedIn-profile-)
47 | 


--------------------------------------------------------------------------------
/05-monitoring/homework/prefect-monitoring/monitor_profile_solution.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "from pymongo import MongoClient\n",
10 |     "import pprint"
11 |    ]
12 |   },
13 |   {
14 |    "cell_type": "code",
15 |    "execution_count": 2,
16 |    "metadata": {},
17 |    "outputs": [],
18 |    "source": [
19 |     "MONGO_CLIENT_ADDRESS = \"mongodb://localhost:27017/\"\n",
20 |     "MONGO_DATABASE = \"prediction_service\"\n",
21 |     "REPORT_COLLECTION = \"report\""
22 |    ]
23 |   },
24 |   {
25 |    "cell_type": "code",
26 |    "execution_count": 3,
27 |    "metadata": {},
28 |    "outputs": [],
29 |    "source": [
30 |     "client = MongoClient()\n",
31 |     "collection = client.get_database(MONGO_DATABASE).get_collection(REPORT_COLLECTION)"
32 |    ]
33 |   },
34 |   {
35 |    "cell_type": "code",
36 |    "execution_count": 4,
37 |    "metadata": {},
38 |    "outputs": [
39 |     {
40 |      "name": "stdout",
41 |      "output_type": "stream",
42 |      "text": [
43 |       "9\n"
44 |      ]
45 |     }
46 |    ],
47 |    "source": [
48 |     "for col in collection.find():\n",
49 |     "    pprint.pprint(len(col['data_drift']['data']['metrics'].keys()))"
50 |    ]
51 |   }
52 |  ],
53 |  "metadata": {
54 |   "kernelspec": {
55 |    "display_name": "Python 3.8.2 ('prefect-monitoring-vrjQsnUO')",
56 |    "language": "python",
57 |    "name": "python3"
58 |   },
59 |   "language_info": {
60 |    "codemirror_mode": {
61 |     "name": "ipython",
62 |     "version": 3
63 |    },
64 |    "file_extension": ".py",
65 |    "mimetype": "text/x-python",
66 |    "name": "python",
67 |    "nbconvert_exporter": "python",
68 |    "pygments_lexer": "ipython3",
69 |    "version": "3.8.2"
70 |   },
71 |   "orig_nbformat": 4,
72 |   "vscode": {
73 |    "interpreter": {
74 |     "hash": "8c4128a542e647ac345fb470a121f5ad37749126bd51dd0e4b0f94b08087470c"
75 |    }
76 |   }
77 |  },
78 |  "nbformat": 4,
79 |  "nbformat_minor": 2
80 | }
81 | 


--------------------------------------------------------------------------------
/04-deployment/streaming/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import boto3
 4 | import base64
 5 | 
 6 | import mlflow
 7 | 
 8 | kinesis_client = boto3.client('kinesis')
 9 | 
10 | PREDICTIONS_STREAM_NAME = os.getenv('PREDICTIONS_STREAM_NAME', 'ride_predictions')
11 | 
12 | 
13 | RUN_ID = os.getenv('RUN_ID')
14 | 
15 | logged_model = f's3://mlflow-models-alexey/1/{RUN_ID}/artifacts/model'
16 | # logged_model = f'runs:/{RUN_ID}/model'
17 | model = mlflow.pyfunc.load_model(logged_model)
18 | 
19 | 
20 | TEST_RUN = os.getenv('TEST_RUN', 'False') == 'True'
21 | 
22 | def prepare_features(ride):
23 |     features = {}
24 |     features['PU_DO'] = '%s_%s' % (ride['PULocationID'], ride['DOLocationID'])
25 |     features['trip_distance'] = ride['trip_distance']
26 |     return features
27 | 
28 | 
29 | def predict(features):
30 |     pred = model.predict(features)
31 |     return float(pred[0])
32 | 
33 | 
34 | def lambda_handler(event, context):
35 |     # print(json.dumps(event))
36 |     
37 |     predictions_events = []
38 |     
39 |     for record in event['Records']:
40 |         encoded_data = record['kinesis']['data']
41 |         decoded_data = base64.b64decode(encoded_data).decode('utf-8')
42 |         ride_event = json.loads(decoded_data)
43 | 
44 |         # print(ride_event)
45 |         ride = ride_event['ride']
46 |         ride_id = ride_event['ride_id']
47 |     
48 |         features = prepare_features(ride)
49 |         prediction = predict(features)
50 |     
51 |         prediction_event = {
52 |             'model': 'ride_duration_prediction_model',
53 |             'version': '123',
54 |             'prediction': {
55 |                 'ride_duration': prediction,
56 |                 'ride_id': ride_id   
57 |             }
58 |         }
59 | 
60 |         if not TEST_RUN:
61 |             kinesis_client.put_record(
62 |                 StreamName=PREDICTIONS_STREAM_NAME,
63 |                 Data=json.dumps(prediction_event),
64 |                 PartitionKey=str(ride_id)
65 |             )
66 |         
67 |         predictions_events.append(prediction_event)
68 | 
69 | 
70 |     return {
71 |         'predictions': predictions_events
72 |     }
73 | 


--------------------------------------------------------------------------------
/.github/workflows/ci-tests.yml:
--------------------------------------------------------------------------------
 1 | name: CI-Tests
 2 | on:
 3 |   pull_request:
 4 |     branches:
 5 |       - 'develop'
 6 |     paths:
 7 |       - '06-best-practices/code/**'
 8 | 
 9 | env:
10 |   AWS_DEFAULT_REGION: 'eu-west-1'
11 |   AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
12 |   AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
13 | 
14 | jobs:
15 |   test:
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - uses: actions/checkout@v2
19 |       - name: Set up Python 3.9
20 |         uses: actions/setup-python@v2
21 |         with:
22 |           python-version: 3.9
23 | 
24 |       - name: Install dependencies
25 |         working-directory: "06-best-practices/code"
26 |         run: pip install pipenv && pipenv install --dev
27 | 
28 |       - name: Run Unit tests
29 |         working-directory: "06-best-practices/code"
30 |         run: pipenv run pytest tests/
31 | 
32 |       - name: Lint
33 |         working-directory: "06-best-practices/code"
34 |         run: pipenv run pylint --recursive=y .
35 | 
36 |       - name: Configure AWS Credentials
37 |         uses: aws-actions/configure-aws-credentials@v1
38 |         with:
39 |           aws-access-key-id: ${{ env.AWS_ACCESS_KEY_ID }}
40 |           aws-secret-access-key: ${{ env.AWS_SECRET_ACCESS_KEY }}
41 |           aws-region: ${{ env.AWS_DEFAULT_REGION }}
42 | 
43 |       - name: Integration Test
44 |         working-directory: '06-best-practices/code/integraton-test'
45 |         run: |
46 |           . run.sh
47 | 
48 |   tf-plan:
49 |     runs-on: ubuntu-latest
50 |     steps:
51 |       - uses: actions/checkout@v2
52 |       - name: Configure AWS Credentials
53 |         uses: aws-actions/configure-aws-credentials@v1
54 |         with:
55 |           aws-access-key-id: ${{ env.AWS_ACCESS_KEY_ID }}
56 |           aws-secret-access-key: ${{ env.AWS_SECRET_ACCESS_KEY }}
57 |           aws-region: ${{ env.AWS_DEFAULT_REGION }}
58 | 
59 |       - uses: hashicorp/setup-terraform@v2
60 | 
61 |       - name: TF plan
62 |         id: plan
63 |         working-directory: '06-best-practices/code/infrastructure'
64 |         run: |
65 |           terraform init -backend-config="key=mlops-zoomcamp-prod.tfstate" --reconfigure && terraform plan --var-file vars/prod.tfvars
66 | 


--------------------------------------------------------------------------------
/05-monitoring/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.7'
 2 | 
 3 | volumes:
 4 |     prometheus_data: {}
 5 |     grafana_data: {}
 6 |     mongo_data: {}
 7 | 
 8 | networks:
 9 |   front-tier:
10 |   back-tier:
11 | 
12 | services:
13 |   prediction_service:
14 |     build:
15 |       context: prediction_service
16 |       dockerfile: Dockerfile
17 |     depends_on:
18 |       - evidently_service
19 |       - mongo
20 |     environment:
21 |       EVIDENTLY_SERVICE: "http://evidently_service.:8085"
22 |       MONGODB_ADDRESS: "mongodb://mongo.:27017/"
23 |     ports:
24 |       - "9696:9696"
25 |     networks:
26 |       - back-tier
27 |       - front-tier
28 |   evidently_service:
29 |     build:
30 |       context: evidently_service
31 |       dockerfile: Dockerfile
32 |     depends_on:
33 |       - grafana
34 |     volumes:
35 |       - ./evidently_service/datasets:/app/datasets
36 |       - ./evidently_service/config.yaml:/app/config.yaml
37 |     ports:
38 |       - "8085:8085"
39 |     networks:
40 |       - back-tier
41 |       - front-tier
42 | 
43 |   prometheus:
44 |     image: prom/prometheus
45 |     volumes:
46 |       - ./evidently_service/config/prometheus.yml:/etc/prometheus/prometheus.yml
47 |       - prometheus_data:/prometheus
48 |     command:
49 |       - '--config.file=/etc/prometheus/prometheus.yml'
50 |       - '--storage.tsdb.path=/prometheus'
51 |     ports:
52 |       - "9091:9090"
53 |     networks:
54 |       - back-tier
55 |     restart: always
56 | 
57 |   grafana:
58 |     image: grafana/grafana
59 |     user: "472"
60 |     depends_on:
61 |       - prometheus
62 |     ports:
63 |       - "3000:3000"
64 |     volumes:
65 |       - ./evidently_service/config/grafana_datasources.yaml:/etc/grafana/provisioning/datasources/datasource.yaml:ro
66 |       - ./evidently_service/config/grafana_dashboards.yaml:/etc/grafana/provisioning/dashboards/dashboards.yaml:ro
67 |       - ./evidently_service/dashboards:/opt/grafana/dashboards
68 |       - grafana_data:/var/lib/grafana
69 |     networks:
70 |       - back-tier
71 |       - front-tier
72 |     restart: always
73 |   mongo:
74 |     image: mongo
75 |     ports:
76 |       - "27018:27017"
77 |     volumes:
78 |       - mongo_data:/data/db
79 |     networks:
80 |       - back-tier
81 |       - front-tier


--------------------------------------------------------------------------------
/02-experiment-tracking/homework/hpo.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import pickle
 4 | 
 5 | import mlflow
 6 | import numpy as np
 7 | from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
 8 | from hyperopt.pyll import scope
 9 | from sklearn.ensemble import RandomForestRegressor
10 | from sklearn.metrics import mean_squared_error
11 | 
12 | mlflow.set_tracking_uri("http://127.0.0.1:5000")
13 | mlflow.set_experiment("random-forest-hyperopt")
14 | 
15 | 
16 | def load_pickle(filename):
17 |     with open(filename, "rb") as f_in:
18 |         return pickle.load(f_in)
19 | 
20 | 
21 | def run(data_path, num_trials):
22 | 
23 |     X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
24 |     X_valid, y_valid = load_pickle(os.path.join(data_path, "valid.pkl"))
25 | 
26 |     def objective(params):
27 | 
28 |         rf = RandomForestRegressor(**params)
29 |         rf.fit(X_train, y_train)
30 |         y_pred = rf.predict(X_valid)
31 |         rmse = mean_squared_error(y_valid, y_pred, squared=False)
32 | 
33 |         return {'loss': rmse, 'status': STATUS_OK}
34 | 
35 |     search_space = {
36 |         'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
37 |         'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
38 |         'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
39 |         'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
40 |         'random_state': 42
41 |     }
42 | 
43 |     rstate = np.random.default_rng(42)  # for reproducible results
44 |     fmin(
45 |         fn=objective,
46 |         space=search_space,
47 |         algo=tpe.suggest,
48 |         max_evals=num_trials,
49 |         trials=Trials(),
50 |         rstate=rstate
51 |     )
52 | 
53 | 
54 | if __name__ == '__main__':
55 | 
56 |     parser = argparse.ArgumentParser()
57 |     parser.add_argument(
58 |         "--data_path",
59 |         default="./output",
60 |         help="the location where the processed NYC taxi trip data was saved."
61 |     )
62 |     parser.add_argument(
63 |         "--max_evals",
64 |         type=int,
65 |         default=50,
66 |         help="the number of parameter evaluations for the optimizer to explore."
67 |     )
68 |     args = parser.parse_args()
69 | 
70 |     run(args.data_path, args.max_evals)
71 | 


--------------------------------------------------------------------------------
/05-monitoring/evidently_service/config/prometheus.yml:
--------------------------------------------------------------------------------
 1 | # my global config
 2 | global:
 3 |   scrape_interval:     15s # By default, scrape targets every 15 seconds.
 4 |   evaluation_interval: 15s # By default, scrape targets every 15 seconds.
 5 |   # scrape_timeout is set to the global default (10s).
 6 | 
 7 |   # Attach these labels to any time series or alerts when communicating with
 8 |   # external systems (federation, remote storage, Alertmanager).
 9 |   external_labels:
10 |       monitor: 'my-project'
11 | 
12 | # Load and evaluate rules in this file every 'evaluation_interval' seconds.
13 | rule_files:
14 |   - 'alert.rules'
15 |   # - "first.rules"
16 |   # - "second.rules"
17 | 
18 | # alert
19 | alerting:
20 |   alertmanagers:
21 |   - scheme: http
22 |     static_configs:
23 |     - targets:
24 |       - "alertmanager:9093"
25 | 
26 | # A scrape configuration containing exactly one endpoint to scrape:
27 | # Here it's Prometheus itself.
28 | scrape_configs:
29 |   # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
30 | 
31 |   - job_name: 'prometheus'
32 | 
33 |     # Override the global default and scrape targets from this job every 5 seconds.
34 |     scrape_interval: 5s
35 | 
36 |     static_configs:
37 |          - targets: ['localhost:9090']
38 | 
39 | 
40 | #  - job_name: 'cadvisor'
41 | #
42 | #    # Override the global default and scrape targets from this job every 5 seconds.
43 | #    scrape_interval: 5s
44 | #
45 | #    dns_sd_configs:
46 | #    - names:
47 | #      - 'tasks.cadvisor'
48 | #      type: 'A'
49 | #      port: 8080
50 | 
51 | #     static_configs:
52 | #          - targets: ['cadvisor:8080']
53 | 
54 | #  - job_name: 'node-exporter'
55 | #
56 | #    # Override the global default and scrape targets from this job every 5 seconds.
57 | #    scrape_interval: 5s
58 | #
59 | #    dns_sd_configs:
60 | #    - names:
61 | #      - 'tasks.node-exporter'
62 | #      type: 'A'
63 | #      port: 9100
64 | 
65 | #  - job_name: 'pushgateway'
66 | #    scrape_interval: 10s
67 | #    dns_\sd_configs:
68 | #    - names:
69 | #      - 'tasks.pushgateway'
70 | #      type: 'A'
71 | #      port: 9091
72 | 
73 | #     static_configs:
74 | #          - targets: ['node-exporter:9100']
75 |   - job_name: 'service'
76 |     scrape_interval: 10s
77 |     static_configs:
78 |       - targets: ['evidently_service.:8085']


--------------------------------------------------------------------------------
/05-monitoring/homework/model_training.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | import pandas as pd
 4 | import pyarrow.parquet as pq
 5 | from sklearn.feature_extraction import DictVectorizer
 6 | from sklearn.linear_model import LinearRegression
 7 | 
 8 | 
 9 | def read_dataframe(filename):
10 |     df = pq.read_table(filename).to_pandas()
11 | 
12 |     df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
13 |     df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
14 | 
15 |     df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
16 |     df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
17 | 
18 |     df = df[(df.duration >= 1) & (df.duration <= 60)]
19 | 
20 |     categorical = ['PULocationID', 'DOLocationID']
21 |     df[categorical] = df[categorical].astype(str)
22 |     
23 |     return df
24 | 
25 | def add_features(train_data="./datasets/green_tripdata_2021-03.parquet",
26 |                  additional_training_data=None):
27 |     df_train = read_dataframe(train_data)
28 | 
29 |     if additional_training_data:
30 |         extra_data = read_dataframe(additional_training_data)
31 |         df_train = pd.concat([df_train, extra_data], axis=0, ignore_index=True)
32 | 
33 | 
34 | 
35 |     df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
36 | 
37 |     categorical = ['PU_DO'] 
38 |     numerical = ['trip_distance']
39 | 
40 |     dv = DictVectorizer()
41 | 
42 |     train_dicts = df_train[categorical + numerical].to_dict(orient='records')
43 |     X_train = dv.fit_transform(train_dicts)
44 | 
45 |     target = 'duration'
46 |     y_train = df_train[target].values
47 | 
48 |     return X_train, y_train, dv
49 | 
50 | 
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     X_train, y_train, dv = add_features()
55 |     
56 |     print("Training model with one month of data")
57 |     lr = LinearRegression()
58 |     lr.fit(X_train, y_train)
59 | 
60 |     
61 |     with open('prediction_service/lin_reg.bin', 'wb') as f_out:
62 |         pickle.dump((dv, lr), f_out)
63 | 
64 |     X_train, y_train, dv = add_features(additional_training_data="./datasets/green_tripdata_2021-04.parquet")
65 |     print("Training model with two months of data")
66 |     lr = LinearRegression()
67 |     lr.fit(X_train, y_train)
68 | 
69 |     with open('prediction_service/lin_reg_V2.bin', 'wb') as f_out:
70 |         pickle.dump((dv, lr), f_out)
71 | 


--------------------------------------------------------------------------------
/06-best-practices/code/infrastructure/main.tf:
--------------------------------------------------------------------------------
 1 | # Make sure to create state bucket beforehand
 2 | terraform {
 3 |   required_version = ">= 1.0"
 4 |   backend "s3" {
 5 |     bucket  = "tf-state-mlops-zoomcamp"
 6 |     key     = "mlops-zoomcamp-stg.tfstate"
 7 |     region  = "eu-west-1"
 8 |     encrypt = true
 9 |   }
10 | }
11 | 
12 | provider "aws" {
13 |   region = var.aws_region
14 | }
15 | 
16 | data "aws_caller_identity" "current_identity" {}
17 | 
18 | locals {
19 |   account_id = data.aws_caller_identity.current_identity.account_id
20 | }
21 | 
22 | # ride_events
23 | module "source_kinesis_stream" {
24 |   source = "./modules/kinesis"
25 |   retention_period = 48
26 |   shard_count = 2
27 |   stream_name = "${var.source_stream_name}-${var.project_id}"
28 |   tags = var.project_id
29 | }
30 | 
31 | # ride_predictions
32 | module "output_kinesis_stream" {
33 |   source = "./modules/kinesis"
34 |   retention_period = 48
35 |   shard_count = 2
36 |   stream_name = "${var.output_stream_name}-${var.project_id}"
37 |   tags = var.project_id
38 | }
39 | 
40 | # model bucket
41 | module "s3_bucket" {
42 |   source = "./modules/s3"
43 |   bucket_name = "${var.model_bucket}-${var.project_id}"
44 | }
45 | 
46 | # image registry
47 | module "ecr_image" {
48 |    source = "./modules/ecr"
49 |    ecr_repo_name = "${var.ecr_repo_name}_${var.project_id}"
50 |    account_id = local.account_id
51 |    lambda_function_local_path = var.lambda_function_local_path
52 |    docker_image_local_path = var.docker_image_local_path
53 | }
54 | 
55 | module "lambda_function" {
56 |   source = "./modules/lambda"
57 |   image_uri = module.ecr_image.image_uri
58 |   lambda_function_name = "${var.lambda_function_name}_${var.project_id}"
59 |   model_bucket = module.s3_bucket.name
60 |   output_stream_name = "${var.output_stream_name}-${var.project_id}"
61 |   output_stream_arn = module.output_kinesis_stream.stream_arn
62 |   source_stream_name = "${var.source_stream_name}-${var.project_id}"
63 |   source_stream_arn = module.source_kinesis_stream.stream_arn
64 | }
65 | 
66 | # For CI/CD
67 | output "lambda_function" {
68 |   value     = "${var.lambda_function_name}_${var.project_id}"
69 | }
70 | 
71 | output "model_bucket" {
72 |   value = module.s3_bucket.name
73 | }
74 | 
75 | output "predictions_stream_name" {
76 |   value     = "${var.output_stream_name}-${var.project_id}"
77 | }
78 | 
79 | output "ecr_repo" {
80 |   value = "${var.ecr_repo_name}_${var.project_id}"
81 | }
82 | 


--------------------------------------------------------------------------------
/03-orchestration/homework.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from sklearn.feature_extraction import DictVectorizer
 4 | from sklearn.linear_model import LinearRegression
 5 | from sklearn.metrics import mean_squared_error
 6 | 
 7 | def read_data(path):
 8 |     df = pd.read_parquet(path)
 9 |     return df
10 | 
11 | def prepare_features(df, categorical, train=True):
12 |     df['duration'] = df.dropOff_datetime - df.pickup_datetime
13 |     df['duration'] = df.duration.dt.total_seconds() / 60
14 |     df = df[(df.duration >= 1) & (df.duration <= 60)].copy()
15 | 
16 |     mean_duration = df.duration.mean()
17 |     if train:
18 |         print(f"The mean duration of training is {mean_duration}")
19 |     else:
20 |         print(f"The mean duration of validation is {mean_duration}")
21 |     
22 |     df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
23 |     return df
24 | 
25 | def train_model(df, categorical):
26 | 
27 |     train_dicts = df[categorical].to_dict(orient='records')
28 |     dv = DictVectorizer()
29 |     X_train = dv.fit_transform(train_dicts) 
30 |     y_train = df.duration.values
31 | 
32 |     print(f"The shape of X_train is {X_train.shape}")
33 |     print(f"The DictVectorizer has {len(dv.feature_names_)} features")
34 | 
35 |     lr = LinearRegression()
36 |     lr.fit(X_train, y_train)
37 |     y_pred = lr.predict(X_train)
38 |     mse = mean_squared_error(y_train, y_pred, squared=False)
39 |     print(f"The MSE of training is: {mse}")
40 |     return lr, dv
41 | 
42 | def run_model(df, categorical, dv, lr):
43 |     val_dicts = df[categorical].to_dict(orient='records')
44 |     X_val = dv.transform(val_dicts) 
45 |     y_pred = lr.predict(X_val)
46 |     y_val = df.duration.values
47 | 
48 |     mse = mean_squared_error(y_val, y_pred, squared=False)
49 |     print(f"The MSE of validation is: {mse}")
50 |     return
51 | 
52 | def main(train_path: str = './data/fhv_tripdata_2021-01.parquet', 
53 |            val_path: str = './data/fhv_tripdata_2021-02.parquet'):
54 | 
55 |     categorical = ['PUlocationID', 'DOlocationID']
56 | 
57 |     df_train = read_data(train_path)
58 |     df_train_processed = prepare_features(df_train, categorical)
59 | 
60 |     df_val = read_data(val_path)
61 |     df_val_processed = prepare_features(df_val, categorical, False)
62 | 
63 |     # train the model
64 |     lr, dv = train_model(df_train_processed, categorical)
65 |     run_model(df_val_processed, categorical, dv, lr)
66 | 
67 | main()
68 | 


--------------------------------------------------------------------------------
/05-monitoring/homework/prediction_service/app.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import pickle
 4 | import uuid
 5 | 
 6 | from flask import Flask, jsonify, request
 7 | from pymongo import MongoClient
 8 | 
 9 | 
10 | MONGO_ADDRESS = os.getenv("MONGO_ADDRESS", "mongodb://localhost:27017/")
11 | MONGO_DATABASE = os.getenv("MONGO_DATABASE", "ride_prediction")
12 | LOGGED_MODEL = os.getenv("MODEL_FILE", "lin_reg.bin")
13 | MODEL_VERSION = os.getenv("MODEL_VERSION", "1")
14 | 
15 | with open(LOGGED_MODEL, 'rb') as f_in:
16 |     dv, model = pickle.load(f_in)
17 | 
18 | 
19 | mongo_client = MongoClient(MONGO_ADDRESS)
20 | mongo_db = mongo_client[MONGO_DATABASE]
21 | mongo_collection = mongo_db.get_collection("data")
22 | 
23 | 
24 | app = Flask("Ride-Prediction-Service")
25 | logging.basicConfig(level=logging.INFO)
26 | 
27 | 
28 | def prepare_features(ride):
29 |     """Function to prepare features before making prediction"""
30 | 
31 |     record = ride.copy()
32 |     record['PU_DO'] = '%s_%s' % (record['PULocationID'], record['DOLocationID'])
33 | 
34 |     features = dv.transform([record])
35 |    
36 |     return features, record
37 | 
38 | 
39 | def save_db(record, pred_result):
40 |     """Save data to mongo db collection"""
41 | 
42 |     rec = record.copy()
43 |     rec["prediction"] = pred_result[0]
44 |     mongo_collection.insert_one(rec)
45 | 
46 | 
47 | 
48 | @app.route("/", methods=["GET"])
49 | def get_info():
50 |     """Function to provide info about the app"""
51 |     info = """<H1>Ride Prediction Service</H1>
52 |               <div class="Data Request"> 
53 |                 <H3>Data Request Example</H3> 
54 |                 <div class="data">
55 |                 <p> "ride = {
56 |                     "PULocationID": 10,
57 |                     "DOLocationID": 50,
58 |                     "trip_distance": 40
59 |                     }"
60 |                 </p>
61 |                 </div>    
62 |                </div>"""
63 |     return info
64 | 
65 | @app.route("/predict-duration", methods=["POST"])
66 | def predict_duration():
67 |     """Function to predict duration"""
68 | 
69 |     ride = request.get_json()
70 |     features, record = prepare_features(ride)
71 | 
72 |     prediction = model.predict(features)
73 |     ride_id = str(uuid.uuid4())
74 |     pred_data = {
75 |             "ride_id": ride_id,
76 |             "PU_DO": record["PU_DO"],
77 |             "trip_distance": record["trip_distance"],
78 |             "status": 200,
79 |             "duration": prediction[0],
80 |             "model_version": MODEL_VERSION
81 |             }
82 | 
83 |     save_db(record, prediction)
84 | 
85 |     result = {
86 |         "statusCode": 200,
87 |         "data" : pred_data
88 |         }
89 | 
90 |     return jsonify(result)
91 | 
92 | 
93 | if __name__ == "__main__":
94 |     app.run(debug=True, host="0.0.0.0", port=9696)
95 | 


--------------------------------------------------------------------------------
/06-best-practices/code/tests/model_test.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | 
  3 | import model
  4 | 
  5 | 
  6 | def read_text(file):
  7 |     test_directory = Path(__file__).parent
  8 | 
  9 |     with open(test_directory / file, 'rt', encoding='utf-8') as f_in:
 10 |         return f_in.read().strip()
 11 | 
 12 | 
 13 | def test_base64_decode():
 14 |     base64_input = read_text('data.b64')
 15 | 
 16 |     actual_result = model.base64_decode(base64_input)
 17 |     expected_result = {
 18 |         "ride": {
 19 |             "PULocationID": 130,
 20 |             "DOLocationID": 205,
 21 |             "trip_distance": 3.66,
 22 |         },
 23 |         "ride_id": 256,
 24 |     }
 25 | 
 26 |     assert actual_result == expected_result
 27 | 
 28 | 
 29 | def test_prepare_features():
 30 |     model_service = model.ModelService(None)
 31 | 
 32 |     ride = {
 33 |         "PULocationID": 130,
 34 |         "DOLocationID": 205,
 35 |         "trip_distance": 3.66,
 36 |     }
 37 | 
 38 |     actual_features = model_service.prepare_features(ride)
 39 | 
 40 |     expected_fetures = {
 41 |         "PU_DO": "130_205",
 42 |         "trip_distance": 3.66,
 43 |     }
 44 | 
 45 |     assert actual_features == expected_fetures
 46 | 
 47 | 
 48 | class ModelMock:
 49 |     def __init__(self, value):
 50 |         self.value = value
 51 | 
 52 |     def predict(self, X):
 53 |         n = len(X)
 54 |         return [self.value] * n
 55 | 
 56 | 
 57 | def test_predict():
 58 |     model_mock = ModelMock(10.0)
 59 |     model_service = model.ModelService(model_mock)
 60 | 
 61 |     features = {
 62 |         "PU_DO": "130_205",
 63 |         "trip_distance": 3.66,
 64 |     }
 65 | 
 66 |     actual_prediction = model_service.predict(features)
 67 |     expected_prediction = 10.0
 68 | 
 69 |     assert actual_prediction == expected_prediction
 70 | 
 71 | 
 72 | def test_lambda_handler():
 73 |     model_mock = ModelMock(10.0)
 74 |     model_version = 'Test123'
 75 |     model_service = model.ModelService(model_mock, model_version)
 76 | 
 77 |     base64_input = read_text('data.b64')
 78 | 
 79 |     event = {
 80 |         "Records": [
 81 |             {
 82 |                 "kinesis": {
 83 |                     "data": base64_input,
 84 |                 },
 85 |             }
 86 |         ]
 87 |     }
 88 | 
 89 |     actual_predictions = model_service.lambda_handler(event)
 90 |     expected_predictions = {
 91 |         'predictions': [
 92 |             {
 93 |                 'model': 'ride_duration_prediction_model',
 94 |                 'version': model_version,
 95 |                 'prediction': {
 96 |                     'ride_duration': 10.0,
 97 |                     'ride_id': 256,
 98 |                 },
 99 |             }
100 |         ]
101 |     }
102 | 
103 |     assert actual_predictions == expected_predictions
104 | 


--------------------------------------------------------------------------------
/02-experiment-tracking/homework/preprocess_data.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import pickle
 4 | 
 5 | import pandas as pd
 6 | from sklearn.feature_extraction import DictVectorizer
 7 | 
 8 | 
 9 | def dump_pickle(obj, filename):
10 |     with open(filename, "wb") as f_out:
11 |         return pickle.dump(obj, f_out)
12 | 
13 | 
14 | def read_dataframe(filename: str):
15 |     df = pd.read_parquet(filename)
16 | 
17 |     df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
18 |     df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
19 |     df = df[(df.duration >= 1) & (df.duration <= 60)]
20 | 
21 |     categorical = ['PULocationID', 'DOLocationID']
22 |     df[categorical] = df[categorical].astype(str)
23 | 
24 |     return df
25 | 
26 | 
27 | def preprocess(df: pd.DataFrame, dv: DictVectorizer, fit_dv: bool = False):
28 |     df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
29 |     categorical = ['PU_DO']
30 |     numerical = ['trip_distance']
31 |     dicts = df[categorical + numerical].to_dict(orient='records')
32 |     if fit_dv:
33 |         X = dv.fit_transform(dicts)
34 |     else:
35 |         X = dv.transform(dicts)
36 |     return X, dv
37 | 
38 | 
39 | def run(raw_data_path: str, dest_path: str, dataset: str = "green"):
40 |     # load parquet files
41 |     df_train = read_dataframe(
42 |         os.path.join(raw_data_path, f"{dataset}_tripdata_2021-01.parquet")
43 |     )
44 |     df_valid = read_dataframe(
45 |         os.path.join(raw_data_path, f"{dataset}_tripdata_2021-02.parquet")
46 |     )
47 |     df_test = read_dataframe(
48 |         os.path.join(raw_data_path, f"{dataset}_tripdata_2021-03.parquet")
49 |     )
50 | 
51 |     # extract the target
52 |     target = 'duration'
53 |     y_train = df_train[target].values
54 |     y_valid = df_valid[target].values
55 |     y_test = df_test[target].values
56 | 
57 |     # fit the dictvectorizer and preprocess data
58 |     dv = DictVectorizer()
59 |     X_train, dv = preprocess(df_train, dv, fit_dv=True)
60 |     X_valid, _ = preprocess(df_valid, dv, fit_dv=False)
61 |     X_test, _ = preprocess(df_test, dv, fit_dv=False)
62 | 
63 |     # create dest_path folder unless it already exists
64 |     os.makedirs(dest_path, exist_ok=True)
65 | 
66 |     # save dictvectorizer and datasets
67 |     dump_pickle(dv, os.path.join(dest_path, "dv.pkl"))
68 |     dump_pickle((X_train, y_train), os.path.join(dest_path, "train.pkl"))
69 |     dump_pickle((X_valid, y_valid), os.path.join(dest_path, "valid.pkl"))
70 |     dump_pickle((X_test, y_test), os.path.join(dest_path, "test.pkl"))
71 | 
72 | 
73 | if __name__ == '__main__':
74 | 
75 |     parser = argparse.ArgumentParser()
76 |     parser.add_argument(
77 |         "--raw_data_path",
78 |         help="the location where the raw NYC taxi trip data was saved"
79 |     )
80 |     parser.add_argument(
81 |         "--dest_path",
82 |         help="the location where the resulting files will be saved."
83 |     )
84 |     args = parser.parse_args()
85 | 
86 |     run(args.raw_data_path, args.dest_path)
87 | 


--------------------------------------------------------------------------------
/06-best-practices/homework_solution/batch.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | import os
 5 | import sys
 6 | import pickle
 7 | import pandas as pd
 8 | 
 9 | 
10 | def prepare_data(df, categorical):
11 |     df['duration'] = df.dropOff_datetime - df.pickup_datetime
12 |     df['duration'] = df.duration.dt.total_seconds() / 60
13 | 
14 |     df = df[(df.duration >= 1) & (df.duration <= 60)].copy()
15 | 
16 |     df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
17 |     return df
18 | 
19 | 
20 | def read_data(filename, categorical):
21 |     S3_ENDPOINT_URL = os.getenv('S3_ENDPOINT_URL')
22 | 
23 |     if S3_ENDPOINT_URL is not None:
24 |         options = {
25 |             'client_kwargs': {
26 |                 'endpoint_url': S3_ENDPOINT_URL
27 |             }
28 |         }
29 | 
30 |         df = pd.read_parquet(filename, storage_options=options)
31 |     else:
32 |         df = pd.read_parquet(filename)
33 | 
34 |     return prepare_data(df, categorical)
35 | 
36 | 
37 | def write_date(filename, df):
38 |     S3_ENDPOINT_URL = os.getenv('S3_ENDPOINT_URL')
39 | 
40 |     if S3_ENDPOINT_URL is not None:
41 |         options = {
42 |             'client_kwargs': {
43 |                 'endpoint_url': S3_ENDPOINT_URL
44 |             }
45 |         }
46 | 
47 |         df.to_parquet(filename, engine='pyarrow', index=False, storage_options=options)
48 |     else:
49 |         df.to_parquet(filename, engine='pyarrow', index=False)
50 | 
51 | 
52 | def get_input_path(year, month):
53 |     default_input_pattern = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/nyc-tlc/fhv/fhv_tripdata_{year:04d}-{month:02d}.parquet'
54 |     input_pattern = os.getenv('INPUT_FILE_PATTERN', default_input_pattern)
55 |     return input_pattern.format(year=year, month=month)
56 | 
57 | 
58 | def get_output_path(year, month):
59 |     default_output_pattern = 's3://nyc-duration-prediction-alexey/taxi_type=fhv/year={year:04d}/month={month:02d}/predictions.parquet'
60 |     output_pattern = os.getenv('OUTPUT_FILE_PATTERN', default_output_pattern)
61 |     return output_pattern.format(year=year, month=month)
62 | 
63 | 
64 | def main(year, month):
65 |     input_file = get_input_path(year, month)
66 |     output_file = get_output_path(year, month)
67 | 
68 |     with open('model.bin', 'rb') as f_in:
69 |         dv, lr = pickle.load(f_in)
70 | 
71 |     categorical = ['PUlocationID', 'DOlocationID']
72 | 
73 |     df = read_data(input_file, categorical)
74 |     df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')
75 | 
76 |     dicts = df[categorical].to_dict(orient='records')
77 |     X_val = dv.transform(dicts)
78 |     y_pred = lr.predict(X_val)
79 | 
80 |     print('predicted mean duration:', y_pred.mean())
81 | 
82 |     df_result = pd.DataFrame()
83 |     df_result['ride_id'] = df['ride_id']
84 |     df_result['predicted_duration'] = y_pred
85 | 
86 |     write_date(output_file, df_result)
87 |     
88 | 
89 | 
90 | if __name__ == '__main__': 
91 |     year = int(sys.argv[1])
92 |     month = int(sys.argv[2])
93 |     main(year, month)


--------------------------------------------------------------------------------
/01-intro/homework.md:
--------------------------------------------------------------------------------
  1 | ## 1.6 Homework
  2 | 
  3 | The goal of this homework is to train a simple model for predicting the duration of a ride - similar to what we did in this module.
  4 | 
  5 | 
  6 | ## Q1. Downloading the data
  7 | 
  8 | We'll use [the same NYC taxi dataset](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page),
  9 | but instead of "Green Taxi Trip Records", we'll use "For-Hire Vehicle Trip Records".
 10 | 
 11 | Download the data for January and February 2021.
 12 | 
 13 | Note that you need "For-Hire Vehicle Trip Records", not "High Volume For-Hire Vehicle Trip Records".
 14 | 
 15 | Read the data for January. How many records are there?
 16 | 
 17 | * 1054112
 18 | * 1154112
 19 | * 1254112
 20 | * 1354112
 21 | 
 22 | 
 23 | ## Q2. Computing duration
 24 | 
 25 | Now let's compute the `duration` variable. It should contain the duration of a ride in minutes. 
 26 | 
 27 | What's the average trip duration in January?
 28 | 
 29 | * 15.16
 30 | * 19.16
 31 | * 24.16
 32 | * 29.16
 33 | 
 34 | ## Data preparation
 35 | 
 36 | Check the distribution of the duration variable. There are some outliers. 
 37 | 
 38 | Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).
 39 | 
 40 | How many records did you drop? 
 41 | 
 42 | ## Q3. Missing values
 43 | 
 44 | The features we'll use for our model are the pickup and dropoff location IDs. 
 45 | 
 46 | But they have a lot of missing values there. Let's replace them with "-1".
 47 | 
 48 | What's the fractions of missing values for the pickup location ID? I.e. fraction of "-1"s after you filled the NAs.
 49 | 
 50 | * 53%
 51 | * 63%
 52 | * 73%
 53 | * 83%
 54 | 
 55 | ## Q4. One-hot encoding
 56 | 
 57 | Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model. 
 58 | 
 59 | * Turn the dataframe into a list of dictionaries
 60 | * Fit a dictionary vectorizer 
 61 | * Get a feature matrix from it
 62 | 
 63 | What's the dimensionality of this matrix? (The number of columns).
 64 | 
 65 | * 2
 66 | * 152
 67 | * 352
 68 | * 525
 69 | * 725
 70 | 
 71 | ## Q5. Training a model
 72 | 
 73 | Now let's use the feature matrix from the previous step to train a model. 
 74 | 
 75 | * Train a plain linear regression model with default parameters 
 76 | * Calculate the RMSE of the model on the training data
 77 | 
 78 | What's the RMSE on train?
 79 | 
 80 | * 5.52
 81 | * 10.52
 82 | * 15.52
 83 | * 20.52
 84 | 
 85 | 
 86 | ## Q6. Evaluating the model
 87 | 
 88 | Now let's apply this model to the validation dataset (Feb 2021). 
 89 | 
 90 | What's the RMSE on validation?
 91 | 
 92 | * 6.01
 93 | * 11.01
 94 | * 16.01
 95 | * 21.01
 96 | 
 97 | ## Submit the results
 98 | 
 99 | Submit your results here: https://forms.gle/V8q5rv7QRoZ13Sft6
100 | 
101 | It's possible that your answers won't match exactly. If it's the case, select the closest one.
102 | 
103 | 
104 | ## Deadline
105 | 
106 | The deadline for submitting is 24 May 2022 (Tuesday) 23:00 CET. After that, the form will be closed.
107 | 
108 | 
109 | ## Solution
110 | 
111 | * [Video](https://www.youtube.com/watch?v=feH1PMLyu-Q&list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK&index=9)
112 | * [Notebook](homework.ipynb)
113 | 


--------------------------------------------------------------------------------
/02-experiment-tracking/homework/register_model.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import pickle
 4 | 
 5 | import mlflow
 6 | from hyperopt import hp, space_eval
 7 | from hyperopt.pyll import scope
 8 | from mlflow.entities import ViewType
 9 | from mlflow.tracking import MlflowClient
10 | from sklearn.ensemble import RandomForestRegressor
11 | from sklearn.metrics import mean_squared_error
12 | 
13 | HPO_EXPERIMENT_NAME = "random-forest-hyperopt"
14 | EXPERIMENT_NAME = "random-forest-best-models"
15 | 
16 | mlflow.set_tracking_uri("http://127.0.0.1:5000")
17 | mlflow.set_experiment(EXPERIMENT_NAME)
18 | mlflow.sklearn.autolog()
19 | 
20 | SPACE = {
21 |     'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
22 |     'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
23 |     'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
24 |     'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
25 |     'random_state': 42
26 | }
27 | 
28 | 
29 | def load_pickle(filename):
30 |     with open(filename, "rb") as f_in:
31 |         return pickle.load(f_in)
32 | 
33 | 
34 | def train_and_log_model(data_path, params):
35 |     X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
36 |     X_valid, y_valid = load_pickle(os.path.join(data_path, "valid.pkl"))
37 |     X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))
38 | 
39 |     with mlflow.start_run():
40 |         params = space_eval(SPACE, params)
41 |         rf = RandomForestRegressor(**params)
42 |         rf.fit(X_train, y_train)
43 | 
44 |         # evaluate model on the validation and test sets
45 |         valid_rmse = mean_squared_error(y_valid, rf.predict(X_valid), squared=False)
46 |         mlflow.log_metric("valid_rmse", valid_rmse)
47 |         test_rmse = mean_squared_error(y_test, rf.predict(X_test), squared=False)
48 |         mlflow.log_metric("test_rmse", test_rmse)
49 | 
50 | 
51 | def run(data_path, log_top):
52 | 
53 |     client = MlflowClient()
54 | 
55 |     # retrieve the top_n model runs and log the models to MLflow
56 |     experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
57 |     runs = client.search_runs(
58 |         experiment_ids=experiment.experiment_id,
59 |         run_view_type=ViewType.ACTIVE_ONLY,
60 |         max_results=log_top,
61 |         order_by=["metrics.rmse ASC"]
62 |     )
63 |     for run in runs:
64 |         train_and_log_model(data_path=data_path, params=run.data.params)
65 | 
66 |     # select the model with the lowest test RMSE
67 |     experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
68 |     # best_run = client.search_runs( ...  )[0]
69 | 
70 |     # register the best model
71 |     # mlflow.register_model( ... )
72 | 
73 | 
74 | if __name__ == '__main__':
75 | 
76 |     parser = argparse.ArgumentParser()
77 |     parser.add_argument(
78 |         "--data_path",
79 |         default="./output",
80 |         help="the location where the processed NYC taxi trip data was saved."
81 |     )
82 |     parser.add_argument(
83 |         "--top_n",
84 |         default=5,
85 |         type=int,
86 |         help="the top 'top_n' models will be evaluated to decide which model to promote."
87 |     )
88 |     args = parser.parse_args()
89 | 
90 |     run(args.data_path, args.top_n)
91 | 


--------------------------------------------------------------------------------
/04-deployment/README.md:
--------------------------------------------------------------------------------
  1 | # 4. Model Deployment
  2 | 
  3 | ## 4.1 Three ways of deploying a model
  4 | 
  5 | <a href="https://www.youtube.com/watch?v=JMGe4yIoBRA&list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK">
  6 |   <img src="images/thumbnail-4-01.jpg">
  7 | </a>
  8 | 
  9 | 
 10 | 
 11 | ## 4.2 Web-services: Deploying models with Flask and Docker
 12 | 
 13 | <a href="https://www.youtube.com/watch?v=D7wfMAdgdF8&list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK">
 14 |   <img src="images/thumbnail-4-02.jpg">
 15 | </a>
 16 | 
 17 | 
 18 | [See code here](web-service/)
 19 | 
 20 | 
 21 | ## 4.3 Web-services: Getting the models from the model registry (MLflow)
 22 | 
 23 | <a href="https://www.youtube.com/watch?v=aewOpHSCkqI&list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK">
 24 |   <img src="images/thumbnail-4-03.jpg">
 25 | </a>
 26 | 
 27 | 
 28 | [See code here](web-service-mlflow/)
 29 | 
 30 | 
 31 | ## 4.4 (Optional) Streaming: Deploying models with Kinesis and Lambda 
 32 | 
 33 | <a href="https://www.youtube.com/watch?v=TCqr9HNcrsI&list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK">
 34 |   <img src="images/thumbnail-4-04.jpg">
 35 | </a>
 36 | 
 37 | 
 38 | [See code here](streaming/)
 39 | 
 40 | 
 41 | ## 4.5 Batch: Preparing a scoring script
 42 | 
 43 | <a href="https://www.youtube.com/watch?v=18Lbaaeigek&list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK">
 44 |   <img src="images/thumbnail-4-05.jpg">
 45 | </a>
 46 | 
 47 | 
 48 | [See code here](batch/)
 49 | 
 50 | 
 51 | ## 4.6 MLOps Zoomcamp 4.6 - Batch: Scheduling batch scoring jobs with Prefect
 52 | 
 53 | **Note:** There are several changes to deployment in Prefect 2.3.1 since 2.0b8:
 54 | - `DeploymentSpec` in 2.0b8 now becomes `Deployment`. 
 55 | - `work_queue_name` is used instead of `tags` to submit the deployment to the a specific work queue. 
 56 | - You don't need to create a work queue before using the work queue. A work queue will be created if it doesn't exist. 
 57 | - `flow_location` is replaced with `flow`
 58 | - `flow_runner` and `flow_storage` are no longer supported
 59 | 
 60 | ```python
 61 | from prefect.deployments import Deployment
 62 | from prefect.orion.schemas.schedules import CronSchedule
 63 | from score import ride_duration_prediction
 64 | 
 65 | deployment = Deployment.build_from_flow(
 66 |     flow=ride_duration_prediction,
 67 |     name="ride_duration_prediction",
 68 |     parameters={
 69 |         "taxi_type": "green",
 70 |         "run_id": "e1efc53e9bd149078b0c12aeaa6365df",
 71 |     },
 72 |     schedule=CronSchedule(cron="0 3 2 * *"),
 73 |     work_queue_name="ml",
 74 | )
 75 | 
 76 | deployment.apply()
 77 | ```
 78 | 
 79 | <a href="https://www.youtube.com/watch?v=ekT_JW213Tc&list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK">
 80 |   <img src="images/thumbnail-4-06.jpg">
 81 | </a>
 82 | 
 83 | ## 4.7 Choosing the right way of deployment
 84 | 
 85 | COMING SOON
 86 | 
 87 | 
 88 | ## 4.8 Homework
 89 | 
 90 | More information here: [homework.md](homework.md)
 91 | 
 92 | 
 93 | ## Notes
 94 | 
 95 | Did you take notes? Add them here:
 96 | 
 97 | * [Notes on model deployment (+ creating a modeling package) by Ron M.](https://particle1331.github.io/inefficient-networks/notebooks/mlops/04-deployment/notes.html)
 98 | * [Notes on Model Deployment using Google Cloud Platform, by M. Ayoub C.](https://gist.github.com/Qfl3x/de2a9b98a370749a4b17a4c94ef46185)
 99 | * [Week4: Notes on Model Deployment by Bhagabat](https://github.com/BPrasad123/MLOps_Zoomcamp/tree/main/Week4)
100 | * [Week 4: Deployment notes by Ayoub.B](https://github.com/ayoub-berdeddouch/mlops-journey/blob/main/deployment-04.md)
101 | * Send a PR, add your notes above this line
102 | 


--------------------------------------------------------------------------------
/05-monitoring/prefect_example.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import pickle
 4 | 
 5 | import pandas
 6 | from prefect import flow, task
 7 | from pymongo import MongoClient
 8 | import pyarrow.parquet as pq
 9 | 
10 | from evidently import ColumnMapping
11 | 
12 | from evidently.dashboard import Dashboard
13 | from evidently.dashboard.tabs import DataDriftTab,RegressionPerformanceTab
14 | 
15 | from evidently.model_profile import Profile
16 | from evidently.model_profile.sections import DataDriftProfileSection, RegressionPerformanceProfileSection
17 | 
18 | 
19 | @task
20 | def upload_target(filename):
21 |     client = MongoClient("mongodb://localhost:27018/")
22 |     collection = client.get_database("prediction_service").get_collection("data")
23 |     with open(filename) as f_target:
24 |         for line in f_target.readlines():
25 |             row = line.split(",")
26 |             collection.update_one({"id": row[0]}, {"$set": {"target": float(row[1])}})
27 |     client.close()
28 | 
29 | 
30 | @task
31 | def load_reference_data(filename):
32 |     MODEL_FILE = os.getenv('MODEL_FILE', './prediction_service/lin_reg.bin')
33 |     with open(MODEL_FILE, 'rb') as f_in:
34 |         dv, model = pickle.load(f_in)
35 |     reference_data = pq.read_table(filename).to_pandas()
36 |     # Create features
37 |     reference_data['PU_DO'] = reference_data['PULocationID'].astype(str) + "_" + reference_data['DOLocationID'].astype(str)
38 | 
39 |     # add target column
40 |     reference_data['target'] = reference_data.lpep_dropoff_datetime - reference_data.lpep_pickup_datetime
41 |     reference_data.target = reference_data.target.apply(lambda td: td.total_seconds() / 60)
42 |     reference_data = reference_data[(reference_data.target >= 1) & (reference_data.target <= 60)]
43 |     features = ['PU_DO', 'PULocationID', 'DOLocationID', 'trip_distance']
44 |     x_pred = dv.transform(reference_data[features].to_dict(orient='records'))
45 |     reference_data['prediction'] = model.predict(x_pred)
46 |     return reference_data
47 | 
48 | 
49 | @task
50 | def fetch_data():
51 |     client = MongoClient("mongodb://localhost:27018/")
52 |     data = client.get_database("prediction_service").get_collection("data").find()
53 |     df = pandas.DataFrame(list(data))
54 |     return df
55 | 
56 | 
57 | @task
58 | def run_evidently(ref_data, data):
59 |     ref_data.drop('ehail_fee', axis=1, inplace=True)
60 |     data.drop('ehail_fee', axis=1, inplace=True)  # drop empty column (until Evidently will work with it properly)
61 |     profile = Profile(sections=[DataDriftProfileSection(), RegressionPerformanceProfileSection()])
62 |     mapping = ColumnMapping(prediction="prediction", numerical_features=['trip_distance'],
63 |                             categorical_features=['PULocationID', 'DOLocationID'],
64 |                             datetime_features=[])
65 |     profile.calculate(ref_data, data, mapping)
66 | 
67 |     dashboard = Dashboard(tabs=[DataDriftTab(), RegressionPerformanceTab(verbose_level=0)])
68 |     dashboard.calculate(ref_data, data, mapping)
69 |     return json.loads(profile.json()), dashboard
70 | 
71 | 
72 | @task
73 | def save_report(result):
74 |     client = MongoClient("mongodb://localhost:27018/")
75 |     client.get_database("prediction_service").get_collection("report").insert_one(result[0])
76 | 
77 | 
78 | @task
79 | def save_html_report(result):
80 |     result[1].save("evidently_report_example.html")
81 | 
82 | 
83 | @flow
84 | def batch_analyze():
85 |     upload_target("target.csv")
86 |     ref_data = load_reference_data("./evidently_service/datasets/green_tripdata_2021-01.parquet")
87 |     data = fetch_data()
88 |     result = run_evidently(ref_data, data)
89 |     save_report(result)
90 |     save_html_report(result)
91 | 
92 | batch_analyze()
93 | 


--------------------------------------------------------------------------------
/02-experiment-tracking/README.md:
--------------------------------------------------------------------------------
 1 | # 2. Experiment tracking and model management
 2 | 
 3 | 
 4 | * [Slides](https://drive.google.com/file/d/1YtkAtOQS3wvY7yts_nosVlXrLQBq5q37/view?usp=sharing)
 5 | 
 6 | 
 7 | ## 2.1 Experiment tracking intro
 8 | 
 9 | <a href="https://www.youtube.com/watch?v=MiA7LQin9c8&list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK">
10 |   <img src="images/thumbnail-2-01.jpg">
11 | </a>
12 | 
13 | 
14 | 
15 | ## 2.2 Getting started with MLflow
16 | 
17 | <a href="https://www.youtube.com/watch?v=cESCQE9J3ZE&list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK">
18 |   <img src="images/thumbnail-2-02.jpg">
19 | </a>
20 | 
21 | Note: in the videos, Cristian uses Jupyter in VS code and runs everything locally
22 | 
23 | But if you set up a VM in the previous module, you can keep using it
24 | and use the usual Jupyter from your browser. There's no significant
25 | difference between using Jupyter with VS code and without
26 | 
27 | 
28 | ## 2.3 Experiment tracking with MLflow
29 | 
30 | <a href="https://www.youtube.com/watch?v=iaJz-T7VWec&list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK">
31 |   <img src="images/thumbnail-2-03.jpg">
32 | </a>
33 | 
34 | 
35 | 
36 | ## 2.4 Model management
37 | 
38 | <a href="https://www.youtube.com/watch?v=OVUPIX88q88&list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK">
39 |   <img src="images/thumbnail-2-04.jpg">
40 | </a>
41 | 
42 | 
43 | 
44 | ## 2.5 Model registry
45 | 
46 | <a href="https://www.youtube.com/watch?v=TKHU7HAvGH8&list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK">
47 |   <img src="images/thumbnail-2-05.jpg">
48 | </a>
49 | 
50 | 
51 | ## 2.6 MLflow in practice
52 | 
53 | <a href="https://www.youtube.com/watch?v=1ykg4YmbFVA&list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK">
54 |   <img src="images/thumbnail-2-06.jpg">
55 | </a>
56 | 
57 | 
58 | ## 2.7 MLflow: benefits, limitations and alternatives
59 | 
60 | <a href="https://www.youtube.com/watch?v=Lugy1JPsBRY&list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK">
61 |   <img src="images/thumbnail-2-07.jpg">
62 | </a>
63 | 
64 | 
65 | ## 2.7 Homework
66 | 
67 | More information here: [homework.md](homework.md)
68 | 
69 | 
70 | ## Notes
71 | 
72 | Did you take notes? Add them here:
73 | 
74 | * [Notes/General Docs on MLflow by Ayoub](https://gist.github.com/Qfl3x/ccff6b0708358c040e437d52af0c2e43)
75 | * [Minimalist MLflow code reference by Anna V](https://github.com/annnvv/mlops_zoomcamp/blob/main/notes/module2_notes_MLflow.md)
76 | * [Notes from second lesson by Neimv](https://gitlab.com/neimv/mlops/-/blob/main/lessons_weeks/notes_2.md)
77 | * [2nd Week Experiment & Tracking notes by Ayoub.B](https://github.com/ayoub-berdeddouch/mlops-journey/blob/main/experiment_tracking_02.md)
78 | * [Notes on Experiment Tracking with MLflow (Jupyter Book) by particle1331](https://particle1331.github.io/inefficient-networks/notebooks/mlops/2-mlflow/2-mlflow.html)
79 | * [Week 2: Experiment & Tracking Notes by Bengsoon Chuah](https://github.com/bengsoon/mlops-zoomcamp/blob/main/02-experiment-tracking/notes/Experiment_Tracking_notes.md)
80 | * [2.4 Model Management Notes by Alvaro Pena](https://github.com/alvarofps/mlops-zoomcamp/blob/main/02-experiment-tracking/my-notes/2.4%20Model%20management.md)
81 | * [Notes by Alvaro Navas](https://github.com/ziritrion/mlopszoomcamp/blob/main/notes/2_experiment.md)
82 | * [Notebook from froukje](https://github.com/froukje/ml-ops-zoomcamp/blob/master/02-experiment-tracking/week02.ipynb) and [notes](https://medium.com/@falbrechtg/getting-started-with-mlflow-tracking-46a0089d6a73)
83 | * [Blog post on setting up MLFlow on GCP by Isaac Kargar](https://kargarisaac.github.io/blog/mlops/data%20engineering/2022/06/15/MLFlow-on-GCP.html).
84 | * [Week2: Experiment tracking notes and notebook by Bhagabat](https://github.com/BPrasad123/MLOps_Zoomcamp/tree/main/Week2)
85 | * [Notes of ML-flow by Jaime Cabrera-Salcedo](https://github.com/jaimeh94/MLOps-Zoomcamp/tree/main/02-experiment-tracking)
86 | * Send a PR, add your notes above this line
87 | 


--------------------------------------------------------------------------------
/05-monitoring/README.md:
--------------------------------------------------------------------------------
  1 | # 5. Model Monitoring
  2 | 
  3 | ## 5.1 Monitoring for ML-based services
  4 | 
  5 | <a href="https://www.youtube.com/watch?v=gMiT11Bp05A&list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK">
  6 |   <img src="images/thumbnail-5-01.jpg">
  7 | </a>
  8 | 
  9 | 
 10 | [Slides](https://drive.google.com/file/d/1wcMU75ZcNNJie4ELjsKPkITIL93wHykt/view?usp=sharing)
 11 | 
 12 | 
 13 | ## 5.2 Setting up the environment
 14 | 
 15 | <a href="https://www.youtube.com/watch?v=VkkpVXW53bo&list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK">
 16 |   <img src="images/thumbnail-5-02.jpg">
 17 | </a>
 18 | 
 19 | 
 20 | 
 21 | ## 5.3 Creating a prediction service and simulating traffic
 22 | 
 23 | <a href="https://www.youtube.com/watch?v=umQ3Mo5G1o8&list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK">
 24 |   <img src="images/thumbnail-5-03.jpg">
 25 | </a>
 26 | 
 27 | 
 28 | 
 29 | ## 5.4 Realtime monitoring walktrough (Prometheus, Evidently, Grafana)
 30 | 
 31 | <a href="https://www.youtube.com/watch?v=r_m4VFEJ8yY&list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK">
 32 |   <img src="images/thumbnail-5-04.jpg">
 33 | </a>
 34 | 
 35 | 
 36 | 
 37 | ## 5.5 Batch monitoring walktrough (Prefect, MongoDB, Evidently)
 38 | 
 39 | <a href="https://www.youtube.com/watch?v=KefdYuue_FE&list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK">
 40 |   <img src="images/thumbnail-5-05.jpg">
 41 | </a>
 42 | 
 43 | 
 44 | 
 45 | ## 5.6 Homework
 46 | 
 47 | More information here: [homework.md](homework.md). Please also give us feedback for this week videos using 
 48 | [this form](https://forms.gle/fb7dJKjyb1oeNeAz9) 
 49 | 
 50 | 
 51 | ## Notes
 52 | 
 53 | Did you take notes? Add them here:
 54 | 
 55 | * [Week 5 notes by M. Ayoub C.](https://gist.github.com/Qfl3x/aa6b1bec35fb645ded0371c46e8aafd1)
 56 | * [week 5: Monitoring notes Ayoub.B](https://github.com/ayoub-berdeddouch/mlops-journey/blob/main/monitoring-05.md)
 57 | * Send a PR, add your notes above this line
 58 | 
 59 | 
 60 | 
 61 | # Monitoring example
 62 | 
 63 | ## Prerequisites
 64 | 
 65 | You need following tools installed:
 66 | - `docker`
 67 | - `docker-compose` (included to Docker Desktop for Mac and Docker Desktop for Windows )
 68 | 
 69 | ## Preparation
 70 | 
 71 | Note: all actions expected to be executed in repo folder.
 72 | 
 73 | - Create virtual environment and activate it (eg. `python -m venv venv && source ./venv/bin/activate`)
 74 | - Install required packages `pip install -r requirements.txt`
 75 | - Run `python prepare.py` for downloading datasets
 76 | 
 77 | ## Monitoring Example
 78 | 
 79 | ### Starting services
 80 | 
 81 | To start all required services, execute:
 82 | ```bash
 83 | docker-compose up
 84 | ```
 85 | 
 86 | It will start following services:
 87 | - `prometheus` - TSDB for metrics
 88 | - `grafana` - Visual tool for metrics
 89 | - `mongo` - MongoDB, for storing raw data, predictions, targets and profile reports
 90 | - `evidently_service` - Evindently RT-monitoring service (draft example)
 91 | - `prediction_service` - main service, which makes predictions
 92 | 
 93 | ### Sending data
 94 | 
 95 | To start sending data to service, execute:
 96 | ```bash
 97 | python send_data.py
 98 | ```
 99 | 
100 | This script will send every second single row from dataset to prediction service along with creating file `target.csv` with actual results (so it can be loaded after)
101 | 
102 | ## Batch Monitoring Example
103 | 
104 | After you stop sending data to service, you can run batch monitoring pipeline (using Prefect) by running script:
105 | 
106 | ```bash
107 | python prefect_example.py
108 | ```
109 | 
110 | This script will:
111 | - load `target.csv` to MongoDB
112 | - download dataset from MongoDB
113 | - Run Evidently Model Profile and Evidently Report on this data
114 | - Save Profile data back to MongoDB
115 | - Save Report to `evidently_report_example.html`
116 | 
117 | You can look at Prefect steps in Prefect Orion UI
118 | (to start it execute `prefect orion start`)
119 | 


--------------------------------------------------------------------------------
/03-orchestration/homework_solution.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | 
  3 | from sklearn.feature_extraction import DictVectorizer
  4 | from sklearn.linear_model import LinearRegression
  5 | from sklearn.metrics import mean_squared_error
  6 | 
  7 | from prefect import task, flow, get_run_logger
  8 | from datetime import datetime
  9 | import pickle
 10 | 
 11 | @task
 12 | def read_data(path):
 13 |     df = pd.read_parquet(path)
 14 |     return df
 15 | 
 16 | @task
 17 | def prepare_features(df, categorical, train=True):
 18 |     logger = get_run_logger()
 19 |     df['duration'] = df.dropOff_datetime - df.pickup_datetime
 20 |     df['duration'] = df.duration.dt.total_seconds() / 60
 21 |     df = df[(df.duration >= 1) & (df.duration <= 60)].copy()
 22 | 
 23 |     mean_duration = df.duration.mean()
 24 |     if train:
 25 |         logger.info(f"The mean duration of training is {mean_duration}")
 26 |     else:
 27 |         logger.info(f"The mean duration of validation is {mean_duration}")
 28 |     
 29 |     df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
 30 |     return df
 31 | 
 32 | @task
 33 | def train_model(df, categorical):
 34 |     logger = get_run_logger()
 35 |     train_dicts = df[categorical].to_dict(orient='records')
 36 |     dv = DictVectorizer()
 37 |     X_train = dv.fit_transform(train_dicts) 
 38 |     y_train = df.duration.values
 39 | 
 40 |     logger.info(f"The shape of X_train is {X_train.shape}")
 41 |     logger.info(f"The DictVectorizer has {len(dv.feature_names_)} features")
 42 | 
 43 |     lr = LinearRegression()
 44 |     lr.fit(X_train, y_train)
 45 |     y_pred = lr.predict(X_train)
 46 |     mse = mean_squared_error(y_train, y_pred, squared=False)
 47 |     logger.info(f"The MSE of training is: {mse}")
 48 |     return lr, dv
 49 | 
 50 | @task
 51 | def run_model(df, categorical, dv, lr):
 52 |     logger = get_run_logger()
 53 |     val_dicts = df[categorical].to_dict(orient='records')
 54 |     X_val = dv.transform(val_dicts) 
 55 |     y_pred = lr.predict(X_val)
 56 |     y_val = df.duration.values
 57 | 
 58 |     mse = mean_squared_error(y_val, y_pred, squared=False)
 59 |     logger.info(f"The MSE of validation is: {mse}")
 60 |     return
 61 | 
 62 | @task
 63 | def get_paths(date):
 64 |     from dateutil.relativedelta import relativedelta
 65 |     if date:
 66 |         processed_date = datetime.strptime(date, "%Y-%m-%d")
 67 |     else:
 68 |         processed_date = datetime.today()
 69 |     train_date = processed_date - relativedelta(months=2)
 70 |     val_date = processed_date - relativedelta(months=1)
 71 |     train_path = f"./data/fhv_tripdata_{train_date.year}-{str(train_date.month).zfill(2)}.parquet"
 72 |     val_path = f"./data/fhv_tripdata_{val_date.year}-{str(val_date.month).zfill(2)}.parquet"
 73 |     return train_path, val_path
 74 | 
 75 | @flow
 76 | def main(date=None):
 77 |     train_path, val_path = get_paths(date).result()
 78 | 
 79 |     categorical = ['PUlocationID', 'DOlocationID']
 80 | 
 81 |     df_train = read_data(train_path) 
 82 |     df_train_processed = prepare_features(df_train, categorical)
 83 | 
 84 |     df_val = read_data(val_path)
 85 |     df_val_processed = prepare_features(df_val, categorical, False)
 86 | 
 87 |     # train the model
 88 |     lr, dv = train_model(df_train_processed, categorical).result()
 89 |     run_model(df_val_processed, categorical, dv, lr)
 90 | 
 91 |     if date is None:
 92 |         date = datetime.today.strftime("%Y-%m-%d")
 93 |     with open(f'./models/dv-{date}.b', 'wb') as f_out:
 94 |         pickle.dump(dv, f_out)
 95 | 
 96 | # main("2021-08-15")
 97 | 
 98 | from prefect.deployments import Deployment
 99 | from prefect.orion.schemas.schedules import CronSchedule
100 | from prefect.flow_runners import SubprocessFlowRunner
101 | 
102 | Deployment(
103 |     flow=main,
104 |     name="model_training",
105 |     schedule=CronSchedule(cron="0 9 15 * *"),
106 |     flow_runner=SubprocessFlowRunner(),
107 | )
108 | 


--------------------------------------------------------------------------------
/06-best-practices/code/model.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import base64
  4 | 
  5 | import boto3
  6 | import mlflow
  7 | 
  8 | 
  9 | def get_model_location(run_id):
 10 |     model_location = os.getenv('MODEL_LOCATION')
 11 | 
 12 |     if model_location is not None:
 13 |         return model_location
 14 | 
 15 |     model_bucket = os.getenv('MODEL_BUCKET', 'mlflow-models-alexey')
 16 |     experiment_id = os.getenv('MLFLOW_EXPERIMENT_ID', '1')
 17 | 
 18 |     model_location = f's3://{model_bucket}/{experiment_id}/{run_id}/artifacts/model'
 19 |     return model_location
 20 | 
 21 | 
 22 | def load_model(run_id):
 23 |     model_path = get_model_location(run_id)
 24 |     model = mlflow.pyfunc.load_model(model_path)
 25 |     return model
 26 | 
 27 | 
 28 | def base64_decode(encoded_data):
 29 |     decoded_data = base64.b64decode(encoded_data).decode('utf-8')
 30 |     ride_event = json.loads(decoded_data)
 31 |     return ride_event
 32 | 
 33 | 
 34 | class ModelService:
 35 |     def __init__(self, model, model_version=None, callbacks=None):
 36 |         self.model = model
 37 |         self.model_version = model_version
 38 |         self.callbacks = callbacks or []
 39 | 
 40 |     def prepare_features(self, ride):
 41 |         features = {}
 42 |         features['PU_DO'] = f"{ride['PULocationID']}_{ride['DOLocationID']}"
 43 |         features['trip_distance'] = ride['trip_distance']
 44 |         return features
 45 | 
 46 |     def predict(self, features):
 47 |         pred = self.model.predict(features)
 48 |         return float(pred[0])
 49 | 
 50 |     def lambda_handler(self, event):
 51 |         # print(json.dumps(event))
 52 | 
 53 |         predictions_events = []
 54 | 
 55 |         for record in event['Records']:
 56 |             encoded_data = record['kinesis']['data']
 57 |             ride_event = base64_decode(encoded_data)
 58 | 
 59 |             # print(ride_event)
 60 |             ride = ride_event['ride']
 61 |             ride_id = ride_event['ride_id']
 62 | 
 63 |             features = self.prepare_features(ride)
 64 |             prediction = self.predict(features)
 65 | 
 66 |             prediction_event = {
 67 |                 'model': 'ride_duration_prediction_model',
 68 |                 'version': self.model_version,
 69 |                 'prediction': {'ride_duration': prediction, 'ride_id': ride_id},
 70 |             }
 71 | 
 72 |             for callback in self.callbacks:
 73 |                 callback(prediction_event)
 74 | 
 75 |             predictions_events.append(prediction_event)
 76 | 
 77 |         return {'predictions': predictions_events}
 78 | 
 79 | 
 80 | class KinesisCallback:
 81 |     def __init__(self, kinesis_client, prediction_stream_name):
 82 |         self.kinesis_client = kinesis_client
 83 |         self.prediction_stream_name = prediction_stream_name
 84 | 
 85 |     def put_record(self, prediction_event):
 86 |         ride_id = prediction_event['prediction']['ride_id']
 87 | 
 88 |         self.kinesis_client.put_record(
 89 |             StreamName=self.prediction_stream_name,
 90 |             Data=json.dumps(prediction_event),
 91 |             PartitionKey=str(ride_id),
 92 |         )
 93 | 
 94 | 
 95 | def create_kinesis_client():
 96 |     endpoint_url = os.getenv('KINESIS_ENDPOINT_URL')
 97 | 
 98 |     if endpoint_url is None:
 99 |         return boto3.client('kinesis')
100 | 
101 |     return boto3.client('kinesis', endpoint_url=endpoint_url)
102 | 
103 | 
104 | def init(prediction_stream_name: str, run_id: str, test_run: bool):
105 |     model = load_model(run_id)
106 | 
107 |     callbacks = []
108 | 
109 |     if not test_run:
110 |         kinesis_client = create_kinesis_client()
111 |         kinesis_callback = KinesisCallback(kinesis_client, prediction_stream_name)
112 |         callbacks.append(kinesis_callback.put_record)
113 | 
114 |     model_service = ModelService(model=model, model_version=run_id, callbacks=callbacks)
115 | 
116 |     return model_service
117 | 


--------------------------------------------------------------------------------
/03-orchestration/README.md:
--------------------------------------------------------------------------------
 1 | # 3. Orchestration and ML Pipelines
 2 | 
 3 | **Note:** [`orchestration.py`](orchestration.py) is a ready final version. The rest of the files were worked on together during the video tutorials.
 4 | 
 5 | **Note** With Prefect version [`2.2.1`](https://github.com/PrefectHQ/prefect/blob/orion/RELEASE-NOTES.md#20b8) or later `DeploymentSpec`'s are now just `Deployment`'s.
 6 | 
 7 | ## 3.1 Negative engineering and workflow orchestration
 8 | 
 9 | <a href="https://www.youtube.com/watch?v=eKzCjNXoCTc&list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK">
10 |   <img src="images/thumbnail-3-01.jpg">
11 | </a>
12 | 
13 | 
14 | 
15 | ## 3.2 Introduction to Prefect 2.0
16 | 
17 | <a href="https://www.youtube.com/watch?v=Yb6NJwI7bXw&list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK">
18 |   <img src="images/thumbnail-3-02.jpg">
19 | </a>
20 | 
21 | 
22 | 
23 | ## 3.3 First Prefect flow and basics
24 | 
25 | <a href="https://www.youtube.com/watch?v=MCFpURG506w&list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK">
26 |   <img src="images/thumbnail-3-03.jpg">
27 | </a>
28 | 
29 | 
30 | 
31 | ## 3.4 Remote Prefect Orion deployment
32 | 
33 | <a href="https://www.youtube.com/watch?v=ComkSIAB0k4&list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK">
34 |   <img src="images/thumbnail-3-04.jpg">
35 | </a>
36 | 
37 | 
38 | 
39 | ## 3.5 Deployment of Prefect flow
40 | 
41 | **Note:** There are several changes to deployment in Prefect 2.3.1 since 2.0b8:
42 | - `DeploymentSpec` in 2.0b8 now becomes `Deployment`. 
43 | - `work_queue_name` is used instead of `tags` to submit the deployment to the a specific work queue. 
44 | - You don't need to create a work queue before using the work queue. A work queue will be created if it doesn't exist. 
45 | 
46 | ```python
47 | from prefect.deployments import Deployment
48 | from prefect.orion.schemas.schedules import IntervalSchedule
49 | from datetime import timedelta
50 | 
51 | deployment = Deployment.build_from_flow(
52 |     flow=main,
53 |     name="model_training",
54 |     schedule=IntervalSchedule(interval=timedelta(minutes=5)),
55 |     work_queue_name="ml"
56 | )
57 | 
58 | deployment.apply()
59 | ```
60 | 
61 | <a href="https://www.youtube.com/watch?v=xw9JfaWPPps&list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK">
62 |   <img src="images/thumbnail-3-05.jpg">
63 | </a>
64 | 
65 | Links:
66 | 
67 | * [Instructions for Hosting Prefect Orion](https://discourse.prefect.io/t/hosting-an-orion-instance-on-a-cloud-vm/967)
68 | 
69 | 
70 | ## 3.6 MLOps Zoomcamp 3.6 - (Optional) Work queues and agents
71 | 
72 | <a href="https://www.youtube.com/watch?v=oDSf0ThKsso&list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK">
73 |   <img src="images/thumbnail-3-06.jpg">
74 | </a>
75 | 
76 | 
77 | ## 3.7 Homework
78 | 
79 | More information here: [homework.md](homework.md)
80 | 
81 | 
82 | ## Notes
83 | 
84 | Did you take notes? Add them here:
85 | 
86 | * [Week 3, Prefect Introduction and S3 Bucket configuration with Prefect by M. Ayoub C.](https://gist.github.com/Qfl3x/8dd69b8173f027b9468016c118f3b6a5)
87 | * [Notes from froukje](https://github.com/froukje/ml-ops-zoomcamp/blob/master/03-orchestration/week03_orchestration.ipynb)
88 | * [Minimalist code notes from Anna V](https://github.com/annnvv/mlops_zoomcamp/blob/main/notes/module3_notes_prefect.md)
89 | * [Getting Started on Prefect 2.0 + Deploying worfklows for MLflow Staging by Ron Medina (Jupyter Book)](https://particle1331.github.io/inefficient-networks/notebooks/mlops/3-prefect/3-prefect.html)
90 | * [Quickstart your homework by Zioalex](https://github.com/zioalex/mlops-zoomcamp/blob/week3/03-orchestration/homework_quickstart.md)
91 | * [Notes from Maxime M](https://github.com/maxmarkov/mlops-zoomcamp/blob/master/lecture-notes/WEEK-3/03-orchestration.md)
92 | * [Week3: Prefect introduction and homework notes by Bhagabat](https://github.com/BPrasad123/MLOps_Zoomcamp/tree/main/Week3)
93 | * [Week 3: Orchestration notes by Ayoub.B](https://github.com/ayoub-berdeddouch/mlops-journey/blob/main/orchestration-03.md)
94 | * Send a PR, add your notes above this line
95 | 


--------------------------------------------------------------------------------
/05-monitoring/homework/prefect-monitoring/prefect_monitoring.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import pickle
 4 | 
 5 | import pandas
 6 | import pyarrow.parquet as pq
 7 | from evidently import ColumnMapping
 8 | from evidently.dashboard import Dashboard
 9 | from evidently.dashboard.tabs import DataDriftTab, RegressionPerformanceTab
10 | from evidently.model_profile import Profile
11 | from evidently.model_profile.sections import (
12 |     DataDriftProfileSection, RegressionPerformanceProfileSection)
13 | from prefect import flow, task
14 | from pymongo import MongoClient
15 | 
16 | MONGO_CLIENT_ADDRESS = "mongodb://localhost:27017/"
17 | MONGO_DATABASE = "prediction_service"
18 | PREDICTION_COLLECTION = "data"
19 | REPORT_COLLECTION = "report"
20 | REFERENCE_DATA_FILE = "../datasets/green_tripdata_2021-03.parquet" # Modify this for Q7
21 | TARGET_DATA_FILE = "target.csv"
22 | MODEL_FILE = os.getenv('MODEL_FILE', '../prediction_service/lin_reg.bin') # Modify this for Q7
23 | 
24 | @task
25 | def upload_target(filename):
26 |     client = MongoClient(MONGO_CLIENT_ADDRESS)
27 |     collection = client.get_database(MONGO_DATABASE).get_collection(PREDICTION_COLLECTION)
28 |     with open(filename) as f_target:
29 |         for line in f_target.readlines():
30 |             row = line.split(",")
31 |             collection.update_one({"id": row[0]},
32 |                                   {"$set": {"target": float(row[1])}}
33 |                                  )
34 | 
35 | 
36 | 
37 | @task
38 | def load_reference_data(filename):
39 |     
40 |     with open(MODEL_FILE, 'rb') as f_in:
41 |         dv, model = pickle.load(f_in)
42 |     reference_data = pq.read_table(filename).to_pandas().sample(n=5000,random_state=42) #Monitoring for 1st 5000 records
43 |     # Create features
44 |     reference_data['PU_DO'] = reference_data['PULocationID'].astype(str) + "_" + reference_data['DOLocationID'].astype(str)
45 | 
46 |     # add target column
47 |     reference_data['target'] = reference_data.lpep_dropoff_datetime - reference_data.lpep_pickup_datetime
48 |     reference_data.target = reference_data.target.apply(lambda td: td.total_seconds() / 60)
49 |     reference_data = reference_data[(reference_data.target >= 1) & (reference_data.target <= 60)]
50 |     features = ['PU_DO', 'PULocationID', 'DOLocationID', 'trip_distance']
51 |     x_pred = dv.transform(reference_data[features].to_dict(orient='records'))
52 |     reference_data['prediction'] = model.predict(x_pred)
53 |     return reference_data
54 | 
55 | 
56 | @task
57 | def fetch_data():
58 |     client = MongoClient(MONGO_CLIENT_ADDRESS)
59 |     data = client.get_database(MONGO_DATABASE).get_collection(PREDICTION_COLLECTION).find()
60 |     df = pandas.DataFrame(list(data))
61 |     return df
62 | 
63 | @task
64 | def run_evidently(ref_data, data):
65 | 
66 |     ref_data.drop(['ehail_fee'], axis=1, inplace=True)
67 |     data.drop('ehail_fee', axis=1, inplace=True)  # drop empty column (until Evidently will work with it properly)
68 | 
69 |     profile = Profile(sections=[DataDriftProfileSection(), RegressionPerformanceProfileSection()])
70 |     mapping = ColumnMapping(prediction="prediction", numerical_features=['trip_distance'],
71 |                             categorical_features=['PULocationID', 'DOLocationID'],
72 |                             datetime_features=[])
73 |     profile.calculate(ref_data, data, mapping)
74 | 
75 |     dashboard = Dashboard(tabs=[DataDriftTab(), RegressionPerformanceTab(verbose_level=0)])
76 |     dashboard.calculate(ref_data, data, mapping)
77 |     return json.loads(profile.json()), dashboard
78 | 
79 | 
80 | @task
81 | def save_report(result):
82 |     pass
83 | 
84 | @task
85 | def save_html_report(result):
86 |     pass
87 | 
88 | 
89 | @flow
90 | def batch_analyze():
91 |     upload_target(TARGET_DATA_FILE)
92 |     ref_data = load_reference_data(REFERENCE_DATA_FILE).result()
93 |     data = fetch_data().result()
94 |     profile, dashboard = run_evidently(ref_data, data).result()
95 |     save_report(profile)
96 |     save_html_report(dashboard)
97 | 
98 | batch_analyze()
99 | 


--------------------------------------------------------------------------------
/06-best-practices/docs.md:
--------------------------------------------------------------------------------
 1 | ## Extra Material
 2 | 
 3 | ### Concepts of IaC and Terraform
 4 | 
 5 | #### Summary
 6 | 
 7 | **Infrastructure-as-Code (IaC)**:
 8 | * Define and automate operations around you application's infrastructure.
 9 | * Can use version control to track changes made to infrastructure
10 | * Easy to replicate the configuration across different environments such as development, staging, and production. 
11 | 
12 | 
13 | #### Reference Material
14 | 
15 | We have already covered Terraform concepts at a deeper level in the [Data Engineering Zoomcamp](https://github.com/DataTalksClub/data-engineering-zoomcamp), and will not be repeating some of those basic concepts again. You can find the content here for your reference:
16 | 
17 | **Notes**:
18 | * [Terraform Overview](https://github.com/DataTalksClub/data-engineering-zoomcamp/blob/main/week_1_basics_n_setup/1_terraform_gcp/1_terraform_overview.md)
19 | 
20 | **Videos**:
21 | 
22 | 1. For an introduction to Terraform and IaC concepts, please refer to [this video](https://www.youtube.com/watch?v=Hajwnmj0xfQ&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=11) 
23 | (from the DE Zoomcamp), especially the sections in the time-codes:
24 | 
25 |     * 00:00 Introduction
26 |     * 00:35 What is Terraform?
27 |     * 01:10 What is IaC?
28 |     * 01:43 Advantages of IaC
29 |     * 14:48 Installing Terraform
30 |     * 02:28 More on Installing Terraform
31 | 
32 | 2. For a quickstart tutorial, and understanding the main components of a basic Terraform script, please refer to [this video](https://www.youtube.com/watch?v=dNkEgO-CExg&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=12)
33 |     (from the DE Zoomcamp). Please note that this example uses GCP as a cloud provider, while for MLOps Zoomcamp we are using AWS.
34 |     
35 |     * 00:00 Introduction
36 |     * 00:20 .terraform-version
37 |     * 01:04 main.tf
38 |     * 01:23 terraform declaration
39 |     * 03:25 provider plugins
40 |     * 04:00 resource example - google_storage_bucket
41 |     * 05:42 provider credentials
42 |     * 06:34 variables.tf
43 |     * 10:54 overview of terraform commands
44 |     * 13:35 running terraform commands
45 |     * 18:08 recap
46 | 
47 | In case you're using GCP instead of AWS, following is some setup material:
48 | * [Local Setup for Terraform and GCP](https://github.com/DataTalksClub/data-engineering-zoomcamp/tree/main/week_1_basics_n_setup/1_terraform_gcp)
49 | * [GCP Overview](https://github.com/DataTalksClub/data-engineering-zoomcamp/blob/main/week_1_basics_n_setup/1_terraform_gcp/2_gcp_overview.md)
50 | * [main.tf](https://github.com/DataTalksClub/data-engineering-zoomcamp/blob/main/week_1_basics_n_setup/1_terraform_gcp/terraform/main.tf)
51 | 
52 | #### References
53 | * Terraform with AWS: [Getting Started](https://learn.hashicorp.com/collections/terraform/aws-get-started) and [AWS provider library](https://registry.terraform.io/providers/hashicorp/aws/latest/docs)
54 | * Terraform Modules: [Define](https://www.terraform.io/language/modules/develop) and [Call](https://www.terraform.io/language/modules/syntax)
55 | 
56 | 
57 | ### Concepts of CI/CD and GitHub Actions
58 | 
59 | #### Summary
60 | * Using GitHub Actions to create workflows to automatically test a pull request, 
61 | build and push a Docker image, and deploy the updated lambda service to production. 
62 | * Creating specific YAML files in GitHub repo, to automatically kick off a series of automation steps.
63 | * Motivation on automating your further tasks with GitHub Actions:
64 |     * Orchestrating a continuous training pipeline (CT) to retrain your model and generate updated model artifacts in production
65 |     * Integrating the model registry (MLflow, DVC etc.) to fetch the latest model version or experiment ID
66 |     * and many more... 
67 | 
68 | 
69 | #### Reference Material
70 | * [GitHub Actions & Workflows](https://docs.github.com/en/actions/using-workflows)
71 | * [Build-Push image to ECR](https://docs.github.com/en/actions/deployment/deploying-to-your-cloud-provider/deploying-to-amazon-elastic-container-service)
72 | * [Python tests](https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python)
73 | 


--------------------------------------------------------------------------------
/02-experiment-tracking/running-mlflow-examples/scenario-1.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Scenario 1: A single data scientist participating in an ML competition\n",
  8 |     "\n",
  9 |     "MLflow setup:\n",
 10 |     "* Tracking server: no\n",
 11 |     "* Backend store: local filesystem\n",
 12 |     "* Artifacts store: local filesystem\n",
 13 |     "\n",
 14 |     "The experiments can be explored locally by launching the MLflow UI."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import mlflow"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "print(f\"tracking URI: '{mlflow.get_tracking_uri()}'\")"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "mlflow.list_experiments()"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "### Creating an experiment and logging a new run"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "from sklearn.linear_model import LogisticRegression\n",
 58 |     "from sklearn.datasets import load_iris\n",
 59 |     "from sklearn.metrics import accuracy_score\n",
 60 |     "\n",
 61 |     "mlflow.set_experiment(\"my-experiment-1\")\n",
 62 |     "\n",
 63 |     "with mlflow.start_run():\n",
 64 |     "\n",
 65 |     "    X, y = load_iris(return_X_y=True)\n",
 66 |     "\n",
 67 |     "    params = {\"C\": 0.1, \"random_state\": 42}\n",
 68 |     "    mlflow.log_params(params)\n",
 69 |     "\n",
 70 |     "    lr = LogisticRegression(**params).fit(X, y)\n",
 71 |     "    y_pred = lr.predict(X)\n",
 72 |     "    mlflow.log_metric(\"accuracy\", accuracy_score(y, y_pred))\n",
 73 |     "\n",
 74 |     "    mlflow.sklearn.log_model(lr, artifact_path=\"models\")\n",
 75 |     "    print(f\"default artifacts URI: '{mlflow.get_artifact_uri()}'\")"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "mlflow.list_experiments()"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": []
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "### Interacting with the model registry"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "from mlflow.tracking import MlflowClient\n",
108 |     "\n",
109 |     "\n",
110 |     "client = MlflowClient()"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "from mlflow.exceptions import MlflowException\n",
120 |     "\n",
121 |     "try:\n",
122 |     "    client.list_registered_models()\n",
123 |     "except MlflowException:\n",
124 |     "    print(\"It's not possible to access the model registry :(\")"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": []
133 |   }
134 |  ],
135 |  "metadata": {
136 |   "interpreter": {
137 |    "hash": "0848c9d6c7d415ad6c477ff7ff8e98694d1a4aa96d0deee89244642e6b630036"
138 |   },
139 |   "kernelspec": {
140 |    "display_name": "Python 3.9.12 ('exp-tracking-env')",
141 |    "language": "python",
142 |    "name": "python3"
143 |   },
144 |   "language_info": {
145 |    "codemirror_mode": {
146 |     "name": "ipython",
147 |     "version": 3
148 |    },
149 |    "file_extension": ".py",
150 |    "mimetype": "text/x-python",
151 |    "name": "python",
152 |    "nbconvert_exporter": "python",
153 |    "pygments_lexer": "ipython3",
154 |    "version": "3.9.12"
155 |   },
156 |   "orig_nbformat": 4
157 |  },
158 |  "nbformat": 4,
159 |  "nbformat_minor": 2
160 | }
161 | 


--------------------------------------------------------------------------------
/04-deployment/batch/score.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | import os
  5 | import sys
  6 | 
  7 | import uuid
  8 | import pickle
  9 | 
 10 | from datetime import datetime
 11 | 
 12 | import pandas as pd
 13 | 
 14 | import mlflow
 15 | 
 16 | from prefect import task, flow, get_run_logger
 17 | from prefect.context import get_run_context
 18 | 
 19 | from dateutil.relativedelta import relativedelta
 20 | 
 21 | from sklearn.feature_extraction import DictVectorizer
 22 | from sklearn.ensemble import RandomForestRegressor
 23 | from sklearn.metrics import mean_squared_error
 24 | from sklearn.pipeline import make_pipeline
 25 | 
 26 | 
 27 | def generate_uuids(n):
 28 |     ride_ids = []
 29 |     for i in range(n):
 30 |         ride_ids.append(str(uuid.uuid4()))
 31 |     return ride_ids
 32 | 
 33 | 
 34 | def read_dataframe(filename: str):
 35 |     df = pd.read_parquet(filename)
 36 | 
 37 |     df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
 38 |     df.duration = df.duration.dt.total_seconds() / 60
 39 |     df = df[(df.duration >= 1) & (df.duration <= 60)]
 40 |     
 41 |     df['ride_id'] = generate_uuids(len(df))
 42 | 
 43 |     return df
 44 | 
 45 | 
 46 | def prepare_dictionaries(df: pd.DataFrame):
 47 |     categorical = ['PULocationID', 'DOLocationID']
 48 |     df[categorical] = df[categorical].astype(str)
 49 |     
 50 |     df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
 51 | 
 52 |     categorical = ['PU_DO']
 53 |     numerical = ['trip_distance']
 54 |     dicts = df[categorical + numerical].to_dict(orient='records')
 55 |     return dicts
 56 | 
 57 | 
 58 | def load_model(run_id):
 59 |     logged_model = f's3://mlflow-models-alexey/1/{run_id}/artifacts/model'
 60 |     model = mlflow.pyfunc.load_model(logged_model)
 61 |     return model
 62 | 
 63 | 
 64 | def save_results(df, y_pred, run_id, output_file):
 65 |     df_result = pd.DataFrame()
 66 |     df_result['ride_id'] = df['ride_id']
 67 |     df_result['lpep_pickup_datetime'] = df['lpep_pickup_datetime']
 68 |     df_result['PULocationID'] = df['PULocationID']
 69 |     df_result['DOLocationID'] = df['DOLocationID']
 70 |     df_result['actual_duration'] = df['duration']
 71 |     df_result['predicted_duration'] = y_pred
 72 |     df_result['diff'] = df_result['actual_duration'] - df_result['predicted_duration']
 73 |     df_result['model_version'] = run_id
 74 | 
 75 |     df_result.to_parquet(output_file, index=False)
 76 | 
 77 | 
 78 | @task
 79 | def apply_model(input_file, run_id, output_file):
 80 |     logger = get_run_logger()
 81 | 
 82 |     logger.info(f'reading the data from {input_file}...')
 83 |     df = read_dataframe(input_file)
 84 |     dicts = prepare_dictionaries(df)
 85 | 
 86 |     logger.info(f'loading the model with RUN_ID={run_id}...')
 87 |     model = load_model(run_id)
 88 | 
 89 |     logger.info(f'applying the model...')
 90 |     y_pred = model.predict(dicts)
 91 | 
 92 |     logger.info(f'saving the result to {output_file}...')
 93 | 
 94 |     save_results(df, y_pred, run_id, output_file)
 95 |     return output_file
 96 | 
 97 | 
 98 | def get_paths(run_date, taxi_type, run_id):
 99 |     prev_month = run_date - relativedelta(months=1)
100 |     year = prev_month.year
101 |     month = prev_month.month 
102 | 
103 |     input_file = f's3://nyc-tlc/trip data/{taxi_type}_tripdata_{year:04d}-{month:02d}.parquet'
104 |     output_file = f's3://nyc-duration-prediction-alexey/taxi_type={taxi_type}/year={year:04d}/month={month:02d}/{run_id}.parquet'
105 | 
106 |     return input_file, output_file
107 | 
108 | 
109 | @flow
110 | def ride_duration_prediction(
111 |         taxi_type: str,
112 |         run_id: str,
113 |         run_date: datetime = None):
114 |     if run_date is None:
115 |         ctx = get_run_context()
116 |         run_date = ctx.flow_run.expected_start_time
117 |     
118 |     input_file, output_file = get_paths(run_date, taxi_type, run_id)
119 | 
120 |     apply_model(
121 |         input_file=input_file,
122 |         run_id=run_id,
123 |         output_file=output_file
124 |     )
125 | 
126 | 
127 | def run():
128 |     taxi_type = sys.argv[1] # 'green'
129 |     year = int(sys.argv[2]) # 2021
130 |     month = int(sys.argv[3]) # 3
131 | 
132 |     run_id = sys.argv[4] # 'e1efc53e9bd149078b0c12aeaa6365df'
133 | 
134 |     ride_duration_prediction(
135 |         taxi_type=taxi_type,
136 |         run_id=run_id,
137 |         run_date=datetime(year=year, month=month, day=1)
138 |     )
139 | 
140 | 
141 | if __name__ == '__main__':
142 |     run()
143 | 
144 | 
145 | 
146 | 
147 | 


--------------------------------------------------------------------------------
/06-best-practices/code/infrastructure/modules/lambda/iam.tf:
--------------------------------------------------------------------------------
  1 | resource "aws_iam_role" "iam_lambda" {
  2 |   name = "iam_${var.lambda_function_name}"
  3 |   assume_role_policy = <<EOF
  4 | {
  5 |   "Version": "2012-10-17",
  6 |   "Statement": [
  7 |     {
  8 |       "Action": "sts:AssumeRole",
  9 |       "Principal": {
 10 |         "Service": [
 11 |           "lambda.amazonaws.com",
 12 |           "kinesis.amazonaws.com"
 13 |           ]
 14 |       },
 15 |       "Effect": "Allow",
 16 |       "Sid": ""
 17 |     }
 18 |   ]
 19 | }
 20 | EOF
 21 | }
 22 | 
 23 | resource "aws_iam_policy" "allow_kinesis_processing" {
 24 |   name        = "allow_kinesis_processing_${var.lambda_function_name}"
 25 |   path        = "/"
 26 |   description = "IAM policy for logging from a lambda"
 27 | 
 28 |   policy = <<EOF
 29 | {
 30 |   "Version": "2012-10-17",
 31 |   "Statement": [
 32 |     {
 33 |       "Action": [
 34 |         "kinesis:ListShards",
 35 |         "kinesis:ListStreams",
 36 |         "kinesis:*"
 37 |       ],
 38 |       "Resource": "arn:aws:kinesis:*:*:*",
 39 |       "Effect": "Allow"
 40 |     },
 41 |     {
 42 |       "Action": [
 43 |         "stream:GetRecord",
 44 |         "stream:GetShardIterator",
 45 |         "stream:DescribeStream",
 46 |         "stream:*"
 47 |       ],
 48 |       "Resource": "arn:aws:stream:*:*:*",
 49 |       "Effect": "Allow"
 50 |     }
 51 |   ]
 52 | }
 53 | EOF
 54 | }
 55 | 
 56 | resource "aws_iam_role_policy_attachment" "kinesis_processing" {
 57 |   role       = aws_iam_role.iam_lambda.name
 58 |   policy_arn = aws_iam_policy.allow_kinesis_processing.arn
 59 | }
 60 | 
 61 | resource "aws_iam_role_policy" "inline_lambda_policy" {
 62 |   name       = "LambdaInlinePolicy"
 63 |   role       = aws_iam_role.iam_lambda.id
 64 |   depends_on = [aws_iam_role.iam_lambda]
 65 |   policy     = <<EOF
 66 | {
 67 |   "Version": "2012-10-17",
 68 |   "Statement": [
 69 |     {
 70 |       "Effect": "Allow",
 71 |       "Action": [
 72 |         "kinesis:PutRecords",
 73 |         "kinesis:PutRecord"
 74 |       ],
 75 |       "Resource": "${var.output_stream_arn}"
 76 |     }
 77 |   ]
 78 | }
 79 | EOF
 80 | }
 81 | 
 82 | # IAM for CW
 83 | 
 84 | resource "aws_lambda_permission" "allow_cloudwatch_to_trigger_lambda_function" {
 85 |   statement_id  = "AllowExecutionFromCloudWatch"
 86 |   action        = "lambda:InvokeFunction"
 87 |   function_name = aws_lambda_function.kinesis_lambda.function_name
 88 |   principal     = "events.amazonaws.com"
 89 |   source_arn    = var.source_stream_arn
 90 | }
 91 | 
 92 | resource "aws_iam_policy" "allow_logging" {
 93 |   name        = "allow_logging_${var.lambda_function_name}"
 94 |   path        = "/"
 95 |   description = "IAM policy for logging from a lambda"
 96 | 
 97 |   policy = <<EOF
 98 | {
 99 |   "Version": "2012-10-17",
100 |   "Statement": [
101 |     {
102 |       "Action": [
103 |         "logs:CreateLogGroup",
104 |         "logs:CreateLogStream",
105 |         "logs:PutLogEvents"
106 |       ],
107 |       "Resource": "arn:aws:logs:*:*:*",
108 |       "Effect": "Allow"
109 |     }
110 |   ]
111 | }
112 | EOF
113 | }
114 | 
115 | resource "aws_iam_role_policy_attachment" "lambda_logs" {
116 |   role       = aws_iam_role.iam_lambda.name
117 |   policy_arn = aws_iam_policy.allow_logging.arn
118 | }
119 | 
120 | # IAM for S3
121 | 
122 | resource "aws_iam_policy" "lambda_s3_role_policy" {
123 |   name = "lambda_s3_policy_${var.lambda_function_name}"
124 |   description = "IAM Policy for s3"
125 |   # TODO: change policies below to reflect get operation
126 | policy = <<EOF
127 | {
128 |   "Version": "2012-10-17",
129 |   "Statement": [
130 |     {
131 |             "Effect": "Allow",
132 |             "Action": [
133 |                 "s3:ListAllMyBuckets",
134 |                 "s3:GetBucketLocation",
135 |                 "s3:*"
136 |             ],
137 |             "Resource": "*"
138 |         },
139 |         {
140 |             "Effect": "Allow",
141 |             "Action": "s3:*",
142 |             "Resource": [
143 |                 "arn:aws:s3:::${var.model_bucket}",
144 |                 "arn:aws:s3:::${var.model_bucket}/*"
145 |             ]
146 |         },
147 |         {
148 |           "Action": [
149 |             "autoscaling:Describe*",
150 |             "cloudwatch:*",
151 |             "logs:*",
152 |             "sns:*"
153 |           ],
154 |           "Effect": "Allow",
155 |           "Resource": "*"
156 |         }
157 |   ]
158 | }
159 |   EOF
160 | }
161 | 
162 | resource "aws_iam_role_policy_attachment" "iam-policy-attach" {
163 |   role       = aws_iam_role.iam_lambda.name
164 |   policy_arn = aws_iam_policy.lambda_s3_role_policy.arn
165 | }
166 | 


--------------------------------------------------------------------------------
/02-experiment-tracking/running-mlflow-examples/scenario-2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Scenario 2: A cross-functional team with one data scientist working on an ML model\n",
  8 |     "\n",
  9 |     "\n",
 10 |     "MLflow setup:\n",
 11 |     "- tracking server: yes, local server\n",
 12 |     "- backend store: sqlite database\n",
 13 |     "- artifacts store: local filesystem\n",
 14 |     "\n",
 15 |     "The experiments can be explored locally by accessing the local tracking server.\n",
 16 |     "\n",
 17 |     "To run this example you need to launch the mlflow server locally by running the following command in your terminal:\n",
 18 |     "\n",
 19 |     "`mlflow server --backend-store-uri sqlite:///backend.db`"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "import mlflow\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "mlflow.set_tracking_uri(\"http://127.0.0.1:5000\")"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "print(f\"tracking URI: '{mlflow.get_tracking_uri()}'\")"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "mlflow.list_experiments()"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "from sklearn.linear_model import LogisticRegression\n",
 59 |     "from sklearn.datasets import load_iris\n",
 60 |     "from sklearn.metrics import accuracy_score\n",
 61 |     "\n",
 62 |     "mlflow.set_experiment(\"my-experiment-1\")\n",
 63 |     "\n",
 64 |     "with mlflow.start_run():\n",
 65 |     "\n",
 66 |     "    X, y = load_iris(return_X_y=True)\n",
 67 |     "\n",
 68 |     "    params = {\"C\": 0.1, \"random_state\": 42}\n",
 69 |     "    mlflow.log_params(params)\n",
 70 |     "\n",
 71 |     "    lr = LogisticRegression(**params).fit(X, y)\n",
 72 |     "    y_pred = lr.predict(X)\n",
 73 |     "    mlflow.log_metric(\"accuracy\", accuracy_score(y, y_pred))\n",
 74 |     "\n",
 75 |     "    mlflow.sklearn.log_model(lr, artifact_path=\"models\")\n",
 76 |     "    print(f\"default artifacts URI: '{mlflow.get_artifact_uri()}'\")"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "mlflow.list_experiments()"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "### Interacting with the model registry"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "from mlflow.tracking import MlflowClient\n",
102 |     "\n",
103 |     "\n",
104 |     "client = MlflowClient(\"http://127.0.0.1:5000\")"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "client.list_registered_models()"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "run_id = client.list_run_infos(experiment_id='1')[0].run_id\n",
123 |     "mlflow.register_model(\n",
124 |     "    model_uri=f\"runs:/{run_id}/models\",\n",
125 |     "    name='iris-classifier'\n",
126 |     ")"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": []
135 |   }
136 |  ],
137 |  "metadata": {
138 |   "interpreter": {
139 |    "hash": "0848c9d6c7d415ad6c477ff7ff8e98694d1a4aa96d0deee89244642e6b630036"
140 |   },
141 |   "kernelspec": {
142 |    "display_name": "Python 3.9.12 ('exp-tracking-env')",
143 |    "language": "python",
144 |    "name": "python3"
145 |   },
146 |   "language_info": {
147 |    "codemirror_mode": {
148 |     "name": "ipython",
149 |     "version": 3
150 |    },
151 |    "file_extension": ".py",
152 |    "mimetype": "text/x-python",
153 |    "name": "python",
154 |    "nbconvert_exporter": "python",
155 |    "pygments_lexer": "ipython3",
156 |    "version": "3.9.12"
157 |   },
158 |   "orig_nbformat": 4
159 |  },
160 |  "nbformat": 4,
161 |  "nbformat_minor": 2
162 | }
163 | 


--------------------------------------------------------------------------------
/05-monitoring/homework/prefect-monitoring/prefect_monitoring_solution.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import pickle
  4 | from datetime import datetime
  5 | 
  6 | import pandas
  7 | import pyarrow.parquet as pq
  8 | from evidently import ColumnMapping
  9 | from evidently.dashboard import Dashboard
 10 | from evidently.dashboard.tabs import DataDriftTab, RegressionPerformanceTab
 11 | from evidently.model_profile import Profile
 12 | from evidently.model_profile.sections import (
 13 |     DataDriftProfileSection, RegressionPerformanceProfileSection)
 14 | from prefect import flow, task
 15 | from pymongo import MongoClient
 16 | 
 17 | MONGO_CLIENT_ADDRESS = "mongodb://localhost:27017/"
 18 | MONGO_DATABASE = "prediction_service"
 19 | PREDICTION_COLLECTION = "data"
 20 | REPORT_COLLECTION = "report"
 21 | REFERENCE_DATA_FILE = "green_tripdata_2021-03to04.parquet" # Modify this for Q7
 22 | TARGET_DATA_FILE = "target.csv"
 23 | MODEL_FILE = os.getenv('MODEL_FILE', '../prediction_service/lin_reg_V2.bin') # Modify this for Q7
 24 | 
 25 | @task
 26 | def upload_target(filename):
 27 |     client = MongoClient(MONGO_CLIENT_ADDRESS)
 28 |     collection = client.get_database(MONGO_DATABASE).get_collection(PREDICTION_COLLECTION)
 29 |     with open(filename) as f_target:
 30 |         for line in f_target.readlines():
 31 |             row = line.split(",")
 32 |             collection.update_one({"id": row[0]},
 33 |                                   {"$set": {"target": float(row[1])}}
 34 |                                  )
 35 | 
 36 | 
 37 | 
 38 | @task
 39 | def load_reference_data(filename):
 40 |     
 41 |     with open(MODEL_FILE, 'rb') as f_in:
 42 |         dv, model = pickle.load(f_in)
 43 |     reference_data = pq.read_table(filename).to_pandas().sample(n=5000,random_state=42) #Monitoring for 1st 5000 records
 44 |     # Create features
 45 |     reference_data['PU_DO'] = reference_data['PULocationID'].astype(str) + "_" + reference_data['DOLocationID'].astype(str)
 46 | 
 47 |     # add target column
 48 |     reference_data['target'] = reference_data.lpep_dropoff_datetime - reference_data.lpep_pickup_datetime
 49 |     reference_data.target = reference_data.target.apply(lambda td: td.total_seconds() / 60)
 50 |     reference_data = reference_data[(reference_data.target >= 1) & (reference_data.target <= 60)]
 51 |     features = ['PU_DO', 'PULocationID', 'DOLocationID', 'trip_distance']
 52 |     x_pred = dv.transform(reference_data[features].to_dict(orient='records'))
 53 |     reference_data['prediction'] = model.predict(x_pred)
 54 |     return reference_data
 55 | 
 56 | 
 57 | @task
 58 | def fetch_data():
 59 |     client = MongoClient(MONGO_CLIENT_ADDRESS)
 60 |     data = client.get_database(MONGO_DATABASE).get_collection(PREDICTION_COLLECTION).find()
 61 |     df = pandas.DataFrame(list(data))
 62 |     return df
 63 | 
 64 | @task
 65 | def run_evidently(ref_data, data):
 66 | 
 67 |     ref_data.drop(['ehail_fee'], axis=1, inplace=True)
 68 |     data.drop('ehail_fee', axis=1, inplace=True)  # drop empty column (until Evidently will work with it properly)
 69 | 
 70 |     profile = Profile(sections=[DataDriftProfileSection(), RegressionPerformanceProfileSection()])
 71 |     mapping = ColumnMapping(prediction="prediction", numerical_features=['trip_distance'],
 72 |                             categorical_features=['PULocationID', 'DOLocationID'],
 73 |                             datetime_features=[])
 74 |     profile.calculate(ref_data, data, mapping)
 75 | 
 76 |     dashboard = Dashboard(tabs=[DataDriftTab(), RegressionPerformanceTab(verbose_level=0)])
 77 |     dashboard.calculate(ref_data, data, mapping)
 78 |     return json.loads(profile.json()), dashboard
 79 | 
 80 | 
 81 | @task
 82 | def save_report(result):
 83 |     """Save evidendtly profile for ride prediction to mongo server"""
 84 | 
 85 |     client = MongoClient(MONGO_CLIENT_ADDRESS)
 86 |     collection = client.get_database(MONGO_DATABASE).get_collection(REPORT_COLLECTION)
 87 |     collection.insert_one(result)
 88 | 
 89 | @task
 90 | def save_html_report(result, filename_suffix=None):
 91 |     """Create evidently html report file for ride prediction"""
 92 |     
 93 |     if filename_suffix is None:
 94 |         filename_suffix = datetime.now().strftime('%Y-%m-%d-%H-%M')
 95 |     
 96 |     result.save(f"ride_prediction_drift_report_{filename_suffix}.html")
 97 | 
 98 | 
 99 | @flow
100 | def batch_analyze():
101 |     upload_target(TARGET_DATA_FILE)
102 |     ref_data = load_reference_data(REFERENCE_DATA_FILE).result()
103 |     data = fetch_data().result()
104 |     profile, dashboard = run_evidently(ref_data, data).result()
105 |     save_report(profile)
106 |     save_html_report(dashboard)
107 | 
108 | batch_analyze()
109 | 


--------------------------------------------------------------------------------
/.github/workflows/cd-deploy.yml:
--------------------------------------------------------------------------------
  1 | name: CD-Deploy
  2 | on:
  3 |   push:
  4 |     branches:
  5 |       - 'develop'
  6 | #    paths:
  7 | #      - '06-best-practices/code/**'
  8 | 
  9 | jobs:
 10 |   build-push-deploy:
 11 |     runs-on: ubuntu-latest
 12 |     steps:
 13 |       - name: Check out repo
 14 |         uses: actions/checkout@v3
 15 |       - name: Configure AWS Credentials
 16 |         uses: aws-actions/configure-aws-credentials@v1
 17 |         with:
 18 |           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
 19 |           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
 20 |           aws-region: "eu-west-1"
 21 |       - uses: hashicorp/setup-terraform@v2
 22 |         with:
 23 |           terraform_wrapper: false
 24 | 
 25 |       # Define the infrastructure
 26 |       - name: TF plan
 27 |         id: tf-plan
 28 |         working-directory: '06-best-practices/code/infrastructure'
 29 |         run: |
 30 |           terraform init -backend-config="key=mlops-zoomcamp-prod.tfstate" -reconfigure && terraform plan -var-file=vars/prod.tfvars
 31 | 
 32 |       - name: TF Apply
 33 |         id: tf-apply
 34 |         working-directory: '06-best-practices/code/infrastructure'
 35 |         if: ${{ steps.tf-plan.outcome }} == 'success'
 36 |         run: |
 37 |           terraform apply -auto-approve -var-file=vars/prod.tfvars
 38 |           echo "::set-output name=ecr_repo::$(terraform output ecr_repo | xargs)"
 39 |           echo "::set-output name=predictions_stream_name::$(terraform output predictions_stream_name | xargs)"
 40 |           echo "::set-output name=model_bucket::$(terraform output model_bucket | xargs)"
 41 |           echo "::set-output name=lambda_function::$(terraform output lambda_function | xargs)"
 42 | 
 43 |       # Build-Push
 44 |       - name: Login to Amazon ECR
 45 |         id: login-ecr
 46 |         uses: aws-actions/amazon-ecr-login@v1
 47 | 
 48 |       - name: Build, tag, and push image to Amazon ECR
 49 |         id: build-image-step
 50 |         working-directory: "06-best-practices/code"
 51 |         env:
 52 |           ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
 53 |           ECR_REPOSITORY: ${{ steps.tf-apply.outputs.ecr_repo }}
 54 |           IMAGE_TAG: "latest"   # ${{ github.sha }}
 55 |         run: |
 56 |           docker build -t ${ECR_REGISTRY}/${ECR_REPOSITORY}:${IMAGE_TAG} .
 57 |           docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
 58 |           echo "::set-output name=image_uri::$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG"
 59 | 
 60 |       # Deploy
 61 |       - name: Get model artifacts
 62 |       # The steps here are not suited for production.
 63 |       # In practice, retrieving the latest model version or RUN_ID from a service like MLflow or DVC can also be integrated into a CI/CD pipeline.
 64 |       # But due to the limited scope of this workshop, we would be keeping things simple.
 65 |       # In practice, you would also have a separate training pipeline to write new model artifacts to your Model Bucket in Prod.
 66 | 
 67 |         id: get-model-artifacts
 68 |         working-directory: "06-best-practices/code"
 69 |         env:
 70 |           MODEL_BUCKET_DEV: "mlflow-models-alexey"
 71 |           MODEL_BUCKET_PROD: ${{ steps.tf-apply.outputs.model_bucket }}
 72 |         run: |
 73 |           export RUN_ID=$(aws s3api list-objects-v2 --bucket ${MODEL_BUCKET_DEV} \
 74 |           --query 'sort_by(Contents, &LastModified)[-1].Key' --output=text | cut -f2 -d/)
 75 |           aws s3 sync s3://${MODEL_BUCKET_DEV} s3://${MODEL_BUCKET_PROD}
 76 |           echo "::set-output name=run_id::${RUN_ID}"
 77 | 
 78 |       - name: Update Lambda
 79 |         env:
 80 |           LAMBDA_FUNCTION: ${{ steps.tf-apply.outputs.lambda_function }}
 81 |           PREDICTIONS_STREAM_NAME: ${{ steps.tf-apply.outputs.predictions_stream_name }}
 82 |           MODEL_BUCKET: ${{ steps.tf-apply.outputs.model_bucket }}
 83 |           RUN_ID: ${{ steps.get-model-artifacts.outputs.run_id }}
 84 |         run: |
 85 |           variables="{ \
 86 |                     PREDICTIONS_STREAM_NAME=$PREDICTIONS_STREAM_NAME, MODEL_BUCKET=$MODEL_BUCKET, RUN_ID=$RUN_ID \
 87 |                     }"
 88 | 
 89 |           STATE=$(aws lambda get-function --function-name $LAMBDA_FUNCTION --region "eu-west-1" --query 'Configuration.LastUpdateStatus' --output text)
 90 |               while [[ "$STATE" == "InProgress" ]]
 91 |               do
 92 |                   echo "sleep 5sec ...."
 93 |                   sleep 5s
 94 |                   STATE=$(aws lambda get-function --function-name $LAMBDA_FUNCTION --region "eu-west-1" --query 'Configuration.LastUpdateStatus' --output text)
 95 |                   echo $STATE
 96 |               done
 97 | 
 98 |           aws lambda update-function-configuration --function-name $LAMBDA_FUNCTION \
 99 |                     --environment "Variables=${variables}"
100 | 


--------------------------------------------------------------------------------
/02-experiment-tracking/mlflow_on_aws.md:
--------------------------------------------------------------------------------
 1 | # Basic AWS setup
 2 | 
 3 | This tutorials explains how to configure a remote tracking server on AWS. We will use an RDS database as the backend store and an s3 bucket as the artifact store.
 4 | 
 5 | 1. First, you need to [create an AWS account](https://aws.amazon.com/free). If you open a new account, AWS allows you to use some of their products for free but take into account that **you may be charged for using the AWS services**. More information [here](https://youtu.be/rkKvzCskpLE) and [here](https://aws.amazon.com/premiumsupport/knowledge-center/free-tier-charges/).
 6 | 
 7 | 2. Launch a new EC2 instance.
 8 | 
 9 | For this, you can select one of the instance types that are free tier eligible. For example, we will select an Amazon Linux OS (`Amazon Linux 2 AMI (HVM) - Kernel 5.10, SSD Volume Type`) and a `t2.micro` instance type, which are free tier eligible. 
10 | 
11 | 
12 | <img src="images/ec2_os.png" width=400/>
13 | 
14 | <img src="images/ec2_instance_type.png" width=400/>
15 | 
16 | You'll also need to create a new key pair so later you can connect to the new instance using SSH. Click on "Create new key pair" and complete the details like in the image below:
17 | 
18 | <img src="images/key_pair.png" width=400/>
19 | 
20 | Select the new key pair and then click on "Launch Instance".
21 | 
22 | <img src="images/select_key_pair.png" width=400/>
23 | 
24 | Finally, you have to edit the security group so the EC2 instance accepts SSH (port 22) and HTTP connections (port 5000):
25 | 
26 | <img src="images/security_group.png" width=400/>
27 | 
28 | 3. Create an s3 bucket to be used as the artifact store.
29 | 
30 | Go to s3 and click on "Create bucket". Fill in the bucket name as in the image below and let all the other configurations with their default values.
31 | 
32 | <img src="images/s3_bucket.png" width=400/>
33 | 
34 | Note: s3 bucket names must be unique across all AWS account in all the AWS Regions within a partition, that means that once a bucket is created, the name of that bucket cannot be used by another AWS account within the same region. If you get an error saying that the bucket name was already taken you can fix it easily by just changing the name to something like `mlflow-artifacts-remote-2` or another name.
35 | 
36 | 4. Create a new PostgreSQL database to be used as the backend store
37 | 
38 | Go to the RDS Console and click on "Create database". Make sure to select "PostgreSQL" engine type and the "Free tier" template.
39 | 
40 | <img src="images/postgresql.png" width=400/>
41 | 
42 | Select a name for your DB instance, set the master username as "mlflow" and tick the option "Auto generate a password" so Amazon RDS generate a password automatically.
43 | 
44 | <img src="images/db_settings.png" width=400/>
45 | 
46 | Finally, on the section "Additional configuration" specify a database name so RDS automatically creates an initial database for you.
47 | 
48 | <img src="images/db_configuration.png" width=400/>
49 | 
50 | After clicking on "launch database" you will be able to check the newly generated password, but take into account that the automatically generated password will be shown only once!
51 | 
52 | <img src="images/db_password.png" width=400/>
53 | 
54 | You can use the default values for all the other configurations.
55 | 
56 | Take note of the following information:
57 | 
58 | * master username
59 | * password 
60 | * initial database name
61 | * endpoint
62 | 
63 | Once the DB instance is created, go to the RDS console, select the new db and under "Connectivity & security" select the VPC security group. Modify the security group by adding a new inbound rule that allows postgreSQL connections on the port 5432 from the security group of the EC2 instance. This way, the server will be able to connect to the postgres database.
64 | 
65 | <img src="images/postgresql_inbound_rule.png" width=400/>
66 | 
67 | 5. Connect to the EC2 instance and launch the tracking server.
68 | 
69 | Go to the EC2 Console and find the instance launched on the step 2. Click on "Connect" and then follow the steps described in the tab "SSH". 
70 | 
71 | Run the following commands to install the dependencies, configure the environment and launch the server:
72 | * `sudo yum update`
73 | * `pip3 install mlflow boto3 psycopg2-binary`
74 | * `aws configure`   # you'll need to input your AWS credentials here
75 | * `mlflow server -h 0.0.0.0 -p 5000 --backend-store-uri postgresql://DB_USER:DB_PASSWORD@DB_ENDPOINT:5432/DB_NAME --default-artifact-root s3://S3_BUCKET_NAME`
76 | 
77 | Note: before launching the server, check that the instance can access the s3 bucket created in the step number 3. To do that, just run this command from the EC2 instance: `aws s3 ls`. You should see the bucket listed in the result.
78 | 
79 | 6. Access the remote tracking server from your local machine.
80 | 
81 | Open a new tab on your web browser and go to this address: `http://<EC2_PUBLIC_DNS>:5000` (you can find the instance's public DNS by checking the details of your instance in the EC2 Console).


--------------------------------------------------------------------------------
/06-best-practices/code/README.md:
--------------------------------------------------------------------------------
  1 | ## Code snippets
  2 | 
  3 | ### Building and running Docker images
  4 | 
  5 | ```bash
  6 | docker build -t stream-model-duration:v2 .
  7 | ```
  8 | 
  9 | ```bash
 10 | docker run -it --rm \
 11 |     -p 8080:8080 \
 12 |     -e PREDICTIONS_STREAM_NAME="ride_predictions" \
 13 |     -e RUN_ID="e1efc53e9bd149078b0c12aeaa6365df" \
 14 |     -e TEST_RUN="True" \
 15 |     -e AWS_DEFAULT_REGION="eu-west-1" \
 16 |     stream-model-duration:v2
 17 | ```
 18 | 
 19 | Mounting the model folder:
 20 | 
 21 | ```
 22 | docker run -it --rm \
 23 |     -p 8080:8080 \
 24 |     -e PREDICTIONS_STREAM_NAME="ride_predictions" \
 25 |     -e RUN_ID="Test123" \
 26 |     -e MODEL_LOCATION="/app/model" \
 27 |     -e TEST_RUN="True" \
 28 |     -e AWS_DEFAULT_REGION="eu-west-1" \
 29 |     -v $(pwd)/model:/app/model \
 30 |     stream-model-duration:v2
 31 | ```
 32 | 
 33 | ### Specifying endpoint URL
 34 | 
 35 | ```bash
 36 | aws --endpoint-url=http://localhost:4566 \
 37 |     kinesis list-streams
 38 | ```
 39 | 
 40 | ```bash
 41 | aws --endpoint-url=http://localhost:4566 \
 42 |     kinesis create-stream \
 43 |     --stream-name ride_predictions \
 44 |     --shard-count 1
 45 | ```
 46 | 
 47 | ```bash
 48 | aws  --endpoint-url=http://localhost:4566 \
 49 |     kinesis     get-shard-iterator \
 50 |     --shard-id ${SHARD} \
 51 |     --shard-iterator-type TRIM_HORIZON \
 52 |     --stream-name ${PREDICTIONS_STREAM_NAME} \
 53 |     --query 'ShardIterator'
 54 | ```
 55 | 
 56 | ### Unable to locate credentials
 57 | 
 58 | If you get `'Unable to locate credentials'` error, add these
 59 | env variables to the `docker-compose.yaml` file:
 60 | 
 61 | ```yaml
 62 | - AWS_ACCESS_KEY_ID=abc
 63 | - AWS_SECRET_ACCESS_KEY=xyz
 64 | ```
 65 | 
 66 | ### Make
 67 | 
 68 | Without make:
 69 | 
 70 | ```
 71 | isort .
 72 | black .
 73 | pylint --recursive=y .
 74 | pytest tests/
 75 | ```
 76 | 
 77 | With make:
 78 | 
 79 | ```
 80 | make quality_checks
 81 | make test
 82 | ```
 83 | 
 84 | 
 85 | To prepare the project, run 
 86 | 
 87 | ```bash
 88 | make setup
 89 | ```
 90 | 
 91 | 
 92 | ### IaC
 93 | w/ Terraform
 94 | 
 95 | #### Setup
 96 | 
 97 | **Installation**:
 98 | 
 99 | * [aws-cli](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) (both versions are fine)
100 | * [terraform client](https://www.terraform.io/downloads)
101 | 
102 | **Configuration**:
103 | 
104 | 1. If you've already created an AWS account, head to the IAM section, generate your secret-key, and download it locally. 
105 | [Instructions](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-prereqs.html)
106 | 
107 | 2. [Configure]((https://docs.aws.amazon.com/cli/latest/userguide/getting-started-quickstart.html)) `aws-cli` with your downloaded AWS secret keys:
108 |       ```shell
109 |          $ aws configure
110 |          AWS Access Key ID [None]: xxx
111 |          AWS Secret Access Key [None]: xxx
112 |          Default region name [None]: eu-west-1
113 |          Default output format [None]:
114 |       ```
115 | 
116 | 3. Verify aws config:
117 |       ```shell
118 |         $ aws sts get-caller-identity
119 |       ```
120 | 
121 | 4. (Optional) Configuring with `aws profile`: [here](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-sourcing-external.html) and [here](https://registry.terraform.io/providers/hashicorp/aws/latest/docs#using-an-external-credentials-process) 
122 | 
123 | <br>
124 | 
125 | #### Execution
126 | 
127 | 
128 | 1. To create infra (manually, in order to test on staging env)
129 |     ```shell
130 |     # Initialize state file (.tfstate)
131 |     terraform init
132 | 
133 |     # Check changes to new infra plan
134 |     terraform plan -var-file=vars/stg.tfvars
135 |     ```
136 | 
137 |     ```shell
138 |     # Create new infra
139 |     terraform apply -var-file=vars/stg.tfvars
140 |     ```
141 | 
142 | 2. To prepare aws env (copy model artifacts, set env-vars for lambda etc.):
143 |     ```
144 |     . ./scripts/deploy_manual.sh
145 |     ```
146 | 
147 | 3. To test the pipeline end-to-end with our new cloud infra:
148 |     ```
149 |     . ./scripts/test_cloud_e2e.sh
150 |     ``` 
151 | 
152 | 4. And then check on CloudWatch logs. Or try `get-records` on the `output_kinesis_stream` (refer to `integration_test`)
153 | 
154 | 5. Destroy infra after use:
155 |     ```shell
156 |     # Delete infra after your work, to avoid costs on any running services
157 |     terraform destroy
158 |     ```
159 | 
160 | <br>
161 | 
162 | ### CI/CD
163 | 
164 | 1. Create a PR (feature branch): `.github/workflows/ci-tests.yml`
165 |     * Env setup, Unit test, Integration test, Terraform plan
166 | 2. Merge PR to `develop`: `.github/workflows/cd-deploy.yml`
167 |     * Terraform plan, Terraform apply, Docker build & ECR push, Update Lambda config
168 | 
169 | ### Notes
170 | 
171 | * Unfortunately, the `RUN_ID` (if set via the `ENV` or `ARG` in `Dockerfile`), disappears during lambda invocation.
172 | We'll set it via `aws lambda update-function-configuration` CLI command (refer to `deploy_manual.sh` or `.github/workflows/cd-deploy.yml`)
173 |     
174 | 


--------------------------------------------------------------------------------
/02-experiment-tracking/running-mlflow-examples/scenario-3.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Scenario 3: Multiple data scientists working on multiple ML models\n",
  8 |     "\n",
  9 |     "MLflow setup:\n",
 10 |     "* Tracking server: yes, remote server (EC2).\n",
 11 |     "* Backend store: postgresql database.\n",
 12 |     "* Artifacts store: s3 bucket.\n",
 13 |     "\n",
 14 |     "The experiments can be explored by accessing the remote server.\n",
 15 |     "\n",
 16 |     "The exampe uses AWS to host a remote server. In order to run the example you'll need an AWS account. Follow the steps described in the file `mlflow_on_aws.md` to create a new AWS account and launch the tracking server. "
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import mlflow\n",
 26 |     "import os\n",
 27 |     "\n",
 28 |     "os.environ[\"AWS_PROFILE\"] = \"\" # fill in with your AWS profile. More info: https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/setup.html#setup-credentials\n",
 29 |     "\n",
 30 |     "TRACKING_SERVER_HOST = \"\" # fill in with the public DNS of the EC2 instance\n",
 31 |     "mlflow.set_tracking_uri(f\"http://{TRACKING_SERVER_HOST}:5000\")"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "print(f\"tracking URI: '{mlflow.get_tracking_uri()}'\")"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "mlflow.list_experiments()"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "from sklearn.linear_model import LogisticRegression\n",
 59 |     "from sklearn.datasets import load_iris\n",
 60 |     "from sklearn.metrics import accuracy_score\n",
 61 |     "\n",
 62 |     "mlflow.set_experiment(\"my-experiment-1\")\n",
 63 |     "\n",
 64 |     "with mlflow.start_run():\n",
 65 |     "\n",
 66 |     "    X, y = load_iris(return_X_y=True)\n",
 67 |     "\n",
 68 |     "    params = {\"C\": 0.1, \"random_state\": 42}\n",
 69 |     "    mlflow.log_params(params)\n",
 70 |     "\n",
 71 |     "    lr = LogisticRegression(**params).fit(X, y)\n",
 72 |     "    y_pred = lr.predict(X)\n",
 73 |     "    mlflow.log_metric(\"accuracy\", accuracy_score(y, y_pred))\n",
 74 |     "\n",
 75 |     "    mlflow.sklearn.log_model(lr, artifact_path=\"models\")\n",
 76 |     "    print(f\"default artifacts URI: '{mlflow.get_artifact_uri()}'\")"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "mlflow.list_experiments()"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": []
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "### Interacting with the model registry"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "from mlflow.tracking import MlflowClient\n",
109 |     "\n",
110 |     "\n",
111 |     "client = MlflowClient(f\"http://{TRACKING_SERVER_HOST}:5000\")"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "client.list_registered_models()"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "run_id = client.list_run_infos(experiment_id='1')[0].run_id\n",
130 |     "mlflow.register_model(\n",
131 |     "    model_uri=f\"runs:/{run_id}/models\",\n",
132 |     "    name='iris-classifier'\n",
133 |     ")"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": []
142 |   }
143 |  ],
144 |  "metadata": {
145 |   "interpreter": {
146 |    "hash": "0848c9d6c7d415ad6c477ff7ff8e98694d1a4aa96d0deee89244642e6b630036"
147 |   },
148 |   "kernelspec": {
149 |    "display_name": "Python 3.9.12 ('exp-tracking-env')",
150 |    "language": "python",
151 |    "name": "python3"
152 |   },
153 |   "language_info": {
154 |    "codemirror_mode": {
155 |     "name": "ipython",
156 |     "version": 3
157 |    },
158 |    "file_extension": ".py",
159 |    "mimetype": "text/x-python",
160 |    "name": "python",
161 |    "nbconvert_exporter": "python",
162 |    "pygments_lexer": "ipython3",
163 |    "version": "3.9.12"
164 |   },
165 |   "orig_nbformat": 4
166 |  },
167 |  "nbformat": 4,
168 |  "nbformat_minor": 2
169 | }
170 | 


--------------------------------------------------------------------------------
/04-deployment/homework/starter.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "2c51efaa",
  7 |    "metadata": {},
  8 |    "outputs": [
  9 |     {
 10 |      "name": "stdout",
 11 |      "output_type": "stream",
 12 |      "text": [
 13 |       "scikit-learn==1.0.2\n",
 14 |       "scikit-learn-intelex==2021.20210714.120553\n"
 15 |      ]
 16 |     }
 17 |    ],
 18 |    "source": [
 19 |     "!pip freeze | grep scikit-learn"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 1,
 25 |    "id": "0ef880a0",
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "import pickle\n",
 30 |     "import pandas as pd"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 15,
 36 |    "id": "920cff32",
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "year = 2021\n",
 41 |     "month = 2\n",
 42 |     "\n",
 43 |     "input_file = f'https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_{year:04d}-{month:02d}.parquet'\n",
 44 |     "output_file = f'output/fhv_tripdata_{year:04d}-{month:02d}.parquet'"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 2,
 50 |    "id": "7836ccfd",
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "with open('model.bin', 'rb') as f_in:\n",
 55 |     "    dv, lr = pickle.load(f_in)"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 3,
 61 |    "id": "41c08294",
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "categorical = ['PUlocationID', 'DOlocationID']\n",
 66 |     "\n",
 67 |     "def read_data(filename):\n",
 68 |     "    df = pd.read_parquet(filename)\n",
 69 |     "    \n",
 70 |     "    df['duration'] = df.dropOff_datetime - df.pickup_datetime\n",
 71 |     "    df['duration'] = df.duration.dt.total_seconds() / 60\n",
 72 |     "\n",
 73 |     "    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()\n",
 74 |     "\n",
 75 |     "    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')\n",
 76 |     "    \n",
 77 |     "    return df"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 10,
 83 |    "id": "4854399a",
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "df = read_data(input_file)\n",
 88 |     "df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 6,
 94 |    "id": "669fda0a",
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "dicts = df[categorical].to_dict(orient='records')\n",
 99 |     "X_val = dv.transform(dicts)\n",
100 |     "y_pred = lr.predict(X_val)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 7,
106 |    "id": "914b15a5",
107 |    "metadata": {},
108 |    "outputs": [
109 |     {
110 |      "data": {
111 |       "text/plain": [
112 |        "16.191691679979066"
113 |       ]
114 |      },
115 |      "execution_count": 7,
116 |      "metadata": {},
117 |      "output_type": "execute_result"
118 |     }
119 |    ],
120 |    "source": [
121 |     "y_pred.mean()"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 13,
127 |    "id": "037e3d22",
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "df_result = pd.DataFrame()\n",
132 |     "df_result['ride_id'] = df['ride_id']\n",
133 |     "df_result['predicted_duration'] = y_pred"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 17,
139 |    "id": "7a5753be",
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "df_result.to_parquet(\n",
144 |     "    output_file,\n",
145 |     "    engine='pyarrow',\n",
146 |     "    compression=None,\n",
147 |     "    index=False\n",
148 |     ")"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 18,
154 |    "id": "f0b3b58c",
155 |    "metadata": {},
156 |    "outputs": [
157 |     {
158 |      "name": "stdout",
159 |      "output_type": "stream",
160 |      "text": [
161 |       "total 19M\r\n",
162 |       "-rw-rw-r-- 1 ubuntu ubuntu 19M Jun 30 08:43 fhv_tripdata_2021-02.parquet\r\n"
163 |      ]
164 |     }
165 |    ],
166 |    "source": [
167 |     "!ls -lh output/"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "id": "0dbe3e15",
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": []
177 |   }
178 |  ],
179 |  "metadata": {
180 |   "kernelspec": {
181 |    "display_name": "Python 3 (ipykernel)",
182 |    "language": "python",
183 |    "name": "python3"
184 |   },
185 |   "language_info": {
186 |    "codemirror_mode": {
187 |     "name": "ipython",
188 |     "version": 3
189 |    },
190 |    "file_extension": ".py",
191 |    "mimetype": "text/x-python",
192 |    "name": "python",
193 |    "nbconvert_exporter": "python",
194 |    "pygments_lexer": "ipython3",
195 |    "version": "3.9.7"
196 |   }
197 |  },
198 |  "nbformat": 4,
199 |  "nbformat_minor": 5
200 | }
201 | 


--------------------------------------------------------------------------------