├── .github
    └── workflows
    │   ├── cd-deploy.yml
    │   └── ci-tests.yml
├── .gitignore
├── 01-intro
    ├── README.md
    ├── duration-prediction.ipynb
    ├── images
    │   ├── thumbnail-1-01.jpg
    │   ├── thumbnail-1-02-1.jpg
    │   ├── thumbnail-1-02.jpg
    │   ├── thumbnail-1-03.jpg
    │   ├── thumbnail-1-04.jpg
    │   └── thumbnail-1-05.jpg
    └── meta.json
├── 02-experiment-tracking
    ├── README.md
    ├── duration-prediction.ipynb
    ├── images
    │   ├── db_configuration.png
    │   ├── db_password.png
    │   ├── db_settings.png
    │   ├── ec2_instance_type.png
    │   ├── ec2_os.png
    │   ├── key_pair.png
    │   ├── postgresql.png
    │   ├── postgresql_inbound_rule.png
    │   ├── s3_bucket.png
    │   ├── security_group.png
    │   ├── select_key_pair.png
    │   ├── thumbnail-2-01.jpg
    │   ├── thumbnail-2-02.jpg
    │   ├── thumbnail-2-03.jpg
    │   ├── thumbnail-2-04.jpg
    │   ├── thumbnail-2-05.jpg
    │   ├── thumbnail-2-06.jpg
    │   └── thumbnail-2-07.jpg
    ├── meta.json
    ├── mlflow_on_aws.md
    ├── model-registry.ipynb
    ├── requirements.txt
    └── running-mlflow-examples
    │   ├── scenario-1.ipynb
    │   ├── scenario-2.ipynb
    │   └── scenario-3.ipynb
├── 03-orchestration
    ├── README.md
    └── code
    │   ├── commands.md
    │   ├── duration-prediction.ipynb
    │   └── duration-prediction.py
├── 04-deployment
    ├── README.md
    ├── batch
    │   ├── Pipfile
    │   ├── Pipfile.lock
    │   ├── README.md
    │   ├── score.ipynb
    │   ├── score.py
    │   ├── score_backfill.py
    │   └── score_deploy.py
    ├── images
    │   ├── thumbnail-4-01.jpg
    │   ├── thumbnail-4-02.jpg
    │   ├── thumbnail-4-03.jpg
    │   ├── thumbnail-4-04.jpg
    │   ├── thumbnail-4-05.jpg
    │   └── thumbnail-4-06.jpg
    ├── load_model.ipynb
    ├── meta.json
    ├── streaming
    │   ├── Dockerfile
    │   ├── Pipfile
    │   ├── Pipfile.lock
    │   ├── README.md
    │   ├── lambda_function.py
    │   ├── test.py
    │   └── test_docker.py
    ├── web-service-mlflow
    │   ├── Pipfile
    │   ├── Pipfile.lock
    │   ├── README.md
    │   ├── dict_vectorizer.bin
    │   ├── predict.py
    │   ├── random-forest.ipynb
    │   └── test.py
    └── web-service
    │   ├── Dockerfile
    │   ├── Pipfile
    │   ├── Pipfile.lock
    │   ├── README.md
    │   ├── lin_reg.bin
    │   ├── predict.py
    │   └── test.py
├── 05-monitoring
    ├── README.md
    ├── baseline_model_nyc_taxi_data.ipynb
    ├── config
    │   ├── grafana_dashboards.yaml
    │   └── grafana_datasources.yaml
    ├── dashboards
    │   └── data_drift.json
    ├── data
    │   └── .gitignore
    ├── debugging_nyc_taxi_data.ipynb
    ├── docker-compose.yml
    ├── dummy_metrics_calculation.py
    ├── evidently_metrics_calculation.py
    ├── images
    │   ├── thumbnail-5-01.jpg
    │   ├── thumbnail-5-02.jpg
    │   ├── thumbnail-5-03.jpg
    │   ├── thumbnail-5-04.jpg
    │   ├── thumbnail-5-05.jpg
    │   ├── thumbnail-5-06.jpg
    │   ├── thumbnail-5-07.jpg
    │   └── thumbnail-5-08.jpg
    ├── meta.json
    ├── models
    │   └── .gitignore
    └── requirements.txt
├── 06-best-practices
    ├── AWS-stream-pipeline.png
    ├── README.md
    ├── ci_cd_zoomcamp.png
    ├── code
    │   ├── .gitignore
    │   ├── .pre-commit-config.yaml
    │   ├── .vscode
    │   │   └── settings.json
    │   ├── Dockerfile
    │   ├── Makefile
    │   ├── Pipfile
    │   ├── Pipfile.lock
    │   ├── README.md
    │   ├── infrastructure
    │   │   ├── main.tf
    │   │   ├── modules
    │   │   │   ├── ecr
    │   │   │   │   ├── main.tf
    │   │   │   │   └── variables.tf
    │   │   │   ├── kinesis
    │   │   │   │   ├── main.tf
    │   │   │   │   └── variables.tf
    │   │   │   ├── lambda
    │   │   │   │   ├── iam.tf
    │   │   │   │   ├── main.tf
    │   │   │   │   └── variables.tf
    │   │   │   └── s3
    │   │   │   │   ├── main.tf
    │   │   │   │   └── variables.tf
    │   │   ├── variables.tf
    │   │   └── vars
    │   │   │   ├── prod.tfvars
    │   │   │   └── stg.tfvars
    │   ├── integration-test
    │   │   ├── docker-compose.yaml
    │   │   ├── event.json
    │   │   ├── model
    │   │   │   ├── MLmodel
    │   │   │   ├── conda.yaml
    │   │   │   ├── model.pkl
    │   │   │   ├── python_env.yaml
    │   │   │   └── requirements.txt
    │   │   ├── run.sh
    │   │   ├── test_docker.py
    │   │   └── test_kinesis.py
    │   ├── lambda_function.py
    │   ├── model.py
    │   ├── plan.md
    │   ├── pyproject.toml
    │   ├── scripts
    │   │   ├── deploy_manual.sh
    │   │   ├── publish.sh
    │   │   └── test_cloud_e2e.sh
    │   └── tests
    │   │   ├── __init__.py
    │   │   ├── data.b64
    │   │   └── model_test.py
    ├── docs.md
    ├── images
    │   ├── thumbnail-6-1.jpg
    │   ├── thumbnail-6-2.jpg
    │   ├── thumbnail-6-3.jpg
    │   ├── thumbnail-6-4.jpg
    │   ├── thumbnail-6-5.jpg
    │   └── thumbnail-6-6.jpg
    └── meta.json
├── 07-project
    ├── README.md
    └── images
    │   └── thumbnail-7-1.jpg
├── README.md
├── after-sign-up.md
├── asking-questions.md
├── certificate.md
├── cohorts
    ├── 2022
    │   ├── 01-intro
    │   │   ├── homework.ipynb
    │   │   └── homework.md
    │   ├── 02-experiment-tracking
    │   │   ├── homework.md
    │   │   └── homework
    │   │   │   ├── hpo.py
    │   │   │   ├── preprocess_data.py
    │   │   │   ├── register_model.py
    │   │   │   └── train.py
    │   ├── 03-orchestration
    │   │   ├── README.md
    │   │   ├── code
    │   │   │   ├── model_training.py
    │   │   │   ├── orchestration.py
    │   │   │   ├── prefect_deploy.py
    │   │   │   ├── prefect_flow.py
    │   │   │   └── work-queue.py
    │   │   ├── homework.md
    │   │   ├── homework.py
    │   │   ├── homework_solution.py
    │   │   └── images
    │   │   │   ├── thumbnail-3-01.jpg
    │   │   │   ├── thumbnail-3-02.jpg
    │   │   │   ├── thumbnail-3-03.jpg
    │   │   │   ├── thumbnail-3-04.jpg
    │   │   │   ├── thumbnail-3-05.jpg
    │   │   │   └── thumbnail-3-06.jpg
    │   ├── 04-deployment
    │   │   ├── homework.md
    │   │   └── homework
    │   │   │   ├── Dockerfile
    │   │   │   ├── Pipfile
    │   │   │   ├── Pipfile.lock
    │   │   │   ├── batch.py
    │   │   │   ├── homework.dockerfile
    │   │   │   ├── model.bin
    │   │   │   └── starter.ipynb
    │   ├── 05-monitoring
    │   │   ├── README.md
    │   │   ├── homework.md
    │   │   └── homework
    │   │   │   ├── docker-compose-homework-solution.yml
    │   │   │   ├── docker-compose-homework.yml
    │   │   │   ├── model_training.py
    │   │   │   ├── prediction_service
    │   │   │       ├── Dockerfile
    │   │   │       ├── Pipfile
    │   │   │       ├── Pipfile.lock
    │   │   │       ├── app.py
    │   │   │       ├── lin_reg.bin
    │   │   │       └── lin_reg_V2.bin
    │   │   │   ├── prefect-monitoring
    │   │   │       ├── Pipfile
    │   │   │       ├── Pipfile.lock
    │   │   │       ├── clean_mongo.py
    │   │   │       ├── monitor_profile.ipynb
    │   │   │       ├── monitor_profile_solution.ipynb
    │   │   │       ├── prefect_monitoring.py
    │   │   │       ├── prefect_monitoring_solution.py
    │   │   │       ├── prepare_reference_data.py
    │   │   │       └── send_data.py
    │   │   │   ├── prepare.py
    │   │   │   ├── requirements.txt
    │   │   │   └── test.py
    │   ├── 06-best-practices
    │   │   ├── homework.md
    │   │   ├── homework
    │   │   │   ├── Dockerfile
    │   │   │   ├── Pipfile
    │   │   │   ├── Pipfile.lock
    │   │   │   ├── batch.py
    │   │   │   └── model.bin
    │   │   └── homework_solution
    │   │   │   ├── Dockerfile
    │   │   │   ├── Pipfile
    │   │   │   ├── Pipfile.lock
    │   │   │   ├── batch.py
    │   │   │   ├── docker-compose.yaml
    │   │   │   ├── integration_test.py
    │   │   │   ├── integration_test.sh
    │   │   │   ├── model.bin
    │   │   │   └── tests
    │   │   │       ├── __init__.py
    │   │   │       └── test_batch.py
    │   ├── 07-project
    │   │   └── README.md
    │   └── leaderboard.md
    ├── 2023
    │   ├── 01-intro
    │   │   └── homework.md
    │   ├── 02-experiment-tracking
    │   │   ├── homework-wandb
    │   │   │   ├── preprocess_data.py
    │   │   │   ├── sweep.py
    │   │   │   └── train.py
    │   │   ├── homework.md
    │   │   ├── homework
    │   │   │   ├── hpo.py
    │   │   │   ├── preprocess_data.py
    │   │   │   ├── register_model.py
    │   │   │   └── train.py
    │   │   ├── solution-mlflow
    │   │   │   ├── hpo.py
    │   │   │   ├── preprocess_data.py
    │   │   │   ├── register_model.py
    │   │   │   └── train.py
    │   │   └── wandb.md
    │   ├── 03-orchestration
    │   │   ├── homework.md
    │   │   └── prefect
    │   │   │   ├── .gitignore
    │   │   │   ├── 3.2
    │   │   │       ├── cat_dog_facts.py
    │   │   │       └── cat_facts.py
    │   │   │   ├── 3.3
    │   │   │       ├── duration_prediction_explore.ipynb
    │   │   │       ├── duration_prediction_original.ipynb
    │   │   │       ├── orchestrate.py
    │   │   │       └── orchestrate_pre_prefect.py
    │   │   │   ├── 3.4
    │   │   │       └── orchestrate.py
    │   │   │   ├── 3.5
    │   │   │       ├── create_s3_bucket_block.py
    │   │   │       ├── orchestrate.py
    │   │   │       └── orchestrate_s3.py
    │   │   │   ├── 3.6
    │   │   │       ├── create_s3_bucket_block.py
    │   │   │       └── orchestrate_s3.py
    │   │   │   ├── README.md
    │   │   │   ├── images
    │   │   │       ├── Activity-create-run-deployment.png
    │   │   │       ├── thumbnail-3-01.jpg
    │   │   │       ├── thumbnail-3-01.png
    │   │   │       ├── thumbnail-3-02.jpg
    │   │   │       ├── thumbnail-3-03.jpg
    │   │   │       ├── thumbnail-3-03.png
    │   │   │       ├── thumbnail-3-04.jpg
    │   │   │       ├── thumbnail-3-04.png
    │   │   │       ├── thumbnail-3-05.jpg
    │   │   │       ├── thumbnail-3-05.png
    │   │   │       ├── thumbnail-3-06.jpg
    │   │   │       └── thumbnail-3-06.png
    │   │   │   ├── meta.json
    │   │   │   └── requirements.txt
    │   ├── 04-deployment
    │   │   ├── homework.md
    │   │   └── homework
    │   │   │   ├── Dockerfile
    │   │   │   ├── model.bin
    │   │   │   └── starter.ipynb
    │   ├── 05-monitoring
    │   │   └── homework.md
    │   ├── 06-best-practices
    │   │   ├── homework.md
    │   │   ├── homework
    │   │   │   ├── Dockerfile
    │   │   │   ├── Pipfile
    │   │   │   ├── Pipfile.lock
    │   │   │   ├── batch.py
    │   │   │   └── model.bin
    │   │   └── homework_solution
    │   │   │   ├── Pipfile
    │   │   │   ├── Pipfile.lock
    │   │   │   ├── batch.py
    │   │   │   ├── docker-compose.yaml
    │   │   │   ├── integration_test.py
    │   │   │   ├── model.bin
    │   │   │   └── tests
    │   │   │       ├── __init__.py
    │   │   │       └── test_batch.py
    │   ├── 07-project
    │   │   └── README.md
    │   └── README.md
    ├── 2024
    │   ├── 01-intro
    │   │   └── homework.md
    │   ├── 02-experiment-tracking
    │   │   ├── homework.md
    │   │   ├── homework
    │   │   │   ├── hpo.py
    │   │   │   ├── preprocess_data.py
    │   │   │   ├── register_model.py
    │   │   │   └── train.py
    │   │   └── solution
    │   │   │   ├── hpo.py
    │   │   │   ├── preprocess_data.py
    │   │   │   ├── register_model.py
    │   │   │   └── train.py
    │   ├── 03-orchestration
    │   │   ├── .gitignore
    │   │   ├── 3.0
    │   │   │   └── README.md
    │   │   ├── 3.1
    │   │   │   └── README.md
    │   │   ├── 3.2
    │   │   │   └── README.md
    │   │   ├── 3.3
    │   │   │   └── README.md
    │   │   ├── 3.4
    │   │   │   └── README.md
    │   │   ├── 3.5
    │   │   │   └── README.md
    │   │   ├── README.md
    │   │   ├── homework.md
    │   │   ├── meta.json
    │   │   └── requirements.txt
    │   ├── 04-deployment
    │   │   ├── homework.md
    │   │   ├── homework
    │   │   │   ├── model.bin
    │   │   │   └── starter.ipynb
    │   │   └── homework_solution
    │   │   │   ├── Dockerfile
    │   │   │   ├── Pipfile
    │   │   │   ├── Pipfile.lock
    │   │   │   ├── batch.py
    │   │   │   ├── homework.dockerfile
    │   │   │   ├── model.bin
    │   │   │   ├── model2.bin
    │   │   │   └── solution.ipynb
    │   ├── 05-monitoring
    │   │   └── homework.md
    │   ├── 06-best-practices
    │   │   ├── homework.md
    │   │   └── homework
    │   │   │   ├── Dockerfile
    │   │   │   ├── Pipfile
    │   │   │   ├── Pipfile.lock
    │   │   │   ├── batch.py
    │   │   │   └── model.bin
    │   ├── README.md
    │   └── project.md
    └── 2025
    │   ├── 01-intro
    │       ├── homework.ipynb
    │       └── homework.md
    │   ├── 02-experiment-tracking
    │       ├── homework.md
    │       └── homework
    │       │   ├── hpo.py
    │       │   ├── preprocess_data.py
    │       │   ├── register_model.py
    │       │   └── train.py
    │   ├── 03-orchestration
    │       └── homework.md
    │   ├── 04-deployment
    │       ├── homework.md
    │       └── homework
    │       │   ├── model.bin
    │       │   └── starter.ipynb
    │   ├── 05-monitoring
    │       └── homework.md
    │   ├── 06-best-practices
    │       ├── homework.md
    │       └── homework
    │       │   ├── Dockerfile
    │       │   ├── Pipfile
    │       │   ├── Pipfile.lock
    │       │   ├── batch.py
    │       │   └── model.bin
    │   ├── README.md
    │   └── project.md
├── generate
    └── generate_pages.ipynb
├── images
    ├── IMG_20230323_134059_927.png
    ├── banner-2025.jpg
    ├── banner.png
    ├── learning-in-public-links.png
    ├── learning-in-public.png
    └── play.png
└── learning-in-public.md


/.github/workflows/ci-tests.yml:
--------------------------------------------------------------------------------
 1 | name: CI-Tests
 2 | on:
 3 |   pull_request:
 4 |     branches:
 5 |       - 'develop'
 6 |     paths:
 7 |       - '06-best-practices/code/**'
 8 | 
 9 | env:
10 |   AWS_DEFAULT_REGION: 'eu-west-1'
11 |   AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
12 |   AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
13 | 
14 | jobs:
15 |   test:
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - uses: actions/checkout@v2
19 |       - name: Set up Python 3.9
20 |         uses: actions/setup-python@v2
21 |         with:
22 |           python-version: 3.9
23 | 
24 |       - name: Install dependencies
25 |         working-directory: "06-best-practices/code"
26 |         run: pip install pipenv && pipenv install --dev
27 | 
28 |       - name: Run Unit tests
29 |         working-directory: "06-best-practices/code"
30 |         run: pipenv run pytest tests/
31 | 
32 |       - name: Lint
33 |         working-directory: "06-best-practices/code"
34 |         run: pipenv run pylint --recursive=y .
35 | 
36 |       - name: Configure AWS Credentials
37 |         uses: aws-actions/configure-aws-credentials@v1
38 |         with:
39 |           aws-access-key-id: ${{ env.AWS_ACCESS_KEY_ID }}
40 |           aws-secret-access-key: ${{ env.AWS_SECRET_ACCESS_KEY }}
41 |           aws-region: ${{ env.AWS_DEFAULT_REGION }}
42 | 
43 |       - name: Integration Test
44 |         working-directory: '06-best-practices/code/integraton-test'
45 |         run: |
46 |           . run.sh
47 | 
48 |   tf-plan:
49 |     runs-on: ubuntu-latest
50 |     steps:
51 |       - uses: actions/checkout@v2
52 |       - name: Configure AWS Credentials
53 |         uses: aws-actions/configure-aws-credentials@v1
54 |         with:
55 |           aws-access-key-id: ${{ env.AWS_ACCESS_KEY_ID }}
56 |           aws-secret-access-key: ${{ env.AWS_SECRET_ACCESS_KEY }}
57 |           aws-region: ${{ env.AWS_DEFAULT_REGION }}
58 | 
59 |       - uses: hashicorp/setup-terraform@v2
60 | 
61 |       - name: TF plan
62 |         id: plan
63 |         working-directory: '06-best-practices/code/infrastructure'
64 |         run: |
65 |           terraform init -backend-config="key=mlops-zoomcamp-prod.tfstate" --reconfigure && terraform plan --var-file vars/prod.tfvars
66 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | data/
 2 | .ipynb_checkpoints
 3 | .bin
 4 | *.db
 5 | 
 6 | *.parquet
 7 | *.html
 8 | *.csv
 9 | 
10 | .venv
11 | venv
12 | .idea
13 | **/artifacts/
14 | **/models/
15 | 
16 | __pycache__/
17 | **.env
18 | **.terraform/
19 | **.terraform.lock*
20 | **terraform.tfstate*
21 | 
22 | 
23 | .DS_Store


--------------------------------------------------------------------------------
/01-intro/images/thumbnail-1-01.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/01-intro/images/thumbnail-1-01.jpg


--------------------------------------------------------------------------------
/01-intro/images/thumbnail-1-02-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/01-intro/images/thumbnail-1-02-1.jpg


--------------------------------------------------------------------------------
/01-intro/images/thumbnail-1-02.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/01-intro/images/thumbnail-1-02.jpg


--------------------------------------------------------------------------------
/01-intro/images/thumbnail-1-03.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/01-intro/images/thumbnail-1-03.jpg


--------------------------------------------------------------------------------
/01-intro/images/thumbnail-1-04.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/01-intro/images/thumbnail-1-04.jpg


--------------------------------------------------------------------------------
/01-intro/images/thumbnail-1-05.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/01-intro/images/thumbnail-1-05.jpg


--------------------------------------------------------------------------------
/01-intro/meta.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "module": {
 3 |     "number": 1,
 4 |     "title": "Introduction"
 5 |   },
 6 |   "units": [
 7 |     {
 8 |       "number": 1,
 9 |       "title": "Introduction",
10 |       "youtube": "https://www.youtube.com/watch?v=s0uaFZSzwfI"
11 |     },
12 |     {
13 |       "number": 2,
14 |       "title": "Environment preparation",
15 |       "youtube": "https://www.youtube.com/watch?v=IXSiYkP23zo"
16 |     },
17 |     {
18 |       "number": 3,
19 |       "title": "(Optional) Training a ride duration prediction model",
20 |       "youtube": "https://www.youtube.com/watch?v=iRunifGSHFc"
21 |     },
22 |     {
23 |       "number": 4,
24 |       "title": "Course overview",
25 |       "youtube": "https://www.youtube.com/watch?v=teP9KWkP6SM"
26 |     },
27 |     {
28 |       "number": 5,
29 |       "title": "MLOps maturity model",
30 |       "youtube": "https://www.youtube.com/watch?v=XwTH8BDGzYk"
31 |     },
32 |     {
33 |       "number": 6,
34 |       "title": "Homework",
35 |       "youtube": ""
36 |     }
37 |   ]
38 | }


--------------------------------------------------------------------------------
/02-experiment-tracking/images/db_configuration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/db_configuration.png


--------------------------------------------------------------------------------
/02-experiment-tracking/images/db_password.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/db_password.png


--------------------------------------------------------------------------------
/02-experiment-tracking/images/db_settings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/db_settings.png


--------------------------------------------------------------------------------
/02-experiment-tracking/images/ec2_instance_type.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/ec2_instance_type.png


--------------------------------------------------------------------------------
/02-experiment-tracking/images/ec2_os.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/ec2_os.png


--------------------------------------------------------------------------------
/02-experiment-tracking/images/key_pair.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/key_pair.png


--------------------------------------------------------------------------------
/02-experiment-tracking/images/postgresql.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/postgresql.png


--------------------------------------------------------------------------------
/02-experiment-tracking/images/postgresql_inbound_rule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/postgresql_inbound_rule.png


--------------------------------------------------------------------------------
/02-experiment-tracking/images/s3_bucket.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/s3_bucket.png


--------------------------------------------------------------------------------
/02-experiment-tracking/images/security_group.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/security_group.png


--------------------------------------------------------------------------------
/02-experiment-tracking/images/select_key_pair.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/select_key_pair.png


--------------------------------------------------------------------------------
/02-experiment-tracking/images/thumbnail-2-01.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/thumbnail-2-01.jpg


--------------------------------------------------------------------------------
/02-experiment-tracking/images/thumbnail-2-02.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/thumbnail-2-02.jpg


--------------------------------------------------------------------------------
/02-experiment-tracking/images/thumbnail-2-03.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/thumbnail-2-03.jpg


--------------------------------------------------------------------------------
/02-experiment-tracking/images/thumbnail-2-04.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/thumbnail-2-04.jpg


--------------------------------------------------------------------------------
/02-experiment-tracking/images/thumbnail-2-05.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/thumbnail-2-05.jpg


--------------------------------------------------------------------------------
/02-experiment-tracking/images/thumbnail-2-06.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/thumbnail-2-06.jpg


--------------------------------------------------------------------------------
/02-experiment-tracking/images/thumbnail-2-07.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/thumbnail-2-07.jpg


--------------------------------------------------------------------------------
/02-experiment-tracking/meta.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "module": {
 3 |     "number": 2,
 4 |     "title": "Experiment tracking and model management"
 5 |   },
 6 |   "units": [
 7 |     {
 8 |       "number": 1,
 9 |       "title": "Experiment tracking intro",
10 |       "youtube": "https://www.youtube.com/watch?v=MiA7LQin9c8"
11 |     },
12 |     {
13 |       "number": 2,
14 |       "title": "Getting started with MLflow",
15 |       "youtube": "https://www.youtube.com/watch?v=cESCQE9J3ZE"
16 |     },
17 |     {
18 |       "number": 3,
19 |       "title": "Experiment tracking with MLflow",
20 |       "youtube": "https://www.youtube.com/watch?v=iaJz-T7VWec"
21 |     },
22 |     {
23 |       "number": 4,
24 |       "title": "Model management",
25 |       "youtube": "https://www.youtube.com/watch?v=OVUPIX88q88"
26 |     },
27 |     {
28 |       "number": 5,
29 |       "title": "Model registry",
30 |       "youtube": "https://www.youtube.com/watch?v=TKHU7HAvGH8"
31 |     },
32 |     {
33 |       "number": 6,
34 |       "title": "MLflow in practice",
35 |       "youtube": "https://www.youtube.com/watch?v=1ykg4YmbFVA"
36 |     },
37 |     {
38 |       "number": 7,
39 |       "title": "MLflow: benefits, limitations and alternatives",
40 |       "youtube": "https://www.youtube.com/watch?v=Lugy1JPsBRY"
41 |     },
42 |     {
43 |       "number": 8,
44 |       "title": "Homework",
45 |       "youtube": ""
46 |     }
47 |   ]
48 | }


--------------------------------------------------------------------------------
/02-experiment-tracking/requirements.txt:
--------------------------------------------------------------------------------
1 | mlflow
2 | jupyter
3 | scikit-learn
4 | pandas
5 | seaborn
6 | hyperopt
7 | xgboost
8 | fastparquet
9 | boto3


--------------------------------------------------------------------------------
/03-orchestration/code/commands.md:
--------------------------------------------------------------------------------
 1 | ```bash
 2 | pip install mlflow jupyter pandas numpy scikit-learn xgboost hyperopt 
 3 | wget https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/refs/heads/main/02-experiment-tracking/duration-prediction.ipynb
 4 | 
 5 | 
 6 | jupyter notebook
 7 | 
 8 | mlflow server \
 9 |     --backend-store-uri sqlite:///mlflow.db
10 | ```
11 | 
12 | 
13 | ```python
14 | import mlflow
15 | 
16 | mlflow.set_tracking_uri("http://localhost:5000")
17 | mlflow.set_experiment("nyc-taxi-experiment")
18 | ```
19 | 
20 | ```python
21 | URL = 'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet'
22 | ```


--------------------------------------------------------------------------------
/04-deployment/batch/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | scikit-learn = "==1.0.2"
 8 | prefect = "==2.0b6"
 9 | mlflow = "*"
10 | pandas = "*"
11 | boto3 = "*"
12 | pyarrow = "*"
13 | s3fs = "*"
14 | 
15 | [dev-packages]
16 | 
17 | [requires]
18 | python_version = "3.9"
19 | 


--------------------------------------------------------------------------------
/04-deployment/batch/README.md:
--------------------------------------------------------------------------------
1 | ## Batch deployment
2 | 
3 | * Turn the notebook for training a model into a notebook for applying the model
4 | * Turn the notebook into a script 
5 | * Clean it and parametrize
6 | 


--------------------------------------------------------------------------------
/04-deployment/batch/score_backfill.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from dateutil.relativedelta import relativedelta
 3 | 
 4 | from prefect import flow
 5 | 
 6 | import score
 7 | 
 8 | 
 9 | @flow
10 | def ride_duration_prediction_backfill():
11 |     start_date = datetime(year=2021, month=3, day=1)
12 |     end_date = datetime(year=2022, month=4, day=1)
13 | 
14 |     d = start_date
15 | 
16 |     while d <= end_date:
17 |         score.ride_duration_prediction(
18 |             taxi_type='green',
19 |             run_id='e1efc53e9bd149078b0c12aeaa6365df',
20 |             run_date=d
21 |         )
22 | 
23 |         d = d + relativedelta(months=1)
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     ride_duration_prediction_backfill()


--------------------------------------------------------------------------------
/04-deployment/batch/score_deploy.py:
--------------------------------------------------------------------------------
 1 | from prefect.deployments import Deployment
 2 | from prefect.orion.schemas.schedules import CronSchedule
 3 | from score import ride_duration_prediction
 4 | 
 5 | deployment = Deployment.build_from_flow(
 6 |     flow=ride_duration_prediction,
 7 |     name="ride_duration_prediction",
 8 |     parameters={
 9 |         "taxi_type": "green",
10 |         "run_id": "e1efc53e9bd149078b0c12aeaa6365df",
11 |     },
12 |     schedule=CronSchedule(cron="0 3 2 * *"),
13 |     work_queue_name="ml",
14 | )
15 | 
16 | deployment.apply()
17 | 


--------------------------------------------------------------------------------
/04-deployment/images/thumbnail-4-01.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/04-deployment/images/thumbnail-4-01.jpg


--------------------------------------------------------------------------------
/04-deployment/images/thumbnail-4-02.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/04-deployment/images/thumbnail-4-02.jpg


--------------------------------------------------------------------------------
/04-deployment/images/thumbnail-4-03.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/04-deployment/images/thumbnail-4-03.jpg


--------------------------------------------------------------------------------
/04-deployment/images/thumbnail-4-04.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/04-deployment/images/thumbnail-4-04.jpg


--------------------------------------------------------------------------------
/04-deployment/images/thumbnail-4-05.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/04-deployment/images/thumbnail-4-05.jpg


--------------------------------------------------------------------------------
/04-deployment/images/thumbnail-4-06.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/04-deployment/images/thumbnail-4-06.jpg


--------------------------------------------------------------------------------
/04-deployment/meta.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "module": {
 3 |     "number": 4,
 4 |     "title": "Model Deployment"
 5 |   },
 6 |   "units": [
 7 |     {
 8 |       "number": 1,
 9 |       "title": "Three ways of deploying a model",
10 |       "youtube": "https://www.youtube.com/watch?v=JMGe4yIoBRA"
11 |     },
12 |     {
13 |       "number": 2,
14 |       "title": "Web-services: Deploying models with Flask and Docker",
15 |       "youtube": "https://www.youtube.com/watch?v=D7wfMAdgdF8"
16 |     },
17 |     {
18 |       "number": 3,
19 |       "title": "Web-services: Getting the models from the model registry (MLflow)",
20 |       "youtube": "https://www.youtube.com/watch?v=aewOpHSCkqI"
21 |     },
22 |     {
23 |       "number": 4,
24 |       "title": "(Optional) Streaming: Deploying models with Kinesis and Lambda ",
25 |       "youtube": "https://www.youtube.com/watch?v=TCqr9HNcrsI"
26 |     },
27 |     {
28 |       "number": 5,
29 |       "title": "Batch: Preparing a scoring script",
30 |       "youtube": "https://www.youtube.com/watch?v=18Lbaaeigek"
31 |     },
32 |     {
33 |       "number": 6,
34 |       "title": "MLOps Zoomcamp 4.6 - Batch: Scheduling batch scoring jobs with Prefect",
35 |       "youtube": "https://www.youtube.com/watch?v=ekT_JW213Tc"
36 |     },
37 |     {
38 |       "number": 7,
39 |       "title": "Homework",
40 |       "youtube": ""
41 |     }
42 |   ]
43 | }


--------------------------------------------------------------------------------
/04-deployment/streaming/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM public.ecr.aws/lambda/python:3.9
 2 | 
 3 | RUN pip install -U pip
 4 | RUN pip install pipenv 
 5 | 
 6 | COPY [ "Pipfile", "Pipfile.lock", "./" ]
 7 | 
 8 | RUN pipenv install --system --deploy
 9 | 
10 | COPY [ "lambda_function.py", "./" ]
11 | 
12 | CMD [ "lambda_function.lambda_handler" ]
13 | 


--------------------------------------------------------------------------------
/04-deployment/streaming/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | boto3 = "*"
 8 | mlflow = "*"
 9 | scikit-learn = "==1.0.2"
10 | 
11 | [dev-packages]
12 | 
13 | [requires]
14 | python_version = "3.9"
15 | 


--------------------------------------------------------------------------------
/04-deployment/streaming/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import boto3
 4 | import base64
 5 | 
 6 | import mlflow
 7 | 
 8 | kinesis_client = boto3.client('kinesis')
 9 | 
10 | PREDICTIONS_STREAM_NAME = os.getenv('PREDICTIONS_STREAM_NAME', 'ride_predictions')
11 | 
12 | 
13 | RUN_ID = os.getenv('RUN_ID')
14 | 
15 | logged_model = f's3://mlflow-models-alexey/1/{RUN_ID}/artifacts/model'
16 | # logged_model = f'runs:/{RUN_ID}/model'
17 | model = mlflow.pyfunc.load_model(logged_model)
18 | 
19 | 
20 | TEST_RUN = os.getenv('TEST_RUN', 'False') == 'True'
21 | 
22 | def prepare_features(ride):
23 |     features = {}
24 |     features['PU_DO'] = '%s_%s' % (ride['PULocationID'], ride['DOLocationID'])
25 |     features['trip_distance'] = ride['trip_distance']
26 |     return features
27 | 
28 | 
29 | def predict(features):
30 |     pred = model.predict(features)
31 |     return float(pred[0])
32 | 
33 | 
34 | def lambda_handler(event, context):
35 |     # print(json.dumps(event))
36 |     
37 |     predictions_events = []
38 |     
39 |     for record in event['Records']:
40 |         encoded_data = record['kinesis']['data']
41 |         decoded_data = base64.b64decode(encoded_data).decode('utf-8')
42 |         ride_event = json.loads(decoded_data)
43 | 
44 |         # print(ride_event)
45 |         ride = ride_event['ride']
46 |         ride_id = ride_event['ride_id']
47 |     
48 |         features = prepare_features(ride)
49 |         prediction = predict(features)
50 |     
51 |         prediction_event = {
52 |             'model': 'ride_duration_prediction_model',
53 |             'version': '123',
54 |             'prediction': {
55 |                 'ride_duration': prediction,
56 |                 'ride_id': ride_id   
57 |             }
58 |         }
59 | 
60 |         if not TEST_RUN:
61 |             kinesis_client.put_record(
62 |                 StreamName=PREDICTIONS_STREAM_NAME,
63 |                 Data=json.dumps(prediction_event),
64 |                 PartitionKey=str(ride_id)
65 |             )
66 |         
67 |         predictions_events.append(prediction_event)
68 | 
69 | 
70 |     return {
71 |         'predictions': predictions_events
72 |     }
73 | 


--------------------------------------------------------------------------------
/04-deployment/streaming/test.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import lambda_function
 3 | 
 4 | event = {
 5 |     "Records": [
 6 |         {
 7 |             "kinesis": {
 8 |                 "kinesisSchemaVersion": "1.0",
 9 |                 "partitionKey": "1",
10 |                 "sequenceNumber": "49630081666084879290581185630324770398608704880802529282",
11 |                 "data": "ewogICAgICAgICJyaWRlIjogewogICAgICAgICAgICAiUFVMb2NhdGlvbklEIjogMTMwLAogICAgICAgICAgICAiRE9Mb2NhdGlvbklEIjogMjA1LAogICAgICAgICAgICAidHJpcF9kaXN0YW5jZSI6IDMuNjYKICAgICAgICB9LCAKICAgICAgICAicmlkZV9pZCI6IDI1NgogICAgfQ==",
12 |                 "approximateArrivalTimestamp": 1654161514.132
13 |             },
14 |             "eventSource": "aws:kinesis",
15 |             "eventVersion": "1.0",
16 |             "eventID": "shardId-000000000000:49630081666084879290581185630324770398608704880802529282",
17 |             "eventName": "aws:kinesis:record",
18 |             "invokeIdentityArn": "arn:aws:iam::387546586013:role/lambda-kinesis-role",
19 |             "awsRegion": "eu-west-1",
20 |             "eventSourceARN": "arn:aws:kinesis:eu-west-1:387546586013:stream/ride_events"
21 |         }
22 |     ]
23 | }
24 | 
25 | 
26 | result = lambda_function.lambda_handler(event, None)
27 | print(result)
28 | 


--------------------------------------------------------------------------------
/04-deployment/streaming/test_docker.py:
--------------------------------------------------------------------------------
 1 | import requests 
 2 | 
 3 | event = {
 4 |     "Records": [
 5 |         {
 6 |             "kinesis": {
 7 |                 "kinesisSchemaVersion": "1.0",
 8 |                 "partitionKey": "1",
 9 |                 "sequenceNumber": "49630081666084879290581185630324770398608704880802529282",
10 |                 "data": "ewogICAgICAgICJyaWRlIjogewogICAgICAgICAgICAiUFVMb2NhdGlvbklEIjogMTMwLAogICAgICAgICAgICAiRE9Mb2NhdGlvbklEIjogMjA1LAogICAgICAgICAgICAidHJpcF9kaXN0YW5jZSI6IDMuNjYKICAgICAgICB9LCAKICAgICAgICAicmlkZV9pZCI6IDI1NgogICAgfQ==",
11 |                 "approximateArrivalTimestamp": 1654161514.132
12 |             },
13 |             "eventSource": "aws:kinesis",
14 |             "eventVersion": "1.0",
15 |             "eventID": "shardId-000000000000:49630081666084879290581185630324770398608704880802529282",
16 |             "eventName": "aws:kinesis:record",
17 |             "invokeIdentityArn": "arn:aws:iam::387546586013:role/lambda-kinesis-role",
18 |             "awsRegion": "eu-west-1",
19 |             "eventSourceARN": "arn:aws:kinesis:eu-west-1:387546586013:stream/ride_events"
20 |         }
21 |     ]
22 | }
23 | 
24 | 
25 | url = 'http://localhost:8080/2015-03-31/functions/function/invocations'
26 | response = requests.post(url, json=event)
27 | print(response.json())
28 | 


--------------------------------------------------------------------------------
/04-deployment/web-service-mlflow/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | scikit-learn = "==1.0.2"
 8 | flask = "*"
 9 | gunicorn = "*"
10 | mlflow = "*"
11 | boto3 = "*"
12 | 
13 | [dev-packages]
14 | requests = "*"
15 | 
16 | [requires]
17 | python_version = "3.9"
18 | 


--------------------------------------------------------------------------------
/04-deployment/web-service-mlflow/README.md:
--------------------------------------------------------------------------------
 1 | ## Getting the model for deployment from MLflow
 2 | 
 3 | * Take the code from the previous video
 4 | * Train another model, register with MLflow
 5 | * Put the model into a scikit-learn pipeline
 6 | * Model deployment with tracking server
 7 | * Model deployment without the tracking server
 8 | 
 9 | Starting the MLflow server with S3:
10 | 
11 | ```bash
12 | mlflow server \
13 |     --backend-store-uri=sqlite:///mlflow.db \
14 |     --default-artifact-root=s3://mlflow-models-alexey/
15 | ```
16 | 
17 | Downloading the artifact
18 | 
19 | ```bash
20 | export MLFLOW_TRACKING_URI="http://127.0.0.1:5000"
21 | export MODEL_RUN_ID="6dd459b11b4e48dc862f4e1019d166f6"
22 | 
23 | mlflow artifacts download \
24 |     --run-id ${MODEL_RUN_ID} \
25 |     --artifact-path model \
26 |     --dst-path .
27 | ```


--------------------------------------------------------------------------------
/04-deployment/web-service-mlflow/dict_vectorizer.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/04-deployment/web-service-mlflow/dict_vectorizer.bin


--------------------------------------------------------------------------------
/04-deployment/web-service-mlflow/predict.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | 
 4 | import mlflow
 5 | from flask import Flask, request, jsonify
 6 | 
 7 | 
 8 | RUN_ID = os.getenv('RUN_ID')
 9 | 
10 | logged_model = f's3://mlflow-models-alexey/1/{RUN_ID}/artifacts/model'
11 | # logged_model = f'runs:/{RUN_ID}/model'
12 | model = mlflow.pyfunc.load_model(logged_model)
13 | 
14 | 
15 | def prepare_features(ride):
16 |     features = {}
17 |     features['PU_DO'] = '%s_%s' % (ride['PULocationID'], ride['DOLocationID'])
18 |     features['trip_distance'] = ride['trip_distance']
19 |     return features
20 | 
21 | 
22 | def predict(features):
23 |     preds = model.predict(features)
24 |     return float(preds[0])
25 | 
26 | 
27 | app = Flask('duration-prediction')
28 | 
29 | 
30 | @app.route('/predict', methods=['POST'])
31 | def predict_endpoint():
32 |     ride = request.get_json()
33 | 
34 |     features = prepare_features(ride)
35 |     pred = predict(features)
36 | 
37 |     result = {
38 |         'duration': pred,
39 |         'model_version': RUN_ID
40 |     }
41 | 
42 |     return jsonify(result)
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     app.run(debug=True, host='0.0.0.0', port=9696)
47 | 


--------------------------------------------------------------------------------
/04-deployment/web-service-mlflow/test.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | ride = {
 4 |     "PULocationID": 10,
 5 |     "DOLocationID": 50,
 6 |     "trip_distance": 40
 7 | }
 8 | 
 9 | url = 'http://localhost:9696/predict'
10 | response = requests.post(url, json=ride)
11 | print(response.json())
12 | 


--------------------------------------------------------------------------------
/04-deployment/web-service/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9.7-slim
 2 | 
 3 | RUN pip install -U pip
 4 | RUN pip install pipenv 
 5 | 
 6 | WORKDIR /app
 7 | 
 8 | COPY [ "Pipfile", "Pipfile.lock", "./" ]
 9 | 
10 | RUN pipenv install --system --deploy
11 | 
12 | COPY [ "predict.py", "lin_reg.bin", "./" ]
13 | 
14 | EXPOSE 9696
15 | 
16 | ENTRYPOINT [ "gunicorn", "--bind=0.0.0.0:9696", "predict:app" ]


--------------------------------------------------------------------------------
/04-deployment/web-service/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | scikit-learn = "==1.0.2"
 8 | flask = "*"
 9 | gunicorn = "*"
10 | 
11 | [dev-packages]
12 | requests = "*"
13 | 
14 | [requires]
15 | python_version = "3.9"
16 | 


--------------------------------------------------------------------------------
/04-deployment/web-service/README.md:
--------------------------------------------------------------------------------
 1 | ## Deploying a model as a web-service
 2 | 
 3 | * Creating a virtual environment with Pipenv
 4 | * Creating a script for predictiong 
 5 | * Putting the script into a Flask app
 6 | * Packaging the app to Docker
 7 | 
 8 | 
 9 | ```bash
10 | docker build -t ride-duration-prediction-service:v1 .
11 | ```
12 | 
13 | ```bash
14 | docker run -it --rm -p 9696:9696  ride-duration-prediction-service:v1
15 | ```
16 | 


--------------------------------------------------------------------------------
/04-deployment/web-service/lin_reg.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/04-deployment/web-service/lin_reg.bin


--------------------------------------------------------------------------------
/04-deployment/web-service/predict.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | from flask import Flask, request, jsonify
 4 | 
 5 | with open('lin_reg.bin', 'rb') as f_in:
 6 |     (dv, model) = pickle.load(f_in)
 7 | 
 8 | 
 9 | def prepare_features(ride):
10 |     features = {}
11 |     features['PU_DO'] = '%s_%s' % (ride['PULocationID'], ride['DOLocationID'])
12 |     features['trip_distance'] = ride['trip_distance']
13 |     return features
14 | 
15 | 
16 | def predict(features):
17 |     X = dv.transform(features)
18 |     preds = model.predict(X)
19 |     return float(preds[0])
20 | 
21 | 
22 | app = Flask('duration-prediction')
23 | 
24 | 
25 | @app.route('/predict', methods=['POST'])
26 | def predict_endpoint():
27 |     ride = request.get_json()
28 | 
29 |     features = prepare_features(ride)
30 |     pred = predict(features)
31 | 
32 |     result = {
33 |         'duration': pred
34 |     }
35 | 
36 |     return jsonify(result)
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     app.run(debug=True, host='0.0.0.0', port=9696)


--------------------------------------------------------------------------------
/04-deployment/web-service/test.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | ride = {
 4 |     "PULocationID": 10,
 5 |     "DOLocationID": 50,
 6 |     "trip_distance": 40
 7 | }
 8 | 
 9 | url = 'http://localhost:9696/predict'
10 | response = requests.post(url, json=ride)
11 | print(response.json())
12 | 


--------------------------------------------------------------------------------
/05-monitoring/config/grafana_dashboards.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: 1
 2 | 
 3 | providers:
 4 |   # <string> an unique provider name. Required
 5 |   - name: 'Evidently Dashboards'
 6 |     # <int> Org id. Default to 1
 7 |     orgId: 1
 8 |     # <string> name of the dashboard folder.
 9 |     folder: ''
10 |     # <string> folder UID. will be automatically generated if not specified
11 |     folderUid: ''
12 |     # <string> provider type. Default to 'file'
13 |     type: file
14 |     # <bool> disable dashboard deletion
15 |     disableDeletion: false
16 |     # <int> how often Grafana will scan for changed dashboards
17 |     updateIntervalSeconds: 10
18 |     # <bool> allow updating provisioned dashboards from the UI
19 |     allowUiUpdates: false
20 |     options:
21 |       # <string, required> path to dashboard files on disk. Required when using the 'file' type
22 |       path: /opt/grafana/dashboards
23 |       # <bool> use folder names from filesystem to create folders in Grafana
24 |       foldersFromFilesStructure: true


--------------------------------------------------------------------------------
/05-monitoring/config/grafana_datasources.yaml:
--------------------------------------------------------------------------------
 1 | # config file version
 2 | apiVersion: 1
 3 | 
 4 | # list of datasources to insert/update
 5 | # available in the database
 6 | datasources:
 7 |   - name: PostgreSQL
 8 |     type: postgres
 9 |     access: proxy
10 |     url: db:5432
11 |     database: test
12 |     user: postgres
13 |     secureJsonData:
14 |       password: 'example'
15 |     jsonData:
16 |       sslmode: 'disable'
17 |       database: test
18 | 


--------------------------------------------------------------------------------
/05-monitoring/data/.gitignore:
--------------------------------------------------------------------------------
1 | !.gitignore
2 | 


--------------------------------------------------------------------------------
/05-monitoring/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.7'
 2 | 
 3 | volumes: 
 4 |   grafana_data: {}
 5 | 
 6 | networks:
 7 |   front-tier:
 8 |   back-tier:
 9 | 
10 | services:
11 |   db:
12 |     image: postgres
13 |     restart: always
14 |     environment:
15 |       POSTGRES_PASSWORD: example
16 |     ports:
17 |       - "5432:5432"
18 |     networks:
19 |       - back-tier
20 | 
21 |   adminer:
22 |     image: adminer
23 |     restart: always
24 |     ports:
25 |       - "8080:8080"
26 |     networks:
27 |       - back-tier
28 |       - front-tier  
29 | 
30 |   grafana:
31 |     image: grafana/grafana-enterprise
32 |     user: "472"
33 |     ports:
34 |       - "3000:3000"
35 |     volumes:
36 |       - ./config/grafana_datasources.yaml:/etc/grafana/provisioning/datasources/datasource.yaml:ro
37 |       - ./config/grafana_dashboards.yaml:/etc/grafana/provisioning/dashboards/dashboards.yaml:ro
38 |       - ./dashboards:/opt/grafana/dashboards
39 |     networks:
40 |       - back-tier
41 |       - front-tier
42 |     restart: always


--------------------------------------------------------------------------------
/05-monitoring/dummy_metrics_calculation.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import time
 3 | import random
 4 | import logging 
 5 | import uuid
 6 | import pytz
 7 | import pandas as pd
 8 | import io
 9 | import psycopg
10 | 
11 | logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s]: %(message)s")
12 | 
13 | SEND_TIMEOUT = 10
14 | rand = random.Random()
15 | 
16 | create_table_statement = """
17 | drop table if exists dummy_metrics;
18 | create table dummy_metrics(
19 | 	timestamp timestamp,
20 | 	value1 integer,
21 | 	value2 varchar,
22 | 	value3 float
23 | )
24 | """
25 | 
26 | def prep_db():
27 | 	with psycopg.connect("host=localhost port=5432 user=postgres password=example", autocommit=True) as conn:
28 | 		res = conn.execute("SELECT 1 FROM pg_database WHERE datname='test'")
29 | 		if len(res.fetchall()) == 0:
30 | 			conn.execute("create database test;")
31 | 		with psycopg.connect("host=localhost port=5432 dbname=test user=postgres password=example") as conn:
32 | 			conn.execute(create_table_statement)
33 | 
34 | def calculate_dummy_metrics_postgresql(curr):
35 | 	value1 = rand.randint(0, 1000)
36 | 	value2 = str(uuid.uuid4())
37 | 	value3 = rand.random()
38 | 
39 | 	curr.execute(
40 | 		"insert into dummy_metrics(timestamp, value1, value2, value3) values (%s, %s, %s, %s)",
41 | 		(datetime.datetime.now(pytz.timezone('Europe/London')), value1, value2, value3)
42 | 	)
43 | 
44 | def main():
45 | 	prep_db()
46 | 	last_send = datetime.datetime.now() - datetime.timedelta(seconds=10)
47 | 	with psycopg.connect("host=localhost port=5432 dbname=test user=postgres password=example", autocommit=True) as conn:
48 | 		for i in range(0, 100):
49 | 			with conn.cursor() as curr:
50 | 				calculate_dummy_metrics_postgresql(curr)
51 | 
52 | 			new_send = datetime.datetime.now()
53 | 			seconds_elapsed = (new_send - last_send).total_seconds()
54 | 			if seconds_elapsed < SEND_TIMEOUT:
55 | 				time.sleep(SEND_TIMEOUT - seconds_elapsed)
56 | 			while last_send < new_send:
57 | 				last_send = last_send + datetime.timedelta(seconds=10)
58 | 			logging.info("data sent")
59 | 
60 | if __name__ == '__main__':
61 | 	main()


--------------------------------------------------------------------------------
/05-monitoring/images/thumbnail-5-01.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/05-monitoring/images/thumbnail-5-01.jpg


--------------------------------------------------------------------------------
/05-monitoring/images/thumbnail-5-02.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/05-monitoring/images/thumbnail-5-02.jpg


--------------------------------------------------------------------------------
/05-monitoring/images/thumbnail-5-03.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/05-monitoring/images/thumbnail-5-03.jpg


--------------------------------------------------------------------------------
/05-monitoring/images/thumbnail-5-04.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/05-monitoring/images/thumbnail-5-04.jpg


--------------------------------------------------------------------------------
/05-monitoring/images/thumbnail-5-05.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/05-monitoring/images/thumbnail-5-05.jpg


--------------------------------------------------------------------------------
/05-monitoring/images/thumbnail-5-06.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/05-monitoring/images/thumbnail-5-06.jpg


--------------------------------------------------------------------------------
/05-monitoring/images/thumbnail-5-07.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/05-monitoring/images/thumbnail-5-07.jpg


--------------------------------------------------------------------------------
/05-monitoring/images/thumbnail-5-08.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/05-monitoring/images/thumbnail-5-08.jpg


--------------------------------------------------------------------------------
/05-monitoring/meta.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "module": {
 3 |     "number": 5,
 4 |     "title": "ML Monitoring"
 5 |   },
 6 |   "units": [
 7 |     {
 8 |       "number": 1,
 9 |       "title": "Intro to ML monitoring",
10 |       "youtube": "https://www.youtube.com/watch?v=SQ0jBwd_3kk"
11 |     },
12 |     {
13 |       "number": 2,
14 |       "title": "Environment setup",
15 |       "youtube": "https://www.youtube.com/watch?v=yixA3C1xSxc"
16 |     },
17 |     {
18 |       "number": 3,
19 |       "title": "Prepare reference and model",
20 |       "youtube": "https://www.youtube.com/watch?v=IjNrkqMYQeQ"
21 |     },
22 |     {
23 |       "number": 4,
24 |       "title": "Evidently metrics calculation",
25 |       "youtube": "https://www.youtube.com/watch?v=kP3lzh_HfWY"
26 |     },
27 |     {
28 |       "number": 5,
29 |       "title": "Dummy monitoring",
30 |       "youtube": "https://www.youtube.com/watch?v=s3G4PMsOMOA"
31 |     },
32 |     {
33 |       "number": 6,
34 |       "title": "Data quality monitoring",
35 |       "youtube": "https://www.youtube.com/watch?v=fytrmPbcLhI"
36 |     },
37 |     {
38 |       "number": 7,
39 |       "title": "Save Grafana Dashboard",
40 |       "youtube": "https://www.youtube.com/watch?v=-c4iumyZMyw"
41 |     },
42 |     {
43 |       "number": 8,
44 |       "title": "Debugging with test suites and reports",
45 |       "youtube": "https://www.youtube.com/watch?v=sNSk3ojISh8"
46 |     }
47 |   ]
48 | }


--------------------------------------------------------------------------------
/05-monitoring/models/.gitignore:
--------------------------------------------------------------------------------
1 | !.gitignore
2 | 


--------------------------------------------------------------------------------
/05-monitoring/requirements.txt:
--------------------------------------------------------------------------------
 1 | prefect
 2 | tqdm
 3 | requests
 4 | joblib
 5 | pyarrow
 6 | psycopg
 7 | psycopg_binary
 8 | evidently==0.6.7
 9 | pandas
10 | numpy
11 | scikit-learn
12 | jupyter
13 | matplotlib
14 | 


--------------------------------------------------------------------------------
/06-best-practices/AWS-stream-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/06-best-practices/AWS-stream-pipeline.png


--------------------------------------------------------------------------------
/06-best-practices/ci_cd_zoomcamp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/06-best-practices/ci_cd_zoomcamp.png


--------------------------------------------------------------------------------
/06-best-practices/code/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | 


--------------------------------------------------------------------------------
/06-best-practices/code/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # See https://pre-commit.com for more information
 2 | # See https://pre-commit.com/hooks.html for more hooks
 3 | repos:
 4 | - repo: https://github.com/pre-commit/pre-commit-hooks
 5 |   rev: v3.2.0
 6 |   hooks:
 7 |     - id: trailing-whitespace
 8 |     - id: end-of-file-fixer
 9 |     - id: check-yaml
10 |     - id: check-added-large-files
11 | - repo: https://github.com/pycqa/isort
12 |   rev: 5.10.1
13 |   hooks:
14 |     - id: isort
15 |       name: isort (python)
16 | - repo: https://github.com/psf/black
17 |   rev: 22.6.0
18 |   hooks:
19 |     - id: black
20 |       language_version: python3.9
21 | - repo: local
22 |   hooks:
23 |     - id: pylint
24 |       name: pylint
25 |       entry: pylint
26 |       language: system
27 |       types: [python]
28 |       args: [
29 |         "-rn", # Only display messages
30 |         "-sn", # Don't display the score
31 |         "--recursive=y"
32 |       ]
33 | - repo: local
34 |   hooks:
35 |     - id: pytest-check
36 |       name: pytest-check
37 |       entry: pytest
38 |       language: system
39 |       pass_filenames: false
40 |       always_run: true
41 |       args: [
42 |         "tests/"
43 |       ]
44 | 


--------------------------------------------------------------------------------
/06-best-practices/code/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "python.testing.pytestArgs": [
 3 |         "tests"
 4 |     ],
 5 |     "python.testing.unittestEnabled": false,
 6 |     "python.testing.pytestEnabled": true,
 7 |     "python.linting.pylintEnabled": true,
 8 |     "python.linting.enabled": true
 9 | }
10 | 


--------------------------------------------------------------------------------
/06-best-practices/code/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM public.ecr.aws/lambda/python:3.9
 2 | 
 3 | RUN pip install -U pip
 4 | RUN pip install pipenv
 5 | 
 6 | COPY [ "Pipfile", "Pipfile.lock", "./" ]
 7 | 
 8 | RUN pipenv install --system --deploy
 9 | 
10 | COPY [ "lambda_function.py", "model.py", "./" ]
11 | 
12 | CMD [ "lambda_function.lambda_handler" ]
13 | 


--------------------------------------------------------------------------------
/06-best-practices/code/Makefile:
--------------------------------------------------------------------------------
 1 | LOCAL_TAG:=$(shell date +"%Y-%m-%d-%H-%M")
 2 | LOCAL_IMAGE_NAME:=stream-model-duration:${LOCAL_TAG}
 3 | 
 4 | test:
 5 | 	pytest tests/
 6 | 
 7 | quality_checks:
 8 | 	isort .
 9 | 	black .
10 | 	pylint --recursive=y .
11 | 
12 | build: quality_checks test
13 | 	docker build -t ${LOCAL_IMAGE_NAME} .
14 | 
15 | integration_test: build
16 | 	LOCAL_IMAGE_NAME=${LOCAL_IMAGE_NAME} bash integraton-test/run.sh
17 | 
18 | publish: build integration_test
19 | 	LOCAL_IMAGE_NAME=${LOCAL_IMAGE_NAME} bash scripts/publish.sh
20 | 
21 | setup:
22 | 	pipenv install --dev
23 | 	pre-commit install


--------------------------------------------------------------------------------
/06-best-practices/code/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | boto3 = "*"
 8 | mlflow = "*"
 9 | scikit-learn = "==1.0.2"
10 | 
11 | [dev-packages]
12 | pytest = "*"
13 | deepdiff = "*"
14 | pylint = "==2.14.4"
15 | black = "*"
16 | isort = "*"
17 | pre-commit = "*"
18 | 
19 | [requires]
20 | python_version = "3.9"
21 | 


--------------------------------------------------------------------------------
/06-best-practices/code/infrastructure/main.tf:
--------------------------------------------------------------------------------
 1 | # Make sure to create state bucket beforehand
 2 | terraform {
 3 |   required_version = ">= 1.0"
 4 |   backend "s3" {
 5 |     bucket  = "tf-state-mlops-zoomcamp"
 6 |     key     = "mlops-zoomcamp-stg.tfstate"
 7 |     region  = "eu-west-1"
 8 |     encrypt = true
 9 |   }
10 | }
11 | 
12 | provider "aws" {
13 |   region = var.aws_region
14 | }
15 | 
16 | data "aws_caller_identity" "current_identity" {}
17 | 
18 | locals {
19 |   account_id = data.aws_caller_identity.current_identity.account_id
20 | }
21 | 
22 | # ride_events
23 | module "source_kinesis_stream" {
24 |   source = "./modules/kinesis"
25 |   retention_period = 48
26 |   shard_count = 2
27 |   stream_name = "${var.source_stream_name}-${var.project_id}"
28 |   tags = var.project_id
29 | }
30 | 
31 | # ride_predictions
32 | module "output_kinesis_stream" {
33 |   source = "./modules/kinesis"
34 |   retention_period = 48
35 |   shard_count = 2
36 |   stream_name = "${var.output_stream_name}-${var.project_id}"
37 |   tags = var.project_id
38 | }
39 | 
40 | # model bucket
41 | module "s3_bucket" {
42 |   source = "./modules/s3"
43 |   bucket_name = "${var.model_bucket}-${var.project_id}"
44 | }
45 | 
46 | # image registry
47 | module "ecr_image" {
48 |    source = "./modules/ecr"
49 |    ecr_repo_name = "${var.ecr_repo_name}_${var.project_id}"
50 |    account_id = local.account_id
51 |    lambda_function_local_path = var.lambda_function_local_path
52 |    docker_image_local_path = var.docker_image_local_path
53 | }
54 | 
55 | module "lambda_function" {
56 |   source = "./modules/lambda"
57 |   image_uri = module.ecr_image.image_uri
58 |   lambda_function_name = "${var.lambda_function_name}_${var.project_id}"
59 |   model_bucket = module.s3_bucket.name
60 |   output_stream_name = "${var.output_stream_name}-${var.project_id}"
61 |   output_stream_arn = module.output_kinesis_stream.stream_arn
62 |   source_stream_name = "${var.source_stream_name}-${var.project_id}"
63 |   source_stream_arn = module.source_kinesis_stream.stream_arn
64 | }
65 | 
66 | # For CI/CD
67 | output "lambda_function" {
68 |   value     = "${var.lambda_function_name}_${var.project_id}"
69 | }
70 | 
71 | output "model_bucket" {
72 |   value = module.s3_bucket.name
73 | }
74 | 
75 | output "predictions_stream_name" {
76 |   value     = "${var.output_stream_name}-${var.project_id}"
77 | }
78 | 
79 | output "ecr_repo" {
80 |   value = "${var.ecr_repo_name}_${var.project_id}"
81 | }
82 | 


--------------------------------------------------------------------------------
/06-best-practices/code/infrastructure/modules/ecr/main.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_ecr_repository" "repo" {
 2 |   name                 = var.ecr_repo_name
 3 |   image_tag_mutability = "MUTABLE"
 4 | 
 5 |   image_scanning_configuration {
 6 |     scan_on_push = false
 7 |   }
 8 | 
 9 |   force_delete = true
10 | }
11 | 
12 | # In practice, the Image build-and-push step is handled separately by the CI/CD pipeline and not the IaC script.
13 | # But because the lambda config would fail without an existing Image URI in ECR,
14 | # we can also upload any base image to bootstrap the lambda config, unrelated to your Inference logic
15 | resource null_resource ecr_image {
16 |    triggers = {
17 |      python_file = md5(file(var.lambda_function_local_path))
18 |      docker_file = md5(file(var.docker_image_local_path))
19 |    }
20 | 
21 |    provisioner "local-exec" {
22 |      command = <<EOF
23 |              aws ecr get-login-password --region ${var.region} | docker login --username AWS --password-stdin ${var.account_id}.dkr.ecr.${var.region}.amazonaws.com
24 |              cd ../
25 |              docker build -t ${aws_ecr_repository.repo.repository_url}:${var.ecr_image_tag} .
26 |              docker push ${aws_ecr_repository.repo.repository_url}:${var.ecr_image_tag}
27 |          EOF
28 |    }
29 | }
30 | 
31 | // Wait for the image to be uploaded, before lambda config runs
32 | data aws_ecr_image lambda_image {
33 |  depends_on = [
34 |    null_resource.ecr_image
35 |  ]
36 |  repository_name = var.ecr_repo_name
37 |  image_tag       = var.ecr_image_tag
38 | }
39 | 
40 | output "image_uri" {
41 |   value     = "${aws_ecr_repository.repo.repository_url}:${data.aws_ecr_image.lambda_image.image_tag}"
42 | }
43 | 


--------------------------------------------------------------------------------
/06-best-practices/code/infrastructure/modules/ecr/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "ecr_repo_name" {
 2 |     type        = string
 3 |     description = "ECR repo name"
 4 | }
 5 | 
 6 | variable "ecr_image_tag" {
 7 |     type        = string
 8 |     description = "ECR repo name"
 9 |     default = "latest"
10 | }
11 | 
12 | variable "lambda_function_local_path" {
13 |     type        = string
14 |     description = "Local path to lambda function / python file"
15 | }
16 | 
17 | variable "docker_image_local_path" {
18 |     type        = string
19 |     description = "Local path to Dockerfile"
20 | }
21 | 
22 | variable "region" {
23 |     type        = string
24 |     description = "region"
25 |     default = "eu-west-1"
26 | }
27 | 
28 | variable "account_id" {
29 | }
30 | 


--------------------------------------------------------------------------------
/06-best-practices/code/infrastructure/modules/kinesis/main.tf:
--------------------------------------------------------------------------------
 1 | # Create Kinesis Data Stream
 2 | 
 3 | resource "aws_kinesis_stream" "stream" {
 4 |   name             = var.stream_name
 5 |   shard_count      = var.shard_count
 6 |   retention_period = var.retention_period
 7 |   shard_level_metrics = var.shard_level_metrics
 8 |   tags = {
 9 |     CreatedBy = var.tags
10 |   }
11 | }
12 | 
13 | output "stream_arn" {
14 |   value = aws_kinesis_stream.stream.arn
15 | }
16 | 


--------------------------------------------------------------------------------
/06-best-practices/code/infrastructure/modules/kinesis/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "stream_name" {
 2 |     type        = string
 3 |     description = "Kinesis stream name"
 4 | }
 5 | 
 6 | variable "shard_count" {
 7 |     type        = number
 8 |     description = "Kinesis stream shard count"
 9 | }
10 | 
11 | variable "retention_period" {
12 |     type        = number
13 |     description = "Kinesis stream retention period"
14 | }
15 | 
16 | variable "shard_level_metrics" {
17 |     type        = list(string)
18 |     description = "shard_level_metrics"
19 |     default     = [
20 |     "IncomingBytes",
21 |     "OutgoingBytes",
22 |     "OutgoingRecords",
23 |     "ReadProvisionedThroughputExceeded",
24 |     "WriteProvisionedThroughputExceeded",
25 |     "IncomingRecords",
26 |     "IteratorAgeMilliseconds",
27 |   ]
28 | }
29 | 
30 | variable "tags" {
31 |   description = "Tags for kinesis stream"
32 |     default = "mlops-zoomcamp"
33 | }
34 | 


--------------------------------------------------------------------------------
/06-best-practices/code/infrastructure/modules/lambda/main.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_lambda_function" "kinesis_lambda" {
 2 |   function_name = var.lambda_function_name
 3 |   # This can also be any base image to bootstrap the lambda config, unrelated to your Inference service on ECR
 4 |   # which would be anyway updated regularly via a CI/CD pipeline
 5 |   image_uri = var.image_uri   # required-argument
 6 |   package_type = "Image"
 7 |   role          = aws_iam_role.iam_lambda.arn
 8 |   tracing_config {
 9 |     mode = "Active"
10 |   }
11 |   // This step is optional (environment)
12 |   environment {
13 |     variables = {
14 |       PREDICTIONS_STREAM_NAME = var.output_stream_name
15 |       MODEL_BUCKET = var.model_bucket
16 |     }
17 |   }
18 |   timeout = 180
19 | }
20 | 
21 | # Lambda Invoke & Event Source Mapping:
22 | 
23 | resource "aws_lambda_function_event_invoke_config" "kinesis_lambda_event" {
24 |   function_name                = aws_lambda_function.kinesis_lambda.function_name
25 |   maximum_event_age_in_seconds = 60
26 |   maximum_retry_attempts       = 0
27 | }
28 | 
29 | resource "aws_lambda_event_source_mapping" "kinesis_mapping" {
30 |   event_source_arn  = var.source_stream_arn
31 |   function_name     = aws_lambda_function.kinesis_lambda.arn
32 |   starting_position = "LATEST"
33 |   depends_on = [
34 |     aws_iam_role_policy_attachment.kinesis_processing
35 |   ]
36 |   // enabled           = var.lambda_event_source_mapping_enabled
37 |   // batch_size        = var.lambda_event_source_mapping_batch_size
38 | }
39 | 


--------------------------------------------------------------------------------
/06-best-practices/code/infrastructure/modules/lambda/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "source_stream_name" {
 2 |   type        = string
 3 |   description = "Source Kinesis Data Streams stream name"
 4 | }
 5 | 
 6 | variable "source_stream_arn" {
 7 |   type        = string
 8 |   description = "Source Kinesis Data Streams stream name"
 9 | }
10 | 
11 | variable "output_stream_name" {
12 |   description = "Name of output stream where all the events will be passed"
13 | }
14 | 
15 | variable "output_stream_arn" {
16 |   description = "ARN of output stream where all the events will be passed"
17 | }
18 | 
19 | variable "model_bucket" {
20 |   description = "Name of the bucket"
21 | }
22 | 
23 | variable "lambda_function_name" {
24 |   description = "Name of the lambda function"
25 | }
26 | 
27 | variable "image_uri" {
28 |   description = "ECR image uri"
29 | }
30 | 


--------------------------------------------------------------------------------
/06-best-practices/code/infrastructure/modules/s3/main.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_s3_bucket" "s3_bucket" {
 2 |   bucket = var.bucket_name
 3 |   acl    = "private"
 4 |   force_destroy = true
 5 | }
 6 | 
 7 | output "name" {
 8 |   value = aws_s3_bucket.s3_bucket.bucket
 9 | }
10 | 


--------------------------------------------------------------------------------
/06-best-practices/code/infrastructure/modules/s3/variables.tf:
--------------------------------------------------------------------------------
1 | variable "bucket_name" {
2 |   description = "Name of the bucket"
3 | }
4 | 


--------------------------------------------------------------------------------
/06-best-practices/code/infrastructure/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "aws_region" {
 2 |   description = "AWS region to create resources"
 3 |   default     = "eu-west-1"
 4 | }
 5 | 
 6 | variable "project_id" {
 7 |   description = "project_id"
 8 |   default = "mlops-zoomcamp"
 9 | }
10 | 
11 | variable "source_stream_name" {
12 |   description = ""
13 | }
14 | 
15 | variable "output_stream_name" {
16 |   description = ""
17 | }
18 | 
19 | variable "model_bucket" {
20 |   description = "s3_bucket"
21 | }
22 | 
23 | variable "lambda_function_local_path" {
24 |   description = ""
25 | }
26 | 
27 | variable "docker_image_local_path" {
28 |   description = ""
29 | }
30 | 
31 | variable "ecr_repo_name" {
32 |   description = ""
33 | }
34 | 
35 | variable "lambda_function_name" {
36 |   description = ""
37 | }


--------------------------------------------------------------------------------
/06-best-practices/code/infrastructure/vars/prod.tfvars:
--------------------------------------------------------------------------------
1 | source_stream_name = "prod_ride_events"
2 | output_stream_name = "prod_ride_predictions"
3 | model_bucket = "prod-mlflow-models-code-owners"
4 | lambda_function_local_path = "../lambda_function.py"
5 | docker_image_local_path = "../Dockerfile"
6 | ecr_repo_name = "prod_stream_model_duration"
7 | lambda_function_name = "prod_prediction_lambda"
8 | 


--------------------------------------------------------------------------------
/06-best-practices/code/infrastructure/vars/stg.tfvars:
--------------------------------------------------------------------------------
1 | source_stream_name = "stg_ride_events"
2 | output_stream_name = "stg_ride_predictions"
3 | model_bucket = "stg-mlflow-models-code-owners"
4 | lambda_function_local_path = "../lambda_function.py"
5 | docker_image_local_path = "../Dockerfile"
6 | ecr_repo_name = "stg_stream_model_duration"
7 | lambda_function_name = "stg_prediction_lambda"
8 | 


--------------------------------------------------------------------------------
/06-best-practices/code/integration-test/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   backend:
 3 |     image: ${LOCAL_IMAGE_NAME}
 4 |     ports:
 5 |       - "8080:8080"
 6 |     environment:
 7 |       - PREDICTIONS_STREAM_NAME=${PREDICTIONS_STREAM_NAME}
 8 |       - RUN_ID=Test123
 9 |       - AWS_DEFAULT_REGION=eu-west-1
10 |       - MODEL_LOCATION=/app/model
11 |       - KINESIS_ENDPOINT_URL=http://kinesis:4566/
12 |       - AWS_ACCESS_KEY_ID=abc
13 |       - AWS_SECRET_ACCESS_KEY=xyz
14 |     volumes:
15 |       - "./model:/app/model"
16 |   kinesis:
17 |     image: localstack/localstack
18 |     ports:
19 |       - "4566:4566"
20 |     environment:
21 |       - SERVICES=kinesis
22 | 


--------------------------------------------------------------------------------
/06-best-practices/code/integration-test/event.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Records": [
 3 |         {
 4 |             "kinesis": {
 5 |                 "kinesisSchemaVersion": "1.0",
 6 |                 "partitionKey": "1",
 7 |                 "sequenceNumber": "49630081666084879290581185630324770398608704880802529282",
 8 |                 "data": "ewogICAgICAgICJyaWRlIjogewogICAgICAgICAgICAiUFVMb2NhdGlvbklEIjogMTMwLAogICAgICAgICAgICAiRE9Mb2NhdGlvbklEIjogMjA1LAogICAgICAgICAgICAidHJpcF9kaXN0YW5jZSI6IDMuNjYKICAgICAgICB9LCAKICAgICAgICAicmlkZV9pZCI6IDI1NgogICAgfQ==",
 9 |                 "approximateArrivalTimestamp": 1654161514.132
10 |             },
11 |             "eventSource": "aws:kinesis",
12 |             "eventVersion": "1.0",
13 |             "eventID": "shardId-000000000000:49630081666084879290581185630324770398608704880802529282",
14 |             "eventName": "aws:kinesis:record",
15 |             "invokeIdentityArn": "arn:aws:iam::387546586013:role/lambda-kinesis-role",
16 |             "awsRegion": "eu-west-1",
17 |             "eventSourceARN": "arn:aws:kinesis:eu-west-1:387546586013:stream/ride_events"
18 |         }
19 |     ]
20 | }
21 | 


--------------------------------------------------------------------------------
/06-best-practices/code/integration-test/model/MLmodel:
--------------------------------------------------------------------------------
 1 | artifact_path: model
 2 | flavors:
 3 |   python_function:
 4 |     env: conda.yaml
 5 |     loader_module: mlflow.sklearn
 6 |     model_path: model.pkl
 7 |     python_version: 3.9.7
 8 |   sklearn:
 9 |     code: null
10 |     pickled_model: model.pkl
11 |     serialization_format: cloudpickle
12 |     sklearn_version: 1.0.2
13 | mlflow_version: 1.26.1
14 | model_uuid: 78edf19ceea5463aadce7d84f3f9bc82
15 | run_id: e1efc53e9bd149078b0c12aeaa6365df
16 | utc_time_created: '2022-06-01 12:49:55.846831'
17 | 


--------------------------------------------------------------------------------
/06-best-practices/code/integration-test/model/conda.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 | - conda-forge
 3 | dependencies:
 4 | - python=3.9.7
 5 | - pip<=22.1
 6 | - pip:
 7 |   - mlflow
 8 |   - cloudpickle==2.0.0
 9 |   - psutil==5.8.0
10 |   - scikit-learn==1.0.2
11 |   - typing-extensions==3.10.0.2
12 | name: mlflow-env
13 | 


--------------------------------------------------------------------------------
/06-best-practices/code/integration-test/model/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/06-best-practices/code/integration-test/model/model.pkl


--------------------------------------------------------------------------------
/06-best-practices/code/integration-test/model/python_env.yaml:
--------------------------------------------------------------------------------
1 | python: 3.9.7
2 | build_dependencies:
3 | - pip==22.1
4 | - setuptools==58.0.4
5 | - wheel==0.37.0
6 | dependencies:
7 | - -r requirements.txt
8 | 


--------------------------------------------------------------------------------
/06-best-practices/code/integration-test/model/requirements.txt:
--------------------------------------------------------------------------------
1 | mlflow
2 | cloudpickle==2.0.0
3 | psutil==5.8.0
4 | scikit-learn==1.0.2
5 | typing-extensions==3.10.0.2
6 | 


--------------------------------------------------------------------------------
/06-best-practices/code/integration-test/run.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [[ -z "${GITHUB_ACTIONS}" ]]; then
 4 |   cd "$(dirname "$0")"
 5 | fi
 6 | 
 7 | if [ "${LOCAL_IMAGE_NAME}" == "" ]; then 
 8 |     LOCAL_TAG=`date +"%Y-%m-%d-%H-%M"`
 9 |     export LOCAL_IMAGE_NAME="stream-model-duration:${LOCAL_TAG}"
10 |     echo "LOCAL_IMAGE_NAME is not set, building a new image with tag ${LOCAL_IMAGE_NAME}"
11 |     docker build -t ${LOCAL_IMAGE_NAME} ..
12 | else
13 |     echo "no need to build image ${LOCAL_IMAGE_NAME}"
14 | fi
15 | 
16 | export PREDICTIONS_STREAM_NAME="ride_predictions"
17 | 
18 | docker-compose up -d
19 | 
20 | sleep 5
21 | 
22 | aws --endpoint-url=http://localhost:4566 \
23 |     kinesis create-stream \
24 |     --stream-name ${PREDICTIONS_STREAM_NAME} \
25 |     --shard-count 1
26 | 
27 | pipenv run python test_docker.py
28 | 
29 | ERROR_CODE=$?
30 | 
31 | if [ ${ERROR_CODE} != 0 ]; then
32 |     docker-compose logs
33 |     docker-compose down
34 |     exit ${ERROR_CODE}
35 | fi
36 | 
37 | 
38 | pipenv run python test_kinesis.py
39 | 
40 | ERROR_CODE=$?
41 | 
42 | if [ ${ERROR_CODE} != 0 ]; then
43 |     docker-compose logs
44 |     docker-compose down
45 |     exit ${ERROR_CODE}
46 | fi
47 | 
48 | 
49 | docker-compose down
50 | 


--------------------------------------------------------------------------------
/06-best-practices/code/integration-test/test_docker.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable=duplicate-code
 2 | 
 3 | import json
 4 | 
 5 | import requests
 6 | from deepdiff import DeepDiff
 7 | 
 8 | with open('event.json', 'rt', encoding='utf-8') as f_in:
 9 |     event = json.load(f_in)
10 | 
11 | 
12 | url = 'http://localhost:8080/2015-03-31/functions/function/invocations'
13 | actual_response = requests.post(url, json=event).json()
14 | print('actual response:')
15 | 
16 | print(json.dumps(actual_response, indent=2))
17 | 
18 | expected_response = {
19 |     'predictions': [
20 |         {
21 |             'model': 'ride_duration_prediction_model',
22 |             'version': 'Test123',
23 |             'prediction': {
24 |                 'ride_duration': 21.3,
25 |                 'ride_id': 256,
26 |             },
27 |         }
28 |     ]
29 | }
30 | 
31 | 
32 | diff = DeepDiff(actual_response, expected_response, significant_digits=1)
33 | print(f'diff={diff}')
34 | 
35 | assert 'type_changes' not in diff
36 | assert 'values_changed' not in diff
37 | 


--------------------------------------------------------------------------------
/06-best-practices/code/integration-test/test_kinesis.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable=duplicate-code
 2 | 
 3 | import os
 4 | import json
 5 | from pprint import pprint
 6 | 
 7 | import boto3
 8 | from deepdiff import DeepDiff
 9 | 
10 | kinesis_endpoint = os.getenv('KINESIS_ENDPOINT_URL', "http://localhost:4566")
11 | kinesis_client = boto3.client('kinesis', endpoint_url=kinesis_endpoint)
12 | 
13 | stream_name = os.getenv('PREDICTIONS_STREAM_NAME', 'ride_predictions')
14 | shard_id = 'shardId-000000000000'
15 | 
16 | 
17 | shard_iterator_response = kinesis_client.get_shard_iterator(
18 |     StreamName=stream_name,
19 |     ShardId=shard_id,
20 |     ShardIteratorType='TRIM_HORIZON',
21 | )
22 | 
23 | shard_iterator_id = shard_iterator_response['ShardIterator']
24 | 
25 | 
26 | records_response = kinesis_client.get_records(
27 |     ShardIterator=shard_iterator_id,
28 |     Limit=1,
29 | )
30 | 
31 | 
32 | records = records_response['Records']
33 | pprint(records)
34 | 
35 | 
36 | assert len(records) == 1
37 | 
38 | 
39 | actual_record = json.loads(records[0]['Data'])
40 | pprint(actual_record)
41 | 
42 | expected_record = {
43 |     'model': 'ride_duration_prediction_model',
44 |     'version': 'Test123',
45 |     'prediction': {
46 |         'ride_duration': 21.3,
47 |         'ride_id': 256,
48 |     },
49 | }
50 | 
51 | diff = DeepDiff(actual_record, expected_record, significant_digits=1)
52 | print(f'diff={diff}')
53 | 
54 | assert 'values_changed' not in diff
55 | assert 'type_changes' not in diff
56 | 
57 | 
58 | print('all good')
59 | 


--------------------------------------------------------------------------------
/06-best-practices/code/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import model
 4 | 
 5 | PREDICTIONS_STREAM_NAME = os.getenv('PREDICTIONS_STREAM_NAME', 'ride_predictions')
 6 | RUN_ID = os.getenv('RUN_ID')
 7 | TEST_RUN = os.getenv('TEST_RUN', 'False') == 'True'
 8 | 
 9 | 
10 | model_service = model.init(
11 |     prediction_stream_name=PREDICTIONS_STREAM_NAME,
12 |     run_id=RUN_ID,
13 |     test_run=TEST_RUN,
14 | )
15 | 
16 | 
17 | def lambda_handler(event, context):
18 |     # pylint: disable=unused-argument
19 |     return model_service.lambda_handler(event)
20 | 


--------------------------------------------------------------------------------
/06-best-practices/code/plan.md:
--------------------------------------------------------------------------------
 1 | ## Plan
 2 | 
 3 | - [x] Testing the code: unit tests with pytest
 4 | - [x] Integration tests with docker-compose
 5 | - [x] Testing cloud services with LocalStack
 6 | - [x] Code quality: linting and formatting
 7 | - [x] Git pre-commit hooks
 8 | - [x] Makefiles and make
 9 | - [ ] Staging and production environments
10 | - [ ] Infrastructure as Code
11 | - [ ] CI/CD and GitHub Actions
12 | 


--------------------------------------------------------------------------------
/06-best-practices/code/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.pylint.messages_control]
 2 | 
 3 | disable = [
 4 |     "missing-function-docstring",
 5 |     "missing-final-newline",
 6 |     "missing-class-docstring",
 7 |     "missing-module-docstring",
 8 |     "invalid-name",
 9 |     "too-few-public-methods"
10 | ]
11 | 
12 | [tool.black]
13 | line-length = 88
14 | target-version = ['py39']
15 | skip-string-normalization = true
16 | 
17 | [tool.isort]
18 | multi_line_output = 3
19 | length_sort = true
20 | 


--------------------------------------------------------------------------------
/06-best-practices/code/scripts/deploy_manual.sh:
--------------------------------------------------------------------------------
 1 | AWS_REGION="eu-west-1"
 2 | 
 3 | # Dynamically generated by TF
 4 | export MODEL_BUCKET_PROD="stg-mlflow-models-code-owners-mlops-zoomcamp"
 5 | export PREDICTIONS_STREAM_NAME="stg_ride_predictions-mlops-zoomcamp"
 6 | export LAMBDA_FUNCTION="stg_prediction_lambda_mlops-zoomcamp"
 7 | 
 8 | # Model artifacts bucket from the previous weeks (MLflow experiments)
 9 | export MODEL_BUCKET_DEV="mlflow-models-alexey"
10 | 
11 | # Get latest RUN_ID from latest S3 partition.
12 | # NOT FOR PRODUCTION!
13 | # In practice, this is generally picked up from your experiment tracking tool such as MLflow or DVC
14 | export RUN_ID=$(aws s3api list-objects-v2 --bucket ${MODEL_BUCKET_DEV} \
15 | --query 'sort_by(Contents, &LastModified)[-1].Key' --output=text | cut -f2 -d/)
16 | 
17 | # NOT FOR PRODUCTION!
18 | # Just mocking the artifacts from training process in the Prod env
19 | aws s3 sync s3://${MODEL_BUCKET_DEV} s3://${MODEL_BUCKET_PROD}
20 | 
21 | # Set new var RUN_ID in existing set of vars.
22 | variables="{PREDICTIONS_STREAM_NAME=${PREDICTIONS_STREAM_NAME}, MODEL_BUCKET=${MODEL_BUCKET_PROD}, RUN_ID=${RUN_ID}}"
23 | 
24 | # https://docs.aws.amazon.com/lambda/latest/dg/configuration-envvars.html
25 | aws lambda update-function-configuration --function-name ${LAMBDA_FUNCTION} --environment "Variables=${variables}"
26 | 


--------------------------------------------------------------------------------
/06-best-practices/code/scripts/publish.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | echo "publishing image ${LOCAL_IMAGE_NAME} to ECR..."


--------------------------------------------------------------------------------
/06-best-practices/code/scripts/test_cloud_e2e.sh:
--------------------------------------------------------------------------------
 1 | export KINESIS_STREAM_INPUT="stg_ride_events-mlops-zoomcamp"
 2 | export KINESIS_STREAM_OUTPUT="stg_ride_predictions-mlops-zoomcamp"
 3 | 
 4 | SHARD_ID=$(aws kinesis put-record  \
 5 |         --stream-name ${KINESIS_STREAM_INPUT}   \
 6 |         --partition-key 1  --cli-binary-format raw-in-base64-out  \
 7 |         --data '{"ride": {
 8 |             "PULocationID": 130,
 9 |             "DOLocationID": 205,
10 |             "trip_distance": 3.66
11 |         },
12 |         "ride_id": 156}'  \
13 |         --query 'ShardId'
14 |     )
15 | 
16 | #SHARD_ITERATOR=$(aws kinesis get-shard-iterator --shard-id ${SHARD_ID} --shard-iterator-type TRIM_HORIZON --stream-name ${KINESIS_STREAM_OUTPUT} --query 'ShardIterator')
17 | 
18 | #aws kinesis get-records --shard-iterator $SHARD_ITERATOR
19 | 


--------------------------------------------------------------------------------
/06-best-practices/code/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/06-best-practices/code/tests/__init__.py


--------------------------------------------------------------------------------
/06-best-practices/code/tests/data.b64:
--------------------------------------------------------------------------------
1 | ewogICAgICAgICJyaWRlIjogewogICAgICAgICAgICAiUFVMb2NhdGlvbklEIjogMTMwLAogICAgICAgICAgICAiRE9Mb2NhdGlvbklEIjogMjA1LAogICAgICAgICAgICAidHJpcF9kaXN0YW5jZSI6IDMuNjYKICAgICAgICB9LCAKICAgICAgICAicmlkZV9pZCI6IDI1NgogICAgfQ==
2 | 


--------------------------------------------------------------------------------
/06-best-practices/code/tests/model_test.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | 
  3 | import model
  4 | 
  5 | 
  6 | def read_text(file):
  7 |     test_directory = Path(__file__).parent
  8 | 
  9 |     with open(test_directory / file, 'rt', encoding='utf-8') as f_in:
 10 |         return f_in.read().strip()
 11 | 
 12 | 
 13 | def test_base64_decode():
 14 |     base64_input = read_text('data.b64')
 15 | 
 16 |     actual_result = model.base64_decode(base64_input)
 17 |     expected_result = {
 18 |         "ride": {
 19 |             "PULocationID": 130,
 20 |             "DOLocationID": 205,
 21 |             "trip_distance": 3.66,
 22 |         },
 23 |         "ride_id": 256,
 24 |     }
 25 | 
 26 |     assert actual_result == expected_result
 27 | 
 28 | 
 29 | def test_prepare_features():
 30 |     model_service = model.ModelService(None)
 31 | 
 32 |     ride = {
 33 |         "PULocationID": 130,
 34 |         "DOLocationID": 205,
 35 |         "trip_distance": 3.66,
 36 |     }
 37 | 
 38 |     actual_features = model_service.prepare_features(ride)
 39 | 
 40 |     expected_fetures = {
 41 |         "PU_DO": "130_205",
 42 |         "trip_distance": 3.66,
 43 |     }
 44 | 
 45 |     assert actual_features == expected_fetures
 46 | 
 47 | 
 48 | class ModelMock:
 49 |     def __init__(self, value):
 50 |         self.value = value
 51 | 
 52 |     def predict(self, X):
 53 |         n = len(X)
 54 |         return [self.value] * n
 55 | 
 56 | 
 57 | def test_predict():
 58 |     model_mock = ModelMock(10.0)
 59 |     model_service = model.ModelService(model_mock)
 60 | 
 61 |     features = {
 62 |         "PU_DO": "130_205",
 63 |         "trip_distance": 3.66,
 64 |     }
 65 | 
 66 |     actual_prediction = model_service.predict(features)
 67 |     expected_prediction = 10.0
 68 | 
 69 |     assert actual_prediction == expected_prediction
 70 | 
 71 | 
 72 | def test_lambda_handler():
 73 |     model_mock = ModelMock(10.0)
 74 |     model_version = 'Test123'
 75 |     model_service = model.ModelService(model_mock, model_version)
 76 | 
 77 |     base64_input = read_text('data.b64')
 78 | 
 79 |     event = {
 80 |         "Records": [
 81 |             {
 82 |                 "kinesis": {
 83 |                     "data": base64_input,
 84 |                 },
 85 |             }
 86 |         ]
 87 |     }
 88 | 
 89 |     actual_predictions = model_service.lambda_handler(event)
 90 |     expected_predictions = {
 91 |         'predictions': [
 92 |             {
 93 |                 'model': 'ride_duration_prediction_model',
 94 |                 'version': model_version,
 95 |                 'prediction': {
 96 |                     'ride_duration': 10.0,
 97 |                     'ride_id': 256,
 98 |                 },
 99 |             }
100 |         ]
101 |     }
102 | 
103 |     assert actual_predictions == expected_predictions
104 | 


--------------------------------------------------------------------------------
/06-best-practices/images/thumbnail-6-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/06-best-practices/images/thumbnail-6-1.jpg


--------------------------------------------------------------------------------
/06-best-practices/images/thumbnail-6-2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/06-best-practices/images/thumbnail-6-2.jpg


--------------------------------------------------------------------------------
/06-best-practices/images/thumbnail-6-3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/06-best-practices/images/thumbnail-6-3.jpg


--------------------------------------------------------------------------------
/06-best-practices/images/thumbnail-6-4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/06-best-practices/images/thumbnail-6-4.jpg


--------------------------------------------------------------------------------
/06-best-practices/images/thumbnail-6-5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/06-best-practices/images/thumbnail-6-5.jpg


--------------------------------------------------------------------------------
/06-best-practices/images/thumbnail-6-6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/06-best-practices/images/thumbnail-6-6.jpg


--------------------------------------------------------------------------------
/06-best-practices/meta.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "module": {
 3 |     "number": 6,
 4 |     "title": "Best Practices"
 5 |   },
 6 |   "units": [
 7 |     {
 8 |       "number": "1",
 9 |       "title": "Testing Python code with pytest",
10 |       "youtube": "https://www.youtube.com/watch?v=CJp1eFQP5nk"
11 |     },
12 |     {
13 |       "number": "2",
14 |       "title": "Integration tests with docker-compose",
15 |       "youtube": "https://www.youtube.com/watch?v=lBX0Gl7Z1ck"
16 |     },
17 |     {
18 |       "number": "3",
19 |       "title": "Testing cloud services with LocalStack",
20 |       "youtube": "https://www.youtube.com/watch?v=9yMO86SYvuI"
21 |     },
22 |     {
23 |       "number": "4",
24 |       "title": "Code quality: linting and formatting",
25 |       "youtube": "https://www.youtube.com/watch?v=uImvWE-iSDQ"
26 |     },
27 |     {
28 |       "number": "5",
29 |       "title": "Git pre-commit hooks",
30 |       "youtube": "https://www.youtube.com/watch?v=lmMZ7Axk2T8"
31 |     },
32 |     {
33 |       "number": "6",
34 |       "title": "Makefiles and make",
35 |       "youtube": "https://www.youtube.com/watch?v=F6DZdvbRZQQ"
36 |     },
37 |     {
38 |       "number": "X",
39 |       "title": "Homework",
40 |       "youtube": ""
41 |     }
42 |   ]
43 | }


--------------------------------------------------------------------------------
/07-project/images/thumbnail-7-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/07-project/images/thumbnail-7-1.jpg


--------------------------------------------------------------------------------
/after-sign-up.md:
--------------------------------------------------------------------------------
 1 | ## Thank you!
 2 | 
 3 | Thanks for signining up for the course.
 4 | 
 5 | The process of adding you to the mailing list is not automated yet, 
 6 | but you will hear from us closer to the course start. 
 7 | 
 8 | To make sure you don't miss any announcements
 9 | 
10 | - Register in [DataTalks.Club's Slack](https://datatalks.club/slack.html) and join the [`#course-mlops-zoomcamp`](https://app.slack.com/client/T01ATQK62F8/C02R98X7DS9) channel
11 | - Join the [course Telegram channel with announcements](https://t.me/dtc_courses)
12 | - [Tweet about the course!](https://ctt.ac/fH67W)
13 | - Subscribe to [DataTalks.Club's YouTube channel](https://www.youtube.com/c/DataTalksClub) and check 
14 |   [the course playlist](https://www.youtube.com/playlist?list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK)
15 | - Subscribe to our [public Google Calendar](https://calendar.google.com/calendar/?cid=M3Jzbmg0ZDA2aHVsY2M1ZjcyNDJtODNyMTRAZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ) (it works from Desktop only)
16 | - Check our [Technical FAQ](https://docs.google.com/document/d/12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0/edit) if you have questions 
17 | 
18 | See you in May!
19 | 


--------------------------------------------------------------------------------
/asking-questions.md:
--------------------------------------------------------------------------------
 1 | ## Asking questions
 2 | 
 3 | If you have any questions, ask them 
 4 | in the [`#course-mlops-zoomcamp`](https://app.slack.com/client/T01ATQK62F8/C02R98X7DS9) channel in [DataTalks.Club](https://datatalks.club) slack.
 5 | 
 6 | To keep our discussion in Slack more organized, we ask you to follow these suggestions:
 7 | 
 8 | * Before asking a question, check [FAQ](https://docs.google.com/document/d/12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0/edit).
 9 | * Use threads. When you have a problem, first describe the problem shortly
10 |   and then put the actual error in the thread - so it doesn't take the entire screen.
11 | * Instead of screenshots, it's better to copy-paste the error you're getting in text.
12 |   Use ` ``` ` for formatting your code.
13 |   It's very difficult to read text from screenshots.
14 | * Please don't take pictures of your code with a phone. It's even harder to read. Follow the previous suggestion,
15 |   and in rare cases when you need to show what happens on your screen, take a screenshot.
16 | * You don't need to tag the instructors when you have a problem. We will see it eventually.
17 | * If somebody helped you with your problem and it's not in [FAQ](https://docs.google.com/document/d/12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0/edit), please add it there.
18 |   It'll help other students.
19 | 


--------------------------------------------------------------------------------
/certificate.md:
--------------------------------------------------------------------------------
 1 | ## Getting your certificate
 2 | 
 3 | Congratulations on finishing the course!
 4 | 
 5 | Here's how you can get your certificate.
 6 | 
 7 | First, get your certificate id using the `compute_certificate_id` function:
 8 | 
 9 | ```python
10 | from hashlib import sha1
11 | 
12 | def compute_hash(email):
13 |     return sha1(email.encode('utf-8')).hexdigest()
14 | 
15 | def compute_certificate_id(email):
16 |     email_clean = email.lower().strip()
17 |     return compute_hash(email_clean + '_')
18 | ```
19 | 
20 | Then use this hash to get the URL
21 | 
22 | ```python
23 | cohort = 2024
24 | course = 'mlops-zoomcamp'
25 | your_id = compute_certificate_id('never.give.up@gmail.com')
26 | url = f"https://certificate.datatalks.club/{course}/{cohort}/{your_id}.pdf"
27 | print(url)
28 | ```
29 | 
30 | Example: https://certificate.datatalks.club/mlops-zoomcamp/2022/fe629854d45c559e9c10b3b8458ea392fdeb68a9.pdf
31 | 
32 | 
33 | ## Adding to LinkedIn
34 | 
35 | You can add your certificate to LinkedIn:
36 | 
37 | * Log in to your LinkedIn account, then go to your profile.
38 | * On the right, in the "Add profile" section dropdown, choose "Background" and then select the drop-down triangle next to "Licenses & Certifications".
39 | * In "Name", enter "MLOps Zoomcamp".
40 | * In "Issuing Organization", enter "DataTalksClub".
41 | * (Optional) In "Issue Date", enter the time when the certificate was created.
42 | * (Optional) Select the checkbox This certification does not expire. 
43 | * Put your certificate ID.
44 | * In "Certification URL", enter the URL for your certificate.
45 | 
46 | [Adapted from here](https://support.edx.org/hc/en-us/articles/206501938-How-can-I-add-my-certificate-to-my-LinkedIn-profile-)
47 | 


--------------------------------------------------------------------------------
/cohorts/2022/02-experiment-tracking/homework/hpo.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import pickle
 4 | 
 5 | import mlflow
 6 | import numpy as np
 7 | from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
 8 | from hyperopt.pyll import scope
 9 | from sklearn.ensemble import RandomForestRegressor
10 | from sklearn.metrics import mean_squared_error
11 | 
12 | mlflow.set_tracking_uri("http://127.0.0.1:5000")
13 | mlflow.set_experiment("random-forest-hyperopt")
14 | 
15 | 
16 | def load_pickle(filename):
17 |     with open(filename, "rb") as f_in:
18 |         return pickle.load(f_in)
19 | 
20 | 
21 | def run(data_path, num_trials):
22 | 
23 |     X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
24 |     X_valid, y_valid = load_pickle(os.path.join(data_path, "valid.pkl"))
25 | 
26 |     def objective(params):
27 | 
28 |         rf = RandomForestRegressor(**params)
29 |         rf.fit(X_train, y_train)
30 |         y_pred = rf.predict(X_valid)
31 |         rmse = mean_squared_error(y_valid, y_pred, squared=False)
32 | 
33 |         return {'loss': rmse, 'status': STATUS_OK}
34 | 
35 |     search_space = {
36 |         'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
37 |         'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
38 |         'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
39 |         'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
40 |         'random_state': 42
41 |     }
42 | 
43 |     rstate = np.random.default_rng(42)  # for reproducible results
44 |     fmin(
45 |         fn=objective,
46 |         space=search_space,
47 |         algo=tpe.suggest,
48 |         max_evals=num_trials,
49 |         trials=Trials(),
50 |         rstate=rstate
51 |     )
52 | 
53 | 
54 | if __name__ == '__main__':
55 | 
56 |     parser = argparse.ArgumentParser()
57 |     parser.add_argument(
58 |         "--data_path",
59 |         default="./output",
60 |         help="the location where the processed NYC taxi trip data was saved."
61 |     )
62 |     parser.add_argument(
63 |         "--max_evals",
64 |         type=int,
65 |         default=50,
66 |         help="the number of parameter evaluations for the optimizer to explore."
67 |     )
68 |     args = parser.parse_args()
69 | 
70 |     run(args.data_path, args.max_evals)
71 | 


--------------------------------------------------------------------------------
/cohorts/2022/02-experiment-tracking/homework/train.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import pickle
 4 | 
 5 | from sklearn.ensemble import RandomForestRegressor
 6 | from sklearn.metrics import mean_squared_error
 7 | 
 8 | 
 9 | def load_pickle(filename: str):
10 |     with open(filename, "rb") as f_in:
11 |         return pickle.load(f_in)
12 | 
13 | 
14 | def run(data_path):
15 | 
16 |     X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
17 |     X_valid, y_valid = load_pickle(os.path.join(data_path, "valid.pkl"))
18 | 
19 |     rf = RandomForestRegressor(max_depth=10, random_state=0)
20 |     rf.fit(X_train, y_train)
21 |     y_pred = rf.predict(X_valid)
22 | 
23 |     rmse = mean_squared_error(y_valid, y_pred, squared=False)
24 | 
25 | 
26 | if __name__ == '__main__':
27 | 
28 |     parser = argparse.ArgumentParser()
29 |     parser.add_argument(
30 |         "--data_path",
31 |         default="./output",
32 |         help="the location where the processed NYC taxi trip data was saved."
33 |     )
34 |     args = parser.parse_args()
35 | 
36 |     run(args.data_path)
37 | 


--------------------------------------------------------------------------------
/cohorts/2022/03-orchestration/code/work-queue.py:
--------------------------------------------------------------------------------
 1 | from prefect import flow
 2 | 
 3 | @flow
 4 | def myflow():
 5 |     print("hello")
 6 | 
 7 | from prefect.deployments import Deployment
 8 | from prefect.orion.schemas.schedules import IntervalSchedule
 9 | from datetime import timedelta
10 | 
11 | deployment_dev = Deployment.build_from_flow(
12 |     flow=myflow,
13 |     name="model_training-dev",
14 |     schedule=IntervalSchedule(interval=timedelta(minutes=5)),
15 |     work_queue_name="dev"
16 | )
17 | 
18 | deployment_dev.apply()
19 | 
20 | deployment_prod = Deployment.build_from_flow(
21 |     flow=myflow,
22 |     name="model_training-prod",
23 |     schedule=IntervalSchedule(interval=timedelta(minutes=5)),
24 |     work_queue_name="prod"
25 | )
26 | 
27 | deployment_prod.apply()
28 | 
29 | 


--------------------------------------------------------------------------------
/cohorts/2022/03-orchestration/homework.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from sklearn.feature_extraction import DictVectorizer
 4 | from sklearn.linear_model import LinearRegression
 5 | from sklearn.metrics import mean_squared_error
 6 | 
 7 | def read_data(path):
 8 |     df = pd.read_parquet(path)
 9 |     return df
10 | 
11 | def prepare_features(df, categorical, train=True):
12 |     df['duration'] = df.dropOff_datetime - df.pickup_datetime
13 |     df['duration'] = df.duration.dt.total_seconds() / 60
14 |     df = df[(df.duration >= 1) & (df.duration <= 60)].copy()
15 | 
16 |     mean_duration = df.duration.mean()
17 |     if train:
18 |         print(f"The mean duration of training is {mean_duration}")
19 |     else:
20 |         print(f"The mean duration of validation is {mean_duration}")
21 |     
22 |     df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
23 |     return df
24 | 
25 | def train_model(df, categorical):
26 | 
27 |     train_dicts = df[categorical].to_dict(orient='records')
28 |     dv = DictVectorizer()
29 |     X_train = dv.fit_transform(train_dicts) 
30 |     y_train = df.duration.values
31 | 
32 |     print(f"The shape of X_train is {X_train.shape}")
33 |     print(f"The DictVectorizer has {len(dv.feature_names_)} features")
34 | 
35 |     lr = LinearRegression()
36 |     lr.fit(X_train, y_train)
37 |     y_pred = lr.predict(X_train)
38 |     mse = mean_squared_error(y_train, y_pred, squared=False)
39 |     print(f"The MSE of training is: {mse}")
40 |     return lr, dv
41 | 
42 | def run_model(df, categorical, dv, lr):
43 |     val_dicts = df[categorical].to_dict(orient='records')
44 |     X_val = dv.transform(val_dicts) 
45 |     y_pred = lr.predict(X_val)
46 |     y_val = df.duration.values
47 | 
48 |     mse = mean_squared_error(y_val, y_pred, squared=False)
49 |     print(f"The MSE of validation is: {mse}")
50 |     return
51 | 
52 | def main(train_path: str = './data/fhv_tripdata_2021-01.parquet', 
53 |            val_path: str = './data/fhv_tripdata_2021-02.parquet'):
54 | 
55 |     categorical = ['PUlocationID', 'DOlocationID']
56 | 
57 |     df_train = read_data(train_path)
58 |     df_train_processed = prepare_features(df_train, categorical)
59 | 
60 |     df_val = read_data(val_path)
61 |     df_val_processed = prepare_features(df_val, categorical, False)
62 | 
63 |     # train the model
64 |     lr, dv = train_model(df_train_processed, categorical)
65 |     run_model(df_val_processed, categorical, dv, lr)
66 | 
67 | main()
68 | 


--------------------------------------------------------------------------------
/cohorts/2022/03-orchestration/images/thumbnail-3-01.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2022/03-orchestration/images/thumbnail-3-01.jpg


--------------------------------------------------------------------------------
/cohorts/2022/03-orchestration/images/thumbnail-3-02.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2022/03-orchestration/images/thumbnail-3-02.jpg


--------------------------------------------------------------------------------
/cohorts/2022/03-orchestration/images/thumbnail-3-03.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2022/03-orchestration/images/thumbnail-3-03.jpg


--------------------------------------------------------------------------------
/cohorts/2022/03-orchestration/images/thumbnail-3-04.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2022/03-orchestration/images/thumbnail-3-04.jpg


--------------------------------------------------------------------------------
/cohorts/2022/03-orchestration/images/thumbnail-3-05.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2022/03-orchestration/images/thumbnail-3-05.jpg


--------------------------------------------------------------------------------
/cohorts/2022/03-orchestration/images/thumbnail-3-06.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2022/03-orchestration/images/thumbnail-3-06.jpg


--------------------------------------------------------------------------------
/cohorts/2022/04-deployment/homework/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.9.7-slim
2 | 
3 | WORKDIR /app
4 | COPY [ "model2.bin", "model.bin" ]
5 |     


--------------------------------------------------------------------------------
/cohorts/2022/04-deployment/homework/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | scikit-learn = "==1.0.2"
 8 | pandas = "*"
 9 | pyarrow = "*"
10 | s3fs = "*"
11 | 
12 | [dev-packages]
13 | 
14 | [requires]
15 | python_version = "3.9"
16 | 


--------------------------------------------------------------------------------
/cohorts/2022/04-deployment/homework/batch.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | import sys
 5 | import pickle
 6 | import pandas as pd
 7 | 
 8 | 
 9 | year = int(sys.argv[1]) # 2021
10 | month = int(sys.argv[2]) #2
11 | 
12 | input_file = f's3://nyc-tlc/trip data/fhv_tripdata_{year:04d}-{month:02d}.parquet'
13 | output_file = f's3://nyc-duration-prediction-alexey/taxi_type=fhv/year={year:04d}/month={month:02d}/predictions.parquet'
14 | 
15 | 
16 | with open('model.bin', 'rb') as f_in:
17 |     dv, lr = pickle.load(f_in)
18 | 
19 | 
20 | categorical = ['PUlocationID', 'DOlocationID']
21 | 
22 | def read_data(filename):
23 |     df = pd.read_parquet(filename)
24 |     
25 |     df['duration'] = df.dropOff_datetime - df.pickup_datetime
26 |     df['duration'] = df.duration.dt.total_seconds() / 60
27 | 
28 |     df = df[(df.duration >= 1) & (df.duration <= 60)].copy()
29 | 
30 |     df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
31 |     
32 |     return df
33 | 
34 | 
35 | df = read_data(input_file)
36 | df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')
37 | 
38 | 
39 | dicts = df[categorical].to_dict(orient='records')
40 | X_val = dv.transform(dicts)
41 | y_pred = lr.predict(X_val)
42 | 
43 | 
44 | print('predicted mean duration:', y_pred.mean())
45 | 
46 | 
47 | df_result = pd.DataFrame()
48 | df_result['ride_id'] = df['ride_id']
49 | df_result['predicted_duration'] = y_pred
50 | 
51 | 
52 | df_result.to_parquet(
53 |     output_file,
54 |     engine='pyarrow',
55 |     compression=None,
56 |     index=False
57 | )


--------------------------------------------------------------------------------
/cohorts/2022/04-deployment/homework/homework.dockerfile:
--------------------------------------------------------------------------------
 1 | FROM agrigorev/zoomcamp-model:mlops-3.9.7-slim
 2 | 
 3 | RUN pip install -U pip
 4 | RUN pip install pipenv 
 5 | 
 6 | WORKDIR /app
 7 | 
 8 | COPY [ "Pipfile", "Pipfile.lock", "./" ]
 9 | 
10 | RUN pipenv install --system --deploy
11 | 
12 | COPY [ "batch.py", "batch.py" ]
13 | 
14 | ENTRYPOINT [ "python", "batch.py" ]


--------------------------------------------------------------------------------
/cohorts/2022/04-deployment/homework/model.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2022/04-deployment/homework/model.bin


--------------------------------------------------------------------------------
/cohorts/2022/05-monitoring/homework/docker-compose-homework-solution.yml:
--------------------------------------------------------------------------------
 1 | version: "3.7"
 2 | 
 3 | volumes:
 4 |     mongo_data: {}
 5 | 
 6 | networks:
 7 |   front-tier:
 8 |   back-tier:
 9 | 
10 | services:
11 |   prediction_service:
12 |     build:
13 |       context: prediction_service
14 |       dockerfile: Dockerfile
15 |     depends_on:
16 |       - mongo
17 |     environment:
18 |       MONGO_DATABASE: "prediction_service"
19 |       MONGO_ADDRESS: "mongodb://mongo.:27017/"
20 |       MODEL_VERSION: "2"
21 |       MODEL_FILE: "lin_reg_V2.bin"
22 |     
23 |     ports:
24 |       - 9696:9696
25 |     networks:
26 |       - back-tier
27 |       - front-tier
28 | 
29 |   mongo:
30 |     image: mongo
31 |     ports:
32 |       - 27017:27017
33 |     volumes:
34 |       - mongo_data:/data/db
35 |     networks:
36 |       - back-tier
37 |       - front-tier
38 | 


--------------------------------------------------------------------------------
/cohorts/2022/05-monitoring/homework/docker-compose-homework.yml:
--------------------------------------------------------------------------------
 1 | version: "3.7"
 2 | 
 3 | volumes:
 4 |     mongo_data: {}
 5 | 
 6 | networks:
 7 |   front-tier:
 8 |   back-tier:
 9 | 
10 | services:
11 |   prediction_service:
12 |     build:
13 |       context: prediction_service
14 |       dockerfile: Dockerfile
15 |     depends_on:
16 |       - mongo
17 |     environment:
18 |       MONGO_DATABASE: "prediction_service"
19 |       MONGO_ADDRESS: "mongodb://mongo.:27017/"
20 |       MODEL_VERSION: "1"
21 |       MODEL_FILE: "lin_reg.bin"
22 |     
23 |     ports:
24 |       - 9696:9696
25 |     networks:
26 |       - back-tier
27 |       - front-tier
28 | 
29 |   mongo:
30 |     image: mongo
31 |     ports:
32 |       - 27017:27017
33 |     volumes:
34 |       - mongo_data:/data/db
35 |     networks:
36 |       - back-tier
37 |       - front-tier
38 | 


--------------------------------------------------------------------------------
/cohorts/2022/05-monitoring/homework/model_training.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | import pandas as pd
 4 | import pyarrow.parquet as pq
 5 | from sklearn.feature_extraction import DictVectorizer
 6 | from sklearn.linear_model import LinearRegression
 7 | 
 8 | 
 9 | def read_dataframe(filename):
10 |     df = pq.read_table(filename).to_pandas()
11 | 
12 |     df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
13 |     df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
14 | 
15 |     df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
16 |     df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
17 | 
18 |     df = df[(df.duration >= 1) & (df.duration <= 60)]
19 | 
20 |     categorical = ['PULocationID', 'DOLocationID']
21 |     df[categorical] = df[categorical].astype(str)
22 |     
23 |     return df
24 | 
25 | def add_features(train_data="./datasets/green_tripdata_2021-03.parquet",
26 |                  additional_training_data=None):
27 |     df_train = read_dataframe(train_data)
28 | 
29 |     if additional_training_data:
30 |         extra_data = read_dataframe(additional_training_data)
31 |         df_train = pd.concat([df_train, extra_data], axis=0, ignore_index=True)
32 | 
33 | 
34 | 
35 |     df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
36 | 
37 |     categorical = ['PU_DO'] 
38 |     numerical = ['trip_distance']
39 | 
40 |     dv = DictVectorizer()
41 | 
42 |     train_dicts = df_train[categorical + numerical].to_dict(orient='records')
43 |     X_train = dv.fit_transform(train_dicts)
44 | 
45 |     target = 'duration'
46 |     y_train = df_train[target].values
47 | 
48 |     return X_train, y_train, dv
49 | 
50 | 
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     X_train, y_train, dv = add_features()
55 |     
56 |     print("Training model with one month of data")
57 |     lr = LinearRegression()
58 |     lr.fit(X_train, y_train)
59 | 
60 |     
61 |     with open('prediction_service/lin_reg.bin', 'wb') as f_out:
62 |         pickle.dump((dv, lr), f_out)
63 | 
64 |     X_train, y_train, dv = add_features(additional_training_data="./datasets/green_tripdata_2021-04.parquet")
65 |     print("Training model with two months of data")
66 |     lr = LinearRegression()
67 |     lr.fit(X_train, y_train)
68 | 
69 |     with open('prediction_service/lin_reg_V2.bin', 'wb') as f_out:
70 |         pickle.dump((dv, lr), f_out)
71 | 


--------------------------------------------------------------------------------
/cohorts/2022/05-monitoring/homework/prediction_service/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.8-slim-buster
 2 | 
 3 | RUN pip install -U pip
 4 | RUN pip install pipenv 
 5 | 
 6 | WORKDIR /app
 7 | 
 8 | COPY [ "Pipfile", "Pipfile.lock", "./" ]
 9 | 
10 | RUN pipenv install --system --deploy
11 | 
12 | COPY [ "app.py", "lin_reg.bin", "lin_reg_V2.bin", "./" ]
13 | 
14 | EXPOSE 9696
15 | 
16 | ENTRYPOINT ["gunicorn", "--bind=0.0.0.0:9696", "app:app" ]
17 | 


--------------------------------------------------------------------------------
/cohorts/2022/05-monitoring/homework/prediction_service/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | scikit-learn = "==1.0.2"
 8 | flask = "==2.0.1"
 9 | pandas = "==1.1.5"
10 | evidently = "*"
11 | pymongo = "*"
12 | gunicorn = "*"
13 | 
14 | [dev-packages]
15 | pyarrow = "*"
16 | 
17 | [requires]
18 | python_version = "3.8"
19 | 


--------------------------------------------------------------------------------
/cohorts/2022/05-monitoring/homework/prediction_service/app.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import pickle
 4 | import uuid
 5 | 
 6 | from flask import Flask, jsonify, request
 7 | from pymongo import MongoClient
 8 | 
 9 | 
10 | MONGO_ADDRESS = os.getenv("MONGO_ADDRESS", "mongodb://localhost:27017/")
11 | MONGO_DATABASE = os.getenv("MONGO_DATABASE", "ride_prediction")
12 | LOGGED_MODEL = os.getenv("MODEL_FILE", "lin_reg.bin")
13 | MODEL_VERSION = os.getenv("MODEL_VERSION", "1")
14 | 
15 | with open(LOGGED_MODEL, 'rb') as f_in:
16 |     dv, model = pickle.load(f_in)
17 | 
18 | 
19 | mongo_client = MongoClient(MONGO_ADDRESS)
20 | mongo_db = mongo_client[MONGO_DATABASE]
21 | mongo_collection = mongo_db.get_collection("data")
22 | 
23 | 
24 | app = Flask("Ride-Prediction-Service")
25 | logging.basicConfig(level=logging.INFO)
26 | 
27 | 
28 | def prepare_features(ride):
29 |     """Function to prepare features before making prediction"""
30 | 
31 |     record = ride.copy()
32 |     record['PU_DO'] = '%s_%s' % (record['PULocationID'], record['DOLocationID'])
33 | 
34 |     features = dv.transform([record])
35 |    
36 |     return features, record
37 | 
38 | 
39 | def save_db(record, pred_result):
40 |     """Save data to mongo db collection"""
41 | 
42 |     rec = record.copy()
43 |     rec["prediction"] = pred_result[0]
44 |     mongo_collection.insert_one(rec)
45 | 
46 | 
47 | 
48 | @app.route("/", methods=["GET"])
49 | def get_info():
50 |     """Function to provide info about the app"""
51 |     info = """<H1>Ride Prediction Service</H1>
52 |               <div class="Data Request"> 
53 |                 <H3>Data Request Example</H3> 
54 |                 <div class="data">
55 |                 <p> "ride = {
56 |                     "PULocationID": 10,
57 |                     "DOLocationID": 50,
58 |                     "trip_distance": 40
59 |                     }"
60 |                 </p>
61 |                 </div>    
62 |                </div>"""
63 |     return info
64 | 
65 | @app.route("/predict-duration", methods=["POST"])
66 | def predict_duration():
67 |     """Function to predict duration"""
68 | 
69 |     ride = request.get_json()
70 |     features, record = prepare_features(ride)
71 | 
72 |     prediction = model.predict(features)
73 |     ride_id = str(uuid.uuid4())
74 |     pred_data = {
75 |             "ride_id": ride_id,
76 |             "PU_DO": record["PU_DO"],
77 |             "trip_distance": record["trip_distance"],
78 |             "status": 200,
79 |             "duration": prediction[0],
80 |             "model_version": MODEL_VERSION
81 |             }
82 | 
83 |     save_db(record, prediction)
84 | 
85 |     result = {
86 |         "statusCode": 200,
87 |         "data" : pred_data
88 |         }
89 | 
90 |     return jsonify(result)
91 | 
92 | 
93 | if __name__ == "__main__":
94 |     app.run(debug=True, host="0.0.0.0", port=9696)
95 | 


--------------------------------------------------------------------------------
/cohorts/2022/05-monitoring/homework/prediction_service/lin_reg.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2022/05-monitoring/homework/prediction_service/lin_reg.bin


--------------------------------------------------------------------------------
/cohorts/2022/05-monitoring/homework/prediction_service/lin_reg_V2.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2022/05-monitoring/homework/prediction_service/lin_reg_V2.bin


--------------------------------------------------------------------------------
/cohorts/2022/05-monitoring/homework/prefect-monitoring/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | scikit-learn = "==1.0.2"
 8 | pyarrow = "*"
 9 | prefect = "==2.0b8"
10 | pandas = "*"
11 | pymongo = "*"
12 | psutil = "==5.9.1"
13 | evidently = "*"
14 | 
15 | [dev-packages]
16 | 
17 | [requires]
18 | python_version = "3.8"
19 | 


--------------------------------------------------------------------------------
/cohorts/2022/05-monitoring/homework/prefect-monitoring/clean_mongo.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient
 2 | 
 3 | MONGO_CLIENT_ADDRESS = "mongodb://localhost:27017/"
 4 | MONGO_DATABASE = "prediction_service"
 5 | 
 6 | 
 7 | if __name__ == "__main__":
 8 |     client = MongoClient(MONGO_CLIENT_ADDRESS)
 9 |     client.drop_database(MONGO_DATABASE)
10 | 


--------------------------------------------------------------------------------
/cohorts/2022/05-monitoring/homework/prefect-monitoring/monitor_profile.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 48,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "from pymongo import MongoClient\n",
10 |     "import pprint"
11 |    ]
12 |   }
13 |  ],
14 |  "metadata": {
15 |   "kernelspec": {
16 |    "display_name": "Python 3.9.12 ('prediction_service_practice-b8Zbdkaa')",
17 |    "language": "python",
18 |    "name": "python3"
19 |   },
20 |   "language_info": {
21 |    "codemirror_mode": {
22 |     "name": "ipython",
23 |     "version": 3
24 |    },
25 |    "file_extension": ".py",
26 |    "mimetype": "text/x-python",
27 |    "name": "python",
28 |    "nbconvert_exporter": "python",
29 |    "pygments_lexer": "ipython3",
30 |    "version": "3.9.12"
31 |   },
32 |   "orig_nbformat": 4,
33 |   "vscode": {
34 |    "interpreter": {
35 |     "hash": "63df8a96dcc14a3f8fc6f13bb4daf95ac616547a440980d0dc62a5d5ed05a07e"
36 |    }
37 |   }
38 |  },
39 |  "nbformat": 4,
40 |  "nbformat_minor": 2
41 | }
42 | 


--------------------------------------------------------------------------------
/cohorts/2022/05-monitoring/homework/prefect-monitoring/monitor_profile_solution.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "from pymongo import MongoClient\n",
10 |     "import pprint"
11 |    ]
12 |   },
13 |   {
14 |    "cell_type": "code",
15 |    "execution_count": 2,
16 |    "metadata": {},
17 |    "outputs": [],
18 |    "source": [
19 |     "MONGO_CLIENT_ADDRESS = \"mongodb://localhost:27017/\"\n",
20 |     "MONGO_DATABASE = \"prediction_service\"\n",
21 |     "REPORT_COLLECTION = \"report\""
22 |    ]
23 |   },
24 |   {
25 |    "cell_type": "code",
26 |    "execution_count": 3,
27 |    "metadata": {},
28 |    "outputs": [],
29 |    "source": [
30 |     "client = MongoClient()\n",
31 |     "collection = client.get_database(MONGO_DATABASE).get_collection(REPORT_COLLECTION)"
32 |    ]
33 |   },
34 |   {
35 |    "cell_type": "code",
36 |    "execution_count": 4,
37 |    "metadata": {},
38 |    "outputs": [
39 |     {
40 |      "name": "stdout",
41 |      "output_type": "stream",
42 |      "text": [
43 |       "9\n"
44 |      ]
45 |     }
46 |    ],
47 |    "source": [
48 |     "for col in collection.find():\n",
49 |     "    pprint.pprint(len(col['data_drift']['data']['metrics'].keys()))"
50 |    ]
51 |   }
52 |  ],
53 |  "metadata": {
54 |   "kernelspec": {
55 |    "display_name": "Python 3.8.2 ('prefect-monitoring-vrjQsnUO')",
56 |    "language": "python",
57 |    "name": "python3"
58 |   },
59 |   "language_info": {
60 |    "codemirror_mode": {
61 |     "name": "ipython",
62 |     "version": 3
63 |    },
64 |    "file_extension": ".py",
65 |    "mimetype": "text/x-python",
66 |    "name": "python",
67 |    "nbconvert_exporter": "python",
68 |    "pygments_lexer": "ipython3",
69 |    "version": "3.8.2"
70 |   },
71 |   "orig_nbformat": 4,
72 |   "vscode": {
73 |    "interpreter": {
74 |     "hash": "8c4128a542e647ac345fb470a121f5ad37749126bd51dd0e4b0f94b08087470c"
75 |    }
76 |   }
77 |  },
78 |  "nbformat": 4,
79 |  "nbformat_minor": 2
80 | }
81 | 


--------------------------------------------------------------------------------
/cohorts/2022/05-monitoring/homework/prefect-monitoring/prepare_reference_data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pyarrow.parquet as pq
 3 | 
 4 | 
 5 | data_files = ["../datasets/green_tripdata_2021-03.parquet", "../datasets/green_tripdata_2021-04.parquet"]
 6 | output_file = "green_tripdata_2021-03to04.parquet"
 7 | 
 8 | df = pd.DataFrame()
 9 | for file in data_files:
10 |     data = pq.read_table(file).to_pandas()
11 |     df = pd.concat([data, df], ignore_index=True)
12 | 
13 | df.to_parquet(
14 |     output_file,
15 |     engine='pyarrow',
16 |     compression=None,
17 |     index=False
18 | )
19 | 


--------------------------------------------------------------------------------
/cohorts/2022/05-monitoring/homework/prefect-monitoring/send_data.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import uuid
 3 | from datetime import datetime
 4 | 
 5 | import pyarrow.parquet as pq
 6 | import requests
 7 | 
 8 | table = pq.read_table("../datasets/green_tripdata_2021-05.parquet")\
 9 |           .to_pandas()\
10 |           .sample(n=5000, random_state=42) #5000 rows sampled
11 | data = table.copy()
12 | 
13 | 
14 | class DateTimeEncoder(json.JSONEncoder):
15 |     def default(self, o):
16 |         if isinstance(o, datetime):
17 |             return o.isoformat()
18 |         return json.JSONEncoder.default(self, o)
19 | 
20 | 
21 | with open("target.csv", 'w') as f_target:
22 |     for index, row in data.iterrows():
23 |         row['id'] = str(uuid.uuid4())
24 |         duration = (row['lpep_dropoff_datetime'] - row['lpep_pickup_datetime']).total_seconds() / 60
25 |         if duration >= 1 and duration <= 60:
26 |             f_target.write(f"{row['id']},{duration}\n")
27 |         resp = requests.post("http://127.0.0.1:9696/predict-duration",
28 |                              headers={"Content-Type": "application/json"},
29 |                              data=row.to_json()).json()
30 |         print(f"prediction: {resp['data']['duration']}")
31 | 


--------------------------------------------------------------------------------
/cohorts/2022/05-monitoring/homework/prepare.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | import requests
 3 | 
 4 | files = ["green_tripdata_2021-03.parquet", "green_tripdata_2021-04.parquet", "green_tripdata_2021-05.parquet"]
 5 | path = "./datasets"
 6 | print(f"Download files:")
 7 | for file in files:
 8 | 
 9 |     # Change the url based on what works for you whether s3 or cloudfront
10 |     url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
11 |     resp = requests.get(url, stream=True)
12 |     save_path = f"{path}/{file}"
13 |     with open(save_path, "wb") as handle:
14 |         for data in tqdm(resp.iter_content(),
15 |                          desc=f"{file}",
16 |                          postfix=f"save to {save_path}",
17 |                          total=int(resp.headers["Content-Length"])):
18 |             handle.write(data)
19 | 


--------------------------------------------------------------------------------
/cohorts/2022/05-monitoring/homework/requirements.txt:
--------------------------------------------------------------------------------
 1 | scikit-learn==1.0.2
 2 | dataclasses==0.6
 3 | Flask~=2.0.1
 4 | pandas>=1.1.5
 5 | Werkzeug~=2.0.1
 6 | requests~=2.26.0
 7 | prometheus_client~=0.11.0
 8 | pyyaml~=5.4.1
 9 | tqdm
10 | pyarrow
11 | prefect==2.0b8
12 | pymongo
13 | evidently
14 | pipenv
15 | 


--------------------------------------------------------------------------------
/cohorts/2022/05-monitoring/homework/test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pprint
 3 | from pymongo import MongoClient
 4 | 
 5 | import requests
 6 | 
 7 | MONGODB_ADDRESS = os.getenv("MONGODB_ADDRESS", "mongodb://127.0.0.1:27017/")
 8 | FLASK_URL = "http://127.0.0.1:9696/predict-duration"
 9 | 
10 | 
11 | mongo_client = MongoClient(MONGODB_ADDRESS)
12 | mongo_db = mongo_client['prediction_service']
13 | mongo_collection = mongo_db['data']
14 | ride_test_data = {
15 |     "PULocationID": 10, 
16 |     "DOLocationID": 50,
17 |     "trip_distance": 40
18 |     }
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     requests.post(url=FLASK_URL ,json=ride_test_data)
23 |     for coll in mongo_collection.find():
24 |         pprint.pprint(coll)
25 |     


--------------------------------------------------------------------------------
/cohorts/2022/06-best-practices/homework/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9.7-slim
 2 | 
 3 | RUN pip install -U pip
 4 | RUN pip install pipenv 
 5 | 
 6 | WORKDIR /app
 7 | 
 8 | COPY [ "Pipfile", "Pipfile.lock", "./" ]
 9 | 
10 | RUN pipenv install --system --deploy
11 | 
12 | COPY [ "batch.py", "batch.py" ]
13 | COPY [ "model.bin", "model.bin" ]
14 | 
15 | ENTRYPOINT [ "python", "batch.py" ]


--------------------------------------------------------------------------------
/cohorts/2022/06-best-practices/homework/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | scikit-learn = "==1.0.2"
 8 | pandas = "*"
 9 | pyarrow = "*"
10 | s3fs = "*"
11 | 
12 | [dev-packages]
13 | 
14 | [requires]
15 | python_version = "3.9"
16 | 


--------------------------------------------------------------------------------
/cohorts/2022/06-best-practices/homework/batch.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | import sys
 5 | import pickle
 6 | import pandas as pd
 7 | 
 8 | 
 9 | year = int(sys.argv[1])
10 | month = int(sys.argv[2])
11 | 
12 | input_file = f'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/nyc-tlc/fhv/fhv_tripdata_{year:04d}-{month:02d}.parquet'
13 | output_file = f's3://nyc-duration-prediction-alexey/taxi_type=fhv/year={year:04d}/month={month:02d}/predictions.parquet'
14 | 
15 | 
16 | with open('model.bin', 'rb') as f_in:
17 |     dv, lr = pickle.load(f_in)
18 | 
19 | 
20 | categorical = ['PUlocationID', 'DOlocationID']
21 | 
22 | def read_data(filename):
23 |     df = pd.read_parquet(filename)
24 |     
25 |     df['duration'] = df.dropOff_datetime - df.pickup_datetime
26 |     df['duration'] = df.duration.dt.total_seconds() / 60
27 | 
28 |     df = df[(df.duration >= 1) & (df.duration <= 60)].copy()
29 | 
30 |     df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
31 |     
32 |     return df
33 | 
34 | 
35 | df = read_data(input_file)
36 | df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')
37 | 
38 | 
39 | dicts = df[categorical].to_dict(orient='records')
40 | X_val = dv.transform(dicts)
41 | y_pred = lr.predict(X_val)
42 | 
43 | 
44 | print('predicted mean duration:', y_pred.mean())
45 | 
46 | 
47 | df_result = pd.DataFrame()
48 | df_result['ride_id'] = df['ride_id']
49 | df_result['predicted_duration'] = y_pred
50 | 
51 | df_result.to_parquet(output_file, engine='pyarrow', index=False)


--------------------------------------------------------------------------------
/cohorts/2022/06-best-practices/homework/model.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2022/06-best-practices/homework/model.bin


--------------------------------------------------------------------------------
/cohorts/2022/06-best-practices/homework_solution/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9.7-slim
 2 | 
 3 | RUN pip install -U pip
 4 | RUN pip install pipenv 
 5 | 
 6 | WORKDIR /app
 7 | 
 8 | COPY [ "Pipfile", "Pipfile.lock", "./" ]
 9 | 
10 | RUN pipenv install --system --deploy
11 | 
12 | COPY [ "batch.py", "batch.py" ]
13 | COPY [ "model.bin", "model.bin" ]
14 | 
15 | ENTRYPOINT [ "python", "batch.py" ]


--------------------------------------------------------------------------------
/cohorts/2022/06-best-practices/homework_solution/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | scikit-learn = "==1.0.2"
 8 | pandas = "*"
 9 | pyarrow = "*"
10 | s3fs = "*"
11 | 
12 | [dev-packages]
13 | pytest = "*"
14 | 
15 | [requires]
16 | python_version = "3.9"
17 | 


--------------------------------------------------------------------------------
/cohorts/2022/06-best-practices/homework_solution/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | services:
2 |   s3:
3 |     image: localstack/localstack
4 |     ports:
5 |       - "4566:4566"
6 |     environment:
7 |       - SERVICES=s3


--------------------------------------------------------------------------------
/cohorts/2022/06-best-practices/homework_solution/integration_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from datetime import datetime
 4 | import pandas as pd
 5 | 
 6 | import batch
 7 | 
 8 | def dt(hour, minute, second=0):
 9 |     return datetime(2021, 1, 1, hour, minute, second)
10 | 
11 | 
12 | S3_ENDPOINT_URL = os.getenv('S3_ENDPOINT_URL')
13 | 
14 | options = {
15 |     'client_kwargs': {
16 |         'endpoint_url': S3_ENDPOINT_URL
17 |     }
18 | }
19 | 
20 | data = [
21 |     (None, None, dt(1, 2), dt(1, 10)),
22 |     (1, 1, dt(1, 2), dt(1, 10)),
23 |     (1, 1, dt(1, 2, 0), dt(1, 2, 50)),
24 |     (1, 1, dt(1, 2, 0), dt(2, 2, 1)),        
25 | ]
26 | 
27 | columns = ['PUlocationID', 'DOlocationID', 'pickup_datetime', 'dropOff_datetime']
28 | df_input = pd.DataFrame(data, columns=columns)
29 | 
30 | 
31 | input_file = batch.get_input_path(2021, 1)
32 | output_file = batch.get_output_path(2021, 1)
33 | 
34 | df_input.to_parquet(
35 |     input_file,
36 |     engine='pyarrow',
37 |     compression=None,
38 |     index=False,
39 |     storage_options=options
40 | )
41 | 
42 | 
43 | os.system('python batch.py 2021 1')
44 | 
45 | 
46 | df_actual = pd.read_parquet(output_file, storage_options=options)
47 | 
48 | 
49 | assert abs(df_actual['predicted_duration'].sum() - 69.28) < 0.1


--------------------------------------------------------------------------------
/cohorts/2022/06-best-practices/homework_solution/integration_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | 
 4 | 
 5 | docker-compose up -d
 6 | 
 7 | sleep 5
 8 | 
 9 | export INPUT_FILE_PATTERN="s3://nyc-duration/in/{year:04d}-{month:02d}.parquet"
10 | export OUTPUT_FILE_PATTERN="s3://nyc-duration/out/{year:04d}-{month:02d}.parquet"
11 | export S3_ENDPOINT_URL="http://localhost:4566"
12 | 
13 | 
14 | aws --endpoint-url="${S3_ENDPOINT_URL}" s3 mb s3://nyc-duration
15 | 
16 | pipenv run python integration_test.py
17 | 
18 | ERROR_CODE=$?
19 | 
20 | if [ ${ERROR_CODE} != 0 ]; then
21 |     docker-compose logs
22 |     docker-compose down
23 |     exit ${ERROR_CODE}
24 | fi
25 | 
26 | echo "yay tests work!"
27 | 
28 | docker-compose down


--------------------------------------------------------------------------------
/cohorts/2022/06-best-practices/homework_solution/model.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2022/06-best-practices/homework_solution/model.bin


--------------------------------------------------------------------------------
/cohorts/2022/06-best-practices/homework_solution/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2022/06-best-practices/homework_solution/tests/__init__.py


--------------------------------------------------------------------------------
/cohorts/2022/06-best-practices/homework_solution/tests/test_batch.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | import pandas as pd
 4 | 
 5 | import batch
 6 | 
 7 | 
 8 | def dt(hour, minute, second=0):
 9 |     return datetime(2021, 1, 1, hour, minute, second)
10 | 
11 | 
12 | def test_prepare_data():
13 |     data = [
14 |         (None, None, dt(1, 2), dt(1, 10)),
15 |         (1, 1, dt(1, 2), dt(1, 10)),
16 |         (1, 1, dt(1, 2, 0), dt(1, 2, 50)),
17 |         (1, 1, dt(1, 2, 0), dt(2, 2, 1)),        
18 |     ]
19 | 
20 |     categorical = ['PUlocationID', 'DOlocationID']
21 |     columns = ['PUlocationID', 'DOlocationID', 'pickup_datetime', 'dropOff_datetime']
22 |     df = pd.DataFrame(data, columns=columns)
23 | 
24 |     df_actual = batch.prepare_data(df, categorical)
25 | 
26 |     data_expected = [
27 |         ('-1', '-1', 8.0),
28 |         ( '1',  '1', 8.0),
29 |     ]
30 | 
31 |     columns_test = ['PUlocationID', 'DOlocationID', 'duration']
32 |     df_expected = pd.DataFrame(data_expected, columns=columns_test)
33 |     print(df_actual)
34 | 
35 |     assert (df_actual['PUlocationID'] == df_expected['PUlocationID']).all()
36 |     assert (df_actual['DOlocationID'] == df_expected['DOlocationID']).all()
37 |     assert (df_actual['duration'] - df_expected['duration']).abs().sum() < 0.0000001
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/cohorts/2022/07-project/README.md:
--------------------------------------------------------------------------------
 1 | ## Course Project
 2 | 
 3 | The goal of this project is to apply everything we learned
 4 | in this course and build an end-to-end machine learning project.
 5 | 
 6 | Remember that to pass the project, you must evaluate 3 peers. If you don't do that, your project can't be considered compelete.  
 7 | 
 8 | 
 9 | ### Submitting 
10 | 
11 | 
12 | #### Project Cohort #2
13 | 
14 | Project:
15 | 
16 | * Form: https://forms.gle/aj8LHkY7PrWG9XzW6
17 | * Deadline: 12 September, 23:00 CEST
18 | 
19 | Peer reviewing:
20 | 
21 | * Peer review assignments: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vQYTps829bmaN-aaJPiBUc3UwtN3e_llI44DKv-rQDsmVRMS1No7XWQqOyNI4ZbFbIvN351Q-G6edCP/pubhtml) ("project 2" tab)
22 | * Form: https://forms.gle/BeQ2HCohrM3puKf26
23 | * Deadline: 19 September, 23:00 CEST
24 | 
25 | Project feedback: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vRB5xKkhCyAUVNSNJvxaP94RwgNbYhf3dNf_ctRHhNKvvQQB94YVBn9JRdCTdQb5NGCJdYBtjXP7tP9/pubhtml) ("feedback-02" tab)
26 | 
27 | 
28 | #### Project Cohort #1
29 | 
30 | Project:
31 | 
32 | * Form: https://forms.gle/7UmQkK4BBxqdgMDp9
33 | * Deadline: 22 August, 23:00 CEST
34 | 
35 | Peer reviewing:
36 | 
37 | * Peer review assignments: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vQYTps829bmaN-aaJPiBUc3UwtN3e_llI44DKv-rQDsmVRMS1No7XWQqOyNI4ZbFbIvN351Q-G6edCP/pubhtml) ("project 1" tab)
38 | * Form: https://forms.gle/KaBMoYhmfeEFmiWb7
39 | * Deadline: 29 August, 23:00 CEST
40 | 
41 | Project feedback: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vRB5xKkhCyAUVNSNJvxaP94RwgNbYhf3dNf_ctRHhNKvvQQB94YVBn9JRdCTdQb5NGCJdYBtjXP7tP9/pubhtml) ("feedback-02" tab)
42 | 
43 | 
44 | ### Evaluation criteria
45 | 
46 | See [here](../../../07-project/README.md)
47 | 
48 | 
49 | ### Misc
50 | 
51 | To get the hash for your project, use this function to hash your email:
52 | 
53 | ```python
54 | from hashlib import sha1
55 | 
56 | def compute_hash(email):
57 |     return sha1(email.lower().encode('utf-8')).hexdigest()
58 | ```
59 | 
60 | Or use [this website](http://www.sha1-online.com/). 


--------------------------------------------------------------------------------
/cohorts/2023/01-intro/homework.md:
--------------------------------------------------------------------------------
  1 | ## Homework
  2 | 
  3 | The goal of this homework is to train a simple model for predicting the duration of a ride - similar to what we did in this module.
  4 | 
  5 | 
  6 | ## Q1. Downloading the data
  7 | 
  8 | We'll use [the same NYC taxi dataset](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page),
  9 | but instead of "**Green** Taxi Trip Records", we'll use "**Yellow** Taxi Trip Records".
 10 | 
 11 | Download the data for January and February 2022.
 12 | 
 13 | Read the data for January. How many columns are there?
 14 | 
 15 | * 16
 16 | * 17
 17 | * 18
 18 | * 19
 19 | 
 20 | 
 21 | ## Q2. Computing duration
 22 | 
 23 | Now let's compute the `duration` variable. It should contain the duration of a ride in minutes. 
 24 | 
 25 | What's the standard deviation of the trips duration in January?
 26 | 
 27 | * 41.45
 28 | * 46.45
 29 | * 51.45
 30 | * 56.45
 31 | 
 32 | 
 33 | ## Q3. Dropping outliers
 34 | 
 35 | Next, we need to check the distribution of the `duration` variable. There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).
 36 | 
 37 | What fraction of the records left after you dropped the outliers?
 38 | 
 39 | * 90%
 40 | * 92%
 41 | * 95%
 42 | * 98%
 43 | 
 44 | 
 45 | ## Q4. One-hot encoding
 46 | 
 47 | Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model. 
 48 | 
 49 | * Turn the dataframe into a list of dictionaries (remember to re-cast the ids to strings - otherwise it will 
 50 |   label encode them)
 51 | * Fit a dictionary vectorizer 
 52 | * Get a feature matrix from it
 53 | 
 54 | What's the dimensionality of this matrix (number of columns)?
 55 | 
 56 | * 2
 57 | * 155
 58 | * 345
 59 | * 515
 60 | * 715
 61 | 
 62 | 
 63 | ## Q5. Training a model
 64 | 
 65 | Now let's use the feature matrix from the previous step to train a model. 
 66 | 
 67 | * Train a plain linear regression model with default parameters 
 68 | * Calculate the RMSE of the model on the training data
 69 | 
 70 | What's the RMSE on train?
 71 | 
 72 | * 6.99
 73 | * 11.99
 74 | * 16.99
 75 | * 21.99
 76 | 
 77 | 
 78 | ## Q6. Evaluating the model
 79 | 
 80 | Now let's apply this model to the validation dataset (February 2022). 
 81 | 
 82 | What's the RMSE on validation?
 83 | 
 84 | * 7.79
 85 | * 12.79
 86 | * 17.79
 87 | * 22.79
 88 | 
 89 | ## Submit the results
 90 | 
 91 | * Submit your results here: https://forms.gle/uYTnWrcsubi2gdGV7
 92 | * You can submit your solution multiple times. In this case, only the last submission will be used
 93 | * If your answer doesn't match options exactly, select the closest one
 94 | 
 95 | 
 96 | ## Deadline
 97 | 
 98 | The deadline for submitting is 23 May 2023 (Tuesday), 23:00 CEST (Berlin time). 
 99 | 
100 | After that, the form will be closed.


--------------------------------------------------------------------------------
/cohorts/2023/02-experiment-tracking/homework-wandb/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import click
 4 | 
 5 | import wandb
 6 | 
 7 | from sklearn.ensemble import RandomForestRegressor
 8 | from sklearn.metrics import mean_squared_error
 9 | 
10 | 
11 | def load_pickle(filename: str):
12 |     with open(filename, "rb") as f_in:
13 |         return pickle.load(f_in)
14 | 
15 | 
16 | @click.command()
17 | @click.option("--wandb_project", help="Name of Weights & Biases project")
18 | @click.option("--wandb_entity", help="Name of Weights & Biases entity")
19 | @click.option(
20 |     "--data_artifact",
21 |     help="Address of the Weights & Biases artifact holding the preprocessed data",
22 | )
23 | @click.option("--random_state", default=0, help="Random state")
24 | @click.option("--max_depth", default=10, help="Max tree depth")
25 | def run_train(
26 |     wandb_project: str,
27 |     wandb_entity: str,
28 |     data_artifact: str,
29 |     max_depth: int,
30 |     random_state: int,
31 | ):
32 |     # Initialize a Weights & Biases run
33 |     wandb.init(
34 |         project=wandb_project,
35 |         entity=wandb_entity,
36 |         job_type="train",
37 |         config={"max_depth": max_depth, "random_state": random_state},
38 |     )
39 | 
40 |     # Fetch the preprocessed dataset from artifacts
41 |     artifact = wandb.use_artifact(data_artifact, type="preprocessed_dataset")
42 |     data_path = artifact.download()
43 | 
44 |     X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
45 |     X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
46 | 
47 |     # Define the XGBoost Regressor Mode, train the model and perform prediction
48 |     rf = RandomForestRegressor(max_depth=max_depth, random_state=random_state)
49 |     rf.fit(X_train, y_train)
50 |     y_pred = rf.predict(X_val)
51 | 
52 |     mse = mean_squared_error(y_val, y_pred, squared=False)
53 |     # TODO: Log `mse` to Weights & Biases under the key `"MSE"`
54 | 
55 |     with open("regressor.pkl", "wb") as f:
56 |         pickle.dump(rf, f)
57 | 
58 |     # TODO: Log `regressor.pkl` as an artifact of type `model`
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     run_train()
63 | 


--------------------------------------------------------------------------------
/cohorts/2023/02-experiment-tracking/homework/hpo.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import click
 4 | import mlflow
 5 | import optuna
 6 | 
 7 | from optuna.samplers import TPESampler
 8 | from sklearn.ensemble import RandomForestRegressor
 9 | from sklearn.metrics import mean_squared_error
10 | 
11 | mlflow.set_tracking_uri("http://127.0.0.1:5000")
12 | mlflow.set_experiment("random-forest-hyperopt")
13 | 
14 | 
15 | def load_pickle(filename):
16 |     with open(filename, "rb") as f_in:
17 |         return pickle.load(f_in)
18 | 
19 | 
20 | @click.command()
21 | @click.option(
22 |     "--data_path",
23 |     default="./output",
24 |     help="Location where the processed NYC taxi trip data was saved"
25 | )
26 | @click.option(
27 |     "--num_trials",
28 |     default=10,
29 |     help="The number of parameter evaluations for the optimizer to explore"
30 | )
31 | def run_optimization(data_path: str, num_trials: int):
32 | 
33 |     X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
34 |     X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
35 | 
36 |     def objective(trial):
37 |         params = {
38 |             'n_estimators': trial.suggest_int('n_estimators', 10, 50, 1),
39 |             'max_depth': trial.suggest_int('max_depth', 1, 20, 1),
40 |             'min_samples_split': trial.suggest_int('min_samples_split', 2, 10, 1),
41 |             'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4, 1),
42 |             'random_state': 42,
43 |             'n_jobs': -1
44 |         }
45 | 
46 |         rf = RandomForestRegressor(**params)
47 |         rf.fit(X_train, y_train)
48 |         y_pred = rf.predict(X_val)
49 |         rmse = mean_squared_error(y_val, y_pred, squared=False)
50 | 
51 |         return rmse
52 | 
53 |     sampler = TPESampler(seed=42)
54 |     study = optuna.create_study(direction="minimize", sampler=sampler)
55 |     study.optimize(objective, n_trials=num_trials)
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     run_optimization()
60 | 


--------------------------------------------------------------------------------
/cohorts/2023/02-experiment-tracking/homework/preprocess_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import click
 4 | import pandas as pd
 5 | 
 6 | from sklearn.feature_extraction import DictVectorizer
 7 | 
 8 | 
 9 | def dump_pickle(obj, filename: str):
10 |     with open(filename, "wb") as f_out:
11 |         return pickle.dump(obj, f_out)
12 | 
13 | 
14 | def read_dataframe(filename: str):
15 |     df = pd.read_parquet(filename)
16 | 
17 |     df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
18 |     df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
19 |     df = df[(df.duration >= 1) & (df.duration <= 60)]
20 | 
21 |     categorical = ['PULocationID', 'DOLocationID']
22 |     df[categorical] = df[categorical].astype(str)
23 | 
24 |     return df
25 | 
26 | 
27 | def preprocess(df: pd.DataFrame, dv: DictVectorizer, fit_dv: bool = False):
28 |     df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
29 |     categorical = ['PU_DO']
30 |     numerical = ['trip_distance']
31 |     dicts = df[categorical + numerical].to_dict(orient='records')
32 |     if fit_dv:
33 |         X = dv.fit_transform(dicts)
34 |     else:
35 |         X = dv.transform(dicts)
36 |     return X, dv
37 | 
38 | 
39 | @click.command()
40 | @click.option(
41 |     "--raw_data_path",
42 |     help="Location where the raw NYC taxi trip data was saved"
43 | )
44 | @click.option(
45 |     "--dest_path",
46 |     help="Location where the resulting files will be saved"
47 | )
48 | def run_data_prep(raw_data_path: str, dest_path: str, dataset: str = "green"):
49 |     # Load parquet files
50 |     df_train = read_dataframe(
51 |         os.path.join(raw_data_path, f"{dataset}_tripdata_2022-01.parquet")
52 |     )
53 |     df_val = read_dataframe(
54 |         os.path.join(raw_data_path, f"{dataset}_tripdata_2022-02.parquet")
55 |     )
56 |     df_test = read_dataframe(
57 |         os.path.join(raw_data_path, f"{dataset}_tripdata_2022-03.parquet")
58 |     )
59 | 
60 |     # Extract the target
61 |     target = 'tip_amount'
62 |     y_train = df_train[target].values
63 |     y_val = df_val[target].values
64 |     y_test = df_test[target].values
65 | 
66 |     # Fit the DictVectorizer and preprocess data
67 |     dv = DictVectorizer()
68 |     X_train, dv = preprocess(df_train, dv, fit_dv=True)
69 |     X_val, _ = preprocess(df_val, dv, fit_dv=False)
70 |     X_test, _ = preprocess(df_test, dv, fit_dv=False)
71 | 
72 |     # Create dest_path folder unless it already exists
73 |     os.makedirs(dest_path, exist_ok=True)
74 | 
75 |     # Save DictVectorizer and datasets
76 |     dump_pickle(dv, os.path.join(dest_path, "dv.pkl"))
77 |     dump_pickle((X_train, y_train), os.path.join(dest_path, "train.pkl"))
78 |     dump_pickle((X_val, y_val), os.path.join(dest_path, "val.pkl"))
79 |     dump_pickle((X_test, y_test), os.path.join(dest_path, "test.pkl"))
80 | 
81 | 
82 | if __name__ == '__main__':
83 |     run_data_prep()
84 | 


--------------------------------------------------------------------------------
/cohorts/2023/02-experiment-tracking/homework/register_model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import click
 4 | import mlflow
 5 | 
 6 | from mlflow.entities import ViewType
 7 | from mlflow.tracking import MlflowClient
 8 | from sklearn.ensemble import RandomForestRegressor
 9 | from sklearn.metrics import mean_squared_error
10 | 
11 | HPO_EXPERIMENT_NAME = "random-forest-hyperopt"
12 | EXPERIMENT_NAME = "random-forest-best-models"
13 | RF_PARAMS = ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'random_state', 'n_jobs']
14 | 
15 | mlflow.set_tracking_uri("http://127.0.0.1:5000")
16 | mlflow.set_experiment(EXPERIMENT_NAME)
17 | mlflow.sklearn.autolog()
18 | 
19 | 
20 | def load_pickle(filename):
21 |     with open(filename, "rb") as f_in:
22 |         return pickle.load(f_in)
23 | 
24 | 
25 | def train_and_log_model(data_path, params):
26 |     X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
27 |     X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
28 |     X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))
29 | 
30 |     with mlflow.start_run():
31 |         for param in RF_PARAMS:
32 |             params[param] = int(params[param])
33 | 
34 |         rf = RandomForestRegressor(**params)
35 |         rf.fit(X_train, y_train)
36 | 
37 |         # Evaluate model on the validation and test sets
38 |         val_rmse = mean_squared_error(y_val, rf.predict(X_val), squared=False)
39 |         mlflow.log_metric("val_rmse", val_rmse)
40 |         test_rmse = mean_squared_error(y_test, rf.predict(X_test), squared=False)
41 |         mlflow.log_metric("test_rmse", test_rmse)
42 | 
43 | 
44 | @click.command()
45 | @click.option(
46 |     "--data_path",
47 |     default="./output",
48 |     help="Location where the processed NYC taxi trip data was saved"
49 | )
50 | @click.option(
51 |     "--top_n",
52 |     default=5,
53 |     type=int,
54 |     help="Number of top models that need to be evaluated to decide which one to promote"
55 | )
56 | def run_register_model(data_path: str, top_n: int):
57 | 
58 |     client = MlflowClient()
59 | 
60 |     # Retrieve the top_n model runs and log the models
61 |     experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
62 |     runs = client.search_runs(
63 |         experiment_ids=experiment.experiment_id,
64 |         run_view_type=ViewType.ACTIVE_ONLY,
65 |         max_results=top_n,
66 |         order_by=["metrics.rmse ASC"]
67 |     )
68 |     for run in runs:
69 |         train_and_log_model(data_path=data_path, params=run.data.params)
70 | 
71 |     # Select the model with the lowest test RMSE
72 |     experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
73 |     # best_run = client.search_runs( ...  )[0]
74 | 
75 |     # Register the best model
76 |     # mlflow.register_model( ... )
77 | 
78 | 
79 | if __name__ == '__main__':
80 |     run_register_model()
81 | 


--------------------------------------------------------------------------------
/cohorts/2023/02-experiment-tracking/homework/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import click
 4 | 
 5 | from sklearn.ensemble import RandomForestRegressor
 6 | from sklearn.metrics import mean_squared_error
 7 | 
 8 | 
 9 | def load_pickle(filename: str):
10 |     with open(filename, "rb") as f_in:
11 |         return pickle.load(f_in)
12 | 
13 | 
14 | @click.command()
15 | @click.option(
16 |     "--data_path",
17 |     default="./output",
18 |     help="Location where the processed NYC taxi trip data was saved"
19 | )
20 | def run_train(data_path: str):
21 | 
22 |     X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
23 |     X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
24 | 
25 |     rf = RandomForestRegressor(max_depth=10, random_state=0)
26 |     rf.fit(X_train, y_train)
27 |     y_pred = rf.predict(X_val)
28 | 
29 |     rmse = mean_squared_error(y_val, y_pred, squared=False)
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     run_train()
34 | 


--------------------------------------------------------------------------------
/cohorts/2023/02-experiment-tracking/solution-mlflow/hpo.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import click
 4 | import mlflow
 5 | import optuna
 6 | 
 7 | from optuna.samplers import TPESampler
 8 | from sklearn.ensemble import RandomForestRegressor
 9 | from sklearn.metrics import mean_squared_error
10 | 
11 | mlflow.set_tracking_uri("http://127.0.0.1:5000")
12 | mlflow.set_experiment("random-forest-hyperopt")
13 | 
14 | 
15 | def load_pickle(filename):
16 |     with open(filename, "rb") as f_in:
17 |         return pickle.load(f_in)
18 | 
19 | 
20 | @click.command()
21 | @click.option(
22 |     "--data_path",
23 |     default="./output",
24 |     help="Location where the processed NYC taxi trip data was saved"
25 | )
26 | @click.option(
27 |     "--num_trials",
28 |     default=10,
29 |     help="The number of parameter evaluations for the optimizer to explore"
30 | )
31 | def run_optimization(data_path: str, num_trials: int):
32 |     mlflow.sklearn.autolog(disable=True)
33 | 
34 |     X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
35 |     X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
36 | 
37 |     def objective(trial):
38 |         params = {
39 |             'n_estimators': trial.suggest_int('n_estimators', 10, 50, 1),
40 |             'max_depth': trial.suggest_int('max_depth', 1, 20, 1),
41 |             'min_samples_split': trial.suggest_int('min_samples_split', 2, 10, 1),
42 |             'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4, 1),
43 |             'random_state': 42,
44 |             'n_jobs': -1
45 |         }
46 |         with mlflow.start_run():
47 |             mlflow.log_params(params)
48 |             rf = RandomForestRegressor(**params)
49 |             rf.fit(X_train, y_train)
50 |             y_pred = rf.predict(X_val)
51 |             rmse = mean_squared_error(y_val, y_pred, squared=False)
52 |             mlflow.log_metric("rmse", rmse)
53 | 
54 |         return rmse
55 | 
56 |     sampler = TPESampler(seed=42)
57 |     study = optuna.create_study(direction="minimize", sampler=sampler)
58 |     study.optimize(objective, n_trials=num_trials)
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     run_optimization()
63 | 


--------------------------------------------------------------------------------
/cohorts/2023/02-experiment-tracking/solution-mlflow/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import click
 4 | import mlflow
 5 | 
 6 | from sklearn.ensemble import RandomForestRegressor
 7 | from sklearn.metrics import mean_squared_error
 8 | 
 9 | 
10 | mlflow.set_tracking_uri("sqlite:///mlflow.db")
11 | mlflow.set_experiment("random-forest-train")
12 | 
13 | 
14 | def load_pickle(filename: str):
15 |     with open(filename, "rb") as f_in:
16 |         return pickle.load(f_in)
17 | 
18 | 
19 | @click.command()
20 | @click.option(
21 |     "--data_path",
22 |     default="./output",
23 |     help="Location where the processed NYC taxi trip data was saved"
24 | )
25 | def run_train(data_path: str):
26 |     mlflow.sklearn.autolog()
27 |     X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
28 |     X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
29 | 
30 |     with mlflow.start_run():
31 | 
32 |         rf = RandomForestRegressor(max_depth=10, random_state=0)
33 |         rf.fit(X_train, y_train)
34 |         y_pred = rf.predict(X_val)
35 | 
36 |         rmse = mean_squared_error(y_val, y_pred, squared=False)
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     run_train()
41 | 


--------------------------------------------------------------------------------
/cohorts/2023/03-orchestration/prefect/.gitignore:
--------------------------------------------------------------------------------
1 | data/
2 | .ipynb_checkpoints
3 | models/*
4 | mlruns/*
5 | .vscode/
6 | ./DS_Store
7 | *.db
8 | *.DS_Store


--------------------------------------------------------------------------------
/cohorts/2023/03-orchestration/prefect/3.2/cat_dog_facts.py:
--------------------------------------------------------------------------------
 1 | import httpx
 2 | from prefect import flow
 3 | 
 4 | @flow
 5 | def fetch_cat_fact():
 6 |     '''A flow that gets a cat fact'''
 7 |     return httpx.get("https://catfact.ninja/fact?max_length=140").json()["fact"]
 8 | 
 9 | @flow
10 | def fetch_dog_fact():
11 |     '''A flow that gets a dog fact'''
12 |     return httpx.get(
13 |         "https://dogapi.dog/api/v2/facts",
14 |         headers={"accept": "application/json"},
15 |     ).json()["data"][0]["attributes"]["body"]
16 | 
17 | @flow(log_prints=True)
18 | def animal_facts():
19 |     cat_fact = fetch_cat_fact()
20 |     dog_fact = fetch_dog_fact()
21 |     print(f"🐱: {cat_fact} \n🐶: {dog_fact}")
22 | 
23 | if __name__ == "__main__":
24 |     animal_facts()


--------------------------------------------------------------------------------
/cohorts/2023/03-orchestration/prefect/3.2/cat_facts.py:
--------------------------------------------------------------------------------
 1 | import httpx
 2 | from prefect import flow, task
 3 | 
 4 | 
 5 | @task(retries=4, retry_delay_seconds=0.1, log_prints=True)
 6 | def fetch_cat_fact():
 7 |     cat_fact = httpx.get("https://f3-vyx5c2hfpq-ue.a.run.app/")
 8 |     #An endpoint that is designed to fail sporadically
 9 |     if cat_fact.status_code >= 400:
10 |         raise Exception()
11 |     print(cat_fact.text)
12 | 
13 | 
14 | @flow
15 | def fetch():
16 |     fetch_cat_fact()
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     fetch()


--------------------------------------------------------------------------------
/cohorts/2023/03-orchestration/prefect/3.5/create_s3_bucket_block.py:
--------------------------------------------------------------------------------
 1 | from time import sleep
 2 | from prefect_aws import S3Bucket, AwsCredentials
 3 | 
 4 | 
 5 | def create_aws_creds_block():
 6 |     my_aws_creds_obj = AwsCredentials(
 7 |         aws_access_key_id="123abc", aws_secret_access_key="abc123"
 8 |     )
 9 |     my_aws_creds_obj.save(name="my-aws-creds", overwrite=True)
10 | 
11 | 
12 | def create_s3_bucket_block():
13 |     aws_creds = AwsCredentials.load("my-aws-creds")
14 |     my_s3_bucket_obj = S3Bucket(
15 |         bucket_name="my-first-bucket-abc", credentials=aws_creds
16 |     )
17 |     my_s3_bucket_obj.save(name="s3-bucket-example", overwrite=True)
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     create_aws_creds_block()
22 |     sleep(5)
23 |     create_s3_bucket_block()
24 | 


--------------------------------------------------------------------------------
/cohorts/2023/03-orchestration/prefect/3.6/create_s3_bucket_block.py:
--------------------------------------------------------------------------------
 1 | from time import sleep
 2 | from prefect_aws import S3Bucket, AwsCredentials
 3 | 
 4 | 
 5 | def create_aws_creds_block():
 6 |     my_aws_creds_obj = AwsCredentials(
 7 |         aws_access_key_id="123abc", aws_secret_access_key="abc123"
 8 |     )
 9 |     my_aws_creds_obj.save(name="my-aws-creds", overwrite=True)
10 | 
11 | 
12 | def create_s3_bucket_block():
13 |     aws_creds = AwsCredentials.load("my-aws-creds")
14 |     my_s3_bucket_obj = S3Bucket(
15 |         bucket_name="my-first-bucket-abc", credentials=aws_creds
16 |     )
17 |     my_s3_bucket_obj.save(name="s3-bucket-example", overwrite=True)
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     create_aws_creds_block()
22 |     sleep(5)
23 |     create_s3_bucket_block()
24 | 


--------------------------------------------------------------------------------
/cohorts/2023/03-orchestration/prefect/images/Activity-create-run-deployment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/03-orchestration/prefect/images/Activity-create-run-deployment.png


--------------------------------------------------------------------------------
/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-01.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-01.jpg


--------------------------------------------------------------------------------
/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-01.png


--------------------------------------------------------------------------------
/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-02.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-02.jpg


--------------------------------------------------------------------------------
/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-03.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-03.jpg


--------------------------------------------------------------------------------
/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-03.png


--------------------------------------------------------------------------------
/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-04.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-04.jpg


--------------------------------------------------------------------------------
/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-04.png


--------------------------------------------------------------------------------
/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-05.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-05.jpg


--------------------------------------------------------------------------------
/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-05.png


--------------------------------------------------------------------------------
/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-06.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-06.jpg


--------------------------------------------------------------------------------
/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-06.png


--------------------------------------------------------------------------------
/cohorts/2023/03-orchestration/prefect/meta.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "module": {
 3 |     "number": 3,
 4 |     "title": "Orchestration and ML Pipelines"
 5 |   },
 6 |   "units": [
 7 |     {
 8 |       "number": 1,
 9 |       "title": "Introdution to Workflow Orchestration",
10 |       "youtube": "https://www.youtube.com/watch?v=Cqb7wyaNF08"
11 |     },
12 |     {
13 |       "number": 2,
14 |       "title": "Introduction to Prefect",
15 |       "youtube": "https://www.youtube.com/watch?v=rTUBTvXvXvM"
16 |     },
17 |     {
18 |       "number": 3,
19 |       "title": "Prefect Workflow",
20 |       "youtube": "https://www.youtube.com/watch?v=x3bV8yMKjtc"
21 |     },
22 |     {
23 |       "number": 4,
24 |       "title": "Deploying Your Workflow",
25 |       "youtube": "https://www.youtube.com/watch?v=3YjagezFhOo"
26 |     },
27 |     {
28 |       "number": 5,
29 |       "title": "Working with Deployments",
30 |       "youtube": "https://www.youtube.com/watch?v=jVmaaqs63O8"
31 |     },
32 |     {
33 |       "number": 6,
34 |       "title": "Prefect Cloud (optional)",
35 |       "youtube": "https://www.youtube.com/watch?v=y89Ww85EUdo"
36 |     },
37 |     {
38 |       "number": 7,
39 |       "title": "Homework",
40 |       "youtube": ""
41 |     }
42 |   ]
43 | }


--------------------------------------------------------------------------------
/cohorts/2023/03-orchestration/prefect/requirements.txt:
--------------------------------------------------------------------------------
 1 | black==23.3.0
 2 | fastparquet==2023.4.0
 3 | hyperopt==0.2.7
 4 | mlflow==2.3.1
 5 | pandas==2.0.1
 6 | prefect==2.10.8
 7 | prefect-aws==0.3.1
 8 | scikit_learn==1.2.2
 9 | seaborn==0.12.2
10 | xgboost==1.7.5
11 | orjson==3.8.1


--------------------------------------------------------------------------------
/cohorts/2023/04-deployment/homework/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.10.0-slim
2 | 
3 | WORKDIR /app
4 | COPY [ "model2.bin", "model.bin" ]
5 |     


--------------------------------------------------------------------------------
/cohorts/2023/04-deployment/homework/model.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/04-deployment/homework/model.bin


--------------------------------------------------------------------------------
/cohorts/2023/04-deployment/homework/starter.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "2c51efaa",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "!pip freeze | grep scikit-learn"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 1,
 16 |    "id": "0ef880a0",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import pickle\n",
 21 |     "import pandas as pd"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 2,
 27 |    "id": "7836ccfd",
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "with open('model.bin', 'rb') as f_in:\n",
 32 |     "    dv, model = pickle.load(f_in)"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 3,
 38 |    "id": "41c08294",
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "categorical = ['PULocationID', 'DOLocationID']\n",
 43 |     "\n",
 44 |     "def read_data(filename):\n",
 45 |     "    df = pd.read_parquet(filename)\n",
 46 |     "    \n",
 47 |     "    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime\n",
 48 |     "    df['duration'] = df.duration.dt.total_seconds() / 60\n",
 49 |     "\n",
 50 |     "    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()\n",
 51 |     "\n",
 52 |     "    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')\n",
 53 |     "    \n",
 54 |     "    return df"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 4,
 60 |    "id": "4854399a",
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "df = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_????-??.parquet')"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 5,
 70 |    "id": "669fda0a",
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "dicts = df[categorical].to_dict(orient='records')\n",
 75 |     "X_val = dv.transform(dicts)\n",
 76 |     "y_pred = model.predict(X_val)"
 77 |    ]
 78 |   }
 79 |  ],
 80 |  "metadata": {
 81 |   "kernelspec": {
 82 |    "display_name": "Python 3 (ipykernel)",
 83 |    "language": "python",
 84 |    "name": "python3"
 85 |   },
 86 |   "language_info": {
 87 |    "codemirror_mode": {
 88 |     "name": "ipython",
 89 |     "version": 3
 90 |    },
 91 |    "file_extension": ".py",
 92 |    "mimetype": "text/x-python",
 93 |    "name": "python",
 94 |    "nbconvert_exporter": "python",
 95 |    "pygments_lexer": "ipython3",
 96 |    "version": "3.10.0"
 97 |   }
 98 |  },
 99 |  "nbformat": 4,
100 |  "nbformat_minor": 5
101 | }
102 | 


--------------------------------------------------------------------------------
/cohorts/2023/06-best-practices/homework/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.10.0-slim
 2 | 
 3 | RUN pip install -U pip & pip install pipenv
 4 | 
 5 | COPY [ "Pipfile", "Pipfile.lock", "./" ]
 6 | 
 7 | RUN pipenv install --system --deploy
 8 | 
 9 | COPY [ "batch.py", "batch.py" ]
10 | COPY [ "model.bin", "model.bin" ]
11 | 
12 | ENTRYPOINT [ "python", "batch.py" ]


--------------------------------------------------------------------------------
/cohorts/2023/06-best-practices/homework/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | scikit-learn = "==1.2.2"
 8 | pandas = "*"
 9 | pyarrow = "*"
10 | s3fs = "*"
11 | 
12 | [dev-packages]
13 | 
14 | [requires]
15 | python_version = "3.10"
16 | 


--------------------------------------------------------------------------------
/cohorts/2023/06-best-practices/homework/batch.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | import sys
 5 | import pickle
 6 | import pandas as pd
 7 | 
 8 | 
 9 | year = int(sys.argv[1])
10 | month = int(sys.argv[2])
11 | 
12 | input_file = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year:04d}-{month:02d}.parquet'
13 | output_file = f'output/yellow_tripdata_{year:04d}-{month:02d}.parquet'
14 | 
15 | 
16 | with open('model.bin', 'rb') as f_in:
17 |     dv, lr = pickle.load(f_in)
18 | 
19 | 
20 | categorical = ['PULocationID', 'DOLocationID']
21 | 
22 | def read_data(filename):
23 |     df = pd.read_parquet(filename)
24 |     
25 |     df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
26 |     df['duration'] = df.duration.dt.total_seconds() / 60
27 | 
28 |     df = df[(df.duration >= 1) & (df.duration <= 60)].copy()
29 | 
30 |     df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
31 |     
32 |     return df
33 | 
34 | 
35 | df = read_data(input_file)
36 | df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')
37 | 
38 | 
39 | dicts = df[categorical].to_dict(orient='records')
40 | X_val = dv.transform(dicts)
41 | y_pred = lr.predict(X_val)
42 | 
43 | 
44 | print('predicted mean duration:', y_pred.mean())
45 | 
46 | 
47 | df_result = pd.DataFrame()
48 | df_result['ride_id'] = df['ride_id']
49 | df_result['predicted_duration'] = y_pred
50 | 
51 | 
52 | df_result.to_parquet(output_file, engine='pyarrow', index=False)
53 | 


--------------------------------------------------------------------------------
/cohorts/2023/06-best-practices/homework/model.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/06-best-practices/homework/model.bin


--------------------------------------------------------------------------------
/cohorts/2023/06-best-practices/homework_solution/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | scikit-learn = "==1.2.2"
 8 | pandas = "*"
 9 | pyarrow = "*"
10 | s3fs = "*"
11 | 
12 | [dev-packages]
13 | pytest = "*"
14 | 
15 | [requires]
16 | python_version = "3.10"
17 | 


--------------------------------------------------------------------------------
/cohorts/2023/06-best-practices/homework_solution/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   localstack:
 3 |     image: localstack/localstack
 4 |     ports:
 5 |       - "4566:4566"
 6 |     environment:
 7 |       - SERVICES=s3
 8 |       - AWS_DEFAULT_REGION=eu-west-1
 9 |       - AWS_ACCESS_KEY_ID=abc
10 |       - AWS_SECRET_ACCESS_KEY=xyz


--------------------------------------------------------------------------------
/cohorts/2023/06-best-practices/homework_solution/integration_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | from datetime import datetime
 4 | 
 5 | import batch
 6 | 
 7 | 
 8 | def dt(hour, minute, second=0):
 9 |     return datetime(2022, 1, 1, hour, minute, second)
10 | 
11 | 
12 | S3_ENDPOINT_URL = os.getenv('S3_ENDPOINT_URL')
13 | 
14 | options = {
15 |     'client_kwargs': {
16 |         'endpoint_url': S3_ENDPOINT_URL
17 |     }
18 | }
19 | 
20 | data = [
21 |     (None, None, dt(1, 2), dt(1, 10)),
22 |     (1, None, dt(1, 2), dt(1, 10)),
23 |     (1, 2, dt(2, 2), dt(2, 3)),
24 |     (None, 1, dt(1, 2, 0), dt(1, 2, 50)),
25 |     (2, 3, dt(1, 2, 0), dt(1, 2, 59)),
26 |     (3, 4, dt(1, 2, 0), dt(2, 2, 1)),
27 | ]
28 | 
29 | columns = ['PULocationID', 'DOLocationID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime']
30 | df_input = pd.DataFrame(data, columns=columns)
31 | 
32 | 
33 | input_file = batch.get_input_path(2022, 1)
34 | output_file = batch.get_output_path(2022, 1)
35 | 
36 | df_input.to_parquet(
37 |     input_file,
38 |     engine='pyarrow',
39 |     compression=None,
40 |     index=False,
41 |     storage_options=options
42 | )
43 | 
44 | 
45 | os.system('python batch.py 2022 1')
46 | 
47 | 
48 | df_actual = pd.read_parquet(output_file, storage_options=options)
49 | print(df_actual['predicted_duration'].sum())
50 | 
51 | assert abs(df_actual['predicted_duration'].sum() - 31.51) < 0.1


--------------------------------------------------------------------------------
/cohorts/2023/06-best-practices/homework_solution/model.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/06-best-practices/homework_solution/model.bin


--------------------------------------------------------------------------------
/cohorts/2023/06-best-practices/homework_solution/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/06-best-practices/homework_solution/tests/__init__.py


--------------------------------------------------------------------------------
/cohorts/2023/06-best-practices/homework_solution/tests/test_batch.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from datetime import datetime
 3 | 
 4 | from batch import prepare_data
 5 | 
 6 | 
 7 | def dt(hour, minute, second=0):
 8 |     return datetime(2022, 1, 1, hour, minute, second)
 9 | 
10 | 
11 | def test_prepare_data():
12 |     data = [
13 |         (None, None, dt(1, 2), dt(1, 10)),
14 |         (1, None, dt(1, 2), dt(1, 10)),
15 |         (1, 2, dt(2, 2), dt(2, 3)),
16 |         (None, 1, dt(1, 2, 0), dt(1, 2, 50)),
17 |         (2, 3, dt(1, 2, 0), dt(1, 2, 59)),
18 |         (3, 4, dt(1, 2, 0), dt(2, 2, 1)),
19 |     ]
20 | 
21 |     categorical = ['PULocationID', 'DOLocationID']
22 |     columns = ['PULocationID', 'DOLocationID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime']
23 |     df = pd.DataFrame(data, columns=columns)
24 | 
25 |     df_actual = prepare_data(df, categorical)
26 | 
27 |     data_expected = [
28 |         ('-1', '-1', 8.0),
29 |         ('1',  '-1', 8.0),
30 |         ('1', '2', 1.0),
31 |     ]
32 | 
33 |     columns_test = ['PULocationID', 'DOLocationID', 'duration']
34 |     df_expected = pd.DataFrame(data_expected, columns=columns_test)
35 |     print(df_actual)
36 | 
37 |     assert (df_actual['PULocationID'] == df_expected['PULocationID']).all()
38 |     assert (df_actual['DOLocationID'] == df_expected['DOLocationID']).all()
39 |     assert (df_actual['duration'] - df_expected['duration']).abs().sum() < 0.0000001
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/cohorts/2023/07-project/README.md:
--------------------------------------------------------------------------------
 1 | ## Course Project
 2 | 
 3 | The goal of this project is to apply everything we learned
 4 | in this course and build an end-to-end machine learning project.
 5 | 
 6 | Remember that to pass the project, you must evaluate 3 peers. If you don't do that, your project can't be considered compelete.  
 7 | 
 8 | 
 9 | ### Submitting 
10 | 
11 | #### Project Cohort #1
12 | 
13 | Project:
14 | 
15 | * Form: https://forms.gle/mRRoDtqDXBytvsoD9
16 | * Deadline: 31 July, 23:00 CEST
17 | 
18 | Peer reviewing:
19 | 
20 | * Peer review assignments: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vTAztxXsLidZV8I18gL9_qtJpxhyvyhJsEeXrP3kFyZoauGgR-S4p6b7H5yJ9kdTbUE5wAAvZgTTZ49/pubhtml?gid=0&single=true) ("project 1" tab)
21 | * Form: https://forms.gle/MRMHDuFiP6DFShaj7
22 | * Deadline: 10 August, 23:00 CEST
23 | 
24 | Project feedback: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vTS8Mlu6sWyu6JinFUftUl6OB5mxXlwGT2icIyQCSbhDDmW36WWyAbv2dCFJhng6Nln0o3cwvTchjcU/pubhtml?gid=0&single=true) ("feedback-01" tab)
25 | 
26 | 
27 | 
28 | #### Project Cohort #2
29 | 
30 | Project:
31 | 
32 | * Form: https://forms.gle/o1s3NmYE4UmFSMVD7
33 | * Deadline: 21 August (Monday), 23:00 CEST
34 | 
35 | Peer reviewing:
36 | 
37 | * Peer review assignments: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vTAztxXsLidZV8I18gL9_qtJpxhyvyhJsEeXrP3kFyZoauGgR-S4p6b7H5yJ9kdTbUE5wAAvZgTTZ49/pubhtml?gid=1942033009&single=true) ("project 2" tab)
38 | * Form: https://forms.gle/R4Y58WSxGDWsDBFv7
39 | * Deadline: 29 August (Tuesday), 23:00 CEST
40 | 
41 | Project feedback: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vTS8Mlu6sWyu6JinFUftUl6OB5mxXlwGT2icIyQCSbhDDmW36WWyAbv2dCFJhng6Nln0o3cwvTchjcU/pubhtml?gid=546664034&single=true) ("feedback-02" tab)
42 | 
43 | 
44 | 
45 | ### Evaluation criteria
46 | 
47 | See [here](../../../07-project/README.md)
48 | 
49 | 
50 | ### Misc
51 | 
52 | To get the hash for your project, use this function to hash your email:
53 | 
54 | ```python
55 | from hashlib import sha1
56 | 
57 | def compute_hash(email):
58 |     return sha1(email.lower().encode('utf-8')).hexdigest()
59 | ```
60 | 
61 | Or use [this website](http://www.sha1-online.com/). 
62 | 


--------------------------------------------------------------------------------
/cohorts/2023/README.md:
--------------------------------------------------------------------------------
 1 | ## MLOps Zoomcamp 2023 Cohort
 2 | 
 3 | * [Pre-Course Live Q&A](https://www.youtube.com/watch?v=o34Q_61iA4Y&list=PL3MmuxUbc_hKqamJqQ7Ew8HxptJYnXqQM&index=1)
 4 | * [Technical FAQ](https://docs.google.com/document/d/12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0/edit)
 5 | * [Course Playlist: Only 2023 Live videos & homeworks](https://www.youtube.com/playlist?list=PL3MmuxUbc_hKqamJqQ7Ew8HxptJYnXqQM)
 6 | * [Leaderboard](https://docs.google.com/spreadsheets/d/e/2PACX-1vTHTc2eDorvcprX3SRd_ZejSnOjd7SUBlmr7ttYs9NsbS3G9szB9wMlMfCOLVL5XWCB0p8oaDOfffaZ/pubhtml)
 7 | * [Deadline calendar](https://docs.google.com/spreadsheets/d/e/2PACX-1vRNTwA0Of1lyprYpn2YxU-l0gvNeq-up7g7ITB42nPf2gT9Qd3PTzqTmkjAZjk1s__r7D99CsJfcZEO/pubhtml?gid=0&single=true) 
 8 | 
 9 | 
10 | [**Module 1: Introduction**](01-intro)
11 | 
12 | * [Homework](01-intro/homework.md)
13 | * [Solution](01-intro/homework.ipynb)
14 | 
15 | [**Module 2: Experiment Tracking**](02-experiment-tracking/)
16 | 
17 | * [Homework](02-experiment-tracking/homework.md)
18 | * [Workshop: Weights & Biases](02-experiment-tracking/wandb.md)
19 | * [Solution MLflow](02-experiment-tracking/solution-mlflow/)
20 | 
21 | [**Module 3: Orchestration and ML pipelines**](03-orchestration/)
22 | 
23 | * [Homework](03-orchestration/homework.md)
24 | 
25 | [**Module 4: Model Deployment**](04-deployment)
26 | 
27 | * [Homework](04-deployment/homework.md)
28 | * [Solution](04-deployment/homework_solution)
29 | 
30 | [**Module 5: Model Monitoring**](05-monitoring/)
31 | 
32 | * [Homework](05-monitoring/homework.md)
33 | 
34 | [**Module 6: Best Practices**](06-best-practices)
35 | 
36 | * [Homework](06-best-practices/homework.md)
37 | * [Solution](06-best-practices/homework_solution)
38 | 
39 | 
40 | [**Projects**](07-project/)
41 | 
42 | * [More information](07-project/README.md)
43 | 


--------------------------------------------------------------------------------
/cohorts/2024/01-intro/homework.md:
--------------------------------------------------------------------------------
 1 | ## Homework
 2 | 
 3 | The goal of this homework is to train a simple model for predicting the duration of a ride - similar to what we did in this module.
 4 | 
 5 | 
 6 | ## Q1. Downloading the data
 7 | 
 8 | We'll use [the same NYC taxi dataset](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page),
 9 | but instead of "**Green** Taxi Trip Records", we'll use "**Yellow** Taxi Trip Records".
10 | 
11 | Download the data for January and February 2023.
12 | 
13 | Read the data for January. How many columns are there?
14 | 
15 | * 16
16 | * 17
17 | * 18
18 | * 19
19 | 
20 | 
21 | ## Q2. Computing duration
22 | 
23 | Now let's compute the `duration` variable. It should contain the duration of a ride in minutes. 
24 | 
25 | What's the standard deviation of the trips duration in January?
26 | 
27 | * 32.59
28 | * 42.59
29 | * 52.59
30 | * 62.59
31 | 
32 | 
33 | ## Q3. Dropping outliers
34 | 
35 | Next, we need to check the distribution of the `duration` variable. There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).
36 | 
37 | What fraction of the records left after you dropped the outliers?
38 | 
39 | * 90%
40 | * 92%
41 | * 95%
42 | * 98%
43 | 
44 | 
45 | ## Q4. One-hot encoding
46 | 
47 | Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model. 
48 | 
49 | * Turn the dataframe into a list of dictionaries (remember to re-cast the ids to strings - otherwise it will 
50 |   label encode them)
51 | * Fit a dictionary vectorizer 
52 | * Get a feature matrix from it
53 | 
54 | What's the dimensionality of this matrix (number of columns)?
55 | 
56 | * 2
57 | * 155
58 | * 345
59 | * 515
60 | * 715
61 | 
62 | 
63 | ## Q5. Training a model
64 | 
65 | Now let's use the feature matrix from the previous step to train a model. 
66 | 
67 | * Train a plain linear regression model with default parameters, where duration is the response variable
68 | * Calculate the RMSE of the model on the training data
69 | 
70 | What's the RMSE on train?
71 | 
72 | * 3.64
73 | * 7.64
74 | * 11.64
75 | * 16.64
76 | 
77 | 
78 | ## Q6. Evaluating the model
79 | 
80 | Now let's apply this model to the validation dataset (February 2023). 
81 | 
82 | What's the RMSE on validation?
83 | 
84 | * 3.81
85 | * 7.81
86 | * 11.81
87 | * 16.81
88 | 
89 | ## Submit the results
90 | 
91 | * Submit your results here: https://courses.datatalks.club/mlops-zoomcamp-2024/homework/hw1
92 | * If your answer doesn't match options exactly, select the closest one
93 | 


--------------------------------------------------------------------------------
/cohorts/2024/02-experiment-tracking/homework/hpo.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import click
 4 | import mlflow
 5 | import numpy as np
 6 | from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
 7 | from hyperopt.pyll import scope
 8 | from sklearn.ensemble import RandomForestRegressor
 9 | from sklearn.metrics import mean_squared_error
10 | 
11 | mlflow.set_tracking_uri("http://127.0.0.1:5000")
12 | mlflow.set_experiment("random-forest-hyperopt")
13 | 
14 | 
15 | def load_pickle(filename: str):
16 |     with open(filename, "rb") as f_in:
17 |         return pickle.load(f_in)
18 | 
19 | 
20 | @click.command()
21 | @click.option(
22 |     "--data_path",
23 |     default="./output",
24 |     help="Location where the processed NYC taxi trip data was saved"
25 | )
26 | @click.option(
27 |     "--num_trials",
28 |     default=15,
29 |     help="The number of parameter evaluations for the optimizer to explore"
30 | )
31 | def run_optimization(data_path: str, num_trials: int):
32 | 
33 |     X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
34 |     X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
35 | 
36 |     def objective(params):
37 | 
38 |         rf = RandomForestRegressor(**params)
39 |         rf.fit(X_train, y_train)
40 |         y_pred = rf.predict(X_val)
41 |         rmse = mean_squared_error(y_val, y_pred, squared=False)
42 | 
43 |         return {'loss': rmse, 'status': STATUS_OK}
44 | 
45 |     search_space = {
46 |         'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
47 |         'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
48 |         'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
49 |         'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
50 |         'random_state': 42
51 |     }
52 | 
53 |     rstate = np.random.default_rng(42)  # for reproducible results
54 |     fmin(
55 |         fn=objective,
56 |         space=search_space,
57 |         algo=tpe.suggest,
58 |         max_evals=num_trials,
59 |         trials=Trials(),
60 |         rstate=rstate
61 |     )
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     run_optimization()
66 | 


--------------------------------------------------------------------------------
/cohorts/2024/02-experiment-tracking/homework/preprocess_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import click
 4 | import pandas as pd
 5 | 
 6 | from sklearn.feature_extraction import DictVectorizer
 7 | 
 8 | 
 9 | def dump_pickle(obj, filename: str):
10 |     with open(filename, "wb") as f_out:
11 |         return pickle.dump(obj, f_out)
12 | 
13 | 
14 | def read_dataframe(filename: str):
15 |     df = pd.read_parquet(filename)
16 | 
17 |     df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
18 |     df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
19 |     df = df[(df.duration >= 1) & (df.duration <= 60)]
20 | 
21 |     categorical = ['PULocationID', 'DOLocationID']
22 |     df[categorical] = df[categorical].astype(str)
23 | 
24 |     return df
25 | 
26 | 
27 | def preprocess(df: pd.DataFrame, dv: DictVectorizer, fit_dv: bool = False):
28 |     df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
29 |     categorical = ['PU_DO']
30 |     numerical = ['trip_distance']
31 |     dicts = df[categorical + numerical].to_dict(orient='records')
32 |     if fit_dv:
33 |         X = dv.fit_transform(dicts)
34 |     else:
35 |         X = dv.transform(dicts)
36 |     return X, dv
37 | 
38 | 
39 | @click.command()
40 | @click.option(
41 |     "--raw_data_path",
42 |     help="Location where the raw NYC taxi trip data was saved"
43 | )
44 | @click.option(
45 |     "--dest_path",
46 |     help="Location where the resulting files will be saved"
47 | )
48 | def run_data_prep(raw_data_path: str, dest_path: str, dataset: str = "green"):
49 |     # Load parquet files
50 |     df_train = read_dataframe(
51 |         os.path.join(raw_data_path, f"{dataset}_tripdata_2023-01.parquet")
52 |     )
53 |     df_val = read_dataframe(
54 |         os.path.join(raw_data_path, f"{dataset}_tripdata_2023-02.parquet")
55 |     )
56 |     df_test = read_dataframe(
57 |         os.path.join(raw_data_path, f"{dataset}_tripdata_2023-03.parquet")
58 |     )
59 | 
60 |     # Extract the target
61 |     target = 'duration'
62 |     y_train = df_train[target].values
63 |     y_val = df_val[target].values
64 |     y_test = df_test[target].values
65 | 
66 |     # Fit the DictVectorizer and preprocess data
67 |     dv = DictVectorizer()
68 |     X_train, dv = preprocess(df_train, dv, fit_dv=True)
69 |     X_val, _ = preprocess(df_val, dv, fit_dv=False)
70 |     X_test, _ = preprocess(df_test, dv, fit_dv=False)
71 | 
72 |     # Create dest_path folder unless it already exists
73 |     os.makedirs(dest_path, exist_ok=True)
74 | 
75 |     # Save DictVectorizer and datasets
76 |     dump_pickle(dv, os.path.join(dest_path, "dv.pkl"))
77 |     dump_pickle((X_train, y_train), os.path.join(dest_path, "train.pkl"))
78 |     dump_pickle((X_val, y_val), os.path.join(dest_path, "val.pkl"))
79 |     dump_pickle((X_test, y_test), os.path.join(dest_path, "test.pkl"))
80 | 
81 | 
82 | if __name__ == '__main__':
83 |     run_data_prep()
84 | 


--------------------------------------------------------------------------------
/cohorts/2024/02-experiment-tracking/homework/register_model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import click
 4 | import mlflow
 5 | 
 6 | from mlflow.entities import ViewType
 7 | from mlflow.tracking import MlflowClient
 8 | from sklearn.ensemble import RandomForestRegressor
 9 | from sklearn.metrics import mean_squared_error
10 | 
11 | HPO_EXPERIMENT_NAME = "random-forest-hyperopt"
12 | EXPERIMENT_NAME = "random-forest-best-models"
13 | RF_PARAMS = ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'random_state']
14 | 
15 | mlflow.set_tracking_uri("http://127.0.0.1:5000")
16 | mlflow.set_experiment(EXPERIMENT_NAME)
17 | mlflow.sklearn.autolog()
18 | 
19 | 
20 | def load_pickle(filename):
21 |     with open(filename, "rb") as f_in:
22 |         return pickle.load(f_in)
23 | 
24 | 
25 | def train_and_log_model(data_path, params):
26 |     X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
27 |     X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
28 |     X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))
29 | 
30 |     with mlflow.start_run():
31 |         new_params = {}
32 |         for param in RF_PARAMS:
33 |             new_params[param] = int(params[param])
34 | 
35 |         rf = RandomForestRegressor(**new_params)
36 |         rf.fit(X_train, y_train)
37 | 
38 |         # Evaluate model on the validation and test sets
39 |         val_rmse = mean_squared_error(y_val, rf.predict(X_val), squared=False)
40 |         mlflow.log_metric("val_rmse", val_rmse)
41 |         test_rmse = mean_squared_error(y_test, rf.predict(X_test), squared=False)
42 |         mlflow.log_metric("test_rmse", test_rmse)
43 | 
44 | 
45 | @click.command()
46 | @click.option(
47 |     "--data_path",
48 |     default="./output",
49 |     help="Location where the processed NYC taxi trip data was saved"
50 | )
51 | @click.option(
52 |     "--top_n",
53 |     default=5,
54 |     type=int,
55 |     help="Number of top models that need to be evaluated to decide which one to promote"
56 | )
57 | def run_register_model(data_path: str, top_n: int):
58 | 
59 |     client = MlflowClient()
60 | 
61 |     # Retrieve the top_n model runs and log the models
62 |     experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
63 |     runs = client.search_runs(
64 |         experiment_ids=experiment.experiment_id,
65 |         run_view_type=ViewType.ACTIVE_ONLY,
66 |         max_results=top_n,
67 |         order_by=["metrics.rmse ASC"]
68 |     )
69 |     for run in runs:
70 |         train_and_log_model(data_path=data_path, params=run.data.params)
71 | 
72 |     # Select the model with the lowest test RMSE
73 |     experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
74 |     # best_run = client.search_runs( ...  )[0]
75 | 
76 |     # Register the best model
77 |     # mlflow.register_model( ... )
78 | 
79 | 
80 | if __name__ == '__main__':
81 |     run_register_model()
82 | 


--------------------------------------------------------------------------------
/cohorts/2024/02-experiment-tracking/homework/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import click
 4 | 
 5 | from sklearn.ensemble import RandomForestRegressor
 6 | from sklearn.metrics import mean_squared_error
 7 | 
 8 | 
 9 | def load_pickle(filename: str):
10 |     with open(filename, "rb") as f_in:
11 |         return pickle.load(f_in)
12 | 
13 | 
14 | @click.command()
15 | @click.option(
16 |     "--data_path",
17 |     default="./output",
18 |     help="Location where the processed NYC taxi trip data was saved"
19 | )
20 | def run_train(data_path: str):
21 | 
22 |     X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
23 |     X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
24 | 
25 |     rf = RandomForestRegressor(max_depth=10, random_state=0)
26 |     rf.fit(X_train, y_train)
27 |     y_pred = rf.predict(X_val)
28 | 
29 |     rmse = mean_squared_error(y_val, y_pred, squared=False)
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     run_train()
34 | 


--------------------------------------------------------------------------------
/cohorts/2024/02-experiment-tracking/solution/hpo.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import click
 4 | import mlflow
 5 | import numpy as np
 6 | from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
 7 | from hyperopt.pyll import scope
 8 | from sklearn.ensemble import RandomForestRegressor
 9 | from sklearn.metrics import mean_squared_error
10 | 
11 | mlflow.set_tracking_uri("http://127.0.0.1:5000")
12 | mlflow.set_experiment("random-forest-hyperopt")
13 | 
14 | 
15 | def load_pickle(filename: str):
16 |     with open(filename, "rb") as f_in:
17 |         return pickle.load(f_in)
18 | 
19 | 
20 | @click.command()
21 | @click.option(
22 |     "--data_path",
23 |     default="./output",
24 |     help="Location where the processed NYC taxi trip data was saved"
25 | )
26 | @click.option(
27 |     "--num_trials",
28 |     default=15,
29 |     help="The number of parameter evaluations for the optimizer to explore"
30 | )
31 | def run_optimization(data_path: str, num_trials: int):
32 | 
33 |     X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
34 |     X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
35 | 
36 |     def objective(params):
37 | 
38 |         with mlflow.start_run():
39 |             mlflow.log_params(params)
40 |             rf = RandomForestRegressor(**params)
41 |             rf.fit(X_train, y_train)
42 |             y_pred = rf.predict(X_val)
43 |             rmse = mean_squared_error(y_val, y_pred, squared=False)
44 |             mlflow.log_metric("rmse", rmse)
45 | 
46 |         return {'loss': rmse, 'status': STATUS_OK}
47 | 
48 |     search_space = {
49 |         'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
50 |         'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
51 |         'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
52 |         'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
53 |         'random_state': 42
54 |     }
55 | 
56 |     rstate = np.random.default_rng(42)  # for reproducible results
57 |     fmin(
58 |         fn=objective,
59 |         space=search_space,
60 |         algo=tpe.suggest,
61 |         max_evals=num_trials,
62 |         trials=Trials(),
63 |         rstate=rstate
64 |     )
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     run_optimization()
69 | 


--------------------------------------------------------------------------------
/cohorts/2024/02-experiment-tracking/solution/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import click
 4 | import mlflow
 5 | 
 6 | from sklearn.ensemble import RandomForestRegressor
 7 | from sklearn.metrics import mean_squared_error
 8 | 
 9 | 
10 | mlflow.set_tracking_uri("sqlite:///mlflow.db")
11 | mlflow.set_experiment("random-forest-train")
12 | 
13 | 
14 | def load_pickle(filename: str):
15 |     with open(filename, "rb") as f_in:
16 |         return pickle.load(f_in)
17 | 
18 | 
19 | @click.command()
20 | @click.option(
21 |     "--data_path",
22 |     default="./output",
23 |     help="Location where the processed NYC taxi trip data was saved"
24 | )
25 | def run_train(data_path: str):
26 |     mlflow.sklearn.autolog()
27 |     X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
28 |     X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
29 | 
30 |     with mlflow.start_run():
31 | 
32 |         rf = RandomForestRegressor(max_depth=10, random_state=0)
33 |         rf.fit(X_train, y_train)
34 |         y_pred = rf.predict(X_val)
35 | 
36 |         rmse = mean_squared_error(y_val, y_pred, squared=False)
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     run_train()
41 | 


--------------------------------------------------------------------------------
/cohorts/2024/03-orchestration/.gitignore:
--------------------------------------------------------------------------------
1 | data/
2 | .ipynb_checkpoints
3 | models/*
4 | mlruns/*
5 | .vscode/
6 | ./DS_Store
7 | *.db
8 | *.DS_Store
9 | 


--------------------------------------------------------------------------------
/cohorts/2024/03-orchestration/3.2/README.md:
--------------------------------------------------------------------------------
 1 | # 3.2 Training: sklearn models and XGBoost
 2 | 
 3 | 
 4 | ## 1. Training pipeline for sklearn models
 5 | 
 6 | ### Videos
 7 | 
 8 | 1. [GDP training set](https://youtu.be/KP68DuJnk4Q?si=tVHWYLCpZ2RpwuNh)
 9 | 1. [Sklearn training GDP](https://youtu.be/CbHaZcq_uGo)
10 | 1. [Load models](https://youtu.be/zsMHFq2C978)
11 | 1. [Utility helper functions for loading models](https://youtu.be/fZnxDhtPxYo)
12 | 1. [Hyperparameter tuning](https://youtu.be/zfBB4KoZ7TM)
13 | 1. [Train sklearn model](https://youtu.be/P7PtegUFk3k)
14 | 
15 | ### Code
16 | 
17 | -   [`utils/models/sklearn.py`](https://github.com/mage-ai/mlops/blob/master/mlops/utils/models/sklearn.py)
18 | -   [`custom/load_models.py` block](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/custom/load_models.py): load sklearn models dynamically
19 | -   [`transformers/hyperparameter_tuning/sklearn.py`](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/transformers/hyperparameter_tuning/sklearn.py)
20 | -   [`data_exporters/sklearn.py`](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/data_exporters/sklearn.py)
21 | -   [`hyperparameters/shared.py`](https://github.com/mage-ai/mlops/blob/master/mlops/utils/hyperparameters/shared.py)
22 | 
23 | ---
24 | 
25 | ## 2. Training pipeline for XGBoost model
26 | 
27 | ### Videos
28 | 
29 | 1. [Hyperparameter tuning](https://youtu.be/K_Z2Lm1Cyu4)
30 | 1. [Train XGBoost model](https://youtu.be/Y2B-ivm7Mug)
31 | 
32 | ### Code
33 | 
34 | -   [`utils/models/xgboost.py`](https://github.com/mage-ai/mlops/blob/master/mlops/utils/models/xgboost.py)
35 | -   [`transformers/hyperparameter_tuning/xgboost.py`](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/transformers/hyperparameter_tuning/xgboost.py)
36 | -   [`data_exporters/xgboost.py`](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/data_exporters/xgboost.py)
37 | -   [`hyperparameters/shared.py`](https://github.com/mage-ai/mlops/blob/master/mlops/utils/hyperparameters/shared.py)
38 | 
39 | ---
40 | 
41 | ## Code
42 | 
43 | 1. [Complete code solution](https://github.com/mage-ai/mlops)
44 | 1. [sklearn training pipeline configuration](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/pipelines/sklearn_training/metadata.yaml)
45 | 1. [XGBoost training pipeline configuration](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/pipelines/xgboost_training/metadata.yaml)
46 | 
47 | ---
48 | 
49 | ## Resources
50 | 
51 | 1. [Accuracy, precision, recall](https://www.mage.ai/blog/definitive-guide-to-accuracy-precision-recall-for-product-developers)
52 | 
53 | 1. [Regression model performance metrics](https://www.mage.ai/blog/product-developers-guide-to-ml-regression-model-metrics)
54 | 


--------------------------------------------------------------------------------
/cohorts/2024/03-orchestration/3.4/README.md:
--------------------------------------------------------------------------------
 1 | # 3.4 Triggering: Inference and retraining
 2 | 
 3 | 
 4 | ## 1. Retraining pipeline
 5 | 
 6 | ### Videos
 7 | 
 8 | 1. [Setup pipeline](https://youtu.be/ywzNac-OzFc)
 9 | 1. [Trigger pipeline to run](https://youtu.be/6kcBWl3E8So)
10 | 
11 | ### Code
12 | 
13 | -   [`detect_new_data.py`](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/sensors/detect_new_data.py)
14 | -   [`custom/retrain/sklearn.py` block](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/custom/retrain/sklearn.py): trigger training pipeline for sklearn models
15 | -   [`custom/retrain/xgboost.py` block](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/custom/retrain/xgboost.py): trigger training pipeline for XGBoost model
16 | 
17 | ---
18 | 
19 | ## 2. Inference pipeline
20 | 
21 | ### Videos
22 | 
23 | 1. [Make a prediction](https://youtu.be/KZaS2oG9NDc)
24 | 1. [Build pipeline](https://youtu.be/mytcFbH_ooY)
25 | 1. [Model inference playground part 1](https://youtu.be/JI0dhR7Bnhk)
26 | 1. [Model inference playground part 2](https://youtu.be/v2ls-gBBRac)
27 | 1. [Get prediction via API](https://youtu.be/J6ckSZczk8M)
28 | 
29 | ### Code
30 | 
31 | -   [`custom/inference.py` block](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/custom/inference.py)
32 | 
33 | ---
34 | 
35 | ## Code
36 | 
37 | 1. [Retraining pipeline `metadata.yaml`](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/pipelines/automatic_retraining/metadata.yaml)
38 | 1. [Inference pipeline `metadata.yaml`](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/pipelines/predict/metadata.yaml)
39 | 1. [Playground configuration settings](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/interactions/playground.yaml)
40 | 
41 | ---
42 | 
43 | ## Resources
44 | 
45 | 1. [No-code UI interactions](https://docs.mage.ai/interactions/overview)
46 | 
47 | 1. [Saving triggers in code](https://docs.mage.ai/orchestration/triggers/configure-triggers-in-code)
48 | 
49 | 1. [Trigger another pipeline from a block](https://docs.mage.ai/orchestration/triggers/trigger-pipeline)
50 | 
51 | 1. [Trigger pipeline via API endpoint](https://docs.mage.ai/orchestration/triggers/trigger-pipeline-api)
52 | 
53 | 1. [Run pipelines on a recurring schedule](https://docs.mage.ai/orchestration/triggers/schedule-pipelines)
54 | 
55 | 1. [Improving model performance through retraining](<https://www.mage.ai/blog/how-to-improve-the-performance-of-a-machine-learning-(ML)-model>)
56 | 


--------------------------------------------------------------------------------
/cohorts/2024/03-orchestration/3.5/README.md:
--------------------------------------------------------------------------------
 1 | # 3.5 Deploying: Running operations in production
 2 | 
 3 | 
 4 | ## 1. Permissions
 5 | 
 6 | ### Videos
 7 | 
 8 | 1. [Configure permissions on AWS](https://youtu.be/TgdFaf4mw38)
 9 | 
10 | ### Code
11 | 
12 | -   [`custom/permissions.py` block](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/custom/permissions.py)
13 | 
14 | ---
15 | 
16 | ## 2. Deploy
17 | 
18 | ### Videos
19 | 
20 | 1. [Setup and deploy using Terraform](https://youtu.be/w9zl3n2a3Wc)
21 | 
22 | ### Code
23 | 
24 | -   [`custom/infrastructure_setup.py` block](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/custom/infrastructure_setup.py)
25 | -   [`custom/deploy.py` block](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/custom/deploy.py)
26 | -   [`custom/teardown_deployed_resources.py` block](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/custom/teardown_deployed_resources.py)
27 | 
28 | ---
29 | 
30 | ## 3. Continuous deployment and integration
31 | 
32 | ### Videos
33 | 
34 | 1. [CI/CD with GitHub Actions](https://youtu.be/tPkA3WjLSHE)
35 | 1. [Mage deployed](https://youtu.be/DMV2zEM50jY)
36 | 
37 | ### Code
38 | 
39 | -   [`custom/ci_and_cd.py` block](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/custom/ci_and_cd.py)
40 | 
41 | ## Code
42 | 
43 | 1. [Deployment pipeline `metadata.yaml`](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/pipelines/deploying_to_production/metadata.yaml)
44 | 
45 | ---
46 | 
47 | ## Resources
48 | 
49 | 1. [Repository setup](https://docs.mage.ai/production/ci-cd/local-cloud/repository-setup)
50 | 1. AWS IAM policy permissions
51 | 
52 |     1. [Terraform apply](https://docs.mage.ai/production/deploying-to-cloud/aws/terraform-apply-policy)
53 |     1. [Terraform destroy](https://docs.mage.ai/production/deploying-to-cloud/aws/terraform-destroy-policy)
54 | 
55 | 1. [Terraform setup](https://docs.mage.ai/production/deploying-to-cloud/using-terraform)
56 | 
57 | 1. [Configure Terraform for AWS](https://docs.mage.ai/production/deploying-to-cloud/aws/setup)
58 | 
59 | 1. [CI/CD overview](https://docs.mage.ai/production/ci-cd/overview)
60 | 
61 | 1. [Setup GitHub actions for CI/CD](https://docs.mage.ai/production/ci-cd/local-cloud/github-actions#github-actions-setup)
62 | 


--------------------------------------------------------------------------------
/cohorts/2024/03-orchestration/README.md:
--------------------------------------------------------------------------------
 1 | # 3. Orchestration and ML Pipelines
 2 | 
 3 | ## [3.0 Introduction: ML pipelines and Mage](3.0/README.md)
 4 | 
 5 | ## [3.1 Data preparation: ETL and feature engineering](3.1/README.md)
 6 | 
 7 | ## [3.2 Training: sklearn models and XGBoost](3.2/README.md)
 8 | 
 9 | ## [3.3 Observability: Monitoring and alerting](3.3/README.md)
10 | 
11 | ## [3.4 Triggering: Inference and retraining](3.4/README.md)
12 | 
13 | ## [3.5 Deploying: Running operations in production](3.5/README.md)
14 | 
15 | ## [3.6 Homework](../cohorts/2024/03-orchestration/homework.md).
16 | 
17 | ## Quickstart
18 | 
19 | See the [Unit 3.0](https://github.com/DataTalksClub/mlops-zoomcamp/blob/main/03-orchestration/3.0/README.md) for a Quick Start guide
20 | 
21 | ## Need help?
22 | 
23 | 1. [Developer documentation](https://docs.mage.ai/introduction/overview)
24 | 1. [AI chat bot](https://mageai.slack.com/archives/C05NYC4DADT)
25 | 1. Live chat with the [Mage team directly](https://mage.ai/chat)
26 | 
27 | 
28 | ## Notes
29 | 
30 | Did you take notes? Add them here:
31 | 
32 | * [Marcus' Notes for Ch3](https://github.com/mleiwe/mlops-zoomcamp/blob/Ch3_ML_Notes/cohorts/2024/03-orchestration/ML_Notes.md)
33 | * Send a PR, add your notes above this line
34 | 
35 | ### Notes previous editions
36 | 
37 | - [2022 Prefect notes](../cohorts/2022/03-orchestration/README.md)
38 | - [2023 Prefect notes](../cohorts/2023/03-orchestration/prefect/README.md)
39 | 


--------------------------------------------------------------------------------
/cohorts/2024/03-orchestration/meta.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "module": {
 3 |     "number": 3,
 4 |     "title": "Orchestration and ML Pipelines"
 5 |   },
 6 |   "units": [
 7 |     {
 8 |       "number": 1,
 9 |       "title": "Introdution to Workflow Orchestration",
10 |       "youtube": "https://www.youtube.com/watch?v=Cqb7wyaNF08"
11 |     },
12 |     {
13 |       "number": 2,
14 |       "title": "Introduction to Prefect",
15 |       "youtube": "https://www.youtube.com/watch?v=rTUBTvXvXvM"
16 |     },
17 |     {
18 |       "number": 3,
19 |       "title": "Prefect Workflow",
20 |       "youtube": "https://www.youtube.com/watch?v=x3bV8yMKjtc"
21 |     },
22 |     {
23 |       "number": 4,
24 |       "title": "Deploying Your Workflow",
25 |       "youtube": "https://www.youtube.com/watch?v=3YjagezFhOo"
26 |     },
27 |     {
28 |       "number": 5,
29 |       "title": "Working with Deployments",
30 |       "youtube": "https://www.youtube.com/watch?v=jVmaaqs63O8"
31 |     },
32 |     {
33 |       "number": 6,
34 |       "title": "Prefect Cloud (optional)",
35 |       "youtube": "https://www.youtube.com/watch?v=y89Ww85EUdo"
36 |     },
37 |     {
38 |       "number": 7,
39 |       "title": "Homework",
40 |       "youtube": ""
41 |     }
42 |   ]
43 | }


--------------------------------------------------------------------------------
/cohorts/2024/03-orchestration/requirements.txt:
--------------------------------------------------------------------------------
 1 | black==23.3.0
 2 | fastparquet==2023.4.0
 3 | hyperopt==0.2.7
 4 | mlflow==2.3.1
 5 | pandas==2.0.1
 6 | prefect==2.10.8
 7 | prefect-aws==0.3.1
 8 | scikit_learn==1.2.2
 9 | seaborn==0.12.2
10 | xgboost==1.7.5
11 | orjson==3.8.1


--------------------------------------------------------------------------------
/cohorts/2024/04-deployment/homework/model.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2024/04-deployment/homework/model.bin


--------------------------------------------------------------------------------
/cohorts/2024/04-deployment/homework_solution/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.10.13-slim
2 | 
3 | WORKDIR /app
4 | COPY [ "model2.bin", "model.bin" ]


--------------------------------------------------------------------------------
/cohorts/2024/04-deployment/homework_solution/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | scikit-learn = "==1.5.0"
 8 | pandas = "*"
 9 | pyarrow = "*"
10 | 
11 | [dev-packages]
12 | 
13 | [requires]
14 | python_version = "3.10"
15 | 


--------------------------------------------------------------------------------
/cohorts/2024/04-deployment/homework_solution/batch.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | import sys
 5 | import os
 6 | import pickle
 7 | import pandas as pd
 8 | 
 9 | 
10 | year = int(sys.argv[1]) # 2023
11 | month = int(sys.argv[2]) # 4
12 | 
13 | input_file = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year:04d}-{month:02d}.parquet'
14 | output_file = f'output/yellow_tripdata_{year:04d}-{month:02d}.parquet'
15 | 
16 | 
17 | MODEL_FILE = os.getenv('MODEL_FILE', 'model.bin')
18 | 
19 | with open(MODEL_FILE, 'rb') as f_in:
20 |     dv, lr = pickle.load(f_in)
21 | 
22 | 
23 | categorical = ['PULocationID', 'DOLocationID']
24 | 
25 | def read_data(filename):
26 |     df = pd.read_parquet(filename)
27 |     
28 |     df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
29 |     df['duration'] = df.duration.dt.total_seconds() / 60
30 | 
31 |     df = df[(df.duration >= 1) & (df.duration <= 60)].copy()
32 | 
33 |     df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
34 |     
35 |     return df
36 | 
37 | 
38 | df = read_data(input_file)
39 | df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')
40 | 
41 | 
42 | dicts = df[categorical].to_dict(orient='records')
43 | X_val = dv.transform(dicts)
44 | y_pred = lr.predict(X_val)
45 | 
46 | 
47 | print('predicted mean duration:', y_pred.mean())
48 | 
49 | 
50 | df_result = pd.DataFrame()
51 | df_result['ride_id'] = df['ride_id']
52 | df_result['predicted_duration'] = y_pred
53 | 
54 | 
55 | os.makedirs('output', exist_ok=True)
56 | 
57 | df_result.to_parquet(
58 |     output_file,
59 |     engine='pyarrow',
60 |     compression=None,
61 |     index=False
62 | )
63 | 


--------------------------------------------------------------------------------
/cohorts/2024/04-deployment/homework_solution/homework.dockerfile:
--------------------------------------------------------------------------------
 1 | FROM agrigorev/zoomcamp-model:mlops-2024-3.10.13-slim
 2 | 
 3 | RUN pip install -U pip & pip install pipenv
 4 | 
 5 | COPY [ "Pipfile", "Pipfile.lock", "./" ]
 6 | 
 7 | RUN pipenv install --system --deploy
 8 | 
 9 | COPY [ "batch.py", "batch.py" ]
10 | 
11 | ENTRYPOINT [ "python", "batch.py" ]


--------------------------------------------------------------------------------
/cohorts/2024/04-deployment/homework_solution/model.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2024/04-deployment/homework_solution/model.bin


--------------------------------------------------------------------------------
/cohorts/2024/04-deployment/homework_solution/model2.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2024/04-deployment/homework_solution/model2.bin


--------------------------------------------------------------------------------
/cohorts/2024/05-monitoring/homework.md:
--------------------------------------------------------------------------------
 1 | ## Homework
 2 | 
 3 | The goal of this homework is to familiarize users with monitoring for ML batch services, using PostgreSQL database to store metrics and Grafana to visualize them.
 4 | 
 5 | 
 6 | 
 7 | ## Q1. Prepare the dataset
 8 | 
 9 | Start with `baseline_model_nyc_taxi_data.ipynb`. Download the March 2024 Green Taxi data. We will use this data to simulate a production usage of a taxi trip duration prediction service.
10 | 
11 | What is the shape of the downloaded data? How many rows are there?
12 | 
13 | * 72044
14 | * 78537 
15 | * 57457
16 | * 54396
17 | 
18 | 
19 | ## Q2. Metric
20 | 
21 | Let's expand the number of data quality metrics we’d like to monitor! Please add one metric of your choice and a quantile value for the `"fare_amount"` column (`quantile=0.5`).
22 | 
23 | Hint: explore evidently metric `ColumnQuantileMetric` (from `evidently.metrics import ColumnQuantileMetric`) 
24 | 
25 | What metric did you choose?
26 | 
27 | 
28 | ## Q3. Monitoring
29 | 
30 | Let’s start monitoring. Run expanded monitoring for a new batch of data (March 2024). 
31 | 
32 | What is the maximum value of metric `quantile = 0.5` on the `"fare_amount"` column during March 2024 (calculated daily)?
33 | 
34 | * 10
35 | * 12.5
36 | * 14.2
37 | * 14.8
38 | 
39 | 
40 | ## Q4. Dashboard
41 | 
42 | 
43 | Finally, let’s add panels with new added metrics to the dashboard. After we customize the  dashboard let's save a dashboard config, so that we can access it later. Hint: click on “Save dashboard” to access JSON configuration of the dashboard. This configuration should be saved locally.
44 | 
45 | Where to place a dashboard config file?
46 | 
47 | * `project_folder` (05-monitoring)
48 | * `project_folder/config`  (05-monitoring/config)
49 | * `project_folder/dashboards`  (05-monitoring/dashboards)
50 | * `project_folder/data`  (05-monitoring/data)
51 | 
52 | 
53 | ## Submit the results
54 | 
55 | * Submit your answers here: https://courses.datatalks.club/mlops-zoomcamp-2024/homework/hw5
56 | 


--------------------------------------------------------------------------------
/cohorts/2024/06-best-practices/homework/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.10.9-slim
 2 | 
 3 | RUN pip install -U pip & pip install pipenv
 4 | 
 5 | COPY [ "Pipfile", "Pipfile.lock", "./" ]
 6 | 
 7 | RUN pipenv install --system --deploy
 8 | 
 9 | COPY [ "batch.py", "batch.py" ]
10 | COPY [ "model.bin", "model.bin" ]
11 | 
12 | ENTRYPOINT [ "python", "batch.py" ]


--------------------------------------------------------------------------------
/cohorts/2024/06-best-practices/homework/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | scikit-learn = "==1.5.0"
 8 | pandas = "*"
 9 | pyarrow = "*"
10 | s3fs = "*"
11 | 
12 | [dev-packages]
13 | 
14 | [requires]
15 | python_version = "3.10"
16 | 


--------------------------------------------------------------------------------
/cohorts/2024/06-best-practices/homework/batch.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | import sys
 5 | import pickle
 6 | import pandas as pd
 7 | 
 8 | 
 9 | year = int(sys.argv[1])
10 | month = int(sys.argv[2])
11 | 
12 | input_file = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year:04d}-{month:02d}.parquet'
13 | output_file = f'output/yellow_tripdata_{year:04d}-{month:02d}.parquet'
14 | 
15 | 
16 | with open('model.bin', 'rb') as f_in:
17 |     dv, lr = pickle.load(f_in)
18 | 
19 | 
20 | categorical = ['PULocationID', 'DOLocationID']
21 | 
22 | def read_data(filename):
23 |     df = pd.read_parquet(filename)
24 |     
25 |     df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
26 |     df['duration'] = df.duration.dt.total_seconds() / 60
27 | 
28 |     df = df[(df.duration >= 1) & (df.duration <= 60)].copy()
29 | 
30 |     df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
31 |     
32 |     return df
33 | 
34 | 
35 | df = read_data(input_file)
36 | df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')
37 | 
38 | 
39 | dicts = df[categorical].to_dict(orient='records')
40 | X_val = dv.transform(dicts)
41 | y_pred = lr.predict(X_val)
42 | 
43 | 
44 | print('predicted mean duration:', y_pred.mean())
45 | 
46 | 
47 | df_result = pd.DataFrame()
48 | df_result['ride_id'] = df['ride_id']
49 | df_result['predicted_duration'] = y_pred
50 | 
51 | 
52 | df_result.to_parquet(output_file, engine='pyarrow', index=False)
53 | 


--------------------------------------------------------------------------------
/cohorts/2024/06-best-practices/homework/model.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2024/06-best-practices/homework/model.bin


--------------------------------------------------------------------------------
/cohorts/2024/README.md:
--------------------------------------------------------------------------------
 1 | ## MLOps Zoomcamp 2024 Cohort
 2 | 
 3 | * [Pre-Course Live Q&A](https://www.youtube.com/watch?v=YmllO3ld5LE)
 4 | * [Course Launch video](https://www.youtube.com/watch?v=2jM7t-NTZxs) and [Slides](https://docs.google.com/presentation/d/1Tp2VVph5_vYIazQ53VR7TYmhJjQg9wuNIKKne3wlZVU/edit?usp=sharing)
 5 | * [Technical FAQ](https://docs.google.com/document/d/12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0/edit)
 6 | * TODO: Course Playlist: Only 2024 Live videos & homeworks
 7 | * [Course management platform](https://courses.datatalks.club/mlops-zoomcamp-2024/)
 8 | 
 9 | 
10 | 
11 | [**Module 1: Introduction**](01-intro)
12 | 
13 | * [Homework](01-intro/homework.md)
14 | * [Solution](01-intro/homework.ipynb)
15 | 
16 | [**Module 2: Experiment Tracking**](02-experiment-tracking/)
17 | 
18 | * [Homework](02-experiment-tracking/homework.md)
19 | * [Solution](02-experiment-tracking/solution)
20 | 
21 | [**Module 3: Orchestration and ML pipelines**](03-orchestration/)
22 | 
23 | * [Homework](03-orchestration/homework.md)
24 | * [Solution](https://www.loom.com/share/802c8c0b843a4d3bbd9dbea240c3593a)
25 | 
26 | [**Module 4: Model Deployment**](04-deployment)
27 | 
28 | * [Homework](04-deployment/homework.md)
29 | 
30 | [**Module 5: Model Monitoring**](05-monitoring/)
31 | 
32 | * [Homework](05-monitoring/homework.md)
33 | 
34 | [**Module 6: Best Practices**](06-best-practices)
35 | 
36 | * [Homework](06-best-practices/homework.md)
37 | 
38 | 
39 | [**Project**](project.md)
40 | 
41 | * [More information](project.md)
42 | 


--------------------------------------------------------------------------------
/cohorts/2024/project.md:
--------------------------------------------------------------------------------
 1 | ## Course Project
 2 | 
 3 | The goal of this project is to apply everything we learned
 4 | in this course and build an end-to-end machine learning project.
 5 | 
 6 | Remember that to pass the project, you must evaluate 3 peers. If you don't do that, your project can't be considered compelete.
 7 | 
 8 | 
 9 | ## Submitting
10 | 
11 | ### Project Attempt #1
12 | 
13 | * Project: https://courses.datatalks.club/mlops-zoomcamp-2024/project/project1
14 | * Review: https://courses.datatalks.club/mlops-zoomcamp-2024/project/project1/eval
15 | 
16 | 
17 | ### Project Attempt #2
18 | 
19 | * Project: https://courses.datatalks.club/mlops-zoomcamp-2024/project/project2
20 | * Review: https://courses.datatalks.club/mlops-zoomcamp-2024/project/project2/eval
21 | 
22 | 
23 | > **Important**: update your "Certificate name" here: https://courses.datatalks.club/mlops-zoomcamp-2024/enrollment -
24 | this is what we will use when generating certificates for you.
25 | 
26 | 
27 | ## Evaluation criteria
28 | 
29 | See [here](../../../07-project/README.md)
30 | 


--------------------------------------------------------------------------------
/cohorts/2025/01-intro/homework.md:
--------------------------------------------------------------------------------
 1 | ## Homework
 2 | 
 3 | The goal of this homework is to train a simple model for predicting the duration of a ride - similar to what we did in this module.
 4 | 
 5 | 
 6 | ## Q1. Downloading the data
 7 | 
 8 | We'll use [the same NYC taxi dataset](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page),
 9 | but instead of "**Green** Taxi Trip Records", we'll use "**Yellow** Taxi Trip Records".
10 | 
11 | Download the data for January and February 2023.
12 | 
13 | Read the data for January. How many columns are there?
14 | 
15 | * 16
16 | * 17
17 | * 18
18 | * 19
19 | 
20 | 
21 | ## Q2. Computing duration
22 | 
23 | Now let's compute the `duration` variable. It should contain the duration of a ride in minutes. 
24 | 
25 | What's the standard deviation of the trips duration in January?
26 | 
27 | * 32.59
28 | * 42.59
29 | * 52.59
30 | * 62.59
31 | 
32 | 
33 | ## Q3. Dropping outliers
34 | 
35 | Next, we need to check the distribution of the `duration` variable. There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).
36 | 
37 | What fraction of the records left after you dropped the outliers?
38 | 
39 | * 90%
40 | * 92%
41 | * 95%
42 | * 98%
43 | 
44 | 
45 | ## Q4. One-hot encoding
46 | 
47 | Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model. 
48 | 
49 | * Turn the dataframe into a list of dictionaries (remember to re-cast the ids to strings - otherwise it will 
50 |   label encode them)
51 | * Fit a dictionary vectorizer 
52 | * Get a feature matrix from it
53 | 
54 | What's the dimensionality of this matrix (number of columns)?
55 | 
56 | * 2
57 | * 155
58 | * 345
59 | * 515
60 | * 715
61 | 
62 | 
63 | ## Q5. Training a model
64 | 
65 | Now let's use the feature matrix from the previous step to train a model. 
66 | 
67 | * Train a plain linear regression model with default parameters, where duration is the response variable
68 | * Calculate the RMSE of the model on the training data
69 | 
70 | What's the RMSE on train?
71 | 
72 | * 3.64
73 | * 7.64
74 | * 11.64
75 | * 16.64
76 | 
77 | 
78 | ## Q6. Evaluating the model
79 | 
80 | Now let's apply this model to the validation dataset (February 2023). 
81 | 
82 | What's the RMSE on validation?
83 | 
84 | * 3.81
85 | * 7.81
86 | * 11.81
87 | * 16.81
88 | 
89 | ## Submit the results
90 | 
91 | * Submit your results here: https://courses.datatalks.club/mlops-zoomcamp-2025/homework/hw1
92 | * If your answer doesn't match options exactly, select the closest one
93 | 


--------------------------------------------------------------------------------
/cohorts/2025/02-experiment-tracking/homework/hpo.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import click
 4 | import mlflow
 5 | import numpy as np
 6 | from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
 7 | from hyperopt.pyll import scope
 8 | from sklearn.ensemble import RandomForestRegressor
 9 | from sklearn.metrics import root_mean_squared_error
10 | 
11 | mlflow.set_tracking_uri("http://127.0.0.1:5000")
12 | mlflow.set_experiment("random-forest-hyperopt")
13 | 
14 | 
15 | def load_pickle(filename: str):
16 |     with open(filename, "rb") as f_in:
17 |         return pickle.load(f_in)
18 | 
19 | 
20 | @click.command()
21 | @click.option(
22 |     "--data_path",
23 |     default="./output",
24 |     help="Location where the processed NYC taxi trip data was saved"
25 | )
26 | @click.option(
27 |     "--num_trials",
28 |     default=15,
29 |     help="The number of parameter evaluations for the optimizer to explore"
30 | )
31 | def run_optimization(data_path: str, num_trials: int):
32 | 
33 |     X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
34 |     X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
35 | 
36 |     def objective(params):
37 | 
38 |         rf = RandomForestRegressor(**params)
39 |         rf.fit(X_train, y_train)
40 |         y_pred = rf.predict(X_val)
41 |         rmse = root_mean_squared_error(y_val, y_pred)
42 | 
43 |         return {'loss': rmse, 'status': STATUS_OK}
44 | 
45 |     search_space = {
46 |         'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
47 |         'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
48 |         'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
49 |         'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
50 |         'random_state': 42
51 |     }
52 | 
53 |     rstate = np.random.default_rng(42)  # for reproducible results
54 |     fmin(
55 |         fn=objective,
56 |         space=search_space,
57 |         algo=tpe.suggest,
58 |         max_evals=num_trials,
59 |         trials=Trials(),
60 |         rstate=rstate
61 |     )
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     run_optimization()
66 | 


--------------------------------------------------------------------------------
/cohorts/2025/02-experiment-tracking/homework/register_model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import click
 4 | import mlflow
 5 | 
 6 | from mlflow.entities import ViewType
 7 | from mlflow.tracking import MlflowClient
 8 | from sklearn.ensemble import RandomForestRegressor
 9 | from sklearn.metrics import root_mean_squared_error
10 | 
11 | HPO_EXPERIMENT_NAME = "random-forest-hyperopt"
12 | EXPERIMENT_NAME = "random-forest-best-models"
13 | RF_PARAMS = ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'random_state']
14 | 
15 | mlflow.set_tracking_uri("http://127.0.0.1:5000")
16 | mlflow.set_experiment(EXPERIMENT_NAME)
17 | mlflow.sklearn.autolog()
18 | 
19 | 
20 | def load_pickle(filename):
21 |     with open(filename, "rb") as f_in:
22 |         return pickle.load(f_in)
23 | 
24 | 
25 | def train_and_log_model(data_path, params):
26 |     X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
27 |     X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
28 |     X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))
29 | 
30 |     with mlflow.start_run():
31 |         new_params = {}
32 |         for param in RF_PARAMS:
33 |             new_params[param] = int(params[param])
34 | 
35 |         rf = RandomForestRegressor(**new_params)
36 |         rf.fit(X_train, y_train)
37 | 
38 |         # Evaluate model on the validation and test sets
39 |         val_rmse = root_mean_squared_error(y_val, rf.predict(X_val))
40 |         mlflow.log_metric("val_rmse", val_rmse)
41 |         test_rmse = root_mean_squared_error(y_test, rf.predict(X_test))
42 |         mlflow.log_metric("test_rmse", test_rmse)
43 | 
44 | 
45 | @click.command()
46 | @click.option(
47 |     "--data_path",
48 |     default="./output",
49 |     help="Location where the processed NYC taxi trip data was saved"
50 | )
51 | @click.option(
52 |     "--top_n",
53 |     default=5,
54 |     type=int,
55 |     help="Number of top models that need to be evaluated to decide which one to promote"
56 | )
57 | def run_register_model(data_path: str, top_n: int):
58 | 
59 |     client = MlflowClient()
60 | 
61 |     # Retrieve the top_n model runs and log the models
62 |     experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
63 |     runs = client.search_runs(
64 |         experiment_ids=experiment.experiment_id,
65 |         run_view_type=ViewType.ACTIVE_ONLY,
66 |         max_results=top_n,
67 |         order_by=["metrics.rmse ASC"]
68 |     )
69 |     for run in runs:
70 |         train_and_log_model(data_path=data_path, params=run.data.params)
71 | 
72 |     # Select the model with the lowest test RMSE
73 |     experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
74 |     # best_run = client.search_runs( ...  )[0]
75 | 
76 |     # Register the best model
77 |     # mlflow.register_model( ... )
78 | 
79 | 
80 | if __name__ == '__main__':
81 |     run_register_model()
82 | 


--------------------------------------------------------------------------------
/cohorts/2025/02-experiment-tracking/homework/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import click
 4 | 
 5 | from sklearn.ensemble import RandomForestRegressor
 6 | from sklearn.metrics import root_mean_squared_error
 7 | 
 8 | 
 9 | def load_pickle(filename: str):
10 |     with open(filename, "rb") as f_in:
11 |         return pickle.load(f_in)
12 | 
13 | 
14 | @click.command()
15 | @click.option(
16 |     "--data_path",
17 |     default="./output",
18 |     help="Location where the processed NYC taxi trip data was saved"
19 | )
20 | def run_train(data_path: str):
21 | 
22 |     X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
23 |     X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
24 | 
25 |     rf = RandomForestRegressor(max_depth=10, random_state=0)
26 |     rf.fit(X_train, y_train)
27 |     y_pred = rf.predict(X_val)
28 | 
29 |     rmse = root_mean_squared_error(y_val, y_pred)
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     run_train()
34 | 


--------------------------------------------------------------------------------
/cohorts/2025/03-orchestration/homework.md:
--------------------------------------------------------------------------------
 1 | ## Homework
 2 | 
 3 | The goal of this homework is to create a simple training pipeline, use mlflow to track experiments and register best model, but use Mage for it.
 4 | 
 5 | We'll use [the same NYC taxi dataset](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page), the **Yellow** taxi data for March, 2023. 
 6 | 
 7 | ## Question 1. Select the Tool
 8 | 
 9 | You can use the same tool you used when completing the module,
10 | or choose a different one for your homework.
11 | 
12 | What's the name of the orchestrator you chose? 
13 | 
14 | 
15 | ## Question 2. Version
16 | 
17 | What's the version of the orchestrator? 
18 | 
19 | 
20 | ## Question 3. Creating a pipeline
21 | 
22 | Let's read the March 2023 Yellow taxi trips data.
23 | 
24 | How many records did we load? 
25 | 
26 | - 3,003,766
27 | - 3,203,766
28 | - 3,403,766
29 | - 3,603,766
30 | 
31 | (Include a print statement in your code)
32 | 
33 | ## Question 4. Data preparation
34 | 
35 | Let's continue with pipeline creation.
36 | 
37 | We will use the same logic for preparing the data we used previously. 
38 | 
39 | This is what we used (adjusted for yellow dataset):
40 | 
41 | ```python
42 | def read_dataframe(filename):
43 |     df = pd.read_parquet(filename)
44 | 
45 |     df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
46 |     df.duration = df.duration.dt.total_seconds() / 60
47 | 
48 |     df = df[(df.duration >= 1) & (df.duration <= 60)]
49 | 
50 |     categorical = ['PULocationID', 'DOLocationID']
51 |     df[categorical] = df[categorical].astype(str)
52 |     
53 |     return df
54 | ```
55 | 
56 | Let's apply to the data we loaded in question 3. 
57 | 
58 | What's the size of the result? 
59 | 
60 | - 2,903,766
61 | - 3,103,766
62 | - 3,316,216 
63 | - 3,503,766
64 | 
65 | ## Question 5. Train a model
66 | 
67 | We will now train a linear regression model using the same code as in homework 1.
68 | 
69 | * Fit a dict vectorizer.
70 | * Train a linear regression with default parameters.
71 | * Use pick up and drop off locations separately, don't create a combination feature.
72 | 
73 | Let's now use it in the pipeline. We will need to create another transformation block, and return both the dict vectorizer and the model.
74 | 
75 | What's the intercept of the model? 
76 | 
77 | Hint: print the `intercept_` field in the code block
78 | 
79 | - 21.77
80 | - 24.77
81 | - 27.77
82 | - 31.77
83 | 
84 | ## Question 6. Register the model 
85 | 
86 | The model is trained, so let's save it with MLFlow.
87 | 
88 | Find the logged model, and find MLModel file. What's the size of the model? (`model_size_bytes` field):
89 | 
90 | * 14,534
91 | * 9,534
92 | * 4,534
93 | * 1,534
94 | 
95 | 
96 | ## Submit the results
97 | 
98 | * Submit your results here: https://courses.datatalks.club/mlops-zoomcamp-2025/homework/hw3
99 | * If your answer doesn't match options exactly, select the closest one.


--------------------------------------------------------------------------------
/cohorts/2025/04-deployment/homework/model.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2025/04-deployment/homework/model.bin


--------------------------------------------------------------------------------
/cohorts/2025/05-monitoring/homework.md:
--------------------------------------------------------------------------------
 1 | ## Homework
 2 | 
 3 | The goal of this homework is to familiarize users with monitoring for ML batch services, using PostgreSQL database to store metrics and Grafana to visualize them.
 4 | 
 5 | 
 6 | 
 7 | ## Q1. Prepare the dataset
 8 | 
 9 | Start with `baseline_model_nyc_taxi_data.ipynb`. Download the March 2024 Green Taxi data. We will use this data to simulate a production usage of a taxi trip duration prediction service.
10 | 
11 | What is the shape of the downloaded data? How many rows are there?
12 | 
13 | * 72044
14 | * 78537 
15 | * 57457
16 | * 54396
17 | 
18 | 
19 | ## Q2. Metric
20 | 
21 | Let's expand the number of data quality metrics we’d like to monitor! Please add one metric of your choice and a quantile value for the `"fare_amount"` column (`quantile=0.5`).
22 | 
23 | Hint: explore evidently metric `ColumnQuantileMetric` (from `evidently.metrics import ColumnQuantileMetric`) 
24 | 
25 | What metric did you choose?
26 | 
27 | 
28 | ## Q3. Monitoring
29 | 
30 | Let’s start monitoring. Run expanded monitoring for a new batch of data (March 2024). 
31 | 
32 | What is the maximum value of metric `quantile = 0.5` on the `"fare_amount"` column during March 2024 (calculated daily)?
33 | 
34 | * 10
35 | * 12.5
36 | * 14.2
37 | * 14.8
38 | 
39 | 
40 | ## Q4. Dashboard
41 | 
42 | 
43 | Finally, let’s add panels with new added metrics to the dashboard. After we customize the  dashboard let's save a dashboard config, so that we can access it later. Hint: click on “Save dashboard” to access JSON configuration of the dashboard. This configuration should be saved locally.
44 | 
45 | Where to place a dashboard config file?
46 | 
47 | * `project_folder` (05-monitoring)
48 | * `project_folder/config`  (05-monitoring/config)
49 | * `project_folder/dashboards`  (05-monitoring/dashboards)
50 | * `project_folder/data`  (05-monitoring/data)
51 | 
52 | 
53 | ## Submit the results
54 | 
55 | * Submit your answers here: https://courses.datatalks.club/mlops-zoomcamp-2025/homework/hw5
56 | 


--------------------------------------------------------------------------------
/cohorts/2025/06-best-practices/homework/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.10.9-slim
 2 | 
 3 | RUN pip install -U pip & pip install pipenv
 4 | 
 5 | COPY [ "Pipfile", "Pipfile.lock", "./" ]
 6 | 
 7 | RUN pipenv install --system --deploy
 8 | 
 9 | COPY [ "batch.py", "batch.py" ]
10 | COPY [ "model.bin", "model.bin" ]
11 | 
12 | ENTRYPOINT [ "python", "batch.py" ]


--------------------------------------------------------------------------------
/cohorts/2025/06-best-practices/homework/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | scikit-learn = "==1.5.0"
 8 | pandas = "*"
 9 | pyarrow = "*"
10 | s3fs = "*"
11 | 
12 | [dev-packages]
13 | 
14 | [requires]
15 | python_version = "3.10"
16 | 


--------------------------------------------------------------------------------
/cohorts/2025/06-best-practices/homework/batch.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | import sys
 5 | import pickle
 6 | import pandas as pd
 7 | 
 8 | 
 9 | year = int(sys.argv[1])
10 | month = int(sys.argv[2])
11 | 
12 | input_file = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year:04d}-{month:02d}.parquet'
13 | output_file = f'output/yellow_tripdata_{year:04d}-{month:02d}.parquet'
14 | 
15 | 
16 | with open('model.bin', 'rb') as f_in:
17 |     dv, lr = pickle.load(f_in)
18 | 
19 | 
20 | categorical = ['PULocationID', 'DOLocationID']
21 | 
22 | def read_data(filename):
23 |     df = pd.read_parquet(filename)
24 |     
25 |     df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
26 |     df['duration'] = df.duration.dt.total_seconds() / 60
27 | 
28 |     df = df[(df.duration >= 1) & (df.duration <= 60)].copy()
29 | 
30 |     df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
31 |     
32 |     return df
33 | 
34 | 
35 | df = read_data(input_file)
36 | df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')
37 | 
38 | 
39 | dicts = df[categorical].to_dict(orient='records')
40 | X_val = dv.transform(dicts)
41 | y_pred = lr.predict(X_val)
42 | 
43 | 
44 | print('predicted mean duration:', y_pred.mean())
45 | 
46 | 
47 | df_result = pd.DataFrame()
48 | df_result['ride_id'] = df['ride_id']
49 | df_result['predicted_duration'] = y_pred
50 | 
51 | 
52 | df_result.to_parquet(output_file, engine='pyarrow', index=False)
53 | 


--------------------------------------------------------------------------------
/cohorts/2025/06-best-practices/homework/model.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2025/06-best-practices/homework/model.bin


--------------------------------------------------------------------------------
/cohorts/2025/README.md:
--------------------------------------------------------------------------------
 1 | ## MLOps Zoomcamp 2025 Cohort
 2 | 
 3 | * [Pre-Course Live Q&A](https://www.youtube.com/watch?v=rv43YJQsZIw)
 4 | * [Course Launch video](https://youtube.com/live/qqZU8nBtH90) and [Slides](https://docs.google.com/presentation/d/10dP4KoVpMA1iMGBk-XWp3YcHjukoM7AxZ2v4LuZd9wE/edit?usp=sharing)
 5 | * [Technical FAQ](https://docs.google.com/document/d/12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0/edit)
 6 | * [Course management platform](https://courses.datatalks.club/mlops-zoomcamp-2025/)
 7 | 
 8 | 
 9 | 
10 | [**Module 1: Introduction**](01-intro)
11 | 
12 | * [Homework](01-intro/homework.md)
13 | 
14 | [**Module 2: Experiment Tracking**](02-experiment-tracking/)
15 | 
16 | * [Homework](02-experiment-tracking/homework.md)
17 | 
18 | [**Module 3: Orchestration and ML pipelines**](03-orchestration/)
19 | 
20 | * [Homework](03-orchestration/homework.md)
21 | 
22 | [**Module 4: Model Deployment**](04-deployment)
23 | 
24 | * [Homework](04-deployment/homework.md)
25 | 
26 | [**Module 5: Model Monitoring**](05-monitoring/)
27 | 
28 | * [Homework](05-monitoring/homework.md)
29 | 
30 | [**Module 6: Best Practices**](06-best-practices)
31 | 
32 | * [Homework](06-best-practices/homework.md)
33 | 
34 | 
35 | [**Project**](project.md)
36 | 
37 | * [More information](project.md)
38 | 


--------------------------------------------------------------------------------
/cohorts/2025/project.md:
--------------------------------------------------------------------------------
 1 | ## Course Project
 2 | 
 3 | The goal of this project is to apply everything we learned
 4 | in this course and build an end-to-end machine learning project.
 5 | 
 6 | Remember that to pass the project, you must evaluate 3 peers. If you don't do that, your project can't be considered compelete.
 7 | 
 8 | 
 9 | ## Submitting
10 | 
11 | ### Project Attempt #1
12 | 
13 | * Project: https://courses.datatalks.club/mlops-zoomcamp-2025/project/project1
14 | * Review: https://courses.datatalks.club/mlops-zoomcamp-2025/project/project1/eval
15 | 
16 | 
17 | ### Project Attempt #2
18 | 
19 | * Project: https://courses.datatalks.club/mlops-zoomcamp-2025/project/project2
20 | * Review: https://courses.datatalks.club/mlops-zoomcamp-2025/project/project2/eval
21 | 
22 | 
23 | > **Important**: update your "Certificate name" here: https://courses.datatalks.club/mlops-zoomcamp-2025/enrollment -
24 | this is what we will use when generating certificates for you.
25 | 
26 | 
27 | ## Evaluation criteria
28 | 
29 | See [here](../../07-project/README.md)
30 | 


--------------------------------------------------------------------------------
/images/IMG_20230323_134059_927.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/images/IMG_20230323_134059_927.png


--------------------------------------------------------------------------------
/images/banner-2025.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/images/banner-2025.jpg


--------------------------------------------------------------------------------
/images/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/images/banner.png


--------------------------------------------------------------------------------
/images/learning-in-public-links.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/images/learning-in-public-links.png


--------------------------------------------------------------------------------
/images/learning-in-public.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/images/learning-in-public.png


--------------------------------------------------------------------------------
/images/play.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/images/play.png


--------------------------------------------------------------------------------
/learning-in-public.md:
--------------------------------------------------------------------------------
 1 | # Learning in public
 2 | 
 3 | Most people learn in private: they consume content but don't tell
 4 | anyone about it. There's nothing wrong with it.
 5 | 
 6 | But we want to encourage you to document your progress and
 7 | share it publicly on social media.
 8 | 
 9 | It helps you get noticed and will lead to:
10 | 
11 | * Expanding your network: meeting new people and making new friends
12 | * Being invited to meetups, conferences and podcasts
13 | * Landing a job or getting clients
14 | * Many other good things
15 | 
16 | Here's a more compresensive reading on why you want to do it: https://github.com/readme/guides/publishing-your-work
17 | 
18 | 
19 | ## Learning in Public for Zoomcamps
20 | 
21 | When you submit your homework or project, you can also submit
22 | learning in public posts:
23 | 
24 | <img src="./images/learning-in-public-links.png" />
25 | 
26 | You can watch this video to see how your learning in public posts may look like:
27 | 
28 | <a href="https://www.loom.com/share/710e3297487b409d94df0e8da1c984ce" target="_blank">
29 |     <img src="./images/learning-in-public.png" height="240" />
30 | </a>
31 | 
32 | 
33 | Send a PR if you want to suggest improvements for this document


--------------------------------------------------------------------------------