├── .github └── workflows │ ├── cd-deploy.yml │ └── ci-tests.yml ├── .gitignore ├── 01-intro ├── README.md ├── duration-prediction.ipynb ├── images │ ├── thumbnail-1-01.jpg │ ├── thumbnail-1-02-1.jpg │ ├── thumbnail-1-02.jpg │ ├── thumbnail-1-03.jpg │ ├── thumbnail-1-04.jpg │ └── thumbnail-1-05.jpg └── meta.json ├── 02-experiment-tracking ├── README.md ├── duration-prediction.ipynb ├── images │ ├── db_configuration.png │ ├── db_password.png │ ├── db_settings.png │ ├── ec2_instance_type.png │ ├── ec2_os.png │ ├── key_pair.png │ ├── postgresql.png │ ├── postgresql_inbound_rule.png │ ├── s3_bucket.png │ ├── security_group.png │ ├── select_key_pair.png │ ├── thumbnail-2-01.jpg │ ├── thumbnail-2-02.jpg │ ├── thumbnail-2-03.jpg │ ├── thumbnail-2-04.jpg │ ├── thumbnail-2-05.jpg │ ├── thumbnail-2-06.jpg │ └── thumbnail-2-07.jpg ├── meta.json ├── mlflow_on_aws.md ├── model-registry.ipynb ├── requirements.txt └── running-mlflow-examples │ ├── scenario-1.ipynb │ ├── scenario-2.ipynb │ └── scenario-3.ipynb ├── 03-orchestration ├── README.md └── code │ ├── commands.md │ ├── duration-prediction.ipynb │ └── duration-prediction.py ├── 04-deployment ├── README.md ├── batch │ ├── Pipfile │ ├── Pipfile.lock │ ├── README.md │ ├── score.ipynb │ ├── score.py │ ├── score_backfill.py │ └── score_deploy.py ├── images │ ├── thumbnail-4-01.jpg │ ├── thumbnail-4-02.jpg │ ├── thumbnail-4-03.jpg │ ├── thumbnail-4-04.jpg │ ├── thumbnail-4-05.jpg │ └── thumbnail-4-06.jpg ├── load_model.ipynb ├── meta.json ├── streaming │ ├── Dockerfile │ ├── Pipfile │ ├── Pipfile.lock │ ├── README.md │ ├── lambda_function.py │ ├── test.py │ └── test_docker.py ├── web-service-mlflow │ ├── Pipfile │ ├── Pipfile.lock │ ├── README.md │ ├── dict_vectorizer.bin │ ├── predict.py │ ├── random-forest.ipynb │ └── test.py └── web-service │ ├── Dockerfile │ ├── Pipfile │ ├── Pipfile.lock │ ├── README.md │ ├── lin_reg.bin │ ├── predict.py │ └── test.py ├── 05-monitoring ├── README.md ├── baseline_model_nyc_taxi_data.ipynb ├── config │ ├── grafana_dashboards.yaml │ └── grafana_datasources.yaml ├── dashboards │ └── data_drift.json ├── data │ └── .gitignore ├── debugging_nyc_taxi_data.ipynb ├── docker-compose.yml ├── dummy_metrics_calculation.py ├── evidently_metrics_calculation.py ├── images │ ├── thumbnail-5-01.jpg │ ├── thumbnail-5-02.jpg │ ├── thumbnail-5-03.jpg │ ├── thumbnail-5-04.jpg │ ├── thumbnail-5-05.jpg │ ├── thumbnail-5-06.jpg │ ├── thumbnail-5-07.jpg │ └── thumbnail-5-08.jpg ├── meta.json ├── models │ └── .gitignore └── requirements.txt ├── 06-best-practices ├── AWS-stream-pipeline.png ├── README.md ├── ci_cd_zoomcamp.png ├── code │ ├── .gitignore │ ├── .pre-commit-config.yaml │ ├── .vscode │ │ └── settings.json │ ├── Dockerfile │ ├── Makefile │ ├── Pipfile │ ├── Pipfile.lock │ ├── README.md │ ├── infrastructure │ │ ├── main.tf │ │ ├── modules │ │ │ ├── ecr │ │ │ │ ├── main.tf │ │ │ │ └── variables.tf │ │ │ ├── kinesis │ │ │ │ ├── main.tf │ │ │ │ └── variables.tf │ │ │ ├── lambda │ │ │ │ ├── iam.tf │ │ │ │ ├── main.tf │ │ │ │ └── variables.tf │ │ │ └── s3 │ │ │ │ ├── main.tf │ │ │ │ └── variables.tf │ │ ├── variables.tf │ │ └── vars │ │ │ ├── prod.tfvars │ │ │ └── stg.tfvars │ ├── integration-test │ │ ├── docker-compose.yaml │ │ ├── event.json │ │ ├── model │ │ │ ├── MLmodel │ │ │ ├── conda.yaml │ │ │ ├── model.pkl │ │ │ ├── python_env.yaml │ │ │ └── requirements.txt │ │ ├── run.sh │ │ ├── test_docker.py │ │ └── test_kinesis.py │ ├── lambda_function.py │ ├── model.py │ ├── plan.md │ ├── pyproject.toml │ ├── scripts │ │ ├── deploy_manual.sh │ │ ├── publish.sh │ │ └── test_cloud_e2e.sh │ └── tests │ │ ├── __init__.py │ │ ├── data.b64 │ │ └── model_test.py ├── docs.md ├── images │ ├── thumbnail-6-1.jpg │ ├── thumbnail-6-2.jpg │ ├── thumbnail-6-3.jpg │ ├── thumbnail-6-4.jpg │ ├── thumbnail-6-5.jpg │ └── thumbnail-6-6.jpg └── meta.json ├── 07-project ├── README.md └── images │ └── thumbnail-7-1.jpg ├── README.md ├── after-sign-up.md ├── asking-questions.md ├── certificate.md ├── cohorts ├── 2022 │ ├── 01-intro │ │ ├── homework.ipynb │ │ └── homework.md │ ├── 02-experiment-tracking │ │ ├── homework.md │ │ └── homework │ │ │ ├── hpo.py │ │ │ ├── preprocess_data.py │ │ │ ├── register_model.py │ │ │ └── train.py │ ├── 03-orchestration │ │ ├── README.md │ │ ├── code │ │ │ ├── model_training.py │ │ │ ├── orchestration.py │ │ │ ├── prefect_deploy.py │ │ │ ├── prefect_flow.py │ │ │ └── work-queue.py │ │ ├── homework.md │ │ ├── homework.py │ │ ├── homework_solution.py │ │ └── images │ │ │ ├── thumbnail-3-01.jpg │ │ │ ├── thumbnail-3-02.jpg │ │ │ ├── thumbnail-3-03.jpg │ │ │ ├── thumbnail-3-04.jpg │ │ │ ├── thumbnail-3-05.jpg │ │ │ └── thumbnail-3-06.jpg │ ├── 04-deployment │ │ ├── homework.md │ │ └── homework │ │ │ ├── Dockerfile │ │ │ ├── Pipfile │ │ │ ├── Pipfile.lock │ │ │ ├── batch.py │ │ │ ├── homework.dockerfile │ │ │ ├── model.bin │ │ │ └── starter.ipynb │ ├── 05-monitoring │ │ ├── README.md │ │ ├── homework.md │ │ └── homework │ │ │ ├── docker-compose-homework-solution.yml │ │ │ ├── docker-compose-homework.yml │ │ │ ├── model_training.py │ │ │ ├── prediction_service │ │ │ ├── Dockerfile │ │ │ ├── Pipfile │ │ │ ├── Pipfile.lock │ │ │ ├── app.py │ │ │ ├── lin_reg.bin │ │ │ └── lin_reg_V2.bin │ │ │ ├── prefect-monitoring │ │ │ ├── Pipfile │ │ │ ├── Pipfile.lock │ │ │ ├── clean_mongo.py │ │ │ ├── monitor_profile.ipynb │ │ │ ├── monitor_profile_solution.ipynb │ │ │ ├── prefect_monitoring.py │ │ │ ├── prefect_monitoring_solution.py │ │ │ ├── prepare_reference_data.py │ │ │ └── send_data.py │ │ │ ├── prepare.py │ │ │ ├── requirements.txt │ │ │ └── test.py │ ├── 06-best-practices │ │ ├── homework.md │ │ ├── homework │ │ │ ├── Dockerfile │ │ │ ├── Pipfile │ │ │ ├── Pipfile.lock │ │ │ ├── batch.py │ │ │ └── model.bin │ │ └── homework_solution │ │ │ ├── Dockerfile │ │ │ ├── Pipfile │ │ │ ├── Pipfile.lock │ │ │ ├── batch.py │ │ │ ├── docker-compose.yaml │ │ │ ├── integration_test.py │ │ │ ├── integration_test.sh │ │ │ ├── model.bin │ │ │ └── tests │ │ │ ├── __init__.py │ │ │ └── test_batch.py │ ├── 07-project │ │ └── README.md │ └── leaderboard.md ├── 2023 │ ├── 01-intro │ │ └── homework.md │ ├── 02-experiment-tracking │ │ ├── homework-wandb │ │ │ ├── preprocess_data.py │ │ │ ├── sweep.py │ │ │ └── train.py │ │ ├── homework.md │ │ ├── homework │ │ │ ├── hpo.py │ │ │ ├── preprocess_data.py │ │ │ ├── register_model.py │ │ │ └── train.py │ │ ├── solution-mlflow │ │ │ ├── hpo.py │ │ │ ├── preprocess_data.py │ │ │ ├── register_model.py │ │ │ └── train.py │ │ └── wandb.md │ ├── 03-orchestration │ │ ├── homework.md │ │ └── prefect │ │ │ ├── .gitignore │ │ │ ├── 3.2 │ │ │ ├── cat_dog_facts.py │ │ │ └── cat_facts.py │ │ │ ├── 3.3 │ │ │ ├── duration_prediction_explore.ipynb │ │ │ ├── duration_prediction_original.ipynb │ │ │ ├── orchestrate.py │ │ │ └── orchestrate_pre_prefect.py │ │ │ ├── 3.4 │ │ │ └── orchestrate.py │ │ │ ├── 3.5 │ │ │ ├── create_s3_bucket_block.py │ │ │ ├── orchestrate.py │ │ │ └── orchestrate_s3.py │ │ │ ├── 3.6 │ │ │ ├── create_s3_bucket_block.py │ │ │ └── orchestrate_s3.py │ │ │ ├── README.md │ │ │ ├── images │ │ │ ├── Activity-create-run-deployment.png │ │ │ ├── thumbnail-3-01.jpg │ │ │ ├── thumbnail-3-01.png │ │ │ ├── thumbnail-3-02.jpg │ │ │ ├── thumbnail-3-03.jpg │ │ │ ├── thumbnail-3-03.png │ │ │ ├── thumbnail-3-04.jpg │ │ │ ├── thumbnail-3-04.png │ │ │ ├── thumbnail-3-05.jpg │ │ │ ├── thumbnail-3-05.png │ │ │ ├── thumbnail-3-06.jpg │ │ │ └── thumbnail-3-06.png │ │ │ ├── meta.json │ │ │ └── requirements.txt │ ├── 04-deployment │ │ ├── homework.md │ │ └── homework │ │ │ ├── Dockerfile │ │ │ ├── model.bin │ │ │ └── starter.ipynb │ ├── 05-monitoring │ │ └── homework.md │ ├── 06-best-practices │ │ ├── homework.md │ │ ├── homework │ │ │ ├── Dockerfile │ │ │ ├── Pipfile │ │ │ ├── Pipfile.lock │ │ │ ├── batch.py │ │ │ └── model.bin │ │ └── homework_solution │ │ │ ├── Pipfile │ │ │ ├── Pipfile.lock │ │ │ ├── batch.py │ │ │ ├── docker-compose.yaml │ │ │ ├── integration_test.py │ │ │ ├── model.bin │ │ │ └── tests │ │ │ ├── __init__.py │ │ │ └── test_batch.py │ ├── 07-project │ │ └── README.md │ └── README.md ├── 2024 │ ├── 01-intro │ │ └── homework.md │ ├── 02-experiment-tracking │ │ ├── homework.md │ │ ├── homework │ │ │ ├── hpo.py │ │ │ ├── preprocess_data.py │ │ │ ├── register_model.py │ │ │ └── train.py │ │ └── solution │ │ │ ├── hpo.py │ │ │ ├── preprocess_data.py │ │ │ ├── register_model.py │ │ │ └── train.py │ ├── 03-orchestration │ │ ├── .gitignore │ │ ├── 3.0 │ │ │ └── README.md │ │ ├── 3.1 │ │ │ └── README.md │ │ ├── 3.2 │ │ │ └── README.md │ │ ├── 3.3 │ │ │ └── README.md │ │ ├── 3.4 │ │ │ └── README.md │ │ ├── 3.5 │ │ │ └── README.md │ │ ├── README.md │ │ ├── homework.md │ │ ├── meta.json │ │ └── requirements.txt │ ├── 04-deployment │ │ ├── homework.md │ │ ├── homework │ │ │ ├── model.bin │ │ │ └── starter.ipynb │ │ └── homework_solution │ │ │ ├── Dockerfile │ │ │ ├── Pipfile │ │ │ ├── Pipfile.lock │ │ │ ├── batch.py │ │ │ ├── homework.dockerfile │ │ │ ├── model.bin │ │ │ ├── model2.bin │ │ │ └── solution.ipynb │ ├── 05-monitoring │ │ └── homework.md │ ├── 06-best-practices │ │ ├── homework.md │ │ └── homework │ │ │ ├── Dockerfile │ │ │ ├── Pipfile │ │ │ ├── Pipfile.lock │ │ │ ├── batch.py │ │ │ └── model.bin │ ├── README.md │ └── project.md └── 2025 │ ├── 01-intro │ ├── homework.ipynb │ └── homework.md │ ├── 02-experiment-tracking │ ├── homework.md │ └── homework │ │ ├── hpo.py │ │ ├── preprocess_data.py │ │ ├── register_model.py │ │ └── train.py │ ├── 03-orchestration │ └── homework.md │ ├── 04-deployment │ ├── homework.md │ └── homework │ │ ├── model.bin │ │ └── starter.ipynb │ ├── 05-monitoring │ └── homework.md │ ├── 06-best-practices │ ├── homework.md │ └── homework │ │ ├── Dockerfile │ │ ├── Pipfile │ │ ├── Pipfile.lock │ │ ├── batch.py │ │ └── model.bin │ ├── README.md │ └── project.md ├── generate └── generate_pages.ipynb ├── images ├── IMG_20230323_134059_927.png ├── banner-2025.jpg ├── banner.png ├── learning-in-public-links.png ├── learning-in-public.png └── play.png └── learning-in-public.md /.github/workflows/ci-tests.yml: -------------------------------------------------------------------------------- 1 | name: CI-Tests 2 | on: 3 | pull_request: 4 | branches: 5 | - 'develop' 6 | paths: 7 | - '06-best-practices/code/**' 8 | 9 | env: 10 | AWS_DEFAULT_REGION: 'eu-west-1' 11 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} 12 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 13 | 14 | jobs: 15 | test: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Set up Python 3.9 20 | uses: actions/setup-python@v2 21 | with: 22 | python-version: 3.9 23 | 24 | - name: Install dependencies 25 | working-directory: "06-best-practices/code" 26 | run: pip install pipenv && pipenv install --dev 27 | 28 | - name: Run Unit tests 29 | working-directory: "06-best-practices/code" 30 | run: pipenv run pytest tests/ 31 | 32 | - name: Lint 33 | working-directory: "06-best-practices/code" 34 | run: pipenv run pylint --recursive=y . 35 | 36 | - name: Configure AWS Credentials 37 | uses: aws-actions/configure-aws-credentials@v1 38 | with: 39 | aws-access-key-id: ${{ env.AWS_ACCESS_KEY_ID }} 40 | aws-secret-access-key: ${{ env.AWS_SECRET_ACCESS_KEY }} 41 | aws-region: ${{ env.AWS_DEFAULT_REGION }} 42 | 43 | - name: Integration Test 44 | working-directory: '06-best-practices/code/integraton-test' 45 | run: | 46 | . run.sh 47 | 48 | tf-plan: 49 | runs-on: ubuntu-latest 50 | steps: 51 | - uses: actions/checkout@v2 52 | - name: Configure AWS Credentials 53 | uses: aws-actions/configure-aws-credentials@v1 54 | with: 55 | aws-access-key-id: ${{ env.AWS_ACCESS_KEY_ID }} 56 | aws-secret-access-key: ${{ env.AWS_SECRET_ACCESS_KEY }} 57 | aws-region: ${{ env.AWS_DEFAULT_REGION }} 58 | 59 | - uses: hashicorp/setup-terraform@v2 60 | 61 | - name: TF plan 62 | id: plan 63 | working-directory: '06-best-practices/code/infrastructure' 64 | run: | 65 | terraform init -backend-config="key=mlops-zoomcamp-prod.tfstate" --reconfigure && terraform plan --var-file vars/prod.tfvars 66 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | .ipynb_checkpoints 3 | .bin 4 | *.db 5 | 6 | *.parquet 7 | *.html 8 | *.csv 9 | 10 | .venv 11 | venv 12 | .idea 13 | **/artifacts/ 14 | **/models/ 15 | 16 | __pycache__/ 17 | **.env 18 | **.terraform/ 19 | **.terraform.lock* 20 | **terraform.tfstate* 21 | 22 | 23 | .DS_Store -------------------------------------------------------------------------------- /01-intro/images/thumbnail-1-01.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/01-intro/images/thumbnail-1-01.jpg -------------------------------------------------------------------------------- /01-intro/images/thumbnail-1-02-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/01-intro/images/thumbnail-1-02-1.jpg -------------------------------------------------------------------------------- /01-intro/images/thumbnail-1-02.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/01-intro/images/thumbnail-1-02.jpg -------------------------------------------------------------------------------- /01-intro/images/thumbnail-1-03.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/01-intro/images/thumbnail-1-03.jpg -------------------------------------------------------------------------------- /01-intro/images/thumbnail-1-04.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/01-intro/images/thumbnail-1-04.jpg -------------------------------------------------------------------------------- /01-intro/images/thumbnail-1-05.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/01-intro/images/thumbnail-1-05.jpg -------------------------------------------------------------------------------- /01-intro/meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "module": { 3 | "number": 1, 4 | "title": "Introduction" 5 | }, 6 | "units": [ 7 | { 8 | "number": 1, 9 | "title": "Introduction", 10 | "youtube": "https://www.youtube.com/watch?v=s0uaFZSzwfI" 11 | }, 12 | { 13 | "number": 2, 14 | "title": "Environment preparation", 15 | "youtube": "https://www.youtube.com/watch?v=IXSiYkP23zo" 16 | }, 17 | { 18 | "number": 3, 19 | "title": "(Optional) Training a ride duration prediction model", 20 | "youtube": "https://www.youtube.com/watch?v=iRunifGSHFc" 21 | }, 22 | { 23 | "number": 4, 24 | "title": "Course overview", 25 | "youtube": "https://www.youtube.com/watch?v=teP9KWkP6SM" 26 | }, 27 | { 28 | "number": 5, 29 | "title": "MLOps maturity model", 30 | "youtube": "https://www.youtube.com/watch?v=XwTH8BDGzYk" 31 | }, 32 | { 33 | "number": 6, 34 | "title": "Homework", 35 | "youtube": "" 36 | } 37 | ] 38 | } -------------------------------------------------------------------------------- /02-experiment-tracking/images/db_configuration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/db_configuration.png -------------------------------------------------------------------------------- /02-experiment-tracking/images/db_password.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/db_password.png -------------------------------------------------------------------------------- /02-experiment-tracking/images/db_settings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/db_settings.png -------------------------------------------------------------------------------- /02-experiment-tracking/images/ec2_instance_type.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/ec2_instance_type.png -------------------------------------------------------------------------------- /02-experiment-tracking/images/ec2_os.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/ec2_os.png -------------------------------------------------------------------------------- /02-experiment-tracking/images/key_pair.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/key_pair.png -------------------------------------------------------------------------------- /02-experiment-tracking/images/postgresql.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/postgresql.png -------------------------------------------------------------------------------- /02-experiment-tracking/images/postgresql_inbound_rule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/postgresql_inbound_rule.png -------------------------------------------------------------------------------- /02-experiment-tracking/images/s3_bucket.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/s3_bucket.png -------------------------------------------------------------------------------- /02-experiment-tracking/images/security_group.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/security_group.png -------------------------------------------------------------------------------- /02-experiment-tracking/images/select_key_pair.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/select_key_pair.png -------------------------------------------------------------------------------- /02-experiment-tracking/images/thumbnail-2-01.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/thumbnail-2-01.jpg -------------------------------------------------------------------------------- /02-experiment-tracking/images/thumbnail-2-02.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/thumbnail-2-02.jpg -------------------------------------------------------------------------------- /02-experiment-tracking/images/thumbnail-2-03.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/thumbnail-2-03.jpg -------------------------------------------------------------------------------- /02-experiment-tracking/images/thumbnail-2-04.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/thumbnail-2-04.jpg -------------------------------------------------------------------------------- /02-experiment-tracking/images/thumbnail-2-05.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/thumbnail-2-05.jpg -------------------------------------------------------------------------------- /02-experiment-tracking/images/thumbnail-2-06.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/thumbnail-2-06.jpg -------------------------------------------------------------------------------- /02-experiment-tracking/images/thumbnail-2-07.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/02-experiment-tracking/images/thumbnail-2-07.jpg -------------------------------------------------------------------------------- /02-experiment-tracking/meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "module": { 3 | "number": 2, 4 | "title": "Experiment tracking and model management" 5 | }, 6 | "units": [ 7 | { 8 | "number": 1, 9 | "title": "Experiment tracking intro", 10 | "youtube": "https://www.youtube.com/watch?v=MiA7LQin9c8" 11 | }, 12 | { 13 | "number": 2, 14 | "title": "Getting started with MLflow", 15 | "youtube": "https://www.youtube.com/watch?v=cESCQE9J3ZE" 16 | }, 17 | { 18 | "number": 3, 19 | "title": "Experiment tracking with MLflow", 20 | "youtube": "https://www.youtube.com/watch?v=iaJz-T7VWec" 21 | }, 22 | { 23 | "number": 4, 24 | "title": "Model management", 25 | "youtube": "https://www.youtube.com/watch?v=OVUPIX88q88" 26 | }, 27 | { 28 | "number": 5, 29 | "title": "Model registry", 30 | "youtube": "https://www.youtube.com/watch?v=TKHU7HAvGH8" 31 | }, 32 | { 33 | "number": 6, 34 | "title": "MLflow in practice", 35 | "youtube": "https://www.youtube.com/watch?v=1ykg4YmbFVA" 36 | }, 37 | { 38 | "number": 7, 39 | "title": "MLflow: benefits, limitations and alternatives", 40 | "youtube": "https://www.youtube.com/watch?v=Lugy1JPsBRY" 41 | }, 42 | { 43 | "number": 8, 44 | "title": "Homework", 45 | "youtube": "" 46 | } 47 | ] 48 | } -------------------------------------------------------------------------------- /02-experiment-tracking/requirements.txt: -------------------------------------------------------------------------------- 1 | mlflow 2 | jupyter 3 | scikit-learn 4 | pandas 5 | seaborn 6 | hyperopt 7 | xgboost 8 | fastparquet 9 | boto3 -------------------------------------------------------------------------------- /03-orchestration/code/commands.md: -------------------------------------------------------------------------------- 1 | ```bash 2 | pip install mlflow jupyter pandas numpy scikit-learn xgboost hyperopt 3 | wget https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/refs/heads/main/02-experiment-tracking/duration-prediction.ipynb 4 | 5 | 6 | jupyter notebook 7 | 8 | mlflow server \ 9 | --backend-store-uri sqlite:///mlflow.db 10 | ``` 11 | 12 | 13 | ```python 14 | import mlflow 15 | 16 | mlflow.set_tracking_uri("http://localhost:5000") 17 | mlflow.set_experiment("nyc-taxi-experiment") 18 | ``` 19 | 20 | ```python 21 | URL = 'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet' 22 | ``` -------------------------------------------------------------------------------- /04-deployment/batch/Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | scikit-learn = "==1.0.2" 8 | prefect = "==2.0b6" 9 | mlflow = "*" 10 | pandas = "*" 11 | boto3 = "*" 12 | pyarrow = "*" 13 | s3fs = "*" 14 | 15 | [dev-packages] 16 | 17 | [requires] 18 | python_version = "3.9" 19 | -------------------------------------------------------------------------------- /04-deployment/batch/README.md: -------------------------------------------------------------------------------- 1 | ## Batch deployment 2 | 3 | * Turn the notebook for training a model into a notebook for applying the model 4 | * Turn the notebook into a script 5 | * Clean it and parametrize 6 | -------------------------------------------------------------------------------- /04-deployment/batch/score_backfill.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from dateutil.relativedelta import relativedelta 3 | 4 | from prefect import flow 5 | 6 | import score 7 | 8 | 9 | @flow 10 | def ride_duration_prediction_backfill(): 11 | start_date = datetime(year=2021, month=3, day=1) 12 | end_date = datetime(year=2022, month=4, day=1) 13 | 14 | d = start_date 15 | 16 | while d <= end_date: 17 | score.ride_duration_prediction( 18 | taxi_type='green', 19 | run_id='e1efc53e9bd149078b0c12aeaa6365df', 20 | run_date=d 21 | ) 22 | 23 | d = d + relativedelta(months=1) 24 | 25 | 26 | if __name__ == '__main__': 27 | ride_duration_prediction_backfill() -------------------------------------------------------------------------------- /04-deployment/batch/score_deploy.py: -------------------------------------------------------------------------------- 1 | from prefect.deployments import Deployment 2 | from prefect.orion.schemas.schedules import CronSchedule 3 | from score import ride_duration_prediction 4 | 5 | deployment = Deployment.build_from_flow( 6 | flow=ride_duration_prediction, 7 | name="ride_duration_prediction", 8 | parameters={ 9 | "taxi_type": "green", 10 | "run_id": "e1efc53e9bd149078b0c12aeaa6365df", 11 | }, 12 | schedule=CronSchedule(cron="0 3 2 * *"), 13 | work_queue_name="ml", 14 | ) 15 | 16 | deployment.apply() 17 | -------------------------------------------------------------------------------- /04-deployment/images/thumbnail-4-01.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/04-deployment/images/thumbnail-4-01.jpg -------------------------------------------------------------------------------- /04-deployment/images/thumbnail-4-02.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/04-deployment/images/thumbnail-4-02.jpg -------------------------------------------------------------------------------- /04-deployment/images/thumbnail-4-03.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/04-deployment/images/thumbnail-4-03.jpg -------------------------------------------------------------------------------- /04-deployment/images/thumbnail-4-04.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/04-deployment/images/thumbnail-4-04.jpg -------------------------------------------------------------------------------- /04-deployment/images/thumbnail-4-05.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/04-deployment/images/thumbnail-4-05.jpg -------------------------------------------------------------------------------- /04-deployment/images/thumbnail-4-06.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/04-deployment/images/thumbnail-4-06.jpg -------------------------------------------------------------------------------- /04-deployment/meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "module": { 3 | "number": 4, 4 | "title": "Model Deployment" 5 | }, 6 | "units": [ 7 | { 8 | "number": 1, 9 | "title": "Three ways of deploying a model", 10 | "youtube": "https://www.youtube.com/watch?v=JMGe4yIoBRA" 11 | }, 12 | { 13 | "number": 2, 14 | "title": "Web-services: Deploying models with Flask and Docker", 15 | "youtube": "https://www.youtube.com/watch?v=D7wfMAdgdF8" 16 | }, 17 | { 18 | "number": 3, 19 | "title": "Web-services: Getting the models from the model registry (MLflow)", 20 | "youtube": "https://www.youtube.com/watch?v=aewOpHSCkqI" 21 | }, 22 | { 23 | "number": 4, 24 | "title": "(Optional) Streaming: Deploying models with Kinesis and Lambda ", 25 | "youtube": "https://www.youtube.com/watch?v=TCqr9HNcrsI" 26 | }, 27 | { 28 | "number": 5, 29 | "title": "Batch: Preparing a scoring script", 30 | "youtube": "https://www.youtube.com/watch?v=18Lbaaeigek" 31 | }, 32 | { 33 | "number": 6, 34 | "title": "MLOps Zoomcamp 4.6 - Batch: Scheduling batch scoring jobs with Prefect", 35 | "youtube": "https://www.youtube.com/watch?v=ekT_JW213Tc" 36 | }, 37 | { 38 | "number": 7, 39 | "title": "Homework", 40 | "youtube": "" 41 | } 42 | ] 43 | } -------------------------------------------------------------------------------- /04-deployment/streaming/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/lambda/python:3.9 2 | 3 | RUN pip install -U pip 4 | RUN pip install pipenv 5 | 6 | COPY [ "Pipfile", "Pipfile.lock", "./" ] 7 | 8 | RUN pipenv install --system --deploy 9 | 10 | COPY [ "lambda_function.py", "./" ] 11 | 12 | CMD [ "lambda_function.lambda_handler" ] 13 | -------------------------------------------------------------------------------- /04-deployment/streaming/Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | boto3 = "*" 8 | mlflow = "*" 9 | scikit-learn = "==1.0.2" 10 | 11 | [dev-packages] 12 | 13 | [requires] 14 | python_version = "3.9" 15 | -------------------------------------------------------------------------------- /04-deployment/streaming/lambda_function.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import boto3 4 | import base64 5 | 6 | import mlflow 7 | 8 | kinesis_client = boto3.client('kinesis') 9 | 10 | PREDICTIONS_STREAM_NAME = os.getenv('PREDICTIONS_STREAM_NAME', 'ride_predictions') 11 | 12 | 13 | RUN_ID = os.getenv('RUN_ID') 14 | 15 | logged_model = f's3://mlflow-models-alexey/1/{RUN_ID}/artifacts/model' 16 | # logged_model = f'runs:/{RUN_ID}/model' 17 | model = mlflow.pyfunc.load_model(logged_model) 18 | 19 | 20 | TEST_RUN = os.getenv('TEST_RUN', 'False') == 'True' 21 | 22 | def prepare_features(ride): 23 | features = {} 24 | features['PU_DO'] = '%s_%s' % (ride['PULocationID'], ride['DOLocationID']) 25 | features['trip_distance'] = ride['trip_distance'] 26 | return features 27 | 28 | 29 | def predict(features): 30 | pred = model.predict(features) 31 | return float(pred[0]) 32 | 33 | 34 | def lambda_handler(event, context): 35 | # print(json.dumps(event)) 36 | 37 | predictions_events = [] 38 | 39 | for record in event['Records']: 40 | encoded_data = record['kinesis']['data'] 41 | decoded_data = base64.b64decode(encoded_data).decode('utf-8') 42 | ride_event = json.loads(decoded_data) 43 | 44 | # print(ride_event) 45 | ride = ride_event['ride'] 46 | ride_id = ride_event['ride_id'] 47 | 48 | features = prepare_features(ride) 49 | prediction = predict(features) 50 | 51 | prediction_event = { 52 | 'model': 'ride_duration_prediction_model', 53 | 'version': '123', 54 | 'prediction': { 55 | 'ride_duration': prediction, 56 | 'ride_id': ride_id 57 | } 58 | } 59 | 60 | if not TEST_RUN: 61 | kinesis_client.put_record( 62 | StreamName=PREDICTIONS_STREAM_NAME, 63 | Data=json.dumps(prediction_event), 64 | PartitionKey=str(ride_id) 65 | ) 66 | 67 | predictions_events.append(prediction_event) 68 | 69 | 70 | return { 71 | 'predictions': predictions_events 72 | } 73 | -------------------------------------------------------------------------------- /04-deployment/streaming/test.py: -------------------------------------------------------------------------------- 1 | 2 | import lambda_function 3 | 4 | event = { 5 | "Records": [ 6 | { 7 | "kinesis": { 8 | "kinesisSchemaVersion": "1.0", 9 | "partitionKey": "1", 10 | "sequenceNumber": "49630081666084879290581185630324770398608704880802529282", 11 | "data": "ewogICAgICAgICJyaWRlIjogewogICAgICAgICAgICAiUFVMb2NhdGlvbklEIjogMTMwLAogICAgICAgICAgICAiRE9Mb2NhdGlvbklEIjogMjA1LAogICAgICAgICAgICAidHJpcF9kaXN0YW5jZSI6IDMuNjYKICAgICAgICB9LCAKICAgICAgICAicmlkZV9pZCI6IDI1NgogICAgfQ==", 12 | "approximateArrivalTimestamp": 1654161514.132 13 | }, 14 | "eventSource": "aws:kinesis", 15 | "eventVersion": "1.0", 16 | "eventID": "shardId-000000000000:49630081666084879290581185630324770398608704880802529282", 17 | "eventName": "aws:kinesis:record", 18 | "invokeIdentityArn": "arn:aws:iam::387546586013:role/lambda-kinesis-role", 19 | "awsRegion": "eu-west-1", 20 | "eventSourceARN": "arn:aws:kinesis:eu-west-1:387546586013:stream/ride_events" 21 | } 22 | ] 23 | } 24 | 25 | 26 | result = lambda_function.lambda_handler(event, None) 27 | print(result) 28 | -------------------------------------------------------------------------------- /04-deployment/streaming/test_docker.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | event = { 4 | "Records": [ 5 | { 6 | "kinesis": { 7 | "kinesisSchemaVersion": "1.0", 8 | "partitionKey": "1", 9 | "sequenceNumber": "49630081666084879290581185630324770398608704880802529282", 10 | "data": "ewogICAgICAgICJyaWRlIjogewogICAgICAgICAgICAiUFVMb2NhdGlvbklEIjogMTMwLAogICAgICAgICAgICAiRE9Mb2NhdGlvbklEIjogMjA1LAogICAgICAgICAgICAidHJpcF9kaXN0YW5jZSI6IDMuNjYKICAgICAgICB9LCAKICAgICAgICAicmlkZV9pZCI6IDI1NgogICAgfQ==", 11 | "approximateArrivalTimestamp": 1654161514.132 12 | }, 13 | "eventSource": "aws:kinesis", 14 | "eventVersion": "1.0", 15 | "eventID": "shardId-000000000000:49630081666084879290581185630324770398608704880802529282", 16 | "eventName": "aws:kinesis:record", 17 | "invokeIdentityArn": "arn:aws:iam::387546586013:role/lambda-kinesis-role", 18 | "awsRegion": "eu-west-1", 19 | "eventSourceARN": "arn:aws:kinesis:eu-west-1:387546586013:stream/ride_events" 20 | } 21 | ] 22 | } 23 | 24 | 25 | url = 'http://localhost:8080/2015-03-31/functions/function/invocations' 26 | response = requests.post(url, json=event) 27 | print(response.json()) 28 | -------------------------------------------------------------------------------- /04-deployment/web-service-mlflow/Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | scikit-learn = "==1.0.2" 8 | flask = "*" 9 | gunicorn = "*" 10 | mlflow = "*" 11 | boto3 = "*" 12 | 13 | [dev-packages] 14 | requests = "*" 15 | 16 | [requires] 17 | python_version = "3.9" 18 | -------------------------------------------------------------------------------- /04-deployment/web-service-mlflow/README.md: -------------------------------------------------------------------------------- 1 | ## Getting the model for deployment from MLflow 2 | 3 | * Take the code from the previous video 4 | * Train another model, register with MLflow 5 | * Put the model into a scikit-learn pipeline 6 | * Model deployment with tracking server 7 | * Model deployment without the tracking server 8 | 9 | Starting the MLflow server with S3: 10 | 11 | ```bash 12 | mlflow server \ 13 | --backend-store-uri=sqlite:///mlflow.db \ 14 | --default-artifact-root=s3://mlflow-models-alexey/ 15 | ``` 16 | 17 | Downloading the artifact 18 | 19 | ```bash 20 | export MLFLOW_TRACKING_URI="http://127.0.0.1:5000" 21 | export MODEL_RUN_ID="6dd459b11b4e48dc862f4e1019d166f6" 22 | 23 | mlflow artifacts download \ 24 | --run-id ${MODEL_RUN_ID} \ 25 | --artifact-path model \ 26 | --dst-path . 27 | ``` -------------------------------------------------------------------------------- /04-deployment/web-service-mlflow/dict_vectorizer.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/04-deployment/web-service-mlflow/dict_vectorizer.bin -------------------------------------------------------------------------------- /04-deployment/web-service-mlflow/predict.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | 4 | import mlflow 5 | from flask import Flask, request, jsonify 6 | 7 | 8 | RUN_ID = os.getenv('RUN_ID') 9 | 10 | logged_model = f's3://mlflow-models-alexey/1/{RUN_ID}/artifacts/model' 11 | # logged_model = f'runs:/{RUN_ID}/model' 12 | model = mlflow.pyfunc.load_model(logged_model) 13 | 14 | 15 | def prepare_features(ride): 16 | features = {} 17 | features['PU_DO'] = '%s_%s' % (ride['PULocationID'], ride['DOLocationID']) 18 | features['trip_distance'] = ride['trip_distance'] 19 | return features 20 | 21 | 22 | def predict(features): 23 | preds = model.predict(features) 24 | return float(preds[0]) 25 | 26 | 27 | app = Flask('duration-prediction') 28 | 29 | 30 | @app.route('/predict', methods=['POST']) 31 | def predict_endpoint(): 32 | ride = request.get_json() 33 | 34 | features = prepare_features(ride) 35 | pred = predict(features) 36 | 37 | result = { 38 | 'duration': pred, 39 | 'model_version': RUN_ID 40 | } 41 | 42 | return jsonify(result) 43 | 44 | 45 | if __name__ == "__main__": 46 | app.run(debug=True, host='0.0.0.0', port=9696) 47 | -------------------------------------------------------------------------------- /04-deployment/web-service-mlflow/test.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | ride = { 4 | "PULocationID": 10, 5 | "DOLocationID": 50, 6 | "trip_distance": 40 7 | } 8 | 9 | url = 'http://localhost:9696/predict' 10 | response = requests.post(url, json=ride) 11 | print(response.json()) 12 | -------------------------------------------------------------------------------- /04-deployment/web-service/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9.7-slim 2 | 3 | RUN pip install -U pip 4 | RUN pip install pipenv 5 | 6 | WORKDIR /app 7 | 8 | COPY [ "Pipfile", "Pipfile.lock", "./" ] 9 | 10 | RUN pipenv install --system --deploy 11 | 12 | COPY [ "predict.py", "lin_reg.bin", "./" ] 13 | 14 | EXPOSE 9696 15 | 16 | ENTRYPOINT [ "gunicorn", "--bind=0.0.0.0:9696", "predict:app" ] -------------------------------------------------------------------------------- /04-deployment/web-service/Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | scikit-learn = "==1.0.2" 8 | flask = "*" 9 | gunicorn = "*" 10 | 11 | [dev-packages] 12 | requests = "*" 13 | 14 | [requires] 15 | python_version = "3.9" 16 | -------------------------------------------------------------------------------- /04-deployment/web-service/README.md: -------------------------------------------------------------------------------- 1 | ## Deploying a model as a web-service 2 | 3 | * Creating a virtual environment with Pipenv 4 | * Creating a script for predictiong 5 | * Putting the script into a Flask app 6 | * Packaging the app to Docker 7 | 8 | 9 | ```bash 10 | docker build -t ride-duration-prediction-service:v1 . 11 | ``` 12 | 13 | ```bash 14 | docker run -it --rm -p 9696:9696 ride-duration-prediction-service:v1 15 | ``` 16 | -------------------------------------------------------------------------------- /04-deployment/web-service/lin_reg.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/04-deployment/web-service/lin_reg.bin -------------------------------------------------------------------------------- /04-deployment/web-service/predict.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | from flask import Flask, request, jsonify 4 | 5 | with open('lin_reg.bin', 'rb') as f_in: 6 | (dv, model) = pickle.load(f_in) 7 | 8 | 9 | def prepare_features(ride): 10 | features = {} 11 | features['PU_DO'] = '%s_%s' % (ride['PULocationID'], ride['DOLocationID']) 12 | features['trip_distance'] = ride['trip_distance'] 13 | return features 14 | 15 | 16 | def predict(features): 17 | X = dv.transform(features) 18 | preds = model.predict(X) 19 | return float(preds[0]) 20 | 21 | 22 | app = Flask('duration-prediction') 23 | 24 | 25 | @app.route('/predict', methods=['POST']) 26 | def predict_endpoint(): 27 | ride = request.get_json() 28 | 29 | features = prepare_features(ride) 30 | pred = predict(features) 31 | 32 | result = { 33 | 'duration': pred 34 | } 35 | 36 | return jsonify(result) 37 | 38 | 39 | if __name__ == "__main__": 40 | app.run(debug=True, host='0.0.0.0', port=9696) -------------------------------------------------------------------------------- /04-deployment/web-service/test.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | ride = { 4 | "PULocationID": 10, 5 | "DOLocationID": 50, 6 | "trip_distance": 40 7 | } 8 | 9 | url = 'http://localhost:9696/predict' 10 | response = requests.post(url, json=ride) 11 | print(response.json()) 12 | -------------------------------------------------------------------------------- /05-monitoring/config/grafana_dashboards.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | providers: 4 | # an unique provider name. Required 5 | - name: 'Evidently Dashboards' 6 | # Org id. Default to 1 7 | orgId: 1 8 | # name of the dashboard folder. 9 | folder: '' 10 | # folder UID. will be automatically generated if not specified 11 | folderUid: '' 12 | # provider type. Default to 'file' 13 | type: file 14 | # disable dashboard deletion 15 | disableDeletion: false 16 | # how often Grafana will scan for changed dashboards 17 | updateIntervalSeconds: 10 18 | # allow updating provisioned dashboards from the UI 19 | allowUiUpdates: false 20 | options: 21 | # path to dashboard files on disk. Required when using the 'file' type 22 | path: /opt/grafana/dashboards 23 | # use folder names from filesystem to create folders in Grafana 24 | foldersFromFilesStructure: true -------------------------------------------------------------------------------- /05-monitoring/config/grafana_datasources.yaml: -------------------------------------------------------------------------------- 1 | # config file version 2 | apiVersion: 1 3 | 4 | # list of datasources to insert/update 5 | # available in the database 6 | datasources: 7 | - name: PostgreSQL 8 | type: postgres 9 | access: proxy 10 | url: db:5432 11 | database: test 12 | user: postgres 13 | secureJsonData: 14 | password: 'example' 15 | jsonData: 16 | sslmode: 'disable' 17 | database: test 18 | -------------------------------------------------------------------------------- /05-monitoring/data/.gitignore: -------------------------------------------------------------------------------- 1 | !.gitignore 2 | -------------------------------------------------------------------------------- /05-monitoring/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | 3 | volumes: 4 | grafana_data: {} 5 | 6 | networks: 7 | front-tier: 8 | back-tier: 9 | 10 | services: 11 | db: 12 | image: postgres 13 | restart: always 14 | environment: 15 | POSTGRES_PASSWORD: example 16 | ports: 17 | - "5432:5432" 18 | networks: 19 | - back-tier 20 | 21 | adminer: 22 | image: adminer 23 | restart: always 24 | ports: 25 | - "8080:8080" 26 | networks: 27 | - back-tier 28 | - front-tier 29 | 30 | grafana: 31 | image: grafana/grafana-enterprise 32 | user: "472" 33 | ports: 34 | - "3000:3000" 35 | volumes: 36 | - ./config/grafana_datasources.yaml:/etc/grafana/provisioning/datasources/datasource.yaml:ro 37 | - ./config/grafana_dashboards.yaml:/etc/grafana/provisioning/dashboards/dashboards.yaml:ro 38 | - ./dashboards:/opt/grafana/dashboards 39 | networks: 40 | - back-tier 41 | - front-tier 42 | restart: always -------------------------------------------------------------------------------- /05-monitoring/dummy_metrics_calculation.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import time 3 | import random 4 | import logging 5 | import uuid 6 | import pytz 7 | import pandas as pd 8 | import io 9 | import psycopg 10 | 11 | logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s]: %(message)s") 12 | 13 | SEND_TIMEOUT = 10 14 | rand = random.Random() 15 | 16 | create_table_statement = """ 17 | drop table if exists dummy_metrics; 18 | create table dummy_metrics( 19 | timestamp timestamp, 20 | value1 integer, 21 | value2 varchar, 22 | value3 float 23 | ) 24 | """ 25 | 26 | def prep_db(): 27 | with psycopg.connect("host=localhost port=5432 user=postgres password=example", autocommit=True) as conn: 28 | res = conn.execute("SELECT 1 FROM pg_database WHERE datname='test'") 29 | if len(res.fetchall()) == 0: 30 | conn.execute("create database test;") 31 | with psycopg.connect("host=localhost port=5432 dbname=test user=postgres password=example") as conn: 32 | conn.execute(create_table_statement) 33 | 34 | def calculate_dummy_metrics_postgresql(curr): 35 | value1 = rand.randint(0, 1000) 36 | value2 = str(uuid.uuid4()) 37 | value3 = rand.random() 38 | 39 | curr.execute( 40 | "insert into dummy_metrics(timestamp, value1, value2, value3) values (%s, %s, %s, %s)", 41 | (datetime.datetime.now(pytz.timezone('Europe/London')), value1, value2, value3) 42 | ) 43 | 44 | def main(): 45 | prep_db() 46 | last_send = datetime.datetime.now() - datetime.timedelta(seconds=10) 47 | with psycopg.connect("host=localhost port=5432 dbname=test user=postgres password=example", autocommit=True) as conn: 48 | for i in range(0, 100): 49 | with conn.cursor() as curr: 50 | calculate_dummy_metrics_postgresql(curr) 51 | 52 | new_send = datetime.datetime.now() 53 | seconds_elapsed = (new_send - last_send).total_seconds() 54 | if seconds_elapsed < SEND_TIMEOUT: 55 | time.sleep(SEND_TIMEOUT - seconds_elapsed) 56 | while last_send < new_send: 57 | last_send = last_send + datetime.timedelta(seconds=10) 58 | logging.info("data sent") 59 | 60 | if __name__ == '__main__': 61 | main() -------------------------------------------------------------------------------- /05-monitoring/images/thumbnail-5-01.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/05-monitoring/images/thumbnail-5-01.jpg -------------------------------------------------------------------------------- /05-monitoring/images/thumbnail-5-02.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/05-monitoring/images/thumbnail-5-02.jpg -------------------------------------------------------------------------------- /05-monitoring/images/thumbnail-5-03.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/05-monitoring/images/thumbnail-5-03.jpg -------------------------------------------------------------------------------- /05-monitoring/images/thumbnail-5-04.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/05-monitoring/images/thumbnail-5-04.jpg -------------------------------------------------------------------------------- /05-monitoring/images/thumbnail-5-05.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/05-monitoring/images/thumbnail-5-05.jpg -------------------------------------------------------------------------------- /05-monitoring/images/thumbnail-5-06.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/05-monitoring/images/thumbnail-5-06.jpg -------------------------------------------------------------------------------- /05-monitoring/images/thumbnail-5-07.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/05-monitoring/images/thumbnail-5-07.jpg -------------------------------------------------------------------------------- /05-monitoring/images/thumbnail-5-08.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/05-monitoring/images/thumbnail-5-08.jpg -------------------------------------------------------------------------------- /05-monitoring/meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "module": { 3 | "number": 5, 4 | "title": "ML Monitoring" 5 | }, 6 | "units": [ 7 | { 8 | "number": 1, 9 | "title": "Intro to ML monitoring", 10 | "youtube": "https://www.youtube.com/watch?v=SQ0jBwd_3kk" 11 | }, 12 | { 13 | "number": 2, 14 | "title": "Environment setup", 15 | "youtube": "https://www.youtube.com/watch?v=yixA3C1xSxc" 16 | }, 17 | { 18 | "number": 3, 19 | "title": "Prepare reference and model", 20 | "youtube": "https://www.youtube.com/watch?v=IjNrkqMYQeQ" 21 | }, 22 | { 23 | "number": 4, 24 | "title": "Evidently metrics calculation", 25 | "youtube": "https://www.youtube.com/watch?v=kP3lzh_HfWY" 26 | }, 27 | { 28 | "number": 5, 29 | "title": "Dummy monitoring", 30 | "youtube": "https://www.youtube.com/watch?v=s3G4PMsOMOA" 31 | }, 32 | { 33 | "number": 6, 34 | "title": "Data quality monitoring", 35 | "youtube": "https://www.youtube.com/watch?v=fytrmPbcLhI" 36 | }, 37 | { 38 | "number": 7, 39 | "title": "Save Grafana Dashboard", 40 | "youtube": "https://www.youtube.com/watch?v=-c4iumyZMyw" 41 | }, 42 | { 43 | "number": 8, 44 | "title": "Debugging with test suites and reports", 45 | "youtube": "https://www.youtube.com/watch?v=sNSk3ojISh8" 46 | } 47 | ] 48 | } -------------------------------------------------------------------------------- /05-monitoring/models/.gitignore: -------------------------------------------------------------------------------- 1 | !.gitignore 2 | -------------------------------------------------------------------------------- /05-monitoring/requirements.txt: -------------------------------------------------------------------------------- 1 | prefect 2 | tqdm 3 | requests 4 | joblib 5 | pyarrow 6 | psycopg 7 | psycopg_binary 8 | evidently==0.6.7 9 | pandas 10 | numpy 11 | scikit-learn 12 | jupyter 13 | matplotlib 14 | -------------------------------------------------------------------------------- /06-best-practices/AWS-stream-pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/06-best-practices/AWS-stream-pipeline.png -------------------------------------------------------------------------------- /06-best-practices/ci_cd_zoomcamp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/06-best-practices/ci_cd_zoomcamp.png -------------------------------------------------------------------------------- /06-best-practices/code/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | -------------------------------------------------------------------------------- /06-best-practices/code/.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v3.2.0 6 | hooks: 7 | - id: trailing-whitespace 8 | - id: end-of-file-fixer 9 | - id: check-yaml 10 | - id: check-added-large-files 11 | - repo: https://github.com/pycqa/isort 12 | rev: 5.10.1 13 | hooks: 14 | - id: isort 15 | name: isort (python) 16 | - repo: https://github.com/psf/black 17 | rev: 22.6.0 18 | hooks: 19 | - id: black 20 | language_version: python3.9 21 | - repo: local 22 | hooks: 23 | - id: pylint 24 | name: pylint 25 | entry: pylint 26 | language: system 27 | types: [python] 28 | args: [ 29 | "-rn", # Only display messages 30 | "-sn", # Don't display the score 31 | "--recursive=y" 32 | ] 33 | - repo: local 34 | hooks: 35 | - id: pytest-check 36 | name: pytest-check 37 | entry: pytest 38 | language: system 39 | pass_filenames: false 40 | always_run: true 41 | args: [ 42 | "tests/" 43 | ] 44 | -------------------------------------------------------------------------------- /06-best-practices/code/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.testing.pytestArgs": [ 3 | "tests" 4 | ], 5 | "python.testing.unittestEnabled": false, 6 | "python.testing.pytestEnabled": true, 7 | "python.linting.pylintEnabled": true, 8 | "python.linting.enabled": true 9 | } 10 | -------------------------------------------------------------------------------- /06-best-practices/code/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/lambda/python:3.9 2 | 3 | RUN pip install -U pip 4 | RUN pip install pipenv 5 | 6 | COPY [ "Pipfile", "Pipfile.lock", "./" ] 7 | 8 | RUN pipenv install --system --deploy 9 | 10 | COPY [ "lambda_function.py", "model.py", "./" ] 11 | 12 | CMD [ "lambda_function.lambda_handler" ] 13 | -------------------------------------------------------------------------------- /06-best-practices/code/Makefile: -------------------------------------------------------------------------------- 1 | LOCAL_TAG:=$(shell date +"%Y-%m-%d-%H-%M") 2 | LOCAL_IMAGE_NAME:=stream-model-duration:${LOCAL_TAG} 3 | 4 | test: 5 | pytest tests/ 6 | 7 | quality_checks: 8 | isort . 9 | black . 10 | pylint --recursive=y . 11 | 12 | build: quality_checks test 13 | docker build -t ${LOCAL_IMAGE_NAME} . 14 | 15 | integration_test: build 16 | LOCAL_IMAGE_NAME=${LOCAL_IMAGE_NAME} bash integraton-test/run.sh 17 | 18 | publish: build integration_test 19 | LOCAL_IMAGE_NAME=${LOCAL_IMAGE_NAME} bash scripts/publish.sh 20 | 21 | setup: 22 | pipenv install --dev 23 | pre-commit install -------------------------------------------------------------------------------- /06-best-practices/code/Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | boto3 = "*" 8 | mlflow = "*" 9 | scikit-learn = "==1.0.2" 10 | 11 | [dev-packages] 12 | pytest = "*" 13 | deepdiff = "*" 14 | pylint = "==2.14.4" 15 | black = "*" 16 | isort = "*" 17 | pre-commit = "*" 18 | 19 | [requires] 20 | python_version = "3.9" 21 | -------------------------------------------------------------------------------- /06-best-practices/code/infrastructure/main.tf: -------------------------------------------------------------------------------- 1 | # Make sure to create state bucket beforehand 2 | terraform { 3 | required_version = ">= 1.0" 4 | backend "s3" { 5 | bucket = "tf-state-mlops-zoomcamp" 6 | key = "mlops-zoomcamp-stg.tfstate" 7 | region = "eu-west-1" 8 | encrypt = true 9 | } 10 | } 11 | 12 | provider "aws" { 13 | region = var.aws_region 14 | } 15 | 16 | data "aws_caller_identity" "current_identity" {} 17 | 18 | locals { 19 | account_id = data.aws_caller_identity.current_identity.account_id 20 | } 21 | 22 | # ride_events 23 | module "source_kinesis_stream" { 24 | source = "./modules/kinesis" 25 | retention_period = 48 26 | shard_count = 2 27 | stream_name = "${var.source_stream_name}-${var.project_id}" 28 | tags = var.project_id 29 | } 30 | 31 | # ride_predictions 32 | module "output_kinesis_stream" { 33 | source = "./modules/kinesis" 34 | retention_period = 48 35 | shard_count = 2 36 | stream_name = "${var.output_stream_name}-${var.project_id}" 37 | tags = var.project_id 38 | } 39 | 40 | # model bucket 41 | module "s3_bucket" { 42 | source = "./modules/s3" 43 | bucket_name = "${var.model_bucket}-${var.project_id}" 44 | } 45 | 46 | # image registry 47 | module "ecr_image" { 48 | source = "./modules/ecr" 49 | ecr_repo_name = "${var.ecr_repo_name}_${var.project_id}" 50 | account_id = local.account_id 51 | lambda_function_local_path = var.lambda_function_local_path 52 | docker_image_local_path = var.docker_image_local_path 53 | } 54 | 55 | module "lambda_function" { 56 | source = "./modules/lambda" 57 | image_uri = module.ecr_image.image_uri 58 | lambda_function_name = "${var.lambda_function_name}_${var.project_id}" 59 | model_bucket = module.s3_bucket.name 60 | output_stream_name = "${var.output_stream_name}-${var.project_id}" 61 | output_stream_arn = module.output_kinesis_stream.stream_arn 62 | source_stream_name = "${var.source_stream_name}-${var.project_id}" 63 | source_stream_arn = module.source_kinesis_stream.stream_arn 64 | } 65 | 66 | # For CI/CD 67 | output "lambda_function" { 68 | value = "${var.lambda_function_name}_${var.project_id}" 69 | } 70 | 71 | output "model_bucket" { 72 | value = module.s3_bucket.name 73 | } 74 | 75 | output "predictions_stream_name" { 76 | value = "${var.output_stream_name}-${var.project_id}" 77 | } 78 | 79 | output "ecr_repo" { 80 | value = "${var.ecr_repo_name}_${var.project_id}" 81 | } 82 | -------------------------------------------------------------------------------- /06-best-practices/code/infrastructure/modules/ecr/main.tf: -------------------------------------------------------------------------------- 1 | resource "aws_ecr_repository" "repo" { 2 | name = var.ecr_repo_name 3 | image_tag_mutability = "MUTABLE" 4 | 5 | image_scanning_configuration { 6 | scan_on_push = false 7 | } 8 | 9 | force_delete = true 10 | } 11 | 12 | # In practice, the Image build-and-push step is handled separately by the CI/CD pipeline and not the IaC script. 13 | # But because the lambda config would fail without an existing Image URI in ECR, 14 | # we can also upload any base image to bootstrap the lambda config, unrelated to your Inference logic 15 | resource null_resource ecr_image { 16 | triggers = { 17 | python_file = md5(file(var.lambda_function_local_path)) 18 | docker_file = md5(file(var.docker_image_local_path)) 19 | } 20 | 21 | provisioner "local-exec" { 22 | command = <= 1) & (df.duration <= 60)].copy() 15 | 16 | mean_duration = df.duration.mean() 17 | if train: 18 | print(f"The mean duration of training is {mean_duration}") 19 | else: 20 | print(f"The mean duration of validation is {mean_duration}") 21 | 22 | df[categorical] = df[categorical].fillna(-1).astype('int').astype('str') 23 | return df 24 | 25 | def train_model(df, categorical): 26 | 27 | train_dicts = df[categorical].to_dict(orient='records') 28 | dv = DictVectorizer() 29 | X_train = dv.fit_transform(train_dicts) 30 | y_train = df.duration.values 31 | 32 | print(f"The shape of X_train is {X_train.shape}") 33 | print(f"The DictVectorizer has {len(dv.feature_names_)} features") 34 | 35 | lr = LinearRegression() 36 | lr.fit(X_train, y_train) 37 | y_pred = lr.predict(X_train) 38 | mse = mean_squared_error(y_train, y_pred, squared=False) 39 | print(f"The MSE of training is: {mse}") 40 | return lr, dv 41 | 42 | def run_model(df, categorical, dv, lr): 43 | val_dicts = df[categorical].to_dict(orient='records') 44 | X_val = dv.transform(val_dicts) 45 | y_pred = lr.predict(X_val) 46 | y_val = df.duration.values 47 | 48 | mse = mean_squared_error(y_val, y_pred, squared=False) 49 | print(f"The MSE of validation is: {mse}") 50 | return 51 | 52 | def main(train_path: str = './data/fhv_tripdata_2021-01.parquet', 53 | val_path: str = './data/fhv_tripdata_2021-02.parquet'): 54 | 55 | categorical = ['PUlocationID', 'DOlocationID'] 56 | 57 | df_train = read_data(train_path) 58 | df_train_processed = prepare_features(df_train, categorical) 59 | 60 | df_val = read_data(val_path) 61 | df_val_processed = prepare_features(df_val, categorical, False) 62 | 63 | # train the model 64 | lr, dv = train_model(df_train_processed, categorical) 65 | run_model(df_val_processed, categorical, dv, lr) 66 | 67 | main() 68 | -------------------------------------------------------------------------------- /cohorts/2022/03-orchestration/images/thumbnail-3-01.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2022/03-orchestration/images/thumbnail-3-01.jpg -------------------------------------------------------------------------------- /cohorts/2022/03-orchestration/images/thumbnail-3-02.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2022/03-orchestration/images/thumbnail-3-02.jpg -------------------------------------------------------------------------------- /cohorts/2022/03-orchestration/images/thumbnail-3-03.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2022/03-orchestration/images/thumbnail-3-03.jpg -------------------------------------------------------------------------------- /cohorts/2022/03-orchestration/images/thumbnail-3-04.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2022/03-orchestration/images/thumbnail-3-04.jpg -------------------------------------------------------------------------------- /cohorts/2022/03-orchestration/images/thumbnail-3-05.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2022/03-orchestration/images/thumbnail-3-05.jpg -------------------------------------------------------------------------------- /cohorts/2022/03-orchestration/images/thumbnail-3-06.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2022/03-orchestration/images/thumbnail-3-06.jpg -------------------------------------------------------------------------------- /cohorts/2022/04-deployment/homework/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9.7-slim 2 | 3 | WORKDIR /app 4 | COPY [ "model2.bin", "model.bin" ] 5 | -------------------------------------------------------------------------------- /cohorts/2022/04-deployment/homework/Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | scikit-learn = "==1.0.2" 8 | pandas = "*" 9 | pyarrow = "*" 10 | s3fs = "*" 11 | 12 | [dev-packages] 13 | 14 | [requires] 15 | python_version = "3.9" 16 | -------------------------------------------------------------------------------- /cohorts/2022/04-deployment/homework/batch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import sys 5 | import pickle 6 | import pandas as pd 7 | 8 | 9 | year = int(sys.argv[1]) # 2021 10 | month = int(sys.argv[2]) #2 11 | 12 | input_file = f's3://nyc-tlc/trip data/fhv_tripdata_{year:04d}-{month:02d}.parquet' 13 | output_file = f's3://nyc-duration-prediction-alexey/taxi_type=fhv/year={year:04d}/month={month:02d}/predictions.parquet' 14 | 15 | 16 | with open('model.bin', 'rb') as f_in: 17 | dv, lr = pickle.load(f_in) 18 | 19 | 20 | categorical = ['PUlocationID', 'DOlocationID'] 21 | 22 | def read_data(filename): 23 | df = pd.read_parquet(filename) 24 | 25 | df['duration'] = df.dropOff_datetime - df.pickup_datetime 26 | df['duration'] = df.duration.dt.total_seconds() / 60 27 | 28 | df = df[(df.duration >= 1) & (df.duration <= 60)].copy() 29 | 30 | df[categorical] = df[categorical].fillna(-1).astype('int').astype('str') 31 | 32 | return df 33 | 34 | 35 | df = read_data(input_file) 36 | df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str') 37 | 38 | 39 | dicts = df[categorical].to_dict(orient='records') 40 | X_val = dv.transform(dicts) 41 | y_pred = lr.predict(X_val) 42 | 43 | 44 | print('predicted mean duration:', y_pred.mean()) 45 | 46 | 47 | df_result = pd.DataFrame() 48 | df_result['ride_id'] = df['ride_id'] 49 | df_result['predicted_duration'] = y_pred 50 | 51 | 52 | df_result.to_parquet( 53 | output_file, 54 | engine='pyarrow', 55 | compression=None, 56 | index=False 57 | ) -------------------------------------------------------------------------------- /cohorts/2022/04-deployment/homework/homework.dockerfile: -------------------------------------------------------------------------------- 1 | FROM agrigorev/zoomcamp-model:mlops-3.9.7-slim 2 | 3 | RUN pip install -U pip 4 | RUN pip install pipenv 5 | 6 | WORKDIR /app 7 | 8 | COPY [ "Pipfile", "Pipfile.lock", "./" ] 9 | 10 | RUN pipenv install --system --deploy 11 | 12 | COPY [ "batch.py", "batch.py" ] 13 | 14 | ENTRYPOINT [ "python", "batch.py" ] -------------------------------------------------------------------------------- /cohorts/2022/04-deployment/homework/model.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2022/04-deployment/homework/model.bin -------------------------------------------------------------------------------- /cohorts/2022/05-monitoring/homework/docker-compose-homework-solution.yml: -------------------------------------------------------------------------------- 1 | version: "3.7" 2 | 3 | volumes: 4 | mongo_data: {} 5 | 6 | networks: 7 | front-tier: 8 | back-tier: 9 | 10 | services: 11 | prediction_service: 12 | build: 13 | context: prediction_service 14 | dockerfile: Dockerfile 15 | depends_on: 16 | - mongo 17 | environment: 18 | MONGO_DATABASE: "prediction_service" 19 | MONGO_ADDRESS: "mongodb://mongo.:27017/" 20 | MODEL_VERSION: "2" 21 | MODEL_FILE: "lin_reg_V2.bin" 22 | 23 | ports: 24 | - 9696:9696 25 | networks: 26 | - back-tier 27 | - front-tier 28 | 29 | mongo: 30 | image: mongo 31 | ports: 32 | - 27017:27017 33 | volumes: 34 | - mongo_data:/data/db 35 | networks: 36 | - back-tier 37 | - front-tier 38 | -------------------------------------------------------------------------------- /cohorts/2022/05-monitoring/homework/docker-compose-homework.yml: -------------------------------------------------------------------------------- 1 | version: "3.7" 2 | 3 | volumes: 4 | mongo_data: {} 5 | 6 | networks: 7 | front-tier: 8 | back-tier: 9 | 10 | services: 11 | prediction_service: 12 | build: 13 | context: prediction_service 14 | dockerfile: Dockerfile 15 | depends_on: 16 | - mongo 17 | environment: 18 | MONGO_DATABASE: "prediction_service" 19 | MONGO_ADDRESS: "mongodb://mongo.:27017/" 20 | MODEL_VERSION: "1" 21 | MODEL_FILE: "lin_reg.bin" 22 | 23 | ports: 24 | - 9696:9696 25 | networks: 26 | - back-tier 27 | - front-tier 28 | 29 | mongo: 30 | image: mongo 31 | ports: 32 | - 27017:27017 33 | volumes: 34 | - mongo_data:/data/db 35 | networks: 36 | - back-tier 37 | - front-tier 38 | -------------------------------------------------------------------------------- /cohorts/2022/05-monitoring/homework/model_training.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import pandas as pd 4 | import pyarrow.parquet as pq 5 | from sklearn.feature_extraction import DictVectorizer 6 | from sklearn.linear_model import LinearRegression 7 | 8 | 9 | def read_dataframe(filename): 10 | df = pq.read_table(filename).to_pandas() 11 | 12 | df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime) 13 | df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime) 14 | 15 | df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime 16 | df.duration = df.duration.apply(lambda td: td.total_seconds() / 60) 17 | 18 | df = df[(df.duration >= 1) & (df.duration <= 60)] 19 | 20 | categorical = ['PULocationID', 'DOLocationID'] 21 | df[categorical] = df[categorical].astype(str) 22 | 23 | return df 24 | 25 | def add_features(train_data="./datasets/green_tripdata_2021-03.parquet", 26 | additional_training_data=None): 27 | df_train = read_dataframe(train_data) 28 | 29 | if additional_training_data: 30 | extra_data = read_dataframe(additional_training_data) 31 | df_train = pd.concat([df_train, extra_data], axis=0, ignore_index=True) 32 | 33 | 34 | 35 | df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID'] 36 | 37 | categorical = ['PU_DO'] 38 | numerical = ['trip_distance'] 39 | 40 | dv = DictVectorizer() 41 | 42 | train_dicts = df_train[categorical + numerical].to_dict(orient='records') 43 | X_train = dv.fit_transform(train_dicts) 44 | 45 | target = 'duration' 46 | y_train = df_train[target].values 47 | 48 | return X_train, y_train, dv 49 | 50 | 51 | 52 | 53 | if __name__ == "__main__": 54 | X_train, y_train, dv = add_features() 55 | 56 | print("Training model with one month of data") 57 | lr = LinearRegression() 58 | lr.fit(X_train, y_train) 59 | 60 | 61 | with open('prediction_service/lin_reg.bin', 'wb') as f_out: 62 | pickle.dump((dv, lr), f_out) 63 | 64 | X_train, y_train, dv = add_features(additional_training_data="./datasets/green_tripdata_2021-04.parquet") 65 | print("Training model with two months of data") 66 | lr = LinearRegression() 67 | lr.fit(X_train, y_train) 68 | 69 | with open('prediction_service/lin_reg_V2.bin', 'wb') as f_out: 70 | pickle.dump((dv, lr), f_out) 71 | -------------------------------------------------------------------------------- /cohorts/2022/05-monitoring/homework/prediction_service/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8-slim-buster 2 | 3 | RUN pip install -U pip 4 | RUN pip install pipenv 5 | 6 | WORKDIR /app 7 | 8 | COPY [ "Pipfile", "Pipfile.lock", "./" ] 9 | 10 | RUN pipenv install --system --deploy 11 | 12 | COPY [ "app.py", "lin_reg.bin", "lin_reg_V2.bin", "./" ] 13 | 14 | EXPOSE 9696 15 | 16 | ENTRYPOINT ["gunicorn", "--bind=0.0.0.0:9696", "app:app" ] 17 | -------------------------------------------------------------------------------- /cohorts/2022/05-monitoring/homework/prediction_service/Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | scikit-learn = "==1.0.2" 8 | flask = "==2.0.1" 9 | pandas = "==1.1.5" 10 | evidently = "*" 11 | pymongo = "*" 12 | gunicorn = "*" 13 | 14 | [dev-packages] 15 | pyarrow = "*" 16 | 17 | [requires] 18 | python_version = "3.8" 19 | -------------------------------------------------------------------------------- /cohorts/2022/05-monitoring/homework/prediction_service/app.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import pickle 4 | import uuid 5 | 6 | from flask import Flask, jsonify, request 7 | from pymongo import MongoClient 8 | 9 | 10 | MONGO_ADDRESS = os.getenv("MONGO_ADDRESS", "mongodb://localhost:27017/") 11 | MONGO_DATABASE = os.getenv("MONGO_DATABASE", "ride_prediction") 12 | LOGGED_MODEL = os.getenv("MODEL_FILE", "lin_reg.bin") 13 | MODEL_VERSION = os.getenv("MODEL_VERSION", "1") 14 | 15 | with open(LOGGED_MODEL, 'rb') as f_in: 16 | dv, model = pickle.load(f_in) 17 | 18 | 19 | mongo_client = MongoClient(MONGO_ADDRESS) 20 | mongo_db = mongo_client[MONGO_DATABASE] 21 | mongo_collection = mongo_db.get_collection("data") 22 | 23 | 24 | app = Flask("Ride-Prediction-Service") 25 | logging.basicConfig(level=logging.INFO) 26 | 27 | 28 | def prepare_features(ride): 29 | """Function to prepare features before making prediction""" 30 | 31 | record = ride.copy() 32 | record['PU_DO'] = '%s_%s' % (record['PULocationID'], record['DOLocationID']) 33 | 34 | features = dv.transform([record]) 35 | 36 | return features, record 37 | 38 | 39 | def save_db(record, pred_result): 40 | """Save data to mongo db collection""" 41 | 42 | rec = record.copy() 43 | rec["prediction"] = pred_result[0] 44 | mongo_collection.insert_one(rec) 45 | 46 | 47 | 48 | @app.route("/", methods=["GET"]) 49 | def get_info(): 50 | """Function to provide info about the app""" 51 | info = """

Ride Prediction Service

52 |
53 |

Data Request Example

54 |
55 |

"ride = { 56 | "PULocationID": 10, 57 | "DOLocationID": 50, 58 | "trip_distance": 40 59 | }" 60 |

61 |
62 |
""" 63 | return info 64 | 65 | @app.route("/predict-duration", methods=["POST"]) 66 | def predict_duration(): 67 | """Function to predict duration""" 68 | 69 | ride = request.get_json() 70 | features, record = prepare_features(ride) 71 | 72 | prediction = model.predict(features) 73 | ride_id = str(uuid.uuid4()) 74 | pred_data = { 75 | "ride_id": ride_id, 76 | "PU_DO": record["PU_DO"], 77 | "trip_distance": record["trip_distance"], 78 | "status": 200, 79 | "duration": prediction[0], 80 | "model_version": MODEL_VERSION 81 | } 82 | 83 | save_db(record, prediction) 84 | 85 | result = { 86 | "statusCode": 200, 87 | "data" : pred_data 88 | } 89 | 90 | return jsonify(result) 91 | 92 | 93 | if __name__ == "__main__": 94 | app.run(debug=True, host="0.0.0.0", port=9696) 95 | -------------------------------------------------------------------------------- /cohorts/2022/05-monitoring/homework/prediction_service/lin_reg.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2022/05-monitoring/homework/prediction_service/lin_reg.bin -------------------------------------------------------------------------------- /cohorts/2022/05-monitoring/homework/prediction_service/lin_reg_V2.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2022/05-monitoring/homework/prediction_service/lin_reg_V2.bin -------------------------------------------------------------------------------- /cohorts/2022/05-monitoring/homework/prefect-monitoring/Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | scikit-learn = "==1.0.2" 8 | pyarrow = "*" 9 | prefect = "==2.0b8" 10 | pandas = "*" 11 | pymongo = "*" 12 | psutil = "==5.9.1" 13 | evidently = "*" 14 | 15 | [dev-packages] 16 | 17 | [requires] 18 | python_version = "3.8" 19 | -------------------------------------------------------------------------------- /cohorts/2022/05-monitoring/homework/prefect-monitoring/clean_mongo.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | 3 | MONGO_CLIENT_ADDRESS = "mongodb://localhost:27017/" 4 | MONGO_DATABASE = "prediction_service" 5 | 6 | 7 | if __name__ == "__main__": 8 | client = MongoClient(MONGO_CLIENT_ADDRESS) 9 | client.drop_database(MONGO_DATABASE) 10 | -------------------------------------------------------------------------------- /cohorts/2022/05-monitoring/homework/prefect-monitoring/monitor_profile.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 48, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pymongo import MongoClient\n", 10 | "import pprint" 11 | ] 12 | } 13 | ], 14 | "metadata": { 15 | "kernelspec": { 16 | "display_name": "Python 3.9.12 ('prediction_service_practice-b8Zbdkaa')", 17 | "language": "python", 18 | "name": "python3" 19 | }, 20 | "language_info": { 21 | "codemirror_mode": { 22 | "name": "ipython", 23 | "version": 3 24 | }, 25 | "file_extension": ".py", 26 | "mimetype": "text/x-python", 27 | "name": "python", 28 | "nbconvert_exporter": "python", 29 | "pygments_lexer": "ipython3", 30 | "version": "3.9.12" 31 | }, 32 | "orig_nbformat": 4, 33 | "vscode": { 34 | "interpreter": { 35 | "hash": "63df8a96dcc14a3f8fc6f13bb4daf95ac616547a440980d0dc62a5d5ed05a07e" 36 | } 37 | } 38 | }, 39 | "nbformat": 4, 40 | "nbformat_minor": 2 41 | } 42 | -------------------------------------------------------------------------------- /cohorts/2022/05-monitoring/homework/prefect-monitoring/monitor_profile_solution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pymongo import MongoClient\n", 10 | "import pprint" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "MONGO_CLIENT_ADDRESS = \"mongodb://localhost:27017/\"\n", 20 | "MONGO_DATABASE = \"prediction_service\"\n", 21 | "REPORT_COLLECTION = \"report\"" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 3, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "client = MongoClient()\n", 31 | "collection = client.get_database(MONGO_DATABASE).get_collection(REPORT_COLLECTION)" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 4, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": [ 43 | "9\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "for col in collection.find():\n", 49 | " pprint.pprint(len(col['data_drift']['data']['metrics'].keys()))" 50 | ] 51 | } 52 | ], 53 | "metadata": { 54 | "kernelspec": { 55 | "display_name": "Python 3.8.2 ('prefect-monitoring-vrjQsnUO')", 56 | "language": "python", 57 | "name": "python3" 58 | }, 59 | "language_info": { 60 | "codemirror_mode": { 61 | "name": "ipython", 62 | "version": 3 63 | }, 64 | "file_extension": ".py", 65 | "mimetype": "text/x-python", 66 | "name": "python", 67 | "nbconvert_exporter": "python", 68 | "pygments_lexer": "ipython3", 69 | "version": "3.8.2" 70 | }, 71 | "orig_nbformat": 4, 72 | "vscode": { 73 | "interpreter": { 74 | "hash": "8c4128a542e647ac345fb470a121f5ad37749126bd51dd0e4b0f94b08087470c" 75 | } 76 | } 77 | }, 78 | "nbformat": 4, 79 | "nbformat_minor": 2 80 | } 81 | -------------------------------------------------------------------------------- /cohorts/2022/05-monitoring/homework/prefect-monitoring/prepare_reference_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pyarrow.parquet as pq 3 | 4 | 5 | data_files = ["../datasets/green_tripdata_2021-03.parquet", "../datasets/green_tripdata_2021-04.parquet"] 6 | output_file = "green_tripdata_2021-03to04.parquet" 7 | 8 | df = pd.DataFrame() 9 | for file in data_files: 10 | data = pq.read_table(file).to_pandas() 11 | df = pd.concat([data, df], ignore_index=True) 12 | 13 | df.to_parquet( 14 | output_file, 15 | engine='pyarrow', 16 | compression=None, 17 | index=False 18 | ) 19 | -------------------------------------------------------------------------------- /cohorts/2022/05-monitoring/homework/prefect-monitoring/send_data.py: -------------------------------------------------------------------------------- 1 | import json 2 | import uuid 3 | from datetime import datetime 4 | 5 | import pyarrow.parquet as pq 6 | import requests 7 | 8 | table = pq.read_table("../datasets/green_tripdata_2021-05.parquet")\ 9 | .to_pandas()\ 10 | .sample(n=5000, random_state=42) #5000 rows sampled 11 | data = table.copy() 12 | 13 | 14 | class DateTimeEncoder(json.JSONEncoder): 15 | def default(self, o): 16 | if isinstance(o, datetime): 17 | return o.isoformat() 18 | return json.JSONEncoder.default(self, o) 19 | 20 | 21 | with open("target.csv", 'w') as f_target: 22 | for index, row in data.iterrows(): 23 | row['id'] = str(uuid.uuid4()) 24 | duration = (row['lpep_dropoff_datetime'] - row['lpep_pickup_datetime']).total_seconds() / 60 25 | if duration >= 1 and duration <= 60: 26 | f_target.write(f"{row['id']},{duration}\n") 27 | resp = requests.post("http://127.0.0.1:9696/predict-duration", 28 | headers={"Content-Type": "application/json"}, 29 | data=row.to_json()).json() 30 | print(f"prediction: {resp['data']['duration']}") 31 | -------------------------------------------------------------------------------- /cohorts/2022/05-monitoring/homework/prepare.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import requests 3 | 4 | files = ["green_tripdata_2021-03.parquet", "green_tripdata_2021-04.parquet", "green_tripdata_2021-05.parquet"] 5 | path = "./datasets" 6 | print(f"Download files:") 7 | for file in files: 8 | 9 | # Change the url based on what works for you whether s3 or cloudfront 10 | url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}" 11 | resp = requests.get(url, stream=True) 12 | save_path = f"{path}/{file}" 13 | with open(save_path, "wb") as handle: 14 | for data in tqdm(resp.iter_content(), 15 | desc=f"{file}", 16 | postfix=f"save to {save_path}", 17 | total=int(resp.headers["Content-Length"])): 18 | handle.write(data) 19 | -------------------------------------------------------------------------------- /cohorts/2022/05-monitoring/homework/requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-learn==1.0.2 2 | dataclasses==0.6 3 | Flask~=2.0.1 4 | pandas>=1.1.5 5 | Werkzeug~=2.0.1 6 | requests~=2.26.0 7 | prometheus_client~=0.11.0 8 | pyyaml~=5.4.1 9 | tqdm 10 | pyarrow 11 | prefect==2.0b8 12 | pymongo 13 | evidently 14 | pipenv 15 | -------------------------------------------------------------------------------- /cohorts/2022/05-monitoring/homework/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pprint 3 | from pymongo import MongoClient 4 | 5 | import requests 6 | 7 | MONGODB_ADDRESS = os.getenv("MONGODB_ADDRESS", "mongodb://127.0.0.1:27017/") 8 | FLASK_URL = "http://127.0.0.1:9696/predict-duration" 9 | 10 | 11 | mongo_client = MongoClient(MONGODB_ADDRESS) 12 | mongo_db = mongo_client['prediction_service'] 13 | mongo_collection = mongo_db['data'] 14 | ride_test_data = { 15 | "PULocationID": 10, 16 | "DOLocationID": 50, 17 | "trip_distance": 40 18 | } 19 | 20 | 21 | if __name__ == "__main__": 22 | requests.post(url=FLASK_URL ,json=ride_test_data) 23 | for coll in mongo_collection.find(): 24 | pprint.pprint(coll) 25 | -------------------------------------------------------------------------------- /cohorts/2022/06-best-practices/homework/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9.7-slim 2 | 3 | RUN pip install -U pip 4 | RUN pip install pipenv 5 | 6 | WORKDIR /app 7 | 8 | COPY [ "Pipfile", "Pipfile.lock", "./" ] 9 | 10 | RUN pipenv install --system --deploy 11 | 12 | COPY [ "batch.py", "batch.py" ] 13 | COPY [ "model.bin", "model.bin" ] 14 | 15 | ENTRYPOINT [ "python", "batch.py" ] -------------------------------------------------------------------------------- /cohorts/2022/06-best-practices/homework/Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | scikit-learn = "==1.0.2" 8 | pandas = "*" 9 | pyarrow = "*" 10 | s3fs = "*" 11 | 12 | [dev-packages] 13 | 14 | [requires] 15 | python_version = "3.9" 16 | -------------------------------------------------------------------------------- /cohorts/2022/06-best-practices/homework/batch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import sys 5 | import pickle 6 | import pandas as pd 7 | 8 | 9 | year = int(sys.argv[1]) 10 | month = int(sys.argv[2]) 11 | 12 | input_file = f'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/nyc-tlc/fhv/fhv_tripdata_{year:04d}-{month:02d}.parquet' 13 | output_file = f's3://nyc-duration-prediction-alexey/taxi_type=fhv/year={year:04d}/month={month:02d}/predictions.parquet' 14 | 15 | 16 | with open('model.bin', 'rb') as f_in: 17 | dv, lr = pickle.load(f_in) 18 | 19 | 20 | categorical = ['PUlocationID', 'DOlocationID'] 21 | 22 | def read_data(filename): 23 | df = pd.read_parquet(filename) 24 | 25 | df['duration'] = df.dropOff_datetime - df.pickup_datetime 26 | df['duration'] = df.duration.dt.total_seconds() / 60 27 | 28 | df = df[(df.duration >= 1) & (df.duration <= 60)].copy() 29 | 30 | df[categorical] = df[categorical].fillna(-1).astype('int').astype('str') 31 | 32 | return df 33 | 34 | 35 | df = read_data(input_file) 36 | df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str') 37 | 38 | 39 | dicts = df[categorical].to_dict(orient='records') 40 | X_val = dv.transform(dicts) 41 | y_pred = lr.predict(X_val) 42 | 43 | 44 | print('predicted mean duration:', y_pred.mean()) 45 | 46 | 47 | df_result = pd.DataFrame() 48 | df_result['ride_id'] = df['ride_id'] 49 | df_result['predicted_duration'] = y_pred 50 | 51 | df_result.to_parquet(output_file, engine='pyarrow', index=False) -------------------------------------------------------------------------------- /cohorts/2022/06-best-practices/homework/model.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2022/06-best-practices/homework/model.bin -------------------------------------------------------------------------------- /cohorts/2022/06-best-practices/homework_solution/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9.7-slim 2 | 3 | RUN pip install -U pip 4 | RUN pip install pipenv 5 | 6 | WORKDIR /app 7 | 8 | COPY [ "Pipfile", "Pipfile.lock", "./" ] 9 | 10 | RUN pipenv install --system --deploy 11 | 12 | COPY [ "batch.py", "batch.py" ] 13 | COPY [ "model.bin", "model.bin" ] 14 | 15 | ENTRYPOINT [ "python", "batch.py" ] -------------------------------------------------------------------------------- /cohorts/2022/06-best-practices/homework_solution/Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | scikit-learn = "==1.0.2" 8 | pandas = "*" 9 | pyarrow = "*" 10 | s3fs = "*" 11 | 12 | [dev-packages] 13 | pytest = "*" 14 | 15 | [requires] 16 | python_version = "3.9" 17 | -------------------------------------------------------------------------------- /cohorts/2022/06-best-practices/homework_solution/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | s3: 3 | image: localstack/localstack 4 | ports: 5 | - "4566:4566" 6 | environment: 7 | - SERVICES=s3 -------------------------------------------------------------------------------- /cohorts/2022/06-best-practices/homework_solution/integration_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datetime import datetime 4 | import pandas as pd 5 | 6 | import batch 7 | 8 | def dt(hour, minute, second=0): 9 | return datetime(2021, 1, 1, hour, minute, second) 10 | 11 | 12 | S3_ENDPOINT_URL = os.getenv('S3_ENDPOINT_URL') 13 | 14 | options = { 15 | 'client_kwargs': { 16 | 'endpoint_url': S3_ENDPOINT_URL 17 | } 18 | } 19 | 20 | data = [ 21 | (None, None, dt(1, 2), dt(1, 10)), 22 | (1, 1, dt(1, 2), dt(1, 10)), 23 | (1, 1, dt(1, 2, 0), dt(1, 2, 50)), 24 | (1, 1, dt(1, 2, 0), dt(2, 2, 1)), 25 | ] 26 | 27 | columns = ['PUlocationID', 'DOlocationID', 'pickup_datetime', 'dropOff_datetime'] 28 | df_input = pd.DataFrame(data, columns=columns) 29 | 30 | 31 | input_file = batch.get_input_path(2021, 1) 32 | output_file = batch.get_output_path(2021, 1) 33 | 34 | df_input.to_parquet( 35 | input_file, 36 | engine='pyarrow', 37 | compression=None, 38 | index=False, 39 | storage_options=options 40 | ) 41 | 42 | 43 | os.system('python batch.py 2021 1') 44 | 45 | 46 | df_actual = pd.read_parquet(output_file, storage_options=options) 47 | 48 | 49 | assert abs(df_actual['predicted_duration'].sum() - 69.28) < 0.1 -------------------------------------------------------------------------------- /cohorts/2022/06-best-practices/homework_solution/integration_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | 4 | 5 | docker-compose up -d 6 | 7 | sleep 5 8 | 9 | export INPUT_FILE_PATTERN="s3://nyc-duration/in/{year:04d}-{month:02d}.parquet" 10 | export OUTPUT_FILE_PATTERN="s3://nyc-duration/out/{year:04d}-{month:02d}.parquet" 11 | export S3_ENDPOINT_URL="http://localhost:4566" 12 | 13 | 14 | aws --endpoint-url="${S3_ENDPOINT_URL}" s3 mb s3://nyc-duration 15 | 16 | pipenv run python integration_test.py 17 | 18 | ERROR_CODE=$? 19 | 20 | if [ ${ERROR_CODE} != 0 ]; then 21 | docker-compose logs 22 | docker-compose down 23 | exit ${ERROR_CODE} 24 | fi 25 | 26 | echo "yay tests work!" 27 | 28 | docker-compose down -------------------------------------------------------------------------------- /cohorts/2022/06-best-practices/homework_solution/model.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2022/06-best-practices/homework_solution/model.bin -------------------------------------------------------------------------------- /cohorts/2022/06-best-practices/homework_solution/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2022/06-best-practices/homework_solution/tests/__init__.py -------------------------------------------------------------------------------- /cohorts/2022/06-best-practices/homework_solution/tests/test_batch.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | import pandas as pd 4 | 5 | import batch 6 | 7 | 8 | def dt(hour, minute, second=0): 9 | return datetime(2021, 1, 1, hour, minute, second) 10 | 11 | 12 | def test_prepare_data(): 13 | data = [ 14 | (None, None, dt(1, 2), dt(1, 10)), 15 | (1, 1, dt(1, 2), dt(1, 10)), 16 | (1, 1, dt(1, 2, 0), dt(1, 2, 50)), 17 | (1, 1, dt(1, 2, 0), dt(2, 2, 1)), 18 | ] 19 | 20 | categorical = ['PUlocationID', 'DOlocationID'] 21 | columns = ['PUlocationID', 'DOlocationID', 'pickup_datetime', 'dropOff_datetime'] 22 | df = pd.DataFrame(data, columns=columns) 23 | 24 | df_actual = batch.prepare_data(df, categorical) 25 | 26 | data_expected = [ 27 | ('-1', '-1', 8.0), 28 | ( '1', '1', 8.0), 29 | ] 30 | 31 | columns_test = ['PUlocationID', 'DOlocationID', 'duration'] 32 | df_expected = pd.DataFrame(data_expected, columns=columns_test) 33 | print(df_actual) 34 | 35 | assert (df_actual['PUlocationID'] == df_expected['PUlocationID']).all() 36 | assert (df_actual['DOlocationID'] == df_expected['DOlocationID']).all() 37 | assert (df_actual['duration'] - df_expected['duration']).abs().sum() < 0.0000001 38 | 39 | 40 | -------------------------------------------------------------------------------- /cohorts/2022/07-project/README.md: -------------------------------------------------------------------------------- 1 | ## Course Project 2 | 3 | The goal of this project is to apply everything we learned 4 | in this course and build an end-to-end machine learning project. 5 | 6 | Remember that to pass the project, you must evaluate 3 peers. If you don't do that, your project can't be considered compelete. 7 | 8 | 9 | ### Submitting 10 | 11 | 12 | #### Project Cohort #2 13 | 14 | Project: 15 | 16 | * Form: https://forms.gle/aj8LHkY7PrWG9XzW6 17 | * Deadline: 12 September, 23:00 CEST 18 | 19 | Peer reviewing: 20 | 21 | * Peer review assignments: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vQYTps829bmaN-aaJPiBUc3UwtN3e_llI44DKv-rQDsmVRMS1No7XWQqOyNI4ZbFbIvN351Q-G6edCP/pubhtml) ("project 2" tab) 22 | * Form: https://forms.gle/BeQ2HCohrM3puKf26 23 | * Deadline: 19 September, 23:00 CEST 24 | 25 | Project feedback: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vRB5xKkhCyAUVNSNJvxaP94RwgNbYhf3dNf_ctRHhNKvvQQB94YVBn9JRdCTdQb5NGCJdYBtjXP7tP9/pubhtml) ("feedback-02" tab) 26 | 27 | 28 | #### Project Cohort #1 29 | 30 | Project: 31 | 32 | * Form: https://forms.gle/7UmQkK4BBxqdgMDp9 33 | * Deadline: 22 August, 23:00 CEST 34 | 35 | Peer reviewing: 36 | 37 | * Peer review assignments: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vQYTps829bmaN-aaJPiBUc3UwtN3e_llI44DKv-rQDsmVRMS1No7XWQqOyNI4ZbFbIvN351Q-G6edCP/pubhtml) ("project 1" tab) 38 | * Form: https://forms.gle/KaBMoYhmfeEFmiWb7 39 | * Deadline: 29 August, 23:00 CEST 40 | 41 | Project feedback: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vRB5xKkhCyAUVNSNJvxaP94RwgNbYhf3dNf_ctRHhNKvvQQB94YVBn9JRdCTdQb5NGCJdYBtjXP7tP9/pubhtml) ("feedback-02" tab) 42 | 43 | 44 | ### Evaluation criteria 45 | 46 | See [here](../../../07-project/README.md) 47 | 48 | 49 | ### Misc 50 | 51 | To get the hash for your project, use this function to hash your email: 52 | 53 | ```python 54 | from hashlib import sha1 55 | 56 | def compute_hash(email): 57 | return sha1(email.lower().encode('utf-8')).hexdigest() 58 | ``` 59 | 60 | Or use [this website](http://www.sha1-online.com/). -------------------------------------------------------------------------------- /cohorts/2023/01-intro/homework.md: -------------------------------------------------------------------------------- 1 | ## Homework 2 | 3 | The goal of this homework is to train a simple model for predicting the duration of a ride - similar to what we did in this module. 4 | 5 | 6 | ## Q1. Downloading the data 7 | 8 | We'll use [the same NYC taxi dataset](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page), 9 | but instead of "**Green** Taxi Trip Records", we'll use "**Yellow** Taxi Trip Records". 10 | 11 | Download the data for January and February 2022. 12 | 13 | Read the data for January. How many columns are there? 14 | 15 | * 16 16 | * 17 17 | * 18 18 | * 19 19 | 20 | 21 | ## Q2. Computing duration 22 | 23 | Now let's compute the `duration` variable. It should contain the duration of a ride in minutes. 24 | 25 | What's the standard deviation of the trips duration in January? 26 | 27 | * 41.45 28 | * 46.45 29 | * 51.45 30 | * 56.45 31 | 32 | 33 | ## Q3. Dropping outliers 34 | 35 | Next, we need to check the distribution of the `duration` variable. There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive). 36 | 37 | What fraction of the records left after you dropped the outliers? 38 | 39 | * 90% 40 | * 92% 41 | * 95% 42 | * 98% 43 | 44 | 45 | ## Q4. One-hot encoding 46 | 47 | Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model. 48 | 49 | * Turn the dataframe into a list of dictionaries (remember to re-cast the ids to strings - otherwise it will 50 | label encode them) 51 | * Fit a dictionary vectorizer 52 | * Get a feature matrix from it 53 | 54 | What's the dimensionality of this matrix (number of columns)? 55 | 56 | * 2 57 | * 155 58 | * 345 59 | * 515 60 | * 715 61 | 62 | 63 | ## Q5. Training a model 64 | 65 | Now let's use the feature matrix from the previous step to train a model. 66 | 67 | * Train a plain linear regression model with default parameters 68 | * Calculate the RMSE of the model on the training data 69 | 70 | What's the RMSE on train? 71 | 72 | * 6.99 73 | * 11.99 74 | * 16.99 75 | * 21.99 76 | 77 | 78 | ## Q6. Evaluating the model 79 | 80 | Now let's apply this model to the validation dataset (February 2022). 81 | 82 | What's the RMSE on validation? 83 | 84 | * 7.79 85 | * 12.79 86 | * 17.79 87 | * 22.79 88 | 89 | ## Submit the results 90 | 91 | * Submit your results here: https://forms.gle/uYTnWrcsubi2gdGV7 92 | * You can submit your solution multiple times. In this case, only the last submission will be used 93 | * If your answer doesn't match options exactly, select the closest one 94 | 95 | 96 | ## Deadline 97 | 98 | The deadline for submitting is 23 May 2023 (Tuesday), 23:00 CEST (Berlin time). 99 | 100 | After that, the form will be closed. -------------------------------------------------------------------------------- /cohorts/2023/02-experiment-tracking/homework-wandb/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import click 4 | 5 | import wandb 6 | 7 | from sklearn.ensemble import RandomForestRegressor 8 | from sklearn.metrics import mean_squared_error 9 | 10 | 11 | def load_pickle(filename: str): 12 | with open(filename, "rb") as f_in: 13 | return pickle.load(f_in) 14 | 15 | 16 | @click.command() 17 | @click.option("--wandb_project", help="Name of Weights & Biases project") 18 | @click.option("--wandb_entity", help="Name of Weights & Biases entity") 19 | @click.option( 20 | "--data_artifact", 21 | help="Address of the Weights & Biases artifact holding the preprocessed data", 22 | ) 23 | @click.option("--random_state", default=0, help="Random state") 24 | @click.option("--max_depth", default=10, help="Max tree depth") 25 | def run_train( 26 | wandb_project: str, 27 | wandb_entity: str, 28 | data_artifact: str, 29 | max_depth: int, 30 | random_state: int, 31 | ): 32 | # Initialize a Weights & Biases run 33 | wandb.init( 34 | project=wandb_project, 35 | entity=wandb_entity, 36 | job_type="train", 37 | config={"max_depth": max_depth, "random_state": random_state}, 38 | ) 39 | 40 | # Fetch the preprocessed dataset from artifacts 41 | artifact = wandb.use_artifact(data_artifact, type="preprocessed_dataset") 42 | data_path = artifact.download() 43 | 44 | X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl")) 45 | X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl")) 46 | 47 | # Define the XGBoost Regressor Mode, train the model and perform prediction 48 | rf = RandomForestRegressor(max_depth=max_depth, random_state=random_state) 49 | rf.fit(X_train, y_train) 50 | y_pred = rf.predict(X_val) 51 | 52 | mse = mean_squared_error(y_val, y_pred, squared=False) 53 | # TODO: Log `mse` to Weights & Biases under the key `"MSE"` 54 | 55 | with open("regressor.pkl", "wb") as f: 56 | pickle.dump(rf, f) 57 | 58 | # TODO: Log `regressor.pkl` as an artifact of type `model` 59 | 60 | 61 | if __name__ == "__main__": 62 | run_train() 63 | -------------------------------------------------------------------------------- /cohorts/2023/02-experiment-tracking/homework/hpo.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import click 4 | import mlflow 5 | import optuna 6 | 7 | from optuna.samplers import TPESampler 8 | from sklearn.ensemble import RandomForestRegressor 9 | from sklearn.metrics import mean_squared_error 10 | 11 | mlflow.set_tracking_uri("http://127.0.0.1:5000") 12 | mlflow.set_experiment("random-forest-hyperopt") 13 | 14 | 15 | def load_pickle(filename): 16 | with open(filename, "rb") as f_in: 17 | return pickle.load(f_in) 18 | 19 | 20 | @click.command() 21 | @click.option( 22 | "--data_path", 23 | default="./output", 24 | help="Location where the processed NYC taxi trip data was saved" 25 | ) 26 | @click.option( 27 | "--num_trials", 28 | default=10, 29 | help="The number of parameter evaluations for the optimizer to explore" 30 | ) 31 | def run_optimization(data_path: str, num_trials: int): 32 | 33 | X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl")) 34 | X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl")) 35 | 36 | def objective(trial): 37 | params = { 38 | 'n_estimators': trial.suggest_int('n_estimators', 10, 50, 1), 39 | 'max_depth': trial.suggest_int('max_depth', 1, 20, 1), 40 | 'min_samples_split': trial.suggest_int('min_samples_split', 2, 10, 1), 41 | 'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4, 1), 42 | 'random_state': 42, 43 | 'n_jobs': -1 44 | } 45 | 46 | rf = RandomForestRegressor(**params) 47 | rf.fit(X_train, y_train) 48 | y_pred = rf.predict(X_val) 49 | rmse = mean_squared_error(y_val, y_pred, squared=False) 50 | 51 | return rmse 52 | 53 | sampler = TPESampler(seed=42) 54 | study = optuna.create_study(direction="minimize", sampler=sampler) 55 | study.optimize(objective, n_trials=num_trials) 56 | 57 | 58 | if __name__ == '__main__': 59 | run_optimization() 60 | -------------------------------------------------------------------------------- /cohorts/2023/02-experiment-tracking/homework/preprocess_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import click 4 | import pandas as pd 5 | 6 | from sklearn.feature_extraction import DictVectorizer 7 | 8 | 9 | def dump_pickle(obj, filename: str): 10 | with open(filename, "wb") as f_out: 11 | return pickle.dump(obj, f_out) 12 | 13 | 14 | def read_dataframe(filename: str): 15 | df = pd.read_parquet(filename) 16 | 17 | df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime'] 18 | df.duration = df.duration.apply(lambda td: td.total_seconds() / 60) 19 | df = df[(df.duration >= 1) & (df.duration <= 60)] 20 | 21 | categorical = ['PULocationID', 'DOLocationID'] 22 | df[categorical] = df[categorical].astype(str) 23 | 24 | return df 25 | 26 | 27 | def preprocess(df: pd.DataFrame, dv: DictVectorizer, fit_dv: bool = False): 28 | df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID'] 29 | categorical = ['PU_DO'] 30 | numerical = ['trip_distance'] 31 | dicts = df[categorical + numerical].to_dict(orient='records') 32 | if fit_dv: 33 | X = dv.fit_transform(dicts) 34 | else: 35 | X = dv.transform(dicts) 36 | return X, dv 37 | 38 | 39 | @click.command() 40 | @click.option( 41 | "--raw_data_path", 42 | help="Location where the raw NYC taxi trip data was saved" 43 | ) 44 | @click.option( 45 | "--dest_path", 46 | help="Location where the resulting files will be saved" 47 | ) 48 | def run_data_prep(raw_data_path: str, dest_path: str, dataset: str = "green"): 49 | # Load parquet files 50 | df_train = read_dataframe( 51 | os.path.join(raw_data_path, f"{dataset}_tripdata_2022-01.parquet") 52 | ) 53 | df_val = read_dataframe( 54 | os.path.join(raw_data_path, f"{dataset}_tripdata_2022-02.parquet") 55 | ) 56 | df_test = read_dataframe( 57 | os.path.join(raw_data_path, f"{dataset}_tripdata_2022-03.parquet") 58 | ) 59 | 60 | # Extract the target 61 | target = 'tip_amount' 62 | y_train = df_train[target].values 63 | y_val = df_val[target].values 64 | y_test = df_test[target].values 65 | 66 | # Fit the DictVectorizer and preprocess data 67 | dv = DictVectorizer() 68 | X_train, dv = preprocess(df_train, dv, fit_dv=True) 69 | X_val, _ = preprocess(df_val, dv, fit_dv=False) 70 | X_test, _ = preprocess(df_test, dv, fit_dv=False) 71 | 72 | # Create dest_path folder unless it already exists 73 | os.makedirs(dest_path, exist_ok=True) 74 | 75 | # Save DictVectorizer and datasets 76 | dump_pickle(dv, os.path.join(dest_path, "dv.pkl")) 77 | dump_pickle((X_train, y_train), os.path.join(dest_path, "train.pkl")) 78 | dump_pickle((X_val, y_val), os.path.join(dest_path, "val.pkl")) 79 | dump_pickle((X_test, y_test), os.path.join(dest_path, "test.pkl")) 80 | 81 | 82 | if __name__ == '__main__': 83 | run_data_prep() 84 | -------------------------------------------------------------------------------- /cohorts/2023/02-experiment-tracking/homework/register_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import click 4 | import mlflow 5 | 6 | from mlflow.entities import ViewType 7 | from mlflow.tracking import MlflowClient 8 | from sklearn.ensemble import RandomForestRegressor 9 | from sklearn.metrics import mean_squared_error 10 | 11 | HPO_EXPERIMENT_NAME = "random-forest-hyperopt" 12 | EXPERIMENT_NAME = "random-forest-best-models" 13 | RF_PARAMS = ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'random_state', 'n_jobs'] 14 | 15 | mlflow.set_tracking_uri("http://127.0.0.1:5000") 16 | mlflow.set_experiment(EXPERIMENT_NAME) 17 | mlflow.sklearn.autolog() 18 | 19 | 20 | def load_pickle(filename): 21 | with open(filename, "rb") as f_in: 22 | return pickle.load(f_in) 23 | 24 | 25 | def train_and_log_model(data_path, params): 26 | X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl")) 27 | X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl")) 28 | X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl")) 29 | 30 | with mlflow.start_run(): 31 | for param in RF_PARAMS: 32 | params[param] = int(params[param]) 33 | 34 | rf = RandomForestRegressor(**params) 35 | rf.fit(X_train, y_train) 36 | 37 | # Evaluate model on the validation and test sets 38 | val_rmse = mean_squared_error(y_val, rf.predict(X_val), squared=False) 39 | mlflow.log_metric("val_rmse", val_rmse) 40 | test_rmse = mean_squared_error(y_test, rf.predict(X_test), squared=False) 41 | mlflow.log_metric("test_rmse", test_rmse) 42 | 43 | 44 | @click.command() 45 | @click.option( 46 | "--data_path", 47 | default="./output", 48 | help="Location where the processed NYC taxi trip data was saved" 49 | ) 50 | @click.option( 51 | "--top_n", 52 | default=5, 53 | type=int, 54 | help="Number of top models that need to be evaluated to decide which one to promote" 55 | ) 56 | def run_register_model(data_path: str, top_n: int): 57 | 58 | client = MlflowClient() 59 | 60 | # Retrieve the top_n model runs and log the models 61 | experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME) 62 | runs = client.search_runs( 63 | experiment_ids=experiment.experiment_id, 64 | run_view_type=ViewType.ACTIVE_ONLY, 65 | max_results=top_n, 66 | order_by=["metrics.rmse ASC"] 67 | ) 68 | for run in runs: 69 | train_and_log_model(data_path=data_path, params=run.data.params) 70 | 71 | # Select the model with the lowest test RMSE 72 | experiment = client.get_experiment_by_name(EXPERIMENT_NAME) 73 | # best_run = client.search_runs( ... )[0] 74 | 75 | # Register the best model 76 | # mlflow.register_model( ... ) 77 | 78 | 79 | if __name__ == '__main__': 80 | run_register_model() 81 | -------------------------------------------------------------------------------- /cohorts/2023/02-experiment-tracking/homework/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import click 4 | 5 | from sklearn.ensemble import RandomForestRegressor 6 | from sklearn.metrics import mean_squared_error 7 | 8 | 9 | def load_pickle(filename: str): 10 | with open(filename, "rb") as f_in: 11 | return pickle.load(f_in) 12 | 13 | 14 | @click.command() 15 | @click.option( 16 | "--data_path", 17 | default="./output", 18 | help="Location where the processed NYC taxi trip data was saved" 19 | ) 20 | def run_train(data_path: str): 21 | 22 | X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl")) 23 | X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl")) 24 | 25 | rf = RandomForestRegressor(max_depth=10, random_state=0) 26 | rf.fit(X_train, y_train) 27 | y_pred = rf.predict(X_val) 28 | 29 | rmse = mean_squared_error(y_val, y_pred, squared=False) 30 | 31 | 32 | if __name__ == '__main__': 33 | run_train() 34 | -------------------------------------------------------------------------------- /cohorts/2023/02-experiment-tracking/solution-mlflow/hpo.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import click 4 | import mlflow 5 | import optuna 6 | 7 | from optuna.samplers import TPESampler 8 | from sklearn.ensemble import RandomForestRegressor 9 | from sklearn.metrics import mean_squared_error 10 | 11 | mlflow.set_tracking_uri("http://127.0.0.1:5000") 12 | mlflow.set_experiment("random-forest-hyperopt") 13 | 14 | 15 | def load_pickle(filename): 16 | with open(filename, "rb") as f_in: 17 | return pickle.load(f_in) 18 | 19 | 20 | @click.command() 21 | @click.option( 22 | "--data_path", 23 | default="./output", 24 | help="Location where the processed NYC taxi trip data was saved" 25 | ) 26 | @click.option( 27 | "--num_trials", 28 | default=10, 29 | help="The number of parameter evaluations for the optimizer to explore" 30 | ) 31 | def run_optimization(data_path: str, num_trials: int): 32 | mlflow.sklearn.autolog(disable=True) 33 | 34 | X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl")) 35 | X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl")) 36 | 37 | def objective(trial): 38 | params = { 39 | 'n_estimators': trial.suggest_int('n_estimators', 10, 50, 1), 40 | 'max_depth': trial.suggest_int('max_depth', 1, 20, 1), 41 | 'min_samples_split': trial.suggest_int('min_samples_split', 2, 10, 1), 42 | 'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4, 1), 43 | 'random_state': 42, 44 | 'n_jobs': -1 45 | } 46 | with mlflow.start_run(): 47 | mlflow.log_params(params) 48 | rf = RandomForestRegressor(**params) 49 | rf.fit(X_train, y_train) 50 | y_pred = rf.predict(X_val) 51 | rmse = mean_squared_error(y_val, y_pred, squared=False) 52 | mlflow.log_metric("rmse", rmse) 53 | 54 | return rmse 55 | 56 | sampler = TPESampler(seed=42) 57 | study = optuna.create_study(direction="minimize", sampler=sampler) 58 | study.optimize(objective, n_trials=num_trials) 59 | 60 | 61 | if __name__ == '__main__': 62 | run_optimization() 63 | -------------------------------------------------------------------------------- /cohorts/2023/02-experiment-tracking/solution-mlflow/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import click 4 | import mlflow 5 | 6 | from sklearn.ensemble import RandomForestRegressor 7 | from sklearn.metrics import mean_squared_error 8 | 9 | 10 | mlflow.set_tracking_uri("sqlite:///mlflow.db") 11 | mlflow.set_experiment("random-forest-train") 12 | 13 | 14 | def load_pickle(filename: str): 15 | with open(filename, "rb") as f_in: 16 | return pickle.load(f_in) 17 | 18 | 19 | @click.command() 20 | @click.option( 21 | "--data_path", 22 | default="./output", 23 | help="Location where the processed NYC taxi trip data was saved" 24 | ) 25 | def run_train(data_path: str): 26 | mlflow.sklearn.autolog() 27 | X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl")) 28 | X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl")) 29 | 30 | with mlflow.start_run(): 31 | 32 | rf = RandomForestRegressor(max_depth=10, random_state=0) 33 | rf.fit(X_train, y_train) 34 | y_pred = rf.predict(X_val) 35 | 36 | rmse = mean_squared_error(y_val, y_pred, squared=False) 37 | 38 | 39 | if __name__ == '__main__': 40 | run_train() 41 | -------------------------------------------------------------------------------- /cohorts/2023/03-orchestration/prefect/.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | .ipynb_checkpoints 3 | models/* 4 | mlruns/* 5 | .vscode/ 6 | ./DS_Store 7 | *.db 8 | *.DS_Store -------------------------------------------------------------------------------- /cohorts/2023/03-orchestration/prefect/3.2/cat_dog_facts.py: -------------------------------------------------------------------------------- 1 | import httpx 2 | from prefect import flow 3 | 4 | @flow 5 | def fetch_cat_fact(): 6 | '''A flow that gets a cat fact''' 7 | return httpx.get("https://catfact.ninja/fact?max_length=140").json()["fact"] 8 | 9 | @flow 10 | def fetch_dog_fact(): 11 | '''A flow that gets a dog fact''' 12 | return httpx.get( 13 | "https://dogapi.dog/api/v2/facts", 14 | headers={"accept": "application/json"}, 15 | ).json()["data"][0]["attributes"]["body"] 16 | 17 | @flow(log_prints=True) 18 | def animal_facts(): 19 | cat_fact = fetch_cat_fact() 20 | dog_fact = fetch_dog_fact() 21 | print(f"🐱: {cat_fact} \n🐶: {dog_fact}") 22 | 23 | if __name__ == "__main__": 24 | animal_facts() -------------------------------------------------------------------------------- /cohorts/2023/03-orchestration/prefect/3.2/cat_facts.py: -------------------------------------------------------------------------------- 1 | import httpx 2 | from prefect import flow, task 3 | 4 | 5 | @task(retries=4, retry_delay_seconds=0.1, log_prints=True) 6 | def fetch_cat_fact(): 7 | cat_fact = httpx.get("https://f3-vyx5c2hfpq-ue.a.run.app/") 8 | #An endpoint that is designed to fail sporadically 9 | if cat_fact.status_code >= 400: 10 | raise Exception() 11 | print(cat_fact.text) 12 | 13 | 14 | @flow 15 | def fetch(): 16 | fetch_cat_fact() 17 | 18 | 19 | if __name__ == "__main__": 20 | fetch() -------------------------------------------------------------------------------- /cohorts/2023/03-orchestration/prefect/3.5/create_s3_bucket_block.py: -------------------------------------------------------------------------------- 1 | from time import sleep 2 | from prefect_aws import S3Bucket, AwsCredentials 3 | 4 | 5 | def create_aws_creds_block(): 6 | my_aws_creds_obj = AwsCredentials( 7 | aws_access_key_id="123abc", aws_secret_access_key="abc123" 8 | ) 9 | my_aws_creds_obj.save(name="my-aws-creds", overwrite=True) 10 | 11 | 12 | def create_s3_bucket_block(): 13 | aws_creds = AwsCredentials.load("my-aws-creds") 14 | my_s3_bucket_obj = S3Bucket( 15 | bucket_name="my-first-bucket-abc", credentials=aws_creds 16 | ) 17 | my_s3_bucket_obj.save(name="s3-bucket-example", overwrite=True) 18 | 19 | 20 | if __name__ == "__main__": 21 | create_aws_creds_block() 22 | sleep(5) 23 | create_s3_bucket_block() 24 | -------------------------------------------------------------------------------- /cohorts/2023/03-orchestration/prefect/3.6/create_s3_bucket_block.py: -------------------------------------------------------------------------------- 1 | from time import sleep 2 | from prefect_aws import S3Bucket, AwsCredentials 3 | 4 | 5 | def create_aws_creds_block(): 6 | my_aws_creds_obj = AwsCredentials( 7 | aws_access_key_id="123abc", aws_secret_access_key="abc123" 8 | ) 9 | my_aws_creds_obj.save(name="my-aws-creds", overwrite=True) 10 | 11 | 12 | def create_s3_bucket_block(): 13 | aws_creds = AwsCredentials.load("my-aws-creds") 14 | my_s3_bucket_obj = S3Bucket( 15 | bucket_name="my-first-bucket-abc", credentials=aws_creds 16 | ) 17 | my_s3_bucket_obj.save(name="s3-bucket-example", overwrite=True) 18 | 19 | 20 | if __name__ == "__main__": 21 | create_aws_creds_block() 22 | sleep(5) 23 | create_s3_bucket_block() 24 | -------------------------------------------------------------------------------- /cohorts/2023/03-orchestration/prefect/images/Activity-create-run-deployment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/03-orchestration/prefect/images/Activity-create-run-deployment.png -------------------------------------------------------------------------------- /cohorts/2023/03-orchestration/prefect/images/thumbnail-3-01.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-01.jpg -------------------------------------------------------------------------------- /cohorts/2023/03-orchestration/prefect/images/thumbnail-3-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-01.png -------------------------------------------------------------------------------- /cohorts/2023/03-orchestration/prefect/images/thumbnail-3-02.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-02.jpg -------------------------------------------------------------------------------- /cohorts/2023/03-orchestration/prefect/images/thumbnail-3-03.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-03.jpg -------------------------------------------------------------------------------- /cohorts/2023/03-orchestration/prefect/images/thumbnail-3-03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-03.png -------------------------------------------------------------------------------- /cohorts/2023/03-orchestration/prefect/images/thumbnail-3-04.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-04.jpg -------------------------------------------------------------------------------- /cohorts/2023/03-orchestration/prefect/images/thumbnail-3-04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-04.png -------------------------------------------------------------------------------- /cohorts/2023/03-orchestration/prefect/images/thumbnail-3-05.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-05.jpg -------------------------------------------------------------------------------- /cohorts/2023/03-orchestration/prefect/images/thumbnail-3-05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-05.png -------------------------------------------------------------------------------- /cohorts/2023/03-orchestration/prefect/images/thumbnail-3-06.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-06.jpg -------------------------------------------------------------------------------- /cohorts/2023/03-orchestration/prefect/images/thumbnail-3-06.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/03-orchestration/prefect/images/thumbnail-3-06.png -------------------------------------------------------------------------------- /cohorts/2023/03-orchestration/prefect/meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "module": { 3 | "number": 3, 4 | "title": "Orchestration and ML Pipelines" 5 | }, 6 | "units": [ 7 | { 8 | "number": 1, 9 | "title": "Introdution to Workflow Orchestration", 10 | "youtube": "https://www.youtube.com/watch?v=Cqb7wyaNF08" 11 | }, 12 | { 13 | "number": 2, 14 | "title": "Introduction to Prefect", 15 | "youtube": "https://www.youtube.com/watch?v=rTUBTvXvXvM" 16 | }, 17 | { 18 | "number": 3, 19 | "title": "Prefect Workflow", 20 | "youtube": "https://www.youtube.com/watch?v=x3bV8yMKjtc" 21 | }, 22 | { 23 | "number": 4, 24 | "title": "Deploying Your Workflow", 25 | "youtube": "https://www.youtube.com/watch?v=3YjagezFhOo" 26 | }, 27 | { 28 | "number": 5, 29 | "title": "Working with Deployments", 30 | "youtube": "https://www.youtube.com/watch?v=jVmaaqs63O8" 31 | }, 32 | { 33 | "number": 6, 34 | "title": "Prefect Cloud (optional)", 35 | "youtube": "https://www.youtube.com/watch?v=y89Ww85EUdo" 36 | }, 37 | { 38 | "number": 7, 39 | "title": "Homework", 40 | "youtube": "" 41 | } 42 | ] 43 | } -------------------------------------------------------------------------------- /cohorts/2023/03-orchestration/prefect/requirements.txt: -------------------------------------------------------------------------------- 1 | black==23.3.0 2 | fastparquet==2023.4.0 3 | hyperopt==0.2.7 4 | mlflow==2.3.1 5 | pandas==2.0.1 6 | prefect==2.10.8 7 | prefect-aws==0.3.1 8 | scikit_learn==1.2.2 9 | seaborn==0.12.2 10 | xgboost==1.7.5 11 | orjson==3.8.1 -------------------------------------------------------------------------------- /cohorts/2023/04-deployment/homework/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10.0-slim 2 | 3 | WORKDIR /app 4 | COPY [ "model2.bin", "model.bin" ] 5 | -------------------------------------------------------------------------------- /cohorts/2023/04-deployment/homework/model.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/04-deployment/homework/model.bin -------------------------------------------------------------------------------- /cohorts/2023/04-deployment/homework/starter.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "2c51efaa", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "!pip freeze | grep scikit-learn" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "id": "0ef880a0", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import pickle\n", 21 | "import pandas as pd" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "id": "7836ccfd", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "with open('model.bin', 'rb') as f_in:\n", 32 | " dv, model = pickle.load(f_in)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 3, 38 | "id": "41c08294", 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "categorical = ['PULocationID', 'DOLocationID']\n", 43 | "\n", 44 | "def read_data(filename):\n", 45 | " df = pd.read_parquet(filename)\n", 46 | " \n", 47 | " df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime\n", 48 | " df['duration'] = df.duration.dt.total_seconds() / 60\n", 49 | "\n", 50 | " df = df[(df.duration >= 1) & (df.duration <= 60)].copy()\n", 51 | "\n", 52 | " df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')\n", 53 | " \n", 54 | " return df" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 4, 60 | "id": "4854399a", 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "df = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_????-??.parquet')" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 5, 70 | "id": "669fda0a", 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "dicts = df[categorical].to_dict(orient='records')\n", 75 | "X_val = dv.transform(dicts)\n", 76 | "y_pred = model.predict(X_val)" 77 | ] 78 | } 79 | ], 80 | "metadata": { 81 | "kernelspec": { 82 | "display_name": "Python 3 (ipykernel)", 83 | "language": "python", 84 | "name": "python3" 85 | }, 86 | "language_info": { 87 | "codemirror_mode": { 88 | "name": "ipython", 89 | "version": 3 90 | }, 91 | "file_extension": ".py", 92 | "mimetype": "text/x-python", 93 | "name": "python", 94 | "nbconvert_exporter": "python", 95 | "pygments_lexer": "ipython3", 96 | "version": "3.10.0" 97 | } 98 | }, 99 | "nbformat": 4, 100 | "nbformat_minor": 5 101 | } 102 | -------------------------------------------------------------------------------- /cohorts/2023/06-best-practices/homework/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10.0-slim 2 | 3 | RUN pip install -U pip & pip install pipenv 4 | 5 | COPY [ "Pipfile", "Pipfile.lock", "./" ] 6 | 7 | RUN pipenv install --system --deploy 8 | 9 | COPY [ "batch.py", "batch.py" ] 10 | COPY [ "model.bin", "model.bin" ] 11 | 12 | ENTRYPOINT [ "python", "batch.py" ] -------------------------------------------------------------------------------- /cohorts/2023/06-best-practices/homework/Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | scikit-learn = "==1.2.2" 8 | pandas = "*" 9 | pyarrow = "*" 10 | s3fs = "*" 11 | 12 | [dev-packages] 13 | 14 | [requires] 15 | python_version = "3.10" 16 | -------------------------------------------------------------------------------- /cohorts/2023/06-best-practices/homework/batch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import sys 5 | import pickle 6 | import pandas as pd 7 | 8 | 9 | year = int(sys.argv[1]) 10 | month = int(sys.argv[2]) 11 | 12 | input_file = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year:04d}-{month:02d}.parquet' 13 | output_file = f'output/yellow_tripdata_{year:04d}-{month:02d}.parquet' 14 | 15 | 16 | with open('model.bin', 'rb') as f_in: 17 | dv, lr = pickle.load(f_in) 18 | 19 | 20 | categorical = ['PULocationID', 'DOLocationID'] 21 | 22 | def read_data(filename): 23 | df = pd.read_parquet(filename) 24 | 25 | df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime 26 | df['duration'] = df.duration.dt.total_seconds() / 60 27 | 28 | df = df[(df.duration >= 1) & (df.duration <= 60)].copy() 29 | 30 | df[categorical] = df[categorical].fillna(-1).astype('int').astype('str') 31 | 32 | return df 33 | 34 | 35 | df = read_data(input_file) 36 | df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str') 37 | 38 | 39 | dicts = df[categorical].to_dict(orient='records') 40 | X_val = dv.transform(dicts) 41 | y_pred = lr.predict(X_val) 42 | 43 | 44 | print('predicted mean duration:', y_pred.mean()) 45 | 46 | 47 | df_result = pd.DataFrame() 48 | df_result['ride_id'] = df['ride_id'] 49 | df_result['predicted_duration'] = y_pred 50 | 51 | 52 | df_result.to_parquet(output_file, engine='pyarrow', index=False) 53 | -------------------------------------------------------------------------------- /cohorts/2023/06-best-practices/homework/model.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/06-best-practices/homework/model.bin -------------------------------------------------------------------------------- /cohorts/2023/06-best-practices/homework_solution/Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | scikit-learn = "==1.2.2" 8 | pandas = "*" 9 | pyarrow = "*" 10 | s3fs = "*" 11 | 12 | [dev-packages] 13 | pytest = "*" 14 | 15 | [requires] 16 | python_version = "3.10" 17 | -------------------------------------------------------------------------------- /cohorts/2023/06-best-practices/homework_solution/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | localstack: 3 | image: localstack/localstack 4 | ports: 5 | - "4566:4566" 6 | environment: 7 | - SERVICES=s3 8 | - AWS_DEFAULT_REGION=eu-west-1 9 | - AWS_ACCESS_KEY_ID=abc 10 | - AWS_SECRET_ACCESS_KEY=xyz -------------------------------------------------------------------------------- /cohorts/2023/06-best-practices/homework_solution/integration_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from datetime import datetime 4 | 5 | import batch 6 | 7 | 8 | def dt(hour, minute, second=0): 9 | return datetime(2022, 1, 1, hour, minute, second) 10 | 11 | 12 | S3_ENDPOINT_URL = os.getenv('S3_ENDPOINT_URL') 13 | 14 | options = { 15 | 'client_kwargs': { 16 | 'endpoint_url': S3_ENDPOINT_URL 17 | } 18 | } 19 | 20 | data = [ 21 | (None, None, dt(1, 2), dt(1, 10)), 22 | (1, None, dt(1, 2), dt(1, 10)), 23 | (1, 2, dt(2, 2), dt(2, 3)), 24 | (None, 1, dt(1, 2, 0), dt(1, 2, 50)), 25 | (2, 3, dt(1, 2, 0), dt(1, 2, 59)), 26 | (3, 4, dt(1, 2, 0), dt(2, 2, 1)), 27 | ] 28 | 29 | columns = ['PULocationID', 'DOLocationID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime'] 30 | df_input = pd.DataFrame(data, columns=columns) 31 | 32 | 33 | input_file = batch.get_input_path(2022, 1) 34 | output_file = batch.get_output_path(2022, 1) 35 | 36 | df_input.to_parquet( 37 | input_file, 38 | engine='pyarrow', 39 | compression=None, 40 | index=False, 41 | storage_options=options 42 | ) 43 | 44 | 45 | os.system('python batch.py 2022 1') 46 | 47 | 48 | df_actual = pd.read_parquet(output_file, storage_options=options) 49 | print(df_actual['predicted_duration'].sum()) 50 | 51 | assert abs(df_actual['predicted_duration'].sum() - 31.51) < 0.1 -------------------------------------------------------------------------------- /cohorts/2023/06-best-practices/homework_solution/model.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/06-best-practices/homework_solution/model.bin -------------------------------------------------------------------------------- /cohorts/2023/06-best-practices/homework_solution/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2023/06-best-practices/homework_solution/tests/__init__.py -------------------------------------------------------------------------------- /cohorts/2023/06-best-practices/homework_solution/tests/test_batch.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from datetime import datetime 3 | 4 | from batch import prepare_data 5 | 6 | 7 | def dt(hour, minute, second=0): 8 | return datetime(2022, 1, 1, hour, minute, second) 9 | 10 | 11 | def test_prepare_data(): 12 | data = [ 13 | (None, None, dt(1, 2), dt(1, 10)), 14 | (1, None, dt(1, 2), dt(1, 10)), 15 | (1, 2, dt(2, 2), dt(2, 3)), 16 | (None, 1, dt(1, 2, 0), dt(1, 2, 50)), 17 | (2, 3, dt(1, 2, 0), dt(1, 2, 59)), 18 | (3, 4, dt(1, 2, 0), dt(2, 2, 1)), 19 | ] 20 | 21 | categorical = ['PULocationID', 'DOLocationID'] 22 | columns = ['PULocationID', 'DOLocationID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime'] 23 | df = pd.DataFrame(data, columns=columns) 24 | 25 | df_actual = prepare_data(df, categorical) 26 | 27 | data_expected = [ 28 | ('-1', '-1', 8.0), 29 | ('1', '-1', 8.0), 30 | ('1', '2', 1.0), 31 | ] 32 | 33 | columns_test = ['PULocationID', 'DOLocationID', 'duration'] 34 | df_expected = pd.DataFrame(data_expected, columns=columns_test) 35 | print(df_actual) 36 | 37 | assert (df_actual['PULocationID'] == df_expected['PULocationID']).all() 38 | assert (df_actual['DOLocationID'] == df_expected['DOLocationID']).all() 39 | assert (df_actual['duration'] - df_expected['duration']).abs().sum() < 0.0000001 40 | 41 | 42 | -------------------------------------------------------------------------------- /cohorts/2023/07-project/README.md: -------------------------------------------------------------------------------- 1 | ## Course Project 2 | 3 | The goal of this project is to apply everything we learned 4 | in this course and build an end-to-end machine learning project. 5 | 6 | Remember that to pass the project, you must evaluate 3 peers. If you don't do that, your project can't be considered compelete. 7 | 8 | 9 | ### Submitting 10 | 11 | #### Project Cohort #1 12 | 13 | Project: 14 | 15 | * Form: https://forms.gle/mRRoDtqDXBytvsoD9 16 | * Deadline: 31 July, 23:00 CEST 17 | 18 | Peer reviewing: 19 | 20 | * Peer review assignments: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vTAztxXsLidZV8I18gL9_qtJpxhyvyhJsEeXrP3kFyZoauGgR-S4p6b7H5yJ9kdTbUE5wAAvZgTTZ49/pubhtml?gid=0&single=true) ("project 1" tab) 21 | * Form: https://forms.gle/MRMHDuFiP6DFShaj7 22 | * Deadline: 10 August, 23:00 CEST 23 | 24 | Project feedback: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vTS8Mlu6sWyu6JinFUftUl6OB5mxXlwGT2icIyQCSbhDDmW36WWyAbv2dCFJhng6Nln0o3cwvTchjcU/pubhtml?gid=0&single=true) ("feedback-01" tab) 25 | 26 | 27 | 28 | #### Project Cohort #2 29 | 30 | Project: 31 | 32 | * Form: https://forms.gle/o1s3NmYE4UmFSMVD7 33 | * Deadline: 21 August (Monday), 23:00 CEST 34 | 35 | Peer reviewing: 36 | 37 | * Peer review assignments: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vTAztxXsLidZV8I18gL9_qtJpxhyvyhJsEeXrP3kFyZoauGgR-S4p6b7H5yJ9kdTbUE5wAAvZgTTZ49/pubhtml?gid=1942033009&single=true) ("project 2" tab) 38 | * Form: https://forms.gle/R4Y58WSxGDWsDBFv7 39 | * Deadline: 29 August (Tuesday), 23:00 CEST 40 | 41 | Project feedback: [link](https://docs.google.com/spreadsheets/d/e/2PACX-1vTS8Mlu6sWyu6JinFUftUl6OB5mxXlwGT2icIyQCSbhDDmW36WWyAbv2dCFJhng6Nln0o3cwvTchjcU/pubhtml?gid=546664034&single=true) ("feedback-02" tab) 42 | 43 | 44 | 45 | ### Evaluation criteria 46 | 47 | See [here](../../../07-project/README.md) 48 | 49 | 50 | ### Misc 51 | 52 | To get the hash for your project, use this function to hash your email: 53 | 54 | ```python 55 | from hashlib import sha1 56 | 57 | def compute_hash(email): 58 | return sha1(email.lower().encode('utf-8')).hexdigest() 59 | ``` 60 | 61 | Or use [this website](http://www.sha1-online.com/). 62 | -------------------------------------------------------------------------------- /cohorts/2023/README.md: -------------------------------------------------------------------------------- 1 | ## MLOps Zoomcamp 2023 Cohort 2 | 3 | * [Pre-Course Live Q&A](https://www.youtube.com/watch?v=o34Q_61iA4Y&list=PL3MmuxUbc_hKqamJqQ7Ew8HxptJYnXqQM&index=1) 4 | * [Technical FAQ](https://docs.google.com/document/d/12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0/edit) 5 | * [Course Playlist: Only 2023 Live videos & homeworks](https://www.youtube.com/playlist?list=PL3MmuxUbc_hKqamJqQ7Ew8HxptJYnXqQM) 6 | * [Leaderboard](https://docs.google.com/spreadsheets/d/e/2PACX-1vTHTc2eDorvcprX3SRd_ZejSnOjd7SUBlmr7ttYs9NsbS3G9szB9wMlMfCOLVL5XWCB0p8oaDOfffaZ/pubhtml) 7 | * [Deadline calendar](https://docs.google.com/spreadsheets/d/e/2PACX-1vRNTwA0Of1lyprYpn2YxU-l0gvNeq-up7g7ITB42nPf2gT9Qd3PTzqTmkjAZjk1s__r7D99CsJfcZEO/pubhtml?gid=0&single=true) 8 | 9 | 10 | [**Module 1: Introduction**](01-intro) 11 | 12 | * [Homework](01-intro/homework.md) 13 | * [Solution](01-intro/homework.ipynb) 14 | 15 | [**Module 2: Experiment Tracking**](02-experiment-tracking/) 16 | 17 | * [Homework](02-experiment-tracking/homework.md) 18 | * [Workshop: Weights & Biases](02-experiment-tracking/wandb.md) 19 | * [Solution MLflow](02-experiment-tracking/solution-mlflow/) 20 | 21 | [**Module 3: Orchestration and ML pipelines**](03-orchestration/) 22 | 23 | * [Homework](03-orchestration/homework.md) 24 | 25 | [**Module 4: Model Deployment**](04-deployment) 26 | 27 | * [Homework](04-deployment/homework.md) 28 | * [Solution](04-deployment/homework_solution) 29 | 30 | [**Module 5: Model Monitoring**](05-monitoring/) 31 | 32 | * [Homework](05-monitoring/homework.md) 33 | 34 | [**Module 6: Best Practices**](06-best-practices) 35 | 36 | * [Homework](06-best-practices/homework.md) 37 | * [Solution](06-best-practices/homework_solution) 38 | 39 | 40 | [**Projects**](07-project/) 41 | 42 | * [More information](07-project/README.md) 43 | -------------------------------------------------------------------------------- /cohorts/2024/01-intro/homework.md: -------------------------------------------------------------------------------- 1 | ## Homework 2 | 3 | The goal of this homework is to train a simple model for predicting the duration of a ride - similar to what we did in this module. 4 | 5 | 6 | ## Q1. Downloading the data 7 | 8 | We'll use [the same NYC taxi dataset](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page), 9 | but instead of "**Green** Taxi Trip Records", we'll use "**Yellow** Taxi Trip Records". 10 | 11 | Download the data for January and February 2023. 12 | 13 | Read the data for January. How many columns are there? 14 | 15 | * 16 16 | * 17 17 | * 18 18 | * 19 19 | 20 | 21 | ## Q2. Computing duration 22 | 23 | Now let's compute the `duration` variable. It should contain the duration of a ride in minutes. 24 | 25 | What's the standard deviation of the trips duration in January? 26 | 27 | * 32.59 28 | * 42.59 29 | * 52.59 30 | * 62.59 31 | 32 | 33 | ## Q3. Dropping outliers 34 | 35 | Next, we need to check the distribution of the `duration` variable. There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive). 36 | 37 | What fraction of the records left after you dropped the outliers? 38 | 39 | * 90% 40 | * 92% 41 | * 95% 42 | * 98% 43 | 44 | 45 | ## Q4. One-hot encoding 46 | 47 | Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model. 48 | 49 | * Turn the dataframe into a list of dictionaries (remember to re-cast the ids to strings - otherwise it will 50 | label encode them) 51 | * Fit a dictionary vectorizer 52 | * Get a feature matrix from it 53 | 54 | What's the dimensionality of this matrix (number of columns)? 55 | 56 | * 2 57 | * 155 58 | * 345 59 | * 515 60 | * 715 61 | 62 | 63 | ## Q5. Training a model 64 | 65 | Now let's use the feature matrix from the previous step to train a model. 66 | 67 | * Train a plain linear regression model with default parameters, where duration is the response variable 68 | * Calculate the RMSE of the model on the training data 69 | 70 | What's the RMSE on train? 71 | 72 | * 3.64 73 | * 7.64 74 | * 11.64 75 | * 16.64 76 | 77 | 78 | ## Q6. Evaluating the model 79 | 80 | Now let's apply this model to the validation dataset (February 2023). 81 | 82 | What's the RMSE on validation? 83 | 84 | * 3.81 85 | * 7.81 86 | * 11.81 87 | * 16.81 88 | 89 | ## Submit the results 90 | 91 | * Submit your results here: https://courses.datatalks.club/mlops-zoomcamp-2024/homework/hw1 92 | * If your answer doesn't match options exactly, select the closest one 93 | -------------------------------------------------------------------------------- /cohorts/2024/02-experiment-tracking/homework/hpo.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import click 4 | import mlflow 5 | import numpy as np 6 | from hyperopt import STATUS_OK, Trials, fmin, hp, tpe 7 | from hyperopt.pyll import scope 8 | from sklearn.ensemble import RandomForestRegressor 9 | from sklearn.metrics import mean_squared_error 10 | 11 | mlflow.set_tracking_uri("http://127.0.0.1:5000") 12 | mlflow.set_experiment("random-forest-hyperopt") 13 | 14 | 15 | def load_pickle(filename: str): 16 | with open(filename, "rb") as f_in: 17 | return pickle.load(f_in) 18 | 19 | 20 | @click.command() 21 | @click.option( 22 | "--data_path", 23 | default="./output", 24 | help="Location where the processed NYC taxi trip data was saved" 25 | ) 26 | @click.option( 27 | "--num_trials", 28 | default=15, 29 | help="The number of parameter evaluations for the optimizer to explore" 30 | ) 31 | def run_optimization(data_path: str, num_trials: int): 32 | 33 | X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl")) 34 | X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl")) 35 | 36 | def objective(params): 37 | 38 | rf = RandomForestRegressor(**params) 39 | rf.fit(X_train, y_train) 40 | y_pred = rf.predict(X_val) 41 | rmse = mean_squared_error(y_val, y_pred, squared=False) 42 | 43 | return {'loss': rmse, 'status': STATUS_OK} 44 | 45 | search_space = { 46 | 'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)), 47 | 'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)), 48 | 'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)), 49 | 'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)), 50 | 'random_state': 42 51 | } 52 | 53 | rstate = np.random.default_rng(42) # for reproducible results 54 | fmin( 55 | fn=objective, 56 | space=search_space, 57 | algo=tpe.suggest, 58 | max_evals=num_trials, 59 | trials=Trials(), 60 | rstate=rstate 61 | ) 62 | 63 | 64 | if __name__ == '__main__': 65 | run_optimization() 66 | -------------------------------------------------------------------------------- /cohorts/2024/02-experiment-tracking/homework/preprocess_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import click 4 | import pandas as pd 5 | 6 | from sklearn.feature_extraction import DictVectorizer 7 | 8 | 9 | def dump_pickle(obj, filename: str): 10 | with open(filename, "wb") as f_out: 11 | return pickle.dump(obj, f_out) 12 | 13 | 14 | def read_dataframe(filename: str): 15 | df = pd.read_parquet(filename) 16 | 17 | df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime'] 18 | df.duration = df.duration.apply(lambda td: td.total_seconds() / 60) 19 | df = df[(df.duration >= 1) & (df.duration <= 60)] 20 | 21 | categorical = ['PULocationID', 'DOLocationID'] 22 | df[categorical] = df[categorical].astype(str) 23 | 24 | return df 25 | 26 | 27 | def preprocess(df: pd.DataFrame, dv: DictVectorizer, fit_dv: bool = False): 28 | df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID'] 29 | categorical = ['PU_DO'] 30 | numerical = ['trip_distance'] 31 | dicts = df[categorical + numerical].to_dict(orient='records') 32 | if fit_dv: 33 | X = dv.fit_transform(dicts) 34 | else: 35 | X = dv.transform(dicts) 36 | return X, dv 37 | 38 | 39 | @click.command() 40 | @click.option( 41 | "--raw_data_path", 42 | help="Location where the raw NYC taxi trip data was saved" 43 | ) 44 | @click.option( 45 | "--dest_path", 46 | help="Location where the resulting files will be saved" 47 | ) 48 | def run_data_prep(raw_data_path: str, dest_path: str, dataset: str = "green"): 49 | # Load parquet files 50 | df_train = read_dataframe( 51 | os.path.join(raw_data_path, f"{dataset}_tripdata_2023-01.parquet") 52 | ) 53 | df_val = read_dataframe( 54 | os.path.join(raw_data_path, f"{dataset}_tripdata_2023-02.parquet") 55 | ) 56 | df_test = read_dataframe( 57 | os.path.join(raw_data_path, f"{dataset}_tripdata_2023-03.parquet") 58 | ) 59 | 60 | # Extract the target 61 | target = 'duration' 62 | y_train = df_train[target].values 63 | y_val = df_val[target].values 64 | y_test = df_test[target].values 65 | 66 | # Fit the DictVectorizer and preprocess data 67 | dv = DictVectorizer() 68 | X_train, dv = preprocess(df_train, dv, fit_dv=True) 69 | X_val, _ = preprocess(df_val, dv, fit_dv=False) 70 | X_test, _ = preprocess(df_test, dv, fit_dv=False) 71 | 72 | # Create dest_path folder unless it already exists 73 | os.makedirs(dest_path, exist_ok=True) 74 | 75 | # Save DictVectorizer and datasets 76 | dump_pickle(dv, os.path.join(dest_path, "dv.pkl")) 77 | dump_pickle((X_train, y_train), os.path.join(dest_path, "train.pkl")) 78 | dump_pickle((X_val, y_val), os.path.join(dest_path, "val.pkl")) 79 | dump_pickle((X_test, y_test), os.path.join(dest_path, "test.pkl")) 80 | 81 | 82 | if __name__ == '__main__': 83 | run_data_prep() 84 | -------------------------------------------------------------------------------- /cohorts/2024/02-experiment-tracking/homework/register_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import click 4 | import mlflow 5 | 6 | from mlflow.entities import ViewType 7 | from mlflow.tracking import MlflowClient 8 | from sklearn.ensemble import RandomForestRegressor 9 | from sklearn.metrics import mean_squared_error 10 | 11 | HPO_EXPERIMENT_NAME = "random-forest-hyperopt" 12 | EXPERIMENT_NAME = "random-forest-best-models" 13 | RF_PARAMS = ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'random_state'] 14 | 15 | mlflow.set_tracking_uri("http://127.0.0.1:5000") 16 | mlflow.set_experiment(EXPERIMENT_NAME) 17 | mlflow.sklearn.autolog() 18 | 19 | 20 | def load_pickle(filename): 21 | with open(filename, "rb") as f_in: 22 | return pickle.load(f_in) 23 | 24 | 25 | def train_and_log_model(data_path, params): 26 | X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl")) 27 | X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl")) 28 | X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl")) 29 | 30 | with mlflow.start_run(): 31 | new_params = {} 32 | for param in RF_PARAMS: 33 | new_params[param] = int(params[param]) 34 | 35 | rf = RandomForestRegressor(**new_params) 36 | rf.fit(X_train, y_train) 37 | 38 | # Evaluate model on the validation and test sets 39 | val_rmse = mean_squared_error(y_val, rf.predict(X_val), squared=False) 40 | mlflow.log_metric("val_rmse", val_rmse) 41 | test_rmse = mean_squared_error(y_test, rf.predict(X_test), squared=False) 42 | mlflow.log_metric("test_rmse", test_rmse) 43 | 44 | 45 | @click.command() 46 | @click.option( 47 | "--data_path", 48 | default="./output", 49 | help="Location where the processed NYC taxi trip data was saved" 50 | ) 51 | @click.option( 52 | "--top_n", 53 | default=5, 54 | type=int, 55 | help="Number of top models that need to be evaluated to decide which one to promote" 56 | ) 57 | def run_register_model(data_path: str, top_n: int): 58 | 59 | client = MlflowClient() 60 | 61 | # Retrieve the top_n model runs and log the models 62 | experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME) 63 | runs = client.search_runs( 64 | experiment_ids=experiment.experiment_id, 65 | run_view_type=ViewType.ACTIVE_ONLY, 66 | max_results=top_n, 67 | order_by=["metrics.rmse ASC"] 68 | ) 69 | for run in runs: 70 | train_and_log_model(data_path=data_path, params=run.data.params) 71 | 72 | # Select the model with the lowest test RMSE 73 | experiment = client.get_experiment_by_name(EXPERIMENT_NAME) 74 | # best_run = client.search_runs( ... )[0] 75 | 76 | # Register the best model 77 | # mlflow.register_model( ... ) 78 | 79 | 80 | if __name__ == '__main__': 81 | run_register_model() 82 | -------------------------------------------------------------------------------- /cohorts/2024/02-experiment-tracking/homework/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import click 4 | 5 | from sklearn.ensemble import RandomForestRegressor 6 | from sklearn.metrics import mean_squared_error 7 | 8 | 9 | def load_pickle(filename: str): 10 | with open(filename, "rb") as f_in: 11 | return pickle.load(f_in) 12 | 13 | 14 | @click.command() 15 | @click.option( 16 | "--data_path", 17 | default="./output", 18 | help="Location where the processed NYC taxi trip data was saved" 19 | ) 20 | def run_train(data_path: str): 21 | 22 | X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl")) 23 | X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl")) 24 | 25 | rf = RandomForestRegressor(max_depth=10, random_state=0) 26 | rf.fit(X_train, y_train) 27 | y_pred = rf.predict(X_val) 28 | 29 | rmse = mean_squared_error(y_val, y_pred, squared=False) 30 | 31 | 32 | if __name__ == '__main__': 33 | run_train() 34 | -------------------------------------------------------------------------------- /cohorts/2024/02-experiment-tracking/solution/hpo.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import click 4 | import mlflow 5 | import numpy as np 6 | from hyperopt import STATUS_OK, Trials, fmin, hp, tpe 7 | from hyperopt.pyll import scope 8 | from sklearn.ensemble import RandomForestRegressor 9 | from sklearn.metrics import mean_squared_error 10 | 11 | mlflow.set_tracking_uri("http://127.0.0.1:5000") 12 | mlflow.set_experiment("random-forest-hyperopt") 13 | 14 | 15 | def load_pickle(filename: str): 16 | with open(filename, "rb") as f_in: 17 | return pickle.load(f_in) 18 | 19 | 20 | @click.command() 21 | @click.option( 22 | "--data_path", 23 | default="./output", 24 | help="Location where the processed NYC taxi trip data was saved" 25 | ) 26 | @click.option( 27 | "--num_trials", 28 | default=15, 29 | help="The number of parameter evaluations for the optimizer to explore" 30 | ) 31 | def run_optimization(data_path: str, num_trials: int): 32 | 33 | X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl")) 34 | X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl")) 35 | 36 | def objective(params): 37 | 38 | with mlflow.start_run(): 39 | mlflow.log_params(params) 40 | rf = RandomForestRegressor(**params) 41 | rf.fit(X_train, y_train) 42 | y_pred = rf.predict(X_val) 43 | rmse = mean_squared_error(y_val, y_pred, squared=False) 44 | mlflow.log_metric("rmse", rmse) 45 | 46 | return {'loss': rmse, 'status': STATUS_OK} 47 | 48 | search_space = { 49 | 'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)), 50 | 'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)), 51 | 'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)), 52 | 'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)), 53 | 'random_state': 42 54 | } 55 | 56 | rstate = np.random.default_rng(42) # for reproducible results 57 | fmin( 58 | fn=objective, 59 | space=search_space, 60 | algo=tpe.suggest, 61 | max_evals=num_trials, 62 | trials=Trials(), 63 | rstate=rstate 64 | ) 65 | 66 | 67 | if __name__ == '__main__': 68 | run_optimization() 69 | -------------------------------------------------------------------------------- /cohorts/2024/02-experiment-tracking/solution/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import click 4 | import mlflow 5 | 6 | from sklearn.ensemble import RandomForestRegressor 7 | from sklearn.metrics import mean_squared_error 8 | 9 | 10 | mlflow.set_tracking_uri("sqlite:///mlflow.db") 11 | mlflow.set_experiment("random-forest-train") 12 | 13 | 14 | def load_pickle(filename: str): 15 | with open(filename, "rb") as f_in: 16 | return pickle.load(f_in) 17 | 18 | 19 | @click.command() 20 | @click.option( 21 | "--data_path", 22 | default="./output", 23 | help="Location where the processed NYC taxi trip data was saved" 24 | ) 25 | def run_train(data_path: str): 26 | mlflow.sklearn.autolog() 27 | X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl")) 28 | X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl")) 29 | 30 | with mlflow.start_run(): 31 | 32 | rf = RandomForestRegressor(max_depth=10, random_state=0) 33 | rf.fit(X_train, y_train) 34 | y_pred = rf.predict(X_val) 35 | 36 | rmse = mean_squared_error(y_val, y_pred, squared=False) 37 | 38 | 39 | if __name__ == '__main__': 40 | run_train() 41 | -------------------------------------------------------------------------------- /cohorts/2024/03-orchestration/.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | .ipynb_checkpoints 3 | models/* 4 | mlruns/* 5 | .vscode/ 6 | ./DS_Store 7 | *.db 8 | *.DS_Store 9 | -------------------------------------------------------------------------------- /cohorts/2024/03-orchestration/3.2/README.md: -------------------------------------------------------------------------------- 1 | # 3.2 Training: sklearn models and XGBoost 2 | 3 | 4 | ## 1. Training pipeline for sklearn models 5 | 6 | ### Videos 7 | 8 | 1. [GDP training set](https://youtu.be/KP68DuJnk4Q?si=tVHWYLCpZ2RpwuNh) 9 | 1. [Sklearn training GDP](https://youtu.be/CbHaZcq_uGo) 10 | 1. [Load models](https://youtu.be/zsMHFq2C978) 11 | 1. [Utility helper functions for loading models](https://youtu.be/fZnxDhtPxYo) 12 | 1. [Hyperparameter tuning](https://youtu.be/zfBB4KoZ7TM) 13 | 1. [Train sklearn model](https://youtu.be/P7PtegUFk3k) 14 | 15 | ### Code 16 | 17 | - [`utils/models/sklearn.py`](https://github.com/mage-ai/mlops/blob/master/mlops/utils/models/sklearn.py) 18 | - [`custom/load_models.py` block](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/custom/load_models.py): load sklearn models dynamically 19 | - [`transformers/hyperparameter_tuning/sklearn.py`](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/transformers/hyperparameter_tuning/sklearn.py) 20 | - [`data_exporters/sklearn.py`](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/data_exporters/sklearn.py) 21 | - [`hyperparameters/shared.py`](https://github.com/mage-ai/mlops/blob/master/mlops/utils/hyperparameters/shared.py) 22 | 23 | --- 24 | 25 | ## 2. Training pipeline for XGBoost model 26 | 27 | ### Videos 28 | 29 | 1. [Hyperparameter tuning](https://youtu.be/K_Z2Lm1Cyu4) 30 | 1. [Train XGBoost model](https://youtu.be/Y2B-ivm7Mug) 31 | 32 | ### Code 33 | 34 | - [`utils/models/xgboost.py`](https://github.com/mage-ai/mlops/blob/master/mlops/utils/models/xgboost.py) 35 | - [`transformers/hyperparameter_tuning/xgboost.py`](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/transformers/hyperparameter_tuning/xgboost.py) 36 | - [`data_exporters/xgboost.py`](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/data_exporters/xgboost.py) 37 | - [`hyperparameters/shared.py`](https://github.com/mage-ai/mlops/blob/master/mlops/utils/hyperparameters/shared.py) 38 | 39 | --- 40 | 41 | ## Code 42 | 43 | 1. [Complete code solution](https://github.com/mage-ai/mlops) 44 | 1. [sklearn training pipeline configuration](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/pipelines/sklearn_training/metadata.yaml) 45 | 1. [XGBoost training pipeline configuration](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/pipelines/xgboost_training/metadata.yaml) 46 | 47 | --- 48 | 49 | ## Resources 50 | 51 | 1. [Accuracy, precision, recall](https://www.mage.ai/blog/definitive-guide-to-accuracy-precision-recall-for-product-developers) 52 | 53 | 1. [Regression model performance metrics](https://www.mage.ai/blog/product-developers-guide-to-ml-regression-model-metrics) 54 | -------------------------------------------------------------------------------- /cohorts/2024/03-orchestration/3.4/README.md: -------------------------------------------------------------------------------- 1 | # 3.4 Triggering: Inference and retraining 2 | 3 | 4 | ## 1. Retraining pipeline 5 | 6 | ### Videos 7 | 8 | 1. [Setup pipeline](https://youtu.be/ywzNac-OzFc) 9 | 1. [Trigger pipeline to run](https://youtu.be/6kcBWl3E8So) 10 | 11 | ### Code 12 | 13 | - [`detect_new_data.py`](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/sensors/detect_new_data.py) 14 | - [`custom/retrain/sklearn.py` block](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/custom/retrain/sklearn.py): trigger training pipeline for sklearn models 15 | - [`custom/retrain/xgboost.py` block](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/custom/retrain/xgboost.py): trigger training pipeline for XGBoost model 16 | 17 | --- 18 | 19 | ## 2. Inference pipeline 20 | 21 | ### Videos 22 | 23 | 1. [Make a prediction](https://youtu.be/KZaS2oG9NDc) 24 | 1. [Build pipeline](https://youtu.be/mytcFbH_ooY) 25 | 1. [Model inference playground part 1](https://youtu.be/JI0dhR7Bnhk) 26 | 1. [Model inference playground part 2](https://youtu.be/v2ls-gBBRac) 27 | 1. [Get prediction via API](https://youtu.be/J6ckSZczk8M) 28 | 29 | ### Code 30 | 31 | - [`custom/inference.py` block](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/custom/inference.py) 32 | 33 | --- 34 | 35 | ## Code 36 | 37 | 1. [Retraining pipeline `metadata.yaml`](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/pipelines/automatic_retraining/metadata.yaml) 38 | 1. [Inference pipeline `metadata.yaml`](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/pipelines/predict/metadata.yaml) 39 | 1. [Playground configuration settings](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/interactions/playground.yaml) 40 | 41 | --- 42 | 43 | ## Resources 44 | 45 | 1. [No-code UI interactions](https://docs.mage.ai/interactions/overview) 46 | 47 | 1. [Saving triggers in code](https://docs.mage.ai/orchestration/triggers/configure-triggers-in-code) 48 | 49 | 1. [Trigger another pipeline from a block](https://docs.mage.ai/orchestration/triggers/trigger-pipeline) 50 | 51 | 1. [Trigger pipeline via API endpoint](https://docs.mage.ai/orchestration/triggers/trigger-pipeline-api) 52 | 53 | 1. [Run pipelines on a recurring schedule](https://docs.mage.ai/orchestration/triggers/schedule-pipelines) 54 | 55 | 1. [Improving model performance through retraining]() 56 | -------------------------------------------------------------------------------- /cohorts/2024/03-orchestration/3.5/README.md: -------------------------------------------------------------------------------- 1 | # 3.5 Deploying: Running operations in production 2 | 3 | 4 | ## 1. Permissions 5 | 6 | ### Videos 7 | 8 | 1. [Configure permissions on AWS](https://youtu.be/TgdFaf4mw38) 9 | 10 | ### Code 11 | 12 | - [`custom/permissions.py` block](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/custom/permissions.py) 13 | 14 | --- 15 | 16 | ## 2. Deploy 17 | 18 | ### Videos 19 | 20 | 1. [Setup and deploy using Terraform](https://youtu.be/w9zl3n2a3Wc) 21 | 22 | ### Code 23 | 24 | - [`custom/infrastructure_setup.py` block](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/custom/infrastructure_setup.py) 25 | - [`custom/deploy.py` block](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/custom/deploy.py) 26 | - [`custom/teardown_deployed_resources.py` block](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/custom/teardown_deployed_resources.py) 27 | 28 | --- 29 | 30 | ## 3. Continuous deployment and integration 31 | 32 | ### Videos 33 | 34 | 1. [CI/CD with GitHub Actions](https://youtu.be/tPkA3WjLSHE) 35 | 1. [Mage deployed](https://youtu.be/DMV2zEM50jY) 36 | 37 | ### Code 38 | 39 | - [`custom/ci_and_cd.py` block](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/custom/ci_and_cd.py) 40 | 41 | ## Code 42 | 43 | 1. [Deployment pipeline `metadata.yaml`](https://github.com/mage-ai/mlops/blob/master/mlops/unit_3_observability/pipelines/deploying_to_production/metadata.yaml) 44 | 45 | --- 46 | 47 | ## Resources 48 | 49 | 1. [Repository setup](https://docs.mage.ai/production/ci-cd/local-cloud/repository-setup) 50 | 1. AWS IAM policy permissions 51 | 52 | 1. [Terraform apply](https://docs.mage.ai/production/deploying-to-cloud/aws/terraform-apply-policy) 53 | 1. [Terraform destroy](https://docs.mage.ai/production/deploying-to-cloud/aws/terraform-destroy-policy) 54 | 55 | 1. [Terraform setup](https://docs.mage.ai/production/deploying-to-cloud/using-terraform) 56 | 57 | 1. [Configure Terraform for AWS](https://docs.mage.ai/production/deploying-to-cloud/aws/setup) 58 | 59 | 1. [CI/CD overview](https://docs.mage.ai/production/ci-cd/overview) 60 | 61 | 1. [Setup GitHub actions for CI/CD](https://docs.mage.ai/production/ci-cd/local-cloud/github-actions#github-actions-setup) 62 | -------------------------------------------------------------------------------- /cohorts/2024/03-orchestration/README.md: -------------------------------------------------------------------------------- 1 | # 3. Orchestration and ML Pipelines 2 | 3 | ## [3.0 Introduction: ML pipelines and Mage](3.0/README.md) 4 | 5 | ## [3.1 Data preparation: ETL and feature engineering](3.1/README.md) 6 | 7 | ## [3.2 Training: sklearn models and XGBoost](3.2/README.md) 8 | 9 | ## [3.3 Observability: Monitoring and alerting](3.3/README.md) 10 | 11 | ## [3.4 Triggering: Inference and retraining](3.4/README.md) 12 | 13 | ## [3.5 Deploying: Running operations in production](3.5/README.md) 14 | 15 | ## [3.6 Homework](../cohorts/2024/03-orchestration/homework.md). 16 | 17 | ## Quickstart 18 | 19 | See the [Unit 3.0](https://github.com/DataTalksClub/mlops-zoomcamp/blob/main/03-orchestration/3.0/README.md) for a Quick Start guide 20 | 21 | ## Need help? 22 | 23 | 1. [Developer documentation](https://docs.mage.ai/introduction/overview) 24 | 1. [AI chat bot](https://mageai.slack.com/archives/C05NYC4DADT) 25 | 1. Live chat with the [Mage team directly](https://mage.ai/chat) 26 | 27 | 28 | ## Notes 29 | 30 | Did you take notes? Add them here: 31 | 32 | * [Marcus' Notes for Ch3](https://github.com/mleiwe/mlops-zoomcamp/blob/Ch3_ML_Notes/cohorts/2024/03-orchestration/ML_Notes.md) 33 | * Send a PR, add your notes above this line 34 | 35 | ### Notes previous editions 36 | 37 | - [2022 Prefect notes](../cohorts/2022/03-orchestration/README.md) 38 | - [2023 Prefect notes](../cohorts/2023/03-orchestration/prefect/README.md) 39 | -------------------------------------------------------------------------------- /cohorts/2024/03-orchestration/meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "module": { 3 | "number": 3, 4 | "title": "Orchestration and ML Pipelines" 5 | }, 6 | "units": [ 7 | { 8 | "number": 1, 9 | "title": "Introdution to Workflow Orchestration", 10 | "youtube": "https://www.youtube.com/watch?v=Cqb7wyaNF08" 11 | }, 12 | { 13 | "number": 2, 14 | "title": "Introduction to Prefect", 15 | "youtube": "https://www.youtube.com/watch?v=rTUBTvXvXvM" 16 | }, 17 | { 18 | "number": 3, 19 | "title": "Prefect Workflow", 20 | "youtube": "https://www.youtube.com/watch?v=x3bV8yMKjtc" 21 | }, 22 | { 23 | "number": 4, 24 | "title": "Deploying Your Workflow", 25 | "youtube": "https://www.youtube.com/watch?v=3YjagezFhOo" 26 | }, 27 | { 28 | "number": 5, 29 | "title": "Working with Deployments", 30 | "youtube": "https://www.youtube.com/watch?v=jVmaaqs63O8" 31 | }, 32 | { 33 | "number": 6, 34 | "title": "Prefect Cloud (optional)", 35 | "youtube": "https://www.youtube.com/watch?v=y89Ww85EUdo" 36 | }, 37 | { 38 | "number": 7, 39 | "title": "Homework", 40 | "youtube": "" 41 | } 42 | ] 43 | } -------------------------------------------------------------------------------- /cohorts/2024/03-orchestration/requirements.txt: -------------------------------------------------------------------------------- 1 | black==23.3.0 2 | fastparquet==2023.4.0 3 | hyperopt==0.2.7 4 | mlflow==2.3.1 5 | pandas==2.0.1 6 | prefect==2.10.8 7 | prefect-aws==0.3.1 8 | scikit_learn==1.2.2 9 | seaborn==0.12.2 10 | xgboost==1.7.5 11 | orjson==3.8.1 -------------------------------------------------------------------------------- /cohorts/2024/04-deployment/homework/model.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2024/04-deployment/homework/model.bin -------------------------------------------------------------------------------- /cohorts/2024/04-deployment/homework_solution/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10.13-slim 2 | 3 | WORKDIR /app 4 | COPY [ "model2.bin", "model.bin" ] -------------------------------------------------------------------------------- /cohorts/2024/04-deployment/homework_solution/Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | scikit-learn = "==1.5.0" 8 | pandas = "*" 9 | pyarrow = "*" 10 | 11 | [dev-packages] 12 | 13 | [requires] 14 | python_version = "3.10" 15 | -------------------------------------------------------------------------------- /cohorts/2024/04-deployment/homework_solution/batch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import sys 5 | import os 6 | import pickle 7 | import pandas as pd 8 | 9 | 10 | year = int(sys.argv[1]) # 2023 11 | month = int(sys.argv[2]) # 4 12 | 13 | input_file = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year:04d}-{month:02d}.parquet' 14 | output_file = f'output/yellow_tripdata_{year:04d}-{month:02d}.parquet' 15 | 16 | 17 | MODEL_FILE = os.getenv('MODEL_FILE', 'model.bin') 18 | 19 | with open(MODEL_FILE, 'rb') as f_in: 20 | dv, lr = pickle.load(f_in) 21 | 22 | 23 | categorical = ['PULocationID', 'DOLocationID'] 24 | 25 | def read_data(filename): 26 | df = pd.read_parquet(filename) 27 | 28 | df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime 29 | df['duration'] = df.duration.dt.total_seconds() / 60 30 | 31 | df = df[(df.duration >= 1) & (df.duration <= 60)].copy() 32 | 33 | df[categorical] = df[categorical].fillna(-1).astype('int').astype('str') 34 | 35 | return df 36 | 37 | 38 | df = read_data(input_file) 39 | df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str') 40 | 41 | 42 | dicts = df[categorical].to_dict(orient='records') 43 | X_val = dv.transform(dicts) 44 | y_pred = lr.predict(X_val) 45 | 46 | 47 | print('predicted mean duration:', y_pred.mean()) 48 | 49 | 50 | df_result = pd.DataFrame() 51 | df_result['ride_id'] = df['ride_id'] 52 | df_result['predicted_duration'] = y_pred 53 | 54 | 55 | os.makedirs('output', exist_ok=True) 56 | 57 | df_result.to_parquet( 58 | output_file, 59 | engine='pyarrow', 60 | compression=None, 61 | index=False 62 | ) 63 | -------------------------------------------------------------------------------- /cohorts/2024/04-deployment/homework_solution/homework.dockerfile: -------------------------------------------------------------------------------- 1 | FROM agrigorev/zoomcamp-model:mlops-2024-3.10.13-slim 2 | 3 | RUN pip install -U pip & pip install pipenv 4 | 5 | COPY [ "Pipfile", "Pipfile.lock", "./" ] 6 | 7 | RUN pipenv install --system --deploy 8 | 9 | COPY [ "batch.py", "batch.py" ] 10 | 11 | ENTRYPOINT [ "python", "batch.py" ] -------------------------------------------------------------------------------- /cohorts/2024/04-deployment/homework_solution/model.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2024/04-deployment/homework_solution/model.bin -------------------------------------------------------------------------------- /cohorts/2024/04-deployment/homework_solution/model2.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2024/04-deployment/homework_solution/model2.bin -------------------------------------------------------------------------------- /cohorts/2024/05-monitoring/homework.md: -------------------------------------------------------------------------------- 1 | ## Homework 2 | 3 | The goal of this homework is to familiarize users with monitoring for ML batch services, using PostgreSQL database to store metrics and Grafana to visualize them. 4 | 5 | 6 | 7 | ## Q1. Prepare the dataset 8 | 9 | Start with `baseline_model_nyc_taxi_data.ipynb`. Download the March 2024 Green Taxi data. We will use this data to simulate a production usage of a taxi trip duration prediction service. 10 | 11 | What is the shape of the downloaded data? How many rows are there? 12 | 13 | * 72044 14 | * 78537 15 | * 57457 16 | * 54396 17 | 18 | 19 | ## Q2. Metric 20 | 21 | Let's expand the number of data quality metrics we’d like to monitor! Please add one metric of your choice and a quantile value for the `"fare_amount"` column (`quantile=0.5`). 22 | 23 | Hint: explore evidently metric `ColumnQuantileMetric` (from `evidently.metrics import ColumnQuantileMetric`) 24 | 25 | What metric did you choose? 26 | 27 | 28 | ## Q3. Monitoring 29 | 30 | Let’s start monitoring. Run expanded monitoring for a new batch of data (March 2024). 31 | 32 | What is the maximum value of metric `quantile = 0.5` on the `"fare_amount"` column during March 2024 (calculated daily)? 33 | 34 | * 10 35 | * 12.5 36 | * 14.2 37 | * 14.8 38 | 39 | 40 | ## Q4. Dashboard 41 | 42 | 43 | Finally, let’s add panels with new added metrics to the dashboard. After we customize the dashboard let's save a dashboard config, so that we can access it later. Hint: click on “Save dashboard” to access JSON configuration of the dashboard. This configuration should be saved locally. 44 | 45 | Where to place a dashboard config file? 46 | 47 | * `project_folder` (05-monitoring) 48 | * `project_folder/config` (05-monitoring/config) 49 | * `project_folder/dashboards` (05-monitoring/dashboards) 50 | * `project_folder/data` (05-monitoring/data) 51 | 52 | 53 | ## Submit the results 54 | 55 | * Submit your answers here: https://courses.datatalks.club/mlops-zoomcamp-2024/homework/hw5 56 | -------------------------------------------------------------------------------- /cohorts/2024/06-best-practices/homework/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10.9-slim 2 | 3 | RUN pip install -U pip & pip install pipenv 4 | 5 | COPY [ "Pipfile", "Pipfile.lock", "./" ] 6 | 7 | RUN pipenv install --system --deploy 8 | 9 | COPY [ "batch.py", "batch.py" ] 10 | COPY [ "model.bin", "model.bin" ] 11 | 12 | ENTRYPOINT [ "python", "batch.py" ] -------------------------------------------------------------------------------- /cohorts/2024/06-best-practices/homework/Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | scikit-learn = "==1.5.0" 8 | pandas = "*" 9 | pyarrow = "*" 10 | s3fs = "*" 11 | 12 | [dev-packages] 13 | 14 | [requires] 15 | python_version = "3.10" 16 | -------------------------------------------------------------------------------- /cohorts/2024/06-best-practices/homework/batch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import sys 5 | import pickle 6 | import pandas as pd 7 | 8 | 9 | year = int(sys.argv[1]) 10 | month = int(sys.argv[2]) 11 | 12 | input_file = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year:04d}-{month:02d}.parquet' 13 | output_file = f'output/yellow_tripdata_{year:04d}-{month:02d}.parquet' 14 | 15 | 16 | with open('model.bin', 'rb') as f_in: 17 | dv, lr = pickle.load(f_in) 18 | 19 | 20 | categorical = ['PULocationID', 'DOLocationID'] 21 | 22 | def read_data(filename): 23 | df = pd.read_parquet(filename) 24 | 25 | df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime 26 | df['duration'] = df.duration.dt.total_seconds() / 60 27 | 28 | df = df[(df.duration >= 1) & (df.duration <= 60)].copy() 29 | 30 | df[categorical] = df[categorical].fillna(-1).astype('int').astype('str') 31 | 32 | return df 33 | 34 | 35 | df = read_data(input_file) 36 | df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str') 37 | 38 | 39 | dicts = df[categorical].to_dict(orient='records') 40 | X_val = dv.transform(dicts) 41 | y_pred = lr.predict(X_val) 42 | 43 | 44 | print('predicted mean duration:', y_pred.mean()) 45 | 46 | 47 | df_result = pd.DataFrame() 48 | df_result['ride_id'] = df['ride_id'] 49 | df_result['predicted_duration'] = y_pred 50 | 51 | 52 | df_result.to_parquet(output_file, engine='pyarrow', index=False) 53 | -------------------------------------------------------------------------------- /cohorts/2024/06-best-practices/homework/model.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2024/06-best-practices/homework/model.bin -------------------------------------------------------------------------------- /cohorts/2024/README.md: -------------------------------------------------------------------------------- 1 | ## MLOps Zoomcamp 2024 Cohort 2 | 3 | * [Pre-Course Live Q&A](https://www.youtube.com/watch?v=YmllO3ld5LE) 4 | * [Course Launch video](https://www.youtube.com/watch?v=2jM7t-NTZxs) and [Slides](https://docs.google.com/presentation/d/1Tp2VVph5_vYIazQ53VR7TYmhJjQg9wuNIKKne3wlZVU/edit?usp=sharing) 5 | * [Technical FAQ](https://docs.google.com/document/d/12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0/edit) 6 | * TODO: Course Playlist: Only 2024 Live videos & homeworks 7 | * [Course management platform](https://courses.datatalks.club/mlops-zoomcamp-2024/) 8 | 9 | 10 | 11 | [**Module 1: Introduction**](01-intro) 12 | 13 | * [Homework](01-intro/homework.md) 14 | * [Solution](01-intro/homework.ipynb) 15 | 16 | [**Module 2: Experiment Tracking**](02-experiment-tracking/) 17 | 18 | * [Homework](02-experiment-tracking/homework.md) 19 | * [Solution](02-experiment-tracking/solution) 20 | 21 | [**Module 3: Orchestration and ML pipelines**](03-orchestration/) 22 | 23 | * [Homework](03-orchestration/homework.md) 24 | * [Solution](https://www.loom.com/share/802c8c0b843a4d3bbd9dbea240c3593a) 25 | 26 | [**Module 4: Model Deployment**](04-deployment) 27 | 28 | * [Homework](04-deployment/homework.md) 29 | 30 | [**Module 5: Model Monitoring**](05-monitoring/) 31 | 32 | * [Homework](05-monitoring/homework.md) 33 | 34 | [**Module 6: Best Practices**](06-best-practices) 35 | 36 | * [Homework](06-best-practices/homework.md) 37 | 38 | 39 | [**Project**](project.md) 40 | 41 | * [More information](project.md) 42 | -------------------------------------------------------------------------------- /cohorts/2024/project.md: -------------------------------------------------------------------------------- 1 | ## Course Project 2 | 3 | The goal of this project is to apply everything we learned 4 | in this course and build an end-to-end machine learning project. 5 | 6 | Remember that to pass the project, you must evaluate 3 peers. If you don't do that, your project can't be considered compelete. 7 | 8 | 9 | ## Submitting 10 | 11 | ### Project Attempt #1 12 | 13 | * Project: https://courses.datatalks.club/mlops-zoomcamp-2024/project/project1 14 | * Review: https://courses.datatalks.club/mlops-zoomcamp-2024/project/project1/eval 15 | 16 | 17 | ### Project Attempt #2 18 | 19 | * Project: https://courses.datatalks.club/mlops-zoomcamp-2024/project/project2 20 | * Review: https://courses.datatalks.club/mlops-zoomcamp-2024/project/project2/eval 21 | 22 | 23 | > **Important**: update your "Certificate name" here: https://courses.datatalks.club/mlops-zoomcamp-2024/enrollment - 24 | this is what we will use when generating certificates for you. 25 | 26 | 27 | ## Evaluation criteria 28 | 29 | See [here](../../../07-project/README.md) 30 | -------------------------------------------------------------------------------- /cohorts/2025/01-intro/homework.md: -------------------------------------------------------------------------------- 1 | ## Homework 2 | 3 | The goal of this homework is to train a simple model for predicting the duration of a ride - similar to what we did in this module. 4 | 5 | 6 | ## Q1. Downloading the data 7 | 8 | We'll use [the same NYC taxi dataset](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page), 9 | but instead of "**Green** Taxi Trip Records", we'll use "**Yellow** Taxi Trip Records". 10 | 11 | Download the data for January and February 2023. 12 | 13 | Read the data for January. How many columns are there? 14 | 15 | * 16 16 | * 17 17 | * 18 18 | * 19 19 | 20 | 21 | ## Q2. Computing duration 22 | 23 | Now let's compute the `duration` variable. It should contain the duration of a ride in minutes. 24 | 25 | What's the standard deviation of the trips duration in January? 26 | 27 | * 32.59 28 | * 42.59 29 | * 52.59 30 | * 62.59 31 | 32 | 33 | ## Q3. Dropping outliers 34 | 35 | Next, we need to check the distribution of the `duration` variable. There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive). 36 | 37 | What fraction of the records left after you dropped the outliers? 38 | 39 | * 90% 40 | * 92% 41 | * 95% 42 | * 98% 43 | 44 | 45 | ## Q4. One-hot encoding 46 | 47 | Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model. 48 | 49 | * Turn the dataframe into a list of dictionaries (remember to re-cast the ids to strings - otherwise it will 50 | label encode them) 51 | * Fit a dictionary vectorizer 52 | * Get a feature matrix from it 53 | 54 | What's the dimensionality of this matrix (number of columns)? 55 | 56 | * 2 57 | * 155 58 | * 345 59 | * 515 60 | * 715 61 | 62 | 63 | ## Q5. Training a model 64 | 65 | Now let's use the feature matrix from the previous step to train a model. 66 | 67 | * Train a plain linear regression model with default parameters, where duration is the response variable 68 | * Calculate the RMSE of the model on the training data 69 | 70 | What's the RMSE on train? 71 | 72 | * 3.64 73 | * 7.64 74 | * 11.64 75 | * 16.64 76 | 77 | 78 | ## Q6. Evaluating the model 79 | 80 | Now let's apply this model to the validation dataset (February 2023). 81 | 82 | What's the RMSE on validation? 83 | 84 | * 3.81 85 | * 7.81 86 | * 11.81 87 | * 16.81 88 | 89 | ## Submit the results 90 | 91 | * Submit your results here: https://courses.datatalks.club/mlops-zoomcamp-2025/homework/hw1 92 | * If your answer doesn't match options exactly, select the closest one 93 | -------------------------------------------------------------------------------- /cohorts/2025/02-experiment-tracking/homework/hpo.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import click 4 | import mlflow 5 | import numpy as np 6 | from hyperopt import STATUS_OK, Trials, fmin, hp, tpe 7 | from hyperopt.pyll import scope 8 | from sklearn.ensemble import RandomForestRegressor 9 | from sklearn.metrics import root_mean_squared_error 10 | 11 | mlflow.set_tracking_uri("http://127.0.0.1:5000") 12 | mlflow.set_experiment("random-forest-hyperopt") 13 | 14 | 15 | def load_pickle(filename: str): 16 | with open(filename, "rb") as f_in: 17 | return pickle.load(f_in) 18 | 19 | 20 | @click.command() 21 | @click.option( 22 | "--data_path", 23 | default="./output", 24 | help="Location where the processed NYC taxi trip data was saved" 25 | ) 26 | @click.option( 27 | "--num_trials", 28 | default=15, 29 | help="The number of parameter evaluations for the optimizer to explore" 30 | ) 31 | def run_optimization(data_path: str, num_trials: int): 32 | 33 | X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl")) 34 | X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl")) 35 | 36 | def objective(params): 37 | 38 | rf = RandomForestRegressor(**params) 39 | rf.fit(X_train, y_train) 40 | y_pred = rf.predict(X_val) 41 | rmse = root_mean_squared_error(y_val, y_pred) 42 | 43 | return {'loss': rmse, 'status': STATUS_OK} 44 | 45 | search_space = { 46 | 'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)), 47 | 'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)), 48 | 'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)), 49 | 'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)), 50 | 'random_state': 42 51 | } 52 | 53 | rstate = np.random.default_rng(42) # for reproducible results 54 | fmin( 55 | fn=objective, 56 | space=search_space, 57 | algo=tpe.suggest, 58 | max_evals=num_trials, 59 | trials=Trials(), 60 | rstate=rstate 61 | ) 62 | 63 | 64 | if __name__ == '__main__': 65 | run_optimization() 66 | -------------------------------------------------------------------------------- /cohorts/2025/02-experiment-tracking/homework/register_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import click 4 | import mlflow 5 | 6 | from mlflow.entities import ViewType 7 | from mlflow.tracking import MlflowClient 8 | from sklearn.ensemble import RandomForestRegressor 9 | from sklearn.metrics import root_mean_squared_error 10 | 11 | HPO_EXPERIMENT_NAME = "random-forest-hyperopt" 12 | EXPERIMENT_NAME = "random-forest-best-models" 13 | RF_PARAMS = ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'random_state'] 14 | 15 | mlflow.set_tracking_uri("http://127.0.0.1:5000") 16 | mlflow.set_experiment(EXPERIMENT_NAME) 17 | mlflow.sklearn.autolog() 18 | 19 | 20 | def load_pickle(filename): 21 | with open(filename, "rb") as f_in: 22 | return pickle.load(f_in) 23 | 24 | 25 | def train_and_log_model(data_path, params): 26 | X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl")) 27 | X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl")) 28 | X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl")) 29 | 30 | with mlflow.start_run(): 31 | new_params = {} 32 | for param in RF_PARAMS: 33 | new_params[param] = int(params[param]) 34 | 35 | rf = RandomForestRegressor(**new_params) 36 | rf.fit(X_train, y_train) 37 | 38 | # Evaluate model on the validation and test sets 39 | val_rmse = root_mean_squared_error(y_val, rf.predict(X_val)) 40 | mlflow.log_metric("val_rmse", val_rmse) 41 | test_rmse = root_mean_squared_error(y_test, rf.predict(X_test)) 42 | mlflow.log_metric("test_rmse", test_rmse) 43 | 44 | 45 | @click.command() 46 | @click.option( 47 | "--data_path", 48 | default="./output", 49 | help="Location where the processed NYC taxi trip data was saved" 50 | ) 51 | @click.option( 52 | "--top_n", 53 | default=5, 54 | type=int, 55 | help="Number of top models that need to be evaluated to decide which one to promote" 56 | ) 57 | def run_register_model(data_path: str, top_n: int): 58 | 59 | client = MlflowClient() 60 | 61 | # Retrieve the top_n model runs and log the models 62 | experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME) 63 | runs = client.search_runs( 64 | experiment_ids=experiment.experiment_id, 65 | run_view_type=ViewType.ACTIVE_ONLY, 66 | max_results=top_n, 67 | order_by=["metrics.rmse ASC"] 68 | ) 69 | for run in runs: 70 | train_and_log_model(data_path=data_path, params=run.data.params) 71 | 72 | # Select the model with the lowest test RMSE 73 | experiment = client.get_experiment_by_name(EXPERIMENT_NAME) 74 | # best_run = client.search_runs( ... )[0] 75 | 76 | # Register the best model 77 | # mlflow.register_model( ... ) 78 | 79 | 80 | if __name__ == '__main__': 81 | run_register_model() 82 | -------------------------------------------------------------------------------- /cohorts/2025/02-experiment-tracking/homework/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import click 4 | 5 | from sklearn.ensemble import RandomForestRegressor 6 | from sklearn.metrics import root_mean_squared_error 7 | 8 | 9 | def load_pickle(filename: str): 10 | with open(filename, "rb") as f_in: 11 | return pickle.load(f_in) 12 | 13 | 14 | @click.command() 15 | @click.option( 16 | "--data_path", 17 | default="./output", 18 | help="Location where the processed NYC taxi trip data was saved" 19 | ) 20 | def run_train(data_path: str): 21 | 22 | X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl")) 23 | X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl")) 24 | 25 | rf = RandomForestRegressor(max_depth=10, random_state=0) 26 | rf.fit(X_train, y_train) 27 | y_pred = rf.predict(X_val) 28 | 29 | rmse = root_mean_squared_error(y_val, y_pred) 30 | 31 | 32 | if __name__ == '__main__': 33 | run_train() 34 | -------------------------------------------------------------------------------- /cohorts/2025/03-orchestration/homework.md: -------------------------------------------------------------------------------- 1 | ## Homework 2 | 3 | The goal of this homework is to create a simple training pipeline, use mlflow to track experiments and register best model, but use Mage for it. 4 | 5 | We'll use [the same NYC taxi dataset](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page), the **Yellow** taxi data for March, 2023. 6 | 7 | ## Question 1. Select the Tool 8 | 9 | You can use the same tool you used when completing the module, 10 | or choose a different one for your homework. 11 | 12 | What's the name of the orchestrator you chose? 13 | 14 | 15 | ## Question 2. Version 16 | 17 | What's the version of the orchestrator? 18 | 19 | 20 | ## Question 3. Creating a pipeline 21 | 22 | Let's read the March 2023 Yellow taxi trips data. 23 | 24 | How many records did we load? 25 | 26 | - 3,003,766 27 | - 3,203,766 28 | - 3,403,766 29 | - 3,603,766 30 | 31 | (Include a print statement in your code) 32 | 33 | ## Question 4. Data preparation 34 | 35 | Let's continue with pipeline creation. 36 | 37 | We will use the same logic for preparing the data we used previously. 38 | 39 | This is what we used (adjusted for yellow dataset): 40 | 41 | ```python 42 | def read_dataframe(filename): 43 | df = pd.read_parquet(filename) 44 | 45 | df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime 46 | df.duration = df.duration.dt.total_seconds() / 60 47 | 48 | df = df[(df.duration >= 1) & (df.duration <= 60)] 49 | 50 | categorical = ['PULocationID', 'DOLocationID'] 51 | df[categorical] = df[categorical].astype(str) 52 | 53 | return df 54 | ``` 55 | 56 | Let's apply to the data we loaded in question 3. 57 | 58 | What's the size of the result? 59 | 60 | - 2,903,766 61 | - 3,103,766 62 | - 3,316,216 63 | - 3,503,766 64 | 65 | ## Question 5. Train a model 66 | 67 | We will now train a linear regression model using the same code as in homework 1. 68 | 69 | * Fit a dict vectorizer. 70 | * Train a linear regression with default parameters. 71 | * Use pick up and drop off locations separately, don't create a combination feature. 72 | 73 | Let's now use it in the pipeline. We will need to create another transformation block, and return both the dict vectorizer and the model. 74 | 75 | What's the intercept of the model? 76 | 77 | Hint: print the `intercept_` field in the code block 78 | 79 | - 21.77 80 | - 24.77 81 | - 27.77 82 | - 31.77 83 | 84 | ## Question 6. Register the model 85 | 86 | The model is trained, so let's save it with MLFlow. 87 | 88 | Find the logged model, and find MLModel file. What's the size of the model? (`model_size_bytes` field): 89 | 90 | * 14,534 91 | * 9,534 92 | * 4,534 93 | * 1,534 94 | 95 | 96 | ## Submit the results 97 | 98 | * Submit your results here: https://courses.datatalks.club/mlops-zoomcamp-2025/homework/hw3 99 | * If your answer doesn't match options exactly, select the closest one. -------------------------------------------------------------------------------- /cohorts/2025/04-deployment/homework/model.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2025/04-deployment/homework/model.bin -------------------------------------------------------------------------------- /cohorts/2025/05-monitoring/homework.md: -------------------------------------------------------------------------------- 1 | ## Homework 2 | 3 | The goal of this homework is to familiarize users with monitoring for ML batch services, using PostgreSQL database to store metrics and Grafana to visualize them. 4 | 5 | 6 | 7 | ## Q1. Prepare the dataset 8 | 9 | Start with `baseline_model_nyc_taxi_data.ipynb`. Download the March 2024 Green Taxi data. We will use this data to simulate a production usage of a taxi trip duration prediction service. 10 | 11 | What is the shape of the downloaded data? How many rows are there? 12 | 13 | * 72044 14 | * 78537 15 | * 57457 16 | * 54396 17 | 18 | 19 | ## Q2. Metric 20 | 21 | Let's expand the number of data quality metrics we’d like to monitor! Please add one metric of your choice and a quantile value for the `"fare_amount"` column (`quantile=0.5`). 22 | 23 | Hint: explore evidently metric `ColumnQuantileMetric` (from `evidently.metrics import ColumnQuantileMetric`) 24 | 25 | What metric did you choose? 26 | 27 | 28 | ## Q3. Monitoring 29 | 30 | Let’s start monitoring. Run expanded monitoring for a new batch of data (March 2024). 31 | 32 | What is the maximum value of metric `quantile = 0.5` on the `"fare_amount"` column during March 2024 (calculated daily)? 33 | 34 | * 10 35 | * 12.5 36 | * 14.2 37 | * 14.8 38 | 39 | 40 | ## Q4. Dashboard 41 | 42 | 43 | Finally, let’s add panels with new added metrics to the dashboard. After we customize the dashboard let's save a dashboard config, so that we can access it later. Hint: click on “Save dashboard” to access JSON configuration of the dashboard. This configuration should be saved locally. 44 | 45 | Where to place a dashboard config file? 46 | 47 | * `project_folder` (05-monitoring) 48 | * `project_folder/config` (05-monitoring/config) 49 | * `project_folder/dashboards` (05-monitoring/dashboards) 50 | * `project_folder/data` (05-monitoring/data) 51 | 52 | 53 | ## Submit the results 54 | 55 | * Submit your answers here: https://courses.datatalks.club/mlops-zoomcamp-2025/homework/hw5 56 | -------------------------------------------------------------------------------- /cohorts/2025/06-best-practices/homework/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10.9-slim 2 | 3 | RUN pip install -U pip & pip install pipenv 4 | 5 | COPY [ "Pipfile", "Pipfile.lock", "./" ] 6 | 7 | RUN pipenv install --system --deploy 8 | 9 | COPY [ "batch.py", "batch.py" ] 10 | COPY [ "model.bin", "model.bin" ] 11 | 12 | ENTRYPOINT [ "python", "batch.py" ] -------------------------------------------------------------------------------- /cohorts/2025/06-best-practices/homework/Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | scikit-learn = "==1.5.0" 8 | pandas = "*" 9 | pyarrow = "*" 10 | s3fs = "*" 11 | 12 | [dev-packages] 13 | 14 | [requires] 15 | python_version = "3.10" 16 | -------------------------------------------------------------------------------- /cohorts/2025/06-best-practices/homework/batch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import sys 5 | import pickle 6 | import pandas as pd 7 | 8 | 9 | year = int(sys.argv[1]) 10 | month = int(sys.argv[2]) 11 | 12 | input_file = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year:04d}-{month:02d}.parquet' 13 | output_file = f'output/yellow_tripdata_{year:04d}-{month:02d}.parquet' 14 | 15 | 16 | with open('model.bin', 'rb') as f_in: 17 | dv, lr = pickle.load(f_in) 18 | 19 | 20 | categorical = ['PULocationID', 'DOLocationID'] 21 | 22 | def read_data(filename): 23 | df = pd.read_parquet(filename) 24 | 25 | df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime 26 | df['duration'] = df.duration.dt.total_seconds() / 60 27 | 28 | df = df[(df.duration >= 1) & (df.duration <= 60)].copy() 29 | 30 | df[categorical] = df[categorical].fillna(-1).astype('int').astype('str') 31 | 32 | return df 33 | 34 | 35 | df = read_data(input_file) 36 | df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str') 37 | 38 | 39 | dicts = df[categorical].to_dict(orient='records') 40 | X_val = dv.transform(dicts) 41 | y_pred = lr.predict(X_val) 42 | 43 | 44 | print('predicted mean duration:', y_pred.mean()) 45 | 46 | 47 | df_result = pd.DataFrame() 48 | df_result['ride_id'] = df['ride_id'] 49 | df_result['predicted_duration'] = y_pred 50 | 51 | 52 | df_result.to_parquet(output_file, engine='pyarrow', index=False) 53 | -------------------------------------------------------------------------------- /cohorts/2025/06-best-practices/homework/model.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/cohorts/2025/06-best-practices/homework/model.bin -------------------------------------------------------------------------------- /cohorts/2025/README.md: -------------------------------------------------------------------------------- 1 | ## MLOps Zoomcamp 2025 Cohort 2 | 3 | * [Pre-Course Live Q&A](https://www.youtube.com/watch?v=rv43YJQsZIw) 4 | * [Course Launch video](https://youtube.com/live/qqZU8nBtH90) and [Slides](https://docs.google.com/presentation/d/10dP4KoVpMA1iMGBk-XWp3YcHjukoM7AxZ2v4LuZd9wE/edit?usp=sharing) 5 | * [Technical FAQ](https://docs.google.com/document/d/12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0/edit) 6 | * [Course management platform](https://courses.datatalks.club/mlops-zoomcamp-2025/) 7 | 8 | 9 | 10 | [**Module 1: Introduction**](01-intro) 11 | 12 | * [Homework](01-intro/homework.md) 13 | 14 | [**Module 2: Experiment Tracking**](02-experiment-tracking/) 15 | 16 | * [Homework](02-experiment-tracking/homework.md) 17 | 18 | [**Module 3: Orchestration and ML pipelines**](03-orchestration/) 19 | 20 | * [Homework](03-orchestration/homework.md) 21 | 22 | [**Module 4: Model Deployment**](04-deployment) 23 | 24 | * [Homework](04-deployment/homework.md) 25 | 26 | [**Module 5: Model Monitoring**](05-monitoring/) 27 | 28 | * [Homework](05-monitoring/homework.md) 29 | 30 | [**Module 6: Best Practices**](06-best-practices) 31 | 32 | * [Homework](06-best-practices/homework.md) 33 | 34 | 35 | [**Project**](project.md) 36 | 37 | * [More information](project.md) 38 | -------------------------------------------------------------------------------- /cohorts/2025/project.md: -------------------------------------------------------------------------------- 1 | ## Course Project 2 | 3 | The goal of this project is to apply everything we learned 4 | in this course and build an end-to-end machine learning project. 5 | 6 | Remember that to pass the project, you must evaluate 3 peers. If you don't do that, your project can't be considered compelete. 7 | 8 | 9 | ## Submitting 10 | 11 | ### Project Attempt #1 12 | 13 | * Project: https://courses.datatalks.club/mlops-zoomcamp-2025/project/project1 14 | * Review: https://courses.datatalks.club/mlops-zoomcamp-2025/project/project1/eval 15 | 16 | 17 | ### Project Attempt #2 18 | 19 | * Project: https://courses.datatalks.club/mlops-zoomcamp-2025/project/project2 20 | * Review: https://courses.datatalks.club/mlops-zoomcamp-2025/project/project2/eval 21 | 22 | 23 | > **Important**: update your "Certificate name" here: https://courses.datatalks.club/mlops-zoomcamp-2025/enrollment - 24 | this is what we will use when generating certificates for you. 25 | 26 | 27 | ## Evaluation criteria 28 | 29 | See [here](../../07-project/README.md) 30 | -------------------------------------------------------------------------------- /images/IMG_20230323_134059_927.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/images/IMG_20230323_134059_927.png -------------------------------------------------------------------------------- /images/banner-2025.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/images/banner-2025.jpg -------------------------------------------------------------------------------- /images/banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/images/banner.png -------------------------------------------------------------------------------- /images/learning-in-public-links.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/images/learning-in-public-links.png -------------------------------------------------------------------------------- /images/learning-in-public.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/images/learning-in-public.png -------------------------------------------------------------------------------- /images/play.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/2747b01f1a08783b38b8d2cbc8b68a17a7c6767b/images/play.png -------------------------------------------------------------------------------- /learning-in-public.md: -------------------------------------------------------------------------------- 1 | # Learning in public 2 | 3 | Most people learn in private: they consume content but don't tell 4 | anyone about it. There's nothing wrong with it. 5 | 6 | But we want to encourage you to document your progress and 7 | share it publicly on social media. 8 | 9 | It helps you get noticed and will lead to: 10 | 11 | * Expanding your network: meeting new people and making new friends 12 | * Being invited to meetups, conferences and podcasts 13 | * Landing a job or getting clients 14 | * Many other good things 15 | 16 | Here's a more compresensive reading on why you want to do it: https://github.com/readme/guides/publishing-your-work 17 | 18 | 19 | ## Learning in Public for Zoomcamps 20 | 21 | When you submit your homework or project, you can also submit 22 | learning in public posts: 23 | 24 | 25 | 26 | You can watch this video to see how your learning in public posts may look like: 27 | 28 | 29 | 30 | 31 | 32 | 33 | Send a PR if you want to suggest improvements for this document --------------------------------------------------------------------------------