├── .cruft.json ├── .env.example ├── .gemini └── config.yaml ├── .github ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── feat-request.md │ └── fix-request.md ├── PULL_REQUEST_TEMPLATE.md ├── actions │ └── setup │ │ └── action.yml ├── dependabot.yml ├── rulesets │ └── main.json └── workflows │ ├── check.yml │ └── publish.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .python-version ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── Dockerfile ├── LICENSE.txt ├── MLproject ├── README.md ├── confs ├── evaluations.yaml ├── explanations.yaml ├── inference.yaml ├── promotion.yaml ├── training.yaml └── tuning.yaml ├── constraints.txt ├── data ├── Readme.txt ├── hour.csv ├── inputs_test.parquet ├── inputs_train.parquet ├── targets_test.parquet └── targets_train.parquet ├── docker-compose.yml ├── images └── mlopsmindmap.png ├── justfile ├── mlops-python-package.code-workspace ├── notebooks ├── explain.ipynb ├── indicators.ipynb ├── processing.ipynb └── prototype.ipynb ├── outputs └── .gitkeep ├── pyproject.toml ├── python_env.yaml ├── requirements.txt ├── src └── bikes │ ├── __init__.py │ ├── __main__.py │ ├── core │ ├── __init__.py │ ├── metrics.py │ ├── models.py │ └── schemas.py │ ├── io │ ├── __init__.py │ ├── configs.py │ ├── datasets.py │ ├── registries.py │ └── services.py │ ├── jobs │ ├── __init__.py │ ├── base.py │ ├── evaluations.py │ ├── explanations.py │ ├── inference.py │ ├── promotion.py │ ├── training.py │ └── tuning.py │ ├── scripts.py │ ├── settings.py │ └── utils │ ├── __init__.py │ ├── searchers.py │ ├── signers.py │ └── splitters.py ├── tasks ├── check.just ├── clean.just ├── commit.just ├── doc.just ├── docker.just ├── format.just ├── install.just ├── mlflow.just ├── package.just └── project.just ├── tests ├── confs │ ├── invalid │ │ └── 1. invalid.yaml │ └── valid │ │ ├── 0. tuning.yaml │ │ ├── 1. training.yaml │ │ ├── 2. promotion.yaml │ │ ├── 3. inference.yaml │ │ ├── 5. evaluations.yaml │ │ └── 6. explanations.yaml ├── conftest.py ├── core │ ├── test_metrics.py │ ├── test_models.py │ └── test_schemas.py ├── data │ ├── inputs_sample.parquet │ ├── outputs_sample.parquet │ └── targets_sample.parquet ├── io │ ├── test_configs.py │ ├── test_datasets.py │ ├── test_registries.py │ └── test_services.py ├── jobs │ ├── test_base.py │ ├── test_evaluations.py │ ├── test_explanations.py │ ├── test_inference.py │ ├── test_promotion.py │ ├── test_training.py │ └── test_tuning.py ├── test_scripts.py └── utils │ ├── test_searchers.py │ ├── test_signers.py │ └── test_splitters.py └── uv.lock /.cruft.json: -------------------------------------------------------------------------------- 1 | { 2 | "template": "https://github.com/fmind/cookiecutter-mlops-package", 3 | "commit": "2ce51abb4333d594baee46ce590ead4e4cd76142", 4 | "checkout": null, 5 | "context": { 6 | "cookiecutter": { 7 | "user": "fmind", 8 | "name": "MLOps Python Package", 9 | "repository": "mlops-python-package", 10 | "package": "bikes", 11 | "license": "MIT", 12 | "version": "4.0.0", 13 | "description": "Predict the number of bikes available", 14 | "python_version": "3.13", 15 | "mlflow_version": "2.20.3", 16 | "_template": "https://github.com/fmind/cookiecutter-mlops-package" 17 | } 18 | }, 19 | "directory": null 20 | } 21 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmind/mlops-python-package/856a5ab74958cec36ae31619636d07c4baf9698f/.env.example -------------------------------------------------------------------------------- /.gemini/config.yaml: -------------------------------------------------------------------------------- 1 | # https://developers.google.com/gemini-code-assist/docs/customize-gemini-behavior-github 2 | have_fun: false 3 | code_review: 4 | disable: false 5 | comment_severity_threshold: MEDIUM 6 | max_review_comments: -1 7 | pull_request_opened: 8 | help: false 9 | summary: true 10 | code_review: true 11 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | # github: ["MLOps-Courses"] 4 | custom: ["https://donate.stripe.com/4gw8xT9oVbCc98s7ss"] 5 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feat-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature Request 3 | about: A new feature. 4 | title: "[FEAT] " 5 | labels: feat 6 | assignees: fmind 7 | --- 8 | 9 | ## Description 10 | 11 | ## Motivation 12 | 13 | ## Solutions 14 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/fix-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Fix Request 3 | about: A bug fix 4 | title: "[FIX] " 5 | labels: fix 6 | assignees: fmind 7 | --- 8 | 9 | ## Bug Description 10 | 11 | ## Expected Behavior 12 | 13 | ## Steps to Reproduce 14 | 15 | ## Additional Context 16 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | # Changes 2 | 3 | # Reasons 4 | 5 | # Testing 6 | 7 | # Impacts 8 | 9 | # Notes 10 | -------------------------------------------------------------------------------- /.github/actions/setup/action.yml: -------------------------------------------------------------------------------- 1 | name: Setup 2 | description: Setup for project workflows 3 | runs: 4 | using: composite 5 | steps: 6 | - name: Install uv 7 | uses: astral-sh/setup-uv@v5 8 | with: 9 | enable-cache: true 10 | - name: Setup Python 11 | uses: actions/setup-python@v5 12 | with: 13 | python-version-file: .python-version 14 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # https://docs.github.com/en/code-security/dependabot/working-with-dependabot/dependabot-options-reference 2 | version: 2 3 | updates: 4 | - package-ecosystem: "pip" 5 | directory: "/" 6 | schedule: 7 | interval: "weekly" 8 | -------------------------------------------------------------------------------- /.github/rulesets/main.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "main", 3 | "target": "branch", 4 | "enforcement": "active", 5 | "conditions": { 6 | "ref_name": { 7 | "exclude": [], 8 | "include": [ 9 | "~DEFAULT_BRANCH" 10 | ] 11 | } 12 | }, 13 | "rules": [ 14 | { 15 | "type": "deletion" 16 | }, 17 | { 18 | "type": "required_linear_history" 19 | }, 20 | { 21 | "type": "pull_request", 22 | "parameters": { 23 | "required_approving_review_count": 0, 24 | "dismiss_stale_reviews_on_push": true, 25 | "require_code_owner_review": false, 26 | "require_last_push_approval": false, 27 | "required_review_thread_resolution": false, 28 | "allowed_merge_methods": [ 29 | "squash", 30 | "rebase" 31 | ] 32 | } 33 | }, 34 | { 35 | "type": "required_status_checks", 36 | "parameters": { 37 | "strict_required_status_checks_policy": true, 38 | "do_not_enforce_on_create": false, 39 | "required_status_checks": [ 40 | { 41 | "context": "checks", 42 | "integration_id": 15368 43 | } 44 | ] 45 | } 46 | }, 47 | { 48 | "type": "non_fast_forward" 49 | } 50 | ], 51 | "bypass_actors": [ 52 | { 53 | "actor_id": 5, 54 | "actor_type": "RepositoryRole", 55 | "bypass_mode": "always" 56 | } 57 | ] 58 | } 59 | -------------------------------------------------------------------------------- /.github/workflows/check.yml: -------------------------------------------------------------------------------- 1 | name: Check 2 | on: 3 | pull_request: 4 | branches: 5 | - '*' 6 | concurrency: 7 | cancel-in-progress: true 8 | group: ${{ github.workflow }}-${{ github.ref }} 9 | jobs: 10 | checks: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | - uses: ./.github/actions/setup 15 | - run: uv sync --group=check 16 | - run: uv run just check-code 17 | - run: uv run just check-type 18 | - run: uv run just check-format 19 | - run: uv run just check-security 20 | - run: uv run just check-coverage 21 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish 2 | on: 3 | release: 4 | types: 5 | - edited 6 | - published 7 | env: 8 | DOCKER_IMAGE: ghcr.io/fmind/mlops-python-package 9 | concurrency: 10 | cancel-in-progress: true 11 | group: publish-workflow 12 | jobs: 13 | pages: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v4 17 | - uses: ./.github/actions/setup 18 | - run: uv sync --group=doc 19 | - run: uv run just doc 20 | - uses: JamesIves/github-pages-deploy-action@v4 21 | with: 22 | folder: docs/ 23 | branch: gh-pages 24 | packages: 25 | permissions: 26 | packages: write 27 | runs-on: ubuntu-latest 28 | steps: 29 | - uses: actions/checkout@v4 30 | - uses: ./.github/actions/setup 31 | - run: uv sync --only-dev 32 | - run: uv run just package 33 | - uses: docker/login-action@v3 34 | with: 35 | registry: ghcr.io 36 | username: ${{ github.actor }} 37 | password: ${{ secrets.GITHUB_TOKEN }} 38 | - uses: docker/setup-buildx-action@v3 39 | - uses: docker/build-push-action@v6 40 | with: 41 | push: true 42 | context: . 43 | cache-to: type=gha 44 | cache-from: type=gha 45 | tags: | 46 | ${{ env.DOCKER_IMAGE }}:latest 47 | ${{ env.DOCKER_IMAGE }}:${{ github.ref_name }} 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # https://git-scm.com/docs/gitignore 2 | 3 | # Build 4 | /dist/ 5 | /build/ 6 | 7 | # Cache 8 | .cache/ 9 | .coverage* 10 | .mypy_cache/ 11 | .ruff_cache/ 12 | .pytest_cache/ 13 | 14 | # Editor 15 | /.idea/ 16 | /.vscode/ 17 | .ipynb_checkpoints/ 18 | 19 | # Environs 20 | .env 21 | /.venv/ 22 | 23 | # Project 24 | /docs/* 25 | /mlruns/* 26 | /outputs/* 27 | !**/.gitkeep 28 | 29 | # Python 30 | *.py[cod] 31 | __pycache__/ 32 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # https://pre-commit.com 2 | # https://pre-commit.com/hooks.html 3 | 4 | default_language_version: 5 | python: python3.13 6 | repos: 7 | - repo: https://github.com/pre-commit/pre-commit-hooks 8 | rev: 'v5.0.0' 9 | hooks: 10 | - id: check-added-large-files 11 | - id: check-case-conflict 12 | - id: check-merge-conflict 13 | - id: check-toml 14 | - id: check-yaml 15 | - id: debug-statements 16 | - id: end-of-file-fixer 17 | - id: mixed-line-ending 18 | - id: trailing-whitespace 19 | - repo: https://github.com/astral-sh/ruff-pre-commit 20 | rev: 'v0.9.9' 21 | hooks: 22 | - id: ruff 23 | - id: ruff-format 24 | - repo: https://github.com/PyCQA/bandit 25 | rev: '1.8.3' 26 | hooks: 27 | - id: bandit 28 | - repo: https://github.com/commitizen-tools/commitizen 29 | rev: 'v4.4.1' 30 | hooks: 31 | - id: commitizen 32 | - id: commitizen-branch 33 | stages: [pre-push] 34 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.13 2 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## v4.1.0 (2025-03-05) 2 | 3 | ### Feat 4 | 5 | - **gemini**: add support for gemini code assist (#51) 6 | - **dependabot**: add dependabot configuration file (#50) 7 | - **github**: add default rulesets and installation (#47) 8 | 9 | ### Fix 10 | 11 | - **workflows**: fix just in workflows 12 | 13 | ### Refactor 14 | 15 | - **cruft**: update to new template version 16 | 17 | ## v4.0.0 (2025-03-04) 18 | 19 | ### Feat 20 | 21 | - **tasks**: switch from pyinvoke to just (#42) 22 | - **workflows**: bump GitHub action versions (#41) 23 | - **versions**: bump python and package version (#40) 24 | - **mindmap**: add mindmap of the package (#32) 25 | 26 | ### Fix 27 | 28 | - **version**: ready to bump 29 | - **datasets**: fix dtype backend (#44) 30 | 31 | ### Refactor 32 | 33 | - **cruft**: update to new template version 34 | 35 | ## v2.0.0 (2024-07-28) 36 | 37 | ### Feat 38 | 39 | - **cruft**: adopt cruft and link it to cookiecutter-mlops-package 40 | 41 | ## v1.1.3 (2024-07-28) 42 | 43 | ### Fix 44 | 45 | - **mlproject**: fix calling mlflow run by adding project run in front 46 | 47 | ## v1.1.2 (2024-07-28) 48 | 49 | ### Fix 50 | 51 | - **dependencies**: add setuptools to main dependency for mlflow 52 | 53 | ## v1.1.1 (2024-07-23) 54 | 55 | ### Fix 56 | 57 | - **publish**: fix publication workflow by installing dev dependencies 58 | 59 | ## v1.0.1 (2024-06-28) 60 | 61 | ### Fix 62 | 63 | - **version**: bump 64 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | github@fmind.dev. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. 129 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # https://docs.docker.com/engine/reference/builder/ 2 | 3 | FROM ghcr.io/astral-sh/uv:python3.13-bookworm 4 | COPY dist/*.whl . 5 | RUN uv pip install --system *.whl 6 | CMD ["bikes", "--help"] 7 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 2 | 3 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 4 | 5 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 6 | -------------------------------------------------------------------------------- /MLproject: -------------------------------------------------------------------------------- 1 | # https://mlflow.org/docs/latest/projects.html 2 | 3 | name: bikes 4 | python_env: python_env.yaml 5 | entry_points: 6 | main: 7 | parameters: 8 | conf_file: path 9 | command: "PYTHONPATH=src python -m bikes {conf_file}" 10 | -------------------------------------------------------------------------------- /confs/evaluations.yaml: -------------------------------------------------------------------------------- 1 | job: 2 | KIND: EvaluationsJob 3 | inputs: 4 | KIND: ParquetReader 5 | path: data/inputs_train.parquet 6 | targets: 7 | KIND: ParquetReader 8 | path: data/targets_train.parquet 9 | -------------------------------------------------------------------------------- /confs/explanations.yaml: -------------------------------------------------------------------------------- 1 | job: 2 | KIND: ExplanationsJob 3 | inputs_samples: 4 | KIND: ParquetReader 5 | path: data/inputs_test.parquet 6 | limit: 100 7 | models_explanations: 8 | KIND: ParquetWriter 9 | path: outputs/models_explanations.parquet 10 | samples_explanations: 11 | KIND: ParquetWriter 12 | path: outputs/samples_explanations.parquet 13 | -------------------------------------------------------------------------------- /confs/inference.yaml: -------------------------------------------------------------------------------- 1 | job: 2 | KIND: InferenceJob 3 | inputs: 4 | KIND: ParquetReader 5 | path: data/inputs_test.parquet 6 | outputs: 7 | KIND: ParquetWriter 8 | path: outputs/predictions_test.parquet 9 | -------------------------------------------------------------------------------- /confs/promotion.yaml: -------------------------------------------------------------------------------- 1 | job: 2 | KIND: PromotionJob 3 | -------------------------------------------------------------------------------- /confs/training.yaml: -------------------------------------------------------------------------------- 1 | job: 2 | KIND: TrainingJob 3 | inputs: 4 | KIND: ParquetReader 5 | path: data/inputs_train.parquet 6 | targets: 7 | KIND: ParquetReader 8 | path: data/targets_train.parquet 9 | -------------------------------------------------------------------------------- /confs/tuning.yaml: -------------------------------------------------------------------------------- 1 | job: 2 | KIND: TuningJob 3 | inputs: 4 | KIND: ParquetReader 5 | path: data/inputs_train.parquet 6 | targets: 7 | KIND: ParquetReader 8 | path: data/targets_train.parquet 9 | -------------------------------------------------------------------------------- /data/Readme.txt: -------------------------------------------------------------------------------- 1 | ========================================== 2 | Bike Sharing Dataset 3 | ========================================== 4 | 5 | Hadi Fanaee-T 6 | 7 | Laboratory of Artificial Intelligence and Decision Support (LIAAD), University of Porto 8 | INESC Porto, Campus da FEUP 9 | Rua Dr. Roberto Frias, 378 10 | 4200 - 465 Porto, Portugal 11 | 12 | https://archive.ics.uci.edu/dataset/275/bike+sharing+dataset 13 | 14 | ========================================= 15 | Background 16 | ========================================= 17 | 18 | Bike sharing systems are new generation of traditional bike rentals where whole process from membership, rental and return 19 | back has become automatic. Through these systems, user is able to easily rent a bike from a particular position and return 20 | back at another position. Currently, there are about over 500 bike-sharing programs around the world which is composed of 21 | over 500 thousands bicycles. Today, there exists great interest in these systems due to their important role in traffic, 22 | environmental and health issues. 23 | 24 | Apart from interesting real world applications of bike sharing systems, the characteristics of data being generated by 25 | these systems make them attractive for the research. Opposed to other transport services such as bus or subway, the duration 26 | of travel, departure and arrival position is explicitly recorded in these systems. This feature turns bike sharing system into 27 | a virtual sensor network that can be used for sensing mobility in the city. Hence, it is expected that most of important 28 | events in the city could be detected via monitoring these data. 29 | 30 | ========================================= 31 | Data Set 32 | ========================================= 33 | Bike-sharing rental process is highly correlated to the environmental and seasonal settings. For instance, weather conditions, 34 | precipitation, day of week, season, hour of the day, etc. can affect the rental behaviors. The core data set is related to 35 | the two-year historical log corresponding to years 2011 and 2012 from Capital Bikeshare system, Washington D.C., USA which is 36 | publicly available in http://capitalbikeshare.com/system-data. We aggregated the data on two hourly and daily basis and then 37 | extracted and added the corresponding weather and seasonal information. Weather information are extracted from http://www.freemeteo.com. 38 | 39 | ========================================= 40 | Associated tasks 41 | ========================================= 42 | 43 | - Regression: 44 | Predication of bike rental count hourly or daily based on the environmental and seasonal settings. 45 | 46 | - Event and Anomaly Detection: 47 | Count of rented bikes are also correlated to some events in the town which easily are traceable via search engines. 48 | For instance, query like "2012-10-30 washington d.c." in Google returns related results to Hurricane Sandy. Some of the important events are 49 | identified in [1]. Therefore the data can be used for validation of anomaly or event detection algorithms as well. 50 | 51 | 52 | ========================================= 53 | Files 54 | ========================================= 55 | 56 | - Readme.txt 57 | - hour.csv : bike sharing counts aggregated on hourly basis. Records: 17379 hours 58 | - day.csv - bike sharing counts aggregated on daily basis. Records: 731 days 59 | 60 | 61 | ========================================= 62 | Dataset characteristics 63 | ========================================= 64 | Both hour.csv and day.csv have the following fields, except hr which is not available in day.csv 65 | 66 | - instant: record index 67 | - dteday : date 68 | - season : season (1:springer, 2:summer, 3:fall, 4:winter) 69 | - yr : year (0: 2011, 1:2012) 70 | - mnth : month ( 1 to 12) 71 | - hr : hour (0 to 23) 72 | - holiday : weather day is holiday or not (extracted from http://dchr.dc.gov/page/holiday-schedule) 73 | - weekday : day of the week 74 | - workingday : if day is neither weekend nor holiday is 1, otherwise is 0. 75 | + weathersit : 76 | - 1: Clear, Few clouds, Partly cloudy, Partly cloudy 77 | - 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist 78 | - 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds 79 | - 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 80 | - temp : Normalized temperature in Celsius. The values are divided to 41 (max) 81 | - atemp: Normalized feeling temperature in Celsius. The values are divided to 50 (max) 82 | - hum: Normalized humidity. The values are divided to 100 (max) 83 | - windspeed: Normalized wind speed. The values are divided to 67 (max) 84 | - casual: count of casual users 85 | - registered: count of registered users 86 | - cnt: count of total rental bikes including both casual and registered 87 | 88 | ========================================= 89 | License 90 | ========================================= 91 | Use of this dataset in publications must be cited to the following publication: 92 | 93 | [1] Fanaee-T, Hadi, and Gama, Joao, "Event labeling combining ensemble detectors and background knowledge", Progress in Artificial Intelligence (2013): pp. 1-15, Springer Berlin Heidelberg, doi:10.1007/s13748-013-0040-3. 94 | 95 | @article{ 96 | year={2013}, 97 | issn={2192-6352}, 98 | journal={Progress in Artificial Intelligence}, 99 | doi={10.1007/s13748-013-0040-3}, 100 | title={Event labeling combining ensemble detectors and background knowledge}, 101 | url={http://dx.doi.org/10.1007/s13748-013-0040-3}, 102 | publisher={Springer Berlin Heidelberg}, 103 | keywords={Event labeling; Event detection; Ensemble learning; Background knowledge}, 104 | author={Fanaee-T, Hadi and Gama, Joao}, 105 | pages={1-15} 106 | } 107 | 108 | ========================================= 109 | Contact 110 | ========================================= 111 | 112 | For further information about this dataset please contact Hadi Fanaee-T (hadi.fanaee@fe.up.pt) 113 | -------------------------------------------------------------------------------- /data/inputs_test.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmind/mlops-python-package/856a5ab74958cec36ae31619636d07c4baf9698f/data/inputs_test.parquet -------------------------------------------------------------------------------- /data/inputs_train.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmind/mlops-python-package/856a5ab74958cec36ae31619636d07c4baf9698f/data/inputs_train.parquet -------------------------------------------------------------------------------- /data/targets_test.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmind/mlops-python-package/856a5ab74958cec36ae31619636d07c4baf9698f/data/targets_test.parquet -------------------------------------------------------------------------------- /data/targets_train.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmind/mlops-python-package/856a5ab74958cec36ae31619636d07c4baf9698f/data/targets_train.parquet -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | # https://docs.docker.com/compose/compose-file/ 2 | 3 | services: 4 | mlflow: 5 | image: ghcr.io/mlflow/mlflow:v2.20.3 6 | ports: 7 | - 5000:5000 8 | environment: 9 | - MLFLOW_HOST=0.0.0.0 10 | command: mlflow server 11 | -------------------------------------------------------------------------------- /images/mlopsmindmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmind/mlops-python-package/856a5ab74958cec36ae31619636d07c4baf9698f/images/mlopsmindmap.png -------------------------------------------------------------------------------- /justfile: -------------------------------------------------------------------------------- 1 | # https://just.systems/man/en/ 2 | 3 | # REQUIRES 4 | 5 | docker := require("docker") 6 | find := require("find") 7 | rm := require("rm") 8 | uv := require("uv") 9 | 10 | # SETTINGS 11 | 12 | set dotenv-load := true 13 | 14 | # VARIABLES 15 | 16 | PACKAGE := "bikes" 17 | REPOSITORY := "bikes" 18 | SOURCES := "src" 19 | TESTS := "tests" 20 | 21 | # DEFAULTS 22 | 23 | # display help information 24 | default: 25 | @just --list 26 | 27 | # IMPORTS 28 | 29 | import 'tasks/check.just' 30 | import 'tasks/clean.just' 31 | import 'tasks/commit.just' 32 | import 'tasks/doc.just' 33 | import 'tasks/docker.just' 34 | import 'tasks/format.just' 35 | import 'tasks/install.just' 36 | import 'tasks/mlflow.just' 37 | import 'tasks/package.just' 38 | import 'tasks/project.just' 39 | -------------------------------------------------------------------------------- /mlops-python-package.code-workspace: -------------------------------------------------------------------------------- 1 | { 2 | "folders": [ 3 | { 4 | "path": "." 5 | } 6 | ], 7 | "settings": { 8 | "editor.formatOnSave": true, 9 | "python.defaultInterpreterPath": ".venv/bin/python", 10 | "python.testing.pytestEnabled": true, 11 | "python.testing.pytestArgs": [ 12 | "tests" 13 | ], 14 | "[python]": { 15 | "editor.codeActionsOnSave": { 16 | "source.organizeImports": "explicit" 17 | }, 18 | "editor.defaultFormatter": "charliermarsh.ruff", 19 | }, 20 | }, 21 | "extensions": { 22 | "recommendations": [ 23 | "charliermarsh.ruff", 24 | "ms-python.mypy-type-checker", 25 | "ms-python.python", 26 | "ms-python.vscode-pylance", 27 | "redhat.vscode-yaml", 28 | ] 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /outputs/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmind/mlops-python-package/856a5ab74958cec36ae31619636d07c4baf9698f/outputs/.gitkeep -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # https://docs.astral.sh/uv/reference/settings/ 2 | # https://packaging.python.org/en/latest/guides/writing-pyproject-toml/ 3 | 4 | # PROJECT 5 | 6 | [project] 7 | name = "bikes" 8 | version = "4.1.0" 9 | description = "Predict the number of bikes available." 10 | authors = [{ name = "Médéric HURIER", email = "github@fmind.dev" }] 11 | readme = "README.md" 12 | license = { file = "LICENSE.txt" } 13 | keywords = ["mlops", "python", "package"] 14 | requires-python = ">=3.13" 15 | dependencies = [ 16 | "loguru>=0.7.3", 17 | "matplotlib>=3.10.1", 18 | "mlflow>=2.20.3", 19 | "numba>=0.61.0", 20 | "numpy>=2.1.3", 21 | "omegaconf>=2.3.0", 22 | "pandas>=2.2.3", 23 | "pandera>=0.23.0", 24 | "plotly>=6.0.0", 25 | "plyer>=2.1.0", 26 | "psutil>=7.0.0", 27 | "pyarrow>=19.0.1", 28 | "pydantic-settings>=2.8.1", 29 | "pydantic>=2.10.6", 30 | "pynvml>=12.0.0", 31 | "scikit-learn>=1.6.1", 32 | "setuptools>=75.8.2", 33 | "shap>=0.46.0", 34 | "hatchling>=1.27.0", 35 | ] 36 | 37 | # LINKS 38 | 39 | [project.urls] 40 | Homepage = "https://github.com/fmind/mlops-python-package" 41 | Documentation = "https://fmind.github.io/mlops-python-package/bikes.html" 42 | Repository = "https://github.com/fmind/mlops-python-package" 43 | "Bug Tracker" = "https://github.com/fmind/mlops-python-package/issues" 44 | Changelog = "https://github.com/fmind/mlops-python-package/blob/main/CHANGELOG.md" 45 | 46 | # SCRIPTS 47 | 48 | [project.scripts] 49 | bikes = 'bikes.scripts:main' 50 | 51 | # DEPENDENCIES 52 | 53 | [dependency-groups] 54 | check = [ 55 | "bandit>=1.8.3", 56 | "coverage>=7.6.12", 57 | "mypy>=1.15.0", 58 | "pandera[mypy]>=0.23.0", 59 | "pytest>=8.3.5", 60 | "pytest-cov>=6.0.0", 61 | "pytest-mock>=3.14.0", 62 | "pytest-xdist>=3.6.1", 63 | "ruff>=0.9.9", 64 | ] 65 | commit = ["commitizen>=4.4.1", "pre-commit>=4.1.0"] 66 | dev = ["rust-just>=1.39.0"] 67 | doc = ["pdoc>=15.0.1"] 68 | notebook = ["ipykernel>=6.29.5", "nbformat>=5.10.4"] 69 | 70 | # TOOLS 71 | 72 | [tool.uv] 73 | default-groups = ["check", "commit", "dev", "doc", "notebook"] 74 | 75 | [tool.bandit] 76 | targets = ["src"] 77 | 78 | [tool.commitizen] 79 | name = "cz_conventional_commits" 80 | tag_format = "v$version" 81 | version_scheme = "pep440" 82 | version_provider = "pep621" 83 | changelog_start_rev = "v1.0.0" 84 | update_changelog_on_bump = true 85 | 86 | [tool.coverage.run] 87 | branch = true 88 | source = ["src"] 89 | omit = ["__main__.py"] 90 | 91 | [tool.mypy] 92 | pretty = true 93 | python_version = "3.13" 94 | check_untyped_defs = true 95 | ignore_missing_imports = true 96 | plugins = ["pandera.mypy", "pydantic.mypy"] 97 | 98 | [tool.pytest.ini_options] 99 | addopts = "--verbosity=2" 100 | pythonpath = ["src"] 101 | 102 | [tool.ruff] 103 | fix = true 104 | indent-width = 4 105 | line-length = 100 106 | target-version = "py313" 107 | 108 | [tool.ruff.format] 109 | docstring-code-format = true 110 | 111 | [tool.ruff.lint.pydocstyle] 112 | convention = "google" 113 | 114 | [tool.ruff.lint.per-file-ignores] 115 | "tests/*.py" = ["D100", "D103"] 116 | 117 | # SYSTEMS 118 | 119 | [build-system] 120 | requires = ["hatchling"] 121 | build-backend = "hatchling.build" 122 | -------------------------------------------------------------------------------- /python_env.yaml: -------------------------------------------------------------------------------- 1 | { 2 | "python": "3.13", 3 | "dependencies": [ 4 | "alembic==1.14.1", 5 | "annotated-types==0.7.0", 6 | "antlr4-python3-runtime==4.9.3", 7 | "appnope==0.1.4", 8 | "argcomplete==3.5.3", 9 | "asttokens==3.0.0", 10 | "attrs==25.1.0", 11 | "bandit==1.8.3", 12 | "blinker==1.9.0", 13 | "cachetools==5.5.2", 14 | "certifi==2025.1.31", 15 | "cffi==1.17.1", 16 | "cfgv==3.4.0", 17 | "charset-normalizer==3.4.1", 18 | "click==8.1.8", 19 | "cloudpickle==3.1.1", 20 | "colorama==0.4.6", 21 | "comm==0.2.2", 22 | "commitizen==4.4.1", 23 | "contourpy==1.3.1", 24 | "coverage==7.6.12", 25 | "cycler==0.12.1", 26 | "databricks-sdk==0.44.1", 27 | "debugpy==1.8.12", 28 | "decli==0.6.2", 29 | "decorator==5.2.1", 30 | "deprecated==1.2.18", 31 | "distlib==0.3.9", 32 | "docker==7.1.0", 33 | "execnet==2.1.1", 34 | "executing==2.2.0", 35 | "fastjsonschema==2.21.1", 36 | "filelock==3.17.0", 37 | "flask==3.1.0", 38 | "fonttools==4.56.0", 39 | "gitdb==4.0.12", 40 | "gitpython==3.1.44", 41 | "google-auth==2.38.0", 42 | "graphene==3.4.3", 43 | "graphql-core==3.2.6", 44 | "graphql-relay==3.2.0", 45 | "greenlet==3.1.1", 46 | "gunicorn==23.0.0", 47 | "hatchling==1.27.0", 48 | "identify==2.6.8", 49 | "idna==3.10", 50 | "importlib-metadata==8.6.1", 51 | "iniconfig==2.0.0", 52 | "ipykernel==6.29.5", 53 | "ipython==9.0.0", 54 | "ipython-pygments-lexers==1.1.1", 55 | "itsdangerous==2.2.0", 56 | "jedi==0.19.2", 57 | "jinja2==3.1.5", 58 | "joblib==1.4.2", 59 | "jsonschema==4.23.0", 60 | "jsonschema-specifications==2024.10.1", 61 | "jupyter-client==8.6.3", 62 | "jupyter-core==5.7.2", 63 | "kiwisolver==1.4.8", 64 | "llvmlite==0.44.0", 65 | "loguru==0.7.3", 66 | "mako==1.3.9", 67 | "markdown==3.7", 68 | "markdown-it-py==3.0.0", 69 | "markupsafe==3.0.2", 70 | "matplotlib==3.10.1", 71 | "matplotlib-inline==0.1.7", 72 | "mdurl==0.1.2", 73 | "mlflow==2.20.3", 74 | "mlflow-skinny==2.20.3", 75 | "mypy==1.15.0", 76 | "mypy-extensions==1.0.0", 77 | "narwhals==1.28.0", 78 | "nbformat==5.10.4", 79 | "nest-asyncio==1.6.0", 80 | "nodeenv==1.9.1", 81 | "numba==0.61.0", 82 | "numpy==2.1.3", 83 | "nvidia-ml-py==12.570.86", 84 | "omegaconf==2.3.0", 85 | "opentelemetry-api==1.16.0", 86 | "opentelemetry-sdk==1.16.0", 87 | "opentelemetry-semantic-conventions==0.37b0", 88 | "packaging==24.2", 89 | "pandas==2.2.3", 90 | "pandas-stubs==2.2.3.241126", 91 | "pandera==0.23.0", 92 | "parso==0.8.4", 93 | "pathspec==0.12.1", 94 | "pbr==6.1.1", 95 | "pdoc==15.0.1", 96 | "pexpect==4.9.0", 97 | "pillow==11.1.0", 98 | "platformdirs==4.3.6", 99 | "plotly==6.0.0", 100 | "pluggy==1.5.0", 101 | "plyer==2.1.0", 102 | "pre-commit==4.1.0", 103 | "prompt-toolkit==3.0.50", 104 | "protobuf==5.29.3", 105 | "psutil==7.0.0", 106 | "ptyprocess==0.7.0", 107 | "pure-eval==0.2.3", 108 | "pyarrow==19.0.1", 109 | "pyasn1==0.6.1", 110 | "pyasn1-modules==0.4.1", 111 | "pycparser==2.22", 112 | "pydantic==2.10.6", 113 | "pydantic-core==2.27.2", 114 | "pydantic-settings==2.8.1", 115 | "pygments==2.19.1", 116 | "pynvml==12.0.0", 117 | "pyparsing==3.2.1", 118 | "pytest==8.3.5", 119 | "pytest-cov==6.0.0", 120 | "pytest-mock==3.14.0", 121 | "pytest-xdist==3.6.1", 122 | "python-dateutil==2.9.0.post0", 123 | "python-dotenv==1.0.1", 124 | "pytz==2025.1", 125 | "pyyaml==6.0.2", 126 | "pyzmq==26.2.1", 127 | "questionary==2.1.0", 128 | "referencing==0.36.2", 129 | "requests==2.32.3", 130 | "rich==13.9.4", 131 | "rpds-py==0.23.1", 132 | "rsa==4.9", 133 | "ruff==0.9.9", 134 | "scikit-learn==1.6.1", 135 | "scipy==1.15.2", 136 | "setuptools==75.8.2", 137 | "shap==0.46.0", 138 | "six==1.17.0", 139 | "slicer==0.0.8", 140 | "smmap==5.0.2", 141 | "sqlalchemy==2.0.38", 142 | "sqlparse==0.5.3", 143 | "stack-data==0.6.3", 144 | "stevedore==5.4.1", 145 | "termcolor==2.5.0", 146 | "threadpoolctl==3.5.0", 147 | "tomlkit==0.13.2", 148 | "tornado==6.4.2", 149 | "tqdm==4.67.1", 150 | "traitlets==5.14.3", 151 | "trove-classifiers==2025.3.3.18", 152 | "typeguard==4.4.2", 153 | "types-pytz==2025.1.0.20250204", 154 | "typing-extensions==4.12.2", 155 | "typing-inspect==0.9.0", 156 | "tzdata==2025.1", 157 | "urllib3==2.3.0", 158 | "virtualenv==20.29.2", 159 | "waitress==3.0.2", 160 | "wcwidth==0.2.13", 161 | "werkzeug==3.1.3", 162 | "win32-setctime==1.2.0", 163 | "wrapt==1.17.2", 164 | "zipp==3.21.0" 165 | ] 166 | } 167 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # This file was autogenerated by uv via the following command: 2 | # uv export --format=requirements-txt --no-dev --no-hashes --no-editable --no-emit-project --output-file=requirements.txt 3 | alembic==1.14.1 4 | annotated-types==0.7.0 5 | antlr4-python3-runtime==4.9.3 6 | appnope==0.1.4 ; platform_system == 'Darwin' 7 | argcomplete==3.6.1 8 | asttokens==3.0.0 9 | attrs==25.1.0 10 | bandit==1.8.3 11 | blinker==1.9.0 12 | cachetools==5.5.2 13 | certifi==2025.1.31 14 | cffi==1.17.1 ; implementation_name == 'pypy' 15 | cfgv==3.4.0 16 | charset-normalizer==3.4.1 17 | click==8.1.8 18 | cloudpickle==3.1.1 19 | colorama==0.4.6 20 | comm==0.2.2 21 | commitizen==4.4.1 22 | contourpy==1.3.1 23 | coverage==7.6.12 24 | cycler==0.12.1 25 | databricks-sdk==0.44.1 26 | debugpy==1.8.12 27 | decli==0.6.2 28 | decorator==5.2.1 29 | deprecated==1.2.18 30 | distlib==0.3.9 31 | docker==7.1.0 32 | execnet==2.1.1 33 | executing==2.2.0 34 | fastjsonschema==2.21.1 35 | filelock==3.18.0 36 | flask==3.1.0 37 | fonttools==4.56.0 38 | gitdb==4.0.12 39 | gitpython==3.1.44 40 | google-auth==2.38.0 41 | graphene==3.4.3 42 | graphql-core==3.2.6 43 | graphql-relay==3.2.0 44 | greenlet==3.1.1 ; (python_full_version == '3.13.*' and platform_machine == 'AMD64') or (python_full_version == '3.13.*' and platform_machine == 'WIN32') or (python_full_version == '3.13.*' and platform_machine == 'aarch64') or (python_full_version == '3.13.*' and platform_machine == 'amd64') or (python_full_version == '3.13.*' and platform_machine == 'ppc64le') or (python_full_version == '3.13.*' and platform_machine == 'win32') or (python_full_version == '3.13.*' and platform_machine == 'x86_64') 45 | gunicorn==23.0.0 ; platform_system != 'Windows' 46 | hatchling==1.27.0 47 | identify==2.6.9 48 | idna==3.10 49 | importlib-metadata==8.6.1 50 | iniconfig==2.0.0 51 | ipykernel==6.29.5 52 | ipython==9.0.2 53 | ipython-pygments-lexers==1.1.1 54 | itsdangerous==2.2.0 55 | jedi==0.19.2 56 | jinja2==3.1.5 57 | joblib==1.4.2 58 | jsonschema==4.23.0 59 | jsonschema-specifications==2024.10.1 60 | jupyter-client==8.6.3 61 | jupyter-core==5.7.2 62 | kiwisolver==1.4.8 63 | llvmlite==0.44.0 64 | loguru==0.7.3 65 | mako==1.3.9 66 | markdown==3.7 67 | markdown-it-py==3.0.0 68 | markupsafe==3.0.2 69 | matplotlib==3.10.1 70 | matplotlib-inline==0.1.7 71 | mdurl==0.1.2 72 | mlflow==2.20.3 73 | mlflow-skinny==2.20.3 74 | mypy==1.15.0 75 | mypy-extensions==1.0.0 76 | narwhals==1.28.0 77 | nbformat==5.10.4 78 | nest-asyncio==1.6.0 79 | nodeenv==1.9.1 80 | numba==0.61.0 81 | numpy==2.1.3 82 | nvidia-ml-py==12.570.86 83 | omegaconf==2.3.0 84 | opentelemetry-api==1.16.0 85 | opentelemetry-sdk==1.16.0 86 | opentelemetry-semantic-conventions==0.37b0 87 | packaging==24.2 88 | pandas==2.2.3 89 | pandas-stubs==2.2.3.250308 90 | pandera==0.23.0 91 | parso==0.8.4 92 | pathspec==0.12.1 93 | pbr==6.1.1 94 | pdoc==15.0.1 95 | pexpect==4.9.0 ; sys_platform != 'emscripten' and sys_platform != 'win32' 96 | pillow==11.1.0 97 | platformdirs==4.3.6 98 | plotly==6.0.0 99 | pluggy==1.5.0 100 | plyer==2.1.0 101 | pre-commit==4.1.0 102 | prompt-toolkit==3.0.50 103 | protobuf==5.29.3 104 | psutil==7.0.0 105 | ptyprocess==0.7.0 ; sys_platform != 'emscripten' and sys_platform != 'win32' 106 | pure-eval==0.2.3 107 | pyarrow==19.0.1 108 | pyasn1==0.6.1 109 | pyasn1-modules==0.4.1 110 | pycparser==2.22 ; implementation_name == 'pypy' 111 | pydantic==2.10.6 112 | pydantic-core==2.27.2 113 | pydantic-settings==2.8.1 114 | pygments==2.19.1 115 | pynvml==12.0.0 116 | pyparsing==3.2.1 117 | pytest==8.3.5 118 | pytest-cov==6.0.0 119 | pytest-mock==3.14.0 120 | pytest-xdist==3.6.1 121 | python-dateutil==2.9.0.post0 122 | python-dotenv==1.0.1 123 | pytz==2025.1 124 | pywin32==310 ; sys_platform == 'win32' 125 | pyyaml==6.0.2 126 | pyzmq==26.2.1 127 | questionary==2.1.0 128 | referencing==0.36.2 129 | requests==2.32.3 130 | rich==13.9.4 131 | rpds-py==0.24.0 132 | rsa==4.9 133 | ruff==0.11.2 134 | scikit-learn==1.6.1 135 | scipy==1.15.2 136 | setuptools==75.8.2 137 | shap==0.46.0 138 | six==1.17.0 139 | slicer==0.0.8 140 | smmap==5.0.2 141 | sqlalchemy==2.0.38 142 | sqlparse==0.5.3 143 | stack-data==0.6.3 144 | stevedore==5.4.1 145 | termcolor==2.5.0 146 | threadpoolctl==3.5.0 147 | tomlkit==0.13.2 148 | tornado==6.4.2 149 | tqdm==4.67.1 150 | traitlets==5.14.3 151 | trove-classifiers==2025.3.3.18 152 | typeguard==4.4.2 153 | types-pytz==2025.2.0.20250326 154 | typing-extensions==4.12.2 155 | typing-inspect==0.9.0 156 | tzdata==2025.1 157 | urllib3==2.3.0 158 | virtualenv==20.29.3 159 | waitress==3.0.2 ; platform_system == 'Windows' 160 | wcwidth==0.2.13 161 | werkzeug==3.1.3 162 | win32-setctime==1.2.0 ; sys_platform == 'win32' 163 | wrapt==1.17.2 164 | zipp==3.21.0 165 | -------------------------------------------------------------------------------- /src/bikes/__init__.py: -------------------------------------------------------------------------------- 1 | """Predict the number of bikes available.""" 2 | -------------------------------------------------------------------------------- /src/bikes/__main__.py: -------------------------------------------------------------------------------- 1 | """Entry point of the package.""" 2 | 3 | # %% IMPORTS 4 | 5 | from bikes import scripts 6 | 7 | # %% MAIN 8 | 9 | if __name__ == "__main__": 10 | scripts.main() 11 | -------------------------------------------------------------------------------- /src/bikes/core/__init__.py: -------------------------------------------------------------------------------- 1 | """Core components of the project.""" 2 | -------------------------------------------------------------------------------- /src/bikes/core/metrics.py: -------------------------------------------------------------------------------- 1 | """Evaluate model performances with metrics.""" 2 | 3 | # %% IMPORTS 4 | 5 | from __future__ import annotations 6 | 7 | import abc 8 | import typing as T 9 | 10 | import mlflow 11 | import pandas as pd 12 | import pydantic as pdt 13 | from mlflow.metrics import MetricValue 14 | from sklearn import metrics as sklearn_metrics 15 | 16 | from bikes.core import models, schemas 17 | 18 | # %% TYPINGS 19 | 20 | MlflowMetric: T.TypeAlias = MetricValue 21 | MlflowThreshold: T.TypeAlias = mlflow.models.MetricThreshold 22 | MlflowModelValidationFailedException: T.TypeAlias = ( 23 | mlflow.models.evaluation.validation.ModelValidationFailedException 24 | ) 25 | 26 | # %% METRICS 27 | 28 | 29 | class Metric(abc.ABC, pdt.BaseModel, strict=True, frozen=True, extra="forbid"): 30 | """Base class for a project metric. 31 | 32 | Use metrics to evaluate model performance. 33 | e.g., accuracy, precision, recall, MAE, F1, ... 34 | 35 | Parameters: 36 | name (str): name of the metric for the reporting. 37 | greater_is_better (bool): maximize or minimize result. 38 | """ 39 | 40 | KIND: str 41 | 42 | name: str 43 | greater_is_better: bool 44 | 45 | @abc.abstractmethod 46 | def score(self, targets: schemas.Targets, outputs: schemas.Outputs) -> float: 47 | """Score the outputs against the targets. 48 | 49 | Args: 50 | targets (schemas.Targets): expected values. 51 | outputs (schemas.Outputs): predicted values. 52 | 53 | Returns: 54 | float: single result from the metric computation. 55 | """ 56 | 57 | def scorer( 58 | self, model: models.Model, inputs: schemas.Inputs, targets: schemas.Targets 59 | ) -> float: 60 | """Score model outputs against targets. 61 | 62 | Args: 63 | model (models.Model): model to evaluate. 64 | inputs (schemas.Inputs): model inputs values. 65 | targets (schemas.Targets): model expected values. 66 | 67 | Returns: 68 | float: single result from the metric computation. 69 | """ 70 | outputs = model.predict(inputs=inputs) 71 | score = self.score(targets=targets, outputs=outputs) 72 | return score 73 | 74 | def to_mlflow(self) -> MlflowMetric: 75 | """Convert the metric to an Mlflow metric. 76 | 77 | Returns: 78 | MlflowMetric: the Mlflow metric. 79 | """ 80 | 81 | def eval_fn(predictions: pd.Series[int], targets: pd.Series[int]) -> MlflowMetric: 82 | """Evaluation function associated with the mlflow metric. 83 | 84 | Args: 85 | predictions (pd.Series): model predictions. 86 | targets (pd.Series | None): model targets. 87 | 88 | Returns: 89 | MlflowMetric: the mlflow metric. 90 | """ 91 | score_targets = schemas.Targets( 92 | {schemas.TargetsSchema.cnt: targets}, index=targets.index 93 | ) 94 | score_outputs = schemas.Outputs( 95 | {schemas.OutputsSchema.prediction: predictions}, index=predictions.index 96 | ) 97 | sign = 1 if self.greater_is_better else -1 # reverse the effect 98 | score = self.score(targets=score_targets, outputs=score_outputs) 99 | return MlflowMetric(aggregate_results={self.name: score * sign}) 100 | 101 | return mlflow.metrics.make_metric( 102 | eval_fn=eval_fn, name=self.name, greater_is_better=self.greater_is_better 103 | ) 104 | 105 | 106 | class SklearnMetric(Metric): 107 | """Compute metrics with sklearn. 108 | 109 | Parameters: 110 | name (str): name of the sklearn metric. 111 | greater_is_better (bool): maximize or minimize. 112 | """ 113 | 114 | KIND: T.Literal["SklearnMetric"] = "SklearnMetric" 115 | 116 | name: str = "mean_squared_error" 117 | greater_is_better: bool = False 118 | 119 | @T.override 120 | def score(self, targets: schemas.Targets, outputs: schemas.Outputs) -> float: 121 | metric = getattr(sklearn_metrics, self.name) 122 | sign = 1 if self.greater_is_better else -1 123 | y_true = targets[schemas.TargetsSchema.cnt] 124 | y_pred = outputs[schemas.OutputsSchema.prediction] 125 | score = metric(y_pred=y_pred, y_true=y_true) * sign 126 | return float(score) 127 | 128 | 129 | MetricKind = SklearnMetric 130 | MetricsKind: T.TypeAlias = list[T.Annotated[MetricKind, pdt.Field(discriminator="KIND")]] 131 | 132 | # %% THRESHOLDS 133 | 134 | 135 | class Threshold(abc.ABC, pdt.BaseModel, strict=True, frozen=True, extra="forbid"): 136 | """A project threshold for a metric. 137 | 138 | Use thresholds to monitor model performances. 139 | e.g., to trigger an alert when a threshold is met. 140 | 141 | Parameters: 142 | threshold (int | float): absolute threshold value. 143 | greater_is_better (bool): maximize or minimize result. 144 | """ 145 | 146 | threshold: int | float 147 | greater_is_better: bool 148 | 149 | def to_mlflow(self) -> MlflowThreshold: 150 | """Convert the threshold to an mlflow threshold. 151 | 152 | Returns: 153 | MlflowThreshold: the mlflow threshold. 154 | """ 155 | return MlflowThreshold(threshold=self.threshold, greater_is_better=self.greater_is_better) 156 | -------------------------------------------------------------------------------- /src/bikes/core/models.py: -------------------------------------------------------------------------------- 1 | """Define trainable machine learning models.""" 2 | 3 | # %% IMPORTS 4 | 5 | import abc 6 | import typing as T 7 | 8 | import pandas as pd 9 | import pydantic as pdt 10 | import shap 11 | from sklearn import compose, ensemble, pipeline, preprocessing 12 | 13 | from bikes.core import schemas 14 | 15 | # %% TYPES 16 | 17 | # Model params 18 | ParamKey = str 19 | ParamValue = T.Any 20 | Params = dict[ParamKey, ParamValue] 21 | 22 | # %% MODELS 23 | 24 | 25 | class Model(abc.ABC, pdt.BaseModel, strict=True, frozen=False, extra="forbid"): 26 | """Base class for a project model. 27 | 28 | Use a model to adapt AI/ML frameworks. 29 | e.g., to swap easily one model with another. 30 | """ 31 | 32 | KIND: str 33 | 34 | def get_params(self, deep: bool = True) -> Params: 35 | """Get the model params. 36 | 37 | Args: 38 | deep (bool, optional): ignored. 39 | 40 | Returns: 41 | Params: internal model parameters. 42 | """ 43 | params: Params = {} 44 | for key, value in self.model_dump().items(): 45 | if not key.startswith("_") and not key.isupper(): 46 | params[key] = value 47 | return params 48 | 49 | def set_params(self, **params: ParamValue) -> T.Self: 50 | """Set the model params in place. 51 | 52 | Returns: 53 | T.Self: instance of the model. 54 | """ 55 | for key, value in params.items(): 56 | setattr(self, key, value) 57 | return self 58 | 59 | @abc.abstractmethod 60 | def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> T.Self: 61 | """Fit the model on the given inputs and targets. 62 | 63 | Args: 64 | inputs (schemas.Inputs): model training inputs. 65 | targets (schemas.Targets): model training targets. 66 | 67 | Returns: 68 | T.Self: instance of the model. 69 | """ 70 | 71 | @abc.abstractmethod 72 | def predict(self, inputs: schemas.Inputs) -> schemas.Outputs: 73 | """Generate outputs with the model for the given inputs. 74 | 75 | Args: 76 | inputs (schemas.Inputs): model prediction inputs. 77 | 78 | Returns: 79 | schemas.Outputs: model prediction outputs. 80 | """ 81 | 82 | def explain_model(self) -> schemas.FeatureImportances: 83 | """Explain the internal model structure. 84 | 85 | Returns: 86 | schemas.FeatureImportances: feature importances. 87 | """ 88 | raise NotImplementedError() 89 | 90 | def explain_samples(self, inputs: schemas.Inputs) -> schemas.SHAPValues: 91 | """Explain model outputs on input samples. 92 | 93 | Returns: 94 | schemas.SHAPValues: SHAP values. 95 | """ 96 | raise NotImplementedError() 97 | 98 | def get_internal_model(self) -> T.Any: 99 | """Return the internal model in the object. 100 | 101 | Raises: 102 | NotImplementedError: method not implemented. 103 | 104 | Returns: 105 | T.Any: any internal model (either empty or fitted). 106 | """ 107 | raise NotImplementedError() 108 | 109 | 110 | class BaselineSklearnModel(Model): 111 | """Simple baseline model based on scikit-learn. 112 | 113 | Parameters: 114 | max_depth (int): maximum depth of the random forest. 115 | n_estimators (int): number of estimators in the random forest. 116 | random_state (int, optional): random state of the machine learning pipeline. 117 | """ 118 | 119 | KIND: T.Literal["BaselineSklearnModel"] = "BaselineSklearnModel" 120 | 121 | # params 122 | max_depth: int = 20 123 | n_estimators: int = 200 124 | random_state: int | None = 42 125 | # private 126 | _pipeline: pipeline.Pipeline | None = None 127 | _numericals: list[str] = [ 128 | "yr", 129 | "mnth", 130 | "hr", 131 | "holiday", 132 | "weekday", 133 | "workingday", 134 | "temp", 135 | "atemp", 136 | "hum", 137 | "windspeed", 138 | "casual", 139 | "registered", # too correlated with target 140 | ] 141 | _categoricals: list[str] = [ 142 | "season", 143 | "weathersit", 144 | ] 145 | 146 | @T.override 147 | def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> "BaselineSklearnModel": 148 | # subcomponents 149 | categoricals_transformer = preprocessing.OneHotEncoder( 150 | sparse_output=False, handle_unknown="ignore" 151 | ) 152 | # components 153 | transformer = compose.ColumnTransformer( 154 | [ 155 | ("categoricals", categoricals_transformer, self._categoricals), 156 | ("numericals", "passthrough", self._numericals), 157 | ], 158 | remainder="drop", 159 | ) 160 | regressor = ensemble.RandomForestRegressor( 161 | max_depth=self.max_depth, 162 | n_estimators=self.n_estimators, 163 | random_state=self.random_state, 164 | ) 165 | # pipeline 166 | self._pipeline = pipeline.Pipeline( 167 | steps=[ 168 | ("transformer", transformer), 169 | ("regressor", regressor), 170 | ] 171 | ) 172 | self._pipeline.fit(X=inputs, y=targets[schemas.TargetsSchema.cnt]) 173 | return self 174 | 175 | @T.override 176 | def predict(self, inputs: schemas.Inputs) -> schemas.Outputs: 177 | model = self.get_internal_model() 178 | prediction = model.predict(inputs) 179 | outputs_ = pd.DataFrame( 180 | data={schemas.OutputsSchema.prediction: prediction}, index=inputs.index 181 | ) 182 | outputs = schemas.OutputsSchema.check(data=outputs_) 183 | return outputs 184 | 185 | @T.override 186 | def explain_model(self) -> schemas.FeatureImportances: 187 | model = self.get_internal_model() 188 | regressor = model.named_steps["regressor"] 189 | transformer = model.named_steps["transformer"] 190 | feature = transformer.get_feature_names_out() 191 | feature_importances_ = pd.DataFrame( 192 | data={ 193 | "feature": feature, 194 | "importance": regressor.feature_importances_, 195 | } 196 | ) 197 | feature_importances = schemas.FeatureImportancesSchema.check(data=feature_importances_) 198 | return feature_importances 199 | 200 | @T.override 201 | def explain_samples(self, inputs: schemas.Inputs) -> schemas.SHAPValues: 202 | model = self.get_internal_model() 203 | regressor = model.named_steps["regressor"] 204 | transformer = model.named_steps["transformer"] 205 | transformed = transformer.transform(X=inputs) 206 | explainer = shap.TreeExplainer(model=regressor) 207 | shap_values_ = pd.DataFrame( 208 | data=explainer.shap_values(X=transformed), 209 | columns=transformer.get_feature_names_out(), 210 | ) 211 | shap_values = schemas.SHAPValuesSchema.check(data=shap_values_) 212 | return shap_values 213 | 214 | @T.override 215 | def get_internal_model(self) -> pipeline.Pipeline: 216 | model = self._pipeline 217 | if model is None: 218 | raise ValueError("Model is not fitted yet!") 219 | return model 220 | 221 | 222 | ModelKind = BaselineSklearnModel 223 | -------------------------------------------------------------------------------- /src/bikes/core/schemas.py: -------------------------------------------------------------------------------- 1 | """Define and validate dataframe schemas.""" 2 | 3 | # %% IMPORTS 4 | 5 | import typing as T 6 | 7 | import pandas as pd 8 | import pandera as pa 9 | import pandera.typing as papd 10 | import pandera.typing.common as padt 11 | 12 | # %% TYPES 13 | 14 | # Generic type for a dataframe container 15 | TSchema = T.TypeVar("TSchema", bound="pa.DataFrameModel") 16 | 17 | # %% SCHEMAS 18 | 19 | 20 | class Schema(pa.DataFrameModel): 21 | """Base class for a dataframe schema. 22 | 23 | Use a schema to type your dataframe object. 24 | e.g., to communicate and validate its fields. 25 | """ 26 | 27 | class Config: 28 | """Default configurations for all schemas. 29 | 30 | Parameters: 31 | coerce (bool): convert data type if possible. 32 | strict (bool): ensure the data type is correct. 33 | """ 34 | 35 | coerce: bool = True 36 | strict: bool = True 37 | 38 | @classmethod 39 | def check(cls: T.Type[TSchema], data: pd.DataFrame) -> papd.DataFrame[TSchema]: 40 | """Check the dataframe with this schema. 41 | 42 | Args: 43 | data (pd.DataFrame): dataframe to check. 44 | 45 | Returns: 46 | papd.DataFrame[TSchema]: validated dataframe. 47 | """ 48 | return T.cast(papd.DataFrame[TSchema], cls.validate(data)) 49 | 50 | 51 | class InputsSchema(Schema): 52 | """Schema for the project inputs.""" 53 | 54 | instant: papd.Index[padt.UInt32] = pa.Field(ge=0) 55 | dteday: papd.Series[padt.DateTime] = pa.Field() 56 | season: papd.Series[padt.UInt8] = pa.Field(isin=[1, 2, 3, 4]) 57 | yr: papd.Series[padt.UInt8] = pa.Field(ge=0, le=1) 58 | mnth: papd.Series[padt.UInt8] = pa.Field(ge=1, le=12) 59 | hr: papd.Series[padt.UInt8] = pa.Field(ge=0, le=23) 60 | holiday: papd.Series[padt.Bool] = pa.Field() 61 | weekday: papd.Series[padt.UInt8] = pa.Field(ge=0, le=6) 62 | workingday: papd.Series[padt.Bool] = pa.Field() 63 | weathersit: papd.Series[padt.UInt8] = pa.Field(ge=1, le=4) 64 | temp: papd.Series[padt.Float16] = pa.Field(ge=0, le=1) 65 | atemp: papd.Series[padt.Float16] = pa.Field(ge=0, le=1) 66 | hum: papd.Series[padt.Float16] = pa.Field(ge=0, le=1) 67 | windspeed: papd.Series[padt.Float16] = pa.Field(ge=0, le=1) 68 | casual: papd.Series[padt.UInt32] = pa.Field(ge=0) 69 | registered: papd.Series[padt.UInt32] = pa.Field(ge=0) 70 | 71 | 72 | Inputs = papd.DataFrame[InputsSchema] 73 | 74 | 75 | class TargetsSchema(Schema): 76 | """Schema for the project target.""" 77 | 78 | instant: papd.Index[padt.UInt32] = pa.Field(ge=0) 79 | cnt: papd.Series[padt.UInt32] = pa.Field(ge=0) 80 | 81 | 82 | Targets = papd.DataFrame[TargetsSchema] 83 | 84 | 85 | class OutputsSchema(Schema): 86 | """Schema for the project output.""" 87 | 88 | instant: papd.Index[padt.UInt32] = pa.Field(ge=0) 89 | prediction: papd.Series[padt.UInt32] = pa.Field(ge=0) 90 | 91 | 92 | Outputs = papd.DataFrame[OutputsSchema] 93 | 94 | 95 | class SHAPValuesSchema(Schema): 96 | """Schema for the project shap values.""" 97 | 98 | class Config: 99 | """Default configurations this schema. 100 | 101 | Parameters: 102 | dtype (str): dataframe default data type. 103 | strict (bool): ensure the data type is correct. 104 | """ 105 | 106 | dtype: str = "float32" 107 | strict: bool = False 108 | 109 | 110 | SHAPValues = papd.DataFrame[SHAPValuesSchema] 111 | 112 | 113 | class FeatureImportancesSchema(Schema): 114 | """Schema for the project feature importances.""" 115 | 116 | feature: papd.Series[padt.String] = pa.Field() 117 | importance: papd.Series[padt.Float32] = pa.Field() 118 | 119 | 120 | FeatureImportances = papd.DataFrame[FeatureImportancesSchema] 121 | -------------------------------------------------------------------------------- /src/bikes/io/__init__.py: -------------------------------------------------------------------------------- 1 | """Components related to external operations (inputs and outputs).""" 2 | -------------------------------------------------------------------------------- /src/bikes/io/configs.py: -------------------------------------------------------------------------------- 1 | """Parse, merge, and convert config objects.""" 2 | 3 | # %% IMPORTS 4 | 5 | import typing as T 6 | 7 | import omegaconf as oc 8 | 9 | # %% TYPES 10 | 11 | Config = oc.ListConfig | oc.DictConfig 12 | 13 | # %% PARSERS 14 | 15 | 16 | def parse_file(path: str) -> Config: 17 | """Parse a config file from a path. 18 | 19 | Args: 20 | path (str): path to local config. 21 | 22 | Returns: 23 | Config: representation of the config file. 24 | """ 25 | return oc.OmegaConf.load(path) 26 | 27 | 28 | def parse_string(string: str) -> Config: 29 | """Parse the given config string. 30 | 31 | Args: 32 | string (str): content of config string. 33 | 34 | Returns: 35 | Config: representation of the config string. 36 | """ 37 | return oc.OmegaConf.create(string) 38 | 39 | 40 | # %% MERGERS 41 | 42 | 43 | def merge_configs(configs: T.Sequence[Config]) -> Config: 44 | """Merge a list of config into a single config. 45 | 46 | Args: 47 | configs (T.Sequence[Config]): list of configs. 48 | 49 | Returns: 50 | Config: representation of the merged config objects. 51 | """ 52 | return oc.OmegaConf.merge(*configs) 53 | 54 | 55 | # %% CONVERTERS 56 | 57 | 58 | def to_object(config: Config, resolve: bool = True) -> object: 59 | """Convert a config object to a python object. 60 | 61 | Args: 62 | config (Config): representation of the config. 63 | resolve (bool): resolve variables. Defaults to True. 64 | 65 | Returns: 66 | object: conversion of the config to a python object. 67 | """ 68 | return oc.OmegaConf.to_container(config, resolve=resolve) 69 | -------------------------------------------------------------------------------- /src/bikes/io/datasets.py: -------------------------------------------------------------------------------- 1 | """Read/Write datasets from/to external sources/destinations.""" 2 | 3 | # %% IMPORTS 4 | 5 | import abc 6 | import typing as T 7 | 8 | import mlflow.data.pandas_dataset as lineage 9 | import pandas as pd 10 | import pydantic as pdt 11 | 12 | # %% TYPINGS 13 | 14 | Lineage: T.TypeAlias = lineage.PandasDataset 15 | 16 | # %% READERS 17 | 18 | 19 | class Reader(abc.ABC, pdt.BaseModel, strict=True, frozen=True, extra="forbid"): 20 | """Base class for a dataset reader. 21 | 22 | Use a reader to load a dataset in memory. 23 | e.g., to read file, database, cloud storage, ... 24 | 25 | Parameters: 26 | limit (int, optional): maximum number of rows to read. Defaults to None. 27 | """ 28 | 29 | KIND: str 30 | 31 | limit: int | None = None 32 | 33 | @abc.abstractmethod 34 | def read(self) -> pd.DataFrame: 35 | """Read a dataframe from a dataset. 36 | 37 | Returns: 38 | pd.DataFrame: dataframe representation. 39 | """ 40 | 41 | @abc.abstractmethod 42 | def lineage( 43 | self, 44 | name: str, 45 | data: pd.DataFrame, 46 | targets: str | None = None, 47 | predictions: str | None = None, 48 | ) -> Lineage: 49 | """Generate lineage information. 50 | 51 | Args: 52 | name (str): dataset name. 53 | data (pd.DataFrame): reader dataframe. 54 | targets (str | None): name of the target column. 55 | predictions (str | None): name of the prediction column. 56 | 57 | Returns: 58 | Lineage: lineage information. 59 | """ 60 | 61 | 62 | class ParquetReader(Reader): 63 | """Read a dataframe from a parquet file. 64 | 65 | Parameters: 66 | path (str): local path to the dataset. 67 | """ 68 | 69 | KIND: T.Literal["ParquetReader"] = "ParquetReader" 70 | 71 | path: str 72 | backend: T.Literal["pyarrow", "numpy_nullable"] = "pyarrow" 73 | 74 | @T.override 75 | def read(self) -> pd.DataFrame: 76 | # can't limit rows at read time 77 | data = pd.read_parquet(self.path, dtype_backend=self.backend) 78 | if self.limit is not None: 79 | data = data.head(self.limit) 80 | return data 81 | 82 | @T.override 83 | def lineage( 84 | self, 85 | name: str, 86 | data: pd.DataFrame, 87 | targets: str | None = None, 88 | predictions: str | None = None, 89 | ) -> Lineage: 90 | return lineage.from_pandas( 91 | df=data, 92 | name=name, 93 | source=self.path, 94 | targets=targets, 95 | predictions=predictions, 96 | ) 97 | 98 | 99 | ReaderKind = ParquetReader 100 | 101 | # %% WRITERS 102 | 103 | 104 | class Writer(abc.ABC, pdt.BaseModel, strict=True, frozen=True, extra="forbid"): 105 | """Base class for a dataset writer. 106 | 107 | Use a writer to save a dataset from memory. 108 | e.g., to write file, database, cloud storage, ... 109 | """ 110 | 111 | KIND: str 112 | 113 | @abc.abstractmethod 114 | def write(self, data: pd.DataFrame) -> None: 115 | """Write a dataframe to a dataset. 116 | 117 | Args: 118 | data (pd.DataFrame): dataframe representation. 119 | """ 120 | 121 | 122 | class ParquetWriter(Writer): 123 | """Writer a dataframe to a parquet file. 124 | 125 | Parameters: 126 | path (str): local or S3 path to the dataset. 127 | """ 128 | 129 | KIND: T.Literal["ParquetWriter"] = "ParquetWriter" 130 | 131 | path: str 132 | 133 | @T.override 134 | def write(self, data: pd.DataFrame) -> None: 135 | pd.DataFrame.to_parquet(data, self.path) 136 | 137 | 138 | WriterKind = ParquetWriter 139 | -------------------------------------------------------------------------------- /src/bikes/io/registries.py: -------------------------------------------------------------------------------- 1 | """Savers, loaders, and registers for model registries.""" 2 | 3 | # %% IMPORTS 4 | 5 | import abc 6 | import typing as T 7 | 8 | import mlflow 9 | import pydantic as pdt 10 | from mlflow.pyfunc import PyFuncModel, PythonModel, PythonModelContext 11 | 12 | from bikes.core import models, schemas 13 | from bikes.utils import signers 14 | 15 | # %% TYPES 16 | 17 | # Results of model registry operations 18 | Info: T.TypeAlias = mlflow.models.model.ModelInfo 19 | Alias: T.TypeAlias = mlflow.entities.model_registry.ModelVersion 20 | Version: T.TypeAlias = mlflow.entities.model_registry.ModelVersion 21 | 22 | # %% HELPERS 23 | 24 | 25 | def uri_for_model_alias(name: str, alias: str) -> str: 26 | """Create a model URI from a model name and an alias. 27 | 28 | Args: 29 | name (str): name of the mlflow registered model. 30 | alias (str): alias of the registered model. 31 | 32 | Returns: 33 | str: model URI as "models:/name@alias". 34 | """ 35 | return f"models:/{name}@{alias}" 36 | 37 | 38 | def uri_for_model_version(name: str, version: int) -> str: 39 | """Create a model URI from a model name and a version. 40 | 41 | Args: 42 | name (str): name of the mlflow registered model. 43 | version (int): version of the registered model. 44 | 45 | Returns: 46 | str: model URI as "models:/name/version." 47 | """ 48 | return f"models:/{name}/{version}" 49 | 50 | 51 | def uri_for_model_alias_or_version(name: str, alias_or_version: str | int) -> str: 52 | """Create a model URi from a model name and an alias or version. 53 | 54 | Args: 55 | name (str): name of the mlflow registered model. 56 | alias_or_version (str | int): alias or version of the registered model. 57 | 58 | Returns: 59 | str: model URI as "models:/name@alias" or "models:/name/version" based on input. 60 | """ 61 | if isinstance(alias_or_version, int): 62 | return uri_for_model_version(name=name, version=alias_or_version) 63 | else: 64 | return uri_for_model_alias(name=name, alias=alias_or_version) 65 | 66 | 67 | # %% SAVERS 68 | 69 | 70 | class Saver(abc.ABC, pdt.BaseModel, strict=True, frozen=True, extra="forbid"): 71 | """Base class for saving models in registry. 72 | 73 | Separate model definition from serialization. 74 | e.g., to switch between serialization flavors. 75 | 76 | Parameters: 77 | path (str): model path inside the Mlflow store. 78 | """ 79 | 80 | KIND: str 81 | 82 | path: str = "model" 83 | 84 | @abc.abstractmethod 85 | def save( 86 | self, 87 | model: models.Model, 88 | signature: signers.Signature, 89 | input_example: schemas.Inputs, 90 | ) -> Info: 91 | """Save a model in the model registry. 92 | 93 | Args: 94 | model (models.Model): project model to save. 95 | signature (signers.Signature): model signature. 96 | input_example (schemas.Inputs): sample of inputs. 97 | 98 | Returns: 99 | Info: model saving information. 100 | """ 101 | 102 | 103 | class CustomSaver(Saver): 104 | """Saver for project models using the Mlflow PyFunc module. 105 | 106 | https://mlflow.org/docs/latest/python_api/mlflow.pyfunc.html 107 | """ 108 | 109 | KIND: T.Literal["CustomSaver"] = "CustomSaver" 110 | 111 | class Adapter(PythonModel): # type: ignore[misc] 112 | """Adapt a custom model to the Mlflow PyFunc flavor for saving operations. 113 | 114 | https://mlflow.org/docs/latest/python_api/mlflow.pyfunc.html?#mlflow.pyfunc.PythonModel 115 | """ 116 | 117 | def __init__(self, model: models.Model): 118 | """Initialize the custom saver adapter. 119 | 120 | Args: 121 | model (models.Model): project model. 122 | """ 123 | self.model = model 124 | 125 | def predict( 126 | self, 127 | context: PythonModelContext, 128 | model_input: schemas.Inputs, 129 | params: dict[str, T.Any] | None = None, 130 | ) -> schemas.Outputs: 131 | """Generate predictions with a custom model for the given inputs. 132 | 133 | Args: 134 | context (mlflow.PythonModelContext): mlflow context. 135 | model_input (schemas.Inputs): inputs for the mlflow model. 136 | params (dict[str, T.Any] | None): additional parameters. 137 | 138 | Returns: 139 | schemas.Outputs: validated outputs of the project model. 140 | """ 141 | return self.model.predict(inputs=model_input) 142 | 143 | @T.override 144 | def save( 145 | self, 146 | model: models.Model, 147 | signature: signers.Signature, 148 | input_example: schemas.Inputs, 149 | ) -> Info: 150 | adapter = CustomSaver.Adapter(model=model) 151 | return mlflow.pyfunc.log_model( 152 | python_model=adapter, 153 | signature=signature, 154 | artifact_path=self.path, 155 | input_example=input_example, 156 | ) 157 | 158 | 159 | class BuiltinSaver(Saver): 160 | """Saver for built-in models using an Mlflow flavor module. 161 | 162 | https://mlflow.org/docs/latest/models.html#built-in-model-flavors 163 | 164 | Parameters: 165 | flavor (str): Mlflow flavor module to use for the serialization. 166 | """ 167 | 168 | KIND: T.Literal["BuiltinSaver"] = "BuiltinSaver" 169 | 170 | flavor: str 171 | 172 | @T.override 173 | def save( 174 | self, 175 | model: models.Model, 176 | signature: signers.Signature, 177 | input_example: schemas.Inputs, 178 | ) -> Info: 179 | builtin_model = model.get_internal_model() 180 | module = getattr(mlflow, self.flavor) 181 | return module.log_model( 182 | builtin_model, 183 | artifact_path=self.path, 184 | signature=signature, 185 | input_example=input_example, 186 | ) 187 | 188 | 189 | SaverKind = CustomSaver | BuiltinSaver 190 | 191 | # %% LOADERS 192 | 193 | 194 | class Loader(abc.ABC, pdt.BaseModel, strict=True, frozen=True, extra="forbid"): 195 | """Base class for loading models from registry. 196 | 197 | Separate model definition from deserialization. 198 | e.g., to switch between deserialization flavors. 199 | """ 200 | 201 | KIND: str 202 | 203 | class Adapter(abc.ABC): 204 | """Adapt any model for the project inference.""" 205 | 206 | @abc.abstractmethod 207 | def predict(self, inputs: schemas.Inputs) -> schemas.Outputs: 208 | """Generate predictions with the internal model for the given inputs. 209 | 210 | Args: 211 | inputs (schemas.Inputs): validated inputs for the project model. 212 | 213 | Returns: 214 | schemas.Outputs: validated outputs of the project model. 215 | """ 216 | 217 | @abc.abstractmethod 218 | def load(self, uri: str) -> "Loader.Adapter": 219 | """Load a model from the model registry. 220 | 221 | Args: 222 | uri (str): URI of a model to load. 223 | 224 | Returns: 225 | Loader.Adapter: model loaded. 226 | """ 227 | 228 | 229 | class CustomLoader(Loader): 230 | """Loader for custom models using the Mlflow PyFunc module. 231 | 232 | https://mlflow.org/docs/latest/python_api/mlflow.pyfunc.html 233 | """ 234 | 235 | KIND: T.Literal["CustomLoader"] = "CustomLoader" 236 | 237 | class Adapter(Loader.Adapter): 238 | """Adapt a custom model for the project inference.""" 239 | 240 | def __init__(self, model: PyFuncModel) -> None: 241 | """Initialize the adapter from an mlflow pyfunc model. 242 | 243 | Args: 244 | model (PyFuncModel): mlflow pyfunc model. 245 | """ 246 | self.model = model 247 | 248 | @T.override 249 | def predict(self, inputs: schemas.Inputs) -> schemas.Outputs: 250 | # model validation is already done in predict 251 | outputs = self.model.predict(data=inputs) 252 | return T.cast(schemas.Outputs, outputs) 253 | 254 | @T.override 255 | def load(self, uri: str) -> "CustomLoader.Adapter": 256 | model = mlflow.pyfunc.load_model(model_uri=uri) 257 | adapter = CustomLoader.Adapter(model=model) 258 | return adapter 259 | 260 | 261 | class BuiltinLoader(Loader): 262 | """Loader for built-in models using the Mlflow PyFunc module. 263 | 264 | Note: use Mlflow PyFunc instead of flavors to use standard API. 265 | 266 | https://mlflow.org/docs/latest/models.html#built-in-model-flavors 267 | """ 268 | 269 | KIND: T.Literal["BuiltinLoader"] = "BuiltinLoader" 270 | 271 | class Adapter(Loader.Adapter): 272 | """Adapt a builtin model for the project inference.""" 273 | 274 | def __init__(self, model: PyFuncModel) -> None: 275 | """Initialize the adapter from an mlflow pyfunc model. 276 | 277 | Args: 278 | model (PyFuncModel): mlflow pyfunc model. 279 | """ 280 | self.model = model 281 | 282 | @T.override 283 | def predict(self, inputs: schemas.Inputs) -> schemas.Outputs: 284 | columns = list(schemas.OutputsSchema.to_schema().columns) 285 | outputs = self.model.predict(data=inputs) # unchecked data! 286 | return schemas.Outputs(outputs, columns=columns, index=inputs.index) 287 | 288 | @T.override 289 | def load(self, uri: str) -> "BuiltinLoader.Adapter": 290 | model = mlflow.pyfunc.load_model(model_uri=uri) 291 | adapter = BuiltinLoader.Adapter(model=model) 292 | return adapter 293 | 294 | 295 | LoaderKind = CustomLoader | BuiltinLoader 296 | 297 | # %% REGISTERS 298 | 299 | 300 | class Register(abc.ABC, pdt.BaseModel, strict=True, frozen=True, extra="forbid"): 301 | """Base class for registring models to a location. 302 | 303 | Separate model definition from its registration. 304 | e.g., to change the model registry backend. 305 | 306 | Parameters: 307 | tags (dict[str, T.Any]): tags for the model. 308 | """ 309 | 310 | KIND: str 311 | 312 | tags: dict[str, T.Any] = {} 313 | 314 | @abc.abstractmethod 315 | def register(self, name: str, model_uri: str) -> Version: 316 | """Register a model given its name and URI. 317 | 318 | Args: 319 | name (str): name of the model to register. 320 | model_uri (str): URI of a model to register. 321 | 322 | Returns: 323 | Version: information about the registered model. 324 | """ 325 | 326 | 327 | class MlflowRegister(Register): 328 | """Register for models in the Mlflow Model Registry. 329 | 330 | https://mlflow.org/docs/latest/model-registry.html 331 | """ 332 | 333 | KIND: T.Literal["MlflowRegister"] = "MlflowRegister" 334 | 335 | @T.override 336 | def register(self, name: str, model_uri: str) -> Version: 337 | return mlflow.register_model(name=name, model_uri=model_uri, tags=self.tags) 338 | 339 | 340 | RegisterKind = MlflowRegister 341 | -------------------------------------------------------------------------------- /src/bikes/io/services.py: -------------------------------------------------------------------------------- 1 | """Manage global context during execution.""" 2 | 3 | # %% IMPORTS 4 | 5 | from __future__ import annotations 6 | 7 | import abc 8 | import contextlib as ctx 9 | import sys 10 | import typing as T 11 | 12 | import loguru 13 | import mlflow 14 | import mlflow.tracking as mt 15 | import pydantic as pdt 16 | from plyer import notification 17 | 18 | # %% SERVICES 19 | 20 | 21 | class Service(abc.ABC, pdt.BaseModel, strict=True, frozen=True, extra="forbid"): 22 | """Base class for a global service. 23 | 24 | Use services to manage global contexts. 25 | e.g., logger object, mlflow client, spark context, ... 26 | """ 27 | 28 | @abc.abstractmethod 29 | def start(self) -> None: 30 | """Start the service.""" 31 | 32 | def stop(self) -> None: 33 | """Stop the service.""" 34 | # does nothing by default 35 | 36 | 37 | class LoggerService(Service): 38 | """Service for logging messages. 39 | 40 | https://loguru.readthedocs.io/en/stable/api/logger.html 41 | 42 | Parameters: 43 | sink (str): logging output. 44 | level (str): logging level. 45 | format (str): logging format. 46 | colorize (bool): colorize output. 47 | serialize (bool): convert to JSON. 48 | backtrace (bool): enable exception trace. 49 | diagnose (bool): enable variable display. 50 | catch (bool): catch errors during log handling. 51 | """ 52 | 53 | sink: str = "stderr" 54 | level: str = "DEBUG" 55 | format: str = ( 56 | "[{time:YYYY-MM-DD HH:mm:ss.SSS}]" 57 | "[{level}]" 58 | "[{name}:{function}:{line}]" 59 | " {message}" 60 | ) 61 | colorize: bool = True 62 | serialize: bool = False 63 | backtrace: bool = True 64 | diagnose: bool = False 65 | catch: bool = True 66 | 67 | @T.override 68 | def start(self) -> None: 69 | loguru.logger.remove() 70 | config = self.model_dump() 71 | # use standard sinks or keep the original 72 | sinks = {"stderr": sys.stderr, "stdout": sys.stdout} 73 | config["sink"] = sinks.get(config["sink"], config["sink"]) 74 | loguru.logger.add(**config) 75 | 76 | def logger(self) -> loguru.Logger: 77 | """Return the main logger. 78 | 79 | Returns: 80 | loguru.Logger: the main logger. 81 | """ 82 | return loguru.logger 83 | 84 | 85 | class AlertsService(Service): 86 | """Service for sending notifications. 87 | 88 | Require libnotify-bin on Linux systems. 89 | 90 | In production, use with Slack, Discord, or emails. 91 | 92 | https://plyer.readthedocs.io/en/latest/api.html#plyer.facades.Notification 93 | 94 | Parameters: 95 | enable (bool): use notifications or print. 96 | app_name (str): name of the application. 97 | timeout (int | None): timeout in secs. 98 | """ 99 | 100 | enable: bool = True 101 | app_name: str = "Bikes" 102 | timeout: int | None = None 103 | 104 | @T.override 105 | def start(self) -> None: 106 | pass 107 | 108 | def notify(self, title: str, message: str) -> None: 109 | """Send a notification to the system. 110 | 111 | Args: 112 | title (str): title of the notification. 113 | message (str): message of the notification. 114 | """ 115 | if self.enable: 116 | try: 117 | notification.notify( 118 | title=title, 119 | message=message, 120 | app_name=self.app_name, 121 | timeout=self.timeout, 122 | ) 123 | except NotImplementedError: 124 | print("Notifications are not supported on this system.") 125 | self._print(title=title, message=message) 126 | else: 127 | self._print(title=title, message=message) 128 | 129 | def _print(self, title: str, message: str) -> None: 130 | """Print a notification to the system. 131 | 132 | Args: 133 | title (str): title of the notification. 134 | message (str): message of the notification. 135 | """ 136 | print(f"[{self.app_name}] {title}: {message}") 137 | 138 | 139 | class MlflowService(Service): 140 | """Service for Mlflow tracking and registry. 141 | 142 | Parameters: 143 | tracking_uri (str): the URI for the Mlflow tracking server. 144 | registry_uri (str): the URI for the Mlflow model registry. 145 | experiment_name (str): the name of tracking experiment. 146 | registry_name (str): the name of model registry. 147 | autolog_disable (bool): disable autologging. 148 | autolog_disable_for_unsupported_versions (bool): disable autologging for unsupported versions. 149 | autolog_exclusive (bool): If True, enables exclusive autologging. 150 | autolog_log_input_examples (bool): If True, logs input examples during autologging. 151 | autolog_log_model_signatures (bool): If True, logs model signatures during autologging. 152 | autolog_log_models (bool): If True, enables logging of models during autologging. 153 | autolog_log_datasets (bool): If True, logs datasets used during autologging. 154 | autolog_silent (bool): If True, suppresses all Mlflow warnings during autologging. 155 | """ 156 | 157 | class RunConfig(pdt.BaseModel, strict=True, frozen=True, extra="forbid"): 158 | """Run configuration for Mlflow tracking. 159 | 160 | Parameters: 161 | name (str): name of the run. 162 | description (str | None): description of the run. 163 | tags (dict[str, T.Any] | None): tags for the run. 164 | log_system_metrics (bool | None): enable system metrics logging. 165 | """ 166 | 167 | name: str 168 | description: str | None = None 169 | tags: dict[str, T.Any] | None = None 170 | log_system_metrics: bool | None = True 171 | 172 | # server uri 173 | tracking_uri: str = "./mlruns" 174 | registry_uri: str = "./mlruns" 175 | # experiment 176 | experiment_name: str = "bikes" 177 | # registry 178 | registry_name: str = "bikes" 179 | # autolog 180 | autolog_disable: bool = False 181 | autolog_disable_for_unsupported_versions: bool = False 182 | autolog_exclusive: bool = False 183 | autolog_log_input_examples: bool = True 184 | autolog_log_model_signatures: bool = True 185 | autolog_log_models: bool = False 186 | autolog_log_datasets: bool = False 187 | autolog_silent: bool = False 188 | 189 | @T.override 190 | def start(self) -> None: 191 | # server uri 192 | mlflow.set_tracking_uri(uri=self.tracking_uri) 193 | mlflow.set_registry_uri(uri=self.registry_uri) 194 | # experiment 195 | mlflow.set_experiment(experiment_name=self.experiment_name) 196 | # autolog 197 | mlflow.autolog( 198 | disable=self.autolog_disable, 199 | disable_for_unsupported_versions=self.autolog_disable_for_unsupported_versions, 200 | exclusive=self.autolog_exclusive, 201 | log_input_examples=self.autolog_log_input_examples, 202 | log_model_signatures=self.autolog_log_model_signatures, 203 | log_datasets=self.autolog_log_datasets, 204 | silent=self.autolog_silent, 205 | ) 206 | 207 | @ctx.contextmanager 208 | def run_context(self, run_config: RunConfig) -> T.Generator[mlflow.ActiveRun, None, None]: 209 | """Yield an active Mlflow run and exit it afterwards. 210 | 211 | Args: 212 | run (str): run parameters. 213 | 214 | Yields: 215 | T.Generator[mlflow.ActiveRun, None, None]: active run context. Will be closed at the end of context. 216 | """ 217 | with mlflow.start_run( 218 | run_name=run_config.name, 219 | tags=run_config.tags, 220 | description=run_config.description, 221 | log_system_metrics=run_config.log_system_metrics, 222 | ) as run: 223 | yield run 224 | 225 | def client(self) -> mt.MlflowClient: 226 | """Return a new Mlflow client. 227 | 228 | Returns: 229 | MlflowClient: the mlflow client. 230 | """ 231 | return mt.MlflowClient(tracking_uri=self.tracking_uri, registry_uri=self.registry_uri) 232 | -------------------------------------------------------------------------------- /src/bikes/jobs/__init__.py: -------------------------------------------------------------------------------- 1 | """High-level jobs of the project.""" 2 | 3 | # %% IMPORTS 4 | 5 | from bikes.jobs.evaluations import EvaluationsJob 6 | from bikes.jobs.explanations import ExplanationsJob 7 | from bikes.jobs.inference import InferenceJob 8 | from bikes.jobs.promotion import PromotionJob 9 | from bikes.jobs.training import TrainingJob 10 | from bikes.jobs.tuning import TuningJob 11 | 12 | # %% TYPES 13 | 14 | JobKind = TuningJob | TrainingJob | PromotionJob | InferenceJob | EvaluationsJob | ExplanationsJob 15 | 16 | # %% EXPORTS 17 | 18 | __all__ = [ 19 | "TuningJob", 20 | "TrainingJob", 21 | "PromotionJob", 22 | "InferenceJob", 23 | "EvaluationsJob", 24 | "ExplanationsJob", 25 | "JobKind", 26 | ] 27 | -------------------------------------------------------------------------------- /src/bikes/jobs/base.py: -------------------------------------------------------------------------------- 1 | """Base for high-level project jobs.""" 2 | 3 | # %% IMPORTS 4 | 5 | import abc 6 | import types as TS 7 | import typing as T 8 | 9 | import pydantic as pdt 10 | 11 | from bikes.io import services 12 | 13 | # %% TYPES 14 | 15 | # Local job variables 16 | Locals = T.Dict[str, T.Any] 17 | 18 | # %% JOBS 19 | 20 | 21 | class Job(abc.ABC, pdt.BaseModel, strict=True, frozen=True, extra="forbid"): 22 | """Base class for a job. 23 | 24 | use a job to execute runs in context. 25 | e.g., to define common services like logger 26 | 27 | Parameters: 28 | logger_service (services.LoggerService): manage the logger system. 29 | alerts_service (services.AlertsService): manage the alerts system. 30 | mlflow_service (services.MlflowService): manage the mlflow system. 31 | """ 32 | 33 | KIND: str 34 | 35 | logger_service: services.LoggerService = services.LoggerService() 36 | alerts_service: services.AlertsService = services.AlertsService() 37 | mlflow_service: services.MlflowService = services.MlflowService() 38 | 39 | def __enter__(self) -> T.Self: 40 | """Enter the job context. 41 | 42 | Returns: 43 | T.Self: return the current object. 44 | """ 45 | self.logger_service.start() 46 | logger = self.logger_service.logger() 47 | logger.debug("[START] Logger service: {}", self.logger_service) 48 | logger.debug("[START] Alerts service: {}", self.alerts_service) 49 | self.alerts_service.start() 50 | logger.debug("[START] Mlflow service: {}", self.mlflow_service) 51 | self.mlflow_service.start() 52 | return self 53 | 54 | def __exit__( 55 | self, 56 | exc_type: T.Type[BaseException] | None, 57 | exc_value: BaseException | None, 58 | exc_traceback: TS.TracebackType | None, 59 | ) -> T.Literal[False]: 60 | """Exit the job context. 61 | 62 | Args: 63 | exc_type (T.Type[BaseException] | None): ignored. 64 | exc_value (BaseException | None): ignored. 65 | exc_traceback (TS.TracebackType | None): ignored. 66 | 67 | Returns: 68 | T.Literal[False]: always propagate exceptions. 69 | """ 70 | logger = self.logger_service.logger() 71 | logger.debug("[STOP] Mlflow service: {}", self.mlflow_service) 72 | self.mlflow_service.stop() 73 | logger.debug("[STOP] Alerts service: {}", self.alerts_service) 74 | self.alerts_service.stop() 75 | logger.debug("[STOP] Logger service: {}", self.logger_service) 76 | self.logger_service.stop() 77 | return False # re-raise 78 | 79 | @abc.abstractmethod 80 | def run(self) -> Locals: 81 | """Run the job in context. 82 | 83 | Returns: 84 | Locals: local job variables. 85 | """ 86 | -------------------------------------------------------------------------------- /src/bikes/jobs/evaluations.py: -------------------------------------------------------------------------------- 1 | """Define a job for evaluating registered models with data.""" 2 | 3 | # %% IMPORTS 4 | 5 | import typing as T 6 | 7 | import mlflow 8 | import pandas as pd 9 | import pydantic as pdt 10 | 11 | from bikes.core import metrics as metrics_ 12 | from bikes.core import schemas 13 | from bikes.io import datasets, registries, services 14 | from bikes.jobs import base 15 | 16 | # %% JOBS 17 | 18 | 19 | class EvaluationsJob(base.Job): 20 | """Generate evaluations from a registered model and a dataset. 21 | 22 | Parameters: 23 | run_config (services.MlflowService.RunConfig): mlflow run config. 24 | inputs (datasets.ReaderKind): reader for the inputs data. 25 | targets (datasets.ReaderKind): reader for the targets data. 26 | model_type (str): model type (e.g. "regressor", "classifier"). 27 | alias_or_version (str | int): alias or version for the model. 28 | metrics (metrics_.MetricsKind): metric list to compute. 29 | evaluators (list[str]): list of evaluators to use. 30 | thresholds (dict[str, metrics_.Threshold] | None): metric thresholds. 31 | """ 32 | 33 | KIND: T.Literal["EvaluationsJob"] = "EvaluationsJob" 34 | 35 | # Run 36 | run_config: services.MlflowService.RunConfig = services.MlflowService.RunConfig( 37 | name="Evaluations" 38 | ) 39 | # Data 40 | inputs: datasets.ReaderKind = pdt.Field(..., discriminator="KIND") 41 | targets: datasets.ReaderKind = pdt.Field(..., discriminator="KIND") 42 | # Model 43 | model_type: str = "regressor" 44 | alias_or_version: str | int = "Champion" 45 | # Loader 46 | loader: registries.LoaderKind = pdt.Field(registries.CustomLoader(), discriminator="KIND") 47 | # Metrics 48 | metrics: metrics_.MetricsKind = [metrics_.SklearnMetric()] 49 | # Evaluators 50 | evaluators: list[str] = ["default"] 51 | # Thresholds 52 | thresholds: dict[str, metrics_.Threshold] = { 53 | "r2_score": metrics_.Threshold(threshold=0.5, greater_is_better=True) 54 | } 55 | 56 | @T.override 57 | def run(self) -> base.Locals: 58 | # services 59 | # - logger 60 | logger = self.logger_service.logger() 61 | logger.info("With logger: {}", logger) 62 | # - mlflow 63 | client = self.mlflow_service.client() 64 | logger.info("With client: {}", client.tracking_uri) 65 | with self.mlflow_service.run_context(run_config=self.run_config) as run: 66 | logger.info("With run context: {}", run.info) 67 | # data 68 | # - inputs 69 | logger.info("Read inputs: {}", self.inputs) 70 | inputs_ = self.inputs.read() # unchecked! 71 | inputs = schemas.InputsSchema.check(inputs_) 72 | logger.debug("- Inputs shape: {}", inputs.shape) 73 | # - targets 74 | logger.info("Read targets: {}", self.targets) 75 | targets_ = self.targets.read() # unchecked! 76 | targets = schemas.TargetsSchema.check(targets_) 77 | logger.debug("- Targets shape: {}", targets.shape) 78 | # lineage 79 | # - inputs 80 | logger.info("Log lineage: inputs") 81 | inputs_lineage = self.inputs.lineage(data=inputs, name="inputs") 82 | mlflow.log_input(dataset=inputs_lineage, context=self.run_config.name) 83 | logger.debug("- Inputs lineage: {}", inputs_lineage.to_dict()) 84 | # - targets 85 | logger.info("Log lineage: targets") 86 | targets_lineage = self.targets.lineage( 87 | data=targets, name="targets", targets=schemas.TargetsSchema.cnt 88 | ) 89 | mlflow.log_input(dataset=targets_lineage, context=self.run_config.name) 90 | logger.debug("- Targets lineage: {}", targets_lineage.to_dict()) 91 | # model 92 | logger.info("With model: {}", self.mlflow_service.registry_name) 93 | model_uri = registries.uri_for_model_alias_or_version( 94 | name=self.mlflow_service.registry_name, 95 | alias_or_version=self.alias_or_version, 96 | ) 97 | logger.debug("- Model URI: {}", model_uri) 98 | # loader 99 | logger.info("Load model: {}", self.loader) 100 | model = self.loader.load(uri=model_uri) 101 | logger.debug("- Model: {}", model) 102 | # outputs 103 | logger.info("Predict outputs: {}", len(inputs)) 104 | outputs = model.predict(inputs=inputs) # checked 105 | logger.debug("- Outputs shape: {}", outputs.shape) 106 | # dataset 107 | logger.info("Create dataset: inputs & targets & outputs") 108 | dataset_ = pd.concat([inputs, targets, outputs], axis="columns") 109 | dataset = mlflow.data.from_pandas( # type: ignore[attr-defined] 110 | df=dataset_, 111 | name="evaluation", 112 | targets=schemas.TargetsSchema.cnt, 113 | predictions=schemas.OutputsSchema.prediction, 114 | ) 115 | logger.debug("- Dataset: {}", dataset.to_dict()) 116 | # metrics 117 | logger.debug("Convert metrics: {}", self.metrics) 118 | extra_metrics = [metric.to_mlflow() for metric in self.metrics] 119 | logger.debug("- Extra metrics: {}", extra_metrics) 120 | # thresholds 121 | logger.info("Convert thresholds: {}", self.thresholds) 122 | validation_thresholds = { 123 | name: threshold.to_mlflow() for name, threshold in self.thresholds.items() 124 | } 125 | logger.debug("- Validation thresholds: {}", validation_thresholds) 126 | # evaluations 127 | logger.info("Compute evaluations: {}", self.model_type) 128 | evaluations = mlflow.evaluate( 129 | data=dataset, 130 | model_type=self.model_type, 131 | evaluators=self.evaluators, 132 | extra_metrics=extra_metrics, 133 | validation_thresholds=validation_thresholds, 134 | ) 135 | logger.debug("- Evaluations metrics: {}", evaluations.metrics) 136 | # notify 137 | self.alerts_service.notify( 138 | title="Evaluations Job Finished", 139 | message=f"Evaluation metrics: {evaluations.metrics}", 140 | ) 141 | return locals() 142 | -------------------------------------------------------------------------------- /src/bikes/jobs/explanations.py: -------------------------------------------------------------------------------- 1 | """Define a job for explaining the model structure and decisions.""" 2 | 3 | # %% IMPORTS 4 | 5 | import typing as T 6 | 7 | import pydantic as pdt 8 | 9 | from bikes.core import schemas 10 | from bikes.io import datasets, registries 11 | from bikes.jobs import base 12 | 13 | # %% JOBS 14 | 15 | 16 | class ExplanationsJob(base.Job): 17 | """Generate explanations from the model and a data sample. 18 | 19 | Parameters: 20 | inputs_samples (datasets.ReaderKind): reader for the samples data. 21 | models_explanations (datasets.WriterKind): writer for models explanation. 22 | samples_explanations (datasets.WriterKind): writer for samples explanation. 23 | alias_or_version (str | int): alias or version for the model. 24 | loader (registries.LoaderKind): registry loader for the model. 25 | """ 26 | 27 | KIND: T.Literal["ExplanationsJob"] = "ExplanationsJob" 28 | 29 | # Samples 30 | inputs_samples: datasets.ReaderKind = pdt.Field(..., discriminator="KIND") 31 | # Explanations 32 | models_explanations: datasets.WriterKind = pdt.Field(..., discriminator="KIND") 33 | samples_explanations: datasets.WriterKind = pdt.Field(..., discriminator="KIND") 34 | # Model 35 | alias_or_version: str | int = "Champion" 36 | # Loader 37 | loader: registries.LoaderKind = pdt.Field(registries.CustomLoader(), discriminator="KIND") 38 | 39 | @T.override 40 | def run(self) -> base.Locals: 41 | # services 42 | logger = self.logger_service.logger() 43 | logger.info("With logger: {}", logger) 44 | # inputs 45 | logger.info("Read samples: {}", self.inputs_samples) 46 | inputs_samples = self.inputs_samples.read() # unchecked! 47 | inputs_samples = schemas.InputsSchema.check(inputs_samples) 48 | logger.debug("- Inputs samples shape: {}", inputs_samples.shape) 49 | # model 50 | logger.info("With model: {}", self.mlflow_service.registry_name) 51 | model_uri = registries.uri_for_model_alias_or_version( 52 | name=self.mlflow_service.registry_name, 53 | alias_or_version=self.alias_or_version, 54 | ) 55 | logger.debug("- Model URI: {}", model_uri) 56 | # loader 57 | logger.info("Load model: {}", self.loader) 58 | model = self.loader.load(uri=model_uri).model.unwrap_python_model().model 59 | logger.debug("- Model: {}", model) 60 | # explanations 61 | # - models 62 | logger.info("Explain model: {}", model) 63 | models_explanations = model.explain_model() 64 | logger.debug("- Models explanations shape: {}", models_explanations.shape) 65 | # # - samples 66 | logger.info("Explain samples: {}", len(inputs_samples)) 67 | samples_explanations = model.explain_samples(inputs=inputs_samples) 68 | logger.debug("- Samples explanations shape: {}", samples_explanations.shape) 69 | # write 70 | # - model 71 | logger.info("Write models explanations: {}", self.models_explanations) 72 | self.models_explanations.write(data=models_explanations) 73 | # - samples 74 | logger.info("Write samples explanations: {}", self.samples_explanations) 75 | self.samples_explanations.write(data=samples_explanations) 76 | # notify 77 | self.alerts_service.notify( 78 | title="Explanations Job Finished", 79 | message=f"Features Count: {len(models_explanations)}", 80 | ) 81 | return locals() 82 | -------------------------------------------------------------------------------- /src/bikes/jobs/inference.py: -------------------------------------------------------------------------------- 1 | """Define a job for generating batch predictions from a registered model.""" 2 | 3 | # %% IMPORTS 4 | 5 | import typing as T 6 | 7 | import pydantic as pdt 8 | 9 | from bikes.core import schemas 10 | from bikes.io import datasets, registries 11 | from bikes.jobs import base 12 | 13 | # %% JOBS 14 | 15 | 16 | class InferenceJob(base.Job): 17 | """Generate batch predictions from a registered model. 18 | 19 | Parameters: 20 | inputs (datasets.ReaderKind): reader for the inputs data. 21 | outputs (datasets.WriterKind): writer for the outputs data. 22 | alias_or_version (str | int): alias or version for the model. 23 | loader (registries.LoaderKind): registry loader for the model. 24 | """ 25 | 26 | KIND: T.Literal["InferenceJob"] = "InferenceJob" 27 | 28 | # Inputs 29 | inputs: datasets.ReaderKind = pdt.Field(..., discriminator="KIND") 30 | # Outputs 31 | outputs: datasets.WriterKind = pdt.Field(..., discriminator="KIND") 32 | # Model 33 | alias_or_version: str | int = "Champion" 34 | # Loader 35 | loader: registries.LoaderKind = pdt.Field(registries.CustomLoader(), discriminator="KIND") 36 | 37 | @T.override 38 | def run(self) -> base.Locals: 39 | # services 40 | logger = self.logger_service.logger() 41 | logger.info("With logger: {}", logger) 42 | # inputs 43 | logger.info("Read inputs: {}", self.inputs) 44 | inputs_ = self.inputs.read() # unchecked! 45 | inputs = schemas.InputsSchema.check(inputs_) 46 | logger.debug("- Inputs shape: {}", inputs.shape) 47 | # model 48 | logger.info("With model: {}", self.mlflow_service.registry_name) 49 | model_uri = registries.uri_for_model_alias_or_version( 50 | name=self.mlflow_service.registry_name, 51 | alias_or_version=self.alias_or_version, 52 | ) 53 | logger.debug("- Model URI: {}", model_uri) 54 | # loader 55 | logger.info("Load model: {}", self.loader) 56 | model = self.loader.load(uri=model_uri) 57 | logger.debug("- Model: {}", model) 58 | # outputs 59 | logger.info("Predict outputs: {}", len(inputs)) 60 | outputs = model.predict(inputs=inputs) # checked 61 | logger.debug("- Outputs shape: {}", outputs.shape) 62 | # write 63 | logger.info("Write outputs: {}", self.outputs) 64 | self.outputs.write(data=outputs) 65 | # notify 66 | self.alerts_service.notify( 67 | title="Inference Job Finished", message=f"Outputs Shape: {outputs.shape}" 68 | ) 69 | return locals() 70 | -------------------------------------------------------------------------------- /src/bikes/jobs/promotion.py: -------------------------------------------------------------------------------- 1 | """Define a job for promoting a registered model version with an alias.""" 2 | 3 | # %% IMPORTS 4 | 5 | import typing as T 6 | 7 | from bikes.jobs import base 8 | 9 | # %% JOBS 10 | 11 | 12 | class PromotionJob(base.Job): 13 | """Define a job for promoting a registered model version with an alias. 14 | 15 | https://mlflow.org/docs/latest/model-registry.html#concepts 16 | 17 | Parameters: 18 | alias (str): the mlflow alias to transition the registered model version. 19 | version (int | None): the model version to transition (use None for latest). 20 | """ 21 | 22 | KIND: T.Literal["PromotionJob"] = "PromotionJob" 23 | 24 | alias: str = "Champion" 25 | version: int | None = None 26 | 27 | @T.override 28 | def run(self) -> base.Locals: 29 | # services 30 | # - logger 31 | logger = self.logger_service.logger() 32 | logger.info("With logger: {}", logger) 33 | # - mlflow 34 | client = self.mlflow_service.client() 35 | logger.info("With client: {}", client) 36 | name = self.mlflow_service.registry_name 37 | # version 38 | if self.version is None: # use the latest model version 39 | version = client.search_model_versions( 40 | f"name='{name}'", max_results=1, order_by=["version_number DESC"] 41 | )[0].version 42 | else: 43 | version = self.version 44 | logger.info("From version: {}", version) 45 | # alias 46 | logger.info("To alias: {}", self.alias) 47 | # promote 48 | logger.info("Promote model: {}", name) 49 | client.set_registered_model_alias(name=name, alias=self.alias, version=version) 50 | model_version = client.get_model_version_by_alias(name=name, alias=self.alias) 51 | logger.debug("- Model version: {}", model_version) 52 | # notify 53 | self.alerts_service.notify( 54 | title="Promotion Job Finished", 55 | message=f"Version: {model_version.version} @ {self.alias}", 56 | ) 57 | return locals() 58 | -------------------------------------------------------------------------------- /src/bikes/jobs/training.py: -------------------------------------------------------------------------------- 1 | """Define a job for training and registring a single AI/ML model.""" 2 | 3 | # %% IMPORTS 4 | 5 | import typing as T 6 | 7 | import mlflow 8 | import pydantic as pdt 9 | 10 | from bikes.core import metrics as metrics_ 11 | from bikes.core import models, schemas 12 | from bikes.io import datasets, registries, services 13 | from bikes.jobs import base 14 | from bikes.utils import signers, splitters 15 | 16 | # %% JOBS 17 | 18 | 19 | class TrainingJob(base.Job): 20 | """Train and register a single AI/ML model. 21 | 22 | Parameters: 23 | run_config (services.MlflowService.RunConfig): mlflow run config. 24 | inputs (datasets.ReaderKind): reader for the inputs data. 25 | targets (datasets.ReaderKind): reader for the targets data. 26 | model (models.ModelKind): machine learning model to train. 27 | metrics (metrics_.MetricsKind): metric list to compute. 28 | splitter (splitters.SplitterKind): data sets splitter. 29 | saver (registries.SaverKind): model saver. 30 | signer (signers.SignerKind): model signer. 31 | registry (registries.RegisterKind): model register. 32 | """ 33 | 34 | KIND: T.Literal["TrainingJob"] = "TrainingJob" 35 | 36 | # Run 37 | run_config: services.MlflowService.RunConfig = services.MlflowService.RunConfig(name="Training") 38 | # Data 39 | inputs: datasets.ReaderKind = pdt.Field(..., discriminator="KIND") 40 | targets: datasets.ReaderKind = pdt.Field(..., discriminator="KIND") 41 | # Model 42 | model: models.ModelKind = pdt.Field(models.BaselineSklearnModel(), discriminator="KIND") 43 | # Metrics 44 | metrics: metrics_.MetricsKind = [metrics_.SklearnMetric()] 45 | # Splitter 46 | splitter: splitters.SplitterKind = pdt.Field( 47 | splitters.TrainTestSplitter(), discriminator="KIND" 48 | ) 49 | # Saver 50 | saver: registries.SaverKind = pdt.Field(registries.CustomSaver(), discriminator="KIND") 51 | # Signer 52 | signer: signers.SignerKind = pdt.Field(signers.InferSigner(), discriminator="KIND") 53 | # Registrer 54 | # - avoid shadowing pydantic `register` pydantic function 55 | registry: registries.RegisterKind = pdt.Field(registries.MlflowRegister(), discriminator="KIND") 56 | 57 | @T.override 58 | def run(self) -> base.Locals: 59 | # services 60 | # - logger 61 | logger = self.logger_service.logger() 62 | logger.info("With logger: {}", logger) 63 | # - mlflow 64 | client = self.mlflow_service.client() 65 | logger.info("With client: {}", client.tracking_uri) 66 | with self.mlflow_service.run_context(run_config=self.run_config) as run: 67 | logger.info("With run context: {}", run.info) 68 | # data 69 | # - inputs 70 | logger.info("Read inputs: {}", self.inputs) 71 | inputs_ = self.inputs.read() # unchecked! 72 | inputs = schemas.InputsSchema.check(inputs_) 73 | logger.debug("- Inputs shape: {}", inputs.shape) 74 | # - targets 75 | logger.info("Read targets: {}", self.targets) 76 | targets_ = self.targets.read() # unchecked! 77 | targets = schemas.TargetsSchema.check(targets_) 78 | logger.debug("- Targets shape: {}", targets.shape) 79 | # lineage 80 | # - inputs 81 | logger.info("Log lineage: inputs") 82 | inputs_lineage = self.inputs.lineage(data=inputs, name="inputs") 83 | mlflow.log_input(dataset=inputs_lineage, context=self.run_config.name) 84 | logger.debug("- Inputs lineage: {}", inputs_lineage.to_dict()) 85 | # - targets 86 | logger.info("Log lineage: targets") 87 | targets_lineage = self.targets.lineage( 88 | data=targets, name="targets", targets=schemas.TargetsSchema.cnt 89 | ) 90 | mlflow.log_input(dataset=targets_lineage, context=self.run_config.name) 91 | logger.debug("- Targets lineage: {}", targets_lineage.to_dict()) 92 | # splitter 93 | logger.info("With splitter: {}", self.splitter) 94 | # - index 95 | train_index, test_index = next(self.splitter.split(inputs=inputs, targets=targets)) 96 | # - inputs 97 | inputs_train = T.cast(schemas.Inputs, inputs.iloc[train_index]) 98 | inputs_test = T.cast(schemas.Inputs, inputs.iloc[test_index]) 99 | logger.debug("- Inputs train shape: {}", inputs_train.shape) 100 | logger.debug("- Inputs test shape: {}", inputs_test.shape) 101 | # - targets 102 | targets_train = T.cast(schemas.Targets, targets.iloc[train_index]) 103 | targets_test = T.cast(schemas.Targets, targets.iloc[test_index]) 104 | logger.debug("- Targets train shape: {}", targets_train.shape) 105 | logger.debug("- Targets test shape: {}", targets_test.shape) 106 | # model 107 | logger.info("Fit model: {}", self.model) 108 | self.model.fit(inputs=inputs_train, targets=targets_train) 109 | # outputs 110 | logger.info("Predict outputs: {}", len(inputs_test)) 111 | outputs_test = self.model.predict(inputs=inputs_test) 112 | logger.debug("- Outputs test shape: {}", outputs_test.shape) 113 | # metrics 114 | for i, metric in enumerate(self.metrics, start=1): 115 | logger.info("{}. Compute metric: {}", i, metric) 116 | score = metric.score(targets=targets_test, outputs=outputs_test) 117 | client.log_metric(run_id=run.info.run_id, key=metric.name, value=score) 118 | logger.debug("- Metric score: {}", score) 119 | # signer 120 | logger.info("Sign model: {}", self.signer) 121 | model_signature = self.signer.sign(inputs=inputs, outputs=outputs_test) 122 | logger.debug("- Model signature: {}", model_signature.to_dict()) 123 | # saver 124 | logger.info("Save model: {}", self.saver) 125 | model_info = self.saver.save( 126 | model=self.model, signature=model_signature, input_example=inputs 127 | ) 128 | logger.debug("- Model URI: {}", model_info.model_uri) 129 | # register 130 | logger.info("Register model: {}", self.registry) 131 | model_version = self.registry.register( 132 | name=self.mlflow_service.registry_name, model_uri=model_info.model_uri 133 | ) 134 | logger.debug("- Model version: {}", model_version) 135 | # notify 136 | self.alerts_service.notify( 137 | title="Training Job Finished", 138 | message=f"Model version: {model_version.version}", 139 | ) 140 | return locals() 141 | -------------------------------------------------------------------------------- /src/bikes/jobs/tuning.py: -------------------------------------------------------------------------------- 1 | """Define a job for finding the best hyperparameters for a model.""" 2 | 3 | # %% IMPORTS 4 | 5 | import typing as T 6 | 7 | import mlflow 8 | import pydantic as pdt 9 | 10 | from bikes.core import metrics, models, schemas 11 | from bikes.io import datasets, services 12 | from bikes.jobs import base 13 | from bikes.utils import searchers, splitters 14 | 15 | # %% JOBS 16 | 17 | 18 | class TuningJob(base.Job): 19 | """Find the best hyperparameters for a model. 20 | 21 | Parameters: 22 | run_config (services.MlflowService.RunConfig): mlflow run config. 23 | inputs (datasets.ReaderKind): reader for the inputs data. 24 | targets (datasets.ReaderKind): reader for the targets data. 25 | model (models.ModelKind): machine learning model to tune. 26 | metric (metrics.MetricKind): tuning metric to optimize. 27 | splitter (splitters.SplitterKind): data sets splitter. 28 | searcher: (searchers.SearcherKind): hparams searcher. 29 | """ 30 | 31 | KIND: T.Literal["TuningJob"] = "TuningJob" 32 | 33 | # Run 34 | run_config: services.MlflowService.RunConfig = services.MlflowService.RunConfig(name="Tuning") 35 | # Data 36 | inputs: datasets.ReaderKind = pdt.Field(..., discriminator="KIND") 37 | targets: datasets.ReaderKind = pdt.Field(..., discriminator="KIND") 38 | # Model 39 | model: models.ModelKind = pdt.Field(models.BaselineSklearnModel(), discriminator="KIND") 40 | # Metric 41 | metric: metrics.MetricKind = pdt.Field(metrics.SklearnMetric(), discriminator="KIND") 42 | # splitter 43 | splitter: splitters.SplitterKind = pdt.Field( 44 | splitters.TimeSeriesSplitter(), discriminator="KIND" 45 | ) 46 | # Searcher 47 | searcher: searchers.SearcherKind = pdt.Field( 48 | searchers.GridCVSearcher( 49 | param_grid={ 50 | "max_depth": [3, 5, 7], 51 | } 52 | ), 53 | discriminator="KIND", 54 | ) 55 | 56 | @T.override 57 | def run(self) -> base.Locals: 58 | """Run the tuning job in context.""" 59 | # services 60 | # - logger 61 | logger = self.logger_service.logger() 62 | logger.info("With logger: {}", logger) 63 | with self.mlflow_service.run_context(run_config=self.run_config) as run: 64 | logger.info("With run context: {}", run.info) 65 | # data 66 | # - inputs 67 | logger.info("Read inputs: {}", self.inputs) 68 | inputs_ = self.inputs.read() # unchecked! 69 | inputs = schemas.InputsSchema.check(inputs_) 70 | logger.debug("- Inputs shape: {}", inputs.shape) 71 | # - targets 72 | logger.info("Read targets: {}", self.targets) 73 | targets_ = self.targets.read() # unchecked! 74 | targets = schemas.TargetsSchema.check(targets_) 75 | logger.debug("- Targets shape: {}", targets.shape) 76 | # lineage 77 | # - inputs 78 | logger.info("Log lineage: inputs") 79 | inputs_lineage = self.inputs.lineage(data=inputs, name="inputs") 80 | mlflow.log_input(dataset=inputs_lineage, context=self.run_config.name) 81 | logger.debug("- Inputs lineage: {}", inputs_lineage.to_dict()) 82 | # - targets 83 | logger.info("Log lineage: targets") 84 | targets_lineage = self.targets.lineage( 85 | data=targets, name="targets", targets=schemas.TargetsSchema.cnt 86 | ) 87 | mlflow.log_input(dataset=targets_lineage, context=self.run_config.name) 88 | logger.debug("- Targets lineage: {}", targets_lineage.to_dict()) 89 | # model 90 | logger.info("With model: {}", self.model) 91 | # metric 92 | logger.info("With metric: {}", self.metric) 93 | # splitter 94 | logger.info("With splitter: {}", self.splitter) 95 | # searcher 96 | logger.info("Run searcher: {}", self.searcher) 97 | results, best_score, best_params = self.searcher.search( 98 | model=self.model, 99 | metric=self.metric, 100 | inputs=inputs, 101 | targets=targets, 102 | cv=self.splitter, 103 | ) 104 | logger.debug("- Results: {}", results.shape) 105 | logger.debug("- Best Score: {}", best_score) 106 | logger.debug("- Best Params: {}", best_params) 107 | # notify 108 | self.alerts_service.notify( 109 | title="Tuning Job Finished", message=f"Best score: {best_score}" 110 | ) 111 | return locals() 112 | -------------------------------------------------------------------------------- /src/bikes/scripts.py: -------------------------------------------------------------------------------- 1 | """Scripts for the CLI application.""" 2 | 3 | # ruff: noqa: E402 4 | 5 | # %% WARNINGS 6 | 7 | import warnings 8 | 9 | # disable annoying mlflow warnings 10 | warnings.filterwarnings(action="ignore", category=UserWarning) 11 | 12 | # %% IMPORTS 13 | 14 | import argparse 15 | import json 16 | import sys 17 | 18 | from bikes import settings 19 | from bikes.io import configs 20 | 21 | # %% PARSERS 22 | 23 | parser = argparse.ArgumentParser(description="Run an AI/ML job from YAML/JSON configs.") 24 | parser.add_argument("files", nargs="*", help="Config files for the job (local path only).") 25 | parser.add_argument("-e", "--extras", nargs="*", default=[], help="Config strings for the job.") 26 | parser.add_argument("-s", "--schema", action="store_true", help="Print settings schema and exit.") 27 | 28 | # %% SCRIPTS 29 | 30 | 31 | def main(argv: list[str] | None = None) -> int: 32 | """Main script for the application.""" 33 | args = parser.parse_args(argv) 34 | if args.schema: 35 | schema = settings.MainSettings.model_json_schema() 36 | json.dump(schema, sys.stdout, indent=4) 37 | return 0 38 | files = [configs.parse_file(file) for file in args.files] 39 | strings = [configs.parse_string(string) for string in args.extras] 40 | if len(files) == 0 and len(strings) == 0: 41 | raise RuntimeError("No configs provided.") 42 | config = configs.merge_configs([*files, *strings]) 43 | object_ = configs.to_object(config) # python object 44 | setting = settings.MainSettings.model_validate(object_) 45 | with setting.job as runner: 46 | runner.run() 47 | return 0 48 | -------------------------------------------------------------------------------- /src/bikes/settings.py: -------------------------------------------------------------------------------- 1 | """Define settings for the application.""" 2 | 3 | # %% IMPORTS 4 | 5 | import pydantic as pdt 6 | import pydantic_settings as pdts 7 | 8 | from bikes import jobs 9 | 10 | # %% SETTINGS 11 | 12 | 13 | class Settings(pdts.BaseSettings, strict=True, frozen=True, extra="forbid"): 14 | """Base class for application settings. 15 | 16 | Use settings to provide high-level preferences. 17 | i.e., to separate settings from provider (e.g., CLI). 18 | """ 19 | 20 | 21 | class MainSettings(Settings): 22 | """Main settings of the application. 23 | 24 | Parameters: 25 | job (jobs.JobKind): job to run. 26 | """ 27 | 28 | job: jobs.JobKind = pdt.Field(..., discriminator="KIND") 29 | -------------------------------------------------------------------------------- /src/bikes/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """Helper components of the project.""" 2 | -------------------------------------------------------------------------------- /src/bikes/utils/searchers.py: -------------------------------------------------------------------------------- 1 | """Find the best hyperparameters for a model.""" 2 | 3 | # %% IMPORTS 4 | 5 | import abc 6 | import typing as T 7 | 8 | import pandas as pd 9 | import pydantic as pdt 10 | from sklearn import model_selection 11 | 12 | from bikes.core import metrics, models, schemas 13 | from bikes.utils import splitters 14 | 15 | # %% TYPES 16 | 17 | # Grid of model params 18 | Grid = dict[models.ParamKey, list[models.ParamValue]] 19 | 20 | # Results of a model search 21 | Results = tuple[ 22 | T.Annotated[pd.DataFrame, "details"], 23 | T.Annotated[float, "best score"], 24 | T.Annotated[models.Params, "best params"], 25 | ] 26 | 27 | # Cross-validation options for searchers 28 | CrossValidation = int | splitters.TrainTestSplits | splitters.Splitter 29 | 30 | # %% SEARCHERS 31 | 32 | 33 | class Searcher(abc.ABC, pdt.BaseModel, strict=True, frozen=True, extra="forbid"): 34 | """Base class for a searcher. 35 | 36 | Use searcher to fine-tune models. 37 | i.e., to find the best model params. 38 | 39 | Parameters: 40 | param_grid (Grid): mapping of param key -> values. 41 | """ 42 | 43 | KIND: str 44 | 45 | param_grid: Grid 46 | 47 | @abc.abstractmethod 48 | def search( 49 | self, 50 | model: models.Model, 51 | metric: metrics.Metric, 52 | inputs: schemas.Inputs, 53 | targets: schemas.Targets, 54 | cv: CrossValidation, 55 | ) -> Results: 56 | """Search the best model for the given inputs and targets. 57 | 58 | Args: 59 | model (models.Model): AI/ML model to fine-tune. 60 | metric (metrics.Metric): main metric to optimize. 61 | inputs (schemas.Inputs): model inputs for tuning. 62 | targets (schemas.Targets): model targets for tuning. 63 | cv (CrossValidation): choice for cross-fold validation. 64 | 65 | Returns: 66 | Results: all the results of the searcher execution process. 67 | """ 68 | 69 | 70 | class GridCVSearcher(Searcher): 71 | """Grid searcher with cross-fold validation. 72 | 73 | Convention: metric returns higher values for better models. 74 | 75 | Parameters: 76 | n_jobs (int, optional): number of jobs to run in parallel. 77 | refit (bool): refit the model after the tuning. 78 | verbose (int): set the searcher verbosity level. 79 | error_score (str | float): strategy or value on error. 80 | return_train_score (bool): include train scores if True. 81 | """ 82 | 83 | KIND: T.Literal["GridCVSearcher"] = "GridCVSearcher" 84 | 85 | n_jobs: int | None = None 86 | refit: bool = True 87 | verbose: int = 3 88 | error_score: str | float = "raise" 89 | return_train_score: bool = False 90 | 91 | @T.override 92 | def search( 93 | self, 94 | model: models.Model, 95 | metric: metrics.Metric, 96 | inputs: schemas.Inputs, 97 | targets: schemas.Targets, 98 | cv: CrossValidation, 99 | ) -> Results: 100 | searcher = model_selection.GridSearchCV( 101 | estimator=model, 102 | scoring=metric.scorer, 103 | cv=cv, 104 | param_grid=self.param_grid, 105 | n_jobs=self.n_jobs, 106 | refit=self.refit, 107 | verbose=self.verbose, 108 | error_score=self.error_score, 109 | return_train_score=self.return_train_score, 110 | ) 111 | searcher.fit(inputs, targets) 112 | results = pd.DataFrame(searcher.cv_results_) 113 | return results, searcher.best_score_, searcher.best_params_ 114 | 115 | 116 | SearcherKind = GridCVSearcher 117 | -------------------------------------------------------------------------------- /src/bikes/utils/signers.py: -------------------------------------------------------------------------------- 1 | """Generate signatures for AI/ML models.""" 2 | 3 | # %% IMPORTS 4 | 5 | import abc 6 | import typing as T 7 | 8 | import mlflow 9 | import pydantic as pdt 10 | from mlflow.models import signature as ms 11 | 12 | from bikes.core import schemas 13 | 14 | # %% TYPES 15 | 16 | Signature: T.TypeAlias = ms.ModelSignature 17 | 18 | # %% SIGNERS 19 | 20 | 21 | class Signer(abc.ABC, pdt.BaseModel, strict=True, frozen=True, extra="forbid"): 22 | """Base class for generating model signatures. 23 | 24 | Allow switching between model signing strategies. 25 | e.g., automatic inference, manual model signature, ... 26 | 27 | https://mlflow.org/docs/latest/models.html#model-signature-and-input-example 28 | """ 29 | 30 | KIND: str 31 | 32 | @abc.abstractmethod 33 | def sign(self, inputs: schemas.Inputs, outputs: schemas.Outputs) -> Signature: 34 | """Generate a model signature from its inputs/outputs. 35 | 36 | Args: 37 | inputs (schemas.Inputs): inputs data. 38 | outputs (schemas.Outputs): outputs data. 39 | 40 | Returns: 41 | Signature: signature of the model. 42 | """ 43 | 44 | 45 | class InferSigner(Signer): 46 | """Generate model signatures from inputs/outputs data.""" 47 | 48 | KIND: T.Literal["InferSigner"] = "InferSigner" 49 | 50 | @T.override 51 | def sign(self, inputs: schemas.Inputs, outputs: schemas.Outputs) -> Signature: 52 | return mlflow.models.infer_signature(model_input=inputs, model_output=outputs) 53 | 54 | 55 | SignerKind = InferSigner 56 | -------------------------------------------------------------------------------- /src/bikes/utils/splitters.py: -------------------------------------------------------------------------------- 1 | """Split dataframes into subsets (e.g., train/valid/test).""" 2 | 3 | # %% IMPORTS 4 | 5 | import abc 6 | import typing as T 7 | 8 | import numpy as np 9 | import numpy.typing as npt 10 | import pydantic as pdt 11 | from sklearn import model_selection 12 | 13 | from bikes.core import schemas 14 | 15 | # %% TYPES 16 | 17 | Index = npt.NDArray[np.int64] 18 | TrainTestIndex = tuple[Index, Index] 19 | TrainTestSplits = T.Iterator[TrainTestIndex] 20 | 21 | # %% SPLITTERS 22 | 23 | 24 | class Splitter(abc.ABC, pdt.BaseModel, strict=True, frozen=True, extra="forbid"): 25 | """Base class for a splitter. 26 | 27 | Use splitters to split data in sets. 28 | e.g., split between a train/test subsets. 29 | 30 | # https://scikit-learn.org/stable/glossary.html#term-CV-splitter 31 | """ 32 | 33 | KIND: str 34 | 35 | @abc.abstractmethod 36 | def split( 37 | self, 38 | inputs: schemas.Inputs, 39 | targets: schemas.Targets, 40 | groups: Index | None = None, 41 | ) -> TrainTestSplits: 42 | """Split a dataframe into subsets. 43 | 44 | Args: 45 | inputs (schemas.Inputs): model inputs. 46 | targets (schemas.Targets): model targets. 47 | groups (Index | None, optional): group labels. 48 | 49 | Returns: 50 | TrainTestSplits: iterator over the dataframe train/test splits. 51 | """ 52 | 53 | @abc.abstractmethod 54 | def get_n_splits( 55 | self, 56 | inputs: schemas.Inputs, 57 | targets: schemas.Targets, 58 | groups: Index | None = None, 59 | ) -> int: 60 | """Get the number of splits generated. 61 | 62 | Args: 63 | inputs (schemas.Inputs): models inputs. 64 | targets (schemas.Targets): model targets. 65 | groups (Index | None, optional): group labels. 66 | 67 | Returns: 68 | int: number of splits generated. 69 | """ 70 | 71 | 72 | class TrainTestSplitter(Splitter): 73 | """Split a dataframe into a train and test set. 74 | 75 | Parameters: 76 | shuffle (bool): shuffle the dataset. Default is False. 77 | test_size (int | float): number/ratio for the test set. 78 | random_state (int): random state for the splitter object. 79 | """ 80 | 81 | KIND: T.Literal["TrainTestSplitter"] = "TrainTestSplitter" 82 | 83 | shuffle: bool = False # required (time sensitive) 84 | test_size: int | float = 24 * 30 * 2 # 2 months 85 | random_state: int = 42 86 | 87 | @T.override 88 | def split( 89 | self, 90 | inputs: schemas.Inputs, 91 | targets: schemas.Targets, 92 | groups: Index | None = None, 93 | ) -> TrainTestSplits: 94 | index = np.arange(len(inputs)) # return integer position 95 | train_index, test_index = model_selection.train_test_split( 96 | index, 97 | shuffle=self.shuffle, 98 | test_size=self.test_size, 99 | random_state=self.random_state, 100 | ) 101 | yield train_index, test_index 102 | 103 | @T.override 104 | def get_n_splits( 105 | self, 106 | inputs: schemas.Inputs, 107 | targets: schemas.Targets, 108 | groups: Index | None = None, 109 | ) -> int: 110 | return 1 111 | 112 | 113 | class TimeSeriesSplitter(Splitter): 114 | """Split a dataframe into fixed time series subsets. 115 | 116 | Parameters: 117 | gap (int): gap between splits. 118 | n_splits (int): number of split to generate. 119 | test_size (int | float): number or ratio for the test dataset. 120 | """ 121 | 122 | KIND: T.Literal["TimeSeriesSplitter"] = "TimeSeriesSplitter" 123 | 124 | gap: int = 0 125 | n_splits: int = 4 126 | test_size: int | float = 24 * 30 * 2 # 2 months 127 | 128 | @T.override 129 | def split( 130 | self, 131 | inputs: schemas.Inputs, 132 | targets: schemas.Targets, 133 | groups: Index | None = None, 134 | ) -> TrainTestSplits: 135 | splitter = model_selection.TimeSeriesSplit( 136 | n_splits=self.n_splits, test_size=self.test_size, gap=self.gap 137 | ) 138 | yield from splitter.split(inputs) 139 | 140 | @T.override 141 | def get_n_splits( 142 | self, 143 | inputs: schemas.Inputs, 144 | targets: schemas.Targets, 145 | groups: Index | None = None, 146 | ) -> int: 147 | return self.n_splits 148 | 149 | 150 | SplitterKind = TrainTestSplitter | TimeSeriesSplitter 151 | -------------------------------------------------------------------------------- /tasks/check.just: -------------------------------------------------------------------------------- 1 | # run check tasks 2 | [group('check')] 3 | check: check-code check-type check-format check-security check-coverage 4 | 5 | # check code quality 6 | [group('check')] 7 | check-code: 8 | uv run ruff check {{SOURCES}} {{TESTS}} 9 | 10 | # check code coverage 11 | [group('check')] 12 | check-coverage numprocesses="auto" cov_fail_under="80": 13 | uv run pytest --numprocesses={{numprocesses}} --cov={{SOURCES}} --cov-fail-under={{cov_fail_under}} {{TESTS}} 14 | 15 | # check code format 16 | [group('check')] 17 | check-format: 18 | uv run ruff format --check {{SOURCES}} {{TESTS}} 19 | 20 | # check code security 21 | [group('check')] 22 | check-security: 23 | uv run bandit --recursive --configfile=pyproject.toml {{SOURCES}} 24 | 25 | # check unit tests 26 | [group('check')] 27 | check-test numprocesses="auto": 28 | uv run pytest --numprocesses={{numprocesses}} {{TESTS}} 29 | 30 | # check code typing 31 | [group('check')] 32 | check-type: 33 | uv run mypy {{SOURCES}} {{TESTS}} 34 | -------------------------------------------------------------------------------- /tasks/clean.just: -------------------------------------------------------------------------------- 1 | # run clean tasks 2 | [group('clean')] 3 | clean: clean-build clean-cache clean-constraints clean-coverage clean-docs clean-environment clean-mlruns clean-mypy clean-outputs clean-pytest clean-python clean-requirements clean-ruff 4 | 5 | # clean build folders 6 | [group('clean')] 7 | clean-build: 8 | rm -rf dist/ 9 | rm -rf build/ 10 | 11 | # clean cache folder 12 | [group('clean')] 13 | clean-cache: 14 | rm -rf .cache/ 15 | 16 | # clean constraints file 17 | [group('clean')] 18 | clean-constraints: 19 | rm -rf constraints.txt 20 | 21 | # clean coverage files 22 | [group('clean')] 23 | clean-coverage: 24 | rm -rf .coverage* 25 | 26 | # clean docs folder 27 | [group('clean')] 28 | clean-docs: 29 | rm -rf docs/ 30 | 31 | # clean environment file 32 | [group('clean')] 33 | clean-environment: 34 | rm -f python_env.yaml 35 | 36 | # clean mlruns folder 37 | [group('clean')] 38 | clean-mlruns: 39 | rm -rf mlruns/* 40 | 41 | # clean mypy folders 42 | [group('clean')] 43 | clean-mypy: 44 | rm -rf .mypy_cache/ 45 | 46 | # clean outputs folder 47 | [group('clean')] 48 | clean-outputs: 49 | rm -rf outputs/* 50 | 51 | # clean pytest cache 52 | [group('clean')] 53 | clean-pytest: 54 | rm -rf .pytest_cache/ 55 | 56 | # clean python caches 57 | [group('clean')] 58 | clean-python: 59 | find . -type f -name '*.py[co]' -delete 60 | find . -type d -name __pycache__ -exec rm -r {} \+ 61 | 62 | # clean requirements file 63 | [group('clean')] 64 | clean-requirements: 65 | rm -f requirements.txt 66 | 67 | # clean ruff cache 68 | [group('clean')] 69 | clean-ruff: 70 | rm -rf .ruff_cache/ 71 | 72 | # clean venv folder 73 | [confirm] 74 | [group('clean')] 75 | clean-venv: 76 | rm -rf .venv/ 77 | -------------------------------------------------------------------------------- /tasks/commit.just: -------------------------------------------------------------------------------- 1 | # bump package 2 | [group('commit')] 3 | commit-bump: 4 | uv run cz bump 5 | 6 | # commit package 7 | [group('commit')] 8 | commit-files: 9 | uv run cz commit 10 | 11 | # get commit info 12 | [group('commit')] 13 | commit-info: 14 | uv run cz info 15 | -------------------------------------------------------------------------------- /tasks/doc.just: -------------------------------------------------------------------------------- 1 | # run doc tasks 2 | [group('doc')] 3 | doc: doc-build 4 | 5 | # build documentation 6 | [group('doc')] 7 | doc-build format="google" output="docs": clean-docs 8 | uv run pdoc --docformat={{format}} --output-directory={{output}} {{SOURCES}}/{{PACKAGE}} 9 | 10 | # serve documentation 11 | [group('doc')] 12 | doc-serve format="google" port="8088": 13 | uv run pdoc --docformat={{format}} --port={{port}} {{SOURCES}}/{{PACKAGE}} 14 | -------------------------------------------------------------------------------- /tasks/docker.just: -------------------------------------------------------------------------------- 1 | # run docker tasks 2 | [group('docker')] 3 | docker: docker-build docker-run 4 | 5 | # build docker image 6 | [group('docker')] 7 | docker-build tag="latest": package-build 8 | docker build --tag={{REPOSITORY}}:{{tag}} . 9 | 10 | # start docker compose 11 | [group('docker')] 12 | docker-compose: 13 | docker compose up 14 | 15 | # run latest docker image 16 | [group('docker')] 17 | docker-run tag="latest": 18 | docker run --rm {{REPOSITORY}}:{{tag}} 19 | -------------------------------------------------------------------------------- /tasks/format.just: -------------------------------------------------------------------------------- 1 | # run format tasks 2 | [group('format')] 3 | format: format-import format-source 4 | 5 | # format code import 6 | [group('format')] 7 | format-import: 8 | uv run ruff check --select=I --fix {{SOURCES}} {{TESTS}} 9 | 10 | # format code source 11 | [group('format')] 12 | format-source: 13 | uv run ruff format {{SOURCES}} {{TESTS}} 14 | -------------------------------------------------------------------------------- /tasks/install.just: -------------------------------------------------------------------------------- 1 | # run install tasks 2 | [group('install')] 3 | install: install-project install-hooks 4 | 5 | # install git hooks 6 | [group('install')] 7 | install-hooks: 8 | uv run pre-commit install --hook-type=pre-push 9 | uv run pre-commit install --hook-type=commit-msg 10 | 11 | # install the project 12 | [group('install')] 13 | install-project: 14 | uv sync --all-groups 15 | 16 | # install github rulesets 17 | [group('install')] 18 | install-rulesets: 19 | #!/usr/bin/env bash 20 | set -euo pipefail 21 | repo=$(gh repo view --json=name --jq=.name) 22 | owner=$(gh repo view --json=owner --jq=.owner.login) 23 | gh api --method POST -H "Accept: application/vnd.github+json" \ 24 | "/repos/$owner/$repo/rulesets" --input=".github/rulesets/main.json" 25 | -------------------------------------------------------------------------------- /tasks/mlflow.just: -------------------------------------------------------------------------------- 1 | # run mlflow tasks 2 | [group('mlflow')] 3 | mlflow: mlflow-doctor mlflow-serve 4 | 5 | # run mlflow doctor 6 | [group('mlflow')] 7 | mlflow-doctor: 8 | uv run mlflow doctor 9 | 10 | # start mlflow server 11 | [group('mlflow')] 12 | mlflow-serve host="127.0.0.1" port="5000" uri="./mlruns": 13 | uv run mlflow server --host={{host}} --port={{port}} --backend-store-uri={{uri}} 14 | -------------------------------------------------------------------------------- /tasks/package.just: -------------------------------------------------------------------------------- 1 | # run package tasks 2 | [group('package')] 3 | package: package-build 4 | 5 | # build package constraints 6 | [group('package')] 7 | package-constraints constraints="constraints.txt": 8 | uv pip compile pyproject.toml --generate-hashes --output-file={{constraints}} 9 | 10 | # build python package 11 | [group('package')] 12 | package-build constraints="constraints.txt": clean-build package-constraints 13 | uv build --build-constraint={{constraints}} --require-hashes --wheel 14 | -------------------------------------------------------------------------------- /tasks/project.just: -------------------------------------------------------------------------------- 1 | # run project tasks 2 | [group('project')] 3 | project: project-environment (project-run "tuning") (project-run "training") (project-run "promotion") (project-run "inference") (project-run "evaluations") (project-run "explanations") 4 | 5 | # export environment file 6 | [group('project')] 7 | project-environment: project-requirements 8 | #!/usr/bin/env python3 9 | import json 10 | with open(".python-version", "r") as reader: 11 | python = reader.read().strip() # version 12 | configuration = {"python": python} 13 | with open("requirements.txt", "r") as reader: 14 | dependencies = [] 15 | for line in reader.readlines(): 16 | dependency = line.split(" ")[0].strip() 17 | if "pywin32" in dependency or "#" in dependency: 18 | continue 19 | dependencies.append(dependency) 20 | configuration["dependencies"] = dependencies 21 | with open("python_env.yaml", "w") as writer: 22 | json.dump(configuration, writer, indent=4) 23 | writer.write("\n") # add new line at the end 24 | 25 | # export requirements file 26 | [group('project')] 27 | project-requirements: 28 | uv export --format=requirements-txt --no-dev --no-hashes \ 29 | --no-editable --no-emit-project --output-file=requirements.txt 30 | 31 | # run project job using mlflow 32 | [group('project')] 33 | project-run job: 34 | uv run mlflow run --experiment-name={{REPOSITORY}} --run-name={{capitalize(job)}} -P conf_file=confs/{{job}}.yaml . 35 | -------------------------------------------------------------------------------- /tests/confs/invalid/1. invalid.yaml: -------------------------------------------------------------------------------- 1 | job: 2 | KIND: UnknownJob 3 | -------------------------------------------------------------------------------- /tests/confs/valid/0. tuning.yaml: -------------------------------------------------------------------------------- 1 | job: 2 | KIND: TuningJob 3 | inputs: 4 | KIND: ParquetReader 5 | path: "${tests_path:}/data/inputs_sample.parquet" 6 | limit: 1500 7 | targets: 8 | KIND: ParquetReader 9 | path: "${tests_path:}/data/targets_sample.parquet" 10 | limit: 1500 11 | splitter: 12 | KIND: TimeSeriesSplitter 13 | n_splits: 3 14 | test_size: 167 # 1 week 15 | -------------------------------------------------------------------------------- /tests/confs/valid/1. training.yaml: -------------------------------------------------------------------------------- 1 | job: 2 | KIND: TrainingJob 3 | inputs: 4 | KIND: ParquetReader 5 | path: "${tests_path:}/data/inputs_sample.parquet" 6 | limit: 1500 7 | targets: 8 | KIND: ParquetReader 9 | path: "${tests_path:}/data/targets_sample.parquet" 10 | limit: 1500 11 | -------------------------------------------------------------------------------- /tests/confs/valid/2. promotion.yaml: -------------------------------------------------------------------------------- 1 | job: 2 | KIND: PromotionJob 3 | -------------------------------------------------------------------------------- /tests/confs/valid/3. inference.yaml: -------------------------------------------------------------------------------- 1 | job: 2 | KIND: InferenceJob 3 | inputs: 4 | KIND: ParquetReader 5 | path: "${tests_path:}/data/inputs_sample.parquet" 6 | limit: 1500 7 | outputs: 8 | KIND: ParquetWriter 9 | path: "${tmp_path:}/outputs_sample.parquet" 10 | -------------------------------------------------------------------------------- /tests/confs/valid/5. evaluations.yaml: -------------------------------------------------------------------------------- 1 | job: 2 | KIND: EvaluationsJob 3 | inputs: 4 | KIND: ParquetReader 5 | path: "${tests_path:}/data/inputs_sample.parquet" 6 | limit: 1500 7 | targets: 8 | KIND: ParquetReader 9 | path: "${tests_path:}/data/targets_sample.parquet" 10 | limit: 1500 11 | -------------------------------------------------------------------------------- /tests/confs/valid/6. explanations.yaml: -------------------------------------------------------------------------------- 1 | job: 2 | KIND: ExplanationsJob 3 | inputs_samples: 4 | KIND: ParquetReader 5 | path: "${tests_path:}/data/inputs_sample.parquet" 6 | limit: 100 7 | models_explanations: 8 | KIND: ParquetWriter 9 | path: "${tmp_path:}/models_explanations.parquet" 10 | samples_explanations: 11 | KIND: ParquetWriter 12 | path: "${tmp_path:}/samples_explanations.parquet" 13 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """Configuration for the tests.""" 2 | 3 | # %% IMPORTS 4 | 5 | import os 6 | import typing as T 7 | 8 | import omegaconf 9 | import pytest 10 | from _pytest import logging as pl 11 | 12 | from bikes.core import metrics, models, schemas 13 | from bikes.io import datasets, registries, services 14 | from bikes.utils import searchers, signers, splitters 15 | 16 | # %% CONFIGS 17 | 18 | LIMIT = 1500 19 | N_SPLITS = 3 20 | TEST_SIZE = 24 * 7 # 1 week 21 | 22 | # %% FIXTURES 23 | 24 | # %% - Paths 25 | 26 | 27 | @pytest.fixture(scope="session") 28 | def tests_path() -> str: 29 | """Return the path of the tests folder.""" 30 | file = os.path.abspath(__file__) 31 | parent = os.path.dirname(file) 32 | return parent 33 | 34 | 35 | @pytest.fixture(scope="session") 36 | def data_path(tests_path: str) -> str: 37 | """Return the path of the data folder.""" 38 | return os.path.join(tests_path, "data") 39 | 40 | 41 | @pytest.fixture(scope="session") 42 | def confs_path(tests_path: str) -> str: 43 | """Return the path of the confs folder.""" 44 | return os.path.join(tests_path, "confs") 45 | 46 | 47 | @pytest.fixture(scope="session") 48 | def inputs_path(data_path: str) -> str: 49 | """Return the path of the inputs dataset.""" 50 | return os.path.join(data_path, "inputs_sample.parquet") 51 | 52 | 53 | @pytest.fixture(scope="session") 54 | def targets_path(data_path: str) -> str: 55 | """Return the path of the targets dataset.""" 56 | return os.path.join(data_path, "targets_sample.parquet") 57 | 58 | 59 | @pytest.fixture(scope="session") 60 | def outputs_path(data_path: str) -> str: 61 | """Return the path of the outputs dataset.""" 62 | return os.path.join(data_path, "outputs_sample.parquet") 63 | 64 | 65 | @pytest.fixture(scope="function") 66 | def tmp_outputs_path(tmp_path: str) -> str: 67 | """Return a tmp path for the outputs dataset.""" 68 | return os.path.join(tmp_path, "outputs.parquet") 69 | 70 | 71 | @pytest.fixture(scope="function") 72 | def tmp_models_explanations_path(tmp_path: str) -> str: 73 | """Return a tmp path for the model explanations dataset.""" 74 | return os.path.join(tmp_path, "models_explanations.parquet") 75 | 76 | 77 | @pytest.fixture(scope="function") 78 | def tmp_samples_explanations_path(tmp_path: str) -> str: 79 | """Return a tmp path for the samples explanations dataset.""" 80 | return os.path.join(tmp_path, "samples_explanations.parquet") 81 | 82 | 83 | # %% - Configs 84 | 85 | 86 | @pytest.fixture(scope="session") 87 | def extra_config() -> str: 88 | """Extra config for scripts.""" 89 | # use OmegaConf resolver: ${tmp_path:} 90 | config = """ 91 | { 92 | "job": { 93 | "alerts_service": { 94 | "enable": false, 95 | }, 96 | "mlflow_service": { 97 | "tracking_uri": "${tmp_path:}/tracking/", 98 | "registry_uri": "${tmp_path:}/registry/", 99 | } 100 | } 101 | } 102 | """ 103 | return config 104 | 105 | 106 | # %% - Datasets 107 | 108 | 109 | @pytest.fixture(scope="session") 110 | def inputs_reader(inputs_path: str) -> datasets.ParquetReader: 111 | """Return a reader for the inputs dataset.""" 112 | return datasets.ParquetReader(path=inputs_path, limit=LIMIT) 113 | 114 | 115 | @pytest.fixture(scope="session") 116 | def inputs_samples_reader(inputs_path: str) -> datasets.ParquetReader: 117 | """Return a reader for the inputs samples dataset.""" 118 | return datasets.ParquetReader(path=inputs_path, limit=100) 119 | 120 | 121 | @pytest.fixture(scope="session") 122 | def targets_reader(targets_path: str) -> datasets.ParquetReader: 123 | """Return a reader for the targets dataset.""" 124 | return datasets.ParquetReader(path=targets_path, limit=LIMIT) 125 | 126 | 127 | @pytest.fixture(scope="session") 128 | def outputs_reader( 129 | outputs_path: str, 130 | inputs_reader: datasets.ParquetReader, 131 | targets_reader: datasets.ParquetReader, 132 | ) -> datasets.ParquetReader: 133 | """Return a reader for the outputs dataset.""" 134 | # generate outputs if it is missing 135 | if not os.path.exists(outputs_path): 136 | inputs = schemas.InputsSchema.check(inputs_reader.read()) 137 | targets = schemas.TargetsSchema.check(targets_reader.read()) 138 | model = models.BaselineSklearnModel().fit(inputs=inputs, targets=targets) 139 | outputs = schemas.OutputsSchema.check(model.predict(inputs=inputs)) 140 | outputs_writer = datasets.ParquetWriter(path=outputs_path) 141 | outputs_writer.write(data=outputs) 142 | return datasets.ParquetReader(path=outputs_path, limit=LIMIT) 143 | 144 | 145 | @pytest.fixture(scope="function") 146 | def tmp_outputs_writer(tmp_outputs_path: str) -> datasets.ParquetWriter: 147 | """Return a writer for the tmp outputs dataset.""" 148 | return datasets.ParquetWriter(path=tmp_outputs_path) 149 | 150 | 151 | @pytest.fixture(scope="function") 152 | def tmp_models_explanations_writer( 153 | tmp_models_explanations_path: str, 154 | ) -> datasets.ParquetWriter: 155 | """Return a writer for the tmp model explanations dataset.""" 156 | return datasets.ParquetWriter(path=tmp_models_explanations_path) 157 | 158 | 159 | @pytest.fixture(scope="function") 160 | def tmp_samples_explanations_writer( 161 | tmp_samples_explanations_path: str, 162 | ) -> datasets.ParquetWriter: 163 | """Return a writer for the tmp samples explanations dataset.""" 164 | return datasets.ParquetWriter(path=tmp_samples_explanations_path) 165 | 166 | 167 | # %% - Dataframes 168 | 169 | 170 | @pytest.fixture(scope="session") 171 | def inputs(inputs_reader: datasets.ParquetReader) -> schemas.Inputs: 172 | """Return the inputs data.""" 173 | data = inputs_reader.read() 174 | return schemas.InputsSchema.check(data) 175 | 176 | 177 | @pytest.fixture(scope="session") 178 | def inputs_samples(inputs_samples_reader: datasets.ParquetReader) -> schemas.Inputs: 179 | """Return the inputs samples data.""" 180 | data = inputs_samples_reader.read() 181 | return schemas.InputsSchema.check(data) 182 | 183 | 184 | @pytest.fixture(scope="session") 185 | def targets(targets_reader: datasets.ParquetReader) -> schemas.Targets: 186 | """Return the targets data.""" 187 | data = targets_reader.read() 188 | return schemas.TargetsSchema.check(data) 189 | 190 | 191 | @pytest.fixture(scope="session") 192 | def outputs(outputs_reader: datasets.ParquetReader) -> schemas.Outputs: 193 | """Return the outputs data.""" 194 | data = outputs_reader.read() 195 | return schemas.OutputsSchema.check(data) 196 | 197 | 198 | # %% - Splitters 199 | 200 | 201 | @pytest.fixture(scope="session") 202 | def train_test_splitter() -> splitters.TrainTestSplitter: 203 | """Return the default train test splitter.""" 204 | return splitters.TrainTestSplitter(test_size=TEST_SIZE) 205 | 206 | 207 | @pytest.fixture(scope="session") 208 | def time_series_splitter() -> splitters.TimeSeriesSplitter: 209 | """Return the default time series splitter.""" 210 | return splitters.TimeSeriesSplitter(n_splits=N_SPLITS, test_size=TEST_SIZE) 211 | 212 | 213 | # %% - Searchers 214 | 215 | 216 | @pytest.fixture(scope="session") 217 | def searcher() -> searchers.GridCVSearcher: 218 | """Return the default searcher object.""" 219 | param_grid = {"max_depth": [1, 2], "n_estimators": [3]} 220 | return searchers.GridCVSearcher(param_grid=param_grid) 221 | 222 | 223 | # %% - Subsets 224 | 225 | 226 | @pytest.fixture(scope="session") 227 | def train_test_sets( 228 | train_test_splitter: splitters.TrainTestSplitter, 229 | inputs: schemas.Inputs, 230 | targets: schemas.Targets, 231 | ) -> tuple[schemas.Inputs, schemas.Targets, schemas.Inputs, schemas.Targets]: 232 | """Return the inputs and targets train and test sets from the splitter.""" 233 | train_index, test_index = next(train_test_splitter.split(inputs=inputs, targets=targets)) 234 | inputs_train, inputs_test = inputs.iloc[train_index], inputs.iloc[test_index] 235 | targets_train, targets_test = targets.iloc[train_index], targets.iloc[test_index] 236 | return ( 237 | T.cast(schemas.Inputs, inputs_train), 238 | T.cast(schemas.Targets, targets_train), 239 | T.cast(schemas.Inputs, inputs_test), 240 | T.cast(schemas.Targets, targets_test), 241 | ) 242 | 243 | 244 | # %% - Models 245 | 246 | 247 | @pytest.fixture(scope="session") 248 | def model( 249 | train_test_sets: tuple[schemas.Inputs, schemas.Targets, schemas.Inputs, schemas.Targets], 250 | ) -> models.BaselineSklearnModel: 251 | """Return a train model for testing.""" 252 | model = models.BaselineSklearnModel() 253 | inputs_train, targets_train, _, _ = train_test_sets 254 | model.fit(inputs=inputs_train, targets=targets_train) 255 | return model 256 | 257 | 258 | # %% - Metrics 259 | 260 | 261 | @pytest.fixture(scope="session") 262 | def metric() -> metrics.SklearnMetric: 263 | """Return the default metric.""" 264 | return metrics.SklearnMetric() 265 | 266 | 267 | # %% - Signers 268 | 269 | 270 | @pytest.fixture(scope="session") 271 | def signer() -> signers.InferSigner: 272 | """Return a model signer.""" 273 | return signers.InferSigner() 274 | 275 | 276 | # %% - Services 277 | 278 | 279 | @pytest.fixture(scope="session", autouse=True) 280 | def logger_service() -> T.Generator[services.LoggerService, None, None]: 281 | """Return and start the logger service.""" 282 | service = services.LoggerService(colorize=False, diagnose=True) 283 | service.start() 284 | yield service 285 | service.stop() 286 | 287 | 288 | @pytest.fixture 289 | def logger_caplog( 290 | caplog: pl.LogCaptureFixture, logger_service: services.LoggerService 291 | ) -> T.Generator[pl.LogCaptureFixture, None, None]: 292 | """Extend pytest caplog fixture with the logger service (loguru).""" 293 | # https://loguru.readthedocs.io/en/stable/resources/migration.html#replacing-caplog-fixture-from-pytest-library 294 | logger = logger_service.logger() 295 | handler_id = logger.add( 296 | caplog.handler, 297 | level=0, 298 | format="{message}", 299 | filter=lambda record: record["level"].no >= caplog.handler.level, 300 | enqueue=False, # Set to 'True' if your test is spawning child processes. 301 | ) 302 | yield caplog 303 | logger.remove(handler_id) 304 | 305 | 306 | @pytest.fixture(scope="session", autouse=True) 307 | def alerts_service() -> T.Generator[services.AlertsService, None, None]: 308 | """Return and start the alerter service.""" 309 | service = services.AlertsService(enable=False) 310 | service.start() 311 | yield service 312 | service.stop() 313 | 314 | 315 | @pytest.fixture(scope="function", autouse=True) 316 | def mlflow_service(tmp_path: str) -> T.Generator[services.MlflowService, None, None]: 317 | """Return and start the mlflow service.""" 318 | service = services.MlflowService( 319 | tracking_uri=f"{tmp_path}/tracking/", 320 | registry_uri=f"{tmp_path}/registry/", 321 | experiment_name="Experiment-Testing", 322 | registry_name="Registry-Testing", 323 | ) 324 | service.start() 325 | yield service 326 | service.stop() 327 | 328 | 329 | # %% - Resolvers 330 | 331 | 332 | @pytest.fixture(scope="session", autouse=True) 333 | def tests_path_resolver(tests_path: str) -> str: 334 | """Register the tests path resolver with OmegaConf.""" 335 | 336 | def resolver() -> str: 337 | """Get tests path.""" 338 | return tests_path 339 | 340 | omegaconf.OmegaConf.register_new_resolver("tests_path", resolver, use_cache=True, replace=False) 341 | return tests_path 342 | 343 | 344 | @pytest.fixture(scope="function", autouse=True) 345 | def tmp_path_resolver(tmp_path: str) -> str: 346 | """Register the tmp path resolver with OmegaConf.""" 347 | 348 | def resolver() -> str: 349 | """Get tmp data path.""" 350 | return tmp_path 351 | 352 | omegaconf.OmegaConf.register_new_resolver("tmp_path", resolver, use_cache=False, replace=True) 353 | return tmp_path 354 | 355 | 356 | # %% - Signatures 357 | 358 | 359 | @pytest.fixture(scope="session") 360 | def signature( 361 | signer: signers.Signer, inputs: schemas.Inputs, outputs: schemas.Outputs 362 | ) -> signers.Signature: 363 | """Return the signature for the testing model.""" 364 | return signer.sign(inputs=inputs, outputs=outputs) 365 | 366 | 367 | # %% - Registries 368 | 369 | 370 | @pytest.fixture(scope="session") 371 | def saver() -> registries.CustomSaver: 372 | """Return the default model saver.""" 373 | return registries.CustomSaver(path="custom-model") 374 | 375 | 376 | @pytest.fixture(scope="session") 377 | def loader() -> registries.CustomLoader: 378 | """Return the default model loader.""" 379 | return registries.CustomLoader() 380 | 381 | 382 | @pytest.fixture(scope="session") 383 | def register() -> registries.MlflowRegister: 384 | """Return the default model register.""" 385 | tags = {"context": "test", "role": "fixture"} 386 | return registries.MlflowRegister(tags=tags) 387 | 388 | 389 | @pytest.fixture(scope="function") 390 | def model_version( 391 | model: models.Model, 392 | inputs: schemas.Inputs, 393 | signature: signers.Signature, 394 | saver: registries.Saver, 395 | register: registries.Register, 396 | mlflow_service: services.MlflowService, 397 | ) -> registries.Version: 398 | """Save and register the default model version.""" 399 | run_config = mlflow_service.RunConfig(name="Custom-Run") 400 | with mlflow_service.run_context(run_config=run_config): 401 | info = saver.save(model=model, signature=signature, input_example=inputs) 402 | version = register.register(name=mlflow_service.registry_name, model_uri=info.model_uri) 403 | return version 404 | 405 | 406 | @pytest.fixture(scope="function") 407 | def model_alias( 408 | model_version: registries.Version, 409 | mlflow_service: services.MlflowService, 410 | ) -> registries.Alias: 411 | """Promote the default model version with an alias.""" 412 | alias = "Promotion" 413 | client = mlflow_service.client() 414 | client.set_registered_model_alias( 415 | name=mlflow_service.registry_name, alias=alias, version=model_version.version 416 | ) 417 | model_alias = client.get_model_version_by_alias(name=mlflow_service.registry_name, alias=alias) 418 | return model_alias 419 | -------------------------------------------------------------------------------- /tests/core/test_metrics.py: -------------------------------------------------------------------------------- 1 | # %% IMPORTS 2 | 3 | import mlflow 4 | import pandas as pd 5 | import pytest 6 | 7 | from bikes.core import metrics, models, schemas 8 | 9 | # %% METRICS 10 | 11 | 12 | @pytest.mark.parametrize( 13 | "name, interval, greater_is_better", 14 | [ 15 | ("mean_squared_error", [0, float("inf")], True), 16 | ("mean_absolute_error", [float("-inf"), 0], False), 17 | ], 18 | ) 19 | def test_sklearn_metric( 20 | name: str, 21 | interval: tuple[int, int], 22 | greater_is_better: bool, 23 | model: models.Model, 24 | inputs: schemas.Inputs, 25 | targets: schemas.Targets, 26 | outputs: schemas.Outputs, 27 | ) -> None: 28 | # given 29 | low, high = interval 30 | data = pd.concat([targets, outputs], axis="columns") 31 | metric = metrics.SklearnMetric(name=name, greater_is_better=greater_is_better) 32 | # when 33 | score = metric.score(targets=targets, outputs=outputs) 34 | scorer = metric.scorer(model=model, inputs=inputs, targets=targets) 35 | mlflow_metric = metric.to_mlflow() 36 | mlflow_results = mlflow.evaluate( 37 | data=data, 38 | predictions=schemas.OutputsSchema.prediction, 39 | targets=schemas.TargetsSchema.cnt, 40 | extra_metrics=[mlflow_metric], 41 | ) 42 | # then 43 | # - score 44 | assert low <= score <= high, "Score should be in the expected interval!" 45 | # - scorer 46 | assert low <= scorer <= high, "Scorer should be in the expected interval!" 47 | # - mlflow metric 48 | assert mlflow_metric.name == metric.name, "Mlflow metric name should be the same!" # type: ignore[attr-defined] 49 | assert ( 50 | mlflow_metric.greater_is_better == metric.greater_is_better # type: ignore[attr-defined] 51 | ), "Mlflow metric greater is better should be the same!" 52 | # - mlflow results 53 | assert mlflow_results.metrics == {metric.name: score * (1 if greater_is_better else -1)}, ( 54 | "Mlflow results metrics should have the same name and score!" 55 | ) 56 | 57 | 58 | # %% THRESHOLDS 59 | 60 | 61 | def test_threshold() -> None: 62 | # given 63 | threshold = metrics.Threshold(threshold=10, greater_is_better=True) 64 | # when 65 | mlflow_threshold = threshold.to_mlflow() 66 | # then 67 | assert mlflow_threshold.threshold == threshold.threshold, "Threshold should be the same!" 68 | assert mlflow_threshold.greater_is_better == threshold.greater_is_better, ( 69 | "Greater is better should be the same!" 70 | ) 71 | -------------------------------------------------------------------------------- /tests/core/test_models.py: -------------------------------------------------------------------------------- 1 | # %% IMPORTS 2 | 3 | import typing as T 4 | 5 | import pytest 6 | 7 | from bikes.core import models, schemas 8 | 9 | # %% MODELS 10 | 11 | 12 | def test_model(inputs_samples: schemas.Inputs) -> None: 13 | # given 14 | class MyModel(models.Model): 15 | KIND: T.Literal["MyModel"] = "MyModel" 16 | 17 | # public 18 | a: int = 1 19 | b: int = 2 20 | # private 21 | _c: int = 3 22 | 23 | def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> T.Self: 24 | return self 25 | 26 | def predict(self, inputs: schemas.Inputs) -> schemas.Outputs: 27 | return schemas.Outputs() 28 | 29 | # when 30 | model = MyModel(a=10) 31 | params_init = model.get_params() 32 | params_set_params = model.set_params(b=20).get_params() 33 | with pytest.raises(NotImplementedError) as explain_model_error: 34 | model.explain_model() 35 | with pytest.raises(NotImplementedError) as explain_samples_error: 36 | model.explain_samples(inputs=inputs_samples) 37 | with pytest.raises(NotImplementedError) as get_internal_model_error: 38 | model.get_internal_model() 39 | # then 40 | assert params_init == { 41 | "a": 10, 42 | "b": 2, 43 | }, "Model should have the given params after init!" 44 | assert params_set_params == { 45 | "a": 10, 46 | "b": 20, 47 | }, "Model should have the given params after set_params!" 48 | assert isinstance(explain_model_error.value, NotImplementedError), ( 49 | "Model should raise NotImplementedError for explain_model_error()!" 50 | ) 51 | assert isinstance(explain_samples_error.value, NotImplementedError), ( 52 | "Model should raise NotImplementedError for explain_samples_error()!" 53 | ) 54 | assert isinstance(get_internal_model_error.value, NotImplementedError), ( 55 | "Model should raise NotImplementedError for get_internal_model_error()!" 56 | ) 57 | 58 | 59 | def test_baseline_sklearn_model( 60 | train_test_sets: tuple[schemas.Inputs, schemas.Targets, schemas.Inputs, schemas.Targets], 61 | ) -> None: 62 | # given 63 | params = {"max_depth": 3, "n_estimators": 5, "random_state": 0} 64 | inputs_train, targets_train, inputs_test, _ = train_test_sets 65 | model = models.BaselineSklearnModel().set_params(**params) 66 | # when 67 | with pytest.raises(ValueError) as not_fitted_error: 68 | model.get_internal_model() 69 | model.fit(inputs=inputs_train, targets=targets_train) 70 | outputs = model.predict(inputs=inputs_test) 71 | shap_values = model.explain_samples(inputs=inputs_test) 72 | feature_importances = model.explain_model() 73 | # then 74 | assert not_fitted_error.match("Model is not fitted yet!"), ( 75 | "Model should raise an error when not fitted!" 76 | ) 77 | # - model 78 | assert model.get_params() == params, "Model should have the given params!" 79 | assert model.get_internal_model() is not None, "Internal model should be fitted!" 80 | # - outputs 81 | assert outputs.ndim == 2, "Outputs should be a dataframe!" 82 | # - shap values 83 | assert len(shap_values.index) == len(inputs_test.index), ( 84 | "SHAP values should be the same length as inputs!" 85 | ) 86 | assert len(shap_values.columns) >= len(inputs_test.columns), ( 87 | "SHAP values should have more features than inputs!" 88 | ) 89 | # - feature importances 90 | assert feature_importances["importance"].sum() == 1.0, ( 91 | "Feature importances should add up to 1.0!" 92 | ) 93 | assert len(feature_importances["feature"]) >= len(inputs_train.columns), ( 94 | "Feature importances should have more features than inputs!" 95 | ) 96 | -------------------------------------------------------------------------------- /tests/core/test_schemas.py: -------------------------------------------------------------------------------- 1 | # %% IMPORTS 2 | 3 | from bikes.core import models, schemas 4 | from bikes.io import datasets 5 | 6 | # %% SCHEMAS 7 | 8 | 9 | def test_inputs_schema(inputs_reader: datasets.Reader) -> None: 10 | # given 11 | schema = schemas.InputsSchema 12 | # when 13 | data = inputs_reader.read() 14 | # then 15 | assert schema.check(data) is not None, "Inputs data should be valid!" 16 | 17 | 18 | def test_targets_schema(targets_reader: datasets.Reader) -> None: 19 | # given 20 | schema = schemas.TargetsSchema 21 | # when 22 | data = targets_reader.read() 23 | # then 24 | assert schema.check(data) is not None, "Targets data should be valid!" 25 | 26 | 27 | def test_outputs_schema(outputs_reader: datasets.Reader) -> None: 28 | # given 29 | schema = schemas.OutputsSchema 30 | # when 31 | data = outputs_reader.read() 32 | # then 33 | assert schema.check(data) is not None, "Outputs data should be valid!" 34 | 35 | 36 | def test_shap_values_schema( 37 | model: models.Model, 38 | train_test_sets: tuple[schemas.Inputs, schemas.Targets, schemas.Inputs, schemas.Targets], 39 | ) -> None: 40 | # given 41 | schema = schemas.SHAPValuesSchema 42 | _, _, inputs_test, _ = train_test_sets 43 | # when 44 | data = model.explain_samples(inputs=inputs_test) 45 | # then 46 | assert schema.check(data) is not None, "SHAP values data should be valid!" 47 | 48 | 49 | def test_feature_importances_schema(model: models.Model) -> None: 50 | # given 51 | schema = schemas.FeatureImportancesSchema 52 | # when 53 | data = model.explain_model() 54 | # then 55 | assert schema.check(data) is not None, "Feature importance data should be valid!" 56 | -------------------------------------------------------------------------------- /tests/data/inputs_sample.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmind/mlops-python-package/856a5ab74958cec36ae31619636d07c4baf9698f/tests/data/inputs_sample.parquet -------------------------------------------------------------------------------- /tests/data/outputs_sample.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmind/mlops-python-package/856a5ab74958cec36ae31619636d07c4baf9698f/tests/data/outputs_sample.parquet -------------------------------------------------------------------------------- /tests/data/targets_sample.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmind/mlops-python-package/856a5ab74958cec36ae31619636d07c4baf9698f/tests/data/targets_sample.parquet -------------------------------------------------------------------------------- /tests/io/test_configs.py: -------------------------------------------------------------------------------- 1 | # %% IMPORTS 2 | 3 | import os 4 | 5 | import omegaconf as oc 6 | 7 | from bikes.io import configs 8 | 9 | # %% PARSERS 10 | 11 | 12 | def test_parse_file(tmp_path: str) -> None: 13 | # given 14 | text = """ 15 | a: 1 16 | b: True 17 | c: [3, 4] 18 | """ 19 | path = os.path.join(tmp_path, "config.yml") 20 | with open(path, "w", encoding="utf-8") as writer: 21 | writer.write(text) 22 | # when 23 | config = configs.parse_file(path) 24 | # then 25 | assert config == { 26 | "a": 1, 27 | "b": True, 28 | "c": [3, 4], 29 | }, "File config should be parsed correctly!" 30 | 31 | 32 | def test_parse_string() -> None: 33 | # given 34 | text = """{"a": 1, "b": 2, "data": [3, 4]}""" 35 | # when 36 | config = configs.parse_string(text) 37 | # then 38 | assert config == { 39 | "a": 1, 40 | "b": 2, 41 | "data": [3, 4], 42 | }, "String config should be parsed correctly!" 43 | 44 | 45 | # %% MERGERS 46 | 47 | 48 | def test_merge_configs() -> None: 49 | # given 50 | confs = [oc.OmegaConf.create({"x": i, i: i}) for i in range(3)] 51 | # when 52 | config = configs.merge_configs(confs) 53 | # then 54 | assert config == { 55 | 0: 0, 56 | 1: 1, 57 | 2: 2, 58 | "x": 2, 59 | }, "Configs should be merged correctly!" 60 | 61 | 62 | # %% CONVERTERS 63 | 64 | 65 | def test_to_object() -> None: 66 | # given 67 | values = { 68 | "a": 1, 69 | "b": True, 70 | "c": [3, 4], 71 | } 72 | config = oc.OmegaConf.create(values) 73 | # when 74 | object_ = configs.to_object(config) 75 | # then 76 | assert object_ == values, "Object should be the same!" 77 | assert isinstance(object_, dict), "Object should be a dict!" 78 | -------------------------------------------------------------------------------- /tests/io/test_datasets.py: -------------------------------------------------------------------------------- 1 | # %% IMPORTS 2 | 3 | import os 4 | 5 | import pytest 6 | 7 | from bikes.core import schemas 8 | from bikes.io import datasets 9 | 10 | # %% READERS 11 | 12 | 13 | @pytest.mark.parametrize("limit", [None, 50]) 14 | def test_parquet_reader(limit: int | None, inputs_path: str) -> None: 15 | # given 16 | reader = datasets.ParquetReader(path=inputs_path, limit=limit) 17 | # when 18 | data = reader.read() 19 | lineage = reader.lineage(name="inputs", data=data) 20 | # then 21 | # - data 22 | assert data.ndim == 2, "Data should be a dataframe!" 23 | if limit is not None: 24 | assert len(data) == limit, "Data should have the limit size!" 25 | # - lineage 26 | assert lineage.name == "inputs", "Lineage name should be inputs!" 27 | assert lineage.source.uri == inputs_path, "Lineage source uri should be the inputs path!" # type: ignore[attr-defined] 28 | assert lineage.schema is not None and set(lineage.schema.input_names()) == set(data.columns), ( 29 | "Lineage schema names should be the data columns!" 30 | ) 31 | assert lineage.profile["num_rows"] == len( # type: ignore[index] 32 | data 33 | ), "Lineage profile should contain the data row count!" 34 | 35 | 36 | # %% WRITERS 37 | 38 | 39 | def test_parquet_writer(targets: schemas.Targets, tmp_outputs_path: str) -> None: 40 | # given 41 | writer = datasets.ParquetWriter(path=tmp_outputs_path) 42 | # when 43 | writer.write(data=targets) 44 | # then 45 | assert os.path.exists(tmp_outputs_path), "Data should be written!" 46 | -------------------------------------------------------------------------------- /tests/io/test_registries.py: -------------------------------------------------------------------------------- 1 | # %% IMPORTS 2 | 3 | from bikes.core import models, schemas 4 | from bikes.io import registries, services 5 | from bikes.utils import signers 6 | 7 | # %% HELPERS 8 | 9 | 10 | def test_uri_for_model_alias() -> None: 11 | # given 12 | name = "testing" 13 | alias = "Champion" 14 | # when 15 | uri = registries.uri_for_model_alias(name=name, alias=alias) 16 | # then 17 | assert uri == f"models:/{name}@{alias}", "The model URI should be valid!" 18 | 19 | 20 | def test_uri_for_model_version() -> None: 21 | # given 22 | name = "testing" 23 | version = 1 24 | # when 25 | uri = registries.uri_for_model_version(name=name, version=version) 26 | # then 27 | assert uri == f"models:/{name}/{version}", "The model URI should be valid!" 28 | 29 | 30 | def test_uri_for_model_alias_or_version() -> None: 31 | # given 32 | name = "testing" 33 | alias = "Champion" 34 | version = 1 35 | # when 36 | alias_uri = registries.uri_for_model_alias_or_version(name=name, alias_or_version=alias) 37 | version_uri = registries.uri_for_model_alias_or_version(name=name, alias_or_version=version) 38 | # then 39 | assert alias_uri == registries.uri_for_model_alias(name=name, alias=alias), ( 40 | "The alias URI should be valid!" 41 | ) 42 | assert version_uri == registries.uri_for_model_version(name=name, version=version), ( 43 | "The version URI should be valid!" 44 | ) 45 | 46 | 47 | # %% SAVERS/LOADERS/REGISTERS 48 | 49 | 50 | def test_custom_pipeline( 51 | model: models.Model, 52 | inputs: schemas.Inputs, 53 | signature: signers.Signature, 54 | mlflow_service: services.MlflowService, 55 | ) -> None: 56 | # given 57 | path = "custom" 58 | name = "Custom" 59 | tags = {"registry": "mlflow"} 60 | saver = registries.CustomSaver(path=path) 61 | loader = registries.CustomLoader() 62 | register = registries.MlflowRegister(tags=tags) 63 | run_config = mlflow_service.RunConfig(name="Custom-Run") 64 | # when 65 | with mlflow_service.run_context(run_config=run_config) as run: 66 | info = saver.save(model=model, signature=signature, input_example=inputs) 67 | version = register.register(name=name, model_uri=info.model_uri) 68 | model_uri = registries.uri_for_model_version(name=name, version=version.version) 69 | adapter = loader.load(uri=model_uri) 70 | outputs = adapter.predict(inputs=inputs) 71 | # then 72 | # - uri 73 | assert model_uri == f"models:/{name}/{version.version}", "The model URI should be valid!" 74 | # - info 75 | assert info.run_id == run.info.run_id, "The run id should be the same!" 76 | assert info.artifact_path == path, "The artifact path should be the same!" 77 | assert info.signature == signature, "The model signature should be the same!" 78 | assert info.flavors.get("python_function"), "The model should have a pyfunc flavor!" 79 | # - version 80 | assert version.name == name, "The model version name should be the same!" 81 | assert version.tags == tags, "The model version tags should be the same!" 82 | assert version.aliases == [], "The model version aliases should be empty!" 83 | assert version.run_id == run.info.run_id, "The model version run id should be the same!" 84 | # - adapter 85 | assert adapter.model.metadata.run_id == version.run_id, ( 86 | "The adapter model run id should be the same!" 87 | ) 88 | assert adapter.model.metadata.signature == signature, ( 89 | "The adapter model signature should be the same!" 90 | ) 91 | assert adapter.model.metadata.flavors.get("python_function") is not None, ( 92 | "The adapter model should have a python_function flavor!" 93 | ) 94 | # - output 95 | assert schemas.OutputsSchema.check(outputs) is not None, "Outputs should be valid!" 96 | 97 | 98 | def test_builtin_pipeline( 99 | model: models.Model, 100 | inputs: schemas.Inputs, 101 | signature: signers.Signature, 102 | mlflow_service: services.MlflowService, 103 | ) -> None: 104 | # given 105 | path = "builtin" 106 | name = "Builtin" 107 | flavor = "sklearn" 108 | tags = {"registry": "mlflow"} 109 | saver = registries.BuiltinSaver(path=path, flavor=flavor) 110 | loader = registries.BuiltinLoader() 111 | register = registries.MlflowRegister(tags=tags) 112 | run_config = mlflow_service.RunConfig(name="Builtin-Run") 113 | # when 114 | with mlflow_service.run_context(run_config=run_config) as run: 115 | info = saver.save(model=model, signature=signature, input_example=inputs) 116 | version = register.register(name=name, model_uri=info.model_uri) 117 | model_uri = registries.uri_for_model_version(name=name, version=version.version) 118 | adapter = loader.load(uri=model_uri) 119 | outputs = adapter.predict(inputs=inputs) 120 | # then 121 | # - uri 122 | assert model_uri == f"models:/{name}/{version.version}", "The model URI should be valid!" 123 | # - info 124 | assert info.run_id == run.info.run_id, "The run id should be the same!" 125 | assert info.artifact_path == path, "The artifact path should be the same!" 126 | assert info.signature == signature, "The model signature should be the same!" 127 | assert info.flavors.get("python_function"), "The model should have a pyfunc flavor!" 128 | assert info.flavors.get(flavor), f"The model should have a built-in model flavor: {flavor}!" 129 | # - version 130 | assert version.name == name, "The model version name should be the same!" 131 | assert version.tags == tags, "The model version tags should be the same!" 132 | assert version.aliases == [], "The model version aliases should be empty!" 133 | assert version.run_id == run.info.run_id, "The model version run id should be the same!" 134 | # - adapter 135 | assert adapter.model.metadata.run_id == version.run_id, ( 136 | "The adapter model run id should be the same!" 137 | ) 138 | assert adapter.model.metadata.signature == signature, ( 139 | "The adapter model signature should be the same!" 140 | ) 141 | assert adapter.model.metadata.flavors.get("python_function") is not None, ( 142 | "The adapter model should have a python_function flavor!" 143 | ) 144 | assert adapter.model.metadata.flavors.get(flavor), ( 145 | f"The model should have a built-in model flavor: {flavor}!" 146 | ) 147 | # - output 148 | assert schemas.OutputsSchema.check(outputs) is not None, "Outputs should be valid!" 149 | -------------------------------------------------------------------------------- /tests/io/test_services.py: -------------------------------------------------------------------------------- 1 | # %% IMPORTS 2 | 3 | import _pytest.capture as pc 4 | import _pytest.logging as pl 5 | import mlflow 6 | import plyer 7 | import pytest 8 | import pytest_mock as pm 9 | 10 | from bikes.io import services 11 | 12 | # %% SERVICES 13 | 14 | 15 | def test_logger_service( 16 | logger_service: services.LoggerService, logger_caplog: pl.LogCaptureFixture 17 | ) -> None: 18 | # given 19 | service = logger_service 20 | logger = service.logger() 21 | # when 22 | logger.debug("DEBUG") 23 | logger.error("ERROR") 24 | # then 25 | assert "DEBUG" in logger_caplog.messages, "Debug message should be logged!" 26 | assert "ERROR" in logger_caplog.messages, "Error message should be logged!" 27 | 28 | 29 | @pytest.mark.parametrize("enable", [True, False]) 30 | def test_alerts_service( 31 | enable: bool, mocker: pm.MockerFixture, capsys: pc.CaptureFixture[str] 32 | ) -> None: 33 | # given 34 | service = services.AlertsService(enable=enable) 35 | mocker.patch(target="plyer.notification.notify") 36 | # when 37 | service.notify(title="test", message="hello") 38 | # then 39 | if enable: 40 | ( 41 | plyer.notification.notify.assert_called_once(), 42 | "Notification method should be called!", 43 | ) 44 | assert capsys.readouterr().out == "", "Notification should not be printed to stdout!" 45 | else: 46 | ( 47 | plyer.notification.notify.assert_not_called(), 48 | "Notification method should not be called!", 49 | ) 50 | assert capsys.readouterr().out == "[Bikes] test: hello\n", ( 51 | "Notification should be printed to stdout!" 52 | ) 53 | 54 | 55 | def test_alerts_service__not_supported( 56 | mocker: pm.MockerFixture, capsys: pc.CaptureFixture[str] 57 | ) -> None: 58 | # given 59 | def notify_not_supported(*args, **kwargs): 60 | raise NotImplementedError() 61 | 62 | service = services.AlertsService(enable=True) 63 | mocker.patch(target="plyer.notification.notify", new=notify_not_supported) 64 | # when 65 | service.notify(title="test", message="hello") 66 | # then 67 | assert "Notifications are not supported on this system." in capsys.readouterr().out 68 | 69 | 70 | def test_mlflow_service(mlflow_service: services.MlflowService) -> None: 71 | # given 72 | service = mlflow_service 73 | run_config = mlflow_service.RunConfig( 74 | name="testing", 75 | tags={"service": "mlflow"}, 76 | description="a test run.", 77 | log_system_metrics=True, 78 | ) 79 | # when 80 | client = service.client() 81 | with service.run_context(run_config=run_config) as context: 82 | pass 83 | finished = client.get_run(run_id=context.info.run_id) 84 | # then 85 | # - run 86 | assert run_config.tags is not None, "Run config tags should be set!" 87 | # - mlflow 88 | assert service.tracking_uri == mlflow.get_tracking_uri(), "Tracking URI should be the same!" 89 | assert service.registry_uri == mlflow.get_registry_uri(), "Registry URI should be the same!" 90 | assert mlflow.get_experiment_by_name(service.experiment_name), "Experiment should be setup!" 91 | # - client 92 | assert service.tracking_uri == client.tracking_uri, "Tracking URI should be the same!" 93 | assert service.registry_uri == client._registry_uri, "Tracking URI should be the same!" 94 | assert client.get_experiment_by_name(service.experiment_name), "Experiment should be setup!" 95 | # - context 96 | assert context.info.run_name == run_config.name, "Context name should be the same!" 97 | assert run_config.description in context.data.tags.values(), ( 98 | "Context desc. should be in tags values!" 99 | ) 100 | assert context.data.tags.items() > run_config.tags.items(), ( 101 | "Context tags should be a subset of the given tags!" 102 | ) 103 | assert context.info.status == "RUNNING", "Context should be running!" 104 | # - finished 105 | assert finished.info.status == "FINISHED", "Finished should be finished!" 106 | -------------------------------------------------------------------------------- /tests/jobs/test_base.py: -------------------------------------------------------------------------------- 1 | # %% IMPORTS 2 | 3 | from bikes.io import services 4 | from bikes.jobs import base 5 | 6 | # %% JOBS 7 | 8 | 9 | def test_job( 10 | logger_service: services.LoggerService, 11 | alerts_service: services.AlertsService, 12 | mlflow_service: services.MlflowService, 13 | ) -> None: 14 | # given 15 | class MyJob(base.Job): 16 | KIND: str = "MyJob" 17 | 18 | def run(self) -> base.Locals: 19 | a, b = 1, "test" 20 | return locals() 21 | 22 | job = MyJob( 23 | logger_service=logger_service, 24 | alerts_service=alerts_service, 25 | mlflow_service=mlflow_service, 26 | ) 27 | # when 28 | with job as runner: 29 | out = runner.run() 30 | # then 31 | # - inputs 32 | assert hasattr(job, "logger_service"), "Job should have an Logger service!" 33 | assert hasattr(job, "alerts_service"), "Job should have a alerter service!" 34 | assert hasattr(job, "mlflow_service"), "Job should have an Mlflow service!" 35 | # - outputs 36 | assert set(out) == {"self", "a", "b"}, "Run should return local variables!" 37 | -------------------------------------------------------------------------------- /tests/jobs/test_evaluations.py: -------------------------------------------------------------------------------- 1 | # %% IMPORTS 2 | 3 | import _pytest.capture as pc 4 | import pytest 5 | 6 | from bikes import jobs 7 | from bikes.core import metrics, schemas 8 | from bikes.io import datasets, registries, services 9 | 10 | # %% JOBS 11 | 12 | 13 | @pytest.mark.parametrize( 14 | "alias_or_version, thresholds", 15 | [ 16 | ( 17 | 1, 18 | { 19 | "mean_squared_error": metrics.Threshold( 20 | threshold=float("inf"), greater_is_better=False 21 | ) 22 | }, 23 | ), 24 | ( 25 | "Promotion", 26 | {"r2_score": metrics.Threshold(threshold=-1, greater_is_better=True)}, 27 | ), 28 | pytest.param( 29 | "Promotion", 30 | {"r2_score": metrics.Threshold(threshold=100, greater_is_better=True)}, 31 | marks=pytest.mark.xfail( 32 | reason="Invalid threshold for metric.", 33 | raises=metrics.MlflowModelValidationFailedException, 34 | ), 35 | ), 36 | ], 37 | ) 38 | def test_evaluations_job( 39 | alias_or_version: str | int, 40 | thresholds: dict[str, metrics.Threshold], 41 | mlflow_service: services.MlflowService, 42 | alerts_service: services.AlertsService, 43 | logger_service: services.LoggerService, 44 | inputs_reader: datasets.ParquetReader, 45 | targets_reader: datasets.ParquetReader, 46 | model_alias: registries.Version, 47 | metric: metrics.SklearnMetric, 48 | capsys: pc.CaptureFixture[str], 49 | ) -> None: 50 | # given 51 | if isinstance(alias_or_version, int): 52 | assert alias_or_version == model_alias.version, "Model version should be the same!" 53 | else: 54 | assert alias_or_version == model_alias.aliases[0], "Model alias should be the same!" 55 | run_config = mlflow_service.RunConfig( 56 | name="EvaluationsTest", 57 | tags={"context": "evaluations"}, 58 | description="Evaluations job.", 59 | ) 60 | # when 61 | job = jobs.EvaluationsJob( 62 | logger_service=logger_service, 63 | alerts_service=alerts_service, 64 | mlflow_service=mlflow_service, 65 | run_config=run_config, 66 | inputs=inputs_reader, 67 | targets=targets_reader, 68 | alias_or_version=alias_or_version, 69 | metrics=[metric], 70 | thresholds=thresholds, 71 | ) 72 | with job as runner: 73 | out = runner.run() 74 | # then 75 | # - vars 76 | assert set(out) == { 77 | "self", 78 | "logger", 79 | "client", 80 | "run", 81 | "inputs", 82 | "inputs_", 83 | "inputs_lineage", 84 | "targets", 85 | "targets_", 86 | "targets_lineage", 87 | "outputs", 88 | "model", 89 | "model_uri", 90 | "dataset", 91 | "dataset_", 92 | "extra_metrics", 93 | "validation_thresholds", 94 | "evaluations", 95 | } 96 | # - run 97 | assert run_config.tags is not None, "Run config tags should be set!" 98 | assert out["run"].info.run_name == run_config.name, "Run name should be the same!" 99 | assert run_config.description in out["run"].data.tags.values(), "Run desc. should be tags!" 100 | assert out["run"].data.tags.items() > run_config.tags.items(), ( 101 | "Run tags should be a subset of tags!" 102 | ) 103 | # - data 104 | assert out["inputs"].ndim == out["inputs_"].ndim == 2, "Inputs should be a dataframe!" 105 | assert out["targets"].ndim == out["targets_"].ndim == 2, "Targets should be a dataframe!" 106 | # - lineage 107 | assert out["inputs_lineage"].name == "inputs", "Inputs lineage name should be inputs!" 108 | assert out["inputs_lineage"].source.uri == inputs_reader.path, ( 109 | "Inputs lineage source should be the inputs reader path!" 110 | ) 111 | assert out["targets_lineage"].name == "targets", "Targets lineage name should be targets!" 112 | assert out["targets_lineage"].source.uri == targets_reader.path, ( 113 | "Targets lineage source should be the targets reader path!" 114 | ) 115 | assert out["targets_lineage"].targets == schemas.TargetsSchema.cnt, ( 116 | "Targets lineage target should be cnt!" 117 | ) 118 | # - outputs 119 | assert out["outputs"].ndim == 2, "Outputs should be a dataframe!" 120 | # - model uri 121 | assert str(alias_or_version) in out["model_uri"], "Model URI should contain the model alias!" 122 | assert mlflow_service.registry_name in out["model_uri"], ( 123 | "Model URI should contain the registry name!" 124 | ) 125 | # - model 126 | assert out["model"].model.metadata.run_id == model_alias.run_id, ( 127 | "Model run id should be the same!" 128 | ) 129 | assert out["model"].model.metadata.signature is not None, "Model should have a signature!" 130 | assert out["model"].model.metadata.flavors.get("python_function"), ( 131 | "Model should have a pyfunc flavor!" 132 | ) 133 | # - dataset 134 | assert out["dataset"].name == "evaluation", "Dataset name should be evaluation!" 135 | assert out["dataset"].targets == schemas.TargetsSchema.cnt, ( 136 | "Dataset targets should be the target column!" 137 | ) 138 | assert out["dataset"].predictions == schemas.OutputsSchema.prediction, ( 139 | "Dataset predictions should be the prediction column!" 140 | ) 141 | assert out["dataset"].source.to_dict().keys() == {"tags"}, "Dataset source should have tags!" 142 | # - extra metrics 143 | assert len(out["extra_metrics"]) == len(job.metrics), ( 144 | "Extra metrics should have the same length as metrics!" 145 | ) 146 | assert out["extra_metrics"][0].name == job.metrics[0].name, ( 147 | "Extra metrics name should be the same!" 148 | ) 149 | assert out["extra_metrics"][0].greater_is_better == job.metrics[0].greater_is_better, ( 150 | "Extra metrics greatter is better should be the same!" 151 | ) 152 | # - validation thresholds 153 | assert out["validation_thresholds"].keys() == thresholds.keys(), ( 154 | "Validation thresholds should have the same keys as thresholds!" 155 | ) 156 | # - evaluations 157 | assert out["evaluations"].metrics["example_count"] == inputs_reader.limit, ( 158 | "Evaluations should have the same number of examples as the inputs!" 159 | ) 160 | assert job.metrics[0].name in out["evaluations"].metrics, "Metric should be logged in Mlflow!" 161 | # - mlflow tracking 162 | experiment = mlflow_service.client().get_experiment_by_name(name=mlflow_service.experiment_name) 163 | assert experiment is not None, "Mlflow Experiment should exist!" 164 | assert experiment.name == mlflow_service.experiment_name, ( 165 | "Mlflow Experiment name should be the same!" 166 | ) 167 | runs = mlflow_service.client().search_runs(experiment_ids=experiment.experiment_id) 168 | assert len(runs) == 2, "There should be a two Mlflow run for training and evaluations!" 169 | assert metric.name in runs[0].data.metrics, "Metric should be logged in Mlflow!" 170 | assert runs[0].info.status == "FINISHED", "Mlflow run status should be set as FINISHED!" 171 | # - alerting service 172 | assert "Evaluations" in capsys.readouterr().out, "Alerting service should be called!" 173 | -------------------------------------------------------------------------------- /tests/jobs/test_explanations.py: -------------------------------------------------------------------------------- 1 | # %% IMPORTS 2 | 3 | import _pytest.capture as pc 4 | import pytest 5 | 6 | from bikes import jobs 7 | from bikes.core import models 8 | from bikes.io import datasets, registries, services 9 | 10 | # %% JOBS 11 | 12 | 13 | @pytest.mark.parametrize("alias_or_version", [1, "Promotion"]) 14 | def test_explanations_job( 15 | alias_or_version: str | int, 16 | mlflow_service: services.MlflowService, 17 | alerts_service: services.AlertsService, 18 | logger_service: services.LoggerService, 19 | inputs_samples_reader: datasets.ParquetReader, 20 | tmp_models_explanations_writer: datasets.ParquetWriter, 21 | tmp_samples_explanations_writer: datasets.ParquetWriter, 22 | model_alias: registries.Version, 23 | loader: registries.CustomLoader, 24 | capsys: pc.CaptureFixture[str], 25 | ) -> None: 26 | # given 27 | if isinstance(alias_or_version, int): 28 | assert alias_or_version == model_alias.version, "Model version should be the same!" 29 | else: 30 | assert alias_or_version == model_alias.aliases[0], "Model alias should be the same!" 31 | # when 32 | job = jobs.ExplanationsJob( 33 | logger_service=logger_service, 34 | alerts_service=alerts_service, 35 | mlflow_service=mlflow_service, 36 | inputs_samples=inputs_samples_reader, 37 | models_explanations=tmp_models_explanations_writer, 38 | samples_explanations=tmp_samples_explanations_writer, 39 | alias_or_version=alias_or_version, 40 | loader=loader, 41 | ) 42 | with job as runner: 43 | out = runner.run() 44 | # then 45 | # - vars 46 | assert set(out) == { 47 | "self", 48 | "logger", 49 | "inputs_samples", 50 | "model_uri", 51 | "model", 52 | "models_explanations", 53 | "samples_explanations", 54 | } 55 | # - inputs 56 | assert out["inputs_samples"].ndim == 2, "Inputs samples should be a dataframe!" 57 | # - model uri 58 | assert str(alias_or_version) in out["model_uri"], "Model URI should contain the model alias!" 59 | assert mlflow_service.registry_name in out["model_uri"], ( 60 | "Model URI should contain the registry name!" 61 | ) 62 | # - model 63 | assert isinstance(out["model"], models.Model), "Model should be an instance of a project Model!" 64 | # - model explanations 65 | assert len(out["models_explanations"].index) >= len(out["inputs_samples"].columns), ( 66 | "Model explanations should have at least as many columns as inputs samples!" 67 | ) 68 | # - samples explanations 69 | assert len(out["samples_explanations"].index) == len(out["inputs_samples"].index), ( 70 | "Samples explanations should have the same number of rows as inputs samples!" 71 | ) 72 | assert len(out["samples_explanations"].columns) >= len(out["inputs_samples"].columns), ( 73 | "Samples explanations should have at least as many columns as inputs samples!" 74 | ) 75 | # - alerting service 76 | assert "Explanations Job Finished" in capsys.readouterr().out, ( 77 | "Alerting service should be called!" 78 | ) 79 | -------------------------------------------------------------------------------- /tests/jobs/test_inference.py: -------------------------------------------------------------------------------- 1 | # %% IMPORTS 2 | 3 | import _pytest.capture as pc 4 | import pytest 5 | 6 | from bikes import jobs 7 | from bikes.io import datasets, registries, services 8 | 9 | # %% JOBS 10 | 11 | 12 | @pytest.mark.parametrize("alias_or_version", [1, "Promotion"]) 13 | def test_inference_job( 14 | alias_or_version: str | int, 15 | mlflow_service: services.MlflowService, 16 | alerts_service: services.AlertsService, 17 | logger_service: services.LoggerService, 18 | inputs_reader: datasets.ParquetReader, 19 | tmp_outputs_writer: datasets.ParquetWriter, 20 | model_alias: registries.Version, 21 | loader: registries.CustomLoader, 22 | capsys: pc.CaptureFixture[str], 23 | ) -> None: 24 | # given 25 | if isinstance(alias_or_version, int): 26 | assert alias_or_version == model_alias.version, "Model version should be the same!" 27 | else: 28 | assert alias_or_version == model_alias.aliases[0], "Model alias should be the same!" 29 | # when 30 | job = jobs.InferenceJob( 31 | logger_service=logger_service, 32 | alerts_service=alerts_service, 33 | mlflow_service=mlflow_service, 34 | inputs=inputs_reader, 35 | outputs=tmp_outputs_writer, 36 | alias_or_version=alias_or_version, 37 | loader=loader, 38 | ) 39 | with job as runner: 40 | out = runner.run() 41 | # then 42 | # - vars 43 | assert set(out) == { 44 | "self", 45 | "logger", 46 | "inputs", 47 | "inputs_", 48 | "model_uri", 49 | "model", 50 | "outputs", 51 | } 52 | # - inputs 53 | assert out["inputs"].ndim == out["inputs_"].ndim == 2, "Inputs should be a dataframe!" 54 | # - model uri 55 | assert str(alias_or_version) in out["model_uri"], "Model URI should contain the model alias!" 56 | assert mlflow_service.registry_name in out["model_uri"], ( 57 | "Model URI should contain the registry name!" 58 | ) 59 | # - model 60 | assert out["model"].model.metadata.run_id == model_alias.run_id, ( 61 | "Model run id should be the same!" 62 | ) 63 | assert out["model"].model.metadata.signature is not None, "Model should have a signature!" 64 | assert out["model"].model.metadata.flavors.get("python_function"), ( 65 | "Model should have a pyfunc flavor!" 66 | ) 67 | # - outputs 68 | assert out["outputs"].ndim == 2, "Outputs should be a dataframe!" 69 | # - alerting service 70 | assert "Inference Job Finished" in capsys.readouterr().out, "Alerting service should be called!" 71 | -------------------------------------------------------------------------------- /tests/jobs/test_promotion.py: -------------------------------------------------------------------------------- 1 | # %% IMPORTS 2 | 3 | import _pytest.capture as pc 4 | import mlflow 5 | import pytest 6 | 7 | from bikes import jobs 8 | from bikes.io import registries, services 9 | 10 | # %% JOBS 11 | 12 | 13 | @pytest.mark.parametrize( 14 | "version", 15 | [ 16 | None, # latest version 17 | 1, # specific version 18 | pytest.param( 19 | 2, 20 | marks=pytest.mark.xfail( 21 | reason="Version does not exist.", 22 | raises=mlflow.exceptions.MlflowException, 23 | ), 24 | ), 25 | ], 26 | ) 27 | def test_promotion_job( 28 | version: int | None, 29 | mlflow_service: services.MlflowService, 30 | alerts_service: services.AlertsService, 31 | logger_service: services.LoggerService, 32 | model_version: registries.Version, 33 | capsys: pc.CaptureFixture[str], 34 | ) -> None: 35 | # given 36 | alias = "Testing" 37 | # when 38 | job = jobs.PromotionJob( 39 | logger_service=logger_service, 40 | alerts_service=alerts_service, 41 | mlflow_service=mlflow_service, 42 | version=version, 43 | alias=alias, 44 | ) 45 | with job as runner: 46 | out = runner.run() 47 | # then 48 | # - vars 49 | assert set(out) == { 50 | "self", 51 | "logger", 52 | "client", 53 | "name", 54 | "version", 55 | "model_version", 56 | } 57 | # - name 58 | assert out["name"] == mlflow_service.registry_name, "Model name should be the same!" 59 | # - version 60 | assert out["version"] == model_version.version, "Version number should be the same!" 61 | # - model version 62 | assert out["model_version"].name == out["name"], "Model version name should be the same!" 63 | assert out["model_version"].version == out["version"], ( 64 | "Model version number should be the same!" 65 | ) 66 | assert out["model_version"].run_id == model_version.run_id, ( 67 | "Model version run id should be the same!" 68 | ) 69 | assert out["model_version"].aliases == [alias], ( 70 | "Model version aliases should contain the given alias!" 71 | ) 72 | # - alerting service 73 | assert "Promotion Job Finished" in capsys.readouterr().out, "Alerting service should be called!" 74 | -------------------------------------------------------------------------------- /tests/jobs/test_training.py: -------------------------------------------------------------------------------- 1 | # %% IMPORTS 2 | 3 | import _pytest.capture as pc 4 | 5 | from bikes import jobs 6 | from bikes.core import metrics, models, schemas 7 | from bikes.io import datasets, registries, services 8 | from bikes.utils import signers, splitters 9 | 10 | # %% JOBS 11 | 12 | 13 | def test_training_job( 14 | mlflow_service: services.MlflowService, 15 | alerts_service: services.AlertsService, 16 | logger_service: services.LoggerService, 17 | inputs_reader: datasets.ParquetReader, 18 | targets_reader: datasets.ParquetReader, 19 | model: models.BaselineSklearnModel, 20 | metric: metrics.SklearnMetric, 21 | train_test_splitter: splitters.TrainTestSplitter, 22 | saver: registries.CustomSaver, 23 | signer: signers.InferSigner, 24 | register: registries.MlflowRegister, 25 | capsys: pc.CaptureFixture[str], 26 | ) -> None: 27 | # given 28 | run_config = mlflow_service.RunConfig( 29 | name="TrainingTest", tags={"context": "training"}, description="Training job." 30 | ) 31 | splitter = train_test_splitter 32 | client = mlflow_service.client() 33 | # when 34 | job = jobs.TrainingJob( 35 | logger_service=logger_service, 36 | alerts_service=alerts_service, 37 | mlflow_service=mlflow_service, 38 | run_config=run_config, 39 | inputs=inputs_reader, 40 | targets=targets_reader, 41 | model=model, 42 | metrics=[metric], 43 | splitter=splitter, 44 | saver=saver, 45 | signer=signer, 46 | registry=register, 47 | ) 48 | with job as runner: 49 | out = runner.run() 50 | # then 51 | # - vars 52 | assert set(out) == { 53 | "self", 54 | "logger", 55 | "client", 56 | "run", 57 | "inputs", 58 | "inputs_", 59 | "inputs_lineage", 60 | "targets", 61 | "targets_", 62 | "targets_lineage", 63 | "train_index", 64 | "test_index", 65 | "inputs_test", 66 | "inputs_train", 67 | "inputs_test", 68 | "targets_train", 69 | "targets_test", 70 | "outputs_test", 71 | "i", 72 | "metric", 73 | "score", 74 | "model_signature", 75 | "model_info", 76 | "model_version", 77 | } 78 | # - run 79 | assert run_config.tags is not None, "Run config tags should be set!" 80 | assert out["run"].info.run_name == run_config.name, "Run name should be the same!" 81 | assert run_config.description in out["run"].data.tags.values(), "Run desc. should be tags!" 82 | assert out["run"].data.tags.items() > run_config.tags.items(), ( 83 | "Run tags should be a subset of tags!" 84 | ) 85 | # - data 86 | assert out["inputs"].ndim == out["inputs_"].ndim == 2, "Inputs should be a dataframe!" 87 | assert out["targets"].ndim == out["targets_"].ndim == 2, "Targets should be a dataframe!" 88 | # - lineage 89 | assert out["inputs_lineage"].name == "inputs", "Inputs lineage name should be inputs!" 90 | assert out["inputs_lineage"].source.uri == inputs_reader.path, ( 91 | "Inputs lineage source should be the inputs reader path!" 92 | ) 93 | assert out["targets_lineage"].name == "targets", "Targets lineage name should be targets!" 94 | assert out["targets_lineage"].source.uri == targets_reader.path, ( 95 | "Targets lineage source should be the targets reader path!" 96 | ) 97 | assert out["targets_lineage"].targets == schemas.TargetsSchema.cnt, ( 98 | "Targets lineage target should be cnt!" 99 | ) 100 | # - splitter 101 | assert len(out["inputs_train"]) + len(out["inputs_test"]) == len(out["inputs"]), ( 102 | "Train and test inputs should have the same length as inputs!" 103 | ) 104 | assert len(out["targets_train"]) + len(out["targets_test"]) == len(out["targets"]), ( 105 | "Train and test targets should have the same length as targets!" 106 | ) 107 | assert len(out["train_index"]) == len(out["inputs_train"]) == len(out["targets_train"]), ( 108 | "Train inputs and targets should have the same length!" 109 | ) 110 | assert len(out["test_index"]) == len(out["inputs_test"]) == len(out["targets_test"]), ( 111 | "Test inputs and targets should have the same length!" 112 | ) 113 | # - outputs 114 | assert out["outputs_test"].shape == out["targets_test"].shape, ( 115 | "Outputs should have the same shape as targets!" 116 | ) 117 | assert len(out["test_index"]) == len(out["outputs_test"]) == len(out["inputs_test"]), ( 118 | "Outputs should have the same length as inputs!" 119 | ) 120 | # - i and score 121 | assert out["i"] == len(job.metrics), "i should be the number of metrics computed!" 122 | assert float("-inf") < out["score"] < float("+inf"), "Score should be between 0 and 1!" 123 | # - model signature 124 | assert out["model_signature"].inputs is not None, "Model signature inputs should not be None!" 125 | assert out["model_signature"].outputs is not None, "Model signature outputs should not be None!" 126 | # - model info 127 | assert out["model_info"].run_id == out["run"].info.run_id, ( 128 | "Model info run id should be the same!" 129 | ) 130 | assert out["model_info"].signature == out["model_signature"], ( 131 | "Model info signature should be the same!" 132 | ) 133 | assert out["model_info"].artifact_path == saver.path, "Model info path should be the same!" 134 | # - model version 135 | assert out["model_version"].version == 1, "Model version number should be 1!" 136 | assert out["model_version"].aliases == [], "Model version aliases should be empty!" 137 | assert out["model_version"].tags == register.tags, "Model version tags should be the same!" 138 | assert out["model_version"].name == mlflow_service.registry_name, ( 139 | "Model name should be the same!" 140 | ) 141 | assert out["model_version"].run_id == out["run"].info.run_id, ( 142 | "Model version run id should be the same!" 143 | ) 144 | # - mlflow tracking 145 | experiment = client.get_experiment_by_name(name=mlflow_service.experiment_name) 146 | assert experiment is not None, "Mlflow Experiment should exist!" 147 | assert experiment.name == mlflow_service.experiment_name, ( 148 | "Mlflow Experiment name should be the same!" 149 | ) 150 | runs = client.search_runs(experiment_ids=experiment.experiment_id) 151 | assert len(runs) == 1, "There should be a single Mlflow run for training!" 152 | assert metric.name in runs[0].data.metrics, "Metric should be logged in Mlflow!" 153 | assert runs[0].info.status == "FINISHED", "Mlflow run status should be set as FINISHED!" 154 | # - mlflow registry 155 | model_version = client.get_model_version( 156 | name=mlflow_service.registry_name, version=out["model_version"].version 157 | ) 158 | assert model_version.run_id == out["run"].info.run_id, ( 159 | "MLFlow model version run id should be the same!" 160 | ) 161 | # - alerting service 162 | assert "Training Job Finished" in capsys.readouterr().out, "Alerting service should be called!" 163 | -------------------------------------------------------------------------------- /tests/jobs/test_tuning.py: -------------------------------------------------------------------------------- 1 | # %% IMPORTS 2 | 3 | import _pytest.capture as pc 4 | 5 | from bikes import jobs 6 | from bikes.core import metrics, models, schemas 7 | from bikes.io import datasets, services 8 | from bikes.utils import searchers, splitters 9 | 10 | # %% JOBS 11 | 12 | 13 | def test_tuning_job( 14 | mlflow_service: services.MlflowService, 15 | alerts_service: services.AlertsService, 16 | logger_service: services.LoggerService, 17 | inputs_reader: datasets.ParquetReader, 18 | targets_reader: datasets.ParquetReader, 19 | model: models.BaselineSklearnModel, 20 | metric: metrics.SklearnMetric, 21 | time_series_splitter: splitters.TimeSeriesSplitter, 22 | searcher: searchers.GridCVSearcher, 23 | capsys: pc.CaptureFixture[str], 24 | ) -> None: 25 | # given 26 | run_config = mlflow_service.RunConfig( 27 | name="TuningTest", tags={"context": "tuning"}, description="Tuning job." 28 | ) 29 | splitter = time_series_splitter 30 | client = mlflow_service.client() 31 | # when 32 | job = jobs.TuningJob( 33 | logger_service=logger_service, 34 | alerts_service=alerts_service, 35 | mlflow_service=mlflow_service, 36 | run_config=run_config, 37 | inputs=inputs_reader, 38 | targets=targets_reader, 39 | model=model, 40 | metric=metric, 41 | splitter=splitter, 42 | searcher=searcher, 43 | ) 44 | with job as runner: 45 | out = runner.run() 46 | # then 47 | # - vars 48 | assert set(out) == { 49 | "self", 50 | "logger", 51 | "run", 52 | "inputs", 53 | "inputs_", 54 | "inputs_lineage", 55 | "targets", 56 | "targets_", 57 | "targets_lineage", 58 | "results", 59 | "best_params", 60 | "best_score", 61 | } 62 | # - run 63 | assert run_config.tags is not None, "Run config tags should be set!" 64 | assert out["run"].info.run_name == run_config.name, "Run name should be the same!" 65 | assert run_config.description in out["run"].data.tags.values(), "Run desc. should be tags!" 66 | assert out["run"].data.tags.items() > run_config.tags.items(), ( 67 | "Run tags should be a subset of tags!" 68 | ) 69 | # - data 70 | assert out["inputs"].ndim == out["inputs_"].ndim == 2, "Inputs should be a dataframe!" 71 | assert out["targets"].ndim == out["inputs_"].ndim == 2, "Targets should be a dataframe!" 72 | # - lineage 73 | assert out["inputs_lineage"].name == "inputs", "Inputs lineage name should be inputs!" 74 | assert out["inputs_lineage"].source.uri == inputs_reader.path, ( 75 | "Inputs lineage source should be the inputs reader path!" 76 | ) 77 | assert out["targets_lineage"].name == "targets", "Targets lineage name should be targets!" 78 | assert out["targets_lineage"].source.uri == targets_reader.path, ( 79 | "Targets lineage source should be the targets reader path!" 80 | ) 81 | assert out["targets_lineage"].targets == schemas.TargetsSchema.cnt, ( 82 | "Targets lineage target should be cnt!" 83 | ) 84 | # - results 85 | assert out["results"].ndim == 2, "Results should be a dataframe!" 86 | # - best score 87 | assert float("-inf") < out["best_score"] < float("inf"), ( 88 | "Best score should be between -inf and +inf!" 89 | ) 90 | # - best params 91 | assert out["best_params"].keys() == searcher.param_grid.keys(), ( 92 | "Best params should have the same keys!" 93 | ) 94 | # - mlflow tracking 95 | experiment = client.get_experiment_by_name(name=mlflow_service.experiment_name) 96 | assert experiment is not None, "Mlflow experiment should exist!" 97 | assert experiment.name == mlflow_service.experiment_name, ( 98 | "Mlflow experiment name should be the same!" 99 | ) 100 | runs = client.search_runs(experiment_ids=experiment.experiment_id) 101 | assert len(runs) == len(out["results"]) + 1, "Mlflow should have 1 run per result + parent!" 102 | # - alerting service 103 | assert "Tuning Job Finished" in capsys.readouterr().out, "Alerting service should be called!" 104 | -------------------------------------------------------------------------------- /tests/test_scripts.py: -------------------------------------------------------------------------------- 1 | # %% IMPORTS 2 | 3 | import json 4 | import os 5 | 6 | import pydantic as pdt 7 | import pytest 8 | from _pytest import capture as pc 9 | 10 | from bikes import scripts 11 | 12 | # %% SCRIPTS 13 | 14 | 15 | def test_schema(capsys: pc.CaptureFixture[str]) -> None: 16 | # given 17 | args = ["prog", "--schema"] 18 | # when 19 | scripts.main(args) 20 | captured = capsys.readouterr() 21 | # then 22 | assert captured.err == "", "Captured error should be empty!" 23 | assert json.loads(captured.out), "Captured output should be a JSON!" 24 | 25 | 26 | @pytest.mark.parametrize( 27 | "scenario", 28 | [ 29 | "valid", 30 | pytest.param( 31 | "invalid", 32 | marks=pytest.mark.xfail( 33 | reason="Invalid config.", 34 | raises=pdt.ValidationError, 35 | ), 36 | ), 37 | ], 38 | ) 39 | def test_main(scenario: str, confs_path: str, extra_config: str) -> None: 40 | # given 41 | folder = os.path.join(confs_path, scenario) 42 | confs = list(sorted(os.listdir(folder))) 43 | # when 44 | for conf in confs: # one job per config 45 | config = os.path.join(folder, conf) 46 | argv = [config, "-e", extra_config] 47 | status = scripts.main(argv=argv) 48 | # then 49 | assert status == 0, f"Job should succeed for config: {config}" 50 | 51 | 52 | def test_main__no_configs() -> None: 53 | # given 54 | argv: list[str] = [] 55 | # when 56 | with pytest.raises(RuntimeError) as error: 57 | scripts.main(argv) 58 | # then 59 | assert error.match("No configs provided."), "RuntimeError should be raised!" 60 | -------------------------------------------------------------------------------- /tests/utils/test_searchers.py: -------------------------------------------------------------------------------- 1 | # %% IMPORTS 2 | 3 | from bikes.core import metrics, models, schemas 4 | from bikes.utils import searchers, splitters 5 | 6 | # %% SEARCHERS 7 | 8 | 9 | def test_grid_cv_searcher( 10 | model: models.Model, 11 | metric: metrics.Metric, 12 | inputs: schemas.Inputs, 13 | targets: schemas.Targets, 14 | train_test_splitter: splitters.Splitter, 15 | ) -> None: 16 | # given 17 | param_grid = {"max_depth": [3, 5, 7]} 18 | searcher = searchers.GridCVSearcher(param_grid=param_grid) 19 | # when 20 | result, best_score, best_params = searcher.search( 21 | model=model, 22 | metric=metric, 23 | inputs=inputs, 24 | targets=targets, 25 | cv=train_test_splitter, 26 | ) 27 | # then 28 | assert set(best_params) == set(param_grid), "Best params should have the same keys as grid!" 29 | assert float("-inf") < best_score < float("+inf"), "Best score should be a floating number!" 30 | assert len(result) == sum(len(vs) for vs in param_grid.values()), ( 31 | "Results should have one row per candidate!" 32 | ) 33 | -------------------------------------------------------------------------------- /tests/utils/test_signers.py: -------------------------------------------------------------------------------- 1 | # %% IMPORTS 2 | 3 | from bikes.core import schemas 4 | from bikes.utils import signers 5 | 6 | # %% SIGNERS 7 | 8 | 9 | def test_infer_signer(inputs: schemas.Inputs, outputs: schemas.Outputs) -> None: 10 | # given 11 | signer = signers.InferSigner() 12 | # when 13 | signature = signer.sign(inputs=inputs, outputs=outputs) 14 | # then 15 | assert set(signature.inputs.input_names()) == set(inputs.columns), ( 16 | "Signature inputs should contain input column names." 17 | ) 18 | assert set(signature.outputs.input_names()) == set(outputs.columns), ( 19 | "Signature outputs should contain output column names." 20 | ) 21 | -------------------------------------------------------------------------------- /tests/utils/test_splitters.py: -------------------------------------------------------------------------------- 1 | # %% IMPORTS 2 | 3 | from bikes.core import schemas 4 | from bikes.utils import splitters 5 | 6 | # %% SPLITTERS 7 | 8 | 9 | def test_train_test_splitter(inputs: schemas.Inputs, targets: schemas.Targets) -> None: 10 | # given 11 | shuffle = False 12 | test_size = 50 13 | random_state = 0 14 | splitter = splitters.TrainTestSplitter( 15 | shuffle=shuffle, test_size=test_size, random_state=random_state 16 | ) 17 | # when 18 | n_splits = splitter.get_n_splits(inputs=inputs, targets=targets) 19 | splits = list(splitter.split(inputs=inputs, targets=targets)) 20 | train_index, test_index = splits[0] # train/test indexes 21 | # then 22 | assert n_splits == len(splits) == 1, "Splitter should return 1 split!" 23 | assert len(test_index) == test_size, "Test index should have the given size!" 24 | assert len(train_index) == len(targets) - test_size, ( 25 | "Train index should have the remaining size!" 26 | ) 27 | assert not inputs.iloc[test_index].empty, "Test index should be a subset of the inputs!" 28 | assert not targets.iloc[train_index].empty, "Train index should be a subset of the targets!" 29 | 30 | 31 | def test_time_series_splitter(inputs: schemas.Inputs, targets: schemas.Targets) -> None: 32 | # given 33 | gap = 0 34 | n_splits = 3 35 | test_size = 50 36 | splitter = splitters.TimeSeriesSplitter(gap=gap, n_splits=n_splits, test_size=test_size) 37 | # when 38 | n_splits = splitter.get_n_splits(inputs=inputs, targets=targets) 39 | splits = list(splitter.split(inputs=inputs, targets=targets)) 40 | # then 41 | assert n_splits == len(splits), "Splitter should return the given n splits!" 42 | for i, (train_index, test_index) in enumerate(splits): 43 | assert len(test_index) == test_size, "Test index should have the given test size!" 44 | assert len(train_index) == (len(inputs) - test_size * (n_splits - i)), ( 45 | "Train index should have the cumulative remaining size!" 46 | ) 47 | assert train_index.max() < test_index.min(), ( 48 | "Train index should always be lower than test index!" 49 | ) 50 | assert not inputs.iloc[train_index].empty, "Train index should be a subset of the inputs!" 51 | assert not inputs.iloc[test_index].empty, "Test index should be a subset of the inputs!" 52 | --------------------------------------------------------------------------------