├── .cruft.json
├── .env.example
├── .gemini
└── config.yaml
├── .github
├── FUNDING.yml
├── ISSUE_TEMPLATE
│ ├── feat-request.md
│ └── fix-request.md
├── PULL_REQUEST_TEMPLATE.md
├── actions
│ └── setup
│ │ └── action.yml
├── dependabot.yml
├── rulesets
│ └── main.json
└── workflows
│ ├── check.yml
│ └── publish.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .python-version
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── Dockerfile
├── LICENSE.txt
├── MLproject
├── README.md
├── confs
├── evaluations.yaml
├── explanations.yaml
├── inference.yaml
├── promotion.yaml
├── training.yaml
└── tuning.yaml
├── constraints.txt
├── data
├── Readme.txt
├── hour.csv
├── inputs_test.parquet
├── inputs_train.parquet
├── targets_test.parquet
└── targets_train.parquet
├── docker-compose.yml
├── images
└── mlopsmindmap.png
├── justfile
├── mlops-python-package.code-workspace
├── notebooks
├── explain.ipynb
├── indicators.ipynb
├── processing.ipynb
└── prototype.ipynb
├── outputs
└── .gitkeep
├── pyproject.toml
├── python_env.yaml
├── requirements.txt
├── src
└── bikes
│ ├── __init__.py
│ ├── __main__.py
│ ├── core
│ ├── __init__.py
│ ├── metrics.py
│ ├── models.py
│ └── schemas.py
│ ├── io
│ ├── __init__.py
│ ├── configs.py
│ ├── datasets.py
│ ├── registries.py
│ └── services.py
│ ├── jobs
│ ├── __init__.py
│ ├── base.py
│ ├── evaluations.py
│ ├── explanations.py
│ ├── inference.py
│ ├── promotion.py
│ ├── training.py
│ └── tuning.py
│ ├── scripts.py
│ ├── settings.py
│ └── utils
│ ├── __init__.py
│ ├── searchers.py
│ ├── signers.py
│ └── splitters.py
├── tasks
├── check.just
├── clean.just
├── commit.just
├── doc.just
├── docker.just
├── format.just
├── install.just
├── mlflow.just
├── package.just
└── project.just
├── tests
├── confs
│ ├── invalid
│ │ └── 1. invalid.yaml
│ └── valid
│ │ ├── 0. tuning.yaml
│ │ ├── 1. training.yaml
│ │ ├── 2. promotion.yaml
│ │ ├── 3. inference.yaml
│ │ ├── 5. evaluations.yaml
│ │ └── 6. explanations.yaml
├── conftest.py
├── core
│ ├── test_metrics.py
│ ├── test_models.py
│ └── test_schemas.py
├── data
│ ├── inputs_sample.parquet
│ ├── outputs_sample.parquet
│ └── targets_sample.parquet
├── io
│ ├── test_configs.py
│ ├── test_datasets.py
│ ├── test_registries.py
│ └── test_services.py
├── jobs
│ ├── test_base.py
│ ├── test_evaluations.py
│ ├── test_explanations.py
│ ├── test_inference.py
│ ├── test_promotion.py
│ ├── test_training.py
│ └── test_tuning.py
├── test_scripts.py
└── utils
│ ├── test_searchers.py
│ ├── test_signers.py
│ └── test_splitters.py
└── uv.lock
/.cruft.json:
--------------------------------------------------------------------------------
1 | {
2 | "template": "https://github.com/fmind/cookiecutter-mlops-package",
3 | "commit": "2ce51abb4333d594baee46ce590ead4e4cd76142",
4 | "checkout": null,
5 | "context": {
6 | "cookiecutter": {
7 | "user": "fmind",
8 | "name": "MLOps Python Package",
9 | "repository": "mlops-python-package",
10 | "package": "bikes",
11 | "license": "MIT",
12 | "version": "4.0.0",
13 | "description": "Predict the number of bikes available",
14 | "python_version": "3.13",
15 | "mlflow_version": "2.20.3",
16 | "_template": "https://github.com/fmind/cookiecutter-mlops-package"
17 | }
18 | },
19 | "directory": null
20 | }
21 |
--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmind/mlops-python-package/856a5ab74958cec36ae31619636d07c4baf9698f/.env.example
--------------------------------------------------------------------------------
/.gemini/config.yaml:
--------------------------------------------------------------------------------
1 | # https://developers.google.com/gemini-code-assist/docs/customize-gemini-behavior-github
2 | have_fun: false
3 | code_review:
4 | disable: false
5 | comment_severity_threshold: MEDIUM
6 | max_review_comments: -1
7 | pull_request_opened:
8 | help: false
9 | summary: true
10 | code_review: true
11 |
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | # github: ["MLOps-Courses"]
4 | custom: ["https://donate.stripe.com/4gw8xT9oVbCc98s7ss"]
5 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feat-request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature Request
3 | about: A new feature.
4 | title: "[FEAT] "
5 | labels: feat
6 | assignees: fmind
7 | ---
8 |
9 | ## Description
10 |
11 | ## Motivation
12 |
13 | ## Solutions
14 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/fix-request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Fix Request
3 | about: A bug fix
4 | title: "[FIX] "
5 | labels: fix
6 | assignees: fmind
7 | ---
8 |
9 | ## Bug Description
10 |
11 | ## Expected Behavior
12 |
13 | ## Steps to Reproduce
14 |
15 | ## Additional Context
16 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | # Changes
2 |
3 | # Reasons
4 |
5 | # Testing
6 |
7 | # Impacts
8 |
9 | # Notes
10 |
--------------------------------------------------------------------------------
/.github/actions/setup/action.yml:
--------------------------------------------------------------------------------
1 | name: Setup
2 | description: Setup for project workflows
3 | runs:
4 | using: composite
5 | steps:
6 | - name: Install uv
7 | uses: astral-sh/setup-uv@v5
8 | with:
9 | enable-cache: true
10 | - name: Setup Python
11 | uses: actions/setup-python@v5
12 | with:
13 | python-version-file: .python-version
14 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | # https://docs.github.com/en/code-security/dependabot/working-with-dependabot/dependabot-options-reference
2 | version: 2
3 | updates:
4 | - package-ecosystem: "pip"
5 | directory: "/"
6 | schedule:
7 | interval: "weekly"
8 |
--------------------------------------------------------------------------------
/.github/rulesets/main.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "main",
3 | "target": "branch",
4 | "enforcement": "active",
5 | "conditions": {
6 | "ref_name": {
7 | "exclude": [],
8 | "include": [
9 | "~DEFAULT_BRANCH"
10 | ]
11 | }
12 | },
13 | "rules": [
14 | {
15 | "type": "deletion"
16 | },
17 | {
18 | "type": "required_linear_history"
19 | },
20 | {
21 | "type": "pull_request",
22 | "parameters": {
23 | "required_approving_review_count": 0,
24 | "dismiss_stale_reviews_on_push": true,
25 | "require_code_owner_review": false,
26 | "require_last_push_approval": false,
27 | "required_review_thread_resolution": false,
28 | "allowed_merge_methods": [
29 | "squash",
30 | "rebase"
31 | ]
32 | }
33 | },
34 | {
35 | "type": "required_status_checks",
36 | "parameters": {
37 | "strict_required_status_checks_policy": true,
38 | "do_not_enforce_on_create": false,
39 | "required_status_checks": [
40 | {
41 | "context": "checks",
42 | "integration_id": 15368
43 | }
44 | ]
45 | }
46 | },
47 | {
48 | "type": "non_fast_forward"
49 | }
50 | ],
51 | "bypass_actors": [
52 | {
53 | "actor_id": 5,
54 | "actor_type": "RepositoryRole",
55 | "bypass_mode": "always"
56 | }
57 | ]
58 | }
59 |
--------------------------------------------------------------------------------
/.github/workflows/check.yml:
--------------------------------------------------------------------------------
1 | name: Check
2 | on:
3 | pull_request:
4 | branches:
5 | - '*'
6 | concurrency:
7 | cancel-in-progress: true
8 | group: ${{ github.workflow }}-${{ github.ref }}
9 | jobs:
10 | checks:
11 | runs-on: ubuntu-latest
12 | steps:
13 | - uses: actions/checkout@v4
14 | - uses: ./.github/actions/setup
15 | - run: uv sync --group=check
16 | - run: uv run just check-code
17 | - run: uv run just check-type
18 | - run: uv run just check-format
19 | - run: uv run just check-security
20 | - run: uv run just check-coverage
21 |
--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
1 | name: Publish
2 | on:
3 | release:
4 | types:
5 | - edited
6 | - published
7 | env:
8 | DOCKER_IMAGE: ghcr.io/fmind/mlops-python-package
9 | concurrency:
10 | cancel-in-progress: true
11 | group: publish-workflow
12 | jobs:
13 | pages:
14 | runs-on: ubuntu-latest
15 | steps:
16 | - uses: actions/checkout@v4
17 | - uses: ./.github/actions/setup
18 | - run: uv sync --group=doc
19 | - run: uv run just doc
20 | - uses: JamesIves/github-pages-deploy-action@v4
21 | with:
22 | folder: docs/
23 | branch: gh-pages
24 | packages:
25 | permissions:
26 | packages: write
27 | runs-on: ubuntu-latest
28 | steps:
29 | - uses: actions/checkout@v4
30 | - uses: ./.github/actions/setup
31 | - run: uv sync --only-dev
32 | - run: uv run just package
33 | - uses: docker/login-action@v3
34 | with:
35 | registry: ghcr.io
36 | username: ${{ github.actor }}
37 | password: ${{ secrets.GITHUB_TOKEN }}
38 | - uses: docker/setup-buildx-action@v3
39 | - uses: docker/build-push-action@v6
40 | with:
41 | push: true
42 | context: .
43 | cache-to: type=gha
44 | cache-from: type=gha
45 | tags: |
46 | ${{ env.DOCKER_IMAGE }}:latest
47 | ${{ env.DOCKER_IMAGE }}:${{ github.ref_name }}
48 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # https://git-scm.com/docs/gitignore
2 |
3 | # Build
4 | /dist/
5 | /build/
6 |
7 | # Cache
8 | .cache/
9 | .coverage*
10 | .mypy_cache/
11 | .ruff_cache/
12 | .pytest_cache/
13 |
14 | # Editor
15 | /.idea/
16 | /.vscode/
17 | .ipynb_checkpoints/
18 |
19 | # Environs
20 | .env
21 | /.venv/
22 |
23 | # Project
24 | /docs/*
25 | /mlruns/*
26 | /outputs/*
27 | !**/.gitkeep
28 |
29 | # Python
30 | *.py[cod]
31 | __pycache__/
32 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | # https://pre-commit.com
2 | # https://pre-commit.com/hooks.html
3 |
4 | default_language_version:
5 | python: python3.13
6 | repos:
7 | - repo: https://github.com/pre-commit/pre-commit-hooks
8 | rev: 'v5.0.0'
9 | hooks:
10 | - id: check-added-large-files
11 | - id: check-case-conflict
12 | - id: check-merge-conflict
13 | - id: check-toml
14 | - id: check-yaml
15 | - id: debug-statements
16 | - id: end-of-file-fixer
17 | - id: mixed-line-ending
18 | - id: trailing-whitespace
19 | - repo: https://github.com/astral-sh/ruff-pre-commit
20 | rev: 'v0.9.9'
21 | hooks:
22 | - id: ruff
23 | - id: ruff-format
24 | - repo: https://github.com/PyCQA/bandit
25 | rev: '1.8.3'
26 | hooks:
27 | - id: bandit
28 | - repo: https://github.com/commitizen-tools/commitizen
29 | rev: 'v4.4.1'
30 | hooks:
31 | - id: commitizen
32 | - id: commitizen-branch
33 | stages: [pre-push]
34 |
--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.13
2 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | ## v4.1.0 (2025-03-05)
2 |
3 | ### Feat
4 |
5 | - **gemini**: add support for gemini code assist (#51)
6 | - **dependabot**: add dependabot configuration file (#50)
7 | - **github**: add default rulesets and installation (#47)
8 |
9 | ### Fix
10 |
11 | - **workflows**: fix just in workflows
12 |
13 | ### Refactor
14 |
15 | - **cruft**: update to new template version
16 |
17 | ## v4.0.0 (2025-03-04)
18 |
19 | ### Feat
20 |
21 | - **tasks**: switch from pyinvoke to just (#42)
22 | - **workflows**: bump GitHub action versions (#41)
23 | - **versions**: bump python and package version (#40)
24 | - **mindmap**: add mindmap of the package (#32)
25 |
26 | ### Fix
27 |
28 | - **version**: ready to bump
29 | - **datasets**: fix dtype backend (#44)
30 |
31 | ### Refactor
32 |
33 | - **cruft**: update to new template version
34 |
35 | ## v2.0.0 (2024-07-28)
36 |
37 | ### Feat
38 |
39 | - **cruft**: adopt cruft and link it to cookiecutter-mlops-package
40 |
41 | ## v1.1.3 (2024-07-28)
42 |
43 | ### Fix
44 |
45 | - **mlproject**: fix calling mlflow run by adding project run in front
46 |
47 | ## v1.1.2 (2024-07-28)
48 |
49 | ### Fix
50 |
51 | - **dependencies**: add setuptools to main dependency for mlflow
52 |
53 | ## v1.1.1 (2024-07-23)
54 |
55 | ### Fix
56 |
57 | - **publish**: fix publication workflow by installing dev dependencies
58 |
59 | ## v1.0.1 (2024-06-28)
60 |
61 | ### Fix
62 |
63 | - **version**: bump
64 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | We as members, contributors, and leaders pledge to make participation in our
6 | community a harassment-free experience for everyone, regardless of age, body
7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
8 | identity and expression, level of experience, education, socio-economic status,
9 | nationality, personal appearance, race, religion, or sexual identity
10 | and orientation.
11 |
12 | We pledge to act and interact in ways that contribute to an open, welcoming,
13 | diverse, inclusive, and healthy community.
14 |
15 | ## Our Standards
16 |
17 | Examples of behavior that contributes to a positive environment for our
18 | community include:
19 |
20 | * Demonstrating empathy and kindness toward other people
21 | * Being respectful of differing opinions, viewpoints, and experiences
22 | * Giving and gracefully accepting constructive feedback
23 | * Accepting responsibility and apologizing to those affected by our mistakes,
24 | and learning from the experience
25 | * Focusing on what is best not just for us as individuals, but for the
26 | overall community
27 |
28 | Examples of unacceptable behavior include:
29 |
30 | * The use of sexualized language or imagery, and sexual attention or
31 | advances of any kind
32 | * Trolling, insulting or derogatory comments, and personal or political attacks
33 | * Public or private harassment
34 | * Publishing others' private information, such as a physical or email
35 | address, without their explicit permission
36 | * Other conduct which could reasonably be considered inappropriate in a
37 | professional setting
38 |
39 | ## Enforcement Responsibilities
40 |
41 | Community leaders are responsible for clarifying and enforcing our standards of
42 | acceptable behavior and will take appropriate and fair corrective action in
43 | response to any behavior that they deem inappropriate, threatening, offensive,
44 | or harmful.
45 |
46 | Community leaders have the right and responsibility to remove, edit, or reject
47 | comments, commits, code, wiki edits, issues, and other contributions that are
48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
49 | decisions when appropriate.
50 |
51 | ## Scope
52 |
53 | This Code of Conduct applies within all community spaces, and also applies when
54 | an individual is officially representing the community in public spaces.
55 | Examples of representing our community include using an official e-mail address,
56 | posting via an official social media account, or acting as an appointed
57 | representative at an online or offline event.
58 |
59 | ## Enforcement
60 |
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported to the community leaders responsible for enforcement at
63 | github@fmind.dev.
64 | All complaints will be reviewed and investigated promptly and fairly.
65 |
66 | All community leaders are obligated to respect the privacy and security of the
67 | reporter of any incident.
68 |
69 | ## Enforcement Guidelines
70 |
71 | Community leaders will follow these Community Impact Guidelines in determining
72 | the consequences for any action they deem in violation of this Code of Conduct:
73 |
74 | ### 1. Correction
75 |
76 | **Community Impact**: Use of inappropriate language or other behavior deemed
77 | unprofessional or unwelcome in the community.
78 |
79 | **Consequence**: A private, written warning from community leaders, providing
80 | clarity around the nature of the violation and an explanation of why the
81 | behavior was inappropriate. A public apology may be requested.
82 |
83 | ### 2. Warning
84 |
85 | **Community Impact**: A violation through a single incident or series
86 | of actions.
87 |
88 | **Consequence**: A warning with consequences for continued behavior. No
89 | interaction with the people involved, including unsolicited interaction with
90 | those enforcing the Code of Conduct, for a specified period of time. This
91 | includes avoiding interactions in community spaces as well as external channels
92 | like social media. Violating these terms may lead to a temporary or
93 | permanent ban.
94 |
95 | ### 3. Temporary Ban
96 |
97 | **Community Impact**: A serious violation of community standards, including
98 | sustained inappropriate behavior.
99 |
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 |
106 | ### 4. Permanent Ban
107 |
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior, harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 |
112 | **Consequence**: A permanent ban from any sort of public interaction within
113 | the community.
114 |
115 | ## Attribution
116 |
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.0, available at
119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
120 |
121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
122 | enforcement ladder](https://github.com/mozilla/diversity).
123 |
124 | [homepage]: https://www.contributor-covenant.org
125 |
126 | For answers to common questions about this code of conduct, see the FAQ at
127 | https://www.contributor-covenant.org/faq. Translations are available at
128 | https://www.contributor-covenant.org/translations.
129 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # https://docs.docker.com/engine/reference/builder/
2 |
3 | FROM ghcr.io/astral-sh/uv:python3.13-bookworm
4 | COPY dist/*.whl .
5 | RUN uv pip install --system *.whl
6 | CMD ["bikes", "--help"]
7 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
2 |
3 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
4 |
5 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
6 |
--------------------------------------------------------------------------------
/MLproject:
--------------------------------------------------------------------------------
1 | # https://mlflow.org/docs/latest/projects.html
2 |
3 | name: bikes
4 | python_env: python_env.yaml
5 | entry_points:
6 | main:
7 | parameters:
8 | conf_file: path
9 | command: "PYTHONPATH=src python -m bikes {conf_file}"
10 |
--------------------------------------------------------------------------------
/confs/evaluations.yaml:
--------------------------------------------------------------------------------
1 | job:
2 | KIND: EvaluationsJob
3 | inputs:
4 | KIND: ParquetReader
5 | path: data/inputs_train.parquet
6 | targets:
7 | KIND: ParquetReader
8 | path: data/targets_train.parquet
9 |
--------------------------------------------------------------------------------
/confs/explanations.yaml:
--------------------------------------------------------------------------------
1 | job:
2 | KIND: ExplanationsJob
3 | inputs_samples:
4 | KIND: ParquetReader
5 | path: data/inputs_test.parquet
6 | limit: 100
7 | models_explanations:
8 | KIND: ParquetWriter
9 | path: outputs/models_explanations.parquet
10 | samples_explanations:
11 | KIND: ParquetWriter
12 | path: outputs/samples_explanations.parquet
13 |
--------------------------------------------------------------------------------
/confs/inference.yaml:
--------------------------------------------------------------------------------
1 | job:
2 | KIND: InferenceJob
3 | inputs:
4 | KIND: ParquetReader
5 | path: data/inputs_test.parquet
6 | outputs:
7 | KIND: ParquetWriter
8 | path: outputs/predictions_test.parquet
9 |
--------------------------------------------------------------------------------
/confs/promotion.yaml:
--------------------------------------------------------------------------------
1 | job:
2 | KIND: PromotionJob
3 |
--------------------------------------------------------------------------------
/confs/training.yaml:
--------------------------------------------------------------------------------
1 | job:
2 | KIND: TrainingJob
3 | inputs:
4 | KIND: ParquetReader
5 | path: data/inputs_train.parquet
6 | targets:
7 | KIND: ParquetReader
8 | path: data/targets_train.parquet
9 |
--------------------------------------------------------------------------------
/confs/tuning.yaml:
--------------------------------------------------------------------------------
1 | job:
2 | KIND: TuningJob
3 | inputs:
4 | KIND: ParquetReader
5 | path: data/inputs_train.parquet
6 | targets:
7 | KIND: ParquetReader
8 | path: data/targets_train.parquet
9 |
--------------------------------------------------------------------------------
/data/Readme.txt:
--------------------------------------------------------------------------------
1 | ==========================================
2 | Bike Sharing Dataset
3 | ==========================================
4 |
5 | Hadi Fanaee-T
6 |
7 | Laboratory of Artificial Intelligence and Decision Support (LIAAD), University of Porto
8 | INESC Porto, Campus da FEUP
9 | Rua Dr. Roberto Frias, 378
10 | 4200 - 465 Porto, Portugal
11 |
12 | https://archive.ics.uci.edu/dataset/275/bike+sharing+dataset
13 |
14 | =========================================
15 | Background
16 | =========================================
17 |
18 | Bike sharing systems are new generation of traditional bike rentals where whole process from membership, rental and return
19 | back has become automatic. Through these systems, user is able to easily rent a bike from a particular position and return
20 | back at another position. Currently, there are about over 500 bike-sharing programs around the world which is composed of
21 | over 500 thousands bicycles. Today, there exists great interest in these systems due to their important role in traffic,
22 | environmental and health issues.
23 |
24 | Apart from interesting real world applications of bike sharing systems, the characteristics of data being generated by
25 | these systems make them attractive for the research. Opposed to other transport services such as bus or subway, the duration
26 | of travel, departure and arrival position is explicitly recorded in these systems. This feature turns bike sharing system into
27 | a virtual sensor network that can be used for sensing mobility in the city. Hence, it is expected that most of important
28 | events in the city could be detected via monitoring these data.
29 |
30 | =========================================
31 | Data Set
32 | =========================================
33 | Bike-sharing rental process is highly correlated to the environmental and seasonal settings. For instance, weather conditions,
34 | precipitation, day of week, season, hour of the day, etc. can affect the rental behaviors. The core data set is related to
35 | the two-year historical log corresponding to years 2011 and 2012 from Capital Bikeshare system, Washington D.C., USA which is
36 | publicly available in http://capitalbikeshare.com/system-data. We aggregated the data on two hourly and daily basis and then
37 | extracted and added the corresponding weather and seasonal information. Weather information are extracted from http://www.freemeteo.com.
38 |
39 | =========================================
40 | Associated tasks
41 | =========================================
42 |
43 | - Regression:
44 | Predication of bike rental count hourly or daily based on the environmental and seasonal settings.
45 |
46 | - Event and Anomaly Detection:
47 | Count of rented bikes are also correlated to some events in the town which easily are traceable via search engines.
48 | For instance, query like "2012-10-30 washington d.c." in Google returns related results to Hurricane Sandy. Some of the important events are
49 | identified in [1]. Therefore the data can be used for validation of anomaly or event detection algorithms as well.
50 |
51 |
52 | =========================================
53 | Files
54 | =========================================
55 |
56 | - Readme.txt
57 | - hour.csv : bike sharing counts aggregated on hourly basis. Records: 17379 hours
58 | - day.csv - bike sharing counts aggregated on daily basis. Records: 731 days
59 |
60 |
61 | =========================================
62 | Dataset characteristics
63 | =========================================
64 | Both hour.csv and day.csv have the following fields, except hr which is not available in day.csv
65 |
66 | - instant: record index
67 | - dteday : date
68 | - season : season (1:springer, 2:summer, 3:fall, 4:winter)
69 | - yr : year (0: 2011, 1:2012)
70 | - mnth : month ( 1 to 12)
71 | - hr : hour (0 to 23)
72 | - holiday : weather day is holiday or not (extracted from http://dchr.dc.gov/page/holiday-schedule)
73 | - weekday : day of the week
74 | - workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
75 | + weathersit :
76 | - 1: Clear, Few clouds, Partly cloudy, Partly cloudy
77 | - 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
78 | - 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
79 | - 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
80 | - temp : Normalized temperature in Celsius. The values are divided to 41 (max)
81 | - atemp: Normalized feeling temperature in Celsius. The values are divided to 50 (max)
82 | - hum: Normalized humidity. The values are divided to 100 (max)
83 | - windspeed: Normalized wind speed. The values are divided to 67 (max)
84 | - casual: count of casual users
85 | - registered: count of registered users
86 | - cnt: count of total rental bikes including both casual and registered
87 |
88 | =========================================
89 | License
90 | =========================================
91 | Use of this dataset in publications must be cited to the following publication:
92 |
93 | [1] Fanaee-T, Hadi, and Gama, Joao, "Event labeling combining ensemble detectors and background knowledge", Progress in Artificial Intelligence (2013): pp. 1-15, Springer Berlin Heidelberg, doi:10.1007/s13748-013-0040-3.
94 |
95 | @article{
96 | year={2013},
97 | issn={2192-6352},
98 | journal={Progress in Artificial Intelligence},
99 | doi={10.1007/s13748-013-0040-3},
100 | title={Event labeling combining ensemble detectors and background knowledge},
101 | url={http://dx.doi.org/10.1007/s13748-013-0040-3},
102 | publisher={Springer Berlin Heidelberg},
103 | keywords={Event labeling; Event detection; Ensemble learning; Background knowledge},
104 | author={Fanaee-T, Hadi and Gama, Joao},
105 | pages={1-15}
106 | }
107 |
108 | =========================================
109 | Contact
110 | =========================================
111 |
112 | For further information about this dataset please contact Hadi Fanaee-T (hadi.fanaee@fe.up.pt)
113 |
--------------------------------------------------------------------------------
/data/inputs_test.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmind/mlops-python-package/856a5ab74958cec36ae31619636d07c4baf9698f/data/inputs_test.parquet
--------------------------------------------------------------------------------
/data/inputs_train.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmind/mlops-python-package/856a5ab74958cec36ae31619636d07c4baf9698f/data/inputs_train.parquet
--------------------------------------------------------------------------------
/data/targets_test.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmind/mlops-python-package/856a5ab74958cec36ae31619636d07c4baf9698f/data/targets_test.parquet
--------------------------------------------------------------------------------
/data/targets_train.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmind/mlops-python-package/856a5ab74958cec36ae31619636d07c4baf9698f/data/targets_train.parquet
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | # https://docs.docker.com/compose/compose-file/
2 |
3 | services:
4 | mlflow:
5 | image: ghcr.io/mlflow/mlflow:v2.20.3
6 | ports:
7 | - 5000:5000
8 | environment:
9 | - MLFLOW_HOST=0.0.0.0
10 | command: mlflow server
11 |
--------------------------------------------------------------------------------
/images/mlopsmindmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmind/mlops-python-package/856a5ab74958cec36ae31619636d07c4baf9698f/images/mlopsmindmap.png
--------------------------------------------------------------------------------
/justfile:
--------------------------------------------------------------------------------
1 | # https://just.systems/man/en/
2 |
3 | # REQUIRES
4 |
5 | docker := require("docker")
6 | find := require("find")
7 | rm := require("rm")
8 | uv := require("uv")
9 |
10 | # SETTINGS
11 |
12 | set dotenv-load := true
13 |
14 | # VARIABLES
15 |
16 | PACKAGE := "bikes"
17 | REPOSITORY := "bikes"
18 | SOURCES := "src"
19 | TESTS := "tests"
20 |
21 | # DEFAULTS
22 |
23 | # display help information
24 | default:
25 | @just --list
26 |
27 | # IMPORTS
28 |
29 | import 'tasks/check.just'
30 | import 'tasks/clean.just'
31 | import 'tasks/commit.just'
32 | import 'tasks/doc.just'
33 | import 'tasks/docker.just'
34 | import 'tasks/format.just'
35 | import 'tasks/install.just'
36 | import 'tasks/mlflow.just'
37 | import 'tasks/package.just'
38 | import 'tasks/project.just'
39 |
--------------------------------------------------------------------------------
/mlops-python-package.code-workspace:
--------------------------------------------------------------------------------
1 | {
2 | "folders": [
3 | {
4 | "path": "."
5 | }
6 | ],
7 | "settings": {
8 | "editor.formatOnSave": true,
9 | "python.defaultInterpreterPath": ".venv/bin/python",
10 | "python.testing.pytestEnabled": true,
11 | "python.testing.pytestArgs": [
12 | "tests"
13 | ],
14 | "[python]": {
15 | "editor.codeActionsOnSave": {
16 | "source.organizeImports": "explicit"
17 | },
18 | "editor.defaultFormatter": "charliermarsh.ruff",
19 | },
20 | },
21 | "extensions": {
22 | "recommendations": [
23 | "charliermarsh.ruff",
24 | "ms-python.mypy-type-checker",
25 | "ms-python.python",
26 | "ms-python.vscode-pylance",
27 | "redhat.vscode-yaml",
28 | ]
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/outputs/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmind/mlops-python-package/856a5ab74958cec36ae31619636d07c4baf9698f/outputs/.gitkeep
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | # https://docs.astral.sh/uv/reference/settings/
2 | # https://packaging.python.org/en/latest/guides/writing-pyproject-toml/
3 |
4 | # PROJECT
5 |
6 | [project]
7 | name = "bikes"
8 | version = "4.1.0"
9 | description = "Predict the number of bikes available."
10 | authors = [{ name = "Médéric HURIER", email = "github@fmind.dev" }]
11 | readme = "README.md"
12 | license = { file = "LICENSE.txt" }
13 | keywords = ["mlops", "python", "package"]
14 | requires-python = ">=3.13"
15 | dependencies = [
16 | "loguru>=0.7.3",
17 | "matplotlib>=3.10.1",
18 | "mlflow>=2.20.3",
19 | "numba>=0.61.0",
20 | "numpy>=2.1.3",
21 | "omegaconf>=2.3.0",
22 | "pandas>=2.2.3",
23 | "pandera>=0.23.0",
24 | "plotly>=6.0.0",
25 | "plyer>=2.1.0",
26 | "psutil>=7.0.0",
27 | "pyarrow>=19.0.1",
28 | "pydantic-settings>=2.8.1",
29 | "pydantic>=2.10.6",
30 | "pynvml>=12.0.0",
31 | "scikit-learn>=1.6.1",
32 | "setuptools>=75.8.2",
33 | "shap>=0.46.0",
34 | "hatchling>=1.27.0",
35 | ]
36 |
37 | # LINKS
38 |
39 | [project.urls]
40 | Homepage = "https://github.com/fmind/mlops-python-package"
41 | Documentation = "https://fmind.github.io/mlops-python-package/bikes.html"
42 | Repository = "https://github.com/fmind/mlops-python-package"
43 | "Bug Tracker" = "https://github.com/fmind/mlops-python-package/issues"
44 | Changelog = "https://github.com/fmind/mlops-python-package/blob/main/CHANGELOG.md"
45 |
46 | # SCRIPTS
47 |
48 | [project.scripts]
49 | bikes = 'bikes.scripts:main'
50 |
51 | # DEPENDENCIES
52 |
53 | [dependency-groups]
54 | check = [
55 | "bandit>=1.8.3",
56 | "coverage>=7.6.12",
57 | "mypy>=1.15.0",
58 | "pandera[mypy]>=0.23.0",
59 | "pytest>=8.3.5",
60 | "pytest-cov>=6.0.0",
61 | "pytest-mock>=3.14.0",
62 | "pytest-xdist>=3.6.1",
63 | "ruff>=0.9.9",
64 | ]
65 | commit = ["commitizen>=4.4.1", "pre-commit>=4.1.0"]
66 | dev = ["rust-just>=1.39.0"]
67 | doc = ["pdoc>=15.0.1"]
68 | notebook = ["ipykernel>=6.29.5", "nbformat>=5.10.4"]
69 |
70 | # TOOLS
71 |
72 | [tool.uv]
73 | default-groups = ["check", "commit", "dev", "doc", "notebook"]
74 |
75 | [tool.bandit]
76 | targets = ["src"]
77 |
78 | [tool.commitizen]
79 | name = "cz_conventional_commits"
80 | tag_format = "v$version"
81 | version_scheme = "pep440"
82 | version_provider = "pep621"
83 | changelog_start_rev = "v1.0.0"
84 | update_changelog_on_bump = true
85 |
86 | [tool.coverage.run]
87 | branch = true
88 | source = ["src"]
89 | omit = ["__main__.py"]
90 |
91 | [tool.mypy]
92 | pretty = true
93 | python_version = "3.13"
94 | check_untyped_defs = true
95 | ignore_missing_imports = true
96 | plugins = ["pandera.mypy", "pydantic.mypy"]
97 |
98 | [tool.pytest.ini_options]
99 | addopts = "--verbosity=2"
100 | pythonpath = ["src"]
101 |
102 | [tool.ruff]
103 | fix = true
104 | indent-width = 4
105 | line-length = 100
106 | target-version = "py313"
107 |
108 | [tool.ruff.format]
109 | docstring-code-format = true
110 |
111 | [tool.ruff.lint.pydocstyle]
112 | convention = "google"
113 |
114 | [tool.ruff.lint.per-file-ignores]
115 | "tests/*.py" = ["D100", "D103"]
116 |
117 | # SYSTEMS
118 |
119 | [build-system]
120 | requires = ["hatchling"]
121 | build-backend = "hatchling.build"
122 |
--------------------------------------------------------------------------------
/python_env.yaml:
--------------------------------------------------------------------------------
1 | {
2 | "python": "3.13",
3 | "dependencies": [
4 | "alembic==1.14.1",
5 | "annotated-types==0.7.0",
6 | "antlr4-python3-runtime==4.9.3",
7 | "appnope==0.1.4",
8 | "argcomplete==3.5.3",
9 | "asttokens==3.0.0",
10 | "attrs==25.1.0",
11 | "bandit==1.8.3",
12 | "blinker==1.9.0",
13 | "cachetools==5.5.2",
14 | "certifi==2025.1.31",
15 | "cffi==1.17.1",
16 | "cfgv==3.4.0",
17 | "charset-normalizer==3.4.1",
18 | "click==8.1.8",
19 | "cloudpickle==3.1.1",
20 | "colorama==0.4.6",
21 | "comm==0.2.2",
22 | "commitizen==4.4.1",
23 | "contourpy==1.3.1",
24 | "coverage==7.6.12",
25 | "cycler==0.12.1",
26 | "databricks-sdk==0.44.1",
27 | "debugpy==1.8.12",
28 | "decli==0.6.2",
29 | "decorator==5.2.1",
30 | "deprecated==1.2.18",
31 | "distlib==0.3.9",
32 | "docker==7.1.0",
33 | "execnet==2.1.1",
34 | "executing==2.2.0",
35 | "fastjsonschema==2.21.1",
36 | "filelock==3.17.0",
37 | "flask==3.1.0",
38 | "fonttools==4.56.0",
39 | "gitdb==4.0.12",
40 | "gitpython==3.1.44",
41 | "google-auth==2.38.0",
42 | "graphene==3.4.3",
43 | "graphql-core==3.2.6",
44 | "graphql-relay==3.2.0",
45 | "greenlet==3.1.1",
46 | "gunicorn==23.0.0",
47 | "hatchling==1.27.0",
48 | "identify==2.6.8",
49 | "idna==3.10",
50 | "importlib-metadata==8.6.1",
51 | "iniconfig==2.0.0",
52 | "ipykernel==6.29.5",
53 | "ipython==9.0.0",
54 | "ipython-pygments-lexers==1.1.1",
55 | "itsdangerous==2.2.0",
56 | "jedi==0.19.2",
57 | "jinja2==3.1.5",
58 | "joblib==1.4.2",
59 | "jsonschema==4.23.0",
60 | "jsonschema-specifications==2024.10.1",
61 | "jupyter-client==8.6.3",
62 | "jupyter-core==5.7.2",
63 | "kiwisolver==1.4.8",
64 | "llvmlite==0.44.0",
65 | "loguru==0.7.3",
66 | "mako==1.3.9",
67 | "markdown==3.7",
68 | "markdown-it-py==3.0.0",
69 | "markupsafe==3.0.2",
70 | "matplotlib==3.10.1",
71 | "matplotlib-inline==0.1.7",
72 | "mdurl==0.1.2",
73 | "mlflow==2.20.3",
74 | "mlflow-skinny==2.20.3",
75 | "mypy==1.15.0",
76 | "mypy-extensions==1.0.0",
77 | "narwhals==1.28.0",
78 | "nbformat==5.10.4",
79 | "nest-asyncio==1.6.0",
80 | "nodeenv==1.9.1",
81 | "numba==0.61.0",
82 | "numpy==2.1.3",
83 | "nvidia-ml-py==12.570.86",
84 | "omegaconf==2.3.0",
85 | "opentelemetry-api==1.16.0",
86 | "opentelemetry-sdk==1.16.0",
87 | "opentelemetry-semantic-conventions==0.37b0",
88 | "packaging==24.2",
89 | "pandas==2.2.3",
90 | "pandas-stubs==2.2.3.241126",
91 | "pandera==0.23.0",
92 | "parso==0.8.4",
93 | "pathspec==0.12.1",
94 | "pbr==6.1.1",
95 | "pdoc==15.0.1",
96 | "pexpect==4.9.0",
97 | "pillow==11.1.0",
98 | "platformdirs==4.3.6",
99 | "plotly==6.0.0",
100 | "pluggy==1.5.0",
101 | "plyer==2.1.0",
102 | "pre-commit==4.1.0",
103 | "prompt-toolkit==3.0.50",
104 | "protobuf==5.29.3",
105 | "psutil==7.0.0",
106 | "ptyprocess==0.7.0",
107 | "pure-eval==0.2.3",
108 | "pyarrow==19.0.1",
109 | "pyasn1==0.6.1",
110 | "pyasn1-modules==0.4.1",
111 | "pycparser==2.22",
112 | "pydantic==2.10.6",
113 | "pydantic-core==2.27.2",
114 | "pydantic-settings==2.8.1",
115 | "pygments==2.19.1",
116 | "pynvml==12.0.0",
117 | "pyparsing==3.2.1",
118 | "pytest==8.3.5",
119 | "pytest-cov==6.0.0",
120 | "pytest-mock==3.14.0",
121 | "pytest-xdist==3.6.1",
122 | "python-dateutil==2.9.0.post0",
123 | "python-dotenv==1.0.1",
124 | "pytz==2025.1",
125 | "pyyaml==6.0.2",
126 | "pyzmq==26.2.1",
127 | "questionary==2.1.0",
128 | "referencing==0.36.2",
129 | "requests==2.32.3",
130 | "rich==13.9.4",
131 | "rpds-py==0.23.1",
132 | "rsa==4.9",
133 | "ruff==0.9.9",
134 | "scikit-learn==1.6.1",
135 | "scipy==1.15.2",
136 | "setuptools==75.8.2",
137 | "shap==0.46.0",
138 | "six==1.17.0",
139 | "slicer==0.0.8",
140 | "smmap==5.0.2",
141 | "sqlalchemy==2.0.38",
142 | "sqlparse==0.5.3",
143 | "stack-data==0.6.3",
144 | "stevedore==5.4.1",
145 | "termcolor==2.5.0",
146 | "threadpoolctl==3.5.0",
147 | "tomlkit==0.13.2",
148 | "tornado==6.4.2",
149 | "tqdm==4.67.1",
150 | "traitlets==5.14.3",
151 | "trove-classifiers==2025.3.3.18",
152 | "typeguard==4.4.2",
153 | "types-pytz==2025.1.0.20250204",
154 | "typing-extensions==4.12.2",
155 | "typing-inspect==0.9.0",
156 | "tzdata==2025.1",
157 | "urllib3==2.3.0",
158 | "virtualenv==20.29.2",
159 | "waitress==3.0.2",
160 | "wcwidth==0.2.13",
161 | "werkzeug==3.1.3",
162 | "win32-setctime==1.2.0",
163 | "wrapt==1.17.2",
164 | "zipp==3.21.0"
165 | ]
166 | }
167 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # This file was autogenerated by uv via the following command:
2 | # uv export --format=requirements-txt --no-dev --no-hashes --no-editable --no-emit-project --output-file=requirements.txt
3 | alembic==1.14.1
4 | annotated-types==0.7.0
5 | antlr4-python3-runtime==4.9.3
6 | appnope==0.1.4 ; platform_system == 'Darwin'
7 | argcomplete==3.6.1
8 | asttokens==3.0.0
9 | attrs==25.1.0
10 | bandit==1.8.3
11 | blinker==1.9.0
12 | cachetools==5.5.2
13 | certifi==2025.1.31
14 | cffi==1.17.1 ; implementation_name == 'pypy'
15 | cfgv==3.4.0
16 | charset-normalizer==3.4.1
17 | click==8.1.8
18 | cloudpickle==3.1.1
19 | colorama==0.4.6
20 | comm==0.2.2
21 | commitizen==4.4.1
22 | contourpy==1.3.1
23 | coverage==7.6.12
24 | cycler==0.12.1
25 | databricks-sdk==0.44.1
26 | debugpy==1.8.12
27 | decli==0.6.2
28 | decorator==5.2.1
29 | deprecated==1.2.18
30 | distlib==0.3.9
31 | docker==7.1.0
32 | execnet==2.1.1
33 | executing==2.2.0
34 | fastjsonschema==2.21.1
35 | filelock==3.18.0
36 | flask==3.1.0
37 | fonttools==4.56.0
38 | gitdb==4.0.12
39 | gitpython==3.1.44
40 | google-auth==2.38.0
41 | graphene==3.4.3
42 | graphql-core==3.2.6
43 | graphql-relay==3.2.0
44 | greenlet==3.1.1 ; (python_full_version == '3.13.*' and platform_machine == 'AMD64') or (python_full_version == '3.13.*' and platform_machine == 'WIN32') or (python_full_version == '3.13.*' and platform_machine == 'aarch64') or (python_full_version == '3.13.*' and platform_machine == 'amd64') or (python_full_version == '3.13.*' and platform_machine == 'ppc64le') or (python_full_version == '3.13.*' and platform_machine == 'win32') or (python_full_version == '3.13.*' and platform_machine == 'x86_64')
45 | gunicorn==23.0.0 ; platform_system != 'Windows'
46 | hatchling==1.27.0
47 | identify==2.6.9
48 | idna==3.10
49 | importlib-metadata==8.6.1
50 | iniconfig==2.0.0
51 | ipykernel==6.29.5
52 | ipython==9.0.2
53 | ipython-pygments-lexers==1.1.1
54 | itsdangerous==2.2.0
55 | jedi==0.19.2
56 | jinja2==3.1.5
57 | joblib==1.4.2
58 | jsonschema==4.23.0
59 | jsonschema-specifications==2024.10.1
60 | jupyter-client==8.6.3
61 | jupyter-core==5.7.2
62 | kiwisolver==1.4.8
63 | llvmlite==0.44.0
64 | loguru==0.7.3
65 | mako==1.3.9
66 | markdown==3.7
67 | markdown-it-py==3.0.0
68 | markupsafe==3.0.2
69 | matplotlib==3.10.1
70 | matplotlib-inline==0.1.7
71 | mdurl==0.1.2
72 | mlflow==2.20.3
73 | mlflow-skinny==2.20.3
74 | mypy==1.15.0
75 | mypy-extensions==1.0.0
76 | narwhals==1.28.0
77 | nbformat==5.10.4
78 | nest-asyncio==1.6.0
79 | nodeenv==1.9.1
80 | numba==0.61.0
81 | numpy==2.1.3
82 | nvidia-ml-py==12.570.86
83 | omegaconf==2.3.0
84 | opentelemetry-api==1.16.0
85 | opentelemetry-sdk==1.16.0
86 | opentelemetry-semantic-conventions==0.37b0
87 | packaging==24.2
88 | pandas==2.2.3
89 | pandas-stubs==2.2.3.250308
90 | pandera==0.23.0
91 | parso==0.8.4
92 | pathspec==0.12.1
93 | pbr==6.1.1
94 | pdoc==15.0.1
95 | pexpect==4.9.0 ; sys_platform != 'emscripten' and sys_platform != 'win32'
96 | pillow==11.1.0
97 | platformdirs==4.3.6
98 | plotly==6.0.0
99 | pluggy==1.5.0
100 | plyer==2.1.0
101 | pre-commit==4.1.0
102 | prompt-toolkit==3.0.50
103 | protobuf==5.29.3
104 | psutil==7.0.0
105 | ptyprocess==0.7.0 ; sys_platform != 'emscripten' and sys_platform != 'win32'
106 | pure-eval==0.2.3
107 | pyarrow==19.0.1
108 | pyasn1==0.6.1
109 | pyasn1-modules==0.4.1
110 | pycparser==2.22 ; implementation_name == 'pypy'
111 | pydantic==2.10.6
112 | pydantic-core==2.27.2
113 | pydantic-settings==2.8.1
114 | pygments==2.19.1
115 | pynvml==12.0.0
116 | pyparsing==3.2.1
117 | pytest==8.3.5
118 | pytest-cov==6.0.0
119 | pytest-mock==3.14.0
120 | pytest-xdist==3.6.1
121 | python-dateutil==2.9.0.post0
122 | python-dotenv==1.0.1
123 | pytz==2025.1
124 | pywin32==310 ; sys_platform == 'win32'
125 | pyyaml==6.0.2
126 | pyzmq==26.2.1
127 | questionary==2.1.0
128 | referencing==0.36.2
129 | requests==2.32.3
130 | rich==13.9.4
131 | rpds-py==0.24.0
132 | rsa==4.9
133 | ruff==0.11.2
134 | scikit-learn==1.6.1
135 | scipy==1.15.2
136 | setuptools==75.8.2
137 | shap==0.46.0
138 | six==1.17.0
139 | slicer==0.0.8
140 | smmap==5.0.2
141 | sqlalchemy==2.0.38
142 | sqlparse==0.5.3
143 | stack-data==0.6.3
144 | stevedore==5.4.1
145 | termcolor==2.5.0
146 | threadpoolctl==3.5.0
147 | tomlkit==0.13.2
148 | tornado==6.4.2
149 | tqdm==4.67.1
150 | traitlets==5.14.3
151 | trove-classifiers==2025.3.3.18
152 | typeguard==4.4.2
153 | types-pytz==2025.2.0.20250326
154 | typing-extensions==4.12.2
155 | typing-inspect==0.9.0
156 | tzdata==2025.1
157 | urllib3==2.3.0
158 | virtualenv==20.29.3
159 | waitress==3.0.2 ; platform_system == 'Windows'
160 | wcwidth==0.2.13
161 | werkzeug==3.1.3
162 | win32-setctime==1.2.0 ; sys_platform == 'win32'
163 | wrapt==1.17.2
164 | zipp==3.21.0
165 |
--------------------------------------------------------------------------------
/src/bikes/__init__.py:
--------------------------------------------------------------------------------
1 | """Predict the number of bikes available."""
2 |
--------------------------------------------------------------------------------
/src/bikes/__main__.py:
--------------------------------------------------------------------------------
1 | """Entry point of the package."""
2 |
3 | # %% IMPORTS
4 |
5 | from bikes import scripts
6 |
7 | # %% MAIN
8 |
9 | if __name__ == "__main__":
10 | scripts.main()
11 |
--------------------------------------------------------------------------------
/src/bikes/core/__init__.py:
--------------------------------------------------------------------------------
1 | """Core components of the project."""
2 |
--------------------------------------------------------------------------------
/src/bikes/core/metrics.py:
--------------------------------------------------------------------------------
1 | """Evaluate model performances with metrics."""
2 |
3 | # %% IMPORTS
4 |
5 | from __future__ import annotations
6 |
7 | import abc
8 | import typing as T
9 |
10 | import mlflow
11 | import pandas as pd
12 | import pydantic as pdt
13 | from mlflow.metrics import MetricValue
14 | from sklearn import metrics as sklearn_metrics
15 |
16 | from bikes.core import models, schemas
17 |
18 | # %% TYPINGS
19 |
20 | MlflowMetric: T.TypeAlias = MetricValue
21 | MlflowThreshold: T.TypeAlias = mlflow.models.MetricThreshold
22 | MlflowModelValidationFailedException: T.TypeAlias = (
23 | mlflow.models.evaluation.validation.ModelValidationFailedException
24 | )
25 |
26 | # %% METRICS
27 |
28 |
29 | class Metric(abc.ABC, pdt.BaseModel, strict=True, frozen=True, extra="forbid"):
30 | """Base class for a project metric.
31 |
32 | Use metrics to evaluate model performance.
33 | e.g., accuracy, precision, recall, MAE, F1, ...
34 |
35 | Parameters:
36 | name (str): name of the metric for the reporting.
37 | greater_is_better (bool): maximize or minimize result.
38 | """
39 |
40 | KIND: str
41 |
42 | name: str
43 | greater_is_better: bool
44 |
45 | @abc.abstractmethod
46 | def score(self, targets: schemas.Targets, outputs: schemas.Outputs) -> float:
47 | """Score the outputs against the targets.
48 |
49 | Args:
50 | targets (schemas.Targets): expected values.
51 | outputs (schemas.Outputs): predicted values.
52 |
53 | Returns:
54 | float: single result from the metric computation.
55 | """
56 |
57 | def scorer(
58 | self, model: models.Model, inputs: schemas.Inputs, targets: schemas.Targets
59 | ) -> float:
60 | """Score model outputs against targets.
61 |
62 | Args:
63 | model (models.Model): model to evaluate.
64 | inputs (schemas.Inputs): model inputs values.
65 | targets (schemas.Targets): model expected values.
66 |
67 | Returns:
68 | float: single result from the metric computation.
69 | """
70 | outputs = model.predict(inputs=inputs)
71 | score = self.score(targets=targets, outputs=outputs)
72 | return score
73 |
74 | def to_mlflow(self) -> MlflowMetric:
75 | """Convert the metric to an Mlflow metric.
76 |
77 | Returns:
78 | MlflowMetric: the Mlflow metric.
79 | """
80 |
81 | def eval_fn(predictions: pd.Series[int], targets: pd.Series[int]) -> MlflowMetric:
82 | """Evaluation function associated with the mlflow metric.
83 |
84 | Args:
85 | predictions (pd.Series): model predictions.
86 | targets (pd.Series | None): model targets.
87 |
88 | Returns:
89 | MlflowMetric: the mlflow metric.
90 | """
91 | score_targets = schemas.Targets(
92 | {schemas.TargetsSchema.cnt: targets}, index=targets.index
93 | )
94 | score_outputs = schemas.Outputs(
95 | {schemas.OutputsSchema.prediction: predictions}, index=predictions.index
96 | )
97 | sign = 1 if self.greater_is_better else -1 # reverse the effect
98 | score = self.score(targets=score_targets, outputs=score_outputs)
99 | return MlflowMetric(aggregate_results={self.name: score * sign})
100 |
101 | return mlflow.metrics.make_metric(
102 | eval_fn=eval_fn, name=self.name, greater_is_better=self.greater_is_better
103 | )
104 |
105 |
106 | class SklearnMetric(Metric):
107 | """Compute metrics with sklearn.
108 |
109 | Parameters:
110 | name (str): name of the sklearn metric.
111 | greater_is_better (bool): maximize or minimize.
112 | """
113 |
114 | KIND: T.Literal["SklearnMetric"] = "SklearnMetric"
115 |
116 | name: str = "mean_squared_error"
117 | greater_is_better: bool = False
118 |
119 | @T.override
120 | def score(self, targets: schemas.Targets, outputs: schemas.Outputs) -> float:
121 | metric = getattr(sklearn_metrics, self.name)
122 | sign = 1 if self.greater_is_better else -1
123 | y_true = targets[schemas.TargetsSchema.cnt]
124 | y_pred = outputs[schemas.OutputsSchema.prediction]
125 | score = metric(y_pred=y_pred, y_true=y_true) * sign
126 | return float(score)
127 |
128 |
129 | MetricKind = SklearnMetric
130 | MetricsKind: T.TypeAlias = list[T.Annotated[MetricKind, pdt.Field(discriminator="KIND")]]
131 |
132 | # %% THRESHOLDS
133 |
134 |
135 | class Threshold(abc.ABC, pdt.BaseModel, strict=True, frozen=True, extra="forbid"):
136 | """A project threshold for a metric.
137 |
138 | Use thresholds to monitor model performances.
139 | e.g., to trigger an alert when a threshold is met.
140 |
141 | Parameters:
142 | threshold (int | float): absolute threshold value.
143 | greater_is_better (bool): maximize or minimize result.
144 | """
145 |
146 | threshold: int | float
147 | greater_is_better: bool
148 |
149 | def to_mlflow(self) -> MlflowThreshold:
150 | """Convert the threshold to an mlflow threshold.
151 |
152 | Returns:
153 | MlflowThreshold: the mlflow threshold.
154 | """
155 | return MlflowThreshold(threshold=self.threshold, greater_is_better=self.greater_is_better)
156 |
--------------------------------------------------------------------------------
/src/bikes/core/models.py:
--------------------------------------------------------------------------------
1 | """Define trainable machine learning models."""
2 |
3 | # %% IMPORTS
4 |
5 | import abc
6 | import typing as T
7 |
8 | import pandas as pd
9 | import pydantic as pdt
10 | import shap
11 | from sklearn import compose, ensemble, pipeline, preprocessing
12 |
13 | from bikes.core import schemas
14 |
15 | # %% TYPES
16 |
17 | # Model params
18 | ParamKey = str
19 | ParamValue = T.Any
20 | Params = dict[ParamKey, ParamValue]
21 |
22 | # %% MODELS
23 |
24 |
25 | class Model(abc.ABC, pdt.BaseModel, strict=True, frozen=False, extra="forbid"):
26 | """Base class for a project model.
27 |
28 | Use a model to adapt AI/ML frameworks.
29 | e.g., to swap easily one model with another.
30 | """
31 |
32 | KIND: str
33 |
34 | def get_params(self, deep: bool = True) -> Params:
35 | """Get the model params.
36 |
37 | Args:
38 | deep (bool, optional): ignored.
39 |
40 | Returns:
41 | Params: internal model parameters.
42 | """
43 | params: Params = {}
44 | for key, value in self.model_dump().items():
45 | if not key.startswith("_") and not key.isupper():
46 | params[key] = value
47 | return params
48 |
49 | def set_params(self, **params: ParamValue) -> T.Self:
50 | """Set the model params in place.
51 |
52 | Returns:
53 | T.Self: instance of the model.
54 | """
55 | for key, value in params.items():
56 | setattr(self, key, value)
57 | return self
58 |
59 | @abc.abstractmethod
60 | def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> T.Self:
61 | """Fit the model on the given inputs and targets.
62 |
63 | Args:
64 | inputs (schemas.Inputs): model training inputs.
65 | targets (schemas.Targets): model training targets.
66 |
67 | Returns:
68 | T.Self: instance of the model.
69 | """
70 |
71 | @abc.abstractmethod
72 | def predict(self, inputs: schemas.Inputs) -> schemas.Outputs:
73 | """Generate outputs with the model for the given inputs.
74 |
75 | Args:
76 | inputs (schemas.Inputs): model prediction inputs.
77 |
78 | Returns:
79 | schemas.Outputs: model prediction outputs.
80 | """
81 |
82 | def explain_model(self) -> schemas.FeatureImportances:
83 | """Explain the internal model structure.
84 |
85 | Returns:
86 | schemas.FeatureImportances: feature importances.
87 | """
88 | raise NotImplementedError()
89 |
90 | def explain_samples(self, inputs: schemas.Inputs) -> schemas.SHAPValues:
91 | """Explain model outputs on input samples.
92 |
93 | Returns:
94 | schemas.SHAPValues: SHAP values.
95 | """
96 | raise NotImplementedError()
97 |
98 | def get_internal_model(self) -> T.Any:
99 | """Return the internal model in the object.
100 |
101 | Raises:
102 | NotImplementedError: method not implemented.
103 |
104 | Returns:
105 | T.Any: any internal model (either empty or fitted).
106 | """
107 | raise NotImplementedError()
108 |
109 |
110 | class BaselineSklearnModel(Model):
111 | """Simple baseline model based on scikit-learn.
112 |
113 | Parameters:
114 | max_depth (int): maximum depth of the random forest.
115 | n_estimators (int): number of estimators in the random forest.
116 | random_state (int, optional): random state of the machine learning pipeline.
117 | """
118 |
119 | KIND: T.Literal["BaselineSklearnModel"] = "BaselineSklearnModel"
120 |
121 | # params
122 | max_depth: int = 20
123 | n_estimators: int = 200
124 | random_state: int | None = 42
125 | # private
126 | _pipeline: pipeline.Pipeline | None = None
127 | _numericals: list[str] = [
128 | "yr",
129 | "mnth",
130 | "hr",
131 | "holiday",
132 | "weekday",
133 | "workingday",
134 | "temp",
135 | "atemp",
136 | "hum",
137 | "windspeed",
138 | "casual",
139 | "registered", # too correlated with target
140 | ]
141 | _categoricals: list[str] = [
142 | "season",
143 | "weathersit",
144 | ]
145 |
146 | @T.override
147 | def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> "BaselineSklearnModel":
148 | # subcomponents
149 | categoricals_transformer = preprocessing.OneHotEncoder(
150 | sparse_output=False, handle_unknown="ignore"
151 | )
152 | # components
153 | transformer = compose.ColumnTransformer(
154 | [
155 | ("categoricals", categoricals_transformer, self._categoricals),
156 | ("numericals", "passthrough", self._numericals),
157 | ],
158 | remainder="drop",
159 | )
160 | regressor = ensemble.RandomForestRegressor(
161 | max_depth=self.max_depth,
162 | n_estimators=self.n_estimators,
163 | random_state=self.random_state,
164 | )
165 | # pipeline
166 | self._pipeline = pipeline.Pipeline(
167 | steps=[
168 | ("transformer", transformer),
169 | ("regressor", regressor),
170 | ]
171 | )
172 | self._pipeline.fit(X=inputs, y=targets[schemas.TargetsSchema.cnt])
173 | return self
174 |
175 | @T.override
176 | def predict(self, inputs: schemas.Inputs) -> schemas.Outputs:
177 | model = self.get_internal_model()
178 | prediction = model.predict(inputs)
179 | outputs_ = pd.DataFrame(
180 | data={schemas.OutputsSchema.prediction: prediction}, index=inputs.index
181 | )
182 | outputs = schemas.OutputsSchema.check(data=outputs_)
183 | return outputs
184 |
185 | @T.override
186 | def explain_model(self) -> schemas.FeatureImportances:
187 | model = self.get_internal_model()
188 | regressor = model.named_steps["regressor"]
189 | transformer = model.named_steps["transformer"]
190 | feature = transformer.get_feature_names_out()
191 | feature_importances_ = pd.DataFrame(
192 | data={
193 | "feature": feature,
194 | "importance": regressor.feature_importances_,
195 | }
196 | )
197 | feature_importances = schemas.FeatureImportancesSchema.check(data=feature_importances_)
198 | return feature_importances
199 |
200 | @T.override
201 | def explain_samples(self, inputs: schemas.Inputs) -> schemas.SHAPValues:
202 | model = self.get_internal_model()
203 | regressor = model.named_steps["regressor"]
204 | transformer = model.named_steps["transformer"]
205 | transformed = transformer.transform(X=inputs)
206 | explainer = shap.TreeExplainer(model=regressor)
207 | shap_values_ = pd.DataFrame(
208 | data=explainer.shap_values(X=transformed),
209 | columns=transformer.get_feature_names_out(),
210 | )
211 | shap_values = schemas.SHAPValuesSchema.check(data=shap_values_)
212 | return shap_values
213 |
214 | @T.override
215 | def get_internal_model(self) -> pipeline.Pipeline:
216 | model = self._pipeline
217 | if model is None:
218 | raise ValueError("Model is not fitted yet!")
219 | return model
220 |
221 |
222 | ModelKind = BaselineSklearnModel
223 |
--------------------------------------------------------------------------------
/src/bikes/core/schemas.py:
--------------------------------------------------------------------------------
1 | """Define and validate dataframe schemas."""
2 |
3 | # %% IMPORTS
4 |
5 | import typing as T
6 |
7 | import pandas as pd
8 | import pandera as pa
9 | import pandera.typing as papd
10 | import pandera.typing.common as padt
11 |
12 | # %% TYPES
13 |
14 | # Generic type for a dataframe container
15 | TSchema = T.TypeVar("TSchema", bound="pa.DataFrameModel")
16 |
17 | # %% SCHEMAS
18 |
19 |
20 | class Schema(pa.DataFrameModel):
21 | """Base class for a dataframe schema.
22 |
23 | Use a schema to type your dataframe object.
24 | e.g., to communicate and validate its fields.
25 | """
26 |
27 | class Config:
28 | """Default configurations for all schemas.
29 |
30 | Parameters:
31 | coerce (bool): convert data type if possible.
32 | strict (bool): ensure the data type is correct.
33 | """
34 |
35 | coerce: bool = True
36 | strict: bool = True
37 |
38 | @classmethod
39 | def check(cls: T.Type[TSchema], data: pd.DataFrame) -> papd.DataFrame[TSchema]:
40 | """Check the dataframe with this schema.
41 |
42 | Args:
43 | data (pd.DataFrame): dataframe to check.
44 |
45 | Returns:
46 | papd.DataFrame[TSchema]: validated dataframe.
47 | """
48 | return T.cast(papd.DataFrame[TSchema], cls.validate(data))
49 |
50 |
51 | class InputsSchema(Schema):
52 | """Schema for the project inputs."""
53 |
54 | instant: papd.Index[padt.UInt32] = pa.Field(ge=0)
55 | dteday: papd.Series[padt.DateTime] = pa.Field()
56 | season: papd.Series[padt.UInt8] = pa.Field(isin=[1, 2, 3, 4])
57 | yr: papd.Series[padt.UInt8] = pa.Field(ge=0, le=1)
58 | mnth: papd.Series[padt.UInt8] = pa.Field(ge=1, le=12)
59 | hr: papd.Series[padt.UInt8] = pa.Field(ge=0, le=23)
60 | holiday: papd.Series[padt.Bool] = pa.Field()
61 | weekday: papd.Series[padt.UInt8] = pa.Field(ge=0, le=6)
62 | workingday: papd.Series[padt.Bool] = pa.Field()
63 | weathersit: papd.Series[padt.UInt8] = pa.Field(ge=1, le=4)
64 | temp: papd.Series[padt.Float16] = pa.Field(ge=0, le=1)
65 | atemp: papd.Series[padt.Float16] = pa.Field(ge=0, le=1)
66 | hum: papd.Series[padt.Float16] = pa.Field(ge=0, le=1)
67 | windspeed: papd.Series[padt.Float16] = pa.Field(ge=0, le=1)
68 | casual: papd.Series[padt.UInt32] = pa.Field(ge=0)
69 | registered: papd.Series[padt.UInt32] = pa.Field(ge=0)
70 |
71 |
72 | Inputs = papd.DataFrame[InputsSchema]
73 |
74 |
75 | class TargetsSchema(Schema):
76 | """Schema for the project target."""
77 |
78 | instant: papd.Index[padt.UInt32] = pa.Field(ge=0)
79 | cnt: papd.Series[padt.UInt32] = pa.Field(ge=0)
80 |
81 |
82 | Targets = papd.DataFrame[TargetsSchema]
83 |
84 |
85 | class OutputsSchema(Schema):
86 | """Schema for the project output."""
87 |
88 | instant: papd.Index[padt.UInt32] = pa.Field(ge=0)
89 | prediction: papd.Series[padt.UInt32] = pa.Field(ge=0)
90 |
91 |
92 | Outputs = papd.DataFrame[OutputsSchema]
93 |
94 |
95 | class SHAPValuesSchema(Schema):
96 | """Schema for the project shap values."""
97 |
98 | class Config:
99 | """Default configurations this schema.
100 |
101 | Parameters:
102 | dtype (str): dataframe default data type.
103 | strict (bool): ensure the data type is correct.
104 | """
105 |
106 | dtype: str = "float32"
107 | strict: bool = False
108 |
109 |
110 | SHAPValues = papd.DataFrame[SHAPValuesSchema]
111 |
112 |
113 | class FeatureImportancesSchema(Schema):
114 | """Schema for the project feature importances."""
115 |
116 | feature: papd.Series[padt.String] = pa.Field()
117 | importance: papd.Series[padt.Float32] = pa.Field()
118 |
119 |
120 | FeatureImportances = papd.DataFrame[FeatureImportancesSchema]
121 |
--------------------------------------------------------------------------------
/src/bikes/io/__init__.py:
--------------------------------------------------------------------------------
1 | """Components related to external operations (inputs and outputs)."""
2 |
--------------------------------------------------------------------------------
/src/bikes/io/configs.py:
--------------------------------------------------------------------------------
1 | """Parse, merge, and convert config objects."""
2 |
3 | # %% IMPORTS
4 |
5 | import typing as T
6 |
7 | import omegaconf as oc
8 |
9 | # %% TYPES
10 |
11 | Config = oc.ListConfig | oc.DictConfig
12 |
13 | # %% PARSERS
14 |
15 |
16 | def parse_file(path: str) -> Config:
17 | """Parse a config file from a path.
18 |
19 | Args:
20 | path (str): path to local config.
21 |
22 | Returns:
23 | Config: representation of the config file.
24 | """
25 | return oc.OmegaConf.load(path)
26 |
27 |
28 | def parse_string(string: str) -> Config:
29 | """Parse the given config string.
30 |
31 | Args:
32 | string (str): content of config string.
33 |
34 | Returns:
35 | Config: representation of the config string.
36 | """
37 | return oc.OmegaConf.create(string)
38 |
39 |
40 | # %% MERGERS
41 |
42 |
43 | def merge_configs(configs: T.Sequence[Config]) -> Config:
44 | """Merge a list of config into a single config.
45 |
46 | Args:
47 | configs (T.Sequence[Config]): list of configs.
48 |
49 | Returns:
50 | Config: representation of the merged config objects.
51 | """
52 | return oc.OmegaConf.merge(*configs)
53 |
54 |
55 | # %% CONVERTERS
56 |
57 |
58 | def to_object(config: Config, resolve: bool = True) -> object:
59 | """Convert a config object to a python object.
60 |
61 | Args:
62 | config (Config): representation of the config.
63 | resolve (bool): resolve variables. Defaults to True.
64 |
65 | Returns:
66 | object: conversion of the config to a python object.
67 | """
68 | return oc.OmegaConf.to_container(config, resolve=resolve)
69 |
--------------------------------------------------------------------------------
/src/bikes/io/datasets.py:
--------------------------------------------------------------------------------
1 | """Read/Write datasets from/to external sources/destinations."""
2 |
3 | # %% IMPORTS
4 |
5 | import abc
6 | import typing as T
7 |
8 | import mlflow.data.pandas_dataset as lineage
9 | import pandas as pd
10 | import pydantic as pdt
11 |
12 | # %% TYPINGS
13 |
14 | Lineage: T.TypeAlias = lineage.PandasDataset
15 |
16 | # %% READERS
17 |
18 |
19 | class Reader(abc.ABC, pdt.BaseModel, strict=True, frozen=True, extra="forbid"):
20 | """Base class for a dataset reader.
21 |
22 | Use a reader to load a dataset in memory.
23 | e.g., to read file, database, cloud storage, ...
24 |
25 | Parameters:
26 | limit (int, optional): maximum number of rows to read. Defaults to None.
27 | """
28 |
29 | KIND: str
30 |
31 | limit: int | None = None
32 |
33 | @abc.abstractmethod
34 | def read(self) -> pd.DataFrame:
35 | """Read a dataframe from a dataset.
36 |
37 | Returns:
38 | pd.DataFrame: dataframe representation.
39 | """
40 |
41 | @abc.abstractmethod
42 | def lineage(
43 | self,
44 | name: str,
45 | data: pd.DataFrame,
46 | targets: str | None = None,
47 | predictions: str | None = None,
48 | ) -> Lineage:
49 | """Generate lineage information.
50 |
51 | Args:
52 | name (str): dataset name.
53 | data (pd.DataFrame): reader dataframe.
54 | targets (str | None): name of the target column.
55 | predictions (str | None): name of the prediction column.
56 |
57 | Returns:
58 | Lineage: lineage information.
59 | """
60 |
61 |
62 | class ParquetReader(Reader):
63 | """Read a dataframe from a parquet file.
64 |
65 | Parameters:
66 | path (str): local path to the dataset.
67 | """
68 |
69 | KIND: T.Literal["ParquetReader"] = "ParquetReader"
70 |
71 | path: str
72 | backend: T.Literal["pyarrow", "numpy_nullable"] = "pyarrow"
73 |
74 | @T.override
75 | def read(self) -> pd.DataFrame:
76 | # can't limit rows at read time
77 | data = pd.read_parquet(self.path, dtype_backend=self.backend)
78 | if self.limit is not None:
79 | data = data.head(self.limit)
80 | return data
81 |
82 | @T.override
83 | def lineage(
84 | self,
85 | name: str,
86 | data: pd.DataFrame,
87 | targets: str | None = None,
88 | predictions: str | None = None,
89 | ) -> Lineage:
90 | return lineage.from_pandas(
91 | df=data,
92 | name=name,
93 | source=self.path,
94 | targets=targets,
95 | predictions=predictions,
96 | )
97 |
98 |
99 | ReaderKind = ParquetReader
100 |
101 | # %% WRITERS
102 |
103 |
104 | class Writer(abc.ABC, pdt.BaseModel, strict=True, frozen=True, extra="forbid"):
105 | """Base class for a dataset writer.
106 |
107 | Use a writer to save a dataset from memory.
108 | e.g., to write file, database, cloud storage, ...
109 | """
110 |
111 | KIND: str
112 |
113 | @abc.abstractmethod
114 | def write(self, data: pd.DataFrame) -> None:
115 | """Write a dataframe to a dataset.
116 |
117 | Args:
118 | data (pd.DataFrame): dataframe representation.
119 | """
120 |
121 |
122 | class ParquetWriter(Writer):
123 | """Writer a dataframe to a parquet file.
124 |
125 | Parameters:
126 | path (str): local or S3 path to the dataset.
127 | """
128 |
129 | KIND: T.Literal["ParquetWriter"] = "ParquetWriter"
130 |
131 | path: str
132 |
133 | @T.override
134 | def write(self, data: pd.DataFrame) -> None:
135 | pd.DataFrame.to_parquet(data, self.path)
136 |
137 |
138 | WriterKind = ParquetWriter
139 |
--------------------------------------------------------------------------------
/src/bikes/io/registries.py:
--------------------------------------------------------------------------------
1 | """Savers, loaders, and registers for model registries."""
2 |
3 | # %% IMPORTS
4 |
5 | import abc
6 | import typing as T
7 |
8 | import mlflow
9 | import pydantic as pdt
10 | from mlflow.pyfunc import PyFuncModel, PythonModel, PythonModelContext
11 |
12 | from bikes.core import models, schemas
13 | from bikes.utils import signers
14 |
15 | # %% TYPES
16 |
17 | # Results of model registry operations
18 | Info: T.TypeAlias = mlflow.models.model.ModelInfo
19 | Alias: T.TypeAlias = mlflow.entities.model_registry.ModelVersion
20 | Version: T.TypeAlias = mlflow.entities.model_registry.ModelVersion
21 |
22 | # %% HELPERS
23 |
24 |
25 | def uri_for_model_alias(name: str, alias: str) -> str:
26 | """Create a model URI from a model name and an alias.
27 |
28 | Args:
29 | name (str): name of the mlflow registered model.
30 | alias (str): alias of the registered model.
31 |
32 | Returns:
33 | str: model URI as "models:/name@alias".
34 | """
35 | return f"models:/{name}@{alias}"
36 |
37 |
38 | def uri_for_model_version(name: str, version: int) -> str:
39 | """Create a model URI from a model name and a version.
40 |
41 | Args:
42 | name (str): name of the mlflow registered model.
43 | version (int): version of the registered model.
44 |
45 | Returns:
46 | str: model URI as "models:/name/version."
47 | """
48 | return f"models:/{name}/{version}"
49 |
50 |
51 | def uri_for_model_alias_or_version(name: str, alias_or_version: str | int) -> str:
52 | """Create a model URi from a model name and an alias or version.
53 |
54 | Args:
55 | name (str): name of the mlflow registered model.
56 | alias_or_version (str | int): alias or version of the registered model.
57 |
58 | Returns:
59 | str: model URI as "models:/name@alias" or "models:/name/version" based on input.
60 | """
61 | if isinstance(alias_or_version, int):
62 | return uri_for_model_version(name=name, version=alias_or_version)
63 | else:
64 | return uri_for_model_alias(name=name, alias=alias_or_version)
65 |
66 |
67 | # %% SAVERS
68 |
69 |
70 | class Saver(abc.ABC, pdt.BaseModel, strict=True, frozen=True, extra="forbid"):
71 | """Base class for saving models in registry.
72 |
73 | Separate model definition from serialization.
74 | e.g., to switch between serialization flavors.
75 |
76 | Parameters:
77 | path (str): model path inside the Mlflow store.
78 | """
79 |
80 | KIND: str
81 |
82 | path: str = "model"
83 |
84 | @abc.abstractmethod
85 | def save(
86 | self,
87 | model: models.Model,
88 | signature: signers.Signature,
89 | input_example: schemas.Inputs,
90 | ) -> Info:
91 | """Save a model in the model registry.
92 |
93 | Args:
94 | model (models.Model): project model to save.
95 | signature (signers.Signature): model signature.
96 | input_example (schemas.Inputs): sample of inputs.
97 |
98 | Returns:
99 | Info: model saving information.
100 | """
101 |
102 |
103 | class CustomSaver(Saver):
104 | """Saver for project models using the Mlflow PyFunc module.
105 |
106 | https://mlflow.org/docs/latest/python_api/mlflow.pyfunc.html
107 | """
108 |
109 | KIND: T.Literal["CustomSaver"] = "CustomSaver"
110 |
111 | class Adapter(PythonModel): # type: ignore[misc]
112 | """Adapt a custom model to the Mlflow PyFunc flavor for saving operations.
113 |
114 | https://mlflow.org/docs/latest/python_api/mlflow.pyfunc.html?#mlflow.pyfunc.PythonModel
115 | """
116 |
117 | def __init__(self, model: models.Model):
118 | """Initialize the custom saver adapter.
119 |
120 | Args:
121 | model (models.Model): project model.
122 | """
123 | self.model = model
124 |
125 | def predict(
126 | self,
127 | context: PythonModelContext,
128 | model_input: schemas.Inputs,
129 | params: dict[str, T.Any] | None = None,
130 | ) -> schemas.Outputs:
131 | """Generate predictions with a custom model for the given inputs.
132 |
133 | Args:
134 | context (mlflow.PythonModelContext): mlflow context.
135 | model_input (schemas.Inputs): inputs for the mlflow model.
136 | params (dict[str, T.Any] | None): additional parameters.
137 |
138 | Returns:
139 | schemas.Outputs: validated outputs of the project model.
140 | """
141 | return self.model.predict(inputs=model_input)
142 |
143 | @T.override
144 | def save(
145 | self,
146 | model: models.Model,
147 | signature: signers.Signature,
148 | input_example: schemas.Inputs,
149 | ) -> Info:
150 | adapter = CustomSaver.Adapter(model=model)
151 | return mlflow.pyfunc.log_model(
152 | python_model=adapter,
153 | signature=signature,
154 | artifact_path=self.path,
155 | input_example=input_example,
156 | )
157 |
158 |
159 | class BuiltinSaver(Saver):
160 | """Saver for built-in models using an Mlflow flavor module.
161 |
162 | https://mlflow.org/docs/latest/models.html#built-in-model-flavors
163 |
164 | Parameters:
165 | flavor (str): Mlflow flavor module to use for the serialization.
166 | """
167 |
168 | KIND: T.Literal["BuiltinSaver"] = "BuiltinSaver"
169 |
170 | flavor: str
171 |
172 | @T.override
173 | def save(
174 | self,
175 | model: models.Model,
176 | signature: signers.Signature,
177 | input_example: schemas.Inputs,
178 | ) -> Info:
179 | builtin_model = model.get_internal_model()
180 | module = getattr(mlflow, self.flavor)
181 | return module.log_model(
182 | builtin_model,
183 | artifact_path=self.path,
184 | signature=signature,
185 | input_example=input_example,
186 | )
187 |
188 |
189 | SaverKind = CustomSaver | BuiltinSaver
190 |
191 | # %% LOADERS
192 |
193 |
194 | class Loader(abc.ABC, pdt.BaseModel, strict=True, frozen=True, extra="forbid"):
195 | """Base class for loading models from registry.
196 |
197 | Separate model definition from deserialization.
198 | e.g., to switch between deserialization flavors.
199 | """
200 |
201 | KIND: str
202 |
203 | class Adapter(abc.ABC):
204 | """Adapt any model for the project inference."""
205 |
206 | @abc.abstractmethod
207 | def predict(self, inputs: schemas.Inputs) -> schemas.Outputs:
208 | """Generate predictions with the internal model for the given inputs.
209 |
210 | Args:
211 | inputs (schemas.Inputs): validated inputs for the project model.
212 |
213 | Returns:
214 | schemas.Outputs: validated outputs of the project model.
215 | """
216 |
217 | @abc.abstractmethod
218 | def load(self, uri: str) -> "Loader.Adapter":
219 | """Load a model from the model registry.
220 |
221 | Args:
222 | uri (str): URI of a model to load.
223 |
224 | Returns:
225 | Loader.Adapter: model loaded.
226 | """
227 |
228 |
229 | class CustomLoader(Loader):
230 | """Loader for custom models using the Mlflow PyFunc module.
231 |
232 | https://mlflow.org/docs/latest/python_api/mlflow.pyfunc.html
233 | """
234 |
235 | KIND: T.Literal["CustomLoader"] = "CustomLoader"
236 |
237 | class Adapter(Loader.Adapter):
238 | """Adapt a custom model for the project inference."""
239 |
240 | def __init__(self, model: PyFuncModel) -> None:
241 | """Initialize the adapter from an mlflow pyfunc model.
242 |
243 | Args:
244 | model (PyFuncModel): mlflow pyfunc model.
245 | """
246 | self.model = model
247 |
248 | @T.override
249 | def predict(self, inputs: schemas.Inputs) -> schemas.Outputs:
250 | # model validation is already done in predict
251 | outputs = self.model.predict(data=inputs)
252 | return T.cast(schemas.Outputs, outputs)
253 |
254 | @T.override
255 | def load(self, uri: str) -> "CustomLoader.Adapter":
256 | model = mlflow.pyfunc.load_model(model_uri=uri)
257 | adapter = CustomLoader.Adapter(model=model)
258 | return adapter
259 |
260 |
261 | class BuiltinLoader(Loader):
262 | """Loader for built-in models using the Mlflow PyFunc module.
263 |
264 | Note: use Mlflow PyFunc instead of flavors to use standard API.
265 |
266 | https://mlflow.org/docs/latest/models.html#built-in-model-flavors
267 | """
268 |
269 | KIND: T.Literal["BuiltinLoader"] = "BuiltinLoader"
270 |
271 | class Adapter(Loader.Adapter):
272 | """Adapt a builtin model for the project inference."""
273 |
274 | def __init__(self, model: PyFuncModel) -> None:
275 | """Initialize the adapter from an mlflow pyfunc model.
276 |
277 | Args:
278 | model (PyFuncModel): mlflow pyfunc model.
279 | """
280 | self.model = model
281 |
282 | @T.override
283 | def predict(self, inputs: schemas.Inputs) -> schemas.Outputs:
284 | columns = list(schemas.OutputsSchema.to_schema().columns)
285 | outputs = self.model.predict(data=inputs) # unchecked data!
286 | return schemas.Outputs(outputs, columns=columns, index=inputs.index)
287 |
288 | @T.override
289 | def load(self, uri: str) -> "BuiltinLoader.Adapter":
290 | model = mlflow.pyfunc.load_model(model_uri=uri)
291 | adapter = BuiltinLoader.Adapter(model=model)
292 | return adapter
293 |
294 |
295 | LoaderKind = CustomLoader | BuiltinLoader
296 |
297 | # %% REGISTERS
298 |
299 |
300 | class Register(abc.ABC, pdt.BaseModel, strict=True, frozen=True, extra="forbid"):
301 | """Base class for registring models to a location.
302 |
303 | Separate model definition from its registration.
304 | e.g., to change the model registry backend.
305 |
306 | Parameters:
307 | tags (dict[str, T.Any]): tags for the model.
308 | """
309 |
310 | KIND: str
311 |
312 | tags: dict[str, T.Any] = {}
313 |
314 | @abc.abstractmethod
315 | def register(self, name: str, model_uri: str) -> Version:
316 | """Register a model given its name and URI.
317 |
318 | Args:
319 | name (str): name of the model to register.
320 | model_uri (str): URI of a model to register.
321 |
322 | Returns:
323 | Version: information about the registered model.
324 | """
325 |
326 |
327 | class MlflowRegister(Register):
328 | """Register for models in the Mlflow Model Registry.
329 |
330 | https://mlflow.org/docs/latest/model-registry.html
331 | """
332 |
333 | KIND: T.Literal["MlflowRegister"] = "MlflowRegister"
334 |
335 | @T.override
336 | def register(self, name: str, model_uri: str) -> Version:
337 | return mlflow.register_model(name=name, model_uri=model_uri, tags=self.tags)
338 |
339 |
340 | RegisterKind = MlflowRegister
341 |
--------------------------------------------------------------------------------
/src/bikes/io/services.py:
--------------------------------------------------------------------------------
1 | """Manage global context during execution."""
2 |
3 | # %% IMPORTS
4 |
5 | from __future__ import annotations
6 |
7 | import abc
8 | import contextlib as ctx
9 | import sys
10 | import typing as T
11 |
12 | import loguru
13 | import mlflow
14 | import mlflow.tracking as mt
15 | import pydantic as pdt
16 | from plyer import notification
17 |
18 | # %% SERVICES
19 |
20 |
21 | class Service(abc.ABC, pdt.BaseModel, strict=True, frozen=True, extra="forbid"):
22 | """Base class for a global service.
23 |
24 | Use services to manage global contexts.
25 | e.g., logger object, mlflow client, spark context, ...
26 | """
27 |
28 | @abc.abstractmethod
29 | def start(self) -> None:
30 | """Start the service."""
31 |
32 | def stop(self) -> None:
33 | """Stop the service."""
34 | # does nothing by default
35 |
36 |
37 | class LoggerService(Service):
38 | """Service for logging messages.
39 |
40 | https://loguru.readthedocs.io/en/stable/api/logger.html
41 |
42 | Parameters:
43 | sink (str): logging output.
44 | level (str): logging level.
45 | format (str): logging format.
46 | colorize (bool): colorize output.
47 | serialize (bool): convert to JSON.
48 | backtrace (bool): enable exception trace.
49 | diagnose (bool): enable variable display.
50 | catch (bool): catch errors during log handling.
51 | """
52 |
53 | sink: str = "stderr"
54 | level: str = "DEBUG"
55 | format: str = (
56 | "[{time:YYYY-MM-DD HH:mm:ss.SSS}]"
57 | "[{level}]"
58 | "[{name}:{function}:{line}]"
59 | " {message}"
60 | )
61 | colorize: bool = True
62 | serialize: bool = False
63 | backtrace: bool = True
64 | diagnose: bool = False
65 | catch: bool = True
66 |
67 | @T.override
68 | def start(self) -> None:
69 | loguru.logger.remove()
70 | config = self.model_dump()
71 | # use standard sinks or keep the original
72 | sinks = {"stderr": sys.stderr, "stdout": sys.stdout}
73 | config["sink"] = sinks.get(config["sink"], config["sink"])
74 | loguru.logger.add(**config)
75 |
76 | def logger(self) -> loguru.Logger:
77 | """Return the main logger.
78 |
79 | Returns:
80 | loguru.Logger: the main logger.
81 | """
82 | return loguru.logger
83 |
84 |
85 | class AlertsService(Service):
86 | """Service for sending notifications.
87 |
88 | Require libnotify-bin on Linux systems.
89 |
90 | In production, use with Slack, Discord, or emails.
91 |
92 | https://plyer.readthedocs.io/en/latest/api.html#plyer.facades.Notification
93 |
94 | Parameters:
95 | enable (bool): use notifications or print.
96 | app_name (str): name of the application.
97 | timeout (int | None): timeout in secs.
98 | """
99 |
100 | enable: bool = True
101 | app_name: str = "Bikes"
102 | timeout: int | None = None
103 |
104 | @T.override
105 | def start(self) -> None:
106 | pass
107 |
108 | def notify(self, title: str, message: str) -> None:
109 | """Send a notification to the system.
110 |
111 | Args:
112 | title (str): title of the notification.
113 | message (str): message of the notification.
114 | """
115 | if self.enable:
116 | try:
117 | notification.notify(
118 | title=title,
119 | message=message,
120 | app_name=self.app_name,
121 | timeout=self.timeout,
122 | )
123 | except NotImplementedError:
124 | print("Notifications are not supported on this system.")
125 | self._print(title=title, message=message)
126 | else:
127 | self._print(title=title, message=message)
128 |
129 | def _print(self, title: str, message: str) -> None:
130 | """Print a notification to the system.
131 |
132 | Args:
133 | title (str): title of the notification.
134 | message (str): message of the notification.
135 | """
136 | print(f"[{self.app_name}] {title}: {message}")
137 |
138 |
139 | class MlflowService(Service):
140 | """Service for Mlflow tracking and registry.
141 |
142 | Parameters:
143 | tracking_uri (str): the URI for the Mlflow tracking server.
144 | registry_uri (str): the URI for the Mlflow model registry.
145 | experiment_name (str): the name of tracking experiment.
146 | registry_name (str): the name of model registry.
147 | autolog_disable (bool): disable autologging.
148 | autolog_disable_for_unsupported_versions (bool): disable autologging for unsupported versions.
149 | autolog_exclusive (bool): If True, enables exclusive autologging.
150 | autolog_log_input_examples (bool): If True, logs input examples during autologging.
151 | autolog_log_model_signatures (bool): If True, logs model signatures during autologging.
152 | autolog_log_models (bool): If True, enables logging of models during autologging.
153 | autolog_log_datasets (bool): If True, logs datasets used during autologging.
154 | autolog_silent (bool): If True, suppresses all Mlflow warnings during autologging.
155 | """
156 |
157 | class RunConfig(pdt.BaseModel, strict=True, frozen=True, extra="forbid"):
158 | """Run configuration for Mlflow tracking.
159 |
160 | Parameters:
161 | name (str): name of the run.
162 | description (str | None): description of the run.
163 | tags (dict[str, T.Any] | None): tags for the run.
164 | log_system_metrics (bool | None): enable system metrics logging.
165 | """
166 |
167 | name: str
168 | description: str | None = None
169 | tags: dict[str, T.Any] | None = None
170 | log_system_metrics: bool | None = True
171 |
172 | # server uri
173 | tracking_uri: str = "./mlruns"
174 | registry_uri: str = "./mlruns"
175 | # experiment
176 | experiment_name: str = "bikes"
177 | # registry
178 | registry_name: str = "bikes"
179 | # autolog
180 | autolog_disable: bool = False
181 | autolog_disable_for_unsupported_versions: bool = False
182 | autolog_exclusive: bool = False
183 | autolog_log_input_examples: bool = True
184 | autolog_log_model_signatures: bool = True
185 | autolog_log_models: bool = False
186 | autolog_log_datasets: bool = False
187 | autolog_silent: bool = False
188 |
189 | @T.override
190 | def start(self) -> None:
191 | # server uri
192 | mlflow.set_tracking_uri(uri=self.tracking_uri)
193 | mlflow.set_registry_uri(uri=self.registry_uri)
194 | # experiment
195 | mlflow.set_experiment(experiment_name=self.experiment_name)
196 | # autolog
197 | mlflow.autolog(
198 | disable=self.autolog_disable,
199 | disable_for_unsupported_versions=self.autolog_disable_for_unsupported_versions,
200 | exclusive=self.autolog_exclusive,
201 | log_input_examples=self.autolog_log_input_examples,
202 | log_model_signatures=self.autolog_log_model_signatures,
203 | log_datasets=self.autolog_log_datasets,
204 | silent=self.autolog_silent,
205 | )
206 |
207 | @ctx.contextmanager
208 | def run_context(self, run_config: RunConfig) -> T.Generator[mlflow.ActiveRun, None, None]:
209 | """Yield an active Mlflow run and exit it afterwards.
210 |
211 | Args:
212 | run (str): run parameters.
213 |
214 | Yields:
215 | T.Generator[mlflow.ActiveRun, None, None]: active run context. Will be closed at the end of context.
216 | """
217 | with mlflow.start_run(
218 | run_name=run_config.name,
219 | tags=run_config.tags,
220 | description=run_config.description,
221 | log_system_metrics=run_config.log_system_metrics,
222 | ) as run:
223 | yield run
224 |
225 | def client(self) -> mt.MlflowClient:
226 | """Return a new Mlflow client.
227 |
228 | Returns:
229 | MlflowClient: the mlflow client.
230 | """
231 | return mt.MlflowClient(tracking_uri=self.tracking_uri, registry_uri=self.registry_uri)
232 |
--------------------------------------------------------------------------------
/src/bikes/jobs/__init__.py:
--------------------------------------------------------------------------------
1 | """High-level jobs of the project."""
2 |
3 | # %% IMPORTS
4 |
5 | from bikes.jobs.evaluations import EvaluationsJob
6 | from bikes.jobs.explanations import ExplanationsJob
7 | from bikes.jobs.inference import InferenceJob
8 | from bikes.jobs.promotion import PromotionJob
9 | from bikes.jobs.training import TrainingJob
10 | from bikes.jobs.tuning import TuningJob
11 |
12 | # %% TYPES
13 |
14 | JobKind = TuningJob | TrainingJob | PromotionJob | InferenceJob | EvaluationsJob | ExplanationsJob
15 |
16 | # %% EXPORTS
17 |
18 | __all__ = [
19 | "TuningJob",
20 | "TrainingJob",
21 | "PromotionJob",
22 | "InferenceJob",
23 | "EvaluationsJob",
24 | "ExplanationsJob",
25 | "JobKind",
26 | ]
27 |
--------------------------------------------------------------------------------
/src/bikes/jobs/base.py:
--------------------------------------------------------------------------------
1 | """Base for high-level project jobs."""
2 |
3 | # %% IMPORTS
4 |
5 | import abc
6 | import types as TS
7 | import typing as T
8 |
9 | import pydantic as pdt
10 |
11 | from bikes.io import services
12 |
13 | # %% TYPES
14 |
15 | # Local job variables
16 | Locals = T.Dict[str, T.Any]
17 |
18 | # %% JOBS
19 |
20 |
21 | class Job(abc.ABC, pdt.BaseModel, strict=True, frozen=True, extra="forbid"):
22 | """Base class for a job.
23 |
24 | use a job to execute runs in context.
25 | e.g., to define common services like logger
26 |
27 | Parameters:
28 | logger_service (services.LoggerService): manage the logger system.
29 | alerts_service (services.AlertsService): manage the alerts system.
30 | mlflow_service (services.MlflowService): manage the mlflow system.
31 | """
32 |
33 | KIND: str
34 |
35 | logger_service: services.LoggerService = services.LoggerService()
36 | alerts_service: services.AlertsService = services.AlertsService()
37 | mlflow_service: services.MlflowService = services.MlflowService()
38 |
39 | def __enter__(self) -> T.Self:
40 | """Enter the job context.
41 |
42 | Returns:
43 | T.Self: return the current object.
44 | """
45 | self.logger_service.start()
46 | logger = self.logger_service.logger()
47 | logger.debug("[START] Logger service: {}", self.logger_service)
48 | logger.debug("[START] Alerts service: {}", self.alerts_service)
49 | self.alerts_service.start()
50 | logger.debug("[START] Mlflow service: {}", self.mlflow_service)
51 | self.mlflow_service.start()
52 | return self
53 |
54 | def __exit__(
55 | self,
56 | exc_type: T.Type[BaseException] | None,
57 | exc_value: BaseException | None,
58 | exc_traceback: TS.TracebackType | None,
59 | ) -> T.Literal[False]:
60 | """Exit the job context.
61 |
62 | Args:
63 | exc_type (T.Type[BaseException] | None): ignored.
64 | exc_value (BaseException | None): ignored.
65 | exc_traceback (TS.TracebackType | None): ignored.
66 |
67 | Returns:
68 | T.Literal[False]: always propagate exceptions.
69 | """
70 | logger = self.logger_service.logger()
71 | logger.debug("[STOP] Mlflow service: {}", self.mlflow_service)
72 | self.mlflow_service.stop()
73 | logger.debug("[STOP] Alerts service: {}", self.alerts_service)
74 | self.alerts_service.stop()
75 | logger.debug("[STOP] Logger service: {}", self.logger_service)
76 | self.logger_service.stop()
77 | return False # re-raise
78 |
79 | @abc.abstractmethod
80 | def run(self) -> Locals:
81 | """Run the job in context.
82 |
83 | Returns:
84 | Locals: local job variables.
85 | """
86 |
--------------------------------------------------------------------------------
/src/bikes/jobs/evaluations.py:
--------------------------------------------------------------------------------
1 | """Define a job for evaluating registered models with data."""
2 |
3 | # %% IMPORTS
4 |
5 | import typing as T
6 |
7 | import mlflow
8 | import pandas as pd
9 | import pydantic as pdt
10 |
11 | from bikes.core import metrics as metrics_
12 | from bikes.core import schemas
13 | from bikes.io import datasets, registries, services
14 | from bikes.jobs import base
15 |
16 | # %% JOBS
17 |
18 |
19 | class EvaluationsJob(base.Job):
20 | """Generate evaluations from a registered model and a dataset.
21 |
22 | Parameters:
23 | run_config (services.MlflowService.RunConfig): mlflow run config.
24 | inputs (datasets.ReaderKind): reader for the inputs data.
25 | targets (datasets.ReaderKind): reader for the targets data.
26 | model_type (str): model type (e.g. "regressor", "classifier").
27 | alias_or_version (str | int): alias or version for the model.
28 | metrics (metrics_.MetricsKind): metric list to compute.
29 | evaluators (list[str]): list of evaluators to use.
30 | thresholds (dict[str, metrics_.Threshold] | None): metric thresholds.
31 | """
32 |
33 | KIND: T.Literal["EvaluationsJob"] = "EvaluationsJob"
34 |
35 | # Run
36 | run_config: services.MlflowService.RunConfig = services.MlflowService.RunConfig(
37 | name="Evaluations"
38 | )
39 | # Data
40 | inputs: datasets.ReaderKind = pdt.Field(..., discriminator="KIND")
41 | targets: datasets.ReaderKind = pdt.Field(..., discriminator="KIND")
42 | # Model
43 | model_type: str = "regressor"
44 | alias_or_version: str | int = "Champion"
45 | # Loader
46 | loader: registries.LoaderKind = pdt.Field(registries.CustomLoader(), discriminator="KIND")
47 | # Metrics
48 | metrics: metrics_.MetricsKind = [metrics_.SklearnMetric()]
49 | # Evaluators
50 | evaluators: list[str] = ["default"]
51 | # Thresholds
52 | thresholds: dict[str, metrics_.Threshold] = {
53 | "r2_score": metrics_.Threshold(threshold=0.5, greater_is_better=True)
54 | }
55 |
56 | @T.override
57 | def run(self) -> base.Locals:
58 | # services
59 | # - logger
60 | logger = self.logger_service.logger()
61 | logger.info("With logger: {}", logger)
62 | # - mlflow
63 | client = self.mlflow_service.client()
64 | logger.info("With client: {}", client.tracking_uri)
65 | with self.mlflow_service.run_context(run_config=self.run_config) as run:
66 | logger.info("With run context: {}", run.info)
67 | # data
68 | # - inputs
69 | logger.info("Read inputs: {}", self.inputs)
70 | inputs_ = self.inputs.read() # unchecked!
71 | inputs = schemas.InputsSchema.check(inputs_)
72 | logger.debug("- Inputs shape: {}", inputs.shape)
73 | # - targets
74 | logger.info("Read targets: {}", self.targets)
75 | targets_ = self.targets.read() # unchecked!
76 | targets = schemas.TargetsSchema.check(targets_)
77 | logger.debug("- Targets shape: {}", targets.shape)
78 | # lineage
79 | # - inputs
80 | logger.info("Log lineage: inputs")
81 | inputs_lineage = self.inputs.lineage(data=inputs, name="inputs")
82 | mlflow.log_input(dataset=inputs_lineage, context=self.run_config.name)
83 | logger.debug("- Inputs lineage: {}", inputs_lineage.to_dict())
84 | # - targets
85 | logger.info("Log lineage: targets")
86 | targets_lineage = self.targets.lineage(
87 | data=targets, name="targets", targets=schemas.TargetsSchema.cnt
88 | )
89 | mlflow.log_input(dataset=targets_lineage, context=self.run_config.name)
90 | logger.debug("- Targets lineage: {}", targets_lineage.to_dict())
91 | # model
92 | logger.info("With model: {}", self.mlflow_service.registry_name)
93 | model_uri = registries.uri_for_model_alias_or_version(
94 | name=self.mlflow_service.registry_name,
95 | alias_or_version=self.alias_or_version,
96 | )
97 | logger.debug("- Model URI: {}", model_uri)
98 | # loader
99 | logger.info("Load model: {}", self.loader)
100 | model = self.loader.load(uri=model_uri)
101 | logger.debug("- Model: {}", model)
102 | # outputs
103 | logger.info("Predict outputs: {}", len(inputs))
104 | outputs = model.predict(inputs=inputs) # checked
105 | logger.debug("- Outputs shape: {}", outputs.shape)
106 | # dataset
107 | logger.info("Create dataset: inputs & targets & outputs")
108 | dataset_ = pd.concat([inputs, targets, outputs], axis="columns")
109 | dataset = mlflow.data.from_pandas( # type: ignore[attr-defined]
110 | df=dataset_,
111 | name="evaluation",
112 | targets=schemas.TargetsSchema.cnt,
113 | predictions=schemas.OutputsSchema.prediction,
114 | )
115 | logger.debug("- Dataset: {}", dataset.to_dict())
116 | # metrics
117 | logger.debug("Convert metrics: {}", self.metrics)
118 | extra_metrics = [metric.to_mlflow() for metric in self.metrics]
119 | logger.debug("- Extra metrics: {}", extra_metrics)
120 | # thresholds
121 | logger.info("Convert thresholds: {}", self.thresholds)
122 | validation_thresholds = {
123 | name: threshold.to_mlflow() for name, threshold in self.thresholds.items()
124 | }
125 | logger.debug("- Validation thresholds: {}", validation_thresholds)
126 | # evaluations
127 | logger.info("Compute evaluations: {}", self.model_type)
128 | evaluations = mlflow.evaluate(
129 | data=dataset,
130 | model_type=self.model_type,
131 | evaluators=self.evaluators,
132 | extra_metrics=extra_metrics,
133 | validation_thresholds=validation_thresholds,
134 | )
135 | logger.debug("- Evaluations metrics: {}", evaluations.metrics)
136 | # notify
137 | self.alerts_service.notify(
138 | title="Evaluations Job Finished",
139 | message=f"Evaluation metrics: {evaluations.metrics}",
140 | )
141 | return locals()
142 |
--------------------------------------------------------------------------------
/src/bikes/jobs/explanations.py:
--------------------------------------------------------------------------------
1 | """Define a job for explaining the model structure and decisions."""
2 |
3 | # %% IMPORTS
4 |
5 | import typing as T
6 |
7 | import pydantic as pdt
8 |
9 | from bikes.core import schemas
10 | from bikes.io import datasets, registries
11 | from bikes.jobs import base
12 |
13 | # %% JOBS
14 |
15 |
16 | class ExplanationsJob(base.Job):
17 | """Generate explanations from the model and a data sample.
18 |
19 | Parameters:
20 | inputs_samples (datasets.ReaderKind): reader for the samples data.
21 | models_explanations (datasets.WriterKind): writer for models explanation.
22 | samples_explanations (datasets.WriterKind): writer for samples explanation.
23 | alias_or_version (str | int): alias or version for the model.
24 | loader (registries.LoaderKind): registry loader for the model.
25 | """
26 |
27 | KIND: T.Literal["ExplanationsJob"] = "ExplanationsJob"
28 |
29 | # Samples
30 | inputs_samples: datasets.ReaderKind = pdt.Field(..., discriminator="KIND")
31 | # Explanations
32 | models_explanations: datasets.WriterKind = pdt.Field(..., discriminator="KIND")
33 | samples_explanations: datasets.WriterKind = pdt.Field(..., discriminator="KIND")
34 | # Model
35 | alias_or_version: str | int = "Champion"
36 | # Loader
37 | loader: registries.LoaderKind = pdt.Field(registries.CustomLoader(), discriminator="KIND")
38 |
39 | @T.override
40 | def run(self) -> base.Locals:
41 | # services
42 | logger = self.logger_service.logger()
43 | logger.info("With logger: {}", logger)
44 | # inputs
45 | logger.info("Read samples: {}", self.inputs_samples)
46 | inputs_samples = self.inputs_samples.read() # unchecked!
47 | inputs_samples = schemas.InputsSchema.check(inputs_samples)
48 | logger.debug("- Inputs samples shape: {}", inputs_samples.shape)
49 | # model
50 | logger.info("With model: {}", self.mlflow_service.registry_name)
51 | model_uri = registries.uri_for_model_alias_or_version(
52 | name=self.mlflow_service.registry_name,
53 | alias_or_version=self.alias_or_version,
54 | )
55 | logger.debug("- Model URI: {}", model_uri)
56 | # loader
57 | logger.info("Load model: {}", self.loader)
58 | model = self.loader.load(uri=model_uri).model.unwrap_python_model().model
59 | logger.debug("- Model: {}", model)
60 | # explanations
61 | # - models
62 | logger.info("Explain model: {}", model)
63 | models_explanations = model.explain_model()
64 | logger.debug("- Models explanations shape: {}", models_explanations.shape)
65 | # # - samples
66 | logger.info("Explain samples: {}", len(inputs_samples))
67 | samples_explanations = model.explain_samples(inputs=inputs_samples)
68 | logger.debug("- Samples explanations shape: {}", samples_explanations.shape)
69 | # write
70 | # - model
71 | logger.info("Write models explanations: {}", self.models_explanations)
72 | self.models_explanations.write(data=models_explanations)
73 | # - samples
74 | logger.info("Write samples explanations: {}", self.samples_explanations)
75 | self.samples_explanations.write(data=samples_explanations)
76 | # notify
77 | self.alerts_service.notify(
78 | title="Explanations Job Finished",
79 | message=f"Features Count: {len(models_explanations)}",
80 | )
81 | return locals()
82 |
--------------------------------------------------------------------------------
/src/bikes/jobs/inference.py:
--------------------------------------------------------------------------------
1 | """Define a job for generating batch predictions from a registered model."""
2 |
3 | # %% IMPORTS
4 |
5 | import typing as T
6 |
7 | import pydantic as pdt
8 |
9 | from bikes.core import schemas
10 | from bikes.io import datasets, registries
11 | from bikes.jobs import base
12 |
13 | # %% JOBS
14 |
15 |
16 | class InferenceJob(base.Job):
17 | """Generate batch predictions from a registered model.
18 |
19 | Parameters:
20 | inputs (datasets.ReaderKind): reader for the inputs data.
21 | outputs (datasets.WriterKind): writer for the outputs data.
22 | alias_or_version (str | int): alias or version for the model.
23 | loader (registries.LoaderKind): registry loader for the model.
24 | """
25 |
26 | KIND: T.Literal["InferenceJob"] = "InferenceJob"
27 |
28 | # Inputs
29 | inputs: datasets.ReaderKind = pdt.Field(..., discriminator="KIND")
30 | # Outputs
31 | outputs: datasets.WriterKind = pdt.Field(..., discriminator="KIND")
32 | # Model
33 | alias_or_version: str | int = "Champion"
34 | # Loader
35 | loader: registries.LoaderKind = pdt.Field(registries.CustomLoader(), discriminator="KIND")
36 |
37 | @T.override
38 | def run(self) -> base.Locals:
39 | # services
40 | logger = self.logger_service.logger()
41 | logger.info("With logger: {}", logger)
42 | # inputs
43 | logger.info("Read inputs: {}", self.inputs)
44 | inputs_ = self.inputs.read() # unchecked!
45 | inputs = schemas.InputsSchema.check(inputs_)
46 | logger.debug("- Inputs shape: {}", inputs.shape)
47 | # model
48 | logger.info("With model: {}", self.mlflow_service.registry_name)
49 | model_uri = registries.uri_for_model_alias_or_version(
50 | name=self.mlflow_service.registry_name,
51 | alias_or_version=self.alias_or_version,
52 | )
53 | logger.debug("- Model URI: {}", model_uri)
54 | # loader
55 | logger.info("Load model: {}", self.loader)
56 | model = self.loader.load(uri=model_uri)
57 | logger.debug("- Model: {}", model)
58 | # outputs
59 | logger.info("Predict outputs: {}", len(inputs))
60 | outputs = model.predict(inputs=inputs) # checked
61 | logger.debug("- Outputs shape: {}", outputs.shape)
62 | # write
63 | logger.info("Write outputs: {}", self.outputs)
64 | self.outputs.write(data=outputs)
65 | # notify
66 | self.alerts_service.notify(
67 | title="Inference Job Finished", message=f"Outputs Shape: {outputs.shape}"
68 | )
69 | return locals()
70 |
--------------------------------------------------------------------------------
/src/bikes/jobs/promotion.py:
--------------------------------------------------------------------------------
1 | """Define a job for promoting a registered model version with an alias."""
2 |
3 | # %% IMPORTS
4 |
5 | import typing as T
6 |
7 | from bikes.jobs import base
8 |
9 | # %% JOBS
10 |
11 |
12 | class PromotionJob(base.Job):
13 | """Define a job for promoting a registered model version with an alias.
14 |
15 | https://mlflow.org/docs/latest/model-registry.html#concepts
16 |
17 | Parameters:
18 | alias (str): the mlflow alias to transition the registered model version.
19 | version (int | None): the model version to transition (use None for latest).
20 | """
21 |
22 | KIND: T.Literal["PromotionJob"] = "PromotionJob"
23 |
24 | alias: str = "Champion"
25 | version: int | None = None
26 |
27 | @T.override
28 | def run(self) -> base.Locals:
29 | # services
30 | # - logger
31 | logger = self.logger_service.logger()
32 | logger.info("With logger: {}", logger)
33 | # - mlflow
34 | client = self.mlflow_service.client()
35 | logger.info("With client: {}", client)
36 | name = self.mlflow_service.registry_name
37 | # version
38 | if self.version is None: # use the latest model version
39 | version = client.search_model_versions(
40 | f"name='{name}'", max_results=1, order_by=["version_number DESC"]
41 | )[0].version
42 | else:
43 | version = self.version
44 | logger.info("From version: {}", version)
45 | # alias
46 | logger.info("To alias: {}", self.alias)
47 | # promote
48 | logger.info("Promote model: {}", name)
49 | client.set_registered_model_alias(name=name, alias=self.alias, version=version)
50 | model_version = client.get_model_version_by_alias(name=name, alias=self.alias)
51 | logger.debug("- Model version: {}", model_version)
52 | # notify
53 | self.alerts_service.notify(
54 | title="Promotion Job Finished",
55 | message=f"Version: {model_version.version} @ {self.alias}",
56 | )
57 | return locals()
58 |
--------------------------------------------------------------------------------
/src/bikes/jobs/training.py:
--------------------------------------------------------------------------------
1 | """Define a job for training and registring a single AI/ML model."""
2 |
3 | # %% IMPORTS
4 |
5 | import typing as T
6 |
7 | import mlflow
8 | import pydantic as pdt
9 |
10 | from bikes.core import metrics as metrics_
11 | from bikes.core import models, schemas
12 | from bikes.io import datasets, registries, services
13 | from bikes.jobs import base
14 | from bikes.utils import signers, splitters
15 |
16 | # %% JOBS
17 |
18 |
19 | class TrainingJob(base.Job):
20 | """Train and register a single AI/ML model.
21 |
22 | Parameters:
23 | run_config (services.MlflowService.RunConfig): mlflow run config.
24 | inputs (datasets.ReaderKind): reader for the inputs data.
25 | targets (datasets.ReaderKind): reader for the targets data.
26 | model (models.ModelKind): machine learning model to train.
27 | metrics (metrics_.MetricsKind): metric list to compute.
28 | splitter (splitters.SplitterKind): data sets splitter.
29 | saver (registries.SaverKind): model saver.
30 | signer (signers.SignerKind): model signer.
31 | registry (registries.RegisterKind): model register.
32 | """
33 |
34 | KIND: T.Literal["TrainingJob"] = "TrainingJob"
35 |
36 | # Run
37 | run_config: services.MlflowService.RunConfig = services.MlflowService.RunConfig(name="Training")
38 | # Data
39 | inputs: datasets.ReaderKind = pdt.Field(..., discriminator="KIND")
40 | targets: datasets.ReaderKind = pdt.Field(..., discriminator="KIND")
41 | # Model
42 | model: models.ModelKind = pdt.Field(models.BaselineSklearnModel(), discriminator="KIND")
43 | # Metrics
44 | metrics: metrics_.MetricsKind = [metrics_.SklearnMetric()]
45 | # Splitter
46 | splitter: splitters.SplitterKind = pdt.Field(
47 | splitters.TrainTestSplitter(), discriminator="KIND"
48 | )
49 | # Saver
50 | saver: registries.SaverKind = pdt.Field(registries.CustomSaver(), discriminator="KIND")
51 | # Signer
52 | signer: signers.SignerKind = pdt.Field(signers.InferSigner(), discriminator="KIND")
53 | # Registrer
54 | # - avoid shadowing pydantic `register` pydantic function
55 | registry: registries.RegisterKind = pdt.Field(registries.MlflowRegister(), discriminator="KIND")
56 |
57 | @T.override
58 | def run(self) -> base.Locals:
59 | # services
60 | # - logger
61 | logger = self.logger_service.logger()
62 | logger.info("With logger: {}", logger)
63 | # - mlflow
64 | client = self.mlflow_service.client()
65 | logger.info("With client: {}", client.tracking_uri)
66 | with self.mlflow_service.run_context(run_config=self.run_config) as run:
67 | logger.info("With run context: {}", run.info)
68 | # data
69 | # - inputs
70 | logger.info("Read inputs: {}", self.inputs)
71 | inputs_ = self.inputs.read() # unchecked!
72 | inputs = schemas.InputsSchema.check(inputs_)
73 | logger.debug("- Inputs shape: {}", inputs.shape)
74 | # - targets
75 | logger.info("Read targets: {}", self.targets)
76 | targets_ = self.targets.read() # unchecked!
77 | targets = schemas.TargetsSchema.check(targets_)
78 | logger.debug("- Targets shape: {}", targets.shape)
79 | # lineage
80 | # - inputs
81 | logger.info("Log lineage: inputs")
82 | inputs_lineage = self.inputs.lineage(data=inputs, name="inputs")
83 | mlflow.log_input(dataset=inputs_lineage, context=self.run_config.name)
84 | logger.debug("- Inputs lineage: {}", inputs_lineage.to_dict())
85 | # - targets
86 | logger.info("Log lineage: targets")
87 | targets_lineage = self.targets.lineage(
88 | data=targets, name="targets", targets=schemas.TargetsSchema.cnt
89 | )
90 | mlflow.log_input(dataset=targets_lineage, context=self.run_config.name)
91 | logger.debug("- Targets lineage: {}", targets_lineage.to_dict())
92 | # splitter
93 | logger.info("With splitter: {}", self.splitter)
94 | # - index
95 | train_index, test_index = next(self.splitter.split(inputs=inputs, targets=targets))
96 | # - inputs
97 | inputs_train = T.cast(schemas.Inputs, inputs.iloc[train_index])
98 | inputs_test = T.cast(schemas.Inputs, inputs.iloc[test_index])
99 | logger.debug("- Inputs train shape: {}", inputs_train.shape)
100 | logger.debug("- Inputs test shape: {}", inputs_test.shape)
101 | # - targets
102 | targets_train = T.cast(schemas.Targets, targets.iloc[train_index])
103 | targets_test = T.cast(schemas.Targets, targets.iloc[test_index])
104 | logger.debug("- Targets train shape: {}", targets_train.shape)
105 | logger.debug("- Targets test shape: {}", targets_test.shape)
106 | # model
107 | logger.info("Fit model: {}", self.model)
108 | self.model.fit(inputs=inputs_train, targets=targets_train)
109 | # outputs
110 | logger.info("Predict outputs: {}", len(inputs_test))
111 | outputs_test = self.model.predict(inputs=inputs_test)
112 | logger.debug("- Outputs test shape: {}", outputs_test.shape)
113 | # metrics
114 | for i, metric in enumerate(self.metrics, start=1):
115 | logger.info("{}. Compute metric: {}", i, metric)
116 | score = metric.score(targets=targets_test, outputs=outputs_test)
117 | client.log_metric(run_id=run.info.run_id, key=metric.name, value=score)
118 | logger.debug("- Metric score: {}", score)
119 | # signer
120 | logger.info("Sign model: {}", self.signer)
121 | model_signature = self.signer.sign(inputs=inputs, outputs=outputs_test)
122 | logger.debug("- Model signature: {}", model_signature.to_dict())
123 | # saver
124 | logger.info("Save model: {}", self.saver)
125 | model_info = self.saver.save(
126 | model=self.model, signature=model_signature, input_example=inputs
127 | )
128 | logger.debug("- Model URI: {}", model_info.model_uri)
129 | # register
130 | logger.info("Register model: {}", self.registry)
131 | model_version = self.registry.register(
132 | name=self.mlflow_service.registry_name, model_uri=model_info.model_uri
133 | )
134 | logger.debug("- Model version: {}", model_version)
135 | # notify
136 | self.alerts_service.notify(
137 | title="Training Job Finished",
138 | message=f"Model version: {model_version.version}",
139 | )
140 | return locals()
141 |
--------------------------------------------------------------------------------
/src/bikes/jobs/tuning.py:
--------------------------------------------------------------------------------
1 | """Define a job for finding the best hyperparameters for a model."""
2 |
3 | # %% IMPORTS
4 |
5 | import typing as T
6 |
7 | import mlflow
8 | import pydantic as pdt
9 |
10 | from bikes.core import metrics, models, schemas
11 | from bikes.io import datasets, services
12 | from bikes.jobs import base
13 | from bikes.utils import searchers, splitters
14 |
15 | # %% JOBS
16 |
17 |
18 | class TuningJob(base.Job):
19 | """Find the best hyperparameters for a model.
20 |
21 | Parameters:
22 | run_config (services.MlflowService.RunConfig): mlflow run config.
23 | inputs (datasets.ReaderKind): reader for the inputs data.
24 | targets (datasets.ReaderKind): reader for the targets data.
25 | model (models.ModelKind): machine learning model to tune.
26 | metric (metrics.MetricKind): tuning metric to optimize.
27 | splitter (splitters.SplitterKind): data sets splitter.
28 | searcher: (searchers.SearcherKind): hparams searcher.
29 | """
30 |
31 | KIND: T.Literal["TuningJob"] = "TuningJob"
32 |
33 | # Run
34 | run_config: services.MlflowService.RunConfig = services.MlflowService.RunConfig(name="Tuning")
35 | # Data
36 | inputs: datasets.ReaderKind = pdt.Field(..., discriminator="KIND")
37 | targets: datasets.ReaderKind = pdt.Field(..., discriminator="KIND")
38 | # Model
39 | model: models.ModelKind = pdt.Field(models.BaselineSklearnModel(), discriminator="KIND")
40 | # Metric
41 | metric: metrics.MetricKind = pdt.Field(metrics.SklearnMetric(), discriminator="KIND")
42 | # splitter
43 | splitter: splitters.SplitterKind = pdt.Field(
44 | splitters.TimeSeriesSplitter(), discriminator="KIND"
45 | )
46 | # Searcher
47 | searcher: searchers.SearcherKind = pdt.Field(
48 | searchers.GridCVSearcher(
49 | param_grid={
50 | "max_depth": [3, 5, 7],
51 | }
52 | ),
53 | discriminator="KIND",
54 | )
55 |
56 | @T.override
57 | def run(self) -> base.Locals:
58 | """Run the tuning job in context."""
59 | # services
60 | # - logger
61 | logger = self.logger_service.logger()
62 | logger.info("With logger: {}", logger)
63 | with self.mlflow_service.run_context(run_config=self.run_config) as run:
64 | logger.info("With run context: {}", run.info)
65 | # data
66 | # - inputs
67 | logger.info("Read inputs: {}", self.inputs)
68 | inputs_ = self.inputs.read() # unchecked!
69 | inputs = schemas.InputsSchema.check(inputs_)
70 | logger.debug("- Inputs shape: {}", inputs.shape)
71 | # - targets
72 | logger.info("Read targets: {}", self.targets)
73 | targets_ = self.targets.read() # unchecked!
74 | targets = schemas.TargetsSchema.check(targets_)
75 | logger.debug("- Targets shape: {}", targets.shape)
76 | # lineage
77 | # - inputs
78 | logger.info("Log lineage: inputs")
79 | inputs_lineage = self.inputs.lineage(data=inputs, name="inputs")
80 | mlflow.log_input(dataset=inputs_lineage, context=self.run_config.name)
81 | logger.debug("- Inputs lineage: {}", inputs_lineage.to_dict())
82 | # - targets
83 | logger.info("Log lineage: targets")
84 | targets_lineage = self.targets.lineage(
85 | data=targets, name="targets", targets=schemas.TargetsSchema.cnt
86 | )
87 | mlflow.log_input(dataset=targets_lineage, context=self.run_config.name)
88 | logger.debug("- Targets lineage: {}", targets_lineage.to_dict())
89 | # model
90 | logger.info("With model: {}", self.model)
91 | # metric
92 | logger.info("With metric: {}", self.metric)
93 | # splitter
94 | logger.info("With splitter: {}", self.splitter)
95 | # searcher
96 | logger.info("Run searcher: {}", self.searcher)
97 | results, best_score, best_params = self.searcher.search(
98 | model=self.model,
99 | metric=self.metric,
100 | inputs=inputs,
101 | targets=targets,
102 | cv=self.splitter,
103 | )
104 | logger.debug("- Results: {}", results.shape)
105 | logger.debug("- Best Score: {}", best_score)
106 | logger.debug("- Best Params: {}", best_params)
107 | # notify
108 | self.alerts_service.notify(
109 | title="Tuning Job Finished", message=f"Best score: {best_score}"
110 | )
111 | return locals()
112 |
--------------------------------------------------------------------------------
/src/bikes/scripts.py:
--------------------------------------------------------------------------------
1 | """Scripts for the CLI application."""
2 |
3 | # ruff: noqa: E402
4 |
5 | # %% WARNINGS
6 |
7 | import warnings
8 |
9 | # disable annoying mlflow warnings
10 | warnings.filterwarnings(action="ignore", category=UserWarning)
11 |
12 | # %% IMPORTS
13 |
14 | import argparse
15 | import json
16 | import sys
17 |
18 | from bikes import settings
19 | from bikes.io import configs
20 |
21 | # %% PARSERS
22 |
23 | parser = argparse.ArgumentParser(description="Run an AI/ML job from YAML/JSON configs.")
24 | parser.add_argument("files", nargs="*", help="Config files for the job (local path only).")
25 | parser.add_argument("-e", "--extras", nargs="*", default=[], help="Config strings for the job.")
26 | parser.add_argument("-s", "--schema", action="store_true", help="Print settings schema and exit.")
27 |
28 | # %% SCRIPTS
29 |
30 |
31 | def main(argv: list[str] | None = None) -> int:
32 | """Main script for the application."""
33 | args = parser.parse_args(argv)
34 | if args.schema:
35 | schema = settings.MainSettings.model_json_schema()
36 | json.dump(schema, sys.stdout, indent=4)
37 | return 0
38 | files = [configs.parse_file(file) for file in args.files]
39 | strings = [configs.parse_string(string) for string in args.extras]
40 | if len(files) == 0 and len(strings) == 0:
41 | raise RuntimeError("No configs provided.")
42 | config = configs.merge_configs([*files, *strings])
43 | object_ = configs.to_object(config) # python object
44 | setting = settings.MainSettings.model_validate(object_)
45 | with setting.job as runner:
46 | runner.run()
47 | return 0
48 |
--------------------------------------------------------------------------------
/src/bikes/settings.py:
--------------------------------------------------------------------------------
1 | """Define settings for the application."""
2 |
3 | # %% IMPORTS
4 |
5 | import pydantic as pdt
6 | import pydantic_settings as pdts
7 |
8 | from bikes import jobs
9 |
10 | # %% SETTINGS
11 |
12 |
13 | class Settings(pdts.BaseSettings, strict=True, frozen=True, extra="forbid"):
14 | """Base class for application settings.
15 |
16 | Use settings to provide high-level preferences.
17 | i.e., to separate settings from provider (e.g., CLI).
18 | """
19 |
20 |
21 | class MainSettings(Settings):
22 | """Main settings of the application.
23 |
24 | Parameters:
25 | job (jobs.JobKind): job to run.
26 | """
27 |
28 | job: jobs.JobKind = pdt.Field(..., discriminator="KIND")
29 |
--------------------------------------------------------------------------------
/src/bikes/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """Helper components of the project."""
2 |
--------------------------------------------------------------------------------
/src/bikes/utils/searchers.py:
--------------------------------------------------------------------------------
1 | """Find the best hyperparameters for a model."""
2 |
3 | # %% IMPORTS
4 |
5 | import abc
6 | import typing as T
7 |
8 | import pandas as pd
9 | import pydantic as pdt
10 | from sklearn import model_selection
11 |
12 | from bikes.core import metrics, models, schemas
13 | from bikes.utils import splitters
14 |
15 | # %% TYPES
16 |
17 | # Grid of model params
18 | Grid = dict[models.ParamKey, list[models.ParamValue]]
19 |
20 | # Results of a model search
21 | Results = tuple[
22 | T.Annotated[pd.DataFrame, "details"],
23 | T.Annotated[float, "best score"],
24 | T.Annotated[models.Params, "best params"],
25 | ]
26 |
27 | # Cross-validation options for searchers
28 | CrossValidation = int | splitters.TrainTestSplits | splitters.Splitter
29 |
30 | # %% SEARCHERS
31 |
32 |
33 | class Searcher(abc.ABC, pdt.BaseModel, strict=True, frozen=True, extra="forbid"):
34 | """Base class for a searcher.
35 |
36 | Use searcher to fine-tune models.
37 | i.e., to find the best model params.
38 |
39 | Parameters:
40 | param_grid (Grid): mapping of param key -> values.
41 | """
42 |
43 | KIND: str
44 |
45 | param_grid: Grid
46 |
47 | @abc.abstractmethod
48 | def search(
49 | self,
50 | model: models.Model,
51 | metric: metrics.Metric,
52 | inputs: schemas.Inputs,
53 | targets: schemas.Targets,
54 | cv: CrossValidation,
55 | ) -> Results:
56 | """Search the best model for the given inputs and targets.
57 |
58 | Args:
59 | model (models.Model): AI/ML model to fine-tune.
60 | metric (metrics.Metric): main metric to optimize.
61 | inputs (schemas.Inputs): model inputs for tuning.
62 | targets (schemas.Targets): model targets for tuning.
63 | cv (CrossValidation): choice for cross-fold validation.
64 |
65 | Returns:
66 | Results: all the results of the searcher execution process.
67 | """
68 |
69 |
70 | class GridCVSearcher(Searcher):
71 | """Grid searcher with cross-fold validation.
72 |
73 | Convention: metric returns higher values for better models.
74 |
75 | Parameters:
76 | n_jobs (int, optional): number of jobs to run in parallel.
77 | refit (bool): refit the model after the tuning.
78 | verbose (int): set the searcher verbosity level.
79 | error_score (str | float): strategy or value on error.
80 | return_train_score (bool): include train scores if True.
81 | """
82 |
83 | KIND: T.Literal["GridCVSearcher"] = "GridCVSearcher"
84 |
85 | n_jobs: int | None = None
86 | refit: bool = True
87 | verbose: int = 3
88 | error_score: str | float = "raise"
89 | return_train_score: bool = False
90 |
91 | @T.override
92 | def search(
93 | self,
94 | model: models.Model,
95 | metric: metrics.Metric,
96 | inputs: schemas.Inputs,
97 | targets: schemas.Targets,
98 | cv: CrossValidation,
99 | ) -> Results:
100 | searcher = model_selection.GridSearchCV(
101 | estimator=model,
102 | scoring=metric.scorer,
103 | cv=cv,
104 | param_grid=self.param_grid,
105 | n_jobs=self.n_jobs,
106 | refit=self.refit,
107 | verbose=self.verbose,
108 | error_score=self.error_score,
109 | return_train_score=self.return_train_score,
110 | )
111 | searcher.fit(inputs, targets)
112 | results = pd.DataFrame(searcher.cv_results_)
113 | return results, searcher.best_score_, searcher.best_params_
114 |
115 |
116 | SearcherKind = GridCVSearcher
117 |
--------------------------------------------------------------------------------
/src/bikes/utils/signers.py:
--------------------------------------------------------------------------------
1 | """Generate signatures for AI/ML models."""
2 |
3 | # %% IMPORTS
4 |
5 | import abc
6 | import typing as T
7 |
8 | import mlflow
9 | import pydantic as pdt
10 | from mlflow.models import signature as ms
11 |
12 | from bikes.core import schemas
13 |
14 | # %% TYPES
15 |
16 | Signature: T.TypeAlias = ms.ModelSignature
17 |
18 | # %% SIGNERS
19 |
20 |
21 | class Signer(abc.ABC, pdt.BaseModel, strict=True, frozen=True, extra="forbid"):
22 | """Base class for generating model signatures.
23 |
24 | Allow switching between model signing strategies.
25 | e.g., automatic inference, manual model signature, ...
26 |
27 | https://mlflow.org/docs/latest/models.html#model-signature-and-input-example
28 | """
29 |
30 | KIND: str
31 |
32 | @abc.abstractmethod
33 | def sign(self, inputs: schemas.Inputs, outputs: schemas.Outputs) -> Signature:
34 | """Generate a model signature from its inputs/outputs.
35 |
36 | Args:
37 | inputs (schemas.Inputs): inputs data.
38 | outputs (schemas.Outputs): outputs data.
39 |
40 | Returns:
41 | Signature: signature of the model.
42 | """
43 |
44 |
45 | class InferSigner(Signer):
46 | """Generate model signatures from inputs/outputs data."""
47 |
48 | KIND: T.Literal["InferSigner"] = "InferSigner"
49 |
50 | @T.override
51 | def sign(self, inputs: schemas.Inputs, outputs: schemas.Outputs) -> Signature:
52 | return mlflow.models.infer_signature(model_input=inputs, model_output=outputs)
53 |
54 |
55 | SignerKind = InferSigner
56 |
--------------------------------------------------------------------------------
/src/bikes/utils/splitters.py:
--------------------------------------------------------------------------------
1 | """Split dataframes into subsets (e.g., train/valid/test)."""
2 |
3 | # %% IMPORTS
4 |
5 | import abc
6 | import typing as T
7 |
8 | import numpy as np
9 | import numpy.typing as npt
10 | import pydantic as pdt
11 | from sklearn import model_selection
12 |
13 | from bikes.core import schemas
14 |
15 | # %% TYPES
16 |
17 | Index = npt.NDArray[np.int64]
18 | TrainTestIndex = tuple[Index, Index]
19 | TrainTestSplits = T.Iterator[TrainTestIndex]
20 |
21 | # %% SPLITTERS
22 |
23 |
24 | class Splitter(abc.ABC, pdt.BaseModel, strict=True, frozen=True, extra="forbid"):
25 | """Base class for a splitter.
26 |
27 | Use splitters to split data in sets.
28 | e.g., split between a train/test subsets.
29 |
30 | # https://scikit-learn.org/stable/glossary.html#term-CV-splitter
31 | """
32 |
33 | KIND: str
34 |
35 | @abc.abstractmethod
36 | def split(
37 | self,
38 | inputs: schemas.Inputs,
39 | targets: schemas.Targets,
40 | groups: Index | None = None,
41 | ) -> TrainTestSplits:
42 | """Split a dataframe into subsets.
43 |
44 | Args:
45 | inputs (schemas.Inputs): model inputs.
46 | targets (schemas.Targets): model targets.
47 | groups (Index | None, optional): group labels.
48 |
49 | Returns:
50 | TrainTestSplits: iterator over the dataframe train/test splits.
51 | """
52 |
53 | @abc.abstractmethod
54 | def get_n_splits(
55 | self,
56 | inputs: schemas.Inputs,
57 | targets: schemas.Targets,
58 | groups: Index | None = None,
59 | ) -> int:
60 | """Get the number of splits generated.
61 |
62 | Args:
63 | inputs (schemas.Inputs): models inputs.
64 | targets (schemas.Targets): model targets.
65 | groups (Index | None, optional): group labels.
66 |
67 | Returns:
68 | int: number of splits generated.
69 | """
70 |
71 |
72 | class TrainTestSplitter(Splitter):
73 | """Split a dataframe into a train and test set.
74 |
75 | Parameters:
76 | shuffle (bool): shuffle the dataset. Default is False.
77 | test_size (int | float): number/ratio for the test set.
78 | random_state (int): random state for the splitter object.
79 | """
80 |
81 | KIND: T.Literal["TrainTestSplitter"] = "TrainTestSplitter"
82 |
83 | shuffle: bool = False # required (time sensitive)
84 | test_size: int | float = 24 * 30 * 2 # 2 months
85 | random_state: int = 42
86 |
87 | @T.override
88 | def split(
89 | self,
90 | inputs: schemas.Inputs,
91 | targets: schemas.Targets,
92 | groups: Index | None = None,
93 | ) -> TrainTestSplits:
94 | index = np.arange(len(inputs)) # return integer position
95 | train_index, test_index = model_selection.train_test_split(
96 | index,
97 | shuffle=self.shuffle,
98 | test_size=self.test_size,
99 | random_state=self.random_state,
100 | )
101 | yield train_index, test_index
102 |
103 | @T.override
104 | def get_n_splits(
105 | self,
106 | inputs: schemas.Inputs,
107 | targets: schemas.Targets,
108 | groups: Index | None = None,
109 | ) -> int:
110 | return 1
111 |
112 |
113 | class TimeSeriesSplitter(Splitter):
114 | """Split a dataframe into fixed time series subsets.
115 |
116 | Parameters:
117 | gap (int): gap between splits.
118 | n_splits (int): number of split to generate.
119 | test_size (int | float): number or ratio for the test dataset.
120 | """
121 |
122 | KIND: T.Literal["TimeSeriesSplitter"] = "TimeSeriesSplitter"
123 |
124 | gap: int = 0
125 | n_splits: int = 4
126 | test_size: int | float = 24 * 30 * 2 # 2 months
127 |
128 | @T.override
129 | def split(
130 | self,
131 | inputs: schemas.Inputs,
132 | targets: schemas.Targets,
133 | groups: Index | None = None,
134 | ) -> TrainTestSplits:
135 | splitter = model_selection.TimeSeriesSplit(
136 | n_splits=self.n_splits, test_size=self.test_size, gap=self.gap
137 | )
138 | yield from splitter.split(inputs)
139 |
140 | @T.override
141 | def get_n_splits(
142 | self,
143 | inputs: schemas.Inputs,
144 | targets: schemas.Targets,
145 | groups: Index | None = None,
146 | ) -> int:
147 | return self.n_splits
148 |
149 |
150 | SplitterKind = TrainTestSplitter | TimeSeriesSplitter
151 |
--------------------------------------------------------------------------------
/tasks/check.just:
--------------------------------------------------------------------------------
1 | # run check tasks
2 | [group('check')]
3 | check: check-code check-type check-format check-security check-coverage
4 |
5 | # check code quality
6 | [group('check')]
7 | check-code:
8 | uv run ruff check {{SOURCES}} {{TESTS}}
9 |
10 | # check code coverage
11 | [group('check')]
12 | check-coverage numprocesses="auto" cov_fail_under="80":
13 | uv run pytest --numprocesses={{numprocesses}} --cov={{SOURCES}} --cov-fail-under={{cov_fail_under}} {{TESTS}}
14 |
15 | # check code format
16 | [group('check')]
17 | check-format:
18 | uv run ruff format --check {{SOURCES}} {{TESTS}}
19 |
20 | # check code security
21 | [group('check')]
22 | check-security:
23 | uv run bandit --recursive --configfile=pyproject.toml {{SOURCES}}
24 |
25 | # check unit tests
26 | [group('check')]
27 | check-test numprocesses="auto":
28 | uv run pytest --numprocesses={{numprocesses}} {{TESTS}}
29 |
30 | # check code typing
31 | [group('check')]
32 | check-type:
33 | uv run mypy {{SOURCES}} {{TESTS}}
34 |
--------------------------------------------------------------------------------
/tasks/clean.just:
--------------------------------------------------------------------------------
1 | # run clean tasks
2 | [group('clean')]
3 | clean: clean-build clean-cache clean-constraints clean-coverage clean-docs clean-environment clean-mlruns clean-mypy clean-outputs clean-pytest clean-python clean-requirements clean-ruff
4 |
5 | # clean build folders
6 | [group('clean')]
7 | clean-build:
8 | rm -rf dist/
9 | rm -rf build/
10 |
11 | # clean cache folder
12 | [group('clean')]
13 | clean-cache:
14 | rm -rf .cache/
15 |
16 | # clean constraints file
17 | [group('clean')]
18 | clean-constraints:
19 | rm -rf constraints.txt
20 |
21 | # clean coverage files
22 | [group('clean')]
23 | clean-coverage:
24 | rm -rf .coverage*
25 |
26 | # clean docs folder
27 | [group('clean')]
28 | clean-docs:
29 | rm -rf docs/
30 |
31 | # clean environment file
32 | [group('clean')]
33 | clean-environment:
34 | rm -f python_env.yaml
35 |
36 | # clean mlruns folder
37 | [group('clean')]
38 | clean-mlruns:
39 | rm -rf mlruns/*
40 |
41 | # clean mypy folders
42 | [group('clean')]
43 | clean-mypy:
44 | rm -rf .mypy_cache/
45 |
46 | # clean outputs folder
47 | [group('clean')]
48 | clean-outputs:
49 | rm -rf outputs/*
50 |
51 | # clean pytest cache
52 | [group('clean')]
53 | clean-pytest:
54 | rm -rf .pytest_cache/
55 |
56 | # clean python caches
57 | [group('clean')]
58 | clean-python:
59 | find . -type f -name '*.py[co]' -delete
60 | find . -type d -name __pycache__ -exec rm -r {} \+
61 |
62 | # clean requirements file
63 | [group('clean')]
64 | clean-requirements:
65 | rm -f requirements.txt
66 |
67 | # clean ruff cache
68 | [group('clean')]
69 | clean-ruff:
70 | rm -rf .ruff_cache/
71 |
72 | # clean venv folder
73 | [confirm]
74 | [group('clean')]
75 | clean-venv:
76 | rm -rf .venv/
77 |
--------------------------------------------------------------------------------
/tasks/commit.just:
--------------------------------------------------------------------------------
1 | # bump package
2 | [group('commit')]
3 | commit-bump:
4 | uv run cz bump
5 |
6 | # commit package
7 | [group('commit')]
8 | commit-files:
9 | uv run cz commit
10 |
11 | # get commit info
12 | [group('commit')]
13 | commit-info:
14 | uv run cz info
15 |
--------------------------------------------------------------------------------
/tasks/doc.just:
--------------------------------------------------------------------------------
1 | # run doc tasks
2 | [group('doc')]
3 | doc: doc-build
4 |
5 | # build documentation
6 | [group('doc')]
7 | doc-build format="google" output="docs": clean-docs
8 | uv run pdoc --docformat={{format}} --output-directory={{output}} {{SOURCES}}/{{PACKAGE}}
9 |
10 | # serve documentation
11 | [group('doc')]
12 | doc-serve format="google" port="8088":
13 | uv run pdoc --docformat={{format}} --port={{port}} {{SOURCES}}/{{PACKAGE}}
14 |
--------------------------------------------------------------------------------
/tasks/docker.just:
--------------------------------------------------------------------------------
1 | # run docker tasks
2 | [group('docker')]
3 | docker: docker-build docker-run
4 |
5 | # build docker image
6 | [group('docker')]
7 | docker-build tag="latest": package-build
8 | docker build --tag={{REPOSITORY}}:{{tag}} .
9 |
10 | # start docker compose
11 | [group('docker')]
12 | docker-compose:
13 | docker compose up
14 |
15 | # run latest docker image
16 | [group('docker')]
17 | docker-run tag="latest":
18 | docker run --rm {{REPOSITORY}}:{{tag}}
19 |
--------------------------------------------------------------------------------
/tasks/format.just:
--------------------------------------------------------------------------------
1 | # run format tasks
2 | [group('format')]
3 | format: format-import format-source
4 |
5 | # format code import
6 | [group('format')]
7 | format-import:
8 | uv run ruff check --select=I --fix {{SOURCES}} {{TESTS}}
9 |
10 | # format code source
11 | [group('format')]
12 | format-source:
13 | uv run ruff format {{SOURCES}} {{TESTS}}
14 |
--------------------------------------------------------------------------------
/tasks/install.just:
--------------------------------------------------------------------------------
1 | # run install tasks
2 | [group('install')]
3 | install: install-project install-hooks
4 |
5 | # install git hooks
6 | [group('install')]
7 | install-hooks:
8 | uv run pre-commit install --hook-type=pre-push
9 | uv run pre-commit install --hook-type=commit-msg
10 |
11 | # install the project
12 | [group('install')]
13 | install-project:
14 | uv sync --all-groups
15 |
16 | # install github rulesets
17 | [group('install')]
18 | install-rulesets:
19 | #!/usr/bin/env bash
20 | set -euo pipefail
21 | repo=$(gh repo view --json=name --jq=.name)
22 | owner=$(gh repo view --json=owner --jq=.owner.login)
23 | gh api --method POST -H "Accept: application/vnd.github+json" \
24 | "/repos/$owner/$repo/rulesets" --input=".github/rulesets/main.json"
25 |
--------------------------------------------------------------------------------
/tasks/mlflow.just:
--------------------------------------------------------------------------------
1 | # run mlflow tasks
2 | [group('mlflow')]
3 | mlflow: mlflow-doctor mlflow-serve
4 |
5 | # run mlflow doctor
6 | [group('mlflow')]
7 | mlflow-doctor:
8 | uv run mlflow doctor
9 |
10 | # start mlflow server
11 | [group('mlflow')]
12 | mlflow-serve host="127.0.0.1" port="5000" uri="./mlruns":
13 | uv run mlflow server --host={{host}} --port={{port}} --backend-store-uri={{uri}}
14 |
--------------------------------------------------------------------------------
/tasks/package.just:
--------------------------------------------------------------------------------
1 | # run package tasks
2 | [group('package')]
3 | package: package-build
4 |
5 | # build package constraints
6 | [group('package')]
7 | package-constraints constraints="constraints.txt":
8 | uv pip compile pyproject.toml --generate-hashes --output-file={{constraints}}
9 |
10 | # build python package
11 | [group('package')]
12 | package-build constraints="constraints.txt": clean-build package-constraints
13 | uv build --build-constraint={{constraints}} --require-hashes --wheel
14 |
--------------------------------------------------------------------------------
/tasks/project.just:
--------------------------------------------------------------------------------
1 | # run project tasks
2 | [group('project')]
3 | project: project-environment (project-run "tuning") (project-run "training") (project-run "promotion") (project-run "inference") (project-run "evaluations") (project-run "explanations")
4 |
5 | # export environment file
6 | [group('project')]
7 | project-environment: project-requirements
8 | #!/usr/bin/env python3
9 | import json
10 | with open(".python-version", "r") as reader:
11 | python = reader.read().strip() # version
12 | configuration = {"python": python}
13 | with open("requirements.txt", "r") as reader:
14 | dependencies = []
15 | for line in reader.readlines():
16 | dependency = line.split(" ")[0].strip()
17 | if "pywin32" in dependency or "#" in dependency:
18 | continue
19 | dependencies.append(dependency)
20 | configuration["dependencies"] = dependencies
21 | with open("python_env.yaml", "w") as writer:
22 | json.dump(configuration, writer, indent=4)
23 | writer.write("\n") # add new line at the end
24 |
25 | # export requirements file
26 | [group('project')]
27 | project-requirements:
28 | uv export --format=requirements-txt --no-dev --no-hashes \
29 | --no-editable --no-emit-project --output-file=requirements.txt
30 |
31 | # run project job using mlflow
32 | [group('project')]
33 | project-run job:
34 | uv run mlflow run --experiment-name={{REPOSITORY}} --run-name={{capitalize(job)}} -P conf_file=confs/{{job}}.yaml .
35 |
--------------------------------------------------------------------------------
/tests/confs/invalid/1. invalid.yaml:
--------------------------------------------------------------------------------
1 | job:
2 | KIND: UnknownJob
3 |
--------------------------------------------------------------------------------
/tests/confs/valid/0. tuning.yaml:
--------------------------------------------------------------------------------
1 | job:
2 | KIND: TuningJob
3 | inputs:
4 | KIND: ParquetReader
5 | path: "${tests_path:}/data/inputs_sample.parquet"
6 | limit: 1500
7 | targets:
8 | KIND: ParquetReader
9 | path: "${tests_path:}/data/targets_sample.parquet"
10 | limit: 1500
11 | splitter:
12 | KIND: TimeSeriesSplitter
13 | n_splits: 3
14 | test_size: 167 # 1 week
15 |
--------------------------------------------------------------------------------
/tests/confs/valid/1. training.yaml:
--------------------------------------------------------------------------------
1 | job:
2 | KIND: TrainingJob
3 | inputs:
4 | KIND: ParquetReader
5 | path: "${tests_path:}/data/inputs_sample.parquet"
6 | limit: 1500
7 | targets:
8 | KIND: ParquetReader
9 | path: "${tests_path:}/data/targets_sample.parquet"
10 | limit: 1500
11 |
--------------------------------------------------------------------------------
/tests/confs/valid/2. promotion.yaml:
--------------------------------------------------------------------------------
1 | job:
2 | KIND: PromotionJob
3 |
--------------------------------------------------------------------------------
/tests/confs/valid/3. inference.yaml:
--------------------------------------------------------------------------------
1 | job:
2 | KIND: InferenceJob
3 | inputs:
4 | KIND: ParquetReader
5 | path: "${tests_path:}/data/inputs_sample.parquet"
6 | limit: 1500
7 | outputs:
8 | KIND: ParquetWriter
9 | path: "${tmp_path:}/outputs_sample.parquet"
10 |
--------------------------------------------------------------------------------
/tests/confs/valid/5. evaluations.yaml:
--------------------------------------------------------------------------------
1 | job:
2 | KIND: EvaluationsJob
3 | inputs:
4 | KIND: ParquetReader
5 | path: "${tests_path:}/data/inputs_sample.parquet"
6 | limit: 1500
7 | targets:
8 | KIND: ParquetReader
9 | path: "${tests_path:}/data/targets_sample.parquet"
10 | limit: 1500
11 |
--------------------------------------------------------------------------------
/tests/confs/valid/6. explanations.yaml:
--------------------------------------------------------------------------------
1 | job:
2 | KIND: ExplanationsJob
3 | inputs_samples:
4 | KIND: ParquetReader
5 | path: "${tests_path:}/data/inputs_sample.parquet"
6 | limit: 100
7 | models_explanations:
8 | KIND: ParquetWriter
9 | path: "${tmp_path:}/models_explanations.parquet"
10 | samples_explanations:
11 | KIND: ParquetWriter
12 | path: "${tmp_path:}/samples_explanations.parquet"
13 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | """Configuration for the tests."""
2 |
3 | # %% IMPORTS
4 |
5 | import os
6 | import typing as T
7 |
8 | import omegaconf
9 | import pytest
10 | from _pytest import logging as pl
11 |
12 | from bikes.core import metrics, models, schemas
13 | from bikes.io import datasets, registries, services
14 | from bikes.utils import searchers, signers, splitters
15 |
16 | # %% CONFIGS
17 |
18 | LIMIT = 1500
19 | N_SPLITS = 3
20 | TEST_SIZE = 24 * 7 # 1 week
21 |
22 | # %% FIXTURES
23 |
24 | # %% - Paths
25 |
26 |
27 | @pytest.fixture(scope="session")
28 | def tests_path() -> str:
29 | """Return the path of the tests folder."""
30 | file = os.path.abspath(__file__)
31 | parent = os.path.dirname(file)
32 | return parent
33 |
34 |
35 | @pytest.fixture(scope="session")
36 | def data_path(tests_path: str) -> str:
37 | """Return the path of the data folder."""
38 | return os.path.join(tests_path, "data")
39 |
40 |
41 | @pytest.fixture(scope="session")
42 | def confs_path(tests_path: str) -> str:
43 | """Return the path of the confs folder."""
44 | return os.path.join(tests_path, "confs")
45 |
46 |
47 | @pytest.fixture(scope="session")
48 | def inputs_path(data_path: str) -> str:
49 | """Return the path of the inputs dataset."""
50 | return os.path.join(data_path, "inputs_sample.parquet")
51 |
52 |
53 | @pytest.fixture(scope="session")
54 | def targets_path(data_path: str) -> str:
55 | """Return the path of the targets dataset."""
56 | return os.path.join(data_path, "targets_sample.parquet")
57 |
58 |
59 | @pytest.fixture(scope="session")
60 | def outputs_path(data_path: str) -> str:
61 | """Return the path of the outputs dataset."""
62 | return os.path.join(data_path, "outputs_sample.parquet")
63 |
64 |
65 | @pytest.fixture(scope="function")
66 | def tmp_outputs_path(tmp_path: str) -> str:
67 | """Return a tmp path for the outputs dataset."""
68 | return os.path.join(tmp_path, "outputs.parquet")
69 |
70 |
71 | @pytest.fixture(scope="function")
72 | def tmp_models_explanations_path(tmp_path: str) -> str:
73 | """Return a tmp path for the model explanations dataset."""
74 | return os.path.join(tmp_path, "models_explanations.parquet")
75 |
76 |
77 | @pytest.fixture(scope="function")
78 | def tmp_samples_explanations_path(tmp_path: str) -> str:
79 | """Return a tmp path for the samples explanations dataset."""
80 | return os.path.join(tmp_path, "samples_explanations.parquet")
81 |
82 |
83 | # %% - Configs
84 |
85 |
86 | @pytest.fixture(scope="session")
87 | def extra_config() -> str:
88 | """Extra config for scripts."""
89 | # use OmegaConf resolver: ${tmp_path:}
90 | config = """
91 | {
92 | "job": {
93 | "alerts_service": {
94 | "enable": false,
95 | },
96 | "mlflow_service": {
97 | "tracking_uri": "${tmp_path:}/tracking/",
98 | "registry_uri": "${tmp_path:}/registry/",
99 | }
100 | }
101 | }
102 | """
103 | return config
104 |
105 |
106 | # %% - Datasets
107 |
108 |
109 | @pytest.fixture(scope="session")
110 | def inputs_reader(inputs_path: str) -> datasets.ParquetReader:
111 | """Return a reader for the inputs dataset."""
112 | return datasets.ParquetReader(path=inputs_path, limit=LIMIT)
113 |
114 |
115 | @pytest.fixture(scope="session")
116 | def inputs_samples_reader(inputs_path: str) -> datasets.ParquetReader:
117 | """Return a reader for the inputs samples dataset."""
118 | return datasets.ParquetReader(path=inputs_path, limit=100)
119 |
120 |
121 | @pytest.fixture(scope="session")
122 | def targets_reader(targets_path: str) -> datasets.ParquetReader:
123 | """Return a reader for the targets dataset."""
124 | return datasets.ParquetReader(path=targets_path, limit=LIMIT)
125 |
126 |
127 | @pytest.fixture(scope="session")
128 | def outputs_reader(
129 | outputs_path: str,
130 | inputs_reader: datasets.ParquetReader,
131 | targets_reader: datasets.ParquetReader,
132 | ) -> datasets.ParquetReader:
133 | """Return a reader for the outputs dataset."""
134 | # generate outputs if it is missing
135 | if not os.path.exists(outputs_path):
136 | inputs = schemas.InputsSchema.check(inputs_reader.read())
137 | targets = schemas.TargetsSchema.check(targets_reader.read())
138 | model = models.BaselineSklearnModel().fit(inputs=inputs, targets=targets)
139 | outputs = schemas.OutputsSchema.check(model.predict(inputs=inputs))
140 | outputs_writer = datasets.ParquetWriter(path=outputs_path)
141 | outputs_writer.write(data=outputs)
142 | return datasets.ParquetReader(path=outputs_path, limit=LIMIT)
143 |
144 |
145 | @pytest.fixture(scope="function")
146 | def tmp_outputs_writer(tmp_outputs_path: str) -> datasets.ParquetWriter:
147 | """Return a writer for the tmp outputs dataset."""
148 | return datasets.ParquetWriter(path=tmp_outputs_path)
149 |
150 |
151 | @pytest.fixture(scope="function")
152 | def tmp_models_explanations_writer(
153 | tmp_models_explanations_path: str,
154 | ) -> datasets.ParquetWriter:
155 | """Return a writer for the tmp model explanations dataset."""
156 | return datasets.ParquetWriter(path=tmp_models_explanations_path)
157 |
158 |
159 | @pytest.fixture(scope="function")
160 | def tmp_samples_explanations_writer(
161 | tmp_samples_explanations_path: str,
162 | ) -> datasets.ParquetWriter:
163 | """Return a writer for the tmp samples explanations dataset."""
164 | return datasets.ParquetWriter(path=tmp_samples_explanations_path)
165 |
166 |
167 | # %% - Dataframes
168 |
169 |
170 | @pytest.fixture(scope="session")
171 | def inputs(inputs_reader: datasets.ParquetReader) -> schemas.Inputs:
172 | """Return the inputs data."""
173 | data = inputs_reader.read()
174 | return schemas.InputsSchema.check(data)
175 |
176 |
177 | @pytest.fixture(scope="session")
178 | def inputs_samples(inputs_samples_reader: datasets.ParquetReader) -> schemas.Inputs:
179 | """Return the inputs samples data."""
180 | data = inputs_samples_reader.read()
181 | return schemas.InputsSchema.check(data)
182 |
183 |
184 | @pytest.fixture(scope="session")
185 | def targets(targets_reader: datasets.ParquetReader) -> schemas.Targets:
186 | """Return the targets data."""
187 | data = targets_reader.read()
188 | return schemas.TargetsSchema.check(data)
189 |
190 |
191 | @pytest.fixture(scope="session")
192 | def outputs(outputs_reader: datasets.ParquetReader) -> schemas.Outputs:
193 | """Return the outputs data."""
194 | data = outputs_reader.read()
195 | return schemas.OutputsSchema.check(data)
196 |
197 |
198 | # %% - Splitters
199 |
200 |
201 | @pytest.fixture(scope="session")
202 | def train_test_splitter() -> splitters.TrainTestSplitter:
203 | """Return the default train test splitter."""
204 | return splitters.TrainTestSplitter(test_size=TEST_SIZE)
205 |
206 |
207 | @pytest.fixture(scope="session")
208 | def time_series_splitter() -> splitters.TimeSeriesSplitter:
209 | """Return the default time series splitter."""
210 | return splitters.TimeSeriesSplitter(n_splits=N_SPLITS, test_size=TEST_SIZE)
211 |
212 |
213 | # %% - Searchers
214 |
215 |
216 | @pytest.fixture(scope="session")
217 | def searcher() -> searchers.GridCVSearcher:
218 | """Return the default searcher object."""
219 | param_grid = {"max_depth": [1, 2], "n_estimators": [3]}
220 | return searchers.GridCVSearcher(param_grid=param_grid)
221 |
222 |
223 | # %% - Subsets
224 |
225 |
226 | @pytest.fixture(scope="session")
227 | def train_test_sets(
228 | train_test_splitter: splitters.TrainTestSplitter,
229 | inputs: schemas.Inputs,
230 | targets: schemas.Targets,
231 | ) -> tuple[schemas.Inputs, schemas.Targets, schemas.Inputs, schemas.Targets]:
232 | """Return the inputs and targets train and test sets from the splitter."""
233 | train_index, test_index = next(train_test_splitter.split(inputs=inputs, targets=targets))
234 | inputs_train, inputs_test = inputs.iloc[train_index], inputs.iloc[test_index]
235 | targets_train, targets_test = targets.iloc[train_index], targets.iloc[test_index]
236 | return (
237 | T.cast(schemas.Inputs, inputs_train),
238 | T.cast(schemas.Targets, targets_train),
239 | T.cast(schemas.Inputs, inputs_test),
240 | T.cast(schemas.Targets, targets_test),
241 | )
242 |
243 |
244 | # %% - Models
245 |
246 |
247 | @pytest.fixture(scope="session")
248 | def model(
249 | train_test_sets: tuple[schemas.Inputs, schemas.Targets, schemas.Inputs, schemas.Targets],
250 | ) -> models.BaselineSklearnModel:
251 | """Return a train model for testing."""
252 | model = models.BaselineSklearnModel()
253 | inputs_train, targets_train, _, _ = train_test_sets
254 | model.fit(inputs=inputs_train, targets=targets_train)
255 | return model
256 |
257 |
258 | # %% - Metrics
259 |
260 |
261 | @pytest.fixture(scope="session")
262 | def metric() -> metrics.SklearnMetric:
263 | """Return the default metric."""
264 | return metrics.SklearnMetric()
265 |
266 |
267 | # %% - Signers
268 |
269 |
270 | @pytest.fixture(scope="session")
271 | def signer() -> signers.InferSigner:
272 | """Return a model signer."""
273 | return signers.InferSigner()
274 |
275 |
276 | # %% - Services
277 |
278 |
279 | @pytest.fixture(scope="session", autouse=True)
280 | def logger_service() -> T.Generator[services.LoggerService, None, None]:
281 | """Return and start the logger service."""
282 | service = services.LoggerService(colorize=False, diagnose=True)
283 | service.start()
284 | yield service
285 | service.stop()
286 |
287 |
288 | @pytest.fixture
289 | def logger_caplog(
290 | caplog: pl.LogCaptureFixture, logger_service: services.LoggerService
291 | ) -> T.Generator[pl.LogCaptureFixture, None, None]:
292 | """Extend pytest caplog fixture with the logger service (loguru)."""
293 | # https://loguru.readthedocs.io/en/stable/resources/migration.html#replacing-caplog-fixture-from-pytest-library
294 | logger = logger_service.logger()
295 | handler_id = logger.add(
296 | caplog.handler,
297 | level=0,
298 | format="{message}",
299 | filter=lambda record: record["level"].no >= caplog.handler.level,
300 | enqueue=False, # Set to 'True' if your test is spawning child processes.
301 | )
302 | yield caplog
303 | logger.remove(handler_id)
304 |
305 |
306 | @pytest.fixture(scope="session", autouse=True)
307 | def alerts_service() -> T.Generator[services.AlertsService, None, None]:
308 | """Return and start the alerter service."""
309 | service = services.AlertsService(enable=False)
310 | service.start()
311 | yield service
312 | service.stop()
313 |
314 |
315 | @pytest.fixture(scope="function", autouse=True)
316 | def mlflow_service(tmp_path: str) -> T.Generator[services.MlflowService, None, None]:
317 | """Return and start the mlflow service."""
318 | service = services.MlflowService(
319 | tracking_uri=f"{tmp_path}/tracking/",
320 | registry_uri=f"{tmp_path}/registry/",
321 | experiment_name="Experiment-Testing",
322 | registry_name="Registry-Testing",
323 | )
324 | service.start()
325 | yield service
326 | service.stop()
327 |
328 |
329 | # %% - Resolvers
330 |
331 |
332 | @pytest.fixture(scope="session", autouse=True)
333 | def tests_path_resolver(tests_path: str) -> str:
334 | """Register the tests path resolver with OmegaConf."""
335 |
336 | def resolver() -> str:
337 | """Get tests path."""
338 | return tests_path
339 |
340 | omegaconf.OmegaConf.register_new_resolver("tests_path", resolver, use_cache=True, replace=False)
341 | return tests_path
342 |
343 |
344 | @pytest.fixture(scope="function", autouse=True)
345 | def tmp_path_resolver(tmp_path: str) -> str:
346 | """Register the tmp path resolver with OmegaConf."""
347 |
348 | def resolver() -> str:
349 | """Get tmp data path."""
350 | return tmp_path
351 |
352 | omegaconf.OmegaConf.register_new_resolver("tmp_path", resolver, use_cache=False, replace=True)
353 | return tmp_path
354 |
355 |
356 | # %% - Signatures
357 |
358 |
359 | @pytest.fixture(scope="session")
360 | def signature(
361 | signer: signers.Signer, inputs: schemas.Inputs, outputs: schemas.Outputs
362 | ) -> signers.Signature:
363 | """Return the signature for the testing model."""
364 | return signer.sign(inputs=inputs, outputs=outputs)
365 |
366 |
367 | # %% - Registries
368 |
369 |
370 | @pytest.fixture(scope="session")
371 | def saver() -> registries.CustomSaver:
372 | """Return the default model saver."""
373 | return registries.CustomSaver(path="custom-model")
374 |
375 |
376 | @pytest.fixture(scope="session")
377 | def loader() -> registries.CustomLoader:
378 | """Return the default model loader."""
379 | return registries.CustomLoader()
380 |
381 |
382 | @pytest.fixture(scope="session")
383 | def register() -> registries.MlflowRegister:
384 | """Return the default model register."""
385 | tags = {"context": "test", "role": "fixture"}
386 | return registries.MlflowRegister(tags=tags)
387 |
388 |
389 | @pytest.fixture(scope="function")
390 | def model_version(
391 | model: models.Model,
392 | inputs: schemas.Inputs,
393 | signature: signers.Signature,
394 | saver: registries.Saver,
395 | register: registries.Register,
396 | mlflow_service: services.MlflowService,
397 | ) -> registries.Version:
398 | """Save and register the default model version."""
399 | run_config = mlflow_service.RunConfig(name="Custom-Run")
400 | with mlflow_service.run_context(run_config=run_config):
401 | info = saver.save(model=model, signature=signature, input_example=inputs)
402 | version = register.register(name=mlflow_service.registry_name, model_uri=info.model_uri)
403 | return version
404 |
405 |
406 | @pytest.fixture(scope="function")
407 | def model_alias(
408 | model_version: registries.Version,
409 | mlflow_service: services.MlflowService,
410 | ) -> registries.Alias:
411 | """Promote the default model version with an alias."""
412 | alias = "Promotion"
413 | client = mlflow_service.client()
414 | client.set_registered_model_alias(
415 | name=mlflow_service.registry_name, alias=alias, version=model_version.version
416 | )
417 | model_alias = client.get_model_version_by_alias(name=mlflow_service.registry_name, alias=alias)
418 | return model_alias
419 |
--------------------------------------------------------------------------------
/tests/core/test_metrics.py:
--------------------------------------------------------------------------------
1 | # %% IMPORTS
2 |
3 | import mlflow
4 | import pandas as pd
5 | import pytest
6 |
7 | from bikes.core import metrics, models, schemas
8 |
9 | # %% METRICS
10 |
11 |
12 | @pytest.mark.parametrize(
13 | "name, interval, greater_is_better",
14 | [
15 | ("mean_squared_error", [0, float("inf")], True),
16 | ("mean_absolute_error", [float("-inf"), 0], False),
17 | ],
18 | )
19 | def test_sklearn_metric(
20 | name: str,
21 | interval: tuple[int, int],
22 | greater_is_better: bool,
23 | model: models.Model,
24 | inputs: schemas.Inputs,
25 | targets: schemas.Targets,
26 | outputs: schemas.Outputs,
27 | ) -> None:
28 | # given
29 | low, high = interval
30 | data = pd.concat([targets, outputs], axis="columns")
31 | metric = metrics.SklearnMetric(name=name, greater_is_better=greater_is_better)
32 | # when
33 | score = metric.score(targets=targets, outputs=outputs)
34 | scorer = metric.scorer(model=model, inputs=inputs, targets=targets)
35 | mlflow_metric = metric.to_mlflow()
36 | mlflow_results = mlflow.evaluate(
37 | data=data,
38 | predictions=schemas.OutputsSchema.prediction,
39 | targets=schemas.TargetsSchema.cnt,
40 | extra_metrics=[mlflow_metric],
41 | )
42 | # then
43 | # - score
44 | assert low <= score <= high, "Score should be in the expected interval!"
45 | # - scorer
46 | assert low <= scorer <= high, "Scorer should be in the expected interval!"
47 | # - mlflow metric
48 | assert mlflow_metric.name == metric.name, "Mlflow metric name should be the same!" # type: ignore[attr-defined]
49 | assert (
50 | mlflow_metric.greater_is_better == metric.greater_is_better # type: ignore[attr-defined]
51 | ), "Mlflow metric greater is better should be the same!"
52 | # - mlflow results
53 | assert mlflow_results.metrics == {metric.name: score * (1 if greater_is_better else -1)}, (
54 | "Mlflow results metrics should have the same name and score!"
55 | )
56 |
57 |
58 | # %% THRESHOLDS
59 |
60 |
61 | def test_threshold() -> None:
62 | # given
63 | threshold = metrics.Threshold(threshold=10, greater_is_better=True)
64 | # when
65 | mlflow_threshold = threshold.to_mlflow()
66 | # then
67 | assert mlflow_threshold.threshold == threshold.threshold, "Threshold should be the same!"
68 | assert mlflow_threshold.greater_is_better == threshold.greater_is_better, (
69 | "Greater is better should be the same!"
70 | )
71 |
--------------------------------------------------------------------------------
/tests/core/test_models.py:
--------------------------------------------------------------------------------
1 | # %% IMPORTS
2 |
3 | import typing as T
4 |
5 | import pytest
6 |
7 | from bikes.core import models, schemas
8 |
9 | # %% MODELS
10 |
11 |
12 | def test_model(inputs_samples: schemas.Inputs) -> None:
13 | # given
14 | class MyModel(models.Model):
15 | KIND: T.Literal["MyModel"] = "MyModel"
16 |
17 | # public
18 | a: int = 1
19 | b: int = 2
20 | # private
21 | _c: int = 3
22 |
23 | def fit(self, inputs: schemas.Inputs, targets: schemas.Targets) -> T.Self:
24 | return self
25 |
26 | def predict(self, inputs: schemas.Inputs) -> schemas.Outputs:
27 | return schemas.Outputs()
28 |
29 | # when
30 | model = MyModel(a=10)
31 | params_init = model.get_params()
32 | params_set_params = model.set_params(b=20).get_params()
33 | with pytest.raises(NotImplementedError) as explain_model_error:
34 | model.explain_model()
35 | with pytest.raises(NotImplementedError) as explain_samples_error:
36 | model.explain_samples(inputs=inputs_samples)
37 | with pytest.raises(NotImplementedError) as get_internal_model_error:
38 | model.get_internal_model()
39 | # then
40 | assert params_init == {
41 | "a": 10,
42 | "b": 2,
43 | }, "Model should have the given params after init!"
44 | assert params_set_params == {
45 | "a": 10,
46 | "b": 20,
47 | }, "Model should have the given params after set_params!"
48 | assert isinstance(explain_model_error.value, NotImplementedError), (
49 | "Model should raise NotImplementedError for explain_model_error()!"
50 | )
51 | assert isinstance(explain_samples_error.value, NotImplementedError), (
52 | "Model should raise NotImplementedError for explain_samples_error()!"
53 | )
54 | assert isinstance(get_internal_model_error.value, NotImplementedError), (
55 | "Model should raise NotImplementedError for get_internal_model_error()!"
56 | )
57 |
58 |
59 | def test_baseline_sklearn_model(
60 | train_test_sets: tuple[schemas.Inputs, schemas.Targets, schemas.Inputs, schemas.Targets],
61 | ) -> None:
62 | # given
63 | params = {"max_depth": 3, "n_estimators": 5, "random_state": 0}
64 | inputs_train, targets_train, inputs_test, _ = train_test_sets
65 | model = models.BaselineSklearnModel().set_params(**params)
66 | # when
67 | with pytest.raises(ValueError) as not_fitted_error:
68 | model.get_internal_model()
69 | model.fit(inputs=inputs_train, targets=targets_train)
70 | outputs = model.predict(inputs=inputs_test)
71 | shap_values = model.explain_samples(inputs=inputs_test)
72 | feature_importances = model.explain_model()
73 | # then
74 | assert not_fitted_error.match("Model is not fitted yet!"), (
75 | "Model should raise an error when not fitted!"
76 | )
77 | # - model
78 | assert model.get_params() == params, "Model should have the given params!"
79 | assert model.get_internal_model() is not None, "Internal model should be fitted!"
80 | # - outputs
81 | assert outputs.ndim == 2, "Outputs should be a dataframe!"
82 | # - shap values
83 | assert len(shap_values.index) == len(inputs_test.index), (
84 | "SHAP values should be the same length as inputs!"
85 | )
86 | assert len(shap_values.columns) >= len(inputs_test.columns), (
87 | "SHAP values should have more features than inputs!"
88 | )
89 | # - feature importances
90 | assert feature_importances["importance"].sum() == 1.0, (
91 | "Feature importances should add up to 1.0!"
92 | )
93 | assert len(feature_importances["feature"]) >= len(inputs_train.columns), (
94 | "Feature importances should have more features than inputs!"
95 | )
96 |
--------------------------------------------------------------------------------
/tests/core/test_schemas.py:
--------------------------------------------------------------------------------
1 | # %% IMPORTS
2 |
3 | from bikes.core import models, schemas
4 | from bikes.io import datasets
5 |
6 | # %% SCHEMAS
7 |
8 |
9 | def test_inputs_schema(inputs_reader: datasets.Reader) -> None:
10 | # given
11 | schema = schemas.InputsSchema
12 | # when
13 | data = inputs_reader.read()
14 | # then
15 | assert schema.check(data) is not None, "Inputs data should be valid!"
16 |
17 |
18 | def test_targets_schema(targets_reader: datasets.Reader) -> None:
19 | # given
20 | schema = schemas.TargetsSchema
21 | # when
22 | data = targets_reader.read()
23 | # then
24 | assert schema.check(data) is not None, "Targets data should be valid!"
25 |
26 |
27 | def test_outputs_schema(outputs_reader: datasets.Reader) -> None:
28 | # given
29 | schema = schemas.OutputsSchema
30 | # when
31 | data = outputs_reader.read()
32 | # then
33 | assert schema.check(data) is not None, "Outputs data should be valid!"
34 |
35 |
36 | def test_shap_values_schema(
37 | model: models.Model,
38 | train_test_sets: tuple[schemas.Inputs, schemas.Targets, schemas.Inputs, schemas.Targets],
39 | ) -> None:
40 | # given
41 | schema = schemas.SHAPValuesSchema
42 | _, _, inputs_test, _ = train_test_sets
43 | # when
44 | data = model.explain_samples(inputs=inputs_test)
45 | # then
46 | assert schema.check(data) is not None, "SHAP values data should be valid!"
47 |
48 |
49 | def test_feature_importances_schema(model: models.Model) -> None:
50 | # given
51 | schema = schemas.FeatureImportancesSchema
52 | # when
53 | data = model.explain_model()
54 | # then
55 | assert schema.check(data) is not None, "Feature importance data should be valid!"
56 |
--------------------------------------------------------------------------------
/tests/data/inputs_sample.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmind/mlops-python-package/856a5ab74958cec36ae31619636d07c4baf9698f/tests/data/inputs_sample.parquet
--------------------------------------------------------------------------------
/tests/data/outputs_sample.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmind/mlops-python-package/856a5ab74958cec36ae31619636d07c4baf9698f/tests/data/outputs_sample.parquet
--------------------------------------------------------------------------------
/tests/data/targets_sample.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmind/mlops-python-package/856a5ab74958cec36ae31619636d07c4baf9698f/tests/data/targets_sample.parquet
--------------------------------------------------------------------------------
/tests/io/test_configs.py:
--------------------------------------------------------------------------------
1 | # %% IMPORTS
2 |
3 | import os
4 |
5 | import omegaconf as oc
6 |
7 | from bikes.io import configs
8 |
9 | # %% PARSERS
10 |
11 |
12 | def test_parse_file(tmp_path: str) -> None:
13 | # given
14 | text = """
15 | a: 1
16 | b: True
17 | c: [3, 4]
18 | """
19 | path = os.path.join(tmp_path, "config.yml")
20 | with open(path, "w", encoding="utf-8") as writer:
21 | writer.write(text)
22 | # when
23 | config = configs.parse_file(path)
24 | # then
25 | assert config == {
26 | "a": 1,
27 | "b": True,
28 | "c": [3, 4],
29 | }, "File config should be parsed correctly!"
30 |
31 |
32 | def test_parse_string() -> None:
33 | # given
34 | text = """{"a": 1, "b": 2, "data": [3, 4]}"""
35 | # when
36 | config = configs.parse_string(text)
37 | # then
38 | assert config == {
39 | "a": 1,
40 | "b": 2,
41 | "data": [3, 4],
42 | }, "String config should be parsed correctly!"
43 |
44 |
45 | # %% MERGERS
46 |
47 |
48 | def test_merge_configs() -> None:
49 | # given
50 | confs = [oc.OmegaConf.create({"x": i, i: i}) for i in range(3)]
51 | # when
52 | config = configs.merge_configs(confs)
53 | # then
54 | assert config == {
55 | 0: 0,
56 | 1: 1,
57 | 2: 2,
58 | "x": 2,
59 | }, "Configs should be merged correctly!"
60 |
61 |
62 | # %% CONVERTERS
63 |
64 |
65 | def test_to_object() -> None:
66 | # given
67 | values = {
68 | "a": 1,
69 | "b": True,
70 | "c": [3, 4],
71 | }
72 | config = oc.OmegaConf.create(values)
73 | # when
74 | object_ = configs.to_object(config)
75 | # then
76 | assert object_ == values, "Object should be the same!"
77 | assert isinstance(object_, dict), "Object should be a dict!"
78 |
--------------------------------------------------------------------------------
/tests/io/test_datasets.py:
--------------------------------------------------------------------------------
1 | # %% IMPORTS
2 |
3 | import os
4 |
5 | import pytest
6 |
7 | from bikes.core import schemas
8 | from bikes.io import datasets
9 |
10 | # %% READERS
11 |
12 |
13 | @pytest.mark.parametrize("limit", [None, 50])
14 | def test_parquet_reader(limit: int | None, inputs_path: str) -> None:
15 | # given
16 | reader = datasets.ParquetReader(path=inputs_path, limit=limit)
17 | # when
18 | data = reader.read()
19 | lineage = reader.lineage(name="inputs", data=data)
20 | # then
21 | # - data
22 | assert data.ndim == 2, "Data should be a dataframe!"
23 | if limit is not None:
24 | assert len(data) == limit, "Data should have the limit size!"
25 | # - lineage
26 | assert lineage.name == "inputs", "Lineage name should be inputs!"
27 | assert lineage.source.uri == inputs_path, "Lineage source uri should be the inputs path!" # type: ignore[attr-defined]
28 | assert lineage.schema is not None and set(lineage.schema.input_names()) == set(data.columns), (
29 | "Lineage schema names should be the data columns!"
30 | )
31 | assert lineage.profile["num_rows"] == len( # type: ignore[index]
32 | data
33 | ), "Lineage profile should contain the data row count!"
34 |
35 |
36 | # %% WRITERS
37 |
38 |
39 | def test_parquet_writer(targets: schemas.Targets, tmp_outputs_path: str) -> None:
40 | # given
41 | writer = datasets.ParquetWriter(path=tmp_outputs_path)
42 | # when
43 | writer.write(data=targets)
44 | # then
45 | assert os.path.exists(tmp_outputs_path), "Data should be written!"
46 |
--------------------------------------------------------------------------------
/tests/io/test_registries.py:
--------------------------------------------------------------------------------
1 | # %% IMPORTS
2 |
3 | from bikes.core import models, schemas
4 | from bikes.io import registries, services
5 | from bikes.utils import signers
6 |
7 | # %% HELPERS
8 |
9 |
10 | def test_uri_for_model_alias() -> None:
11 | # given
12 | name = "testing"
13 | alias = "Champion"
14 | # when
15 | uri = registries.uri_for_model_alias(name=name, alias=alias)
16 | # then
17 | assert uri == f"models:/{name}@{alias}", "The model URI should be valid!"
18 |
19 |
20 | def test_uri_for_model_version() -> None:
21 | # given
22 | name = "testing"
23 | version = 1
24 | # when
25 | uri = registries.uri_for_model_version(name=name, version=version)
26 | # then
27 | assert uri == f"models:/{name}/{version}", "The model URI should be valid!"
28 |
29 |
30 | def test_uri_for_model_alias_or_version() -> None:
31 | # given
32 | name = "testing"
33 | alias = "Champion"
34 | version = 1
35 | # when
36 | alias_uri = registries.uri_for_model_alias_or_version(name=name, alias_or_version=alias)
37 | version_uri = registries.uri_for_model_alias_or_version(name=name, alias_or_version=version)
38 | # then
39 | assert alias_uri == registries.uri_for_model_alias(name=name, alias=alias), (
40 | "The alias URI should be valid!"
41 | )
42 | assert version_uri == registries.uri_for_model_version(name=name, version=version), (
43 | "The version URI should be valid!"
44 | )
45 |
46 |
47 | # %% SAVERS/LOADERS/REGISTERS
48 |
49 |
50 | def test_custom_pipeline(
51 | model: models.Model,
52 | inputs: schemas.Inputs,
53 | signature: signers.Signature,
54 | mlflow_service: services.MlflowService,
55 | ) -> None:
56 | # given
57 | path = "custom"
58 | name = "Custom"
59 | tags = {"registry": "mlflow"}
60 | saver = registries.CustomSaver(path=path)
61 | loader = registries.CustomLoader()
62 | register = registries.MlflowRegister(tags=tags)
63 | run_config = mlflow_service.RunConfig(name="Custom-Run")
64 | # when
65 | with mlflow_service.run_context(run_config=run_config) as run:
66 | info = saver.save(model=model, signature=signature, input_example=inputs)
67 | version = register.register(name=name, model_uri=info.model_uri)
68 | model_uri = registries.uri_for_model_version(name=name, version=version.version)
69 | adapter = loader.load(uri=model_uri)
70 | outputs = adapter.predict(inputs=inputs)
71 | # then
72 | # - uri
73 | assert model_uri == f"models:/{name}/{version.version}", "The model URI should be valid!"
74 | # - info
75 | assert info.run_id == run.info.run_id, "The run id should be the same!"
76 | assert info.artifact_path == path, "The artifact path should be the same!"
77 | assert info.signature == signature, "The model signature should be the same!"
78 | assert info.flavors.get("python_function"), "The model should have a pyfunc flavor!"
79 | # - version
80 | assert version.name == name, "The model version name should be the same!"
81 | assert version.tags == tags, "The model version tags should be the same!"
82 | assert version.aliases == [], "The model version aliases should be empty!"
83 | assert version.run_id == run.info.run_id, "The model version run id should be the same!"
84 | # - adapter
85 | assert adapter.model.metadata.run_id == version.run_id, (
86 | "The adapter model run id should be the same!"
87 | )
88 | assert adapter.model.metadata.signature == signature, (
89 | "The adapter model signature should be the same!"
90 | )
91 | assert adapter.model.metadata.flavors.get("python_function") is not None, (
92 | "The adapter model should have a python_function flavor!"
93 | )
94 | # - output
95 | assert schemas.OutputsSchema.check(outputs) is not None, "Outputs should be valid!"
96 |
97 |
98 | def test_builtin_pipeline(
99 | model: models.Model,
100 | inputs: schemas.Inputs,
101 | signature: signers.Signature,
102 | mlflow_service: services.MlflowService,
103 | ) -> None:
104 | # given
105 | path = "builtin"
106 | name = "Builtin"
107 | flavor = "sklearn"
108 | tags = {"registry": "mlflow"}
109 | saver = registries.BuiltinSaver(path=path, flavor=flavor)
110 | loader = registries.BuiltinLoader()
111 | register = registries.MlflowRegister(tags=tags)
112 | run_config = mlflow_service.RunConfig(name="Builtin-Run")
113 | # when
114 | with mlflow_service.run_context(run_config=run_config) as run:
115 | info = saver.save(model=model, signature=signature, input_example=inputs)
116 | version = register.register(name=name, model_uri=info.model_uri)
117 | model_uri = registries.uri_for_model_version(name=name, version=version.version)
118 | adapter = loader.load(uri=model_uri)
119 | outputs = adapter.predict(inputs=inputs)
120 | # then
121 | # - uri
122 | assert model_uri == f"models:/{name}/{version.version}", "The model URI should be valid!"
123 | # - info
124 | assert info.run_id == run.info.run_id, "The run id should be the same!"
125 | assert info.artifact_path == path, "The artifact path should be the same!"
126 | assert info.signature == signature, "The model signature should be the same!"
127 | assert info.flavors.get("python_function"), "The model should have a pyfunc flavor!"
128 | assert info.flavors.get(flavor), f"The model should have a built-in model flavor: {flavor}!"
129 | # - version
130 | assert version.name == name, "The model version name should be the same!"
131 | assert version.tags == tags, "The model version tags should be the same!"
132 | assert version.aliases == [], "The model version aliases should be empty!"
133 | assert version.run_id == run.info.run_id, "The model version run id should be the same!"
134 | # - adapter
135 | assert adapter.model.metadata.run_id == version.run_id, (
136 | "The adapter model run id should be the same!"
137 | )
138 | assert adapter.model.metadata.signature == signature, (
139 | "The adapter model signature should be the same!"
140 | )
141 | assert adapter.model.metadata.flavors.get("python_function") is not None, (
142 | "The adapter model should have a python_function flavor!"
143 | )
144 | assert adapter.model.metadata.flavors.get(flavor), (
145 | f"The model should have a built-in model flavor: {flavor}!"
146 | )
147 | # - output
148 | assert schemas.OutputsSchema.check(outputs) is not None, "Outputs should be valid!"
149 |
--------------------------------------------------------------------------------
/tests/io/test_services.py:
--------------------------------------------------------------------------------
1 | # %% IMPORTS
2 |
3 | import _pytest.capture as pc
4 | import _pytest.logging as pl
5 | import mlflow
6 | import plyer
7 | import pytest
8 | import pytest_mock as pm
9 |
10 | from bikes.io import services
11 |
12 | # %% SERVICES
13 |
14 |
15 | def test_logger_service(
16 | logger_service: services.LoggerService, logger_caplog: pl.LogCaptureFixture
17 | ) -> None:
18 | # given
19 | service = logger_service
20 | logger = service.logger()
21 | # when
22 | logger.debug("DEBUG")
23 | logger.error("ERROR")
24 | # then
25 | assert "DEBUG" in logger_caplog.messages, "Debug message should be logged!"
26 | assert "ERROR" in logger_caplog.messages, "Error message should be logged!"
27 |
28 |
29 | @pytest.mark.parametrize("enable", [True, False])
30 | def test_alerts_service(
31 | enable: bool, mocker: pm.MockerFixture, capsys: pc.CaptureFixture[str]
32 | ) -> None:
33 | # given
34 | service = services.AlertsService(enable=enable)
35 | mocker.patch(target="plyer.notification.notify")
36 | # when
37 | service.notify(title="test", message="hello")
38 | # then
39 | if enable:
40 | (
41 | plyer.notification.notify.assert_called_once(),
42 | "Notification method should be called!",
43 | )
44 | assert capsys.readouterr().out == "", "Notification should not be printed to stdout!"
45 | else:
46 | (
47 | plyer.notification.notify.assert_not_called(),
48 | "Notification method should not be called!",
49 | )
50 | assert capsys.readouterr().out == "[Bikes] test: hello\n", (
51 | "Notification should be printed to stdout!"
52 | )
53 |
54 |
55 | def test_alerts_service__not_supported(
56 | mocker: pm.MockerFixture, capsys: pc.CaptureFixture[str]
57 | ) -> None:
58 | # given
59 | def notify_not_supported(*args, **kwargs):
60 | raise NotImplementedError()
61 |
62 | service = services.AlertsService(enable=True)
63 | mocker.patch(target="plyer.notification.notify", new=notify_not_supported)
64 | # when
65 | service.notify(title="test", message="hello")
66 | # then
67 | assert "Notifications are not supported on this system." in capsys.readouterr().out
68 |
69 |
70 | def test_mlflow_service(mlflow_service: services.MlflowService) -> None:
71 | # given
72 | service = mlflow_service
73 | run_config = mlflow_service.RunConfig(
74 | name="testing",
75 | tags={"service": "mlflow"},
76 | description="a test run.",
77 | log_system_metrics=True,
78 | )
79 | # when
80 | client = service.client()
81 | with service.run_context(run_config=run_config) as context:
82 | pass
83 | finished = client.get_run(run_id=context.info.run_id)
84 | # then
85 | # - run
86 | assert run_config.tags is not None, "Run config tags should be set!"
87 | # - mlflow
88 | assert service.tracking_uri == mlflow.get_tracking_uri(), "Tracking URI should be the same!"
89 | assert service.registry_uri == mlflow.get_registry_uri(), "Registry URI should be the same!"
90 | assert mlflow.get_experiment_by_name(service.experiment_name), "Experiment should be setup!"
91 | # - client
92 | assert service.tracking_uri == client.tracking_uri, "Tracking URI should be the same!"
93 | assert service.registry_uri == client._registry_uri, "Tracking URI should be the same!"
94 | assert client.get_experiment_by_name(service.experiment_name), "Experiment should be setup!"
95 | # - context
96 | assert context.info.run_name == run_config.name, "Context name should be the same!"
97 | assert run_config.description in context.data.tags.values(), (
98 | "Context desc. should be in tags values!"
99 | )
100 | assert context.data.tags.items() > run_config.tags.items(), (
101 | "Context tags should be a subset of the given tags!"
102 | )
103 | assert context.info.status == "RUNNING", "Context should be running!"
104 | # - finished
105 | assert finished.info.status == "FINISHED", "Finished should be finished!"
106 |
--------------------------------------------------------------------------------
/tests/jobs/test_base.py:
--------------------------------------------------------------------------------
1 | # %% IMPORTS
2 |
3 | from bikes.io import services
4 | from bikes.jobs import base
5 |
6 | # %% JOBS
7 |
8 |
9 | def test_job(
10 | logger_service: services.LoggerService,
11 | alerts_service: services.AlertsService,
12 | mlflow_service: services.MlflowService,
13 | ) -> None:
14 | # given
15 | class MyJob(base.Job):
16 | KIND: str = "MyJob"
17 |
18 | def run(self) -> base.Locals:
19 | a, b = 1, "test"
20 | return locals()
21 |
22 | job = MyJob(
23 | logger_service=logger_service,
24 | alerts_service=alerts_service,
25 | mlflow_service=mlflow_service,
26 | )
27 | # when
28 | with job as runner:
29 | out = runner.run()
30 | # then
31 | # - inputs
32 | assert hasattr(job, "logger_service"), "Job should have an Logger service!"
33 | assert hasattr(job, "alerts_service"), "Job should have a alerter service!"
34 | assert hasattr(job, "mlflow_service"), "Job should have an Mlflow service!"
35 | # - outputs
36 | assert set(out) == {"self", "a", "b"}, "Run should return local variables!"
37 |
--------------------------------------------------------------------------------
/tests/jobs/test_evaluations.py:
--------------------------------------------------------------------------------
1 | # %% IMPORTS
2 |
3 | import _pytest.capture as pc
4 | import pytest
5 |
6 | from bikes import jobs
7 | from bikes.core import metrics, schemas
8 | from bikes.io import datasets, registries, services
9 |
10 | # %% JOBS
11 |
12 |
13 | @pytest.mark.parametrize(
14 | "alias_or_version, thresholds",
15 | [
16 | (
17 | 1,
18 | {
19 | "mean_squared_error": metrics.Threshold(
20 | threshold=float("inf"), greater_is_better=False
21 | )
22 | },
23 | ),
24 | (
25 | "Promotion",
26 | {"r2_score": metrics.Threshold(threshold=-1, greater_is_better=True)},
27 | ),
28 | pytest.param(
29 | "Promotion",
30 | {"r2_score": metrics.Threshold(threshold=100, greater_is_better=True)},
31 | marks=pytest.mark.xfail(
32 | reason="Invalid threshold for metric.",
33 | raises=metrics.MlflowModelValidationFailedException,
34 | ),
35 | ),
36 | ],
37 | )
38 | def test_evaluations_job(
39 | alias_or_version: str | int,
40 | thresholds: dict[str, metrics.Threshold],
41 | mlflow_service: services.MlflowService,
42 | alerts_service: services.AlertsService,
43 | logger_service: services.LoggerService,
44 | inputs_reader: datasets.ParquetReader,
45 | targets_reader: datasets.ParquetReader,
46 | model_alias: registries.Version,
47 | metric: metrics.SklearnMetric,
48 | capsys: pc.CaptureFixture[str],
49 | ) -> None:
50 | # given
51 | if isinstance(alias_or_version, int):
52 | assert alias_or_version == model_alias.version, "Model version should be the same!"
53 | else:
54 | assert alias_or_version == model_alias.aliases[0], "Model alias should be the same!"
55 | run_config = mlflow_service.RunConfig(
56 | name="EvaluationsTest",
57 | tags={"context": "evaluations"},
58 | description="Evaluations job.",
59 | )
60 | # when
61 | job = jobs.EvaluationsJob(
62 | logger_service=logger_service,
63 | alerts_service=alerts_service,
64 | mlflow_service=mlflow_service,
65 | run_config=run_config,
66 | inputs=inputs_reader,
67 | targets=targets_reader,
68 | alias_or_version=alias_or_version,
69 | metrics=[metric],
70 | thresholds=thresholds,
71 | )
72 | with job as runner:
73 | out = runner.run()
74 | # then
75 | # - vars
76 | assert set(out) == {
77 | "self",
78 | "logger",
79 | "client",
80 | "run",
81 | "inputs",
82 | "inputs_",
83 | "inputs_lineage",
84 | "targets",
85 | "targets_",
86 | "targets_lineage",
87 | "outputs",
88 | "model",
89 | "model_uri",
90 | "dataset",
91 | "dataset_",
92 | "extra_metrics",
93 | "validation_thresholds",
94 | "evaluations",
95 | }
96 | # - run
97 | assert run_config.tags is not None, "Run config tags should be set!"
98 | assert out["run"].info.run_name == run_config.name, "Run name should be the same!"
99 | assert run_config.description in out["run"].data.tags.values(), "Run desc. should be tags!"
100 | assert out["run"].data.tags.items() > run_config.tags.items(), (
101 | "Run tags should be a subset of tags!"
102 | )
103 | # - data
104 | assert out["inputs"].ndim == out["inputs_"].ndim == 2, "Inputs should be a dataframe!"
105 | assert out["targets"].ndim == out["targets_"].ndim == 2, "Targets should be a dataframe!"
106 | # - lineage
107 | assert out["inputs_lineage"].name == "inputs", "Inputs lineage name should be inputs!"
108 | assert out["inputs_lineage"].source.uri == inputs_reader.path, (
109 | "Inputs lineage source should be the inputs reader path!"
110 | )
111 | assert out["targets_lineage"].name == "targets", "Targets lineage name should be targets!"
112 | assert out["targets_lineage"].source.uri == targets_reader.path, (
113 | "Targets lineage source should be the targets reader path!"
114 | )
115 | assert out["targets_lineage"].targets == schemas.TargetsSchema.cnt, (
116 | "Targets lineage target should be cnt!"
117 | )
118 | # - outputs
119 | assert out["outputs"].ndim == 2, "Outputs should be a dataframe!"
120 | # - model uri
121 | assert str(alias_or_version) in out["model_uri"], "Model URI should contain the model alias!"
122 | assert mlflow_service.registry_name in out["model_uri"], (
123 | "Model URI should contain the registry name!"
124 | )
125 | # - model
126 | assert out["model"].model.metadata.run_id == model_alias.run_id, (
127 | "Model run id should be the same!"
128 | )
129 | assert out["model"].model.metadata.signature is not None, "Model should have a signature!"
130 | assert out["model"].model.metadata.flavors.get("python_function"), (
131 | "Model should have a pyfunc flavor!"
132 | )
133 | # - dataset
134 | assert out["dataset"].name == "evaluation", "Dataset name should be evaluation!"
135 | assert out["dataset"].targets == schemas.TargetsSchema.cnt, (
136 | "Dataset targets should be the target column!"
137 | )
138 | assert out["dataset"].predictions == schemas.OutputsSchema.prediction, (
139 | "Dataset predictions should be the prediction column!"
140 | )
141 | assert out["dataset"].source.to_dict().keys() == {"tags"}, "Dataset source should have tags!"
142 | # - extra metrics
143 | assert len(out["extra_metrics"]) == len(job.metrics), (
144 | "Extra metrics should have the same length as metrics!"
145 | )
146 | assert out["extra_metrics"][0].name == job.metrics[0].name, (
147 | "Extra metrics name should be the same!"
148 | )
149 | assert out["extra_metrics"][0].greater_is_better == job.metrics[0].greater_is_better, (
150 | "Extra metrics greatter is better should be the same!"
151 | )
152 | # - validation thresholds
153 | assert out["validation_thresholds"].keys() == thresholds.keys(), (
154 | "Validation thresholds should have the same keys as thresholds!"
155 | )
156 | # - evaluations
157 | assert out["evaluations"].metrics["example_count"] == inputs_reader.limit, (
158 | "Evaluations should have the same number of examples as the inputs!"
159 | )
160 | assert job.metrics[0].name in out["evaluations"].metrics, "Metric should be logged in Mlflow!"
161 | # - mlflow tracking
162 | experiment = mlflow_service.client().get_experiment_by_name(name=mlflow_service.experiment_name)
163 | assert experiment is not None, "Mlflow Experiment should exist!"
164 | assert experiment.name == mlflow_service.experiment_name, (
165 | "Mlflow Experiment name should be the same!"
166 | )
167 | runs = mlflow_service.client().search_runs(experiment_ids=experiment.experiment_id)
168 | assert len(runs) == 2, "There should be a two Mlflow run for training and evaluations!"
169 | assert metric.name in runs[0].data.metrics, "Metric should be logged in Mlflow!"
170 | assert runs[0].info.status == "FINISHED", "Mlflow run status should be set as FINISHED!"
171 | # - alerting service
172 | assert "Evaluations" in capsys.readouterr().out, "Alerting service should be called!"
173 |
--------------------------------------------------------------------------------
/tests/jobs/test_explanations.py:
--------------------------------------------------------------------------------
1 | # %% IMPORTS
2 |
3 | import _pytest.capture as pc
4 | import pytest
5 |
6 | from bikes import jobs
7 | from bikes.core import models
8 | from bikes.io import datasets, registries, services
9 |
10 | # %% JOBS
11 |
12 |
13 | @pytest.mark.parametrize("alias_or_version", [1, "Promotion"])
14 | def test_explanations_job(
15 | alias_or_version: str | int,
16 | mlflow_service: services.MlflowService,
17 | alerts_service: services.AlertsService,
18 | logger_service: services.LoggerService,
19 | inputs_samples_reader: datasets.ParquetReader,
20 | tmp_models_explanations_writer: datasets.ParquetWriter,
21 | tmp_samples_explanations_writer: datasets.ParquetWriter,
22 | model_alias: registries.Version,
23 | loader: registries.CustomLoader,
24 | capsys: pc.CaptureFixture[str],
25 | ) -> None:
26 | # given
27 | if isinstance(alias_or_version, int):
28 | assert alias_or_version == model_alias.version, "Model version should be the same!"
29 | else:
30 | assert alias_or_version == model_alias.aliases[0], "Model alias should be the same!"
31 | # when
32 | job = jobs.ExplanationsJob(
33 | logger_service=logger_service,
34 | alerts_service=alerts_service,
35 | mlflow_service=mlflow_service,
36 | inputs_samples=inputs_samples_reader,
37 | models_explanations=tmp_models_explanations_writer,
38 | samples_explanations=tmp_samples_explanations_writer,
39 | alias_or_version=alias_or_version,
40 | loader=loader,
41 | )
42 | with job as runner:
43 | out = runner.run()
44 | # then
45 | # - vars
46 | assert set(out) == {
47 | "self",
48 | "logger",
49 | "inputs_samples",
50 | "model_uri",
51 | "model",
52 | "models_explanations",
53 | "samples_explanations",
54 | }
55 | # - inputs
56 | assert out["inputs_samples"].ndim == 2, "Inputs samples should be a dataframe!"
57 | # - model uri
58 | assert str(alias_or_version) in out["model_uri"], "Model URI should contain the model alias!"
59 | assert mlflow_service.registry_name in out["model_uri"], (
60 | "Model URI should contain the registry name!"
61 | )
62 | # - model
63 | assert isinstance(out["model"], models.Model), "Model should be an instance of a project Model!"
64 | # - model explanations
65 | assert len(out["models_explanations"].index) >= len(out["inputs_samples"].columns), (
66 | "Model explanations should have at least as many columns as inputs samples!"
67 | )
68 | # - samples explanations
69 | assert len(out["samples_explanations"].index) == len(out["inputs_samples"].index), (
70 | "Samples explanations should have the same number of rows as inputs samples!"
71 | )
72 | assert len(out["samples_explanations"].columns) >= len(out["inputs_samples"].columns), (
73 | "Samples explanations should have at least as many columns as inputs samples!"
74 | )
75 | # - alerting service
76 | assert "Explanations Job Finished" in capsys.readouterr().out, (
77 | "Alerting service should be called!"
78 | )
79 |
--------------------------------------------------------------------------------
/tests/jobs/test_inference.py:
--------------------------------------------------------------------------------
1 | # %% IMPORTS
2 |
3 | import _pytest.capture as pc
4 | import pytest
5 |
6 | from bikes import jobs
7 | from bikes.io import datasets, registries, services
8 |
9 | # %% JOBS
10 |
11 |
12 | @pytest.mark.parametrize("alias_or_version", [1, "Promotion"])
13 | def test_inference_job(
14 | alias_or_version: str | int,
15 | mlflow_service: services.MlflowService,
16 | alerts_service: services.AlertsService,
17 | logger_service: services.LoggerService,
18 | inputs_reader: datasets.ParquetReader,
19 | tmp_outputs_writer: datasets.ParquetWriter,
20 | model_alias: registries.Version,
21 | loader: registries.CustomLoader,
22 | capsys: pc.CaptureFixture[str],
23 | ) -> None:
24 | # given
25 | if isinstance(alias_or_version, int):
26 | assert alias_or_version == model_alias.version, "Model version should be the same!"
27 | else:
28 | assert alias_or_version == model_alias.aliases[0], "Model alias should be the same!"
29 | # when
30 | job = jobs.InferenceJob(
31 | logger_service=logger_service,
32 | alerts_service=alerts_service,
33 | mlflow_service=mlflow_service,
34 | inputs=inputs_reader,
35 | outputs=tmp_outputs_writer,
36 | alias_or_version=alias_or_version,
37 | loader=loader,
38 | )
39 | with job as runner:
40 | out = runner.run()
41 | # then
42 | # - vars
43 | assert set(out) == {
44 | "self",
45 | "logger",
46 | "inputs",
47 | "inputs_",
48 | "model_uri",
49 | "model",
50 | "outputs",
51 | }
52 | # - inputs
53 | assert out["inputs"].ndim == out["inputs_"].ndim == 2, "Inputs should be a dataframe!"
54 | # - model uri
55 | assert str(alias_or_version) in out["model_uri"], "Model URI should contain the model alias!"
56 | assert mlflow_service.registry_name in out["model_uri"], (
57 | "Model URI should contain the registry name!"
58 | )
59 | # - model
60 | assert out["model"].model.metadata.run_id == model_alias.run_id, (
61 | "Model run id should be the same!"
62 | )
63 | assert out["model"].model.metadata.signature is not None, "Model should have a signature!"
64 | assert out["model"].model.metadata.flavors.get("python_function"), (
65 | "Model should have a pyfunc flavor!"
66 | )
67 | # - outputs
68 | assert out["outputs"].ndim == 2, "Outputs should be a dataframe!"
69 | # - alerting service
70 | assert "Inference Job Finished" in capsys.readouterr().out, "Alerting service should be called!"
71 |
--------------------------------------------------------------------------------
/tests/jobs/test_promotion.py:
--------------------------------------------------------------------------------
1 | # %% IMPORTS
2 |
3 | import _pytest.capture as pc
4 | import mlflow
5 | import pytest
6 |
7 | from bikes import jobs
8 | from bikes.io import registries, services
9 |
10 | # %% JOBS
11 |
12 |
13 | @pytest.mark.parametrize(
14 | "version",
15 | [
16 | None, # latest version
17 | 1, # specific version
18 | pytest.param(
19 | 2,
20 | marks=pytest.mark.xfail(
21 | reason="Version does not exist.",
22 | raises=mlflow.exceptions.MlflowException,
23 | ),
24 | ),
25 | ],
26 | )
27 | def test_promotion_job(
28 | version: int | None,
29 | mlflow_service: services.MlflowService,
30 | alerts_service: services.AlertsService,
31 | logger_service: services.LoggerService,
32 | model_version: registries.Version,
33 | capsys: pc.CaptureFixture[str],
34 | ) -> None:
35 | # given
36 | alias = "Testing"
37 | # when
38 | job = jobs.PromotionJob(
39 | logger_service=logger_service,
40 | alerts_service=alerts_service,
41 | mlflow_service=mlflow_service,
42 | version=version,
43 | alias=alias,
44 | )
45 | with job as runner:
46 | out = runner.run()
47 | # then
48 | # - vars
49 | assert set(out) == {
50 | "self",
51 | "logger",
52 | "client",
53 | "name",
54 | "version",
55 | "model_version",
56 | }
57 | # - name
58 | assert out["name"] == mlflow_service.registry_name, "Model name should be the same!"
59 | # - version
60 | assert out["version"] == model_version.version, "Version number should be the same!"
61 | # - model version
62 | assert out["model_version"].name == out["name"], "Model version name should be the same!"
63 | assert out["model_version"].version == out["version"], (
64 | "Model version number should be the same!"
65 | )
66 | assert out["model_version"].run_id == model_version.run_id, (
67 | "Model version run id should be the same!"
68 | )
69 | assert out["model_version"].aliases == [alias], (
70 | "Model version aliases should contain the given alias!"
71 | )
72 | # - alerting service
73 | assert "Promotion Job Finished" in capsys.readouterr().out, "Alerting service should be called!"
74 |
--------------------------------------------------------------------------------
/tests/jobs/test_training.py:
--------------------------------------------------------------------------------
1 | # %% IMPORTS
2 |
3 | import _pytest.capture as pc
4 |
5 | from bikes import jobs
6 | from bikes.core import metrics, models, schemas
7 | from bikes.io import datasets, registries, services
8 | from bikes.utils import signers, splitters
9 |
10 | # %% JOBS
11 |
12 |
13 | def test_training_job(
14 | mlflow_service: services.MlflowService,
15 | alerts_service: services.AlertsService,
16 | logger_service: services.LoggerService,
17 | inputs_reader: datasets.ParquetReader,
18 | targets_reader: datasets.ParquetReader,
19 | model: models.BaselineSklearnModel,
20 | metric: metrics.SklearnMetric,
21 | train_test_splitter: splitters.TrainTestSplitter,
22 | saver: registries.CustomSaver,
23 | signer: signers.InferSigner,
24 | register: registries.MlflowRegister,
25 | capsys: pc.CaptureFixture[str],
26 | ) -> None:
27 | # given
28 | run_config = mlflow_service.RunConfig(
29 | name="TrainingTest", tags={"context": "training"}, description="Training job."
30 | )
31 | splitter = train_test_splitter
32 | client = mlflow_service.client()
33 | # when
34 | job = jobs.TrainingJob(
35 | logger_service=logger_service,
36 | alerts_service=alerts_service,
37 | mlflow_service=mlflow_service,
38 | run_config=run_config,
39 | inputs=inputs_reader,
40 | targets=targets_reader,
41 | model=model,
42 | metrics=[metric],
43 | splitter=splitter,
44 | saver=saver,
45 | signer=signer,
46 | registry=register,
47 | )
48 | with job as runner:
49 | out = runner.run()
50 | # then
51 | # - vars
52 | assert set(out) == {
53 | "self",
54 | "logger",
55 | "client",
56 | "run",
57 | "inputs",
58 | "inputs_",
59 | "inputs_lineage",
60 | "targets",
61 | "targets_",
62 | "targets_lineage",
63 | "train_index",
64 | "test_index",
65 | "inputs_test",
66 | "inputs_train",
67 | "inputs_test",
68 | "targets_train",
69 | "targets_test",
70 | "outputs_test",
71 | "i",
72 | "metric",
73 | "score",
74 | "model_signature",
75 | "model_info",
76 | "model_version",
77 | }
78 | # - run
79 | assert run_config.tags is not None, "Run config tags should be set!"
80 | assert out["run"].info.run_name == run_config.name, "Run name should be the same!"
81 | assert run_config.description in out["run"].data.tags.values(), "Run desc. should be tags!"
82 | assert out["run"].data.tags.items() > run_config.tags.items(), (
83 | "Run tags should be a subset of tags!"
84 | )
85 | # - data
86 | assert out["inputs"].ndim == out["inputs_"].ndim == 2, "Inputs should be a dataframe!"
87 | assert out["targets"].ndim == out["targets_"].ndim == 2, "Targets should be a dataframe!"
88 | # - lineage
89 | assert out["inputs_lineage"].name == "inputs", "Inputs lineage name should be inputs!"
90 | assert out["inputs_lineage"].source.uri == inputs_reader.path, (
91 | "Inputs lineage source should be the inputs reader path!"
92 | )
93 | assert out["targets_lineage"].name == "targets", "Targets lineage name should be targets!"
94 | assert out["targets_lineage"].source.uri == targets_reader.path, (
95 | "Targets lineage source should be the targets reader path!"
96 | )
97 | assert out["targets_lineage"].targets == schemas.TargetsSchema.cnt, (
98 | "Targets lineage target should be cnt!"
99 | )
100 | # - splitter
101 | assert len(out["inputs_train"]) + len(out["inputs_test"]) == len(out["inputs"]), (
102 | "Train and test inputs should have the same length as inputs!"
103 | )
104 | assert len(out["targets_train"]) + len(out["targets_test"]) == len(out["targets"]), (
105 | "Train and test targets should have the same length as targets!"
106 | )
107 | assert len(out["train_index"]) == len(out["inputs_train"]) == len(out["targets_train"]), (
108 | "Train inputs and targets should have the same length!"
109 | )
110 | assert len(out["test_index"]) == len(out["inputs_test"]) == len(out["targets_test"]), (
111 | "Test inputs and targets should have the same length!"
112 | )
113 | # - outputs
114 | assert out["outputs_test"].shape == out["targets_test"].shape, (
115 | "Outputs should have the same shape as targets!"
116 | )
117 | assert len(out["test_index"]) == len(out["outputs_test"]) == len(out["inputs_test"]), (
118 | "Outputs should have the same length as inputs!"
119 | )
120 | # - i and score
121 | assert out["i"] == len(job.metrics), "i should be the number of metrics computed!"
122 | assert float("-inf") < out["score"] < float("+inf"), "Score should be between 0 and 1!"
123 | # - model signature
124 | assert out["model_signature"].inputs is not None, "Model signature inputs should not be None!"
125 | assert out["model_signature"].outputs is not None, "Model signature outputs should not be None!"
126 | # - model info
127 | assert out["model_info"].run_id == out["run"].info.run_id, (
128 | "Model info run id should be the same!"
129 | )
130 | assert out["model_info"].signature == out["model_signature"], (
131 | "Model info signature should be the same!"
132 | )
133 | assert out["model_info"].artifact_path == saver.path, "Model info path should be the same!"
134 | # - model version
135 | assert out["model_version"].version == 1, "Model version number should be 1!"
136 | assert out["model_version"].aliases == [], "Model version aliases should be empty!"
137 | assert out["model_version"].tags == register.tags, "Model version tags should be the same!"
138 | assert out["model_version"].name == mlflow_service.registry_name, (
139 | "Model name should be the same!"
140 | )
141 | assert out["model_version"].run_id == out["run"].info.run_id, (
142 | "Model version run id should be the same!"
143 | )
144 | # - mlflow tracking
145 | experiment = client.get_experiment_by_name(name=mlflow_service.experiment_name)
146 | assert experiment is not None, "Mlflow Experiment should exist!"
147 | assert experiment.name == mlflow_service.experiment_name, (
148 | "Mlflow Experiment name should be the same!"
149 | )
150 | runs = client.search_runs(experiment_ids=experiment.experiment_id)
151 | assert len(runs) == 1, "There should be a single Mlflow run for training!"
152 | assert metric.name in runs[0].data.metrics, "Metric should be logged in Mlflow!"
153 | assert runs[0].info.status == "FINISHED", "Mlflow run status should be set as FINISHED!"
154 | # - mlflow registry
155 | model_version = client.get_model_version(
156 | name=mlflow_service.registry_name, version=out["model_version"].version
157 | )
158 | assert model_version.run_id == out["run"].info.run_id, (
159 | "MLFlow model version run id should be the same!"
160 | )
161 | # - alerting service
162 | assert "Training Job Finished" in capsys.readouterr().out, "Alerting service should be called!"
163 |
--------------------------------------------------------------------------------
/tests/jobs/test_tuning.py:
--------------------------------------------------------------------------------
1 | # %% IMPORTS
2 |
3 | import _pytest.capture as pc
4 |
5 | from bikes import jobs
6 | from bikes.core import metrics, models, schemas
7 | from bikes.io import datasets, services
8 | from bikes.utils import searchers, splitters
9 |
10 | # %% JOBS
11 |
12 |
13 | def test_tuning_job(
14 | mlflow_service: services.MlflowService,
15 | alerts_service: services.AlertsService,
16 | logger_service: services.LoggerService,
17 | inputs_reader: datasets.ParquetReader,
18 | targets_reader: datasets.ParquetReader,
19 | model: models.BaselineSklearnModel,
20 | metric: metrics.SklearnMetric,
21 | time_series_splitter: splitters.TimeSeriesSplitter,
22 | searcher: searchers.GridCVSearcher,
23 | capsys: pc.CaptureFixture[str],
24 | ) -> None:
25 | # given
26 | run_config = mlflow_service.RunConfig(
27 | name="TuningTest", tags={"context": "tuning"}, description="Tuning job."
28 | )
29 | splitter = time_series_splitter
30 | client = mlflow_service.client()
31 | # when
32 | job = jobs.TuningJob(
33 | logger_service=logger_service,
34 | alerts_service=alerts_service,
35 | mlflow_service=mlflow_service,
36 | run_config=run_config,
37 | inputs=inputs_reader,
38 | targets=targets_reader,
39 | model=model,
40 | metric=metric,
41 | splitter=splitter,
42 | searcher=searcher,
43 | )
44 | with job as runner:
45 | out = runner.run()
46 | # then
47 | # - vars
48 | assert set(out) == {
49 | "self",
50 | "logger",
51 | "run",
52 | "inputs",
53 | "inputs_",
54 | "inputs_lineage",
55 | "targets",
56 | "targets_",
57 | "targets_lineage",
58 | "results",
59 | "best_params",
60 | "best_score",
61 | }
62 | # - run
63 | assert run_config.tags is not None, "Run config tags should be set!"
64 | assert out["run"].info.run_name == run_config.name, "Run name should be the same!"
65 | assert run_config.description in out["run"].data.tags.values(), "Run desc. should be tags!"
66 | assert out["run"].data.tags.items() > run_config.tags.items(), (
67 | "Run tags should be a subset of tags!"
68 | )
69 | # - data
70 | assert out["inputs"].ndim == out["inputs_"].ndim == 2, "Inputs should be a dataframe!"
71 | assert out["targets"].ndim == out["inputs_"].ndim == 2, "Targets should be a dataframe!"
72 | # - lineage
73 | assert out["inputs_lineage"].name == "inputs", "Inputs lineage name should be inputs!"
74 | assert out["inputs_lineage"].source.uri == inputs_reader.path, (
75 | "Inputs lineage source should be the inputs reader path!"
76 | )
77 | assert out["targets_lineage"].name == "targets", "Targets lineage name should be targets!"
78 | assert out["targets_lineage"].source.uri == targets_reader.path, (
79 | "Targets lineage source should be the targets reader path!"
80 | )
81 | assert out["targets_lineage"].targets == schemas.TargetsSchema.cnt, (
82 | "Targets lineage target should be cnt!"
83 | )
84 | # - results
85 | assert out["results"].ndim == 2, "Results should be a dataframe!"
86 | # - best score
87 | assert float("-inf") < out["best_score"] < float("inf"), (
88 | "Best score should be between -inf and +inf!"
89 | )
90 | # - best params
91 | assert out["best_params"].keys() == searcher.param_grid.keys(), (
92 | "Best params should have the same keys!"
93 | )
94 | # - mlflow tracking
95 | experiment = client.get_experiment_by_name(name=mlflow_service.experiment_name)
96 | assert experiment is not None, "Mlflow experiment should exist!"
97 | assert experiment.name == mlflow_service.experiment_name, (
98 | "Mlflow experiment name should be the same!"
99 | )
100 | runs = client.search_runs(experiment_ids=experiment.experiment_id)
101 | assert len(runs) == len(out["results"]) + 1, "Mlflow should have 1 run per result + parent!"
102 | # - alerting service
103 | assert "Tuning Job Finished" in capsys.readouterr().out, "Alerting service should be called!"
104 |
--------------------------------------------------------------------------------
/tests/test_scripts.py:
--------------------------------------------------------------------------------
1 | # %% IMPORTS
2 |
3 | import json
4 | import os
5 |
6 | import pydantic as pdt
7 | import pytest
8 | from _pytest import capture as pc
9 |
10 | from bikes import scripts
11 |
12 | # %% SCRIPTS
13 |
14 |
15 | def test_schema(capsys: pc.CaptureFixture[str]) -> None:
16 | # given
17 | args = ["prog", "--schema"]
18 | # when
19 | scripts.main(args)
20 | captured = capsys.readouterr()
21 | # then
22 | assert captured.err == "", "Captured error should be empty!"
23 | assert json.loads(captured.out), "Captured output should be a JSON!"
24 |
25 |
26 | @pytest.mark.parametrize(
27 | "scenario",
28 | [
29 | "valid",
30 | pytest.param(
31 | "invalid",
32 | marks=pytest.mark.xfail(
33 | reason="Invalid config.",
34 | raises=pdt.ValidationError,
35 | ),
36 | ),
37 | ],
38 | )
39 | def test_main(scenario: str, confs_path: str, extra_config: str) -> None:
40 | # given
41 | folder = os.path.join(confs_path, scenario)
42 | confs = list(sorted(os.listdir(folder)))
43 | # when
44 | for conf in confs: # one job per config
45 | config = os.path.join(folder, conf)
46 | argv = [config, "-e", extra_config]
47 | status = scripts.main(argv=argv)
48 | # then
49 | assert status == 0, f"Job should succeed for config: {config}"
50 |
51 |
52 | def test_main__no_configs() -> None:
53 | # given
54 | argv: list[str] = []
55 | # when
56 | with pytest.raises(RuntimeError) as error:
57 | scripts.main(argv)
58 | # then
59 | assert error.match("No configs provided."), "RuntimeError should be raised!"
60 |
--------------------------------------------------------------------------------
/tests/utils/test_searchers.py:
--------------------------------------------------------------------------------
1 | # %% IMPORTS
2 |
3 | from bikes.core import metrics, models, schemas
4 | from bikes.utils import searchers, splitters
5 |
6 | # %% SEARCHERS
7 |
8 |
9 | def test_grid_cv_searcher(
10 | model: models.Model,
11 | metric: metrics.Metric,
12 | inputs: schemas.Inputs,
13 | targets: schemas.Targets,
14 | train_test_splitter: splitters.Splitter,
15 | ) -> None:
16 | # given
17 | param_grid = {"max_depth": [3, 5, 7]}
18 | searcher = searchers.GridCVSearcher(param_grid=param_grid)
19 | # when
20 | result, best_score, best_params = searcher.search(
21 | model=model,
22 | metric=metric,
23 | inputs=inputs,
24 | targets=targets,
25 | cv=train_test_splitter,
26 | )
27 | # then
28 | assert set(best_params) == set(param_grid), "Best params should have the same keys as grid!"
29 | assert float("-inf") < best_score < float("+inf"), "Best score should be a floating number!"
30 | assert len(result) == sum(len(vs) for vs in param_grid.values()), (
31 | "Results should have one row per candidate!"
32 | )
33 |
--------------------------------------------------------------------------------
/tests/utils/test_signers.py:
--------------------------------------------------------------------------------
1 | # %% IMPORTS
2 |
3 | from bikes.core import schemas
4 | from bikes.utils import signers
5 |
6 | # %% SIGNERS
7 |
8 |
9 | def test_infer_signer(inputs: schemas.Inputs, outputs: schemas.Outputs) -> None:
10 | # given
11 | signer = signers.InferSigner()
12 | # when
13 | signature = signer.sign(inputs=inputs, outputs=outputs)
14 | # then
15 | assert set(signature.inputs.input_names()) == set(inputs.columns), (
16 | "Signature inputs should contain input column names."
17 | )
18 | assert set(signature.outputs.input_names()) == set(outputs.columns), (
19 | "Signature outputs should contain output column names."
20 | )
21 |
--------------------------------------------------------------------------------
/tests/utils/test_splitters.py:
--------------------------------------------------------------------------------
1 | # %% IMPORTS
2 |
3 | from bikes.core import schemas
4 | from bikes.utils import splitters
5 |
6 | # %% SPLITTERS
7 |
8 |
9 | def test_train_test_splitter(inputs: schemas.Inputs, targets: schemas.Targets) -> None:
10 | # given
11 | shuffle = False
12 | test_size = 50
13 | random_state = 0
14 | splitter = splitters.TrainTestSplitter(
15 | shuffle=shuffle, test_size=test_size, random_state=random_state
16 | )
17 | # when
18 | n_splits = splitter.get_n_splits(inputs=inputs, targets=targets)
19 | splits = list(splitter.split(inputs=inputs, targets=targets))
20 | train_index, test_index = splits[0] # train/test indexes
21 | # then
22 | assert n_splits == len(splits) == 1, "Splitter should return 1 split!"
23 | assert len(test_index) == test_size, "Test index should have the given size!"
24 | assert len(train_index) == len(targets) - test_size, (
25 | "Train index should have the remaining size!"
26 | )
27 | assert not inputs.iloc[test_index].empty, "Test index should be a subset of the inputs!"
28 | assert not targets.iloc[train_index].empty, "Train index should be a subset of the targets!"
29 |
30 |
31 | def test_time_series_splitter(inputs: schemas.Inputs, targets: schemas.Targets) -> None:
32 | # given
33 | gap = 0
34 | n_splits = 3
35 | test_size = 50
36 | splitter = splitters.TimeSeriesSplitter(gap=gap, n_splits=n_splits, test_size=test_size)
37 | # when
38 | n_splits = splitter.get_n_splits(inputs=inputs, targets=targets)
39 | splits = list(splitter.split(inputs=inputs, targets=targets))
40 | # then
41 | assert n_splits == len(splits), "Splitter should return the given n splits!"
42 | for i, (train_index, test_index) in enumerate(splits):
43 | assert len(test_index) == test_size, "Test index should have the given test size!"
44 | assert len(train_index) == (len(inputs) - test_size * (n_splits - i)), (
45 | "Train index should have the cumulative remaining size!"
46 | )
47 | assert train_index.max() < test_index.min(), (
48 | "Train index should always be lower than test index!"
49 | )
50 | assert not inputs.iloc[train_index].empty, "Train index should be a subset of the inputs!"
51 | assert not inputs.iloc[test_index].empty, "Test index should be a subset of the inputs!"
52 |
--------------------------------------------------------------------------------