├── .editorconfig
├── .github
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ └── feature_request.md
├── PULL_REQUEST_TEMPLATE.md
├── dependabot.yml
└── workflows
│ ├── check-links.yml
│ ├── prepare-release.yml
│ ├── publish.yml
│ ├── test_on_master.yml
│ ├── test_on_pr.yml
│ └── test_reusable_workflow.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yml
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── codecov.yml
├── docs
├── Makefile
├── conf.py
├── index.md
├── make.bat
└── source
│ ├── 01_introduction
│ ├── 01_introduction.md
│ ├── 02_motivation.md
│ └── index.md
│ ├── 02_getting_started
│ ├── 01_installation
│ │ ├── 01_installation.md
│ │ └── 02_setup.md
│ ├── 02_quickstart
│ │ ├── 00_intro_tutorial.md
│ │ ├── 01_example_project.md
│ │ └── 02_first_steps.md
│ └── index.md
│ ├── 03_experiment_tracking
│ ├── 01_experiment_tracking
│ │ ├── 01_configuration.md
│ │ ├── 02_version_parameters.md
│ │ ├── 03_version_datasets.md
│ │ ├── 04_version_models.md
│ │ ├── 05_version_metrics.md
│ │ └── 06_mlflow_ui.md
│ ├── 02_interactive_use
│ │ └── 01_notebook_use.md
│ └── index.md
│ ├── 04_pipeline_as_model
│ ├── 01_pipeline_as_custom_model
│ │ ├── 01_mlflow_models.md
│ │ ├── 02_scikit_learn_like_pipeline.md
│ │ ├── 03_deployment_patterns.md
│ │ └── 04_custom_kedro_pipeline_model.md
│ ├── 02_framework_ml
│ │ ├── 01_why_framework.md
│ │ ├── 02_ml_project_components.md
│ │ └── 03_framework_solutions.md
│ └── index.md
│ ├── 05_API
│ ├── 01_python_objects
│ │ ├── 01_Datasets.md
│ │ ├── 02_Hooks.md
│ │ ├── 03_Pipelines.md
│ │ ├── 04_CLI.md
│ │ └── 05_Configuration.md
│ ├── 02_autoapi
│ │ ├── kedro_mlflow.config.rst
│ │ ├── kedro_mlflow.framework.cli.rst
│ │ ├── kedro_mlflow.framework.hooks.rst
│ │ ├── kedro_mlflow.io.rst
│ │ ├── kedro_mlflow.mlflow.rst
│ │ ├── kedro_mlflow.pipeline.rst
│ │ └── kedro_mlflow.rst
│ └── index.md
│ ├── 06_migration_guide
│ ├── index.md
│ ├── migration_guide_kedro_experiment_tracking.md
│ └── migration_guide_kedro_mlflow.md
│ └── imgs
│ ├── apps_interaction.png
│ ├── blogpost_migrate_experiment_tracking.png
│ ├── default_catalog.png
│ ├── etl_app.png
│ ├── hook_registration_process.png
│ ├── initialized_project.png
│ ├── kedro_viz_params.png
│ ├── logo.png
│ ├── ml_pipeline
│ ├── preprocessing
│ │ ├── all.PNG
│ │ ├── inference.PNG
│ │ └── training.PNG
│ ├── shared_inputs
│ │ ├── all.PNG
│ │ ├── inference.PNG
│ │ └── training.PNG
│ ├── tokenizer
│ │ ├── all.PNG
│ │ ├── inference.PNG
│ │ └── training.PNG
│ └── vanilla
│ │ ├── all.PNG
│ │ ├── inference.PNG
│ │ └── training.PNG
│ ├── mlflow_host_page.png
│ ├── mlflow_run.png
│ ├── mlflow_tracking_schema.png
│ ├── mlflow_yml.png
│ ├── once_run_project.png
│ ├── run_with_artifact.png
│ └── updated_catalog.png
├── kedro_mlflow
├── __init__.py
├── config
│ ├── __init__.py
│ ├── kedro_mlflow_config.py
│ └── resolvers.py
├── framework
│ ├── __init__.py
│ ├── cli
│ │ ├── __init__.py
│ │ ├── cli.py
│ │ └── cli_utils.py
│ └── hooks
│ │ ├── __init__.py
│ │ ├── mlflow_hook.py
│ │ └── utils.py
├── io
│ ├── __init__.py
│ ├── artifacts
│ │ ├── __init__.py
│ │ └── mlflow_artifact_dataset.py
│ ├── catalog
│ │ ├── __init__.py
│ │ └── switch_catalog_logging.py
│ ├── metrics
│ │ ├── __init__.py
│ │ ├── mlflow_abstract_metric_dataset.py
│ │ ├── mlflow_metric_dataset.py
│ │ ├── mlflow_metric_history_dataset.py
│ │ └── mlflow_metrics_history_dataset.py
│ └── models
│ │ ├── __init__.py
│ │ ├── mlflow_abstract_model_dataset.py
│ │ ├── mlflow_model_local_filesystem_dataset.py
│ │ ├── mlflow_model_registry_dataset.py
│ │ └── mlflow_model_tracking_dataset.py
├── mlflow
│ ├── __init__.py
│ └── kedro_pipeline_model.py
├── pipeline
│ ├── __init__.py
│ ├── pipeline_ml.py
│ └── pipeline_ml_factory.py
├── template
│ └── project
│ │ └── mlflow.yml
└── utils.py
├── mlc_config.json
├── pyproject.toml
└── tests
├── __init__.py
├── config
├── __init__.py
├── test_get_mlflow_config.py
├── test_kedro_mlflow_config.py
└── test_resolvers.py
├── conftest.py
├── framework
├── __init__.py
├── cli
│ ├── __init__.py
│ ├── test_cli.py
│ ├── test_cli_modelify.py
│ └── test_cli_utils.py
└── hooks
│ ├── __init__.py
│ ├── test_hook_active_run.py
│ ├── test_hook_deactivate_tracking.py
│ ├── test_hook_log_artifact.py
│ ├── test_hook_log_metrics.py
│ ├── test_hook_log_parameters.py
│ ├── test_hook_on_pipeline_error.py
│ ├── test_hook_pipeline_ml.py
│ ├── test_run_name.py
│ ├── test_utils_flatten_dict.py
│ └── test_utils_generate_kedro_command.py
├── io
├── __init__.py
├── artifacts
│ ├── __init__.py
│ └── test_mlflow_artifact_dataset.py
├── metrics
│ ├── __init__.py
│ ├── test_mlflow_metric_dataset.py
│ ├── test_mlflow_metric_history_dataset.py
│ └── test_mlflow_metrics_dataset.py
└── models
│ ├── __init__.py
│ ├── test_mlflow_model_local_filesystem_dataset.py
│ ├── test_mlflow_model_registry_dataset.py
│ └── test_mlflow_model_tracking_dataset.py
├── mlflow
├── __init__.py
└── test_kedro_pipeline_model.py
├── pipeline
├── __init__.py
└── test_pipeline_ml.py
└── template
├── __init__.py
└── project
├── __init__.py
└── test_mlflow_yml.py
/.editorconfig:
--------------------------------------------------------------------------------
1 | # http://editorconfig.org
2 |
3 | root = true
4 |
5 | [*]
6 | indent_style = space
7 | indent_size = 4
8 | trim_trailing_whitespace = true
9 | insert_final_newline = true
10 | charset = utf-8
11 | end_of_line = lf
12 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: If something isn't working
4 | title: '
'
5 | labels: 'Issue: Bug Report'
6 | assignees: ''
7 |
8 | ---
9 |
10 |
11 |
12 | ## Description
13 |
14 |
15 | ## Context
16 |
17 |
18 |
19 | ## Steps to Reproduce
20 |
21 |
26 |
27 | ## Expected Result
28 |
29 |
30 |
31 | ## Actual Result
32 |
33 |
34 |
35 | ```
36 | -- If you received an error, place it here.
37 | ```
38 |
39 | ```
40 | -- Separate them if you have more than one.
41 | ```
42 |
43 | ## Your Environment
44 |
45 |
46 |
47 | * `kedro` and `kedro-mlflow` version used (`pip show kedro` and `pip show kedro-mlflow`):
48 | * Python version used (`python -V`):
49 | * Operating system and version:
50 |
51 | ## Does the bug also happen with the last version on master?
52 |
53 |
62 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Let us know if you have a feature request or enhancement
4 | title: ''
5 | labels: 'Issue: Feature Request'
6 | assignees: ''
7 | ---
8 |
9 |
10 |
11 | ## Description
12 |
13 |
14 | ## Context
15 |
16 |
17 | ## Possible Implementation
18 |
19 |
20 | ## Possible Alternatives
21 |
22 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | ## Description
2 | Why was this PR created?
3 |
4 | ## Development notes
5 | What have you changed, and how has this been tested?
6 |
7 | ## Checklist
8 |
9 | - [ ] Read the [contributing](https://github.com/Galileo-Galilei/kedro-mlflow/blob/master/CONTRIBUTING.md) guidelines
10 | - [ ] Open this PR as a 'Draft Pull Request' if it is work-in-progress
11 | - [ ] Update the documentation to reflect the code changes
12 | - [ ] Add a description of this change and add your name to the list of supporting contributions in the [`CHANGELOG.md`](https://github.com/Galileo-Galilei/kedro-mlflow/blob/master/CHANGELOG.md) file. Please respect [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) guidelines.
13 | - [ ] Add tests to cover your changes
14 |
15 | ## Notice
16 |
17 | - [ ] I acknowledge and agree that, by checking this box and clicking "Submit Pull Request":
18 |
19 | - I submit this contribution under the [Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0.txt) and represent that I am entitled to do so on behalf of myself, my employer, or relevant third parties, as applicable.
20 | - I certify that (a) this contribution is my original creation and / or (b) to the extent it is not my original creation, I am authorised to submit this contribution on behalf of the original creator(s) or their licensees.
21 | - I certify that the use of this contribution as authorised by the Apache 2.0 license does not violate the intellectual property rights of anyone else.
22 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: pip
4 | directory: "/"
5 | schedule:
6 | interval: daily
7 | open-pull-requests-limit: 10
8 |
--------------------------------------------------------------------------------
/.github/workflows/check-links.yml:
--------------------------------------------------------------------------------
1 | name: check-links
2 |
3 | on:
4 | push:
5 | branches: [master]
6 | tags:
7 | - '*'
8 | pull_request:
9 | branches: [master]
10 | schedule:
11 | - cron: '0 3 * * 1' # runs at 3 AM every monday
12 |
13 | jobs:
14 | markdown-link-check:
15 | runs-on: ubuntu-latest
16 | steps:
17 | - uses: actions/checkout@v2
18 | - uses: gaurav-nelson/github-action-markdown-link-check@v1
19 | with:
20 | use-quiet-mode: 'yes'
21 | use-verbose-mode: 'yes'
22 | config-file: 'mlc_config.json'
23 |
--------------------------------------------------------------------------------
/.github/workflows/prepare-release.yml:
--------------------------------------------------------------------------------
1 | name: create-release-candidate
2 |
3 | on:
4 | workflow_dispatch:
5 | inputs:
6 | version_part:
7 | description: The part of the version to update (patch, minor or major)
8 | type: choice
9 | options:
10 | - patch
11 | - minor
12 | - major
13 | default: 'patch'
14 | required: true
15 |
16 | jobs:
17 | prepare-release:
18 | runs-on: ubuntu-latest
19 | strategy:
20 | matrix:
21 | python-version: [3.11]
22 | env:
23 | PYTHON_PACKAGE: kedro_mlflow
24 | steps:
25 | - uses: actions/checkout@v4
26 | - name: Install uv
27 | uses: astral-sh/setup-uv@v5
28 | with:
29 | enable-cache: true
30 | - name: Set up Python ${{ matrix.python-version }}
31 | uses: actions/setup-python@v5
32 | with:
33 | python-version: ${{ matrix.python-version }}
34 | - name: Validate inputs
35 | run: |
36 | echo "INPUT_VERSION_PART: ${{ github.event.inputs.version_part }}"
37 | - name: Bump the version number # bump2version is a maintained fork of original bumpversion
38 | id: bump_version
39 | run: |
40 | uv tool install bump-my-version
41 | uvx bump-my-version bump ${{ github.event.inputs.version_part }}
42 | echo "package_version=$(cat $PYTHON_PACKAGE/__init__.py | grep -Po '\d+\.\d+\.\d+')" >> $GITHUB_OUTPUT
43 | - name: Update the CHANGELOG according to 'Keep a Changelog' guidelines
44 | uses: thomaseizinger/keep-a-changelog-new-release@v1
45 | with:
46 | version: ${{ steps.bump_version.outputs.package_version }}
47 | - name: Create a new release branch
48 | run: |
49 | git config user.name github-actions
50 | git config user.email github-actions@github.com
51 | git checkout -b release-${{ steps.bump_version.outputs.package_version }}
52 | git push -u origin release-${{ steps.bump_version.outputs.package_version }}
53 | - name: Commit the changes
54 | run: |
55 | git commit -am "Bump version and CHANGELOG for release ${{ steps.bump_version.outputs.package_version }}"
56 | git push
57 | - name: Open a PR to merge the release to master
58 | id: open_pr
59 | run: |
60 | gh pr create -B master -H release-${{ steps.bump_version.outputs.package_version }} --title "Release ${{ steps.bump_version.outputs.package_version }}" --body "Bump version and CHANGELOG for next release." --assignee "${{ github.repository_owner }}"
61 | echo "pull_request_number=$(gh pr list --base master --json number,createdAt --jq 'sort_by(.createdAt) | reverse | .[0].number')" >> $GITHUB_OUTPUT
62 | env:
63 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
64 | - name: Change the commit message to add PR number
65 | run: |
66 | git commit -a --amend -m ":rocket: Bump version and CHANGELOG for release ${{ steps.bump_version.outputs.package_version }} (#${{ steps.open_pr.outputs.pull_request_number }})"
67 | git push -f
68 |
--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
1 | name: publish
2 |
3 | on: workflow_dispatch
4 |
5 | jobs:
6 | deploy:
7 | runs-on: ubuntu-latest
8 | environment:
9 | name: release
10 | permissions:
11 | contents: write # IMPORTANT: this permission is mandatory to enable creating a release
12 | id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
13 | env:
14 | PYTHON_PACKAGE: kedro_mlflow
15 | steps:
16 | - name: Checkout the repo
17 | uses: actions/checkout@v4
18 | with:
19 | fetch-depth: 0 # necessary to enable merging, all the history is needed
20 | - name: Install uv
21 | uses: astral-sh/setup-uv@v5
22 | with:
23 | enable-cache: true
24 | - name: Set up Python
25 | uses: actions/setup-python@v5
26 | with:
27 | python-version: "3.11"
28 | - name: Build package dist from source # A better way will be : https://packaging.python.org/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/ but pep 517 is still marked as experimental
29 | run: |
30 | uv build --wheel --sdist
31 | - name: Set dynamically package version as output variable
32 | # see https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#example-of-setting-an-output-parameter
33 | id: set_package_version
34 | run: |
35 | echo "PACKAGE_VERSION=$(cat $PYTHON_PACKAGE/__init__.py | grep -Po '\d+\.\d+\.\d+')" >> "$GITHUB_OUTPUT"
36 | - name: Create temporary file with the body content for the release
37 | run: |
38 | grep -Poz "## \[${{steps.set_package_version.outputs.PACKAGE_VERSION}}] - \d{4}-\d{2}-\d{2}[\S\s]+?(?=## \[\d+\.\d+\.\d+\]|\[.+\]:)" CHANGELOG.md > release_body.md
39 | - name: Create Release
40 | id: create_release
41 | uses: softprops/action-gh-release@v2
42 | env:
43 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # This token is provided by Actions, you do not need to create your own token
44 | with:
45 | tag_name: ${{ steps.set_package_version.outputs.PACKAGE_VERSION }}
46 | name: Release ${{ steps.set_package_version.outputs.PACKAGE_VERSION }}
47 | body_path: ./release_body.md
48 | draft: false
49 | prerelease: false
50 | - name: Rollback Release in case of run failure
51 | if: failure() && steps.create_release.outputs.id != ''
52 | uses: author/action-rollback@stable
53 | with:
54 | # Using a known release ID
55 | release_id: ${{ steps.create_release.outputs.id }}
56 | env:
57 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
58 |
59 | - name: Publish package distributions to PyPI
60 | uses: pypa/gh-action-pypi-publish@release/v1
61 | with:
62 | verbose: true # trace if the upload fails
63 |
--------------------------------------------------------------------------------
/.github/workflows/test_on_master.yml:
--------------------------------------------------------------------------------
1 | name: test
2 |
3 | on:
4 | push:
5 | branches: [master]
6 |
7 | jobs:
8 | lint_and_test:
9 | uses: ./.github/workflows/test_reusable_workflow.yml
10 | strategy:
11 | matrix:
12 | python-version: ["3.9", "3.10", "3.11", "3.12"]
13 | os: [ubuntu-latest, macos-latest, windows-latest]
14 | with:
15 | python-version: ${{ matrix.python-version }}
16 | os: ${{ matrix.os }}
17 | secrets: inherit
18 |
--------------------------------------------------------------------------------
/.github/workflows/test_on_pr.yml:
--------------------------------------------------------------------------------
1 | name: test_on_PR
2 |
3 | on:
4 | pull_request:
5 | branches: [master]
6 |
7 | jobs:
8 | lint_and_test:
9 | uses: ./.github/workflows/test_reusable_workflow.yml
10 | with:
11 | python-version: "3.11"
12 | os: "ubuntu-latest"
13 | secrets: inherit
14 |
--------------------------------------------------------------------------------
/.github/workflows/test_reusable_workflow.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: test
5 |
6 | on:
7 | workflow_call:
8 | inputs:
9 | os:
10 | required: true
11 | type: string
12 | python-version:
13 | required: true
14 | type: string
15 | secrets:
16 | CODECOV_TOKEN:
17 | required: true
18 |
19 | jobs:
20 | lint_and_test:
21 | runs-on: ${{ inputs.os }}
22 | env:
23 | OS: ${{ inputs.os }}
24 | PYTHON_VERSION: ${{ inputs.python-version }}
25 | steps:
26 | - uses: actions/checkout@v3
27 | - name: Install uv
28 | uses: astral-sh/setup-uv@v5
29 | with:
30 | enable-cache: true
31 | - name: Set up Python ${{ inputs.python-version }}
32 | uses: actions/setup-python@v5
33 | with:
34 | python-version: ${{ inputs.python-version }}
35 | - name: Install dependencies
36 | run: |
37 | uv venv
38 | uv pip install .[test]
39 | - name: Check code formatting with ruff
40 | if: ${{ inputs.os }} == 'ubuntu-latest' && ${{ inputs.python-version }} == '3.11' # linting should occur only once in the loop
41 | run: |
42 | uv run ruff format . --check
43 | - name: Check import order and syntax with ruff
44 | if: ${{ inputs.os }} == 'ubuntu-latest' && ${{ inputs.python-version }} == '3.11' # linting should occur only once in the loop
45 | run: |
46 | uv run ruff check .
47 | - name: Test with pytest and generate coverage report
48 | run: |
49 | uv run pytest -x --cov=./ --cov-report=xml -n auto
50 | - name: Upload coverage report to Codecov
51 | uses: codecov/codecov-action@v1
52 | if: ${{ inputs.os }} == 'ubuntu-latest' && ${{ inputs.python-version }} == '3.11' # upload should occur only once in the loop
53 | with:
54 | token: ${{ secrets.CODECOV_TOKEN }} # token is not mandatory but make access more stable
55 | file: ./coverage.xml
56 | env_vars: OS,PYTHON
57 | fail_ci_if_error: true
58 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | # vscode
132 | .vscode
133 |
134 | # mlflow
135 | mlruns/
136 |
137 | # ruff
138 | .ruff_cache
139 |
140 | debug/
141 | *.xlsx
142 | *.pptx
143 |
144 | # uv
145 |
146 | uv.lock
147 |
148 | # End of .gitignore
149 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | exclude: ^kedro_mlflow/template/project/run.py$
2 | repos:
3 | - repo: https://github.com/astral-sh/ruff-pre-commit
4 | rev: v0.9.6
5 | hooks:
6 | - id: ruff
7 | args: [--fix, --exit-non-zero-on-fix]
8 | - id: ruff-format
9 | - repo: https://github.com/asottile/blacken-docs
10 | rev: v1.12.1
11 | hooks:
12 | - id: blacken-docs
13 | additional_dependencies: [black==25.1.0]
14 | - repo: https://github.com/pre-commit/pre-commit-hooks
15 | rev: v4.5.0
16 | hooks:
17 | - id: check-case-conflict
18 | - id: check-json
19 | - id: check-merge-conflict
20 | - id: check-toml
21 | - id: check-yaml
22 | - id: debug-statements
23 | - id: end-of-file-fixer
24 | - id: mixed-line-ending
25 | args: [--fix=lf]
26 | - id: trailing-whitespace
27 | args: [--markdown-linebreak-ext=md]
28 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | # .readthedocs.yml
2 | # Read the Docs configuration file
3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4 |
5 | # Required
6 | version: 2
7 |
8 | # required since 2023/10 https://blog.readthedocs.com/use-build-os-config/
9 | build:
10 | os: "ubuntu-22.04"
11 | tools:
12 | python: "3.11"
13 |
14 | # Build documentation in the docs/ directory with Sphinx
15 | sphinx:
16 | configuration: docs/conf.py
17 |
18 |
19 | # Optionally build your docs in additional formats such as PDF
20 | formats:
21 | - pdf
22 |
23 | # Optionally set the version of Python and requirements required to build your docs
24 | python:
25 | install:
26 | - method: pip
27 | path: .
28 | extra_requirements:
29 | - doc
30 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 |
2 | # Contributor Covenant Code of Conduct
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation.
6 |
7 | ## Our Standards
8 |
9 | Examples of behavior that contributes to creating a positive environment include:
10 |
11 | - Using welcoming and inclusive language
12 | - Being respectful of differing viewpoints and experiences
13 | - Gracefully accepting constructive criticism
14 | - Focusing on what is best for the community
15 | - Showing empathy towards other community members
16 |
17 | Examples of unacceptable behavior by participants include:
18 |
19 | - The use of sexualized language or imagery and unwelcome sexual attention or advances
20 | - Trolling, insulting/derogatory comments, and personal or political attacks
21 | - Public or private harassment
22 | - Publishing others' private information, such as a physical or electronic address, without explicit permission
23 | - Other conduct which could reasonably be considered inappropriate in a professional setting
24 |
25 | ## Our Responsibilities
26 |
27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
28 |
29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
30 |
31 | ## Scope
32 |
33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
34 |
35 | ## Enforcement
36 |
37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at galileo.galilei.github@gmail.com. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
38 |
39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
40 | Attribution
41 |
42 | This Code of Conduct is adapted from the Contributor Covenant, version 1.4, available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
43 |
44 | For answers to common questions about this code of conduct, see https://www.contributor-covenant.org/faq
45 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Development workflow
2 |
3 | The current workflow is the following:
4 |
5 | 1. Open an issue to describe your feature request or your bug fix with a detailed explanation of what you want to achieve.
6 | 2. Fork the repo
7 | 3. Develop locally:
8 | - Install the precommit file (`pip install pre-commit`, then `pre-commit install`)
9 | - Create a branch based on the master branch (``git checkout -b master``)
10 | - Create a conda environment (conda create -n python==3.10)
11 | - Activate this environment (`conda activate `)
12 | - Install the extra dependencies for tests (`pip install kedro-mlflow[dev,test]`)
13 | - Apply your changes
14 | - Run pre-commit (black linting, flake8 errors, isort with ``pre-commit run``)
15 | 4. Submit your changes:
16 | - Ensure test coverage is still 100%
17 | - Update documentation accordingly
18 | - Update `CHANGELOG.md` according to ["Keep a Changelog" guidelines](https://keepachangelog.com/en/1.0.0/)
19 | - Squash all the changes within a single commit as much as possible, and ensure the commit message has the format "[:gitmoji_icon:](https://gitmoji.dev/) Informative description (``#``)"
20 | - Rebase your branch on ``master`` to ensure linear history
21 | - Open a pull request against ``master``
22 | 5. Ask for review:
23 | - Assign the review @Galileo-Galilei
24 | - Wait for review
25 | - Resolve all discussions (go back to step 3.)
26 | 6. The PR will be merged as soon as possible
27 |
28 | **We reserve the right to take over (suppress or modify) PR that do not match the workflow or are abandoned.**
29 |
30 |
31 | # Release workflow
32 |
33 | 1. Check the issues:
34 | - Ensure all the [release issues](https://github.com/Galileo-Galilei/kedro-mlflow/milestones) are completed. Eventually move the not addressed yet issues to a further release.
35 | - Create a [new milestone](https://github.com/Galileo-Galilei/kedro-mlflow/milestones)
36 | 2. Create the release candidate:
37 | - Go to the [create-release-candidate action](https://github.com/Galileo-Galilei/kedro-mlflow/actions?query=workflow%3Acreate-release-candidate)
38 | - Click "Run workflow"
39 | - Enter the part of the version to bump (one of `..`)
40 | 3. If the workflow has run sucessfully:
41 | - Go to the newly openened PR named "[Release candidate ``](https://github.com/Galileo-Galilei/kedro-mlflow/pulls)"
42 | - Check that changelog and version have been properly updated.
43 | - *(If everything is normal, skip this step)* Eventually pull the branch and make changes if necessary
44 | - Merge the PR to master
45 | 4. Checkout the [publish workflow](https://github.com/Galileo-Galilei/kedro-mlflow/actions?query=workflow%3Apublish) to see if:
46 | - The package has been uploaded on PyPI sucessfully
47 | - A Github release has been created
48 | 5. If the pipeline has failed, please raise an issue to correct the CI, and ensure merge on master manually.
49 |
--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | ignore:
2 | - "setup.py"
3 | - "tests/**/*"
4 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 |
17 |
18 | # -- Project information -----------------------------------------------------
19 | from datetime import datetime
20 |
21 | from kedro_mlflow import __version__ as km_version
22 |
23 | project = "kedro-mlflow"
24 | copyright = f"{datetime.today().year}, Yolan Honoré-Rougé"
25 | author = "Yolan Honoré-Rougé"
26 |
27 |
28 | # The full version, including alpha/beta/rc tags
29 | release = km_version
30 |
31 |
32 | # -- General configuration ---------------------------------------------------
33 |
34 | # Add any Sphinx extension module names here, as strings. They can be
35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
36 | # ones.
37 | extensions = [
38 | "sphinx.ext.autodoc",
39 | "sphinx.ext.autosummary",
40 | "sphinx.ext.napoleon",
41 | "sphinx_click",
42 | # "sphinx_autodoc_typehints",
43 | # "sphinx.ext.doctest",
44 | # "sphinx.ext.todo",
45 | # "sphinx.ext.coverage",
46 | # "sphinx.ext.mathjax",
47 | # "sphinx.ext.ifconfig",
48 | # "sphinx.ext.viewcode",
49 | # "nbsphinx",
50 | "sphinx_design", # responsive web component support
51 | "sphinx_copybutton",
52 | "sphinx_markdown_tables",
53 | "myst_parser",
54 | ]
55 |
56 | myst_enable_extensions = ["colon_fence"]
57 |
58 | # enable autosummary plugin (table of contents for modules/classes/class
59 | # methods)
60 | autosummary_generate = True
61 | autosummary_generate_overwrite = False
62 | napoleon_include_init_with_doc = True
63 |
64 | # enable documentation in markdown
65 | source_suffix = {".rst": "restructuredtext", ".md": "markdown"}
66 |
67 |
68 | # List of patterns, relative to source directory, that match files and
69 | # directories to ignore when looking for source files.
70 | # This pattern also affects html_static_path and html_extra_path.
71 | exclude_patterns = ["_build"]
72 |
73 |
74 | # -- Options for HTML output -------------------------------------------------
75 | # The name of the Pygments (syntax highlighting) style to use.
76 | pygments_style = "sphinx"
77 |
78 | # The theme to use for HTML and HTML Help pages. See the documentation for
79 | # a list of builtin themes.
80 |
81 | html_theme = "pydata_sphinx_theme" # see: https://pydata-sphinx-theme.readthedocs.io/en/stable/user_guide/layout.html
82 |
83 | # useful to create dropdown with the name of the directory as the section name
84 | # see https://stackoverflow.com/questions/36925871/toctree-nested-drop-down:
85 | html_theme_options = {
86 | "logo": {
87 | "image_light": "source/imgs/logo.png",
88 | "image_dark": "source/imgs/logo.png",
89 | },
90 | # https://pydata-sphinx-theme.readthedocs.io/en/stable/user_guide/header-links.html#fontawesome-icons
91 | "icon_links": [
92 | {
93 | "name": "GitHub",
94 | "url": "https://github.com/Galileo-Galilei/kedro-mlflow",
95 | "icon": "fa-brands fa-github",
96 | },
97 | {
98 | "name": "PyPI",
99 | "url": "https://pypi.org/project/kedro-mlflow/",
100 | "icon": "fa-brands fa-python",
101 | },
102 | {
103 | "name": "Slack",
104 | "url": "https://kedro-org.slack.com/",
105 | "icon": "fa-brands fa-slack",
106 | },
107 | ],
108 | "navbar_start": ["navbar-logo"], # "version-switcher" to be configured
109 | "navbar_align": "content",
110 | "header_links_before_dropdown": 4,
111 | "secondary_sidebar_items": ["page-toc", "edit-this-page", "sourcelink"],
112 | "use_edit_page_button": True,
113 | }
114 | html_context = {
115 | "github_user": "Galileo-Galilei",
116 | "github_repo": "kedro-mlflow",
117 | "github_version": "master",
118 | "doc_path": "docs/", # why not "docs/source/"?
119 | "default_mode": "light",
120 | }
121 | html_sidebars = {"index": []}
122 |
123 |
124 | myst_heading_anchors = 5
125 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | ---
2 | myst:
3 | html_meta:
4 | "description lang=en": |
5 | Top-level documentation for kedro-mlflow, with links to the rest
6 | of the site.
7 | html_theme.sidebar_secondary.remove: true
8 | ---
9 |
10 | # The kedro-mlflow plugin
11 |
12 | ```kedro-mlflow``` is a Kedro [plugin](https://docs.kedro.org/en/stable/extend_kedro/plugins.html) to integrate [MLflow](https://www.mlflow.org/) effortlessly inside [Kedro](https://kedro.org/) projects.
13 |
14 | Its main features are **automatic parameters tracking**, **datasets tracking as artifacts**, Kedro **pipelines packaging** and serving and **automatic synchronisation between training and inference** pipelines. It aims at providing a complete yet modular framework for high reproducibility of machine learning experiments and ease of deployment.
15 |
16 | ::::{grid} 1 1 2 2
17 | :gutter: 3
18 |
19 | :::{grid-item-card}
20 | :link: source/03_experiment_tracking/01_experiment_tracking/01_configuration.html
21 | :link-type: url
22 | :class-header: bg-light
23 |
24 | {fas}`flask fa-xl;pst-color-primary` Experiment tracking
25 | ^^^
26 |
27 | Track the **parameters**, **metrics**, **artifacts** and **models** of your kedro pipelines for reproducibility.
28 | :::
29 |
30 | :::{grid-item-card}
31 | :link: source/04_pipeline_as_model/01_pipeline_as_custom_model/01_mlflow_models.html
32 | :link-type: url
33 | :class-header: bg-light
34 |
35 | {fas}`rocket fa-xl;pst-color-primary` Pipeline as model
36 | ^^^
37 |
38 | Package any kedro pipeline to a **custom mlflow model** for deployment and serving. The custom model for an inference pipeline can be **registered** in mlflow **automatically** at the end of each training in a *scikit-learn* like way.
39 | :::
40 |
41 | ::::
42 |
43 | ## Resources
44 |
45 | ::::{grid} 1 1 4 4
46 | :gutter: 3
47 |
48 | :::{grid-item-card}
49 | :link: source/02_getting_started/01_installation/01_installation.html
50 | :link-type: url
51 | :class-header: bg-light
52 |
53 | {fas}`fa-solid fa-graduation-cap fa-xl;pst-color-primary` Quickstart
54 | ^^^
55 |
56 | Get started in **1 mn** with experiment tracking!
57 | +++
58 | Try out {fas}`arrow-right fa-xl`
59 | :::
60 |
61 | :::{grid-item-card}
62 | :link: https://github.com/Galileo-Galilei/kedro-mlflow-tutorial
63 | :link-type: url
64 | :class-header: bg-light
65 |
66 | {fas}`fa-solid fa-chalkboard-user fa-xl;pst-color-primary` Advanced tutorial
67 | ^^^
68 |
69 | The ``kedro-mlflow-tutorial`` github repo contains a step-by-step tutorial to learn how to use kedro-mlflow as a MLOps framework!
70 |
71 | +++
72 | Try on github {fab}`github;fa-xl`
73 | :::
74 |
75 | :::{grid-item-card}
76 | :link: https://www.youtube.com/watch?v=Az_6UKqbznw
77 | :link-type: url
78 | :class-header: bg-light
79 |
80 | {fas}`fa-solid fa-video fa-xl;pst-color-primary` Demonstration in video
81 | ^^^
82 |
83 | A youtube video by the kedro team to introduce the plugin, with live coding.
84 |
85 | +++
86 | Watch on youtube {fab}`youtube;fa-xl`
87 | :::
88 |
89 | :::{grid-item-card}
90 | :link: https://youtu.be/mIfJR3CdBUE
91 | :link-type: url
92 | :class-header: bg-light
93 |
94 | {fas}`fa-solid fa-video fa-xl;pst-color-primary` Tackling the ML Reproducibility Curse
95 | ^^^
96 |
97 | A community video by [Oleg Litvinov](https://github.com/OlegBEZb) showcasing how to use the Kedro-MLflow plugin on an end to end project.
98 | +++
99 | Watch on YouTube {fab}`youtube;fa-xl`
100 | :::
101 | ::::
102 |
103 | ```{toctree}
104 | ---
105 | maxdepth: 1
106 | hidden: true
107 | ---
108 | source/01_introduction/index
109 | source/02_getting_started/index
110 | source/03_experiment_tracking/index
111 | source/04_pipeline_as_model/index
112 | source/05_API/index
113 | Changelog
114 | source/06_migration_guide/index
115 | ```
116 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/source/01_introduction/02_motivation.md:
--------------------------------------------------------------------------------
1 | # Motivation behind the plugin
2 |
3 | ## When should I use kedro-mlflow?
4 |
5 | Basically, you should use `kedro-mlflow` in **any `Kedro` project which involves machine learning** / deep learning. As stated in the [introduction](https://kedro-mlflow.readthedocs.io/en/latest/source/01_introduction/index.html), `Kedro`'s current versioning (as of version `0.19.10`) is not sufficient for machine learning projects: it lacks a UI and a ``run`` management system. Besides, the `KedroPipelineModel` ability to serve a kedro pipeline as an API or a batch in one line of code is a great addition for collaboration and transition to production.
6 |
7 | If you do not use ``Kedro`` or if you do pure data processing which does not involve *machine learning*, this plugin is not what you are seeking for ;-)
8 |
9 | ## Why should I use kedro-mlflow?
10 |
11 | ### Benchmark of existing solutions
12 |
13 | This paragraph gives a (quick) overview of existing solutions for mlflow integration inside Kedro projects.
14 |
15 | ``Mlflow`` is very simple to add to any existing code. It is a 2-step process:
16 |
17 | - add `log_{XXX}` (either param, artifact, metric or model) functions where they are needed inside the code
18 | - add a `MLProject` at the root of the project to enable CLI execution. This file must contain all the possible execution steps (like the `pipeline.py` / `hooks.py` in a kedro project).
19 |
20 | Including mlflow inside a ``kedro project`` is consequently very easy: the logging functions can be added in the code, and the ``MLProject`` is very simple and is composed almost only of the ``kedro run`` command. You can find examples of such implementations:
21 |
22 | - the [medium paper](https://medium.com/quantumblack/deploying-and-versioning-data-pipelines-at-scale-942b1d81b5f5) by QuantumBlack employees.
23 | - the associated [github repo](https://github.com/tgoldenberg/kedro-mlflow-example)
24 | - other examples can be found on Github, but AFAIK all of them follow the very same principles.
25 |
26 | ### Enforcing Kedro principles
27 |
28 | Above implementations have the advantage of being very straightforward and *mlflow compliant*, but they break several ``Kedro`` principles:
29 |
30 | - the ``MLFLOW_TRACKING_URI`` which registers the database where runs are logged is declared inside the code instead of a configuration file, which **hinders portability across environments** and makes transition to production more difficult
31 | - the logging of different elements can be put in many places in the ``Kedro`` template (in the code of any function involved in a ``node``, in a ``Hook``, in the ``ProjectContext``, in a ``transformer``...). This is not compliant with the ``Kedro`` template where any object has a dedicated location. We want to avoid the logging to occur anywhere because:
32 | - it is **very error-prone** (one can forget to log one parameter)
33 | - it is **hard to modify** (if you want to remove / add / modify an mlflow action you have to find it in the code)
34 | - it **prevents reuse** (re-usable function must not contain mlflow specific code unrelated to their functional specificities, only their execution must be tracked).
35 |
36 | ``kedro-mlflow`` enforces these best practices while implementing a clear interface for each mlflow action in Kedro template. Below chart maps the mlflow action to perform with the Python API provided by ``kedro-mlflow`` and the location in Kedro template where the action should be performed.
37 |
38 | | Mlflow action | Template file | Python API |
39 | | :------------------------ | :-------------- | :--------------------------------------------------------- |
40 | | Set up configuration | ``mlflow.yml`` | ``MlflowHook`` |
41 | | Logging parameters | ``mlflow.yml`` | ``MlflowHook`` |
42 | | Logging artifacts | ``catalog.yml`` | ``MlflowArtifactDataset`` |
43 | | Logging models | ``catalog.yml`` | `MlflowModelTrackingDataset` and `MlflowModelLocalFileSystemDataset` |
44 | | Logging metrics | ``catalog.yml`` | ``MlflowMetricsHistoryDataset`` |
45 | | Logging Pipeline as model | ``hooks.py`` | ``KedroPipelineModel`` and ``pipeline_ml_factory`` |
46 |
47 | `kedro-mlflow` does not currently provide interface to set tags outside a Kedro ``Pipeline``. Some of above decisions are subject to debate and design decisions (for instance, metrics are often updated in a loop during each epoch / training iteration and it does not always make sense to register the metric between computation steps, e.g. as a an I/O operation after a node run).
48 |
49 | ```{note}
50 | You do **not** need any ``MLProject`` file to use mlflow inside your Kedro project. As seen in the [introduction](https://kedro-mlflow.readthedocs.io/en/latest/source/01_introduction/index.html), this file overlaps with Kedro configuration files.
51 | ```
52 |
--------------------------------------------------------------------------------
/docs/source/01_introduction/index.md:
--------------------------------------------------------------------------------
1 | # Introduction
2 |
3 | ```{toctree}
4 | :caption: Introduction to kedro-mlflow
5 |
6 | 01_introduction
7 | 02_motivation
8 | ```
9 |
--------------------------------------------------------------------------------
/docs/source/02_getting_started/01_installation/01_installation.md:
--------------------------------------------------------------------------------
1 | # Installation guide
2 |
3 | ## Pre-requisites
4 |
5 | ### Create a virtual environment
6 |
7 | I strongly recommend to create a virtual environment in order to avoid version conflicts between packages. I use ``conda`` in this tutorial.
8 |
9 | I also recommend to read [Kedro installation guide](https://kedro.readthedocs.io/en/latest/get_started/install.html) to set up your Kedro project.
10 |
11 | ```console
12 | conda create -n python=<3.[6-8].X>
13 | ```
14 |
15 | For the rest of the section, we assume the environment is activated:
16 |
17 | ```console
18 | conda activate
19 | ```
20 |
21 | ### Check your kedro version
22 |
23 | If you have an existing environment with kedro already installed, make sure its version is above `0.16.0`. `kedro-mlflow` cannot be used with `kedro<0.16.0`, and if you install it in an existing environment, it will reinstall a more up-to-date version of kedro and likely mess your project up until you reinstall the proper version of kedro (the one you originally created the project with).
24 |
25 | ```console
26 | pip show kedro
27 | ```
28 |
29 | should return:
30 |
31 | ```console
32 | Name: kedro
33 | Version: # <-- make sure it is above 0.16.0, <0.17.0
34 | Summary: Kedro helps you build production-ready data and analytics pipelines
35 | Home-page: https://github.com/quantumblacklabs/kedro
36 | Author: QuantumBlack Labs
37 | Author-email: None
38 | License: Apache Software License (Apache 2.0)
39 | Location: <...>\anaconda3\envs\\lib\site-packages
40 | Requires: pip-tools, cachetools, fsspec, toposort, anyconfig, PyYAML, click, pluggy, jmespath, python-json-logger, jupyter-client, setuptools, cookiecutter
41 | ```
42 |
43 | ## Install the plugin
44 |
45 | There are versions of the plugin compatible up to ``kedro>=0.16.0`` and ``mlflow>=0.8.0``. ``kedro-mlflow`` stops adding features to a minor version 2 to 6 months after a new kedro release.
46 |
47 | ::::{tab-set}
48 |
49 | :::{tab-item} Install with pip / uv
50 |
51 | You can install ``kedro-mlflow`` plugin from ``PyPi`` with `pip`:
52 |
53 | ```console
54 | pip install --upgrade kedro-mlflow
55 | ```
56 |
57 | If you prefer uv and have it installed, you can use:
58 |
59 | ```console
60 | uv pip install --upgrade kedro-mlflow
61 | ```
62 |
63 |
64 | :::
65 |
66 | :::{tab-item} Install with conda / mamba / micromamba
67 |
68 | You can install ``kedro-mlflow`` plugin with `conda` from the ``conda-forge`` channel:
69 |
70 | ```console
71 | conda install kedro-mlflow -c conda-forge
72 | ```
73 |
74 | :::
75 |
76 | :::{tab-item} Install from github
77 |
78 | You may want to install the master branch from source which has unreleased features:
79 |
80 | ```console
81 | pip install git+https://github.com/Galileo-Galilei/kedro-mlflow.git
82 | ```
83 |
84 | :::
85 |
86 | ::::
87 |
88 |
89 | ## Check the installation
90 |
91 | Enter ``kedro info`` in a terminal with the activated virtual env to check the installation. If it has succeeded, you should see the following ascii art:
92 |
93 | ```console
94 | _ _
95 | | | _____ __| |_ __ ___
96 | | |/ / _ \/ _` | '__/ _ \
97 | | < __/ (_| | | | (_) |
98 | |_|\_\___|\__,_|_| \___/
99 | v0..
100 |
101 | kedro allows teams to create analytics
102 | projects. It is developed as part of
103 | the Kedro initiative at QuantumBlack.
104 |
105 | Installed plugins:
106 | kedro_mlflow: 0.14.0 (hooks:global,project)
107 | ```
108 |
109 | The version ``0.14.0`` of the plugin is installed and has both global and project commands.
110 |
111 | That's it! You are now ready to go!
112 |
113 | ## Available commands
114 |
115 | With the ``kedro mlflow -h`` command outside of a kedro project, you now see the following output:
116 |
117 | ```console
118 | Usage: kedro mlflow [OPTIONS] COMMAND [ARGS]...
119 |
120 | Use mlflow-specific commands inside kedro project.
121 |
122 | Options:
123 | -h, --help Show this message and exit.
124 | ```
125 |
--------------------------------------------------------------------------------
/docs/source/02_getting_started/01_installation/02_setup.md:
--------------------------------------------------------------------------------
1 | # Initialize your Kedro project
2 |
3 | This section assume that [you have installed `kedro-mlflow` in your virtual environment](https://kedro-mlflow.readthedocs.io/en/latest/source/02_getting_started/01_installation/01_installation.html).
4 |
5 | ## Create a kedro project
6 |
7 | This plugin must be used in an existing kedro project. If you do not have a kedro project yet, you can create it with ``kedro new`` command. [See the kedro docs for a tutorial](https://kedro.readthedocs.io/en/latest/get_started/new_project.html).
8 |
9 | If you do not have a real-world project, you can use a kedro example and [follow the "Quickstart in 1 mn" example](https://kedro-mlflow.readthedocs.io/en/latest/source/02_getting_started/02_quickstart/01_example_project.html) to make a demo of this plugin out of the box.
10 |
11 | ## Activate `kedro-mlflow` in your kedro project
12 |
13 | In order to use the ``kedro-mlflow`` plugin, you need to setup its configuration and declare its hooks.
14 |
15 | ### Setting up the ``kedro-mlflow`` configuration file
16 |
17 |
18 | ``kedro-mlflow`` is [configured](https://kedro-mlflow.readthedocs.io/en/latest/source//05_API/01_python_objects/05_Configuration.html) through an ``mlflow.yml`` file. The recommended way to initialize the `mlflow.yml` is by using [the ``kedro-mlflow`` CLI](https://kedro-mlflow.readthedocs.io/en/latest/source/05_API/01_python_objects/04_CLI.html), but you can create it manually.
19 |
20 | ```{note}
21 | Since ``kedro-mlflow>=0.11.2``, the configuration file is optional. However, the plugin will use default ``mlflow`` configuration. Specifically, the runs will be stored in a ``mlruns`` folder at the root fo the kedro project since no ``mlflow_tracking_uri`` is configured.
22 | ```
23 |
24 | Set the working directory at the root of your kedro project:
25 |
26 | ```console
27 | cd path/to/your/project
28 | ```
29 |
30 | Run the init command :
31 |
32 | ```console
33 | kedro mlflow init
34 | ```
35 |
36 | you should see the following message:
37 |
38 | ```console
39 | 'conf/local/mlflow.yml' successfully updated.
40 | ```
41 |
42 | *Note: you can create the configuration file in another kedro environment with the `--env` argument:*
43 |
44 | ```console
45 | kedro mlflow init --env=
46 | ```
47 |
48 | ### Declaring ``kedro-mlflow`` hooks
49 |
50 | ``kedro_mlflow`` hooks implementations must be registered with Kedro. There are 2 ways of registering [hooks](https://kedro.readthedocs.io/en/latest/hooks/introduction.html).
51 |
52 | ```{important}
53 | You must register the hook provided by ``kedro-mlflow`` (the ``MlflowHook``) to make the plugin work.
54 | ```
55 |
56 | ::::{tab-set}
57 |
58 | :::{tab-item} `kedro>=0.16.4` - auto-discovery
59 |
60 | If you use `kedro>=0.16.4`, `kedro-mlflow` hooks are auto-registered automatically by default without any action from your side. You can [disable this behaviour](https://kedro.readthedocs.io/en/latest/hooks/introduction.html#disable-auto-registered-plugins-hooks) in your `settings.py` file.
61 |
62 | :::
63 |
64 | :::{tab-item} `kedro>=0.16.0, <=0.16.3` - register in ``settings.py``
65 |
66 | If you have turned off plugin automatic registration, you can register its hooks manually by [adding them to ``settings.py``](https://kedro.readthedocs.io/en/latest/hooks/introduction.html#registering-your-hook-implementations-with-kedro):
67 |
68 | ```python
69 | # /src//settings.py
70 | from kedro_mlflow.framework.hooks import MlflowHook
71 |
72 | HOOKS = (MlflowHook(),)
73 | ```
74 |
75 | :::
76 |
77 | ::::
78 |
--------------------------------------------------------------------------------
/docs/source/02_getting_started/02_quickstart/00_intro_tutorial.md:
--------------------------------------------------------------------------------
1 | # Goal of the tutorial
2 |
3 | This "Getting started" section demonstrates how to use some basic functionalities of `kedro-mlflow` in an end to end example. It is supposed to be simple and self-contained and is partially redundant with other sections, but far from complete.
4 |
5 | The **section only focuses on experiment tracking** part and **does _not_ show the "machine learning framework" abilities** of the plugin. The goal is to give to a new user a quick glance to some capabilities so that he can decide whether the plugin suits its needs or not. It is totally worth checking the other sections to have a much more complete overview of what this plugin provides.
6 |
--------------------------------------------------------------------------------
/docs/source/02_getting_started/02_quickstart/01_example_project.md:
--------------------------------------------------------------------------------
1 | # Example project
2 |
3 | ## Install the plugin in a virtual environment
4 |
5 | Create a conda environment and install ``kedro-mlflow`` (this will automatically install ``kedro>=0.16.0``).
6 |
7 | ```console
8 | conda create -n km_example python=3.10 --yes
9 | conda activate km_example
10 | pip install kedro-mlflow
11 | ```
12 |
13 | ## Install the toy project
14 |
15 | For this end to end example, we will use the [kedro starter](https://docs.kedro.org/en/stable/starters/starters.html#official-kedro-starters) with the [iris dataset](https://github.com/kedro-org/kedro-starters).
16 |
17 | We use this project because:
18 |
19 | - it covers most of the common use cases
20 | - it is compatible with older version of ``Kedro`` so newcomers are used to it
21 | - it is maintained by ``Kedro`` maintainers and therefore enforces some best practices.
22 |
23 |
24 | ::::{tab-set}
25 |
26 | :::{tab-item} ``kedro>=0.19.0``
27 |
28 | ```{warning}
29 | For ``kedro>=0.19.0``, ``pandas-iris`` starter has been removed. It is recommended to install [``spaceflights-pandas`` starter instead](https://github.com/kedro-org/kedro-starters/tree/main/spaceflights-pandas).
30 | ```
31 |
32 | :::
33 |
34 | :::{tab-item} ``kedro>=0.16.3,<0.19``
35 |
36 | The default starter is now called "pandas-iris". In a new console, enter:
37 |
38 | ```console
39 | kedro new --starter=pandas-iris
40 | ```
41 |
42 | Answer ``Kedro Mlflow Example``, ``km-example`` and ``km_example`` to the three setup questions of a new kedro project:
43 |
44 | ```console
45 | Project Name:
46 | =============
47 | Please enter a human readable name for your new project.
48 | Spaces and punctuation are allowed.
49 | [New Kedro Project]: Kedro Mlflow Example
50 |
51 | Repository Name:
52 | ================
53 | Please enter a directory name for your new project repository.
54 | Alphanumeric characters, hyphens and underscores are allowed.
55 | Lowercase is recommended.
56 | [kedro-mlflow-example]: km-example
57 |
58 | Python Package Name:
59 | ====================
60 | Please enter a valid Python package name for your project package.
61 | Alphanumeric characters and underscores are allowed.
62 | Lowercase is recommended. Package name must start with a letter or underscore.
63 | [kedro_mlflow_example]: km_example
64 | ```
65 |
66 | :::
67 |
68 | :::{tab-item} ``kedro>=0.16.0, <=0.16.2``
69 |
70 | With older versions of ``Kedro``, the starter option is not available, but this ``kedro new`` provides an "Include example" question. Answer ``y`` to this question to get the same starter as above. In a new console, enter:
71 |
72 | ```console
73 | kedro new
74 | ```
75 |
76 | Answer ``Kedro Mlflow Example``, ``km-example``, ``km_example`` and ``y`` to the four setup questions of a new kedro project:
77 |
78 | ```console
79 | Project Name:
80 | =============
81 | Please enter a human readable name for your new project.
82 | Spaces and punctuation are allowed.
83 | [New Kedro Project]: Kedro Mlflow Example
84 |
85 | Repository Name:
86 | ================
87 | Please enter a directory name for your new project repository.
88 | Alphanumeric characters, hyphens and underscores are allowed.
89 | Lowercase is recommended.
90 | [kedro-mlflow-example]: km-example
91 |
92 | Python Package Name:
93 | ====================
94 | Please enter a valid Python package name for your project package.
95 | Alphanumeric characters and underscores are allowed.
96 | Lowercase is recommended. Package name must start with a letter or underscore.
97 | [kedro_mlflow_example]: km_example
98 |
99 | Generate Example Pipeline:
100 | ==========================
101 | Do you want to generate an example pipeline in your project?
102 | Good for first-time users. (default=N)
103 | [y/N]: y
104 | ```
105 |
106 | :::
107 |
108 | ::::
109 |
110 | ## Install dependencies
111 |
112 | Move to the project directory:
113 |
114 | ```console
115 | cd km-example
116 | ```
117 |
118 | Install the project dependencies :
119 |
120 | ```{warning}
121 | Do not use ``kedro install`` commands which [does not install the packages in your activated environment](https://github.com/quantumblacklabs/kedro/issues/589). It has been removed in ``kedro>=0.19``.
122 | ```
123 |
124 | ```console
125 | pip install -r src/requirements.txt
126 | ```
127 |
--------------------------------------------------------------------------------
/docs/source/02_getting_started/index.md:
--------------------------------------------------------------------------------
1 | # {octicon}`mortar-board` Getting started
2 |
3 | ```{toctree}
4 | :caption: Installation
5 |
6 | 01_installation/01_installation
7 | 01_installation/02_setup
8 | 01_installation/03_migration_guide
9 | ```
10 |
11 | ```{toctree}
12 | :caption: Quickstart
13 |
14 | 02_quickstart/00_intro_tutorial
15 | 02_quickstart/01_example_project
16 | 02_quickstart/02_first_steps
17 | ```
18 |
--------------------------------------------------------------------------------
/docs/source/03_experiment_tracking/01_experiment_tracking/02_version_parameters.md:
--------------------------------------------------------------------------------
1 | # Track parameters
2 |
3 | ## Automatic parameters tracking
4 |
5 | Parameters tracking is automatic when the ``MlflowHook`` is added to [the hook list of the ``ProjectContext``](https://kedro-mlflow.readthedocs.io/en/latest/source/02_getting_started/01_installation/02_setup.html). The `mlflow.yml` configuration file has a parameter called ``flatten_dict_params`` which enables to [log as distinct parameters the (key, value) pairs of a ```dict`` parameter](https://kedro-mlflow.readthedocs.io/en/latest/source/05_API/01_python_objects/02_Hooks.html).
6 |
7 | You **do not need any additional configuration** to benefit from parameters versioning.
8 |
9 | ```{hint}
10 |
11 | **How does ``MlflowHook`` operates under the hood?**
12 |
13 | The [medium post which introduces hooks](https://medium.com/quantumblack/introducing-kedro-hooks-fd5bc4c03ff5) explains in detail the steps ``Kedro`` executes when the user calls the ``kedro run`` command.
14 |
15 | 
16 |
17 | The `MlflowHook` registers the parameters before each node (entry point number 3 on above picture) by calling `mlflow.log_parameter(param_name, param_value)` on each parameters of the node.
18 |
19 | ```
20 |
21 | ## Frequently asked questions
22 |
23 | :::{dropdown} How are parameters detected by the plugin?
24 | The hook **detects parameters through their prefix ``params:`` or the value ``parameters``**. These are the [reserved keywords used by Kedro to define parameters](https://docs.kedro.org/en/stable/configuration/parameters.html#how-to-use-parameters) in the ``pipeline.py`` file(s).
25 | :::
26 |
27 | :::{dropdown} Will parameters be recorded if the pipeline fails during execution?
28 | The parameters are registered node by node (and not in a single batch at the beginning of the execution). If the pipeline fails in the middle of its execution, the **parameters of the nodes who have been run will be recorded**, but **not the parameters of non executed nodes**.
29 | :::
30 |
--------------------------------------------------------------------------------
/docs/source/03_experiment_tracking/01_experiment_tracking/04_version_models.md:
--------------------------------------------------------------------------------
1 | # Track models
2 |
3 | ## What is model tracking?
4 |
5 | MLflow allows to serialize and deserialize models to a common format, track those models in MLflow Tracking and manage them using MLflow Model Registry. Many popular Machine / Deep Learning frameworks have built-in support through what MLflow calls [flavors](https://www.mlflow.org/docs/latest/models.html#built-in-model-flavors). Even if there is no flavor for your framework of choice, it is easy to [create your own flavor](https://www.mlflow.org/docs/latest/models.html#custom-python-models) and integrate it with MLflow.
6 |
7 | ## How to track models using MLflow in Kedro project?
8 |
9 | `kedro-mlflow` introduces two new `DataSet` types that can be used in `DataCatalog` called `MlflowModelTrackingDataset` and `MlflowModelLocalFileSystemDataset`. The two have very similar API, except that:
10 |
11 | - the ``MlflowModelTrackingDataset`` is used to load from and save to from the mlflow artifact store. It uses optional `run_id` argument to load and save from a given `run_id` which must exists in the mlflow server you are logging to.
12 | - the ``MlflowModelLocalFileSystemDataset`` is used to load from and save to a given path. It uses the standard `filepath` argument in the constructor of Kedro DataSets. Note that it **does not log in mlflow**.
13 |
14 | *Note: If you use ``MlflowModelTrackingDataset``, it will be saved during training in your current run. However, you will need to specify the run id to predict with (since it is not persisted locally, it will not pick the latest model by default). You may prefer to combine ``MlflowModelLocalFileSystemDataset`` and ``MlflowArtifactDataset`` to make persist it both locally and remotely, see further.*
15 |
16 | Suppose you would like to register a `scikit-learn` model of your `DataCatalog` in mlflow, you can use the following yaml API:
17 |
18 | ```yaml
19 | my_sklearn_model:
20 | type: kedro_mlflow.io.models.MlflowModelTrackingDataset
21 | flavor: mlflow.sklearn
22 | ```
23 |
24 | More informations on available parameters are available in the [dedicated section](https://kedro-mlflow.readthedocs.io/en/latest/source/05_API/01_python_objects/01_Datasets.html#mlflowmodeltrackingdataset).
25 |
26 | You are now able to use ``my_sklearn_model`` in your nodes. Since this model is registered in mlflow, you can also leverage the [mlflow model serving abilities](https://www.mlflow.org/docs/latest/cli.html#mlflow-models-serve) or [predicting on batch abilities](https://www.mlflow.org/docs/latest/cli.html#mlflow-models-predict), as well as the [mlflow models registry](https://www.mlflow.org/docs/latest/model-registry.html) to manage the lifecycle of this model.
27 |
28 | ## Frequently asked questions
29 |
30 | :::{dropdown} How is it working under the hood?
31 |
32 | **For ``MlflowModelTrackingDataset``**
33 |
34 | During save, a model object from node output is logged to mlflow using ``log_model`` function of the specified ``flavor``. It is logged in the `run_id` run if specified and if there is no active run, else in the currently active mlflow run. If the `run_id` is specified and there is an active run, the saving operation will fail. Consequently it will **never be possible to save in a specific mlflow run_id** if you launch a pipeline with the `kedro run` command because the `MlflowHook` creates a new run before each pipeline run.
35 |
36 | During load, the model is retrieved from the ``run_id`` if specified, else it is retrieved from the mlflow active run. If there is no mlflow active run, the loading fails. This will never happen if you are using the `kedro run` command, because the `MlflowHook` creates a new run before each pipeline run.
37 |
38 | **For ``MlflowModelLocalFileSystemDataset``**
39 |
40 | During save, a model object from node output is saved locally under specified ``filepath`` using ``save_model`` function of the specified ``flavor``.
41 |
42 | When model is loaded, the latest version stored locally is read using ``load_model`` function of the specified ``flavor``. You can also load a model from a specific kedro run by specifying the `version` argument to the constructor.
43 | :::
44 |
45 | :::{dropdown} How can I track a custom MLflow model flavor?
46 |
47 | To track a custom MLflow model flavor you need to set the `flavor` parameter to import the module of your custom flavor and to specify a [pyfunc workflow](https://mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#pyfunc-create-custom-workflows) which can be set either to `python_model` or `loader_module`. The former is the more high level and user friendly and is [recommend by mlflow](https://mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#which-workflow-is-right-for-my-use-case) while the latter offer more control. We haven't tested the integration in `kedro-mlflow` of this second workflow extensively, and it should be used with caution.
48 |
49 | ```yaml
50 | my_custom_model:
51 | type: kedro_mlflow.io.models.MlflowModelTrackingDataset
52 | flavor: my_package.custom_mlflow_flavor
53 | pyfunc_workflow: python_model # or loader_module
54 | ```
55 |
56 | :::
57 |
58 | ### How can I save model locally and log it in MLflow in one step?
59 |
60 | :::{dropdown} How can I save model locally and log it in MLflow in one step?
61 |
62 | If you want to save your model both locally and remotely within the same run, you can leverage `MlflowArtifactDataset`:
63 |
64 | ```yaml
65 | sklearn_model:
66 | type: kedro_mlflow.io.artifacts.MlflowArtifactDataset
67 | dataset:
68 | type: kedro_mlflow.io.models.MlflowModelLocalFileSystemDataset
69 | flavor: mlflow.sklearn
70 | filepath: data/06_models/sklearn_model
71 | ```
72 |
73 | This might be useful if you want to always read the lastest model saved locally and log it to MLflow each time the new model is being trained for tracking purpose.
74 |
75 | :::
76 |
--------------------------------------------------------------------------------
/docs/source/03_experiment_tracking/01_experiment_tracking/06_mlflow_ui.md:
--------------------------------------------------------------------------------
1 | # Open the mlflow UI
2 |
3 | ## The mlflow user interface
4 |
5 | Mlflow offers a user interface (UI) that enable to browse the run history.
6 |
7 | ## The ``kedro-mlflow`` helper
8 |
9 | When you use a local storage for kedro mlflow, you can call a [mlflow cli command](https://www.mlflow.org/docs/latest/tracking.html#tracking-ui) to launch the UI if you do not have a [mlflow tracking server configured](https://www.mlflow.org/docs/latest/tracking.html#mlflow-tracking-server-optional).
10 |
11 | To ensure this UI is linked to the tracking uri specified configuration, ``kedro-mlflow`` offers the following command:
12 |
13 | ```console
14 | kedro mlflow ui
15 | ```
16 |
17 | which is a wrapper for ``kedro ui`` command with the tracking uri (as well as the port and host) specified the ``mlflow.yml`` file.
18 |
19 | Opens ``http://localhost:5000`` in your browser to see the UI after calling previous command. If your ``mlflow_tracking_uri`` is a ``http[s]`` URL, the command will automatically open it.
20 |
--------------------------------------------------------------------------------
/docs/source/03_experiment_tracking/02_interactive_use/01_notebook_use.md:
--------------------------------------------------------------------------------
1 | # How to use `kedro-mlflow` in a notebook
2 |
3 | ```{important}
4 | You need to install ``ipython`` to access notebook functionalities.
5 | ```
6 |
7 | ## Reminder on mlflow's limitations with interactive use
8 |
9 | Data science project lifecycle are very iterative. Mlflow intends to track parameters changes to improve reproducibility. However, one must be conscious that being able to **execute functions outside of a end to end pipeline** puts a strong burden on the user shoulders **because he is in charge to make the code execution coherent** by running the notebooks cells in the right order. Any back and forth during execution to change some parameters in a previous notebook cells and then retrain a model creates an operational risk that the recorded parameter stored in mlflow is different than the real parameter used for training the model.
10 |
11 | To make a long story short: **forget about efficient reproducibility** when using mlflow interactively.
12 |
13 | It may **still be useful to track some experiments results** especially if they are long to run and vary wildly with parameters, e.g. if you are performing hyperparameter tuning.
14 |
15 | These limitations are inherent to the data science process, not to mlflow itself or the plugin.
16 |
17 | ## Setup mlflow configuration in your notebook
18 |
19 | Open your notebook / ipython session with the Kedro CLI:
20 |
21 | ```bash
22 | kedro jupyter notebook
23 | ```
24 |
25 | Or if you are on JupyterLab,
26 |
27 | ```
28 | %load_ext kedro.ipython
29 | ```
30 |
31 | Kedro [creates a bunch of global variables](https://kedro.readthedocs.io/en/stable/tools_integration/ipython.html#use-kedro-with-ipython-and-jupyter), including a `session`, a ``context`` and a ``catalog`` which are automatically accessible.
32 |
33 | When the context was created, ``kedro-mlflow`` automatically:
34 |
35 | - loaded and setup (create the tracking uri, export credentials...) the mlflow configuration of your `mlflow.yml`
36 | - import ``mlflow`` which is now accessible in your notebook
37 |
38 | If you change your ``mlflow.yml``, reload the kedro extension for the changes to take effect.
39 |
40 | ## Difference with running through the CLI
41 |
42 | - The DataSets `load` and `save` methods works as usual. You can call `catalog.save("my_artifact_dataset", data)` inside a cell, and your data will be logged in mlflow properly (assuming "my_artifact_dataset" is a `kedro_mlflow.io.MlflowArtifactDataset`).
43 | - The `hooks` which automatically save all parameters/metrics/artifacts in mlflow will work if you run the session interactively, e.g.:
44 |
45 | ```python
46 | session.run(
47 | pipeline_name="my_ml_pipeline",
48 | tags="training",
49 | from_inputs="data_2",
50 | to_outputs="data_7",
51 | )
52 | ```
53 |
54 | but it is not very likely in a notebook.
55 |
56 | - if you need to interact manually with the mlflow server, you can use ``context.mlflow.server._mlflow_client``.
57 |
58 | ## Guidelines and best practices suggestions
59 |
60 | During experimentation phase, you will likely not run entire pipelines (or sub pipelines filtered out between some inputs and outputs). Hence, you cannot benefit from Kedro's ``hooks`` (and hence from ``kedro-mlflow`` tracking). From this moment on, perfect reproducbility is impossible to achieve: there is no chance that you manage to maintain a perfectly linear workflow, as you will go back and forth modifying parameters and code to create your model.
61 |
62 | I suggest to :
63 |
64 | - **focus on versioning parameters and metrics**. The goal is to finetune your hyperparameters and to be able to remember later the best setup. It is not very important to this stage to version all parameters (e.g. preprocessing ones) nor models (after all you will need an entire pipeline to predict and it is very unlikely that you will need to reuse these experiment models one day.) It may be interesting to use ``mlflow.autolog()`` feature to have a easy basic setup.
65 | - **transition quickly to kedro pipelines**. For instance, when you preprocessing is roughly defined, try to put it in kedro pipelines. You can then use notebooks to experiment / perfom hyperparameter tuning while keeping preprocessing "fixed" to enhance reproducibility. You can run this pipeline interactively with :
66 |
67 | ```python
68 | result = session.run(
69 | pipeline_name="my_preprocessing_pipeline",
70 | tags="training",
71 | from_inputs="data_2",
72 | to_outputs="data_7",
73 | )
74 | ```
75 |
76 | ``result`` is a python `dict` with the outputs of your pipeline (e.g. a "preprocessed_data" ``pandas.DataFrame``), and you can use it interactively in your notebook.
77 |
--------------------------------------------------------------------------------
/docs/source/03_experiment_tracking/index.md:
--------------------------------------------------------------------------------
1 | # {octicon}`beaker` Experiment tracking
2 |
3 | ```{toctree}
4 | :caption: Experiment tracking
5 |
6 | 01_experiment_tracking/01_configuration
7 | 01_experiment_tracking/02_version_parameters
8 | 01_experiment_tracking/03_version_datasets
9 | 01_experiment_tracking/04_version_models
10 | 01_experiment_tracking/05_version_metrics
11 | ```
12 |
13 | ```{toctree}
14 | :caption: Visualise experiments
15 |
16 | 01_experiment_tracking/06_mlflow_ui
17 | ```
18 |
19 | ```{toctree}
20 | :caption: Interactive use
21 |
22 | 02_interactive_use/01_notebook_use
23 | ```
24 |
--------------------------------------------------------------------------------
/docs/source/04_pipeline_as_model/01_pipeline_as_custom_model/01_mlflow_models.md:
--------------------------------------------------------------------------------
1 | # Introduction to mlflow models
2 |
3 | ## What are Mlflow Models ?
4 |
5 | [Mlflow Models are a standardised agnostic format to store machine learning models](https://www.mlflow.org/docs/latest/models.html). They intend to be standalone to be as portable as possible to be deployed virtually anywhere and mlflow provides built-in CLI commands to deploy a mlflow model to most common cloud platforms or to create an API.
6 |
7 | A Mlflow Model is composed of:
8 |
9 | - a ``MLModel`` file which is a configuration file to indicate to mlflow how to load the model. This file may also contain the ``Signature`` of the model (i.e. the ``Schema`` of the input and output of your model, including the columns names and order) as well as example data.
10 | - a ``conda.yml`` file which contains the specifications of the virtual conda environment inside which the model should run. It contains the packages versions necessary for your model to be executed.
11 | - a ``model.pkl`` (or a ``python_function.pkl`` for custom model) file containing the trained model.
12 | - an ``artifacts`` folder containing all other data necessary to execute the models
13 |
14 | ```{important}
15 | Mlflow enable to create **custom models "flavors" to convert any object to a Mlflow Model** provided we have these informations. Inside a Kedro project, the ``Pipeline`` and ``DataCatalog`` objects contain all these informations. As a consequence, it is easy to create a custom model to convert entire Kedro ``Pipeline``s to mlflow models, and it the purpose of ``pipeline_ml_factory`` and ``KedroPipelineModel`` that we will present in the following sections.
16 | ```
17 |
18 | ## Pre-requisite for converting a pipeline to a mlflow model
19 |
20 | You can log any Kedro ``Pipeline`` matching the following requirements:
21 |
22 | - one of its input must be a ``pandas.DataFrame``, a ``spark.DataFrame`` or a ``numpy.array``. This is the **input which contains the data to predict on**. This can be any Kedro ``AbstractDataset`` which loads data in one of the previous three formats. It can also be a ``MemoryDataset`` and not be persisted in the ``catalog.yml``.
23 | - all its other inputs must be persisted on disk (e.g. if the machine learning model must already be trained and saved so we can export it) or declared as "parameters" in the model ``Signature``.
24 |
25 | ```{warning}
26 | If the pipeline has parameters :
27 | - For ``mlflow<2.7.0`` the parameters need to be persisted before exporting the model, which implies that you will not be able to modify them at runtime. This is a limitation of ``mlflow<2.6.0``
28 | - For ``mlflow>=2.7.0`` , they can be declared in the signature and modified at runtime. See https://github.com/Galileo-Galilei/kedro-mlflow/issues/445 for more information.
29 | ```
30 |
--------------------------------------------------------------------------------
/docs/source/04_pipeline_as_model/01_pipeline_as_custom_model/02_scikit_learn_like_pipeline.md:
--------------------------------------------------------------------------------
1 | # Scikit-learn like Kedro pipelines - Automatically log the inference pipeline after training
2 |
3 | For consistency, you may want to **log an inference pipeline** (including some data preprocessing and prediction post processing) **automatically after you ran a training pipeline**, with all the artifacts generated during training (the new model, encoders, vectorizers...).
4 |
5 | ```{hint}
6 | You can think of ``pipeline_ml_factory`` as "**scikit-learn like pipeline in kedro**". Running ``kedro run -p training`` performs the scikit-learn's ``pipeline.fit()`` operation, storing all components (e.g. a model) we need to reuse further as mlflow artifacts and the inference pipeline as code. Hence, you can later use this mlflow model which will perform the scikit-learn's ``pipeline.predict(new_data)`` operation by running the entire kedro inference pipeline.
7 | ```
8 |
9 | ## Getting started with pipeline_ml_factory
10 |
11 | ```{note}
12 | Below code assume that for inference, you want to skip some nodes that are training specific, e.g. you don't want to train the model, you just want to predict with it ; you don't want to fit and transform with you encoder, but only transform. Make sure these 2 steps ("train" and "predict", or "fit and "transform") are separated in 2 differnt nodes in your pipeline, so you can skip the train / transform step at inference time.
13 | ```
14 |
15 | You can configure your project as follows:
16 |
17 | 1. Install ``kedro-mlflow`` ``MlflowHook`` (this is done automatically if you have installed ``kedro-mlflow`` in a ``kedro>=0.16.5`` project)
18 | 2. Turn your training pipeline in a ``PipelineML`` object with ``pipeline_ml_factory`` function in your ``pipeline_registry.py``:
19 |
20 | ```python
21 | # pipeline_registry.py for kedro>=0.17.2 (hooks.py for ``kedro>=0.16.5, <0.17.2)
22 |
23 | from kedro_mlflow_tutorial.pipelines.ml_app.pipeline import create_ml_pipeline
24 |
25 |
26 | def register_pipelines(self) -> [str, Pipeline]:
27 | ml_pipeline = create_ml_pipeline()
28 | training_pipeline_ml = pipeline_ml_factory(
29 | training=ml_pipeline.only_nodes_with_tags(
30 | "training"
31 | ), # nodes : encode_labels + preprocess + train_model + predict + postprocess + evaluate
32 | inference=ml_pipeline.only_nodes_with_tags(
33 | "inference"
34 | ), # nodes : preprocess + predict + postprocess
35 | input_name="instances",
36 | log_model_kwargs=dict(
37 | artifact_path="kedro_mlflow_tutorial",
38 | conda_env={
39 | "python": 3.10,
40 | "dependencies": [f"kedro_mlflow_tutorial=={PROJECT_VERSION}"],
41 | },
42 | signature="auto",
43 | ),
44 | )
45 |
46 | return {"training": training_pipeline_ml}
47 | ```
48 |
49 | 3. Persist all your artifacts locally in the ``catalog.yml``
50 |
51 | ```yaml
52 | label_encoder:
53 | type: pickle.PickleDataset # <- This must be any Kedro Dataset other than "MemoryDataset"
54 | filepath: data/06_models/label_encoder.pkl # <- This must be a local path, no matter what is your mlflow storage (S3 or other)
55 | ```
56 |
57 | and as well for your model if necessary.
58 |
59 | 4. Launch your training pipeline:
60 |
61 | ```bash
62 | kedro run --pipeline=training
63 | ```
64 |
65 | **The inference pipeline will _automagically_ be logged as a custom mlflow model** (a ``KedroPipelineModel``) **at the end of the training pipeline!**.
66 |
67 | 5. Go to the UI, retrieve the run id of your "inference pipeline" model and use it as you want, e.g. in the `catalog.yml`:
68 |
69 | ```yaml
70 | # catalog.yml
71 |
72 | pipeline_inference_model:
73 | type: kedro_mlflow.io.models.MlflowModelTrackingDataset
74 | flavor: mlflow.pyfunc
75 | pyfunc_workflow: python_model
76 | artifact_path: kedro_mlflow_tutorial # the name of your mlflow folder = the model_name in pipeline_ml_factory
77 | run_id:
78 | ```
79 |
80 | Now you can run the entire inference pipeline inside a node as part of another pipeline.
81 |
82 | ## Advanced configuration for pipeline_ml_factory
83 |
84 | ### Register the model as a new version in the mlflow registry
85 |
86 | The ``log_model_kwargs`` argument is passed to the underlying [mlflow.pyfunc.log_model](https://mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#mlflow.pyfunc.log_model). Specifically, it accepts a ``registered_model_name`` argument :
87 |
88 | ```python
89 | pipeline_ml_factory(
90 | training=ml_pipeline.only_nodes_with_tags("training"),
91 | inference=ml_pipeline.only_nodes_with_tags("inference"),
92 | input_name="instances",
93 | log_model_kwargs=dict(
94 | artifact_path="kedro_mlflow_tutorial",
95 | registered_model_name="my_inference_pipeline", # a new version of "my_infernce_pipeline" model will be registered each time you run the "training" pipeline
96 | conda_env={
97 | "python": 3.10,
98 | "dependencies": [f"kedro_mlflow_tutorial=={PROJECT_VERSION}"],
99 | },
100 | signature="auto",
101 | ),
102 | )
103 | ```
104 |
105 | ## Complete step by step demo project with code
106 |
107 | A step by step tutorial with code is available in the [kedro-mlflow-tutorial repository on github](https://github.com/Galileo-Galilei/kedro-mlflow-tutorial#serve-the-inference-pipeline-to-a-end-user).
108 |
109 | You have also other resources to understand the rationale:
110 |
111 | - an explanation of the [``PipelineML`` class in the python objects section](https://kedro-mlflow.readthedocs.io/en/latest/source/05_API/01_python_objects/03_Pipelines.html)
112 | - detailed explanations [on this issue](https://github.com/Galileo-Galilei/kedro-mlflow/issues/16) and [this discussion](https://github.com/Galileo-Galilei/kedro-mlflow/discussions/229).
113 | - an example of use in a user project [in this repo](https://github.com/laurids-reichardt/kedro-examples/blob/kedro-mlflow-hotfix2/text-classification/src/text_classification/pipelines/pipeline.py).
114 |
--------------------------------------------------------------------------------
/docs/source/04_pipeline_as_model/01_pipeline_as_custom_model/03_deployment_patterns.md:
--------------------------------------------------------------------------------
1 | # Deployment patterns for kedro pipelines as model
2 |
3 | A step by step tutorial with code is available in the [kedro-mlflow-tutorial repository on github](https://github.com/Galileo-Galilei/kedro-mlflow-tutorial#serve-the-inference-pipeline-to-an-end-user) which explains how to serve the pipeline as an API or a batch.
4 |
5 | ## Deploying a KedroPipelineModel
6 |
7 | ::::{tab-set}
8 |
9 | :::{tab-item} Reuse from a python script
10 |
11 | ```{note}
12 | See tutorial:
13 | ```
14 |
15 | If you want to load and predict with your model from python, the ``load_model`` function of mlflow is what you need:
16 |
17 | ```python
18 | PROJECT_PATH = r""
19 | RUN_ID = ""
20 |
21 | from kedro.framework.startup import bootstrap_project
22 | from kedro.framework.session import KedroSession
23 | from mlflow.pyfunc import load_model
24 |
25 | bootstrap_project(PROJECT_PATH)
26 | session = Kedrosession.create(
27 | session_id=1,
28 | project_path=PROJECT_PATH,
29 | package_name="kedro_mlflow_tutorial",
30 | )
31 | local_context = session.load_context() # setup mlflow config
32 |
33 | instances = local_context.io.load("instances")
34 | model = load_model(f"runs:/{RUN_ID}/kedro_mlflow_tutorial")
35 |
36 | predictions = model.predict(
37 | instances
38 | ) # runs ``session.run(pipeline=inference)`` with the artifacts created ruing training. You should see the kedro logs.
39 | ```
40 |
41 | The ``predictions`` object is a ``pandas.DataFrame`` and can be handled as usual.
42 | :::
43 |
44 | :::{tab-item} Reuse in a kedro pipeline
45 |
46 | ```{note}
47 | See tutorial:
48 | ```
49 |
50 | Say that you want to reuse this trained model in a kedro Pipeline, like the user_app. The easiest way to do it is to add the model in the catalog.yml file
51 |
52 | ```yaml
53 | pipeline_inference_model:
54 | type: kedro_mlflow.io.models.MlflowModelLoggerDataSet
55 | flavor: mlflow.pyfunc
56 | pyfunc_workflow: python_model
57 | artifact_path: kedro_mlflow_tutorial # the name of your mlflow folder = the model_name in pipeline_ml_factory
58 | run_id: # put it in globals.yml to help people find out what to modify
59 | ```
60 |
61 | Then you can reuse it in a node to predict with this model which is the entire inference pipeline at the time you launched the training.
62 |
63 | ```python
64 | # nodes.py
65 | def predict_from_model(model, data):
66 | return model.predict(data)
67 |
68 |
69 | # pipeline.py
70 | def create_pipeline():
71 | return pipeline(
72 | [
73 | node(
74 | func=predict_from_model,
75 | inputs={"model": pipeline_inference_model, "data": "validation_data"},
76 | )
77 | ]
78 | )
79 | ```
80 |
81 | :::
82 |
83 | :::{tab-item} Serve the model with mlflow
84 |
85 | ```{note}
86 | See tutorial:
87 | ```
88 |
89 | Mlflow provide helpers to serve the model as an API with one line of code:
90 |
91 | ``mlflow models serve -m "runs://kedro_mlflow_tutorial"``
92 |
93 | This will serve your model as an API (beware: there are known issues on windows). You can test it with:
94 | ``curl -d "{\"columns\":[\"text\"],\"index\":[0,1],\"data\":[[\"This movie is cool\"],[\"awful film\"]]}" -H "Content-Type: application/json" localhost:5000/invocations``
95 | :::
96 |
97 | ::::
98 |
99 | ## Frequently asked questions
100 |
101 | :::{dropdown} How can I pass parameters at runtime to a ``KedroPipelineModel``?
102 |
103 | Since ``kedro-mlflow>0.14.0``, you can pass parameters when predicting with a ``KedroPipelineModel`` object.
104 |
105 | We assume you've trained a model with ``pipeline_factory_function``. First, load the model, e.g. through the catalog or as described in the previous section:
106 |
107 | ```yaml
108 | # catalog.yml
109 | pipeline_inference_model:
110 | type: kedro_mlflow.io.models.MlflowModelTrackingDataset
111 | flavor: mlflow.pyfunc
112 | pyfunc_workflow: python_model
113 | artifact_path: kedro_mlflow_tutorial # the name of your mlflow folder = the model_name in pipeline_ml_factory
114 | run_id:
115 | ```
116 |
117 | Then, pass params as a dict under the ``params`` argument of the ``predict`` method:
118 |
119 | ```python
120 | catalog.load("pipeline_inference_model") # You can also load it in a node "as usual"
121 | predictions = model.predict(input_data, params={"my_param": ""})
122 | ```
123 |
124 | ```{warning}
125 | This will only work if ``my_param`` is a parameter (i.e. prefixed with ``params:``) of the inference pipeline.
126 | ```
127 |
128 | ```{tip}
129 | Available params are visible in the model signature in the UI
130 | ```
131 |
132 | :::
133 |
134 | :::{dropdown} How can I change the runner at runtime when predicting with a ``KedroPipelineModel``?
135 |
136 | Assuming the syntax of previous section, a special key in "params" is reserved for the kedro runner:
137 |
138 | ```python
139 | catalog.load("pipeline_inference_model")
140 | predictions = model.predict(
141 | input_data, params={"my_param": "", "runner": "ThreadRunner"}
142 | )
143 | ```
144 |
145 | ```{tip}
146 | You can pass any kedro runner, or even a custom runner by using the path to the module: ``params={"runner": "my_package.my_module.MyRunner"}``
147 | ```
148 |
149 | :::
150 |
--------------------------------------------------------------------------------
/docs/source/04_pipeline_as_model/01_pipeline_as_custom_model/04_custom_kedro_pipeline_model.md:
--------------------------------------------------------------------------------
1 | # Custom registering of a ``KedroPipelineModel``
2 |
3 | ```{warning}
4 | The goal of this section is to give tool to machine learning engineer or platform engineer to reuse the objects and customize the workflow. This is specially useful in case you need high customisation or fine grained control of the kedro objects or the mlflow model attributes. This is **very unlikely you need this section** if you are using a kedro project "in the standard way" as a data scientist, in which case you should refer to the section [scikit-learn like pipeline in kedro](https://kedro-mlflow.readthedocs.io/en/stable/source/).
5 | ```
6 |
7 | ## Log a pipeline to mlflow programatically with ``KedroPipelineModel`` custom mlflow model
8 |
9 | ```{hint}
10 | When using the ``KedroPipelineModel`` programatically, we focus only on the ``inference`` pipeline. We assume That you already ran the ``training`` pipeline previously, and that you now want to log the ``inference`` pipeline in mlflow manually by retrieveing all the needed objects to create the custom model.
11 | ```
12 |
13 | ``kedro-mlflow`` has a ``KedroPipelineModel`` class (which inherits from ``mlflow.pyfunc.PythonModel``) which can turn any kedro ``Pipeline`` object to a Mlflow Model.
14 |
15 | To convert a ``Pipeline`` to a mlflow model, you need to create a ``KedroPipelineModel`` and then log it to mlflow. An example is given in below snippet:
16 |
17 | ```python
18 | from pathlib import Path
19 | from kedro.framework.session import KedroSession
20 | from kedro.framework.startup import bootstrap_project
21 |
22 | bootstrap_project(r"")
23 | session = KedroSession.create(project_path=r"")
24 |
25 | # "pipeline" is the Pipeline object you want to convert to a mlflow model
26 |
27 | context = session.load_context() # this setups mlflow configuration
28 | catalog = context.catalog
29 | pipeline = context.pipelines[""]
30 | input_name = "instances"
31 |
32 |
33 | # artifacts are all the inputs of the inference pipelines that are persisted in the catalog
34 |
35 | # (optional) get the schema of the input dataset
36 | input_data = catalog.load(input_name)
37 | model_signature = infer_signature(
38 | model_input=input_data
39 | ) # if you want to pass parameters in "predict", you should specify them in the signature
40 |
41 | # you can optionally pass other arguments, like the "copy_mode" to be used for each dataset
42 | kedro_pipeline_model = KedroPipelineModel(
43 | pipeline=pipeline, catalog=catalog, input_name=input_name
44 | )
45 |
46 | artifacts = kedro_pipeline_model.extract_pipeline_artifacts()
47 |
48 | mlflow.pyfunc.log_model(
49 | artifact_path="model",
50 | python_model=kedro_pipeline_model,
51 | artifacts=artifacts,
52 | conda_env={"python": "3.10.0", dependencies: ["kedro==0.18.11"]},
53 | model_signature=model_signature,
54 | )
55 | ```
56 |
57 | ```{important}
58 | Note that you need to provide the ``log_model`` function a bunch of non trivial-to-retrieve informations (the conda environment, the "artifacts" i.e. the persisted data you need to reuse like tokenizers / ml models / encoders, the model signature i.e. the columns names and types and the predict parameters...). The ``KedroPipelineModel`` object has methods like `extract_pipeline_artifacts` to help you, but it needs some work on your side.
59 | ```
60 |
61 | ```{note}
62 | Saving Kedro pipelines as Mlflow Model objects is convenient and enable pipeline serving. However, it does not does not solve the decorrelation between training and inference: each time one triggers a training pipeline, (s)he must think to save it immediately afterwards. `kedro-mlflow` offers a convenient API through hooks to simplify this workflow, as described in the section [scikit-learn like pipeline in kedro](https://kedro-mlflow.readthedocs.io/en/stable/source/) .
63 | ```
64 |
65 | ## Log a pipeline to mlflow with the CLI
66 |
67 | ```{note}
68 | This command is mainly a helper to relog a model manually without retraining (e.g. because you slighlty modify the preprocessing or post processing and don't want to train again.)
69 | ```
70 |
71 | ```{warning}
72 | We **assume that you already ran the ``training`` pipeline previously**, which created persisted artifacts. Now you want to trigger logging the ``inference`` pipeline in mlflow trhough the CLI. This is dangerous because the commmand does not check that your pipeline is working correctly or that the perssited model has not been modified.
73 | ```
74 |
75 | You can log a Kedro ``Pipeline`` to mlflow as a custom model through the CLI with ``modelify`` command:
76 |
77 | ```bash
78 | kedro mlflow modelify --pipeline= --input-name
79 | ```
80 |
81 | This command will create a new run with an artifact named ``model`` and persist it the code fo your pipeline and all its inputs as artifacts (hence they should have been created *before* running this command, e.g. the model should already be persisted on the disk). Open the user interface with ``kedro mlflow ui`` to check the result. You can also:
82 |
83 | - specify the run id in which you want to log the pipeline with the ``--run-id`` argument, and its name with the ``--run-name`` argument.
84 | - pass almost all arguments accepted by [``mlflow.pyfunc.log_model``](https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#mlflow.pyfunc.log_model), see the list of all accepted arguments in the [API documentation](https://kedro-mlflow.readthedocs.io/en/latest/source/05_API/01_python_objects/04_CLI.html#modelify)
85 |
--------------------------------------------------------------------------------
/docs/source/04_pipeline_as_model/02_framework_ml/02_ml_project_components.md:
--------------------------------------------------------------------------------
1 | # The components of a machine learning application
2 |
3 | ## Definition: apps of a machine learning projects
4 |
5 | A machine learning project is composed of 3 main blocks that I will call "apps" in the rest of the paragraph. These 3 apps are:
6 |
7 | - The *etl_app*, which is the application in charge of bringing the data to the machine learning pipeline
8 | - The *ml_app*, which is the application in charge of managing the machine learning model (including training and inference)
9 | - The *user_app* which is the application in charge of consuming the predictions of the machine learning model and doing the actual business logic with it
10 |
11 | ## Difference between an app and a Kedro pipeline
12 |
13 | Note that the previously defined "apps" are not pipelines in the Kedro sense. On the contrary, each app likely contain several (Kedro?) pipelines.
14 |
15 | The main differences between these apps are:
16 |
17 | - Each app development / deployment is likely under the responsibility of different people / teams.
18 | - Each app has a different development lifecyle. It implies that development can be parallelized, and releasing one app to fix a bug does not imply to release the other ones. If your training pipeline is time /resources consuming, you do not want a bugfix in the *user_app* to trigger a retraining of your model, do you?
19 | - Each app has its own orchestration timeline. For instance, the data produced by the etl can be stored independently of whether the *user_app* and the *ml_app* consume them "on the fly" or not.
20 | - Each app do not communicate with the other apart from a clear interface: the data schema accepted as inputs/ output of each app.
21 |
22 | ## Apps development lifecycle in a machine learning project
23 |
24 | ### The data scientist creates at least part of the 3 apps
25 |
26 | Note that there are **as many _etl_app_ and _user_app_** as needed for the different use of your model. Since **training the model is a specific use, the data scientist will need one to create its own _etl_app_ and _user_app_**. These apps will very likely be replaced later by the true business app dedicated to the model use.
27 |
28 | We saw that the data scientist has to create some code that will be replaced by other people code when deploying the model. As a consequence, the interactions between these apps must be very clearly defined at the beginning of the project. We claim that it is possible to cover most use case with the following schema:
29 |
30 | 
31 |
32 | The *ml_app* takes `instances` (i.e. examples of the business object to handle) as input. This implies that the *ml_app* will include some machine learning-specific preprocessing and not only the model training. It also (optionally) takes labels as inputs if the underlying problem is supervised. Even in this situation, the labels will not be known at inference time so the *etl_app* does not necessarily produce them.
33 |
34 | This is a key principle: anyone who wants to consume the model later will need to bring instances of the same business object.
35 |
36 | ### The *etl_app*
37 |
38 | The *etl_app* is the one in charge of bringing the data to the *ml_app*. As a consequence, each different *user_app* will likely have to develop its associated *etl_app* to consume the *ml_app*.
39 |
40 | From the data scientist point of view, this app will create the training dataset. This app can do very different things:
41 |
42 | - send request over an API
43 | - extract from a database (with SQL, SAS...)
44 | - scrape data from a website
45 | - download data from an URL
46 | - read data from disk
47 | - ...
48 |
49 | For the labels, in addition of above possibility, this app can be a **labelling tool** with human labellers who provide the needed "true reference" as labels.
50 |
51 | It is also common to mix several of above approaches to gather different data sources, and to have different Kedro pipelines in this app.
52 |
53 | Note that during a training, this app very likely retrieves batch data from a given time period. This will necessarily be different when using the model, because the user often want to use live stream data.
54 |
55 | ### The *ml_app*
56 |
57 | This app is the core of the data scientist work. It is at least composed of two kedro pipelines:
58 |
59 | - a *training* pipeline, which produces all the artifacts (e.g. any object fitted on data, including obviously the machine learning model itself)
60 | - an *inference* pipeline which takes an instance as input and returns the prediction of the model
61 |
62 | It is quite common to have other pipelines depending on the data scientist needs (an *evaluation* pipelines which produces metrics for a given model, an *explanation* pipeline to produce explanation for a specific instance like shap values or importance pixel, ...).
63 |
64 | It is quite common to see data scientists duplicate the code when creating the inference pipeline, because it is written after the training pipeline. **Thanks to kedro tags, it is possible to mark a node to use it in two different pipelines**. Reuse is a key component to improve quality and deployment speed. **Each time a node is created (i.e. a function is called), the data scientist should wonder if it will be used in *training* pipeline only or in both (*training* and *inference*), and tag it accordingly.**
65 |
66 | ### The *user_app*
67 |
68 | The *user_app* must not be aware of how the inference pipeline operates under the hood. The *user_app* must either:
69 |
70 | - takes a *run_id* from mlflow to retrieve the model from mlflow and predict with it. This is mainly useful for batch predictions.
71 | - call the served model from an API endpoint and only get predictions as inputs. This assumes that the model has been served, which is very easy with mlflow.
72 |
73 | After that, the *user_app* can use the predictions and apply any needed business logic to them.
74 |
--------------------------------------------------------------------------------
/docs/source/04_pipeline_as_model/02_framework_ml/03_framework_solutions.md:
--------------------------------------------------------------------------------
1 | # ``kedro-mlflow`` mlops solution
2 |
3 | ## Reminder
4 |
5 | We assume that we want to solve the following challenges among those described in ["Why we need a mlops framework"](https://kedro-mlflow.readthedocs.io/en/latest/source/04_pipeline_as_model/02_framework_ml/01_why_framework.html#) section:
6 |
7 | - serve pipelines (which handles business objects) instead of models
8 | - synchronize training and inference by packaging inference pipeline at training time
9 |
10 | ## Enforcing these principles with a dedicated tool
11 |
12 | ### Synchronizing training and inference pipeline
13 |
14 | To solve the problem of desynchronization between training and inference, ``kedro-mlflow`` offers a `PipelineML` class (which subclasses Kedro `Pipeline` class). A `PipelineML` is simply a Kedro standard ``Pipeline`` (the "training") which has a reference to another ``Pipeline`` (the "inference"). The two pipelines must share a common input DataSet name, which represents the data you will perform operations on (either train on for the training pipeline, or predict on for the inference pipeline).
15 |
16 | This class implements several methods to compare the ``DataCatalog``s associated to each of the two binded pipelines and performs subsetting oparations. This makes it quite difficult to handle directly. Fortunately, ``kedro-mlflow`` provides a convenient API to create ``PipelineML`` objects: the ``pipeline_ml_factory`` function.
17 |
18 | The use of ``pipeline_ml_factory`` is very straightforward, especially if you have used the [project architecture described previously](https://kedro-mlflow.readthedocs.io/en/latest/source/04_pipeline_as_model/02_framework_ml/02_ml_project_components.html). The best place to create such an object is your `hooks.py` file which will look like this:
19 |
20 | ```python
21 | # hooks.py
22 | from kedro_mlflow_tutorial.pipelines.ml_app.pipeline import create_ml_pipeline
23 |
24 |
25 | class ProjectHooks:
26 | @hook_impl
27 | def register_pipelines(self) -> [str, Pipeline]:
28 | ml_pipeline = create_ml_pipeline()
29 |
30 | # convert your two pipelines to a PipelinML object
31 | training_pipeline_ml = pipeline_ml_factory(
32 | training=ml_pipeline.only_nodes_with_tags("training"),
33 | inference=ml_pipeline.only_nodes_with_tags("inference"),
34 | input_name="instances",
35 | )
36 |
37 | return {"__default__": training_pipeline_ml}
38 | ```
39 |
40 | > So, what? We have created a link between our two pipelines, but the gain is not obvious at first glance. The 2 following sections demonstrates that such a construction enables to package and serve automatically the inference pipeline when executing the training one.
41 |
42 | ### Packaging and serving a Kedro Pipeline
43 |
44 | Mlflow offers the possibility to create [custom model class](https://www.mlflow.org/docs/latest/models.html#custom-python-models). Mlflow offers a variety of tool to package/containerize, deploy and serve such models.
45 |
46 | ``kedro-mlflow`` has a ``KedroPipelineModel`` class (which inherits from ``mlflow.pyfunc.PythonModel``) which can turn any kedro ``PipelineML`` object to a Mlflow Model.
47 |
48 | To convert a ``PipelineML``, you need to declare it as a ``KedroPipelineModel`` and then log it to mlflow:
49 |
50 | ```python
51 | from pathlib import Path
52 | from kedro.framework.context import load_context
53 | from kedro_mlflow.mlflow import KedroPipelineModel
54 | from mlflow.models import ModelSignature
55 |
56 | # pipeline_training is your PipelineML object, created as previsously
57 | catalog = load_context(".").io
58 |
59 | # artifacts are all the inputs of the inference pipelines that are persisted in the catalog
60 | artifacts = pipeline_training.extract_pipeline_artifacts(catalog)
61 |
62 | # (optional) get the schema of the input dataset
63 | input_data = catalog.load(pipeline_training.input_name)
64 | model_signature = infer_signature(model_input=input_data)
65 |
66 | kedro_model = KedroPipelineModel(pipeline=pipeline_training, catalog=catalog)
67 |
68 | mlflow.pyfunc.log_model(
69 | artifact_path="model",
70 | python_model=kedro_model,
71 | artifacts=artifacts,
72 | conda_env={"python": "3.10.0", dependencies: ["kedro==0.18.11"]},
73 | signature=model_signature,
74 | )
75 | ```
76 |
77 | Note that you need to provide the ``log_model`` function a bunch of non trivial-to-retrieve informations (the conda environment, the "artifacts" i.e. the persisted data you need to reuse like tokenizers / ml models / encoders, the model signature i.e. the columns names and types...). The ``PipelineML`` object has methods like `extract_pipeline_artifacts` to help you, but it needs some work on your side.
78 |
79 | > Saving Kedro pipelines as Mlflow Model objects is convenient and enable pipeline serving serving. However, it does not does not solve the decorrelation between training and inference: each time one triggers a training pipeline, (s)he must think to save it immediately afterwards. Good news: triggering operations at some "execution moment" of a Kedro ``Pipeline`` (like after it finished runnning) is exactly what hooks are designed for!
80 |
81 | ### kedro-mlflow's magic: inference autologging
82 |
83 | When running the training pipeline, we have all the desired informations we want to pass to the ``KedroPipelineModel`` class and ``mlflow.pyfunc.log_model`` function:
84 |
85 | - the artifacts exist in the DataCatalog if they are persisted
86 | - the "instances" dataset is loaded at the beginning of training, thus we can infer its schema (columns names and types)
87 | - the inference and training pipeline codes are retrieved at the same moments, so consistency checks can be performed
88 |
89 | Hence, ``kedro-mlflow`` provides a ``MlflowHook.after_pipeline_run`` hook which perfoms the following operations:
90 |
91 | - check if the pipeline that have ust been run is a ``PipelineML`` object
92 | - in case it is, create the ``KedroPipelineModel`` like above and log it to mlflow
93 |
94 | > We have achieved perfect synchronicity since the exact inference pipeline (with code, and artifacts) will be logged in mlflow each time the training pipeline is executed. The model is than accessible in the mlflow UI "artifacts" section and can be downloaded, or [served as an API with the ``mlflow serve`` command](https://www.mlflow.org/docs/latest/cli.html#mlflow-models-serve), or [it can be used in the `catalog.yml` with the `MlflowModelTrackingDataset` for further reuse](https://github.com/Galileo-Galilei/kedro-mlflow-tutorial#serve-the-inference-pipeline-to-a-end-user).
95 |
96 | ### Reuse the model in kedro
97 |
98 | Say that you an to reuse this inference model as the input of another kedro pipeline (one of the "user_app" application). ``kedro-mlflow`` provides a ``MlflowModelTrackingDataset`` class which can be used int the ``catalog.yml`` file:
99 |
100 | ```yaml
101 | # catalog.yml
102 |
103 | pipeline_inference_model:
104 | type: kedro_mlflow.io.models.MlflowModelTrackingDataset
105 | flavor: mlflow.pyfunc
106 | pyfunc_workflow: python_model
107 | artifact_path: kedro_mlflow_tutorial # the name of your mlflow folder = the model_name in pipeline_ml_factory
108 | run_id:
109 | ```
110 |
--------------------------------------------------------------------------------
/docs/source/04_pipeline_as_model/index.md:
--------------------------------------------------------------------------------
1 |
2 | # {octicon}`rocket` Pipeline as model
3 |
4 | ```{toctree}
5 | :caption: Pipeline as model
6 |
7 | 01_pipeline_as_custom_model/01_mlflow_models
8 | 01_pipeline_as_custom_model/02_scikit_learn_like_pipeline
9 | 01_pipeline_as_custom_model/03_deployment_patterns
10 | 01_pipeline_as_custom_model/04_custom_kedro_pipeline_model
11 | ```
12 |
13 | ```{toctree}
14 | :caption: kedro-mlflow as a mlops framework
15 |
16 | 02_framework_ml/01_why_framework
17 | 02_framework_ml/02_ml_project_components
18 | 02_framework_ml/03_framework_solutions
19 | ```
20 |
--------------------------------------------------------------------------------
/docs/source/05_API/01_python_objects/02_Hooks.md:
--------------------------------------------------------------------------------
1 | # ``Hooks``
2 |
3 | This package provides 1 new hook.
4 |
5 | ## ``MlflowHook``
6 |
7 | This hook :
8 |
9 | 1. manages mlflow settings at the beginning and the end of the run (run start / end).
10 | 2. autolog nodes parameters each time the pipeline is run (with ``kedro run`` or programatically).
11 | 3. log useful informations for reproducibility as ``mlflow tags`` (including kedro ``Journal`` information for old kedro versions and the commands used to launch the run).
12 | 4. register the pipeline as a valid ``mlflow model`` if [it is a ``PipelineML`` instance](https://kedro-mlflow.readthedocs.io/en/latest/source/05_API/01_python_objects/03_Pipelines.html)
13 |
--------------------------------------------------------------------------------
/docs/source/05_API/01_python_objects/03_Pipelines.md:
--------------------------------------------------------------------------------
1 | # Pipelines
2 |
3 | ## ``PipelineML`` and ``pipeline_ml_factory``
4 |
5 | ``PipelineML`` is a new class which extends ``Pipeline`` and enable to bind two pipelines (one of training, one of inference) together. This class comes with a ``KedroPipelineModel`` class for logging it in mlflow. A pipeline logged as a mlflow model can be served using ``mlflow models serve`` and ``mlflow models predict`` command.
6 |
7 | The ``PipelineML`` class is not intended to be used directly. A ``pipeline_ml_factory`` factory is provided for user friendly interface.
8 |
9 | Example within kedro template:
10 |
11 | ```python
12 | # in src/PYTHON_PACKAGE/pipeline.py
13 |
14 | from PYTHON_PACKAGE.pipelines import data_science as ds
15 |
16 |
17 | def create_pipelines(**kwargs) -> dict[str, Pipeline]:
18 | data_science_pipeline = ds.create_pipeline()
19 | training_pipeline = pipeline_ml_factory(
20 | training=data_science_pipeline.only_nodes_with_tags(
21 | "training"
22 | ), # or whatever your logic is for filtering
23 | inference=data_science_pipeline.only_nodes_with_tags("inference"),
24 | )
25 |
26 | return {
27 | "ds": data_science_pipeline,
28 | "training": training_pipeline,
29 | "__default__": data_engineering_pipeline + data_science_pipeline,
30 | }
31 | ```
32 |
33 | Now each time you will run ``kedro run --pipeline=training`` (provided you registered ``MlflowHook`` in you ``run.py``), the full inference pipeline will be registered as a mlflow model (with all the outputs produced by training as artifacts : the machine learning model, but also the *scaler*, *vectorizer*, *imputer*, or whatever object fitted on data you create in ``training`` and that is used in ``inference``).
34 |
35 | Note that:
36 |
37 | - the `inference` pipeline `input_name` can be a `MemoryDataset` and it belongs to inference pipeline `inputs`
38 | - Apart form `input_name`, all other `inference` pipeline `inputs` must be persisted locally on disk (i.e. it must not be `MemoryDataset` and must have a local `filepath`)
39 | - the `inference` pipeline `inputs` must belong to training `outputs` (vectorizer, binarizer, machine learning model...)
40 | - the `inference` pipeline must have one and only one `output`
41 |
42 | ```{caution}
43 | ``PipelineML`` objects do not implement all filtering methods of a regular ``Pipeline``, and you cannot add or substract 2 ``PipelineML`` together. The rationale is that a filtered ``PipelineML`` is not a ``PipelineML`` in general, because the [filtering is not consistent between training and inference](https://github.com/Galileo-Galilei/kedro-mlflow/issues/554). You can see the ones which are supported [in the code](https://github.com/Galileo-Galilei/kedro-mlflow/blob/master/kedro_mlflow/pipeline/pipeline_ml.py#L162).
44 | ```
45 |
46 | You can also directly log a ``PipelineML`` object in ``mlflow`` programatically:
47 |
48 | ```python
49 | from pathlib import Path
50 | from kedro.framework.context import load_context
51 | from kedro_mlflow.mlflow import KedroPipelineModel
52 | from mlflow.models import ModelSignature
53 |
54 | # pipeline_training is your PipelineML object, created as previsously
55 | catalog = load_context(".").io
56 |
57 | # artifacts are all the inputs of the inference pipelines that are persisted in the catalog
58 | artifacts = pipeline_training.extract_pipeline_artifacts(catalog)
59 |
60 | # get the schema of the input dataset
61 | input_data = catalog.load(pipeline_training.input_name)
62 | model_signature = infer_signature(model_input=input_data)
63 |
64 | mlflow.pyfunc.log_model(
65 | artifact_path="model",
66 | python_model=KedroPipelineModel(pipeline=pipeline_training, catalog=catalog),
67 | artifacts=artifacts,
68 | conda_env={"python": "3.10.0", dependencies: ["kedro==0.18.11"]},
69 | signature=model_signature,
70 | )
71 | ```
72 |
73 | It is also possible to pass arguments to `KedroPipelineModel` to specify the runner or the copy_mode of ``MemoryDataset`` for the inference ``Pipeline``. This may be faster especially for compiled model (e.g keras, tensorflow...), and more suitable for an API serving pattern. Since ``kedro-mlflow==0.12.0``, ``copy_mode="assign"`` has become the default.
74 |
75 | ```python
76 | KedroPipelineModel(pipeline=pipeline_training, catalog=catalog, copy_mode="assign")
77 | ```
78 |
79 | Available `copy_mode` are ``assign``, ``copy`` and ``deepcopy``. It is possible to pass a dictionary to specify different copy mode for each dataset.
80 |
--------------------------------------------------------------------------------
/docs/source/05_API/01_python_objects/04_CLI.md:
--------------------------------------------------------------------------------
1 | # Cli commands
2 |
3 | ## ``init``
4 |
5 | ``kedro mlflow init``: this command is needed to initalize your project. You cannot run any other commands before you run this one once. It performs 2 actions:
6 | - creates a ``mlflow.yml`` configuration file in your ``conf/local`` folder
7 | - replace the ``src/PYTHON_PACKAGE/run.py`` file by an updated version of the template. If your template has been modified since project creation, a warning will be raised. You can either run ``kedro mlflow init --force`` to ignore this warning (but this will erase your ``run.py``) or [set hooks manually](https://kedro-mlflow.readthedocs.io/en/latest/source/02_getting_started/01_installation/02_setup.html).
8 |
9 | `init` has two arguments:
10 |
11 | - `--env` which enable to specifiy another environment where the mlflow.yml should be created (e.g, `base`)
12 | - `--force` which overrides the `mlflow.yml` if it already exists and replaces it with the default one. Use it with caution!
13 |
14 | ## ``ui``
15 |
16 | ``kedro mlflow ui``: this command opens the mlflow UI (basically launches the ``mlflow ui`` command )
17 |
18 | `ui` accepts the port and host arguments of [``mlflow ui`` command](https://www.mlflow.org/docs/latest/cli.html#mlflow-ui). The default values used will be the ones defined in the [``mlflow.yml`` configuration file under the `ui`](https://kedro-mlflow.readthedocs.io/en/latest/source/03_experiment_tracking/01_experiment_tracking/01_configuration.html).
19 |
20 | If you provide the arguments at runtime, they wil take priority over the ``mlflow.yml``, e.g. if you have:
21 |
22 | ```yaml
23 | # mlflow.yml
24 | ui:
25 | localhost: "0.0.0.0"
26 | port: "5001"
27 | ```
28 |
29 | then
30 |
31 | ```console
32 | kedro mlflow ui --port=5002
33 | ```
34 |
35 | will open the ui on port 5002.
36 |
37 | ## ``modelify``
38 |
39 | ``kedro mlflow modelify``: this command converts a kedro pipeline to a mlflow model and logs it in mlflow. It enables distributing the kedro pipeline as a standalone model and leverages all mlflow serving capabilities (as an API).
40 |
41 | `modelify` accepts the following arguments :
42 |
43 | - ``--pipeline``, ``-p``: The name of the kedro pipeline name registered in ``pipeline_registry.py`` that you want to convert to a mlflow model.
44 | - ``--input-name``, ``-i``: The name of the kedro dataset (in ``catalog.yml``) which is the input of your pipeline. It contains the data to predict on.
45 | - ``--infer-signature`` : A boolean which indicates if the signature of the input data should be inferred for mlflow or not.
46 | - ``--infer-input-example`` : A boolean which indicates if the input_example of the input data should be inferred for mlflow or not
47 | - ``--run-id``, ``-r`` : The id of the mlflow run where the model will be logged. If unspecified, the command creates a new run.
48 | - ``--run-name``: The name of the mlflow run where the model will be logged. Defaults to ``"modelify"``.
49 | - ``--copy-mode`` : The copy mode to use when replacing each dataset by a ``MemoryDataset``. Either a string (applied all datasets) or a dict mapping each dataset to a ``copy_mode``.
50 | - ``--artifact-path"`` : The artifact path of mlflow.pyfunc.log_model, see https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#mlflow.pyfunc.log_model
51 | - ``--code-path`` : The code path of mlflow.pyfunc.log_model, see https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#mlflow.pyfunc.log_model
52 | - ``--conda-env`` : "The conda environment of mlflow.pyfunc.log_model, see https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#mlflow.pyfunc.log_model
53 | - ``--registered-model-name`` : The registered_model_name of mlflow.pyfunc.log_model, see https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#mlflow.pyfunc.log_model
54 | - ``--await-registration-for``: The await_registration_for of mlflow.pyfunc.log_model, see https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#mlflow.pyfunc.log_model*
55 | - ``--pip-requirements`` : The pip_requirements of mlflow.pyfunc.log_model, see https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#mlflow.pyfunc.log_model
56 | - ``--extra-pip-requirements`` : The extra_pip_requirements of mlflow.pyfunc.log_model, see https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#mlflow.pyfunc.log_model
57 |
--------------------------------------------------------------------------------
/docs/source/05_API/01_python_objects/05_Configuration.md:
--------------------------------------------------------------------------------
1 | # Configuration
2 |
3 | The python objecti is ``KedroMlflowConfig`` and it can be filled through ``mlflow.yml``.
4 |
5 | More details are coming soon.
6 |
--------------------------------------------------------------------------------
/docs/source/05_API/02_autoapi/kedro_mlflow.config.rst:
--------------------------------------------------------------------------------
1 | Configuration
2 | ====================================
3 |
4 | .. automodule:: kedro_mlflow.config.kedro_mlflow_config
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/source/05_API/02_autoapi/kedro_mlflow.framework.cli.rst:
--------------------------------------------------------------------------------
1 | CLI
2 | ====
3 |
4 | .. click:: kedro_mlflow.framework.cli.cli:init
5 | :prog: init
6 | :nested: full
7 |
8 | .. click:: kedro_mlflow.framework.cli.cli:ui
9 | :prog: ui
10 | :nested: full
11 |
12 | .. click:: kedro_mlflow.framework.cli.cli:modelify
13 | :prog: modelify
14 | :nested: full
15 |
--------------------------------------------------------------------------------
/docs/source/05_API/02_autoapi/kedro_mlflow.framework.hooks.rst:
--------------------------------------------------------------------------------
1 | Hooks
2 | ======
3 |
4 | Node Hook
5 | -----------
6 |
7 | .. automodule:: kedro_mlflow.framework.hooks.mlflow_hook
8 | :members:
9 | :undoc-members:
10 | :show-inheritance:
11 |
--------------------------------------------------------------------------------
/docs/source/05_API/02_autoapi/kedro_mlflow.io.rst:
--------------------------------------------------------------------------------
1 | Datasets
2 | ==================================
3 |
4 | Artifact Dataset
5 | -----------------
6 |
7 | .. automodule:: kedro_mlflow.io.artifacts.mlflow_artifact_dataset
8 | :members:
9 | :undoc-members:
10 | :show-inheritance:
11 |
12 | Metrics Dataset
13 | ----------------
14 |
15 | .. automodule:: kedro_mlflow.io.metrics.mlflow_metric_dataset
16 | :members:
17 | :undoc-members:
18 | :show-inheritance:
19 |
20 | .. automodule:: kedro_mlflow.io.metrics.mlflow_metric_history_dataset
21 | :members:
22 | :undoc-members:
23 | :show-inheritance:
24 |
25 |
26 | .. automodule:: kedro_mlflow.io.metrics.mlflow_metrics_history_dataset
27 | :members:
28 | :undoc-members:
29 | :show-inheritance:
30 |
31 | Models Dataset
32 | ---------------
33 |
34 | .. automodule:: kedro_mlflow.io.models.mlflow_abstract_model_dataset
35 | :members:
36 | :undoc-members:
37 | :show-inheritance:
38 |
39 | .. automodule:: kedro_mlflow.io.models.mlflow_model_tracking_dataset
40 | :members:
41 | :undoc-members:
42 | :show-inheritance:
43 |
44 | .. automodule:: kedro_mlflow.io.models.mlflow_model_local_filesystem_dataset
45 | :members:
46 | :undoc-members:
47 | :show-inheritance:
48 |
49 | .. automodule:: kedro_mlflow.io.models.mlflow_model_registry_dataset
50 | :members:
51 | :undoc-members:
52 | :show-inheritance:
53 |
--------------------------------------------------------------------------------
/docs/source/05_API/02_autoapi/kedro_mlflow.mlflow.rst:
--------------------------------------------------------------------------------
1 | Custom Mlflow Models
2 | ====================
3 |
4 | .. automodule:: kedro_mlflow.mlflow.kedro_pipeline_model
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/source/05_API/02_autoapi/kedro_mlflow.pipeline.rst:
--------------------------------------------------------------------------------
1 | Pipelines
2 | =========
3 |
4 | .. automodule:: kedro_mlflow.pipeline.pipeline_ml
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
9 | .. automodule:: kedro_mlflow.pipeline.pipeline_ml_factory
10 | :members:
11 | :undoc-members:
12 | :show-inheritance:
13 |
--------------------------------------------------------------------------------
/docs/source/05_API/02_autoapi/kedro_mlflow.rst:
--------------------------------------------------------------------------------
1 | kedro\_mlflow package
2 | =====================
3 |
4 | .. toctree::
5 | :maxdepth: 6
6 |
7 | kedro_mlflow.io
8 | kedro_mlflow.framework.cli
9 | kedro_mlflow.pipeline
10 | kedro_mlflow.mlflow
11 | kedro_mlflow.config
12 | kedro_mlflow.framework.hooks
13 |
--------------------------------------------------------------------------------
/docs/source/05_API/index.md:
--------------------------------------------------------------------------------
1 |
2 | # API
3 |
4 | ```{toctree}
5 | :caption: Python objects
6 |
7 | 01_python_objects/01_Datasets
8 | 01_python_objects/02_Hooks
9 | 01_python_objects/03_Pipelines
10 | 01_python_objects/04_CLI
11 | 01_python_objects/05_Configuration
12 | ```
13 |
14 | ```{toctree}
15 | :caption: API
16 |
17 | 02_autoapi/kedro_mlflow
18 | ```
19 |
--------------------------------------------------------------------------------
/docs/source/06_migration_guide/index.md:
--------------------------------------------------------------------------------
1 | # Migration guides
2 |
3 | ```{toctree}
4 | :caption: Migrating between kedro-mlflow versions
5 |
6 | migration_guide_kedro_mlflow
7 |
8 | ```
9 |
10 | ```{toctree}
11 | :caption: Migrating from kedro-viz experiment tracking to kedro-mlflow
12 |
13 | migration_guide_kedro_experiment_tracking
14 | ```
15 |
--------------------------------------------------------------------------------
/docs/source/06_migration_guide/migration_guide_kedro_experiment_tracking.md:
--------------------------------------------------------------------------------
1 | # Migration guide from kedro-viz experiment tracking
2 |
3 | If you use Kedro's [native experiment tracking functionality](https://docs.kedro.org/projects/kedro-viz/en/v9.2.0/experiment_tracking.html), it will be deprecated from ``kedro-viz==0.11.0``.
4 |
5 | The core team suggest migrating to kedro-mlflow and [provides a blog post](https://kedro.org/blog/deprecate-experiment-tracking-kedro-viz) to explain the process.
6 |
7 |
8 | ::::::{grid} 1 2 2 2
9 | :gutter: 3
10 |
11 | :::::{grid-item-card}
12 | :link: https://kedro.org/blog/deprecate-experiment-tracking-kedro-viz
13 | :link-type: url
14 | :shadow: none
15 | :class-card: example-gallery
16 |
17 | :::{image} ../imgs/blogpost_migrate_experiment_tracking.png
18 | :::
19 | :::::
20 |
21 | ::::::
22 |
--------------------------------------------------------------------------------
/docs/source/imgs/apps_interaction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/apps_interaction.png
--------------------------------------------------------------------------------
/docs/source/imgs/blogpost_migrate_experiment_tracking.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/blogpost_migrate_experiment_tracking.png
--------------------------------------------------------------------------------
/docs/source/imgs/default_catalog.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/default_catalog.png
--------------------------------------------------------------------------------
/docs/source/imgs/etl_app.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/etl_app.png
--------------------------------------------------------------------------------
/docs/source/imgs/hook_registration_process.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/hook_registration_process.png
--------------------------------------------------------------------------------
/docs/source/imgs/initialized_project.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/initialized_project.png
--------------------------------------------------------------------------------
/docs/source/imgs/kedro_viz_params.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/kedro_viz_params.png
--------------------------------------------------------------------------------
/docs/source/imgs/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/logo.png
--------------------------------------------------------------------------------
/docs/source/imgs/ml_pipeline/preprocessing/all.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/ml_pipeline/preprocessing/all.PNG
--------------------------------------------------------------------------------
/docs/source/imgs/ml_pipeline/preprocessing/inference.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/ml_pipeline/preprocessing/inference.PNG
--------------------------------------------------------------------------------
/docs/source/imgs/ml_pipeline/preprocessing/training.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/ml_pipeline/preprocessing/training.PNG
--------------------------------------------------------------------------------
/docs/source/imgs/ml_pipeline/shared_inputs/all.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/ml_pipeline/shared_inputs/all.PNG
--------------------------------------------------------------------------------
/docs/source/imgs/ml_pipeline/shared_inputs/inference.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/ml_pipeline/shared_inputs/inference.PNG
--------------------------------------------------------------------------------
/docs/source/imgs/ml_pipeline/shared_inputs/training.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/ml_pipeline/shared_inputs/training.PNG
--------------------------------------------------------------------------------
/docs/source/imgs/ml_pipeline/tokenizer/all.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/ml_pipeline/tokenizer/all.PNG
--------------------------------------------------------------------------------
/docs/source/imgs/ml_pipeline/tokenizer/inference.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/ml_pipeline/tokenizer/inference.PNG
--------------------------------------------------------------------------------
/docs/source/imgs/ml_pipeline/tokenizer/training.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/ml_pipeline/tokenizer/training.PNG
--------------------------------------------------------------------------------
/docs/source/imgs/ml_pipeline/vanilla/all.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/ml_pipeline/vanilla/all.PNG
--------------------------------------------------------------------------------
/docs/source/imgs/ml_pipeline/vanilla/inference.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/ml_pipeline/vanilla/inference.PNG
--------------------------------------------------------------------------------
/docs/source/imgs/ml_pipeline/vanilla/training.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/ml_pipeline/vanilla/training.PNG
--------------------------------------------------------------------------------
/docs/source/imgs/mlflow_host_page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/mlflow_host_page.png
--------------------------------------------------------------------------------
/docs/source/imgs/mlflow_run.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/mlflow_run.png
--------------------------------------------------------------------------------
/docs/source/imgs/mlflow_tracking_schema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/mlflow_tracking_schema.png
--------------------------------------------------------------------------------
/docs/source/imgs/mlflow_yml.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/mlflow_yml.png
--------------------------------------------------------------------------------
/docs/source/imgs/once_run_project.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/once_run_project.png
--------------------------------------------------------------------------------
/docs/source/imgs/run_with_artifact.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/run_with_artifact.png
--------------------------------------------------------------------------------
/docs/source/imgs/updated_catalog.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/updated_catalog.png
--------------------------------------------------------------------------------
/kedro_mlflow/__init__.py:
--------------------------------------------------------------------------------
1 | """kedro-mlflow plugin constants"""
2 |
3 | __version__ = "0.14.4"
4 |
5 | import logging
6 |
7 | logging.getLogger(__name__).setLevel(logging.INFO)
8 |
--------------------------------------------------------------------------------
/kedro_mlflow/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/kedro_mlflow/config/__init__.py
--------------------------------------------------------------------------------
/kedro_mlflow/config/resolvers.py:
--------------------------------------------------------------------------------
1 | from mlflow.utils.name_utils import _generate_random_name
2 |
3 |
4 | def resolve_random_name():
5 | # a resolver must have an argument, see: https://github.com/omry/omegaconf/issues/1060
6 | return _generate_random_name()
7 |
--------------------------------------------------------------------------------
/kedro_mlflow/framework/__init__.py:
--------------------------------------------------------------------------------
1 | """``kedro_mlflow.framework`` provides mlflow extensions for Kedro's framework components"""
2 |
--------------------------------------------------------------------------------
/kedro_mlflow/framework/cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/kedro_mlflow/framework/cli/__init__.py
--------------------------------------------------------------------------------
/kedro_mlflow/framework/cli/cli_utils.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from typing import Union
3 |
4 | from jinja2 import Environment, FileSystemLoader
5 |
6 |
7 | def render_jinja_template(
8 | src: Union[str, Path], is_cookiecutter=False, **kwargs
9 | ) -> str:
10 | """This functions enable to copy a file and render the
11 | tags (identified by {{ my_tag }}) with the values provided in kwargs.
12 |
13 | Arguments:
14 | src {Union[str, Path]} -- The path to the template which should be rendered
15 |
16 | Returns:
17 | str -- A string that contains all the files with replaced tags.
18 | """
19 | src = Path(src)
20 |
21 | template_loader = FileSystemLoader(searchpath=src.parent.as_posix())
22 | # the keep_trailing_new_line option is mandatory to
23 | # make sure that black formatting will be preserved
24 | template_env = Environment(loader=template_loader, keep_trailing_newline=True)
25 | template = template_env.get_template(src.name)
26 | if is_cookiecutter:
27 | # we need to match tags from a cookiecutter object
28 | # but cookiecutter only deals with folder, not file
29 | # thus we need to create an object with all necessary attributes
30 | class FalseCookieCutter:
31 | def __init__(self, **kwargs):
32 | self.__dict__.update(kwargs)
33 |
34 | parsed_template = template.render(cookiecutter=FalseCookieCutter(**kwargs))
35 | else:
36 | parsed_template = template.render(**kwargs)
37 |
38 | return parsed_template
39 |
40 |
41 | def write_jinja_template(
42 | src: Union[str, Path], dst: Union[str, Path], **kwargs
43 | ) -> None:
44 | """Write a template file and replace tis jinja's tags
45 | (identified by {{ my_tag }}) with the values provided in kwargs.
46 |
47 | Arguments:
48 | src {Union[str, Path]} -- Path to the template which should be rendered
49 | dst {Union[str, Path]} -- Path where the rendered template should be saved
50 | """
51 | dst = Path(dst)
52 | parsed_template = render_jinja_template(src, **kwargs)
53 | with open(dst, "w") as file_handler:
54 | file_handler.write(parsed_template)
55 |
--------------------------------------------------------------------------------
/kedro_mlflow/framework/hooks/__init__.py:
--------------------------------------------------------------------------------
1 | from .mlflow_hook import MlflowHook, mlflow_hook
2 |
3 | __all__ = ["MlflowHook", "mlflow_hook"]
4 |
--------------------------------------------------------------------------------
/kedro_mlflow/framework/hooks/utils.py:
--------------------------------------------------------------------------------
1 | from kedro_mlflow.config.kedro_mlflow_config import KedroMlflowConfig
2 |
3 |
4 | def _assert_mlflow_enabled(
5 | pipeline_name: str, mlflow_config: KedroMlflowConfig
6 | ) -> bool:
7 | # TODO: we may want to enable to filter on tags
8 | # but we need to deal with the case when several tags are passed
9 | # what to do if 1 out of 2 is in the list?
10 | disabled_pipelines = mlflow_config.tracking.disable_tracking.pipelines
11 | if pipeline_name in disabled_pipelines:
12 | return False
13 |
14 | return True
15 |
16 |
17 | def _generate_kedro_command(
18 | tags, node_names, from_nodes, to_nodes, from_inputs, load_versions, pipeline_name
19 | ):
20 | cmd_list = ["kedro", "run"]
21 | SEP = "="
22 | if from_inputs:
23 | cmd_list.append("--from-inputs" + SEP + ",".join(from_inputs))
24 | if from_nodes:
25 | cmd_list.append("--from-nodes" + SEP + ",".join(from_nodes))
26 | if to_nodes:
27 | cmd_list.append("--to-nodes" + SEP + ",".join(to_nodes))
28 | if node_names:
29 | cmd_list.append("--node" + SEP + ",".join(node_names))
30 | if pipeline_name:
31 | cmd_list.append("--pipeline" + SEP + pipeline_name)
32 | if tags:
33 | # "tag" is the name of the command, "tags" the value in run_params
34 | cmd_list.append("--tag" + SEP + ",".join(tags))
35 | if load_versions:
36 | # "load_version" is the name of the command, "load_versions" the value in run_params
37 | formatted_versions = [f"{k}:{v}" for k, v in load_versions.items()]
38 | cmd_list.append("--load-version" + SEP + ",".join(formatted_versions))
39 |
40 | kedro_cmd = " ".join(cmd_list)
41 | return kedro_cmd
42 |
43 |
44 | def _flatten_dict(d: dict, recursive: bool = True, sep: str = ".") -> dict:
45 | def expand(key, value):
46 | if isinstance(value, dict):
47 | new_value = (
48 | _flatten_dict(value, recursive=recursive, sep=sep)
49 | if recursive
50 | else value
51 | )
52 | return [(f"{key}{sep}{k}", v) for k, v in new_value.items()]
53 | else:
54 | return [(f"{key}", value)]
55 |
56 | items = [item for k, v in d.items() for item in expand(k, v)]
57 |
58 | return dict(items)
59 |
--------------------------------------------------------------------------------
/kedro_mlflow/io/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/kedro_mlflow/io/__init__.py
--------------------------------------------------------------------------------
/kedro_mlflow/io/artifacts/__init__.py:
--------------------------------------------------------------------------------
1 | from .mlflow_artifact_dataset import MlflowArtifactDataset
2 |
3 | __all__ = ["MlflowArtifactDataset"]
4 |
--------------------------------------------------------------------------------
/kedro_mlflow/io/catalog/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/kedro_mlflow/io/catalog/__init__.py
--------------------------------------------------------------------------------
/kedro_mlflow/io/catalog/switch_catalog_logging.py:
--------------------------------------------------------------------------------
1 | def switch_catalog_logging(catalog, logging_flag=True):
2 | for name, dataset in catalog._datasets.items():
3 | if type(dataset).__name__.startswith("Mlflow"):
4 | catalog._datasets[name]._logging_activated = logging_flag
5 |
--------------------------------------------------------------------------------
/kedro_mlflow/io/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | from .mlflow_metric_dataset import MlflowMetricDataset
2 | from .mlflow_metric_history_dataset import MlflowMetricHistoryDataset
3 | from .mlflow_metrics_history_dataset import MlflowMetricsHistoryDataset
4 |
5 | __all__ = [
6 | "MlflowMetricDataset",
7 | "MlflowMetricHistoryDataset",
8 | "MlflowMetricsHistoryDataset",
9 | ]
10 |
--------------------------------------------------------------------------------
/kedro_mlflow/io/metrics/mlflow_abstract_metric_dataset.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Optional, Union
2 |
3 | import mlflow
4 | from kedro.io import AbstractDataset
5 | from mlflow.tracking import MlflowClient
6 |
7 |
8 | class MlflowAbstractMetricDataset(AbstractDataset):
9 | def __init__(
10 | self,
11 | key: str = None,
12 | run_id: str = None,
13 | load_args: dict[str, Any] = None,
14 | save_args: dict[str, Any] = None,
15 | metadata: Optional[dict[str, Any]] = None,
16 | ):
17 | """Initialise MlflowMetricsHistoryDataset.
18 |
19 | Args:
20 | run_id (str): The ID of the mlflow run where the metric should be logged
21 | """
22 |
23 | self.key = key
24 | self.run_id = run_id
25 | self._load_args = load_args or {}
26 | self._save_args = save_args or {}
27 | self._logging_activated = True # by default, logging is activated!
28 | self.metadata = metadata
29 |
30 | @property
31 | def run_id(self) -> Union[str, None]:
32 | """Get run id."""
33 |
34 | run = mlflow.active_run()
35 | if (self._run_id is None) and (run is not None):
36 | # if no run_id is specified, we try to retrieve the current run
37 | # this is useful because during a kedro run, we want to be able to retrieve
38 | # the metric from the active run to be able to reload a metric
39 | # without specifying the (unknown) run id
40 | return run.info.run_id
41 |
42 | # else we return the _run_id which can eventually be None.
43 | # In this case, saving will work (a new run will be created)
44 | # but loading will fail,
45 | # according to mlflow's behaviour
46 | return self._run_id
47 |
48 | @run_id.setter
49 | def run_id(self, run_id: str):
50 | self._run_id = run_id
51 |
52 | # we want to be able to turn logging off for an entire pipeline run
53 | # To avoid that a single call to a dataset in the catalog creates a new run automatically
54 | # we want to be able to turn everything off
55 | @property
56 | def _logging_activated(self):
57 | return self.__logging_activated
58 |
59 | @_logging_activated.setter
60 | def _logging_activated(self, flag):
61 | if not isinstance(flag, bool):
62 | raise ValueError(f"_logging_activated must be a boolean, got {type(flag)}")
63 | self.__logging_activated = flag
64 |
65 | def _validate_run_id(self):
66 | if self.run_id is None:
67 | raise ValueError(
68 | "You must either specify a run_id or have a mlflow active run opened. Use mlflow.start_run() if necessary."
69 | )
70 |
71 | def _exists(self) -> bool:
72 | """Check if the metric exists in remote mlflow storage exists.
73 |
74 | Returns:
75 | bool: Does the metric name exist in the given run_id?
76 | """
77 | mlflow_client = MlflowClient()
78 | run_id = self.run_id # will get the active run if nothing is specified
79 | run = mlflow_client.get_run(run_id) if run_id else mlflow.active_run()
80 |
81 | flag_exist = self.key in run.data.metrics.keys() if run else False
82 | return flag_exist
83 |
84 | def _describe(self) -> dict[str, Any]:
85 | """Describe MLflow metrics dataset.
86 |
87 | Returns:
88 | dict[str, Any]: dictionary with MLflow metrics dataset description.
89 | """
90 | return {
91 | "key": self.key,
92 | "run_id": self.run_id,
93 | }
94 |
--------------------------------------------------------------------------------
/kedro_mlflow/io/metrics/mlflow_metric_dataset.py:
--------------------------------------------------------------------------------
1 | from copy import deepcopy
2 | from typing import Any, Optional
3 |
4 | from mlflow.tracking import MlflowClient
5 |
6 | from kedro_mlflow.io.metrics.mlflow_abstract_metric_dataset import (
7 | MlflowAbstractMetricDataset,
8 | )
9 |
10 |
11 | class MlflowMetricDataset(MlflowAbstractMetricDataset):
12 | SUPPORTED_SAVE_MODES = {"overwrite", "append"}
13 | DEFAULT_SAVE_MODE = "overwrite"
14 |
15 | def __init__(
16 | self,
17 | key: str = None,
18 | run_id: str = None,
19 | load_args: dict[str, Any] = None,
20 | save_args: dict[str, Any] = None,
21 | metadata: Optional[dict[str, Any]] = None,
22 | ):
23 | """Initialise MlflowMetricDataset.
24 | Args:
25 | run_id (str): The ID of the mlflow run where the metric should be logged
26 | """
27 |
28 | super().__init__(key, run_id, load_args, save_args, metadata)
29 |
30 | # We add an extra argument mode="overwrite" / "append" to enable logging update an existing metric
31 | # this is not an offical mlflow argument for log_metric, so we separate it from the others
32 | # "overwrite" corresponds to the default mlflow behaviour
33 | self.mode = self._save_args.pop("mode", self.DEFAULT_SAVE_MODE)
34 |
35 | def _load(self):
36 | self._validate_run_id()
37 | mlflow_client = MlflowClient()
38 | metric_history = mlflow_client.get_metric_history(
39 | run_id=self.run_id, key=self.key
40 | ) # gets active run if no run_id was given
41 |
42 | # the metric history is always a list of mlflow.entities.metric.Metric
43 | # we want the value of the last one stored because this dataset only deal with one single metric
44 | step = self._load_args.get("step")
45 |
46 | if step is None:
47 | # we take the last value recorded
48 | metric_value = metric_history[-1].value
49 | else:
50 | # we should take the last historical value with the given step
51 | # (it is possible to have several values with the same step)
52 | metric_value = next(
53 | metric.value
54 | for metric in reversed(metric_history)
55 | if metric.step == step
56 | )
57 |
58 | return metric_value
59 |
60 | def _save(self, data: float):
61 | if self._logging_activated:
62 | self._validate_run_id()
63 | run_id = self.run_id # we access it once instead of calling self.run_id everywhere to avoid looking or an active run each time
64 |
65 | mlflow_client = MlflowClient()
66 |
67 | # get the metric history if it has been saved previously to ensure
68 | # to retrieve the right data
69 | # reminder: this is True even if no run_id was originally specified but a run is active
70 | metric_history = (
71 | mlflow_client.get_metric_history(run_id=run_id, key=self.key)
72 | if self._exists()
73 | else []
74 | )
75 |
76 | save_args = deepcopy(self._save_args)
77 | step = save_args.pop("step", None)
78 | if step is None:
79 | if self.mode == "overwrite":
80 | step = max([metric.step for metric in metric_history], default=0)
81 | elif self.mode == "append":
82 | # I put a max([]) default to -1 so that default "step" equals 0
83 | step = (
84 | max([metric.step for metric in metric_history], default=-1) + 1
85 | )
86 | else:
87 | raise ValueError(
88 | f"save_args['mode'] must be one of {self.SUPPORTED_SAVE_MODES}, got '{self.mode}' instead."
89 | )
90 |
91 | mlflow_client.log_metric(
92 | run_id=run_id,
93 | key=self.key,
94 | value=data,
95 | step=step,
96 | **save_args,
97 | )
98 |
--------------------------------------------------------------------------------
/kedro_mlflow/io/metrics/mlflow_metric_history_dataset.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Optional, Union
2 |
3 | from mlflow.tracking import MlflowClient
4 |
5 | from kedro_mlflow.io.metrics.mlflow_abstract_metric_dataset import (
6 | MlflowAbstractMetricDataset,
7 | )
8 |
9 |
10 | class MlflowMetricHistoryDataset(MlflowAbstractMetricDataset):
11 | def __init__(
12 | self,
13 | key: str = None,
14 | run_id: str = None,
15 | load_args: dict[str, Any] = None,
16 | save_args: dict[str, Any] = None,
17 | metadata: Optional[dict[str, Any]] = None,
18 | ):
19 | """Initialise MlflowMetricDataset.
20 | Args:
21 | run_id (str): The ID of the mlflow run where the metric should be logged
22 | """
23 |
24 | super().__init__(key, run_id, load_args, save_args, metadata)
25 |
26 | def _load(self):
27 | self._validate_run_id()
28 | mode = self._load_args.get("mode", "list")
29 | mlflow_client = MlflowClient()
30 |
31 | metric_history = mlflow_client.get_metric_history(self.run_id, key=self.key)
32 |
33 | if mode == "list":
34 | simplified_history = [metric.value for metric in metric_history]
35 | elif mode == "dict":
36 | simplified_history = {
37 | metric.step: metric.value for metric in metric_history
38 | }
39 | elif mode == "history":
40 | # history is a list of dict whom keys are "log_metric" arguments. The following is equivalent to dict mode:
41 | # [{"step": 0, "value": 0.1}, {"step": 1, "value": 0.2}, {"step": 2, "value": 0.3}]
42 | simplified_history = [
43 | {
44 | "step": metric.step,
45 | "value": metric.value,
46 | "timestamp": metric.timestamp,
47 | }
48 | for metric in metric_history
49 | ]
50 | return simplified_history
51 |
52 | def _save(
53 | self,
54 | data: Union[list[int], dict[int, float], list[dict[str, Union[float, str]]]],
55 | ):
56 | if self._logging_activated:
57 | self._validate_run_id()
58 | run_id = self.run_id
59 |
60 | mode = self._save_args.get("mode", "list")
61 | mlflow_client = MlflowClient()
62 | if mode == "list":
63 | # list is a list of value in sequential order:
64 | # [0.1,0.2,0.3]
65 | for i, value in enumerate(data):
66 | mlflow_client.log_metric(
67 | run_id=run_id, key=self.key, step=i, value=value
68 | )
69 | elif mode == "dict":
70 | # dict is a {step: value} mapping:
71 | # [{0: 0.1}, {1: 0.2}, {2: 0.3}]
72 | for step, value in data.items():
73 | mlflow_client.log_metric(
74 | run_id=run_id, key=self.key, step=step, value=value
75 | )
76 | elif mode == "history":
77 | # history is a list of dict whom keys are "log_metric" arguments. The following is equivalent to dict mode:
78 | # [{"step": 0, "value": 0.1}, {"step": 1, "value": 0.2}, {"step": 2, "value": 0.3}]
79 | for log_kwargs in data:
80 | mlflow_client.log_metric(run_id=run_id, key=self.key, **log_kwargs)
81 |
--------------------------------------------------------------------------------
/kedro_mlflow/io/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .mlflow_model_local_filesystem_dataset import MlflowModelLocalFileSystemDataset
2 | from .mlflow_model_registry_dataset import MlflowModelRegistryDataset
3 | from .mlflow_model_tracking_dataset import MlflowModelTrackingDataset
4 |
5 | __all__ = [
6 | "MlflowModelLocalFileSystemDataset",
7 | "MlflowModelRegistryDataset",
8 | "MlflowModelTrackingDataset",
9 | ]
10 |
--------------------------------------------------------------------------------
/kedro_mlflow/io/models/mlflow_abstract_model_dataset.py:
--------------------------------------------------------------------------------
1 | from importlib import import_module
2 | from importlib.util import find_spec
3 | from pathlib import Path
4 | from typing import Any, Optional
5 |
6 | from kedro.io import AbstractVersionedDataset, Version
7 | from kedro.io.core import DatasetError
8 |
9 |
10 | class MlflowAbstractModelDataSet(AbstractVersionedDataset):
11 | """
12 | Abstract mother class for model datasets.
13 | """
14 |
15 | def __init__(
16 | self,
17 | filepath: str,
18 | flavor: str,
19 | pyfunc_workflow: Optional[str] = None,
20 | load_args: dict[str, Any] = None,
21 | save_args: dict[str, Any] = None,
22 | version: Version = None,
23 | metadata: Optional[dict[str, Any]] = None,
24 | ) -> None:
25 | """Initialize the Kedro MlflowAbstractModelDataSet.
26 |
27 | Parameters are passed from the Data Catalog.
28 |
29 | During save, the model is first logged to MLflow.
30 | During load, the model is pulled from MLflow run with `run_id`.
31 |
32 | Args:
33 | filepath (str): Path to store the dataset locally.
34 | flavor (str): Built-in or custom MLflow model flavor module.
35 | Must be Python-importable.
36 | pyfunc_workflow (str, optional): Either `python_model` or `loader_module`.
37 | See https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#workflows.
38 | load_args (dict[str, Any], optional): Arguments to `load_model`
39 | function from specified `flavor`. Defaults to {}.
40 | save_args (dict[str, Any], optional): Arguments to `log_model`
41 | function from specified `flavor`. Defaults to {}.
42 | version (Version, optional): Specific version to load.
43 | metadata: Any arbitrary metadata.
44 | This is ignored by Kedro, but may be consumed by users or external plugins.
45 |
46 | Raises:
47 | DatasetError: When passed `flavor` does not exist.
48 | """
49 |
50 | super().__init__(Path(filepath), version)
51 |
52 | self._flavor = flavor
53 | self._pyfunc_workflow = pyfunc_workflow
54 | self._logging_activated = True # by default, it should be True!
55 |
56 | if flavor == "mlflow.pyfunc" and pyfunc_workflow not in (
57 | "python_model",
58 | "loader_module",
59 | ):
60 | raise DatasetError(
61 | "PyFunc models require specifying `pyfunc_workflow` "
62 | "(set to either `python_model` or `loader_module`)"
63 | )
64 |
65 | self._load_args = load_args or {}
66 | self._save_args = save_args or {}
67 | self.metadata = metadata
68 |
69 | try:
70 | self._mlflow_model_module
71 | except ImportError as err:
72 | raise DatasetError(err)
73 |
74 | # we want to be able to turn logging off for an entire pipeline run
75 | # To avoid that a single call to a dataset in the catalog creates a new run automatically
76 | # we want to be able to turn everything off
77 | @property
78 | def _logging_activated(self):
79 | return self.__logging_activated
80 |
81 | @_logging_activated.setter
82 | def _logging_activated(self, flag):
83 | if not isinstance(flag, bool):
84 | raise ValueError(f"_logging_activated must be a boolean, got {type(flag)}")
85 | self.__logging_activated = flag
86 |
87 | # IMPORTANT: _mlflow_model_module is a property to avoid STORING
88 | # the module as an attribute but rather store a string and load on the fly
89 | # The goal is to make this DataSet deepcopiable for compatibility with
90 | # KedroPipelineModel, e.g we can't just do :
91 | # self._mlflow_model_module = self._import_module(self._flavor)
92 |
93 | @property
94 | def _mlflow_model_module(self): # pragma: no cover
95 | pass
96 |
97 | @_mlflow_model_module.getter
98 | def _mlflow_model_module(self):
99 | return self._import_module(self._flavor)
100 |
101 | # TODO: check with Kajetan what was originally intended here
102 | # @classmethod
103 | # def _parse_args(cls, kwargs_dict: dict[str, Any]) -> dict[str, Any]:
104 | # parsed_kargs = {}
105 | # for key, value in kwargs_dict.items():
106 | # if key.endswith("_args"):
107 | # continue
108 | # if f"{key}_args" in kwargs_dict:
109 | # new_value = cls._import_module(value)(
110 | # MlflowModelDataSet._parse_args(kwargs_dict[f"{key}_args"])
111 | # )
112 | # parsed_kargs[key] = new_value
113 | # else:
114 | # parsed_kargs[key] = value
115 | # return parsed_kargs
116 |
117 | @staticmethod
118 | def _import_module(import_path: str) -> Any:
119 | exists = find_spec(import_path)
120 |
121 | if not exists:
122 | raise ImportError(
123 | f"'{import_path}' module not found. Check valid flavor in mlflow documentation: https://www.mlflow.org/docs/latest/python_api/index.html"
124 | )
125 |
126 | return import_module(import_path)
127 |
--------------------------------------------------------------------------------
/kedro_mlflow/io/models/mlflow_model_local_filesystem_dataset.py:
--------------------------------------------------------------------------------
1 | import shutil
2 | from os.path import exists
3 | from typing import Any, Optional
4 |
5 | from kedro.io import Version
6 |
7 | from kedro_mlflow.io.models.mlflow_abstract_model_dataset import (
8 | MlflowAbstractModelDataSet,
9 | )
10 |
11 |
12 | class MlflowModelLocalFileSystemDataset(MlflowAbstractModelDataSet):
13 | """Wrapper for saving, logging and loading for all MLflow model flavor."""
14 |
15 | def __init__(
16 | self,
17 | filepath: str,
18 | flavor: str,
19 | pyfunc_workflow: Optional[str] = None,
20 | load_args: dict[str, Any] = None,
21 | save_args: dict[str, Any] = None,
22 | log_args: dict[str, Any] = None,
23 | version: Version = None,
24 | metadata: Optional[dict[str, Any]] = None,
25 | ) -> None:
26 | """Initialize the Kedro MlflowModelDataSet.
27 |
28 | Parameters are passed from the Data Catalog.
29 |
30 | During save, the model is saved locally at `filepath`
31 | During load, the model is loaded from the local `filepath`.
32 |
33 | Args:
34 | flavor (str): Built-in or custom MLflow model flavor module.
35 | Must be Python-importable.
36 | filepath (str): Path to store the dataset locally.
37 | pyfunc_workflow (str, optional): Either `python_model` or `loader_module`.
38 | See https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#workflows.
39 | load_args (dict[str, Any], optional): Arguments to `load_model`
40 | function from specified `flavor`. Defaults to None.
41 | save_args (dict[str, Any], optional): Arguments to `save_model`
42 | function from specified `flavor`. Defaults to None.
43 | version (Version, optional): Kedro version to use. Defaults to None.
44 | metadata: Any arbitrary metadata.
45 | This is ignored by Kedro, but may be consumed by users or external plugins.
46 |
47 | Raises:
48 | DatasetError: When passed `flavor` does not exist.
49 | """
50 | super().__init__(
51 | filepath=filepath,
52 | flavor=flavor,
53 | pyfunc_workflow=pyfunc_workflow,
54 | load_args=load_args,
55 | save_args=save_args,
56 | version=version,
57 | metadata=metadata,
58 | )
59 |
60 | def _load(self) -> Any:
61 | """Loads an MLflow model from local path or from MLflow run.
62 |
63 | Returns:
64 | Any: Deserialized model.
65 | """
66 | return self._mlflow_model_module.load_model(
67 | model_uri=self._get_load_path().as_uri(), **self._load_args
68 | )
69 |
70 | def _save(self, model: Any) -> None:
71 | """Save a model to local path and then logs it to MLflow.
72 |
73 | Args:
74 | model (Any): A model object supported by the given MLflow flavor.
75 | """
76 | save_path = self._get_save_path()
77 | # In case of an unversioned model we need to remove the save path
78 | # because MLflow cannot overwrite the target directory.
79 | if exists(save_path):
80 | shutil.rmtree(save_path)
81 |
82 | if self._flavor == "mlflow.pyfunc":
83 | # PyFunc models utilise either `python_model` or `loader_module`
84 | # workflow. We we assign the passed `model` object to one of those keys
85 | # depending on the chosen `pyfunc_workflow`.
86 | self._save_args[self._pyfunc_workflow] = model
87 | self._mlflow_model_module.save_model(save_path, **self._save_args)
88 | else:
89 | # Otherwise we save using the common workflow where first argument is the
90 | # model object and second is the path.
91 | self._mlflow_model_module.save_model(model, save_path, **self._save_args)
92 |
93 | def _describe(self) -> dict[str, Any]:
94 | return dict(
95 | filepath=self._filepath,
96 | flavor=self._flavor,
97 | pyfunc_workflow=self._pyfunc_workflow,
98 | load_args=self._load_args,
99 | save_args=self._save_args,
100 | version=self._version,
101 | )
102 |
--------------------------------------------------------------------------------
/kedro_mlflow/io/models/mlflow_model_registry_dataset.py:
--------------------------------------------------------------------------------
1 | from logging import Logger, getLogger
2 | from typing import Any, Optional, Union
3 |
4 | from kedro.io.core import DatasetError
5 |
6 | from kedro_mlflow.io.models.mlflow_abstract_model_dataset import (
7 | MlflowAbstractModelDataSet,
8 | )
9 |
10 |
11 | class MlflowModelRegistryDataset(MlflowAbstractModelDataSet):
12 | """Wrapper for saving, logging and loading for all MLflow model flavor."""
13 |
14 | def __init__(
15 | self,
16 | model_name: str,
17 | stage_or_version: Union[str, int, None] = None,
18 | alias: Optional[str] = None,
19 | flavor: Optional[str] = "mlflow.pyfunc",
20 | pyfunc_workflow: Optional[str] = "python_model",
21 | load_args: Optional[dict[str, Any]] = None,
22 | metadata: Optional[dict[str, Any]] = None,
23 | ) -> None:
24 | """Initialize the Kedro MlflowModelRegistryDataset.
25 |
26 | Parameters are passed from the Data Catalog.
27 |
28 | During "load", the model is pulled from MLflow model registry by its name.
29 | "save" is not supported.
30 |
31 | Args:
32 | model_name (str): The name of the registered model is the mlflow registry
33 | stage_or_version (str): A valid stage (either "staging" or "production") or version number for the registred model.
34 | Default to "latest" which fetch the last version and the higher "stage" available.
35 | flavor (str): Built-in or custom MLflow model flavor module.
36 | Must be Python-importable.
37 | pyfunc_workflow (str, optional): Either `python_model` or `loader_module`.
38 | See https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#workflows.
39 | load_args (dict[str, Any], optional): Arguments to `load_model`
40 | function from specified `flavor`. Defaults to None.
41 | metadata: Any arbitrary metadata.
42 | This is ignored by Kedro, but may be consumed by users or external plugins.
43 |
44 | Raises:
45 | DatasetError: When passed `flavor` does not exist.
46 | """
47 | super().__init__(
48 | filepath="",
49 | flavor=flavor,
50 | pyfunc_workflow=pyfunc_workflow,
51 | load_args=load_args,
52 | save_args={},
53 | version=None,
54 | metadata=metadata,
55 | )
56 |
57 | if alias is None and stage_or_version is None:
58 | # reassign stage_or_version to "latest"
59 | stage_or_version = "latest"
60 |
61 | if alias and stage_or_version:
62 | raise DatasetError(
63 | f"You cannot specify 'alias' and 'stage_or_version' simultaneously ({alias=} and {stage_or_version=})"
64 | )
65 |
66 | self.model_name = model_name
67 | self.stage_or_version = stage_or_version
68 | self.alias = alias
69 | self.model_uri = (
70 | f"models:/{model_name}@{alias}"
71 | if alias
72 | else f"models:/{model_name}/{stage_or_version}"
73 | )
74 |
75 | @property
76 | def _logger(self) -> Logger:
77 | return getLogger(__name__)
78 |
79 | def _load(self) -> Any:
80 | """Loads an MLflow model from local path or from MLflow run.
81 |
82 | Returns:
83 | Any: Deserialized model.
84 | """
85 |
86 | # If `run_id` is specified, pull the model from MLflow.
87 | # TODO: enable loading from another mlflow conf (with a client with another tracking uri)
88 | # Alternatively, use local path to load the model.
89 | model = self._mlflow_model_module.load_model(
90 | model_uri=self.model_uri, **self._load_args
91 | )
92 |
93 | # log some info because "latest" model is not very informative
94 | # the model itself does not have information about its registry
95 | # because the same run can be registered under several different names
96 | # in the registry. See https://github.com/Galileo-Galilei/kedro-mlflow/issues/552
97 |
98 | self._logger.info(f"Loading model from run_id='{model.metadata.run_id}'")
99 | return model
100 |
101 | def _save(self, model: Any) -> None:
102 | raise NotImplementedError(
103 | "The 'save' method is not implemented for MlflowModelRegistryDataset. You can pass 'registered_model_name' argument in 'MLflowModelTrackingDataset(..., save_args={registered_model_name='my_model'}' to save and register a model in the same step. "
104 | )
105 |
106 | def _describe(self) -> dict[str, Any]:
107 | return dict(
108 | model_uri=self.model_uri,
109 | model_name=self.model_name,
110 | stage_or_version=self.stage_or_version,
111 | alias=self.alias,
112 | flavor=self._flavor,
113 | pyfunc_workflow=self._pyfunc_workflow,
114 | # load_args=self._load_args,
115 | )
116 |
--------------------------------------------------------------------------------
/kedro_mlflow/io/models/mlflow_model_tracking_dataset.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Optional
2 |
3 | import mlflow
4 | from kedro.io.core import DatasetError
5 |
6 | from kedro_mlflow.io.models.mlflow_abstract_model_dataset import (
7 | MlflowAbstractModelDataSet,
8 | )
9 |
10 |
11 | class MlflowModelTrackingDataset(MlflowAbstractModelDataSet):
12 | """Wrapper for saving, logging and loading for all MLflow model flavor."""
13 |
14 | def __init__(
15 | self,
16 | flavor: str,
17 | run_id: Optional[str] = None,
18 | artifact_path: Optional[str] = "model",
19 | pyfunc_workflow: Optional[str] = None,
20 | load_args: Optional[dict[str, Any]] = None,
21 | save_args: Optional[dict[str, Any]] = None,
22 | metadata: Optional[dict[str, Any]] = None,
23 | ) -> None:
24 | """Initialize the Kedro MlflowModelDataSet.
25 |
26 | Parameters are passed from the Data Catalog.
27 |
28 | During save, the model is first logged to MLflow.
29 | During load, the model is pulled from MLflow run with `run_id`.
30 |
31 | Args:
32 | flavor (str): Built-in or custom MLflow model flavor module.
33 | Must be Python-importable.
34 | run_id (Optional[str], optional): MLflow run ID to use to load
35 | the model from or save the model to. Defaults to None.
36 | artifact_path (str, optional): the run relative path to
37 | the model.
38 | pyfunc_workflow (str, optional): Either `python_model` or `loader_module`.
39 | See https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#workflows.
40 | load_args (dict[str, Any], optional): Arguments to `load_model`
41 | function from specified `flavor`. Defaults to None.
42 | save_args (dict[str, Any], optional): Arguments to `log_model`
43 | function from specified `flavor`. Defaults to None.
44 | metadata: Any arbitrary metadata.
45 | This is ignored by Kedro, but may be consumed by users or external plugins.
46 |
47 | Raises:
48 | DatasetError: When passed `flavor` does not exist.
49 | """
50 | super().__init__(
51 | filepath="",
52 | flavor=flavor,
53 | pyfunc_workflow=pyfunc_workflow,
54 | load_args=load_args,
55 | save_args=save_args,
56 | version=None,
57 | metadata=metadata,
58 | )
59 |
60 | self._run_id = run_id
61 | self._artifact_path = artifact_path
62 |
63 | # drop the key which MUST be common to save and load and
64 | # thus is instantiated outside save_args
65 | self._save_args.pop("artifact_path", None)
66 |
67 | @property
68 | def model_uri(self):
69 | run_id = None
70 | if self._run_id:
71 | run_id = self._run_id
72 | elif mlflow.active_run() is not None:
73 | run_id = mlflow.active_run().info.run_id
74 | if run_id is None:
75 | raise DatasetError(
76 | "To access the model_uri, you must either: "
77 | "\n - specifiy 'run_id' "
78 | "\n - have an active run to retrieve data from"
79 | )
80 |
81 | model_uri = f"runs:/{run_id}/{self._artifact_path}"
82 |
83 | return model_uri
84 |
85 | def _load(self) -> Any:
86 | """Loads an MLflow model from local path or from MLflow run.
87 |
88 | Returns:
89 | Any: Deserialized model.
90 | """
91 |
92 | # If `run_id` is specified, pull the model from MLflow.
93 | # TODO: enable loading from another mlflow conf (with a client with another tracking uri)
94 | # Alternatively, use local path to load the model.
95 | return self._mlflow_model_module.load_model(
96 | model_uri=self.model_uri, **self._load_args
97 | )
98 |
99 | def _save(self, model: Any) -> None:
100 | """Save a model to local path and then logs it to MLflow.
101 |
102 | Args:
103 | model (Any): A model object supported by the given MLflow flavor.
104 | """
105 | if self._run_id:
106 | if mlflow.active_run():
107 | # it is not possible to log in a run which is not the current opened one
108 | raise DatasetError(
109 | f"'run_id' cannot be specified (run_id='{self._run_id}') "
110 | f"if there is an mlflow active run (active run id='{mlflow.active_run().info.run_id}') "
111 | f"See the rationale in this issue: https://github.com/Galileo-Galilei/kedro-mlflow/issues/549."
112 | )
113 | else:
114 | # if the run id is specified and there is no opened run,
115 | # open the right run before logging
116 | with mlflow.start_run(run_id=self._run_id):
117 | self._save_model_in_run(model)
118 | else:
119 | # if there is no run_id, log in active run
120 | # OR open automatically a new run to log
121 | self._save_model_in_run(model)
122 |
123 | def _save_model_in_run(self, model):
124 | if self._flavor == "mlflow.pyfunc":
125 | # PyFunc models utilise either `python_model` or `loader_module`
126 | # workflow. We we assign the passed `model` object to one of those keys
127 | # depending on the chosen `pyfunc_workflow`.
128 | self._save_args[self._pyfunc_workflow] = model
129 | if self._logging_activated:
130 | self._mlflow_model_module.log_model(
131 | self._artifact_path, **self._save_args
132 | )
133 | elif self._logging_activated:
134 | # Otherwise we save using the common workflow where first argument is the
135 | # model object and second is the path.
136 | self._mlflow_model_module.log_model(
137 | model, self._artifact_path, **self._save_args
138 | )
139 |
140 | def _describe(self) -> dict[str, Any]:
141 | return dict(
142 | flavor=self._flavor,
143 | run_id=self._run_id,
144 | artifact_path=self._artifact_path,
145 | pyfunc_workflow=self._pyfunc_workflow,
146 | load_args=self._load_args,
147 | save_args=self._save_args,
148 | )
149 |
--------------------------------------------------------------------------------
/kedro_mlflow/mlflow/__init__.py:
--------------------------------------------------------------------------------
1 | from .kedro_pipeline_model import KedroPipelineModel # noqa: F401
2 |
--------------------------------------------------------------------------------
/kedro_mlflow/pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | from .pipeline_ml_factory import pipeline_ml_factory
2 |
3 | __all__ = ["pipeline_ml_factory"]
4 |
--------------------------------------------------------------------------------
/kedro_mlflow/pipeline/pipeline_ml_factory.py:
--------------------------------------------------------------------------------
1 | from kedro.pipeline import Pipeline
2 |
3 | from kedro_mlflow.pipeline.pipeline_ml import PipelineML
4 |
5 |
6 | def pipeline_ml_factory(
7 | training: Pipeline,
8 | inference: Pipeline,
9 | input_name: str = None,
10 | kpm_kwargs=None,
11 | log_model_kwargs=None,
12 | ) -> PipelineML:
13 | """This function is a helper to create `PipelineML`
14 | object directly from two Kedro `Pipelines` (one of
15 | training and one of inference) .
16 |
17 | Args:
18 | training (Pipeline): The `Pipeline` object that creates
19 | all mlflow artifacts for prediction (the model,
20 | but also encoders, binarizers, tokenizers...).
21 | These artifacts must be persisted in the catalog.yml.
22 | inference (Pipeline): A `Pipeline` object which will be
23 | stored in mlflow and use the output(s)
24 | of the training pipeline (namely, the model)
25 | to predict the outcome.
26 | input_name (str, optional): The name of the dataset in
27 | the catalog.yml which the model's user must provide
28 | for prediction (i.e. the data). Defaults to None.
29 | kpm_kwargs:
30 | extra arguments to be passed to `KedroPipelineModel`
31 | when the PipelineML object is automatically saved at the end of a run.
32 | This includes:
33 | - `copy_mode`: the copy_mode to be used for underlying dataset
34 | when loaded in memory
35 | - `runner`: the kedro runner to run the model with
36 | logging_kwargs:
37 | extra arguments to be passed to `mlflow.pyfunc.log_model`
38 | when the PipelineML object is automatically saved at the end of a run.
39 | See mlflow documentation to see all available options: https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#mlflow.pyfunc.log_model
40 |
41 | Returns:
42 | PipelineML: A `PipelineML` which is automatically
43 | discovered by the `MlflowHook` and
44 | contains all the information for logging the
45 | inference pipeline as a Mlflow Model.
46 | """
47 |
48 | pipeline = PipelineML(
49 | nodes=training.nodes,
50 | inference=inference,
51 | input_name=input_name,
52 | kpm_kwargs=kpm_kwargs,
53 | log_model_kwargs=log_model_kwargs,
54 | )
55 | return pipeline
56 |
--------------------------------------------------------------------------------
/kedro_mlflow/template/project/mlflow.yml:
--------------------------------------------------------------------------------
1 | # SERVER CONFIGURATION -------------------
2 |
3 | # `mlflow_tracking_uri` is the path where the runs will be recorded.
4 | # For more informations, see https://www.mlflow.org/docs/latest/tracking.html#where-runs-are-recorded
5 | # kedro-mlflow accepts relative path from the project root.
6 | # For instance, default `mlruns` will create a mlruns folder
7 | # at the root of the project
8 |
9 | # All credentials needed for mlflow must be stored in credentials .yml as a dict
10 | # they will be exported as environment variable
11 | # If you want to set some credentials, e.g. AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY
12 | # > in `credentials.yml`:
13 | # your_mlflow_credentials:
14 | # AWS_ACCESS_KEY_ID: 132456
15 | # AWS_SECRET_ACCESS_KEY: 132456
16 | # > in this file `mlflow.yml`:
17 | # credentials: mlflow_credentials
18 |
19 | server:
20 | mlflow_tracking_uri: null # if null, will use mlflow.get_tracking_uri() as a default
21 | mlflow_registry_uri: null # if null, mlflow_tracking_uri will be used as mlflow default
22 | credentials: null # must be a valid key in credentials.yml which refers to a dict of sensitive mlflow environment variables (password, tokens...). See top of the file.
23 | request_header_provider: # this is only useful to deal with expiring token, see https://github.com/Galileo-Galilei/kedro-mlflow/issues/357
24 | type: null # The path to a class : my_project.pipelines.module.MyClass. Should inherit from https://github.com/mlflow/mlflow/blob/master/mlflow/tracking/request_header/abstract_request_header_provider.py#L4
25 | pass_context: False # should the class be instantiated with "kedro_context" argument?
26 | init_kwargs: {} # any kwargs to pass to the class when it is instantiated
27 |
28 | tracking:
29 | # You can specify a list of pipeline names for which tracking will be disabled
30 | # Running "kedro run --pipeline=" will not log parameters
31 | # in a new mlflow run
32 |
33 | disable_tracking:
34 | disable_autologging: True # If True, we force autologging to be disabled. This is useful on databricks with autologging by default which conflicts with the plugin. If False, we keep the default behaviour which is disable by default anayway.
35 | pipelines: []
36 |
37 | experiment:
38 | name: {{ python_package }}
39 | create_experiment_kwargs: # will be used only if the experiment does not exist yet and is created.
40 | artifact_location: null # enable to specify an artifact location for the experiment different than the global one for the mlflow server
41 | tags: null # a dict of tags for the experiment
42 | restore_if_deleted: True # if the experiment`name` was previously deleted experiment, should we restore it?
43 |
44 | run:
45 | id: null # if `id` is None, a new run will be created
46 | name: null # if `name` is None, pipeline name will be used for the run name. You can use "${km.random_name:}" to generate a random name (mlflow's default)
47 | nested: True # if `nested` is False, you won't be able to launch sub-runs inside your nodes
48 |
49 | params:
50 | dict_params:
51 | flatten: False # if True, parameter which are dictionary will be splitted in multiple parameters when logged in mlflow, one for each key.
52 | recursive: True # Should the dictionary flattening be applied recursively (i.e for nested dictionaries)? Not use if `flatten_dict_params` is False.
53 | sep: "." # In case of recursive flattening, what separator should be used between the keys? E.g. {hyperaparam1: {p1:1, p2:2}} will be logged as hyperaparam1.p1 and hyperaparam1.p2 in mlflow.
54 | long_params_strategy: fail # One of ["fail", "tag", "truncate" ] If a parameter is above mlflow limit (currently 250), what should kedro-mlflow do? -> fail, set as a tag instead of a parameter, or truncate it to its 250 first letters?
55 |
56 |
57 | # UI-RELATED PARAMETERS -----------------
58 |
59 | ui:
60 | port: "5000" # the port to use for the ui. Use mlflow default with 5000.
61 | host: "127.0.0.1" # the host to use for the ui. Use mlflow efault of "127.0.0.1".
62 |
--------------------------------------------------------------------------------
/kedro_mlflow/utils.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from typing import Any, Union
3 |
4 |
5 | def _is_project(project_path: Union[str, Path]) -> bool:
6 | try:
7 | # untested in the CI, for retrocompatiblity with kedro >=0.19.0,<0.19.3
8 | from kedro.framework.startup import _is_project as _ip
9 | except ImportError:
10 | from kedro.utils import _is_project as _ip
11 |
12 | return _ip(project_path)
13 |
14 |
15 | def _find_kedro_project(current_dir: Path) -> Any:
16 | try:
17 | # untested in the CI, for retrocompatiblity with kedro >=0.19.0,<0.19.3
18 | from kedro.framework.startup import _find_kedro_project as _fkp
19 | except ImportError:
20 | from kedro.utils import _find_kedro_project as _fkp
21 |
22 | return _fkp(current_dir)
23 |
--------------------------------------------------------------------------------
/mlc_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "aliveStatusCodes": [
3 | 429,
4 | 200
5 | ]
6 | }
7 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | # PEP-518 https://peps.python.org/pep-0518/
2 |
3 | [build-system]
4 | # Minimum requirements for the build system to execute.
5 | requires = ["setuptools>=65.5.1", "setuptools-scm>=8.0"] # PEP 518 specifications
6 | build-backend = "setuptools.build_meta"
7 |
8 | [project]
9 | name = "kedro_mlflow"
10 | authors = [
11 | {name = "Yolan Honoré-Rougé"}
12 | ]
13 | description = "A kedro-plugin to use mlflow in your kedro projects"
14 | requires-python = ">=3.9"
15 | dependencies = [
16 | "kedro>=0.19.0, <0.20.0",
17 | "kedro_datasets",
18 | "mlflow>=2.7.0, <3.0.0",
19 | "pydantic>=1.0.0, <3.0.0"
20 | ]
21 | keywords = [
22 | "kedro-plugin",
23 | "kedro",
24 | "mlflow",
25 | "experiment tracking",
26 | "model versioning",
27 | "model serving",
28 | "machine learning",
29 | "data pipelines",
30 | "data science",
31 | "ml engineering",
32 | "mlops"
33 | ]
34 | license = {text = "Apache Software License (Apache 2.0)"}
35 | classifiers = [
36 | "Development Status :: 4 - Beta",
37 | "Programming Language :: Python :: 3.9",
38 | "Programming Language :: Python :: 3.10",
39 | "Programming Language :: Python :: 3.11",
40 | "Programming Language :: Python :: 3.12",
41 | "Programming Language :: Python :: 3.13",
42 | "Framework :: Kedro",
43 | "Environment :: Plugins",
44 | "Intended Audience :: Developers",
45 | "Operating System :: Microsoft :: Windows",
46 | "Operating System :: MacOS",
47 | "Operating System :: POSIX :: Linux",
48 | ]
49 | dynamic = ["readme", "version"]
50 |
51 | [project.optional-dependencies]
52 | test = [
53 | "pytest>=5.4.0, <9.0.0",
54 | "pytest-cov>=2.8.0, <7.0.0",
55 | "pytest-lazy-fixtures>=1.0.0, <2.0.0", # pytest==8.0.0 breaks pytest-lazy-fixture (without final S) : https://github.com/TvoroG/pytest-lazy-fixture/issues/65
56 | "pytest-mock>=3.1.0, <4.0.0",
57 | "pytest-xdist>=3.0.0,<4.0.0", # mess up the test readibility in the console but is much faster for the CI with "-n auto" option
58 | "ruff>=0.5.0,<0.10.0", # ensure consistency with pre-commit
59 | "scikit-learn>=0.23.0, <1.7.0",
60 | "kedro-datasets[pandas.CSVDataSet]",
61 | ]
62 |
63 | doc = [
64 | "sphinx>=4.5.0,<9.0.0",
65 | "sphinx-markdown-tables~=0.0.15",
66 | "sphinx-click>=3.1,<6.1",
67 | "sphinx_copybutton~=0.5.0",
68 | "myst-parser>=0.17.2,<4.1.0",
69 | "sphinx_design>=0.6.0,<0.7.0",
70 | "pydata-sphinx-theme>=0.16.0,<0.17.0",
71 | ]
72 | dev = [
73 | "pre-commit>=2.0.0,<5.0.0",
74 | "jupyter>=1.0.0,<2.0.0",
75 | ]
76 |
77 | all = [ "kedro_mlflow[test,doc,dev]" ]
78 |
79 | [project.urls]
80 | Source = "https://github.com/Galileo-Galilei/kedro-mlflow"
81 | Documentation = "https://kedro-mlflow.readthedocs.io/en/stable/"
82 | Tracker = "https://github.com/Galileo-Galilei/kedro-mlflow/issues"
83 |
84 | [project.entry-points."kedro.hooks"]
85 | mlflow_hook = "kedro_mlflow.framework.hooks.mlflow_hook:mlflow_hook"
86 |
87 | [project.entry-points."kedro.project_commands"]
88 | kedro_mlflow = "kedro_mlflow.framework.cli.cli:commands"
89 |
90 | [tool.setuptools]
91 | zip-safe = false
92 |
93 | [tool.setuptools.packages.find]
94 | include = ["kedro_mlflow*"]
95 |
96 | [tool.setuptools.package-data]
97 | kedro_mlflow = ["py.typed", "*.yml"]
98 |
99 | [tool.setuptools.dynamic]
100 | readme = {file = "README.md", content-type = "text/markdown"}
101 | version = {attr = "kedro_mlflow.__version__"}
102 |
103 | [tool.pytest.ini_options]
104 | addopts = "--cov=kedro_mlflow --cov-report=html tests/"
105 |
106 | [tool.ruff]
107 | exclude = [
108 | ".bzr",
109 | ".direnv",
110 | ".eggs",
111 | ".git",
112 | ".git-rewrite",
113 | ".hg",
114 | ".ipynb_checkpoints",
115 | ".mypy_cache",
116 | ".nox",
117 | ".pants.d",
118 | ".pyenv",
119 | ".pytest_cache",
120 | ".pytype",
121 | ".ruff_cache",
122 | ".svn",
123 | ".tox",
124 | ".venv",
125 | ".vscode",
126 | "__pypackages__",
127 | "_build",
128 | "buck-out",
129 | "build",
130 | "dist",
131 | "node_modules",
132 | "site-packages",
133 | "venv",
134 | "/template/",
135 | "debug"
136 | ]
137 |
138 | # Same as Black.
139 | line-length = 88
140 | indent-width = 4
141 |
142 | # Assume Python 3.9
143 | target-version = "py39"
144 |
145 | [tool.ruff.lint]
146 | # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default.
147 | # Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or
148 | # McCabe complexity (`C901`) by default.
149 | select = ["E4", "E7", "E9", "F"]
150 | ignore = []
151 |
152 | # Allow fix for all enabled rules (when `--fix`) is provided.
153 | fixable = ["ALL"]
154 | unfixable = []
155 |
156 | # Allow unused variables when underscore-prefixed.
157 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
158 |
159 | [tool.ruff.format]
160 | # Like Black, use double quotes for strings.
161 | quote-style = "double"
162 |
163 | # Like Black, indent with spaces, rather than tabs.
164 | indent-style = "space"
165 |
166 | # Like Black, respect magic trailing commas.
167 | skip-magic-trailing-comma = false
168 |
169 | # Like Black, automatically detect the appropriate line ending.
170 | line-ending = "auto"
171 |
172 | # Enable auto-formatting of code examples in docstrings. Markdown,
173 | # reStructuredText code/literal blocks and doctests are all supported.
174 | #
175 | # This is currently disabled by default, but it is planned for this
176 | # to be opt-out in the future.
177 | docstring-code-format = false
178 |
179 | # Set the line length limit used when formatting code snippets in
180 | # docstrings.
181 | #
182 | # This only has an effect when the `docstring-code-format` setting is
183 | # enabled.
184 | docstring-code-line-length = "dynamic"
185 |
186 | [tool.bumpversion]
187 | current_version = "0.14.4"
188 |
189 | [[tool.bumpversion.files]]
190 | filename = "kedro_mlflow/__init__.py"
191 |
192 | [[tool.bumpversion.files]]
193 | filename = "README.md"
194 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/tests/__init__.py
--------------------------------------------------------------------------------
/tests/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/tests/config/__init__.py
--------------------------------------------------------------------------------
/tests/config/test_resolvers.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import pytest
4 | import yaml
5 | from kedro.framework.session import KedroSession
6 | from kedro.framework.startup import bootstrap_project
7 | from mlflow.utils.name_utils import (
8 | _GENERATOR_NOUNS,
9 | _GENERATOR_PREDICATES,
10 | )
11 | from omegaconf import OmegaConf
12 |
13 | from kedro_mlflow.config.resolvers import resolve_random_name
14 |
15 |
16 | def _write_yaml(filepath, config):
17 | yaml_str = yaml.dump(config)
18 | filepath.write_text(yaml_str)
19 |
20 |
21 | def _is_mlflow_name(name: str) -> bool:
22 | splitted_name = name.split("-")
23 | flag1 = len(splitted_name) == 3 # noqa: PLR2004
24 | flag2 = splitted_name[0] in _GENERATOR_PREDICATES
25 | flag3 = splitted_name[1] in _GENERATOR_NOUNS
26 | flag4 = re.search(pattern=r"^\d+$", string=splitted_name[2])
27 | return all({flag1, flag2, flag3, flag4})
28 |
29 |
30 | @pytest.fixture
31 | def kedro_project_with_random_name(kedro_project):
32 | # kedro_project is a pytest.fixture in conftest
33 | dict_config = dict(
34 | server=dict(
35 | mlflow_tracking_uri="mlruns",
36 | mlflow_registry_uri=None,
37 | credentials=None,
38 | request_header_provider=dict(type=None, pass_context=False, init_kwargs={}),
39 | ),
40 | tracking=dict(
41 | disable_tracking=dict(pipelines=["my_disabled_pipeline"]),
42 | experiment=dict(name="fake_package", restore_if_deleted=True),
43 | run=dict(id="123456789", name="${km.random_name:}", nested=True),
44 | params=dict(
45 | dict_params=dict(
46 | flatten=True,
47 | recursive=False,
48 | sep="-",
49 | ),
50 | long_params_strategy="truncate",
51 | ),
52 | ),
53 | ui=dict(port="5151", host="localhost"),
54 | )
55 |
56 | _write_yaml(kedro_project / "conf" / "local" / "mlflow.yml", dict_config)
57 | expected = dict_config.copy()
58 | expected["server"]["mlflow_tracking_uri"] = (kedro_project / "mlruns").as_uri()
59 | return kedro_project
60 |
61 |
62 | def test_resolve_random_name_is_valid_mlflow_name():
63 | random_name = resolve_random_name()
64 | assert _is_mlflow_name(random_name)
65 |
66 |
67 | def test_resolve_random_name_is_registered(kedro_project_with_random_name):
68 | bootstrap_project(kedro_project_with_random_name)
69 | with KedroSession.create(project_path=kedro_project_with_random_name) as session:
70 | session.load_context()
71 | assert OmegaConf.has_resolver("km.random_name")
72 |
73 |
74 | def test_resolve_random_name_is_called_in_project(kedro_project_with_random_name):
75 | bootstrap_project(kedro_project_with_random_name)
76 | with KedroSession.create(project_path=kedro_project_with_random_name) as session:
77 | context = session.load_context()
78 | assert _is_mlflow_name(context.mlflow.tracking.run.name)
79 |
80 |
81 | @pytest.mark.skip(reason="kedro 0.19.2 does not take use_cache into account")
82 | def test_resolve_random_name_is_idempotent(kedro_project_with_random_name):
83 | bootstrap_project(kedro_project_with_random_name)
84 | with KedroSession.create(project_path=kedro_project_with_random_name) as session:
85 | context = session.load_context()
86 | assert (
87 | context.config_loader["mlflow"]["tracking"]["run"]["name"]
88 | == context.config_loader["mlflow"]["tracking"]["run"]["name"]
89 | ) # when called twice, should be different is no use_cache because the resolver is random
90 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 |
4 | import mlflow
5 | import pytest
6 | from cookiecutter.main import cookiecutter
7 | from kedro import __version__ as kedro_version
8 | from kedro.framework.cli.starters import TEMPLATE_PATH
9 | from mlflow import MlflowClient
10 |
11 | from kedro_mlflow.framework.cli.cli import TEMPLATE_FOLDER_PATH
12 | from kedro_mlflow.framework.cli.cli_utils import write_jinja_template
13 |
14 | _FAKE_PROJECT_NAME = "fake_project"
15 |
16 |
17 | @pytest.fixture
18 | def tracking_uri(tmp_path):
19 | tracking_uri = (tmp_path / "mlruns").as_uri()
20 | return tracking_uri
21 |
22 |
23 | @pytest.fixture
24 | def mlflow_client(tracking_uri):
25 | mlflow.set_tracking_uri(tracking_uri)
26 | client = MlflowClient(tracking_uri)
27 | return client
28 |
29 |
30 | @pytest.fixture(autouse=True)
31 | def cleanup_mlflow_after_runs():
32 | yield # A test function will be run at this point
33 | while mlflow.active_run():
34 | mlflow.end_run()
35 |
36 | # if set_experiment has been called before, it stores the experiment_id
37 | # as a global variable, so if we change the tracking_uri afterwards
38 | # mlflow is completly lost because the experiment id no longer exists
39 | # we just reset it after a test, like in a brand new session
40 |
41 | # CAVEAT 1 : do not import from "mlflow.tracking.fluent import _active_experiment_id"
42 | # because due to python namespacing import, it will not change the global variable accessed by mlflow
43 |
44 | # CAVEAT 2 : Since this PR: https://github.com/mlflow/mlflow/pull/13456/files
45 | # we need to reset experiment ID too because its now resetted in each thread
46 | mlflow.tracking.fluent._active_experiment_id = None
47 | os.environ.pop("MLFLOW_EXPERIMENT_ID", None)
48 | os.environ.pop("MLFLOW_TRACKING_URI", None)
49 | os.environ.pop("MLFLOW_REGISTRY_URI", None)
50 |
51 | # see https://github.com/kedro-org/kedro/blob/859f98217eed12208a922b771a97cbfb82ba7e80/tests/framework/session/test_session.py#L173
52 |
53 |
54 | @pytest.fixture
55 | def kedro_project(tmp_path):
56 | # TODO : this is also an integration test since this depends from the kedro version
57 | config = {
58 | # "output_dir": tmp_path,
59 | "project_name": _FAKE_PROJECT_NAME,
60 | "repo_name": _FAKE_PROJECT_NAME,
61 | "python_package": _FAKE_PROJECT_NAME,
62 | "kedro_version": kedro_version,
63 | "tools": "['None']",
64 | "example_pipeline": "False",
65 | }
66 |
67 | cookiecutter(
68 | str(TEMPLATE_PATH),
69 | output_dir=tmp_path, # config["output_dir"],
70 | no_input=True,
71 | extra_context=config,
72 | accept_hooks=False,
73 | )
74 |
75 | shutil.rmtree(
76 | tmp_path / _FAKE_PROJECT_NAME / "tests"
77 | ) # avoid conflicts with pytest
78 |
79 | return tmp_path / _FAKE_PROJECT_NAME
80 |
81 |
82 | @pytest.fixture
83 | def kedro_project_with_mlflow_conf(kedro_project):
84 | write_jinja_template(
85 | src=TEMPLATE_FOLDER_PATH / "mlflow.yml",
86 | is_cookiecutter=False,
87 | dst=kedro_project / "conf" / "local" / "mlflow.yml",
88 | python_package="fake_project",
89 | )
90 |
91 | return kedro_project
92 |
--------------------------------------------------------------------------------
/tests/framework/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/tests/framework/__init__.py
--------------------------------------------------------------------------------
/tests/framework/cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/tests/framework/cli/__init__.py
--------------------------------------------------------------------------------
/tests/framework/cli/test_cli_utils.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from kedro_mlflow.framework.cli.cli_utils import (
4 | render_jinja_template,
5 | write_jinja_template,
6 | )
7 |
8 |
9 | @pytest.fixture
10 | def template_path(tmp_path):
11 | return tmp_path / "template.py"
12 |
13 |
14 | @pytest.fixture
15 | def jinja_template(template_path):
16 | with open(template_path, "w") as file_handler:
17 | file_handler.write("fake file\n which contains {{ fake_tag }}. Nice, isn't it?")
18 | return "fake file\n which contains 'Hello world!'. Nice, isn't it?"
19 |
20 |
21 | @pytest.fixture
22 | def cookiecutter_template(template_path):
23 | with open(template_path, "w") as file_handler:
24 | file_handler.write(
25 | "fake file\n which contains {{ cookiecutter.fake_tag }}. Nice, isn't it?"
26 | )
27 | return "fake file\n which contains 'Hello world!'. Nice, isn't it?"
28 |
29 |
30 | def test_render_jinja_template(template_path, jinja_template):
31 | rendered = render_jinja_template(src=template_path, fake_tag="'Hello world!'")
32 | assert rendered == jinja_template
33 |
34 |
35 | def test_render_jinja_template_with_cookiecutter_tags(
36 | template_path, cookiecutter_template
37 | ):
38 | rendered = render_jinja_template(
39 | src=template_path, fake_tag="'Hello world!'", is_cookiecutter=True
40 | )
41 | assert rendered == cookiecutter_template
42 |
43 |
44 | def test_write_jinja_template(tmp_path, template_path, jinja_template):
45 | rendered_path = tmp_path / "rendered.py"
46 | write_jinja_template(
47 | src=template_path, dst=rendered_path, fake_tag="'Hello world!'"
48 | )
49 | with open(rendered_path) as file_handler:
50 | rendered = file_handler.read()
51 | assert rendered == jinja_template
52 |
53 |
54 | def test_write_jinja_template_with_cookiecutter_tags(
55 | tmp_path, template_path, cookiecutter_template
56 | ):
57 | rendered_path = tmp_path / "rendered.py"
58 | write_jinja_template(
59 | src=template_path,
60 | dst=rendered_path,
61 | is_cookiecutter=True,
62 | fake_tag="'Hello world!'",
63 | )
64 | with open(rendered_path) as file_handler:
65 | rendered = file_handler.read()
66 | assert rendered == cookiecutter_template
67 |
--------------------------------------------------------------------------------
/tests/framework/hooks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/tests/framework/hooks/__init__.py
--------------------------------------------------------------------------------
/tests/framework/hooks/test_hook_active_run.py:
--------------------------------------------------------------------------------
1 | import mlflow
2 | import pytest
3 | from kedro.framework.session import KedroSession
4 | from kedro.framework.startup import bootstrap_project
5 | from kedro.io import DataCatalog, MemoryDataset
6 | from kedro.pipeline import Pipeline, node
7 |
8 | from kedro_mlflow.framework.hooks import MlflowHook
9 |
10 |
11 | @pytest.fixture
12 | def dummy_run_params(tmp_path):
13 | dummy_run_params = {
14 | "run_id": "",
15 | "project_path": tmp_path.as_posix(),
16 | "env": "local",
17 | "kedro_version": "0.16.5",
18 | "tags": [],
19 | "from_nodes": [],
20 | "to_nodes": [],
21 | "node_names": [],
22 | "from_inputs": [],
23 | "load_versions": [],
24 | "pipeline_name": "my_cool_pipeline",
25 | "extra_params": [],
26 | }
27 | return dummy_run_params
28 |
29 |
30 | @pytest.fixture
31 | def dummy_node():
32 | def fake_fun(arg1, arg2, arg3):
33 | return None
34 |
35 | node_test = node(
36 | func=fake_fun,
37 | inputs={"arg1": "params:param1", "arg2": "foo", "arg3": "parameters"},
38 | outputs="out",
39 | )
40 |
41 | return node_test
42 |
43 |
44 | @pytest.fixture
45 | def dummy_pipeline(dummy_node):
46 | dummy_pipeline = Pipeline([dummy_node])
47 |
48 | return dummy_pipeline
49 |
50 |
51 | @pytest.fixture
52 | def dummy_catalog():
53 | catalog = DataCatalog(
54 | {
55 | "params:param1": 1,
56 | "foo": MemoryDataset(),
57 | "bar": MemoryDataset(),
58 | "parameters": {"param1": 1, "param2": 2},
59 | }
60 | )
61 |
62 | return catalog
63 |
64 |
65 | def test_hook_use_active_run_if_exist_and_do_not_close(
66 | kedro_project,
67 | dummy_run_params,
68 | dummy_pipeline,
69 | dummy_catalog,
70 | ):
71 | mlflow.set_tracking_uri(f"file:///{kedro_project}/mlruns")
72 | with mlflow.start_run():
73 | mlflow_run_id = mlflow.active_run().info.run_id
74 | bootstrap_project(kedro_project)
75 | with KedroSession.create(
76 | project_path=kedro_project,
77 | ) as session:
78 | context = session.load_context()
79 |
80 | mlflow_node_hook = MlflowHook()
81 | mlflow_node_hook.after_context_created(context)
82 | mlflow_node_hook.before_pipeline_run(
83 | run_params=dummy_run_params,
84 | pipeline=dummy_pipeline,
85 | catalog=dummy_catalog,
86 | )
87 | # check after before_pipeline_run, we should still have the same run
88 | assert mlflow.active_run().info.run_id == mlflow_run_id
89 |
90 | mlflow_node_hook.after_pipeline_run(
91 | run_params=dummy_run_params,
92 | pipeline=dummy_pipeline,
93 | catalog=dummy_catalog,
94 | )
95 | # the run must still be open
96 | assert mlflow.active_run().info.run_id == mlflow_run_id
97 |
98 | mlflow_node_hook.on_pipeline_error(
99 | error=ValueError,
100 | run_params=dummy_run_params,
101 | pipeline=dummy_pipeline,
102 | catalog=dummy_catalog,
103 | )
104 | # the run must still be open
105 | assert mlflow.active_run().info.run_id == mlflow_run_id
106 |
107 |
108 | def test_hook_active_run_exists_with_different_tracking_uri(
109 | kedro_project,
110 | dummy_run_params,
111 | dummy_pipeline,
112 | dummy_catalog,
113 | ):
114 | # tracking uri is "mlruns2", not "mlruns"
115 | mlflow.set_tracking_uri(f"file:///{kedro_project}/mlruns2")
116 | with mlflow.start_run():
117 | mlflow_run_id = mlflow.active_run().info.run_id
118 | bootstrap_project(kedro_project)
119 | with KedroSession.create(
120 | project_path=kedro_project,
121 | ) as session:
122 | context = session.load_context()
123 |
124 | mlflow_node_hook = MlflowHook()
125 | mlflow_node_hook.after_context_created(context)
126 |
127 | mlflow.log_param("a", "1") # emulate param logging
128 | # the config should be modified
129 | assert (
130 | mlflow_node_hook.mlflow_config.server.mlflow_tracking_uri
131 | == f"file:///{kedro_project}/mlruns2"
132 | )
133 | assert mlflow_node_hook.mlflow_config.tracking.experiment.name == "Default"
134 | assert mlflow_node_hook.mlflow_config.tracking.run.id == mlflow_run_id
135 |
136 | assert mlflow.get_tracking_uri() == f"file:///{kedro_project}/mlruns2"
137 |
138 | # mlflow.active_run() does not have all data, we should get it trhough the client: https://www.mlflow.org/docs/latest/python_api/mlflow.html#mlflow.active_run
139 | active_run = mlflow_node_hook.mlflow_config.server._mlflow_client.get_run(
140 | mlflow.active_run().info.run_id
141 | )
142 | assert active_run.data.params == {"a": "1"}
143 |
--------------------------------------------------------------------------------
/tests/framework/hooks/test_hook_log_artifact.py:
--------------------------------------------------------------------------------
1 | import mlflow
2 | import pandas as pd
3 | import pytest
4 | from kedro.framework.hooks import _create_hook_manager
5 | from kedro.framework.hooks.manager import _register_hooks
6 | from kedro.framework.session import KedroSession
7 | from kedro.framework.startup import bootstrap_project
8 | from kedro.io import DataCatalog, MemoryDataset
9 | from kedro.pipeline import Pipeline, node
10 | from kedro.runner import ThreadRunner
11 | from kedro_datasets.pickle import PickleDataset
12 |
13 | from kedro_mlflow.framework.hooks.mlflow_hook import MlflowHook
14 | from kedro_mlflow.io.artifacts import MlflowArtifactDataset
15 |
16 |
17 | @pytest.fixture
18 | def dummy_pipeline():
19 | def preprocess_fun(data):
20 | return data
21 |
22 | def train_fun(data):
23 | return 2
24 |
25 | dummy_pipeline = Pipeline(
26 | [
27 | node(
28 | func=preprocess_fun,
29 | inputs="raw_data",
30 | outputs="data",
31 | ),
32 | node(
33 | func=train_fun,
34 | inputs=["data"],
35 | outputs="model",
36 | ),
37 | ]
38 | )
39 | return dummy_pipeline
40 |
41 |
42 | @pytest.fixture
43 | def dummy_catalog(tmp_path):
44 | dummy_catalog = DataCatalog(
45 | {
46 | "raw_data": MemoryDataset(pd.DataFrame(data=[1], columns=["a"])),
47 | "data": MemoryDataset(),
48 | "model": MlflowArtifactDataset(
49 | dataset=dict(
50 | type=PickleDataset, filepath=(tmp_path / "model.csv").as_posix()
51 | )
52 | ),
53 | }
54 | )
55 | return dummy_catalog
56 |
57 |
58 | @pytest.fixture
59 | def dummy_run_params(tmp_path):
60 | dummy_run_params = {
61 | "project_path": tmp_path.as_posix(),
62 | "env": "local",
63 | "kedro_version": "0.16.0",
64 | "tags": [],
65 | "from_nodes": [],
66 | "to_nodes": [],
67 | "node_names": [],
68 | "from_inputs": [],
69 | "load_versions": [],
70 | "pipeline_name": "my_cool_pipeline",
71 | "extra_params": [],
72 | }
73 | return dummy_run_params
74 |
75 |
76 | def test_mlflow_hook_log_artifacts_within_same_run_with_thread_runner(
77 | kedro_project, dummy_run_params, dummy_pipeline, dummy_catalog
78 | ):
79 | # this test is very specific to a new design introduced in mlflow 2.18 to make it thread safe
80 | # see https://github.com/Galileo-Galilei/kedro-mlflow/issues/613
81 | bootstrap_project(kedro_project)
82 |
83 | with KedroSession.create(project_path=kedro_project) as session:
84 | context = session.load_context() # setup mlflow
85 |
86 | mlflow_hook = MlflowHook()
87 | runner = ThreadRunner() # this is what we want to test
88 |
89 | mlflow_hook.after_context_created(context)
90 | mlflow_hook.after_catalog_created(
91 | catalog=dummy_catalog,
92 | # `after_catalog_created` is not using any of arguments bellow,
93 | # so we are setting them to empty values.
94 | conf_catalog={},
95 | conf_creds={},
96 | feed_dict={},
97 | save_version="",
98 | load_versions="",
99 | )
100 | mlflow_hook.before_pipeline_run(
101 | run_params=dummy_run_params,
102 | pipeline=dummy_pipeline,
103 | catalog=dummy_catalog,
104 | )
105 |
106 | # we get the run id BEFORE running the pipeline because it was modified in different thread
107 | run_id_before_run = mlflow.active_run().info.run_id
108 |
109 | hook_manager = _create_hook_manager()
110 | _register_hooks(hook_manager, (mlflow_hook,))
111 |
112 | runner.run(dummy_pipeline, dummy_catalog, hook_manager)
113 |
114 | run_id_after_run = mlflow.active_run().info.run_id
115 |
116 | # CHECK 1: check that we are not on the second id created by the thread.lock()
117 | assert run_id_before_run == run_id_after_run
118 |
119 | mlflow_hook.after_pipeline_run(
120 | run_params=dummy_run_params,
121 | pipeline=dummy_pipeline,
122 | catalog=dummy_catalog,
123 | )
124 |
125 | mlflow_client = context.mlflow.server._mlflow_client
126 |
127 | # check that the artifact is assocaied to the initial run:
128 |
129 | artifacts_list = mlflow_client.list_artifacts(run_id_before_run)
130 | assert len(artifacts_list) == 1
131 |
--------------------------------------------------------------------------------
/tests/framework/hooks/test_hook_on_pipeline_error.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Iterable, Optional
2 |
3 | import mlflow
4 | import pytest
5 | from kedro.config import AbstractConfigLoader, OmegaConfigLoader
6 | from kedro.framework.hooks import hook_impl
7 | from kedro.framework.project import Validator, _ProjectPipelines, _ProjectSettings
8 | from kedro.framework.session import KedroSession
9 | from kedro.framework.startup import bootstrap_project
10 | from kedro.io import DataCatalog
11 | from kedro.pipeline import Pipeline, node
12 | from mlflow.entities import RunStatus
13 | from mlflow.tracking import MlflowClient
14 |
15 | from kedro_mlflow.framework.hooks.mlflow_hook import MlflowHook
16 |
17 |
18 | class DummyProjectHooks:
19 | @hook_impl
20 | def register_config_loader(self, conf_paths: Iterable[str]) -> AbstractConfigLoader:
21 | return OmegaConfigLoader(conf_paths)
22 |
23 | @hook_impl
24 | def register_catalog(
25 | self,
26 | catalog: Optional[dict[str, dict[str, Any]]],
27 | credentials: dict[str, dict[str, Any]],
28 | load_versions: dict[str, str],
29 | save_version: str,
30 | ) -> DataCatalog:
31 | return DataCatalog.from_config(
32 | catalog, credentials, load_versions, save_version
33 | )
34 |
35 |
36 | def _mock_imported_settings_paths(mocker, mock_settings):
37 | for path in [
38 | "kedro.framework.context.context.settings",
39 | "kedro.framework.session.session.settings",
40 | "kedro.framework.project.settings",
41 | ]:
42 | mocker.patch(path, mock_settings)
43 | return mock_settings
44 |
45 |
46 | def _mock_settings_with_hooks(mocker, hooks):
47 | class MockSettings(_ProjectSettings):
48 | _HOOKS = Validator("HOOKS", default=hooks)
49 |
50 | return _mock_imported_settings_paths(mocker, MockSettings())
51 |
52 |
53 | @pytest.fixture
54 | def mock_settings_with_mlflow_hooks(mocker):
55 | return _mock_settings_with_hooks(
56 | mocker,
57 | hooks=(
58 | DummyProjectHooks(),
59 | MlflowHook(),
60 | ),
61 | )
62 |
63 |
64 | @pytest.fixture
65 | def mock_failing_pipeline(mocker):
66 | def failing_node():
67 | mlflow.start_run(nested=True)
68 | raise ValueError("Let's make this pipeline fail")
69 |
70 | def mocked_register_pipelines():
71 | failing_pipeline = Pipeline(
72 | [
73 | node(
74 | func=failing_node,
75 | inputs=None,
76 | outputs="fake_output",
77 | )
78 | ]
79 | )
80 | return {"__default__": failing_pipeline, "pipeline_off": failing_pipeline}
81 |
82 | mocker.patch.object(
83 | _ProjectPipelines,
84 | "_get_pipelines_registry_callable",
85 | return_value=mocked_register_pipelines,
86 | )
87 |
88 |
89 | # @pytest.mark.usefixtures("mock_settings_with_mlflow_hooks")
90 | @pytest.mark.usefixtures("mock_failing_pipeline")
91 | def test_on_pipeline_error(kedro_project_with_mlflow_conf):
92 | tracking_uri = (kedro_project_with_mlflow_conf / "mlruns").as_uri()
93 |
94 | bootstrap_project(kedro_project_with_mlflow_conf)
95 | with KedroSession.create(project_path=kedro_project_with_mlflow_conf) as session:
96 | context = session.load_context()
97 | from logging import getLogger
98 |
99 | LOGGER = getLogger(__name__)
100 | LOGGER.info(f"{mlflow.active_run()=}")
101 | with pytest.raises(ValueError):
102 | LOGGER.info(f"{mlflow.active_run()=}")
103 | session.run()
104 |
105 | # the run we want is the last one in the configuration experiment
106 | mlflow_client = MlflowClient(tracking_uri)
107 | experiment = mlflow_client.get_experiment_by_name(
108 | context.mlflow.tracking.experiment.name
109 | )
110 | failing_run_info = mlflow_client.search_runs(experiment.experiment_id)[-1].info
111 |
112 | assert mlflow.active_run() is None # the run must have been closed
113 | assert failing_run_info.status == RunStatus.to_string(
114 | RunStatus.FAILED
115 | ) # it must be marked as failed
116 |
--------------------------------------------------------------------------------
/tests/framework/hooks/test_run_name.py:
--------------------------------------------------------------------------------
1 | import mlflow
2 | import pytest
3 | from kedro.framework.session import KedroSession
4 | from kedro.framework.startup import bootstrap_project
5 | from kedro.io import DataCatalog
6 | from kedro.pipeline import Pipeline
7 |
8 | from kedro_mlflow.framework.hooks import MlflowHook
9 |
10 |
11 | @pytest.mark.parametrize(
12 | "pipeline_name,expected_mlflow_run_name",
13 | [
14 | ("my_cool_pipeline", "my_cool_pipeline"),
15 | ("__default__", "__default__"),
16 | (None, "__default__"),
17 | ],
18 | )
19 | def test_pipeline_use_pipeline_name_as_run_name(
20 | kedro_project, pipeline_name, expected_mlflow_run_name
21 | ):
22 | dummy_run_params = {
23 | "run_id": "1234",
24 | "project_path": "path/to/project",
25 | "env": "local",
26 | "kedro_version": "X.Y.Z",
27 | "tags": [],
28 | "from_nodes": [],
29 | "to_nodes": [],
30 | "node_names": [],
31 | "from_inputs": [],
32 | "load_versions": [],
33 | "pipeline_name": pipeline_name,
34 | "extra_params": [],
35 | }
36 |
37 | bootstrap_project(kedro_project)
38 | with KedroSession.create(
39 | project_path=kedro_project,
40 | ) as session:
41 | context = session.load_context()
42 |
43 | mlflow_node_hook = MlflowHook()
44 | mlflow_node_hook.after_context_created(context)
45 | mlflow_node_hook.before_pipeline_run(
46 | run_params=dummy_run_params, pipeline=Pipeline([]), catalog=DataCatalog()
47 | )
48 |
49 | assert (
50 | mlflow.active_run().data.tags["mlflow.runName"] == expected_mlflow_run_name
51 | )
52 |
--------------------------------------------------------------------------------
/tests/framework/hooks/test_utils_flatten_dict.py:
--------------------------------------------------------------------------------
1 | from kedro_mlflow.framework.hooks.utils import _flatten_dict
2 |
3 |
4 | def test_flatten_dict_non_nested():
5 | d = dict(a=1, b=2)
6 | assert _flatten_dict(d=d, recursive=True, sep=".") == d
7 | assert _flatten_dict(d=d, recursive=False, sep=".") == d
8 |
9 |
10 | def test_flatten_dict_nested_1_level():
11 | d = dict(a=1, b=dict(c=3, d=4))
12 | flattened = {"a": 1, "b.c": 3, "b.d": 4}
13 | assert _flatten_dict(d=d, recursive=True, sep=".") == flattened
14 | assert _flatten_dict(d=d, recursive=False, sep=".") == flattened
15 |
16 |
17 | def test_flatten_dict_nested_2_levels():
18 | d = dict(a=1, b=dict(c=1, d=dict(e=3, f=5)))
19 |
20 | assert _flatten_dict(d=d, recursive=True, sep=".") == {
21 | "a": 1,
22 | "b.c": 1,
23 | "b.d.e": 3,
24 | "b.d.f": 5,
25 | }
26 | assert _flatten_dict(d=d, recursive=False, sep=".") == {
27 | "a": 1,
28 | "b.c": 1,
29 | "b.d": {"e": 3, "f": 5},
30 | }
31 |
32 |
33 | def test_flatten_dict_nested_3_levels():
34 | d = dict(a=1, b=dict(c=1, d=dict(e=3, f=dict(g=4, h=5))))
35 |
36 | assert _flatten_dict(d=d, recursive=True, sep=".") == {
37 | "a": 1,
38 | "b.c": 1,
39 | "b.d.e": 3,
40 | "b.d.f.g": 4,
41 | "b.d.f.h": 5,
42 | }
43 | assert _flatten_dict(d=d, recursive=False, sep=".") == {
44 | "a": 1,
45 | "b.c": 1,
46 | "b.d": {"e": 3, "f": {"g": 4, "h": 5}},
47 | }
48 |
49 |
50 | def test_flatten_dict_with_float_keys():
51 | d = {0: 1, 1: {3: 1, 4: {"e": 3, 6.7: 5}}}
52 |
53 | assert _flatten_dict(d=d, recursive=True, sep="_") == {
54 | "0": 1,
55 | "1_3": 1,
56 | "1_4_e": 3,
57 | "1_4_6.7": 5,
58 | }
59 | assert (
60 | _flatten_dict(d=d, recursive=False, sep="_")
61 | == {
62 | "0": 1,
63 | "1_3": 1,
64 | "1_4": {
65 | "e": 3,
66 | 6.7: 5, # 6.7 is not converted to string, but when the entire dict will be logged mlflow will take care of the conversion
67 | },
68 | }
69 | )
70 |
71 |
72 | def test_flatten_dict_with_used_defined_sep():
73 | d = dict(a=1, b=dict(c=1, d=dict(e=3, f=dict(g=4, h=5))))
74 |
75 | assert _flatten_dict(d=d, recursive=True, sep="_") == {
76 | "a": 1,
77 | "b_c": 1,
78 | "b_d_e": 3,
79 | "b_d_f_g": 4,
80 | "b_d_f_h": 5,
81 | }
82 |
--------------------------------------------------------------------------------
/tests/framework/hooks/test_utils_generate_kedro_command.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from kedro_mlflow.framework.hooks.utils import _generate_kedro_command
4 |
5 |
6 | def test_generate_kedro_commands():
7 | # TODO : add a better test because the formatting of record_data is subject to change
8 | # We could check that the command is recored and then rerun properly
9 | record_data = {
10 | "tags": ["tag1", "tag2"],
11 | "from_nodes": ["node1"],
12 | "to_nodes": ["node3"],
13 | "node_names": ["node1", "node2", "node1"],
14 | "from_inputs": ["data_in"],
15 | "load_versions": {"data_inter": "01:23:45"},
16 | "pipeline_name": "fake_pl",
17 | }
18 |
19 | expected = "kedro run --from-inputs=data_in --from-nodes=node1 --to-nodes=node3 --node=node1,node2,node1 --pipeline=fake_pl --tag=tag1,tag2 --load-version=data_inter:01:23:45"
20 | assert _generate_kedro_command(**record_data) == expected
21 |
22 |
23 | @pytest.mark.parametrize("default_value", [None, []])
24 | def test_generate_default_kedro_commands(default_value):
25 | """This test ensures that the _generate_kedro_comands accepts both
26 | `None` and empty `list` as default value, because CLI and interactive
27 | `Journal` do not use the same default.
28 |
29 | Args:
30 | default_value ([type]): [description]
31 | """
32 | record_data = {
33 | "tags": default_value,
34 | "from_nodes": default_value,
35 | "to_nodes": default_value,
36 | "node_names": default_value,
37 | "from_inputs": default_value,
38 | "load_versions": default_value,
39 | "pipeline_name": "fake_pl",
40 | }
41 |
42 | expected = "kedro run --pipeline=fake_pl"
43 | assert _generate_kedro_command(**record_data) == expected
44 |
--------------------------------------------------------------------------------
/tests/io/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/tests/io/__init__.py
--------------------------------------------------------------------------------
/tests/io/artifacts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/tests/io/artifacts/__init__.py
--------------------------------------------------------------------------------
/tests/io/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/tests/io/metrics/__init__.py
--------------------------------------------------------------------------------
/tests/io/metrics/test_mlflow_metric_history_dataset.py:
--------------------------------------------------------------------------------
1 | import mlflow
2 | import pytest
3 | from mlflow.tracking import MlflowClient
4 |
5 | from kedro_mlflow.io.metrics import MlflowMetricHistoryDataset
6 |
7 |
8 | @pytest.fixture
9 | def mlflow_tracking_uri(tmp_path):
10 | tracking_uri = (tmp_path / "mlruns").as_uri()
11 | mlflow.set_tracking_uri(tracking_uri)
12 | return tracking_uri
13 |
14 |
15 | @pytest.fixture
16 | def mlflow_client(mlflow_tracking_uri):
17 | mlflow_client = MlflowClient(mlflow_tracking_uri)
18 | return mlflow_client
19 |
20 |
21 | @pytest.mark.parametrize(
22 | "save_mode,load_mode",
23 | [
24 | ("list", "list"),
25 | ("list", "dict"),
26 | ("dict", "list"),
27 | ("dict", "dict"),
28 | ("history", "list"),
29 | ("history", "dict"),
30 | ("history", "history"),
31 | ],
32 | )
33 | def test_mlflow_metric_history_dataset_save_load(mlflow_client, save_mode, load_mode):
34 | metric_as_list = [0.3, 0.2, 0.1, 0.15, 0.05]
35 | metric_as_dict = dict(enumerate(metric_as_list))
36 | metric_as_history = [
37 | {"step": i, "value": value, "timestamp": 1630235933 + i}
38 | for i, value in metric_as_dict.items()
39 | ]
40 |
41 | mode_metrics_mapping = {
42 | "list": metric_as_list,
43 | "dict": metric_as_dict,
44 | "history": metric_as_history,
45 | }
46 |
47 | metric_ds_model_local_filesystem = MlflowMetricHistoryDataset(
48 | key="my_metric", save_args={"mode": save_mode}
49 | )
50 | with mlflow.start_run():
51 | metric_ds_model_local_filesystem.save(mode_metrics_mapping[save_mode])
52 | run_id = mlflow.active_run().info.run_id
53 |
54 | # check existence
55 | run = mlflow_client.get_run(run_id)
56 | assert "my_metric" in run.data.metrics.keys()
57 |
58 | metric_ds_loader = MlflowMetricHistoryDataset(
59 | key="my_metric", run_id=run_id, load_args={"mode": load_mode}
60 | )
61 |
62 | assert metric_ds_loader.load() == mode_metrics_mapping[load_mode]
63 |
64 |
65 | def test_mlflow_metric_history_dataset_logging_deactivation(mlflow_tracking_uri):
66 | metric_ds = MlflowMetricHistoryDataset(key="inactive_metric")
67 | metric_ds._logging_activated = False
68 | with mlflow.start_run():
69 | metric_ds.save([0.1])
70 | assert metric_ds._exists() is False
71 |
72 |
73 | @pytest.mark.parametrize(
74 | "metadata",
75 | (
76 | None,
77 | {"description": "My awsome dataset"},
78 | {"string": "bbb", "int": 0},
79 | ),
80 | )
81 | def test_metric_history_dataset_with_metadata(tmp_path, metadata):
82 | metric_ds = MlflowMetricHistoryDataset(
83 | key="hello",
84 | metadata=metadata,
85 | )
86 |
87 | assert metric_ds.metadata == metadata
88 |
89 | # Metadata should not show in _describe
90 | assert "metadata" not in metric_ds._describe()
91 |
--------------------------------------------------------------------------------
/tests/io/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/tests/io/models/__init__.py
--------------------------------------------------------------------------------
/tests/io/models/test_mlflow_model_local_filesystem_dataset.py:
--------------------------------------------------------------------------------
1 | from tempfile import TemporaryDirectory
2 |
3 | import mlflow
4 | import pandas as pd
5 | import pytest
6 | from kedro.io import DataCatalog, MemoryDataset
7 | from kedro.pipeline import Pipeline, node
8 | from kedro_datasets.pickle import PickleDataset
9 | from pytest_lazy_fixtures import lf
10 | from sklearn.linear_model import LinearRegression
11 |
12 | from kedro_mlflow.io.models import MlflowModelLocalFileSystemDataset
13 | from kedro_mlflow.mlflow import KedroPipelineModel
14 | from kedro_mlflow.pipeline import pipeline_ml_factory
15 |
16 |
17 | @pytest.fixture
18 | def linreg_model():
19 | linreg_model = LinearRegression()
20 | return linreg_model
21 |
22 |
23 | @pytest.fixture
24 | def tmp_folder():
25 | tmp_folder = TemporaryDirectory()
26 | return tmp_folder
27 |
28 |
29 | @pytest.fixture
30 | def linreg_path(tmp_path):
31 | linreg_path = tmp_path / "data" / "06_models" / "linreg"
32 | return linreg_path
33 |
34 |
35 | @pytest.fixture
36 | def pipeline_ml_obj():
37 | def preprocess_fun(data):
38 | return data
39 |
40 | def fit_fun(data):
41 | return 2
42 |
43 | def predict_fun(model, data):
44 | return data * model
45 |
46 | full_pipeline = Pipeline(
47 | [
48 | node(
49 | func=preprocess_fun,
50 | inputs="raw_data",
51 | outputs="data",
52 | tags=["inference", "training"],
53 | ),
54 | node(func=fit_fun, inputs="data", outputs="model", tags=["training"]),
55 | node(
56 | func=predict_fun,
57 | inputs=["data", "model"],
58 | outputs="predictions",
59 | tags=["inference"],
60 | ),
61 | ]
62 | )
63 |
64 | pipeline_ml_obj = pipeline_ml_factory(
65 | training=full_pipeline.only_nodes_with_tags("training"),
66 | inference=full_pipeline.only_nodes_with_tags("inference"),
67 | input_name="raw_data",
68 | )
69 |
70 | return pipeline_ml_obj
71 |
72 |
73 | @pytest.fixture
74 | def pipeline_inference(pipeline_ml_obj):
75 | return pipeline_ml_obj.inference
76 |
77 |
78 | @pytest.fixture
79 | def dummy_catalog(tmp_path):
80 | dummy_catalog = DataCatalog(
81 | {
82 | "raw_data": MemoryDataset(),
83 | "data": MemoryDataset(),
84 | "model": PickleDataset(
85 | filepath=(tmp_path / "data" / "06_models" / "model.pkl")
86 | .resolve()
87 | .as_posix()
88 | ),
89 | }
90 | )
91 | dummy_catalog._datasets["model"].save(2) # emulate model fitting
92 |
93 | return dummy_catalog
94 |
95 |
96 | @pytest.fixture
97 | def kedro_pipeline_model(tmp_path, pipeline_ml_obj, dummy_catalog):
98 | kedro_pipeline_model = KedroPipelineModel(
99 | pipeline=pipeline_ml_obj,
100 | catalog=dummy_catalog,
101 | input_name=pipeline_ml_obj.input_name,
102 | )
103 |
104 | return kedro_pipeline_model
105 |
106 |
107 | def test_save_unversioned_under_same_path(
108 | linreg_path,
109 | linreg_model,
110 | ):
111 | model_config = {
112 | "name": "linreg",
113 | "config": {
114 | "type": "kedro_mlflow.io.models.MlflowModelLocalFileSystemDataset",
115 | "flavor": "mlflow.sklearn",
116 | "filepath": linreg_path.as_posix(),
117 | },
118 | }
119 | mlflow_model_ds = MlflowModelLocalFileSystemDataset.from_config(**model_config)
120 | mlflow_model_ds.save(linreg_model)
121 | # check that second save does not fail
122 | # this happens if the underlying folder already exists
123 | mlflow_model_ds.save(linreg_model)
124 |
125 |
126 | @pytest.mark.parametrize("versioned", [False, True])
127 | def test_save_load_local(linreg_path, linreg_model, versioned):
128 | model_config = {
129 | "name": "linreg",
130 | "config": {
131 | "type": "kedro_mlflow.io.models.MlflowModelLocalFileSystemDataset",
132 | "filepath": linreg_path.as_posix(),
133 | "flavor": "mlflow.sklearn",
134 | "versioned": versioned,
135 | },
136 | }
137 | mlflow_model_ds = MlflowModelLocalFileSystemDataset.from_config(**model_config)
138 | mlflow_model_ds.save(linreg_model)
139 |
140 | if versioned:
141 | assert (
142 | linreg_path / mlflow_model_ds._version.save / linreg_path.name
143 | ).exists() # Versioned model saved locally
144 | else:
145 | assert linreg_path.exists() # Unversioned model saved locally
146 |
147 | linreg_model_loaded = mlflow_model_ds.load()
148 | assert isinstance(linreg_model_loaded, LinearRegression)
149 |
150 |
151 | @pytest.mark.parametrize(
152 | "pipeline",
153 | [
154 | (lf("pipeline_ml_obj")), # must work for PipelineML
155 | (lf("pipeline_inference")), # must work for Pipeline
156 | ],
157 | )
158 | def test_pyfunc_flavor_python_model_save_and_load(
159 | tmp_path, tmp_folder, pipeline, dummy_catalog
160 | ):
161 | kedro_pipeline_model = KedroPipelineModel(
162 | pipeline=pipeline,
163 | catalog=dummy_catalog,
164 | input_name="raw_data",
165 | )
166 | artifacts = kedro_pipeline_model.extract_pipeline_artifacts(tmp_folder)
167 |
168 | model_config = {
169 | "name": "kedro_pipeline_model",
170 | "config": {
171 | "type": "kedro_mlflow.io.models.MlflowModelLocalFileSystemDataset",
172 | "filepath": (
173 | tmp_path / "data" / "06_models" / "my_custom_model"
174 | ).as_posix(),
175 | "flavor": "mlflow.pyfunc",
176 | "pyfunc_workflow": "python_model",
177 | "save_args": {
178 | "artifacts": artifacts,
179 | "conda_env": {"python": "3.10.0", "dependencies": ["kedro==0.18.11"]},
180 | },
181 | },
182 | }
183 |
184 | mlflow_model_ds = MlflowModelLocalFileSystemDataset.from_config(**model_config)
185 | mlflow_model_ds.save(kedro_pipeline_model)
186 |
187 | assert mlflow.active_run() is None
188 |
189 | # close the run, create another dataset and reload
190 | # (emulate a new "kedro run" with the launch of the )
191 | loaded_model = mlflow_model_ds.load()
192 |
193 | loaded_model.predict(pd.DataFrame(data=[1], columns=["a"])) == pd.DataFrame(
194 | data=[2], columns=["a"]
195 | )
196 |
197 |
198 | @pytest.mark.parametrize(
199 | "metadata",
200 | (
201 | None,
202 | {"description": "My awsome dataset"},
203 | {"string": "bbb", "int": 0},
204 | ),
205 | )
206 | def test_metrics_history_dataset_with_metadata(metadata):
207 | mlflow_model_ds = MlflowModelLocalFileSystemDataset(
208 | flavor="mlflow.sklearn",
209 | filepath="/my/file/path",
210 | metadata=metadata,
211 | )
212 |
213 | assert mlflow_model_ds.metadata == metadata
214 |
215 | # Metadata should not show in _describe
216 | assert "metadata" not in mlflow_model_ds._describe()
217 |
--------------------------------------------------------------------------------
/tests/mlflow/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/tests/mlflow/__init__.py
--------------------------------------------------------------------------------
/tests/pipeline/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/tests/pipeline/__init__.py
--------------------------------------------------------------------------------
/tests/template/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/tests/template/__init__.py
--------------------------------------------------------------------------------
/tests/template/project/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/tests/template/project/__init__.py
--------------------------------------------------------------------------------
/tests/template/project/test_mlflow_yml.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import yaml
3 |
4 | from kedro_mlflow.config.kedro_mlflow_config import KedroMlflowConfig
5 | from kedro_mlflow.framework.cli.cli import TEMPLATE_FOLDER_PATH
6 | from kedro_mlflow.framework.cli.cli_utils import write_jinja_template
7 |
8 |
9 | @pytest.fixture
10 | def template_mlflowyml(tmp_path):
11 | # the goal is to discover all potential ".py" files
12 | # but for now there is only "run.py"
13 | # this is rather a safeguard for further add
14 | raw_template_path = TEMPLATE_FOLDER_PATH / "mlflow.yml"
15 | rendered_template_path = tmp_path / raw_template_path.name
16 | tags = {
17 | "project_name": "This is a fake project",
18 | "python_package": "fake_project",
19 | "kedro_version": "0.16.0",
20 | }
21 |
22 | write_jinja_template(src=raw_template_path, dst=rendered_template_path, **tags)
23 | return rendered_template_path.as_posix()
24 |
25 |
26 | def test_mlflow_yml_rendering(template_mlflowyml):
27 | # the mlflow yml file must be consistent with the default in KedroMlflowConfig for readibility
28 | with open(template_mlflowyml) as file_handler:
29 | mlflow_config = yaml.safe_load(file_handler)
30 |
31 | # note: Using Pydantic model Construct method skip all validations
32 | # and here we do not want to check the path
33 | expected_config = KedroMlflowConfig.construct(
34 | project_path="fake/path",
35 | tracking=dict(
36 | disable_tracking=dict(pipelines=[], disable_autologging=True),
37 | experiment=dict(
38 | name="fake_project",
39 | create_experiment_kwargs=dict(artifact_location=None, tags=None),
40 | restore_if_deleted=True,
41 | ),
42 | params=dict(
43 | dict_params=dict(flatten=False, recursive=True, sep="."),
44 | long_params_strategy="fail",
45 | ),
46 | run=dict(id=None, name=None, nested=True),
47 | ), # check for proper rendering
48 | )
49 |
50 | assert mlflow_config == expected_config.dict(exclude={"project_path"})
51 |
--------------------------------------------------------------------------------