├── .editorconfig ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── PULL_REQUEST_TEMPLATE.md ├── dependabot.yml └── workflows │ ├── check-links.yml │ ├── prepare-release.yml │ ├── publish.yml │ ├── test_on_master.yml │ ├── test_on_pr.yml │ └── test_reusable_workflow.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yml ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── codecov.yml ├── docs ├── Makefile ├── conf.py ├── index.md ├── make.bat └── source │ ├── 01_introduction │ ├── 01_introduction.md │ ├── 02_motivation.md │ └── index.md │ ├── 02_getting_started │ ├── 01_installation │ │ ├── 01_installation.md │ │ └── 02_setup.md │ ├── 02_quickstart │ │ ├── 00_intro_tutorial.md │ │ ├── 01_example_project.md │ │ └── 02_first_steps.md │ └── index.md │ ├── 03_experiment_tracking │ ├── 01_experiment_tracking │ │ ├── 01_configuration.md │ │ ├── 02_version_parameters.md │ │ ├── 03_version_datasets.md │ │ ├── 04_version_models.md │ │ ├── 05_version_metrics.md │ │ └── 06_mlflow_ui.md │ ├── 02_interactive_use │ │ └── 01_notebook_use.md │ └── index.md │ ├── 04_pipeline_as_model │ ├── 01_pipeline_as_custom_model │ │ ├── 01_mlflow_models.md │ │ ├── 02_scikit_learn_like_pipeline.md │ │ ├── 03_deployment_patterns.md │ │ └── 04_custom_kedro_pipeline_model.md │ ├── 02_framework_ml │ │ ├── 01_why_framework.md │ │ ├── 02_ml_project_components.md │ │ └── 03_framework_solutions.md │ └── index.md │ ├── 05_API │ ├── 01_python_objects │ │ ├── 01_Datasets.md │ │ ├── 02_Hooks.md │ │ ├── 03_Pipelines.md │ │ ├── 04_CLI.md │ │ └── 05_Configuration.md │ ├── 02_autoapi │ │ ├── kedro_mlflow.config.rst │ │ ├── kedro_mlflow.framework.cli.rst │ │ ├── kedro_mlflow.framework.hooks.rst │ │ ├── kedro_mlflow.io.rst │ │ ├── kedro_mlflow.mlflow.rst │ │ ├── kedro_mlflow.pipeline.rst │ │ └── kedro_mlflow.rst │ └── index.md │ ├── 06_migration_guide │ ├── index.md │ ├── migration_guide_kedro_experiment_tracking.md │ └── migration_guide_kedro_mlflow.md │ └── imgs │ ├── apps_interaction.png │ ├── blogpost_migrate_experiment_tracking.png │ ├── default_catalog.png │ ├── etl_app.png │ ├── hook_registration_process.png │ ├── initialized_project.png │ ├── kedro_viz_params.png │ ├── logo.png │ ├── ml_pipeline │ ├── preprocessing │ │ ├── all.PNG │ │ ├── inference.PNG │ │ └── training.PNG │ ├── shared_inputs │ │ ├── all.PNG │ │ ├── inference.PNG │ │ └── training.PNG │ ├── tokenizer │ │ ├── all.PNG │ │ ├── inference.PNG │ │ └── training.PNG │ └── vanilla │ │ ├── all.PNG │ │ ├── inference.PNG │ │ └── training.PNG │ ├── mlflow_host_page.png │ ├── mlflow_run.png │ ├── mlflow_tracking_schema.png │ ├── mlflow_yml.png │ ├── once_run_project.png │ ├── run_with_artifact.png │ └── updated_catalog.png ├── kedro_mlflow ├── __init__.py ├── config │ ├── __init__.py │ ├── kedro_mlflow_config.py │ └── resolvers.py ├── framework │ ├── __init__.py │ ├── cli │ │ ├── __init__.py │ │ ├── cli.py │ │ └── cli_utils.py │ └── hooks │ │ ├── __init__.py │ │ ├── mlflow_hook.py │ │ └── utils.py ├── io │ ├── __init__.py │ ├── artifacts │ │ ├── __init__.py │ │ └── mlflow_artifact_dataset.py │ ├── catalog │ │ ├── __init__.py │ │ └── switch_catalog_logging.py │ ├── metrics │ │ ├── __init__.py │ │ ├── mlflow_abstract_metric_dataset.py │ │ ├── mlflow_metric_dataset.py │ │ ├── mlflow_metric_history_dataset.py │ │ └── mlflow_metrics_history_dataset.py │ └── models │ │ ├── __init__.py │ │ ├── mlflow_abstract_model_dataset.py │ │ ├── mlflow_model_local_filesystem_dataset.py │ │ ├── mlflow_model_registry_dataset.py │ │ └── mlflow_model_tracking_dataset.py ├── mlflow │ ├── __init__.py │ └── kedro_pipeline_model.py ├── pipeline │ ├── __init__.py │ ├── pipeline_ml.py │ └── pipeline_ml_factory.py ├── template │ └── project │ │ └── mlflow.yml └── utils.py ├── mlc_config.json ├── pyproject.toml └── tests ├── __init__.py ├── config ├── __init__.py ├── test_get_mlflow_config.py ├── test_kedro_mlflow_config.py └── test_resolvers.py ├── conftest.py ├── framework ├── __init__.py ├── cli │ ├── __init__.py │ ├── test_cli.py │ ├── test_cli_modelify.py │ └── test_cli_utils.py └── hooks │ ├── __init__.py │ ├── test_hook_active_run.py │ ├── test_hook_deactivate_tracking.py │ ├── test_hook_log_artifact.py │ ├── test_hook_log_metrics.py │ ├── test_hook_log_parameters.py │ ├── test_hook_on_pipeline_error.py │ ├── test_hook_pipeline_ml.py │ ├── test_run_name.py │ ├── test_utils_flatten_dict.py │ └── test_utils_generate_kedro_command.py ├── io ├── __init__.py ├── artifacts │ ├── __init__.py │ └── test_mlflow_artifact_dataset.py ├── metrics │ ├── __init__.py │ ├── test_mlflow_metric_dataset.py │ ├── test_mlflow_metric_history_dataset.py │ └── test_mlflow_metrics_dataset.py └── models │ ├── __init__.py │ ├── test_mlflow_model_local_filesystem_dataset.py │ ├── test_mlflow_model_registry_dataset.py │ └── test_mlflow_model_tracking_dataset.py ├── mlflow ├── __init__.py └── test_kedro_pipeline_model.py ├── pipeline ├── __init__.py └── test_pipeline_ml.py └── template ├── __init__.py └── project ├── __init__.py └── test_mlflow_yml.py /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 4 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | charset = utf-8 11 | end_of_line = lf 12 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: If something isn't working 4 | title: '' 5 | labels: 'Issue: Bug Report' 6 | assignees: '' 7 | 8 | --- 9 | 10 | <!-- **_If you like the repo, please give it a :star:_** --> 11 | 12 | ## Description 13 | <!-- Short description of the problem here. --> 14 | 15 | ## Context 16 | 17 | <!-- How has this bug affected you? What were you trying to accomplish? --> 18 | 19 | ## Steps to Reproduce 20 | 21 | <!-- Please provide a detailed description. A Minimal Reproducible Example would really help to solve your issue faster (see this [Stack Overflow thread](https://stackoverflow.com/help/minimal-reproducible-example) to see how to create a good "reprex"). A link to a github repo is even better. 22 | 23 | 1. [First Step] 24 | 2. [Second Step] 25 | 3. [And so on...] --> 26 | 27 | ## Expected Result 28 | 29 | <!-- Tell us what should happen. --> 30 | 31 | ## Actual Result 32 | 33 | <!-- Tell us what happens instead. --> 34 | 35 | ``` 36 | -- If you received an error, place it here. 37 | ``` 38 | 39 | ``` 40 | -- Separate them if you have more than one. 41 | ``` 42 | 43 | ## Your Environment 44 | 45 | <!-- Include as many relevant details about the environment in which you experienced the bug: --> 46 | 47 | * `kedro` and `kedro-mlflow` version used (`pip show kedro` and `pip show kedro-mlflow`): 48 | * Python version used (`python -V`): 49 | * Operating system and version: 50 | 51 | ## Does the bug also happen with the last version on master? 52 | 53 | <!-- The plugin is still in early development and known bugs are fixed as soon as we can. If you are lucky, your bug is already fixed on the `master` branch which is the most up to date. This branch contains our more recent development unpublished on PyPI yet. 54 | 55 | In your environment, please try: 56 | 57 | ```bash 58 | pip install --upgrade git+https://github.com/Galileo-Galilei/kedro-mlflow 59 | ``` 60 | 61 | And check if you can to reproduce the error. If you can't, just wait for the next release or use the master branch at your own risk! --> 62 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Let us know if you have a feature request or enhancement 4 | title: '<Title>' 5 | labels: 'Issue: Feature Request' 6 | assignees: '' 7 | --- 8 | 9 | <!-- **_If you like the repo, please give it a :star:_** --> 10 | 11 | ## Description 12 | <!-- A clear and concise description of what you want to achieve. An image or a code example is worth thousand words! --> 13 | 14 | ## Context 15 | <!-- Why is this change important to you? How would you use it? How can it benefit other users? --> 16 | 17 | ## Possible Implementation 18 | <!-- (Optional) Suggest an idea for implementing the addition or change. --> 19 | 20 | ## Possible Alternatives 21 | <!-- (Optional) Describe any alternative solutions or features you've considered. --> 22 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Description 2 | Why was this PR created? 3 | 4 | ## Development notes 5 | What have you changed, and how has this been tested? 6 | 7 | ## Checklist 8 | 9 | - [ ] Read the [contributing](https://github.com/Galileo-Galilei/kedro-mlflow/blob/master/CONTRIBUTING.md) guidelines 10 | - [ ] Open this PR as a 'Draft Pull Request' if it is work-in-progress 11 | - [ ] Update the documentation to reflect the code changes 12 | - [ ] Add a description of this change and add your name to the list of supporting contributions in the [`CHANGELOG.md`](https://github.com/Galileo-Galilei/kedro-mlflow/blob/master/CHANGELOG.md) file. Please respect [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) guidelines. 13 | - [ ] Add tests to cover your changes 14 | 15 | ## Notice 16 | 17 | - [ ] I acknowledge and agree that, by checking this box and clicking "Submit Pull Request": 18 | 19 | - I submit this contribution under the [Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0.txt) and represent that I am entitled to do so on behalf of myself, my employer, or relevant third parties, as applicable. 20 | - I certify that (a) this contribution is my original creation and / or (b) to the extent it is not my original creation, I am authorised to submit this contribution on behalf of the original creator(s) or their licensees. 21 | - I certify that the use of this contribution as authorised by the Apache 2.0 license does not violate the intellectual property rights of anyone else. 22 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: pip 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | open-pull-requests-limit: 10 8 | -------------------------------------------------------------------------------- /.github/workflows/check-links.yml: -------------------------------------------------------------------------------- 1 | name: check-links 2 | 3 | on: 4 | push: 5 | branches: [master] 6 | tags: 7 | - '*' 8 | pull_request: 9 | branches: [master] 10 | schedule: 11 | - cron: '0 3 * * 1' # runs at 3 AM every monday 12 | 13 | jobs: 14 | markdown-link-check: 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v2 18 | - uses: gaurav-nelson/github-action-markdown-link-check@v1 19 | with: 20 | use-quiet-mode: 'yes' 21 | use-verbose-mode: 'yes' 22 | config-file: 'mlc_config.json' 23 | -------------------------------------------------------------------------------- /.github/workflows/prepare-release.yml: -------------------------------------------------------------------------------- 1 | name: create-release-candidate 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | version_part: 7 | description: The part of the version to update (patch, minor or major) 8 | type: choice 9 | options: 10 | - patch 11 | - minor 12 | - major 13 | default: 'patch' 14 | required: true 15 | 16 | jobs: 17 | prepare-release: 18 | runs-on: ubuntu-latest 19 | strategy: 20 | matrix: 21 | python-version: [3.11] 22 | env: 23 | PYTHON_PACKAGE: kedro_mlflow 24 | steps: 25 | - uses: actions/checkout@v4 26 | - name: Install uv 27 | uses: astral-sh/setup-uv@v5 28 | with: 29 | enable-cache: true 30 | - name: Set up Python ${{ matrix.python-version }} 31 | uses: actions/setup-python@v5 32 | with: 33 | python-version: ${{ matrix.python-version }} 34 | - name: Validate inputs 35 | run: | 36 | echo "INPUT_VERSION_PART: ${{ github.event.inputs.version_part }}" 37 | - name: Bump the version number # bump2version is a maintained fork of original bumpversion 38 | id: bump_version 39 | run: | 40 | uv tool install bump-my-version 41 | uvx bump-my-version bump ${{ github.event.inputs.version_part }} 42 | echo "package_version=$(cat $PYTHON_PACKAGE/__init__.py | grep -Po '\d+\.\d+\.\d+')" >> $GITHUB_OUTPUT 43 | - name: Update the CHANGELOG according to 'Keep a Changelog' guidelines 44 | uses: thomaseizinger/keep-a-changelog-new-release@v1 45 | with: 46 | version: ${{ steps.bump_version.outputs.package_version }} 47 | - name: Create a new release branch 48 | run: | 49 | git config user.name github-actions 50 | git config user.email github-actions@github.com 51 | git checkout -b release-${{ steps.bump_version.outputs.package_version }} 52 | git push -u origin release-${{ steps.bump_version.outputs.package_version }} 53 | - name: Commit the changes 54 | run: | 55 | git commit -am "Bump version and CHANGELOG for release ${{ steps.bump_version.outputs.package_version }}" 56 | git push 57 | - name: Open a PR to merge the release to master 58 | id: open_pr 59 | run: | 60 | gh pr create -B master -H release-${{ steps.bump_version.outputs.package_version }} --title "Release ${{ steps.bump_version.outputs.package_version }}" --body "Bump version and CHANGELOG for next release." --assignee "${{ github.repository_owner }}" 61 | echo "pull_request_number=$(gh pr list --base master --json number,createdAt --jq 'sort_by(.createdAt) | reverse | .[0].number')" >> $GITHUB_OUTPUT 62 | env: 63 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 64 | - name: Change the commit message to add PR number 65 | run: | 66 | git commit -a --amend -m ":rocket: Bump version and CHANGELOG for release ${{ steps.bump_version.outputs.package_version }} (#${{ steps.open_pr.outputs.pull_request_number }})" 67 | git push -f 68 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: publish 2 | 3 | on: workflow_dispatch 4 | 5 | jobs: 6 | deploy: 7 | runs-on: ubuntu-latest 8 | environment: 9 | name: release 10 | permissions: 11 | contents: write # IMPORTANT: this permission is mandatory to enable creating a release 12 | id-token: write # IMPORTANT: this permission is mandatory for trusted publishing 13 | env: 14 | PYTHON_PACKAGE: kedro_mlflow 15 | steps: 16 | - name: Checkout the repo 17 | uses: actions/checkout@v4 18 | with: 19 | fetch-depth: 0 # necessary to enable merging, all the history is needed 20 | - name: Install uv 21 | uses: astral-sh/setup-uv@v5 22 | with: 23 | enable-cache: true 24 | - name: Set up Python 25 | uses: actions/setup-python@v5 26 | with: 27 | python-version: "3.11" 28 | - name: Build package dist from source # A better way will be : https://packaging.python.org/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/ but pep 517 is still marked as experimental 29 | run: | 30 | uv build --wheel --sdist 31 | - name: Set dynamically package version as output variable 32 | # see https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#example-of-setting-an-output-parameter 33 | id: set_package_version 34 | run: | 35 | echo "PACKAGE_VERSION=$(cat $PYTHON_PACKAGE/__init__.py | grep -Po '\d+\.\d+\.\d+')" >> "$GITHUB_OUTPUT" 36 | - name: Create temporary file with the body content for the release 37 | run: | 38 | grep -Poz "## \[${{steps.set_package_version.outputs.PACKAGE_VERSION}}] - \d{4}-\d{2}-\d{2}[\S\s]+?(?=## \[\d+\.\d+\.\d+\]|\[.+\]:)" CHANGELOG.md > release_body.md 39 | - name: Create Release 40 | id: create_release 41 | uses: softprops/action-gh-release@v2 42 | env: 43 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # This token is provided by Actions, you do not need to create your own token 44 | with: 45 | tag_name: ${{ steps.set_package_version.outputs.PACKAGE_VERSION }} 46 | name: Release ${{ steps.set_package_version.outputs.PACKAGE_VERSION }} 47 | body_path: ./release_body.md 48 | draft: false 49 | prerelease: false 50 | - name: Rollback Release in case of run failure 51 | if: failure() && steps.create_release.outputs.id != '' 52 | uses: author/action-rollback@stable 53 | with: 54 | # Using a known release ID 55 | release_id: ${{ steps.create_release.outputs.id }} 56 | env: 57 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 58 | 59 | - name: Publish package distributions to PyPI 60 | uses: pypa/gh-action-pypi-publish@release/v1 61 | with: 62 | verbose: true # trace if the upload fails 63 | -------------------------------------------------------------------------------- /.github/workflows/test_on_master.yml: -------------------------------------------------------------------------------- 1 | name: test 2 | 3 | on: 4 | push: 5 | branches: [master] 6 | 7 | jobs: 8 | lint_and_test: 9 | uses: ./.github/workflows/test_reusable_workflow.yml 10 | strategy: 11 | matrix: 12 | python-version: ["3.9", "3.10", "3.11", "3.12"] 13 | os: [ubuntu-latest, macos-latest, windows-latest] 14 | with: 15 | python-version: ${{ matrix.python-version }} 16 | os: ${{ matrix.os }} 17 | secrets: inherit 18 | -------------------------------------------------------------------------------- /.github/workflows/test_on_pr.yml: -------------------------------------------------------------------------------- 1 | name: test_on_PR 2 | 3 | on: 4 | pull_request: 5 | branches: [master] 6 | 7 | jobs: 8 | lint_and_test: 9 | uses: ./.github/workflows/test_reusable_workflow.yml 10 | with: 11 | python-version: "3.11" 12 | os: "ubuntu-latest" 13 | secrets: inherit 14 | -------------------------------------------------------------------------------- /.github/workflows/test_reusable_workflow.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: test 5 | 6 | on: 7 | workflow_call: 8 | inputs: 9 | os: 10 | required: true 11 | type: string 12 | python-version: 13 | required: true 14 | type: string 15 | secrets: 16 | CODECOV_TOKEN: 17 | required: true 18 | 19 | jobs: 20 | lint_and_test: 21 | runs-on: ${{ inputs.os }} 22 | env: 23 | OS: ${{ inputs.os }} 24 | PYTHON_VERSION: ${{ inputs.python-version }} 25 | steps: 26 | - uses: actions/checkout@v3 27 | - name: Install uv 28 | uses: astral-sh/setup-uv@v5 29 | with: 30 | enable-cache: true 31 | - name: Set up Python ${{ inputs.python-version }} 32 | uses: actions/setup-python@v5 33 | with: 34 | python-version: ${{ inputs.python-version }} 35 | - name: Install dependencies 36 | run: | 37 | uv venv 38 | uv pip install .[test] 39 | - name: Check code formatting with ruff 40 | if: ${{ inputs.os }} == 'ubuntu-latest' && ${{ inputs.python-version }} == '3.11' # linting should occur only once in the loop 41 | run: | 42 | uv run ruff format . --check 43 | - name: Check import order and syntax with ruff 44 | if: ${{ inputs.os }} == 'ubuntu-latest' && ${{ inputs.python-version }} == '3.11' # linting should occur only once in the loop 45 | run: | 46 | uv run ruff check . 47 | - name: Test with pytest and generate coverage report 48 | run: | 49 | uv run pytest -x --cov=./ --cov-report=xml -n auto 50 | - name: Upload coverage report to Codecov 51 | uses: codecov/codecov-action@v1 52 | if: ${{ inputs.os }} == 'ubuntu-latest' && ${{ inputs.python-version }} == '3.11' # upload should occur only once in the loop 53 | with: 54 | token: ${{ secrets.CODECOV_TOKEN }} # token is not mandatory but make access more stable 55 | file: ./coverage.xml 56 | env_vars: OS,PYTHON 57 | fail_ci_if_error: true 58 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # vscode 132 | .vscode 133 | 134 | # mlflow 135 | mlruns/ 136 | 137 | # ruff 138 | .ruff_cache 139 | 140 | debug/ 141 | *.xlsx 142 | *.pptx 143 | 144 | # uv 145 | 146 | uv.lock 147 | 148 | # End of .gitignore 149 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: ^kedro_mlflow/template/project/run.py$ 2 | repos: 3 | - repo: https://github.com/astral-sh/ruff-pre-commit 4 | rev: v0.9.6 5 | hooks: 6 | - id: ruff 7 | args: [--fix, --exit-non-zero-on-fix] 8 | - id: ruff-format 9 | - repo: https://github.com/asottile/blacken-docs 10 | rev: v1.12.1 11 | hooks: 12 | - id: blacken-docs 13 | additional_dependencies: [black==25.1.0] 14 | - repo: https://github.com/pre-commit/pre-commit-hooks 15 | rev: v4.5.0 16 | hooks: 17 | - id: check-case-conflict 18 | - id: check-json 19 | - id: check-merge-conflict 20 | - id: check-toml 21 | - id: check-yaml 22 | - id: debug-statements 23 | - id: end-of-file-fixer 24 | - id: mixed-line-ending 25 | args: [--fix=lf] 26 | - id: trailing-whitespace 27 | args: [--markdown-linebreak-ext=md] 28 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # required since 2023/10 https://blog.readthedocs.com/use-build-os-config/ 9 | build: 10 | os: "ubuntu-22.04" 11 | tools: 12 | python: "3.11" 13 | 14 | # Build documentation in the docs/ directory with Sphinx 15 | sphinx: 16 | configuration: docs/conf.py 17 | 18 | 19 | # Optionally build your docs in additional formats such as PDF 20 | formats: 21 | - pdf 22 | 23 | # Optionally set the version of Python and requirements required to build your docs 24 | python: 25 | install: 26 | - method: pip 27 | path: . 28 | extra_requirements: 29 | - doc 30 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | 2 | # Contributor Covenant Code of Conduct 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | ## Our Standards 8 | 9 | Examples of behavior that contributes to creating a positive environment include: 10 | 11 | - Using welcoming and inclusive language 12 | - Being respectful of differing viewpoints and experiences 13 | - Gracefully accepting constructive criticism 14 | - Focusing on what is best for the community 15 | - Showing empathy towards other community members 16 | 17 | Examples of unacceptable behavior by participants include: 18 | 19 | - The use of sexualized language or imagery and unwelcome sexual attention or advances 20 | - Trolling, insulting/derogatory comments, and personal or political attacks 21 | - Public or private harassment 22 | - Publishing others' private information, such as a physical or electronic address, without explicit permission 23 | - Other conduct which could reasonably be considered inappropriate in a professional setting 24 | 25 | ## Our Responsibilities 26 | 27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 28 | 29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 30 | 31 | ## Scope 32 | 33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 34 | 35 | ## Enforcement 36 | 37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at galileo.galilei.github@gmail.com. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 38 | 39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 40 | Attribution 41 | 42 | This Code of Conduct is adapted from the Contributor Covenant, version 1.4, available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 43 | 44 | For answers to common questions about this code of conduct, see https://www.contributor-covenant.org/faq 45 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Development workflow 2 | 3 | The current workflow is the following: 4 | 5 | 1. Open an issue to describe your feature request or your bug fix with a detailed explanation of what you want to achieve. 6 | 2. Fork the repo 7 | 3. Develop locally: 8 | - Install the precommit file (`pip install pre-commit`, then `pre-commit install`) 9 | - Create a branch based on the master branch (``git checkout -b <prefix-branchname> master``) 10 | - Create a conda environment (conda create -n <your-env-name> python==3.10) 11 | - Activate this environment (`conda activate <your-env-name>`) 12 | - Install the extra dependencies for tests (`pip install kedro-mlflow[dev,test]`) 13 | - Apply your changes 14 | - Run pre-commit (black linting, flake8 errors, isort with ``pre-commit run``) 15 | 4. Submit your changes: 16 | - Ensure test coverage is still 100% 17 | - Update documentation accordingly 18 | - Update `CHANGELOG.md` according to ["Keep a Changelog" guidelines](https://keepachangelog.com/en/1.0.0/) 19 | - Squash all the changes within a single commit as much as possible, and ensure the commit message has the format "[:gitmoji_icon:](https://gitmoji.dev/) Informative description (``#<issue-number>``)" 20 | - Rebase your branch on ``master`` to ensure linear history 21 | - Open a pull request against ``master`` 22 | 5. Ask for review: 23 | - Assign the review @Galileo-Galilei 24 | - Wait for review 25 | - Resolve all discussions (go back to step 3.) 26 | 6. The PR will be merged as soon as possible 27 | 28 | **We reserve the right to take over (suppress or modify) PR that do not match the workflow or are abandoned.** 29 | 30 | 31 | # Release workflow 32 | 33 | 1. Check the issues: 34 | - Ensure all the [release issues](https://github.com/Galileo-Galilei/kedro-mlflow/milestones) are completed. Eventually move the not addressed yet issues to a further release. 35 | - Create a [new milestone](https://github.com/Galileo-Galilei/kedro-mlflow/milestones) 36 | 2. Create the release candidate: 37 | - Go to the [create-release-candidate action](https://github.com/Galileo-Galilei/kedro-mlflow/actions?query=workflow%3Acreate-release-candidate) 38 | - Click "Run workflow" 39 | - Enter the part of the version to bump (one of `<major>.<minor>.<patch>`) 40 | 3. If the workflow has run sucessfully: 41 | - Go to the newly openened PR named "[Release candidate `<version>`](https://github.com/Galileo-Galilei/kedro-mlflow/pulls)" 42 | - Check that changelog and version have been properly updated. 43 | - *(If everything is normal, skip this step)* Eventually pull the branch and make changes if necessary 44 | - Merge the PR to master 45 | 4. Checkout the [publish workflow](https://github.com/Galileo-Galilei/kedro-mlflow/actions?query=workflow%3Apublish) to see if: 46 | - The package has been uploaded on PyPI sucessfully 47 | - A Github release has been created 48 | 5. If the pipeline has failed, please raise an issue to correct the CI, and ensure merge on master manually. 49 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | ignore: 2 | - "setup.py" 3 | - "tests/**/*" 4 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | from datetime import datetime 20 | 21 | from kedro_mlflow import __version__ as km_version 22 | 23 | project = "kedro-mlflow" 24 | copyright = f"{datetime.today().year}, Yolan Honoré-Rougé" 25 | author = "Yolan Honoré-Rougé" 26 | 27 | 28 | # The full version, including alpha/beta/rc tags 29 | release = km_version 30 | 31 | 32 | # -- General configuration --------------------------------------------------- 33 | 34 | # Add any Sphinx extension module names here, as strings. They can be 35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 36 | # ones. 37 | extensions = [ 38 | "sphinx.ext.autodoc", 39 | "sphinx.ext.autosummary", 40 | "sphinx.ext.napoleon", 41 | "sphinx_click", 42 | # "sphinx_autodoc_typehints", 43 | # "sphinx.ext.doctest", 44 | # "sphinx.ext.todo", 45 | # "sphinx.ext.coverage", 46 | # "sphinx.ext.mathjax", 47 | # "sphinx.ext.ifconfig", 48 | # "sphinx.ext.viewcode", 49 | # "nbsphinx", 50 | "sphinx_design", # responsive web component support 51 | "sphinx_copybutton", 52 | "sphinx_markdown_tables", 53 | "myst_parser", 54 | ] 55 | 56 | myst_enable_extensions = ["colon_fence"] 57 | 58 | # enable autosummary plugin (table of contents for modules/classes/class 59 | # methods) 60 | autosummary_generate = True 61 | autosummary_generate_overwrite = False 62 | napoleon_include_init_with_doc = True 63 | 64 | # enable documentation in markdown 65 | source_suffix = {".rst": "restructuredtext", ".md": "markdown"} 66 | 67 | 68 | # List of patterns, relative to source directory, that match files and 69 | # directories to ignore when looking for source files. 70 | # This pattern also affects html_static_path and html_extra_path. 71 | exclude_patterns = ["_build"] 72 | 73 | 74 | # -- Options for HTML output ------------------------------------------------- 75 | # The name of the Pygments (syntax highlighting) style to use. 76 | pygments_style = "sphinx" 77 | 78 | # The theme to use for HTML and HTML Help pages. See the documentation for 79 | # a list of builtin themes. 80 | 81 | html_theme = "pydata_sphinx_theme" # see: https://pydata-sphinx-theme.readthedocs.io/en/stable/user_guide/layout.html 82 | 83 | # useful to create dropdown with the name of the directory as the section name 84 | # see https://stackoverflow.com/questions/36925871/toctree-nested-drop-down: 85 | html_theme_options = { 86 | "logo": { 87 | "image_light": "source/imgs/logo.png", 88 | "image_dark": "source/imgs/logo.png", 89 | }, 90 | # https://pydata-sphinx-theme.readthedocs.io/en/stable/user_guide/header-links.html#fontawesome-icons 91 | "icon_links": [ 92 | { 93 | "name": "GitHub", 94 | "url": "https://github.com/Galileo-Galilei/kedro-mlflow", 95 | "icon": "fa-brands fa-github", 96 | }, 97 | { 98 | "name": "PyPI", 99 | "url": "https://pypi.org/project/kedro-mlflow/", 100 | "icon": "fa-brands fa-python", 101 | }, 102 | { 103 | "name": "Slack", 104 | "url": "https://kedro-org.slack.com/", 105 | "icon": "fa-brands fa-slack", 106 | }, 107 | ], 108 | "navbar_start": ["navbar-logo"], # "version-switcher" to be configured 109 | "navbar_align": "content", 110 | "header_links_before_dropdown": 4, 111 | "secondary_sidebar_items": ["page-toc", "edit-this-page", "sourcelink"], 112 | "use_edit_page_button": True, 113 | } 114 | html_context = { 115 | "github_user": "Galileo-Galilei", 116 | "github_repo": "kedro-mlflow", 117 | "github_version": "master", 118 | "doc_path": "docs/", # why not "docs/source/"? 119 | "default_mode": "light", 120 | } 121 | html_sidebars = {"index": []} 122 | 123 | 124 | myst_heading_anchors = 5 125 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | myst: 3 | html_meta: 4 | "description lang=en": | 5 | Top-level documentation for kedro-mlflow, with links to the rest 6 | of the site. 7 | html_theme.sidebar_secondary.remove: true 8 | --- 9 | 10 | # The kedro-mlflow plugin 11 | 12 | ```kedro-mlflow``` is a Kedro [plugin](https://docs.kedro.org/en/stable/extend_kedro/plugins.html) to integrate [MLflow](https://www.mlflow.org/) effortlessly inside [Kedro](https://kedro.org/) projects. 13 | 14 | Its main features are **automatic parameters tracking**, **datasets tracking as artifacts**, Kedro **pipelines packaging** and serving and **automatic synchronisation between training and inference** pipelines. It aims at providing a complete yet modular framework for high reproducibility of machine learning experiments and ease of deployment. 15 | 16 | ::::{grid} 1 1 2 2 17 | :gutter: 3 18 | 19 | :::{grid-item-card} 20 | :link: source/03_experiment_tracking/01_experiment_tracking/01_configuration.html 21 | :link-type: url 22 | :class-header: bg-light 23 | 24 | {fas}`flask fa-xl;pst-color-primary` Experiment tracking 25 | ^^^ 26 | 27 | Track the **parameters**, **metrics**, **artifacts** and **models** of your kedro pipelines for reproducibility. 28 | ::: 29 | 30 | :::{grid-item-card} 31 | :link: source/04_pipeline_as_model/01_pipeline_as_custom_model/01_mlflow_models.html 32 | :link-type: url 33 | :class-header: bg-light 34 | 35 | {fas}`rocket fa-xl;pst-color-primary` Pipeline as model 36 | ^^^ 37 | 38 | Package any kedro pipeline to a **custom mlflow model** for deployment and serving. The custom model for an inference pipeline can be **registered** in mlflow **automatically** at the end of each training in a *scikit-learn* like way. 39 | ::: 40 | 41 | :::: 42 | 43 | ## Resources 44 | 45 | ::::{grid} 1 1 4 4 46 | :gutter: 3 47 | 48 | :::{grid-item-card} 49 | :link: source/02_getting_started/01_installation/01_installation.html 50 | :link-type: url 51 | :class-header: bg-light 52 | 53 | {fas}`fa-solid fa-graduation-cap fa-xl;pst-color-primary` Quickstart 54 | ^^^ 55 | 56 | Get started in **1 mn** with experiment tracking! 57 | +++ 58 | Try out {fas}`arrow-right fa-xl` 59 | ::: 60 | 61 | :::{grid-item-card} 62 | :link: https://github.com/Galileo-Galilei/kedro-mlflow-tutorial 63 | :link-type: url 64 | :class-header: bg-light 65 | 66 | {fas}`fa-solid fa-chalkboard-user fa-xl;pst-color-primary` Advanced tutorial 67 | ^^^ 68 | 69 | The ``kedro-mlflow-tutorial`` github repo contains a step-by-step tutorial to learn how to use kedro-mlflow as a MLOps framework! 70 | 71 | +++ 72 | Try on github {fab}`github;fa-xl` 73 | ::: 74 | 75 | :::{grid-item-card} 76 | :link: https://www.youtube.com/watch?v=Az_6UKqbznw 77 | :link-type: url 78 | :class-header: bg-light 79 | 80 | {fas}`fa-solid fa-video fa-xl;pst-color-primary` Demonstration in video 81 | ^^^ 82 | 83 | A youtube video by the kedro team to introduce the plugin, with live coding. 84 | 85 | +++ 86 | Watch on youtube {fab}`youtube;fa-xl` 87 | ::: 88 | 89 | :::{grid-item-card} 90 | :link: https://youtu.be/mIfJR3CdBUE 91 | :link-type: url 92 | :class-header: bg-light 93 | 94 | {fas}`fa-solid fa-video fa-xl;pst-color-primary` Tackling the ML Reproducibility Curse 95 | ^^^ 96 | 97 | A community video by [Oleg Litvinov](https://github.com/OlegBEZb) showcasing how to use the Kedro-MLflow plugin on an end to end project. 98 | +++ 99 | Watch on YouTube {fab}`youtube;fa-xl` 100 | ::: 101 | :::: 102 | 103 | ```{toctree} 104 | --- 105 | maxdepth: 1 106 | hidden: true 107 | --- 108 | source/01_introduction/index 109 | source/02_getting_started/index 110 | source/03_experiment_tracking/index 111 | source/04_pipeline_as_model/index 112 | source/05_API/index 113 | Changelog <https://github.com/Galileo-Galilei/kedro-mlflow/releases> 114 | source/06_migration_guide/index 115 | ``` 116 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/01_introduction/02_motivation.md: -------------------------------------------------------------------------------- 1 | # Motivation behind the plugin 2 | 3 | ## When should I use kedro-mlflow? 4 | 5 | Basically, you should use `kedro-mlflow` in **any `Kedro` project which involves machine learning** / deep learning. As stated in the [introduction](https://kedro-mlflow.readthedocs.io/en/latest/source/01_introduction/index.html), `Kedro`'s current versioning (as of version `0.19.10`) is not sufficient for machine learning projects: it lacks a UI and a ``run`` management system. Besides, the `KedroPipelineModel` ability to serve a kedro pipeline as an API or a batch in one line of code is a great addition for collaboration and transition to production. 6 | 7 | If you do not use ``Kedro`` or if you do pure data processing which does not involve *machine learning*, this plugin is not what you are seeking for ;-) 8 | 9 | ## Why should I use kedro-mlflow? 10 | 11 | ### Benchmark of existing solutions 12 | 13 | This paragraph gives a (quick) overview of existing solutions for mlflow integration inside Kedro projects. 14 | 15 | ``Mlflow`` is very simple to add to any existing code. It is a 2-step process: 16 | 17 | - add `log_{XXX}` (either param, artifact, metric or model) functions where they are needed inside the code 18 | - add a `MLProject` at the root of the project to enable CLI execution. This file must contain all the possible execution steps (like the `pipeline.py` / `hooks.py` in a kedro project). 19 | 20 | Including mlflow inside a ``kedro project`` is consequently very easy: the logging functions can be added in the code, and the ``MLProject`` is very simple and is composed almost only of the ``kedro run`` command. You can find examples of such implementations: 21 | 22 | - the [medium paper](https://medium.com/quantumblack/deploying-and-versioning-data-pipelines-at-scale-942b1d81b5f5) by QuantumBlack employees. 23 | - the associated [github repo](https://github.com/tgoldenberg/kedro-mlflow-example) 24 | - other examples can be found on Github, but AFAIK all of them follow the very same principles. 25 | 26 | ### Enforcing Kedro principles 27 | 28 | Above implementations have the advantage of being very straightforward and *mlflow compliant*, but they break several ``Kedro`` principles: 29 | 30 | - the ``MLFLOW_TRACKING_URI`` which registers the database where runs are logged is declared inside the code instead of a configuration file, which **hinders portability across environments** and makes transition to production more difficult 31 | - the logging of different elements can be put in many places in the ``Kedro`` template (in the code of any function involved in a ``node``, in a ``Hook``, in the ``ProjectContext``, in a ``transformer``...). This is not compliant with the ``Kedro`` template where any object has a dedicated location. We want to avoid the logging to occur anywhere because: 32 | - it is **very error-prone** (one can forget to log one parameter) 33 | - it is **hard to modify** (if you want to remove / add / modify an mlflow action you have to find it in the code) 34 | - it **prevents reuse** (re-usable function must not contain mlflow specific code unrelated to their functional specificities, only their execution must be tracked). 35 | 36 | ``kedro-mlflow`` enforces these best practices while implementing a clear interface for each mlflow action in Kedro template. Below chart maps the mlflow action to perform with the Python API provided by ``kedro-mlflow`` and the location in Kedro template where the action should be performed. 37 | 38 | | Mlflow action | Template file | Python API | 39 | | :------------------------ | :-------------- | :--------------------------------------------------------- | 40 | | Set up configuration | ``mlflow.yml`` | ``MlflowHook`` | 41 | | Logging parameters | ``mlflow.yml`` | ``MlflowHook`` | 42 | | Logging artifacts | ``catalog.yml`` | ``MlflowArtifactDataset`` | 43 | | Logging models | ``catalog.yml`` | `MlflowModelTrackingDataset` and `MlflowModelLocalFileSystemDataset` | 44 | | Logging metrics | ``catalog.yml`` | ``MlflowMetricsHistoryDataset`` | 45 | | Logging Pipeline as model | ``hooks.py`` | ``KedroPipelineModel`` and ``pipeline_ml_factory`` | 46 | 47 | `kedro-mlflow` does not currently provide interface to set tags outside a Kedro ``Pipeline``. Some of above decisions are subject to debate and design decisions (for instance, metrics are often updated in a loop during each epoch / training iteration and it does not always make sense to register the metric between computation steps, e.g. as a an I/O operation after a node run). 48 | 49 | ```{note} 50 | You do **not** need any ``MLProject`` file to use mlflow inside your Kedro project. As seen in the [introduction](https://kedro-mlflow.readthedocs.io/en/latest/source/01_introduction/index.html), this file overlaps with Kedro configuration files. 51 | ``` 52 | -------------------------------------------------------------------------------- /docs/source/01_introduction/index.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | ```{toctree} 4 | :caption: Introduction to kedro-mlflow 5 | 6 | 01_introduction 7 | 02_motivation 8 | ``` 9 | -------------------------------------------------------------------------------- /docs/source/02_getting_started/01_installation/01_installation.md: -------------------------------------------------------------------------------- 1 | # Installation guide 2 | 3 | ## Pre-requisites 4 | 5 | ### Create a virtual environment 6 | 7 | I strongly recommend to create a virtual environment in order to avoid version conflicts between packages. I use ``conda`` in this tutorial. 8 | 9 | I also recommend to read [Kedro installation guide](https://kedro.readthedocs.io/en/latest/get_started/install.html) to set up your Kedro project. 10 | 11 | ```console 12 | conda create -n <your-environment-name> python=<3.[6-8].X> 13 | ``` 14 | 15 | For the rest of the section, we assume the environment is activated: 16 | 17 | ```console 18 | conda activate <your-environment-name> 19 | ``` 20 | 21 | ### Check your kedro version 22 | 23 | If you have an existing environment with kedro already installed, make sure its version is above `0.16.0`. `kedro-mlflow` cannot be used with `kedro<0.16.0`, and if you install it in an existing environment, it will reinstall a more up-to-date version of kedro and likely mess your project up until you reinstall the proper version of kedro (the one you originally created the project with). 24 | 25 | ```console 26 | pip show kedro 27 | ``` 28 | 29 | should return: 30 | 31 | ```console 32 | Name: kedro 33 | Version: <your-kedro-version> # <-- make sure it is above 0.16.0, <0.17.0 34 | Summary: Kedro helps you build production-ready data and analytics pipelines 35 | Home-page: https://github.com/quantumblacklabs/kedro 36 | Author: QuantumBlack Labs 37 | Author-email: None 38 | License: Apache Software License (Apache 2.0) 39 | Location: <...>\anaconda3\envs\<your-environment-name>\lib\site-packages 40 | Requires: pip-tools, cachetools, fsspec, toposort, anyconfig, PyYAML, click, pluggy, jmespath, python-json-logger, jupyter-client, setuptools, cookiecutter 41 | ``` 42 | 43 | ## Install the plugin 44 | 45 | There are versions of the plugin compatible up to ``kedro>=0.16.0`` and ``mlflow>=0.8.0``. ``kedro-mlflow`` stops adding features to a minor version 2 to 6 months after a new kedro release. 46 | 47 | ::::{tab-set} 48 | 49 | :::{tab-item} Install with pip / uv 50 | 51 | You can install ``kedro-mlflow`` plugin from ``PyPi`` with `pip`: 52 | 53 | ```console 54 | pip install --upgrade kedro-mlflow 55 | ``` 56 | 57 | If you prefer uv and have it installed, you can use: 58 | 59 | ```console 60 | uv pip install --upgrade kedro-mlflow 61 | ``` 62 | 63 | 64 | ::: 65 | 66 | :::{tab-item} Install with conda / mamba / micromamba 67 | 68 | You can install ``kedro-mlflow`` plugin with `conda` from the ``conda-forge`` channel: 69 | 70 | ```console 71 | conda install kedro-mlflow -c conda-forge 72 | ``` 73 | 74 | ::: 75 | 76 | :::{tab-item} Install from github 77 | 78 | You may want to install the master branch from source which has unreleased features: 79 | 80 | ```console 81 | pip install git+https://github.com/Galileo-Galilei/kedro-mlflow.git 82 | ``` 83 | 84 | ::: 85 | 86 | :::: 87 | 88 | 89 | ## Check the installation 90 | 91 | Enter ``kedro info`` in a terminal with the activated virtual env to check the installation. If it has succeeded, you should see the following ascii art: 92 | 93 | ```console 94 | _ _ 95 | | | _____ __| |_ __ ___ 96 | | |/ / _ \/ _` | '__/ _ \ 97 | | < __/ (_| | | | (_) | 98 | |_|\_\___|\__,_|_| \___/ 99 | v0.<minor>.<patch> 100 | 101 | kedro allows teams to create analytics 102 | projects. It is developed as part of 103 | the Kedro initiative at QuantumBlack. 104 | 105 | Installed plugins: 106 | kedro_mlflow: 0.14.0 (hooks:global,project) 107 | ``` 108 | 109 | The version ``0.14.0`` of the plugin is installed and has both global and project commands. 110 | 111 | That's it! You are now ready to go! 112 | 113 | ## Available commands 114 | 115 | With the ``kedro mlflow -h`` command outside of a kedro project, you now see the following output: 116 | 117 | ```console 118 | Usage: kedro mlflow [OPTIONS] COMMAND [ARGS]... 119 | 120 | Use mlflow-specific commands inside kedro project. 121 | 122 | Options: 123 | -h, --help Show this message and exit. 124 | ``` 125 | -------------------------------------------------------------------------------- /docs/source/02_getting_started/01_installation/02_setup.md: -------------------------------------------------------------------------------- 1 | # Initialize your Kedro project 2 | 3 | This section assume that [you have installed `kedro-mlflow` in your virtual environment](https://kedro-mlflow.readthedocs.io/en/latest/source/02_getting_started/01_installation/01_installation.html). 4 | 5 | ## Create a kedro project 6 | 7 | This plugin must be used in an existing kedro project. If you do not have a kedro project yet, you can create it with ``kedro new`` command. [See the kedro docs for a tutorial](https://kedro.readthedocs.io/en/latest/get_started/new_project.html). 8 | 9 | If you do not have a real-world project, you can use a kedro example and [follow the "Quickstart in 1 mn" example](https://kedro-mlflow.readthedocs.io/en/latest/source/02_getting_started/02_quickstart/01_example_project.html) to make a demo of this plugin out of the box. 10 | 11 | ## Activate `kedro-mlflow` in your kedro project 12 | 13 | In order to use the ``kedro-mlflow`` plugin, you need to setup its configuration and declare its hooks. 14 | 15 | ### Setting up the ``kedro-mlflow`` configuration file 16 | 17 | 18 | ``kedro-mlflow`` is [configured](https://kedro-mlflow.readthedocs.io/en/latest/source//05_API/01_python_objects/05_Configuration.html) through an ``mlflow.yml`` file. The recommended way to initialize the `mlflow.yml` is by using [the ``kedro-mlflow`` CLI](https://kedro-mlflow.readthedocs.io/en/latest/source/05_API/01_python_objects/04_CLI.html), but you can create it manually. 19 | 20 | ```{note} 21 | Since ``kedro-mlflow>=0.11.2``, the configuration file is optional. However, the plugin will use default ``mlflow`` configuration. Specifically, the runs will be stored in a ``mlruns`` folder at the root fo the kedro project since no ``mlflow_tracking_uri`` is configured. 22 | ``` 23 | 24 | Set the working directory at the root of your kedro project: 25 | 26 | ```console 27 | cd path/to/your/project 28 | ``` 29 | 30 | Run the init command : 31 | 32 | ```console 33 | kedro mlflow init 34 | ``` 35 | 36 | you should see the following message: 37 | 38 | ```console 39 | 'conf/local/mlflow.yml' successfully updated. 40 | ``` 41 | 42 | *Note: you can create the configuration file in another kedro environment with the `--env` argument:* 43 | 44 | ```console 45 | kedro mlflow init --env=<other-environment> 46 | ``` 47 | 48 | ### Declaring ``kedro-mlflow`` hooks 49 | 50 | ``kedro_mlflow`` hooks implementations must be registered with Kedro. There are 2 ways of registering [hooks](https://kedro.readthedocs.io/en/latest/hooks/introduction.html). 51 | 52 | ```{important} 53 | You must register the hook provided by ``kedro-mlflow`` (the ``MlflowHook``) to make the plugin work. 54 | ``` 55 | 56 | ::::{tab-set} 57 | 58 | :::{tab-item} `kedro>=0.16.4` - auto-discovery 59 | 60 | If you use `kedro>=0.16.4`, `kedro-mlflow` hooks are auto-registered automatically by default without any action from your side. You can [disable this behaviour](https://kedro.readthedocs.io/en/latest/hooks/introduction.html#disable-auto-registered-plugins-hooks) in your `settings.py` file. 61 | 62 | ::: 63 | 64 | :::{tab-item} `kedro>=0.16.0, <=0.16.3` - register in ``settings.py`` 65 | 66 | If you have turned off plugin automatic registration, you can register its hooks manually by [adding them to ``settings.py``](https://kedro.readthedocs.io/en/latest/hooks/introduction.html#registering-your-hook-implementations-with-kedro): 67 | 68 | ```python 69 | # <your_project>/src/<your_project>/settings.py 70 | from kedro_mlflow.framework.hooks import MlflowHook 71 | 72 | HOOKS = (MlflowHook(),) 73 | ``` 74 | 75 | ::: 76 | 77 | :::: 78 | -------------------------------------------------------------------------------- /docs/source/02_getting_started/02_quickstart/00_intro_tutorial.md: -------------------------------------------------------------------------------- 1 | # Goal of the tutorial 2 | 3 | This "Getting started" section demonstrates how to use some basic functionalities of `kedro-mlflow` in an end to end example. It is supposed to be simple and self-contained and is partially redundant with other sections, but far from complete. 4 | 5 | The **section only focuses on experiment tracking** part and **does _not_ show the "machine learning framework" abilities** of the plugin. The goal is to give to a new user a quick glance to some capabilities so that he can decide whether the plugin suits its needs or not. It is totally worth checking the other sections to have a much more complete overview of what this plugin provides. 6 | -------------------------------------------------------------------------------- /docs/source/02_getting_started/02_quickstart/01_example_project.md: -------------------------------------------------------------------------------- 1 | # Example project 2 | 3 | ## Install the plugin in a virtual environment 4 | 5 | Create a conda environment and install ``kedro-mlflow`` (this will automatically install ``kedro>=0.16.0``). 6 | 7 | ```console 8 | conda create -n km_example python=3.10 --yes 9 | conda activate km_example 10 | pip install kedro-mlflow 11 | ``` 12 | 13 | ## Install the toy project 14 | 15 | For this end to end example, we will use the [kedro starter](https://docs.kedro.org/en/stable/starters/starters.html#official-kedro-starters) with the [iris dataset](https://github.com/kedro-org/kedro-starters). 16 | 17 | We use this project because: 18 | 19 | - it covers most of the common use cases 20 | - it is compatible with older version of ``Kedro`` so newcomers are used to it 21 | - it is maintained by ``Kedro`` maintainers and therefore enforces some best practices. 22 | 23 | 24 | ::::{tab-set} 25 | 26 | :::{tab-item} ``kedro>=0.19.0`` 27 | 28 | ```{warning} 29 | For ``kedro>=0.19.0``, ``pandas-iris`` starter has been removed. It is recommended to install [``spaceflights-pandas`` starter instead](https://github.com/kedro-org/kedro-starters/tree/main/spaceflights-pandas). 30 | ``` 31 | 32 | ::: 33 | 34 | :::{tab-item} ``kedro>=0.16.3,<0.19`` 35 | 36 | The default starter is now called "pandas-iris". In a new console, enter: 37 | 38 | ```console 39 | kedro new --starter=pandas-iris 40 | ``` 41 | 42 | Answer ``Kedro Mlflow Example``, ``km-example`` and ``km_example`` to the three setup questions of a new kedro project: 43 | 44 | ```console 45 | Project Name: 46 | ============= 47 | Please enter a human readable name for your new project. 48 | Spaces and punctuation are allowed. 49 | [New Kedro Project]: Kedro Mlflow Example 50 | 51 | Repository Name: 52 | ================ 53 | Please enter a directory name for your new project repository. 54 | Alphanumeric characters, hyphens and underscores are allowed. 55 | Lowercase is recommended. 56 | [kedro-mlflow-example]: km-example 57 | 58 | Python Package Name: 59 | ==================== 60 | Please enter a valid Python package name for your project package. 61 | Alphanumeric characters and underscores are allowed. 62 | Lowercase is recommended. Package name must start with a letter or underscore. 63 | [kedro_mlflow_example]: km_example 64 | ``` 65 | 66 | ::: 67 | 68 | :::{tab-item} ``kedro>=0.16.0, <=0.16.2`` 69 | 70 | With older versions of ``Kedro``, the starter option is not available, but this ``kedro new`` provides an "Include example" question. Answer ``y`` to this question to get the same starter as above. In a new console, enter: 71 | 72 | ```console 73 | kedro new 74 | ``` 75 | 76 | Answer ``Kedro Mlflow Example``, ``km-example``, ``km_example`` and ``y`` to the four setup questions of a new kedro project: 77 | 78 | ```console 79 | Project Name: 80 | ============= 81 | Please enter a human readable name for your new project. 82 | Spaces and punctuation are allowed. 83 | [New Kedro Project]: Kedro Mlflow Example 84 | 85 | Repository Name: 86 | ================ 87 | Please enter a directory name for your new project repository. 88 | Alphanumeric characters, hyphens and underscores are allowed. 89 | Lowercase is recommended. 90 | [kedro-mlflow-example]: km-example 91 | 92 | Python Package Name: 93 | ==================== 94 | Please enter a valid Python package name for your project package. 95 | Alphanumeric characters and underscores are allowed. 96 | Lowercase is recommended. Package name must start with a letter or underscore. 97 | [kedro_mlflow_example]: km_example 98 | 99 | Generate Example Pipeline: 100 | ========================== 101 | Do you want to generate an example pipeline in your project? 102 | Good for first-time users. (default=N) 103 | [y/N]: y 104 | ``` 105 | 106 | ::: 107 | 108 | :::: 109 | 110 | ## Install dependencies 111 | 112 | Move to the project directory: 113 | 114 | ```console 115 | cd km-example 116 | ``` 117 | 118 | Install the project dependencies : 119 | 120 | ```{warning} 121 | Do not use ``kedro install`` commands which [does not install the packages in your activated environment](https://github.com/quantumblacklabs/kedro/issues/589). It has been removed in ``kedro>=0.19``. 122 | ``` 123 | 124 | ```console 125 | pip install -r src/requirements.txt 126 | ``` 127 | -------------------------------------------------------------------------------- /docs/source/02_getting_started/index.md: -------------------------------------------------------------------------------- 1 | # {octicon}`mortar-board` Getting started 2 | 3 | ```{toctree} 4 | :caption: Installation 5 | 6 | 01_installation/01_installation 7 | 01_installation/02_setup 8 | 01_installation/03_migration_guide 9 | ``` 10 | 11 | ```{toctree} 12 | :caption: Quickstart 13 | 14 | 02_quickstart/00_intro_tutorial 15 | 02_quickstart/01_example_project 16 | 02_quickstart/02_first_steps 17 | ``` 18 | -------------------------------------------------------------------------------- /docs/source/03_experiment_tracking/01_experiment_tracking/02_version_parameters.md: -------------------------------------------------------------------------------- 1 | # Track parameters 2 | 3 | ## Automatic parameters tracking 4 | 5 | Parameters tracking is automatic when the ``MlflowHook`` is added to [the hook list of the ``ProjectContext``](https://kedro-mlflow.readthedocs.io/en/latest/source/02_getting_started/01_installation/02_setup.html). The `mlflow.yml` configuration file has a parameter called ``flatten_dict_params`` which enables to [log as distinct parameters the (key, value) pairs of a ```dict`` parameter](https://kedro-mlflow.readthedocs.io/en/latest/source/05_API/01_python_objects/02_Hooks.html). 6 | 7 | You **do not need any additional configuration** to benefit from parameters versioning. 8 | 9 | ```{hint} 10 | 11 | **How does ``MlflowHook`` operates under the hood?** 12 | 13 | The [medium post which introduces hooks](https://medium.com/quantumblack/introducing-kedro-hooks-fd5bc4c03ff5) explains in detail the steps ``Kedro`` executes when the user calls the ``kedro run`` command. 14 | 15 | ![](../../imgs/hook_registration_process.png) 16 | 17 | The `MlflowHook` registers the parameters before each node (entry point number 3 on above picture) by calling `mlflow.log_parameter(param_name, param_value)` on each parameters of the node. 18 | 19 | ``` 20 | 21 | ## Frequently asked questions 22 | 23 | :::{dropdown} How are parameters detected by the plugin? 24 | The hook **detects parameters through their prefix ``params:`` or the value ``parameters``**. These are the [reserved keywords used by Kedro to define parameters](https://docs.kedro.org/en/stable/configuration/parameters.html#how-to-use-parameters) in the ``pipeline.py`` file(s). 25 | ::: 26 | 27 | :::{dropdown} Will parameters be recorded if the pipeline fails during execution? 28 | The parameters are registered node by node (and not in a single batch at the beginning of the execution). If the pipeline fails in the middle of its execution, the **parameters of the nodes who have been run will be recorded**, but **not the parameters of non executed nodes**. 29 | ::: 30 | -------------------------------------------------------------------------------- /docs/source/03_experiment_tracking/01_experiment_tracking/04_version_models.md: -------------------------------------------------------------------------------- 1 | # Track models 2 | 3 | ## What is model tracking? 4 | 5 | MLflow allows to serialize and deserialize models to a common format, track those models in MLflow Tracking and manage them using MLflow Model Registry. Many popular Machine / Deep Learning frameworks have built-in support through what MLflow calls [flavors](https://www.mlflow.org/docs/latest/models.html#built-in-model-flavors). Even if there is no flavor for your framework of choice, it is easy to [create your own flavor](https://www.mlflow.org/docs/latest/models.html#custom-python-models) and integrate it with MLflow. 6 | 7 | ## How to track models using MLflow in Kedro project? 8 | 9 | `kedro-mlflow` introduces two new `DataSet` types that can be used in `DataCatalog` called `MlflowModelTrackingDataset` and `MlflowModelLocalFileSystemDataset`. The two have very similar API, except that: 10 | 11 | - the ``MlflowModelTrackingDataset`` is used to load from and save to from the mlflow artifact store. It uses optional `run_id` argument to load and save from a given `run_id` which must exists in the mlflow server you are logging to. 12 | - the ``MlflowModelLocalFileSystemDataset`` is used to load from and save to a given path. It uses the standard `filepath` argument in the constructor of Kedro DataSets. Note that it **does not log in mlflow**. 13 | 14 | *Note: If you use ``MlflowModelTrackingDataset``, it will be saved during training in your current run. However, you will need to specify the run id to predict with (since it is not persisted locally, it will not pick the latest model by default). You may prefer to combine ``MlflowModelLocalFileSystemDataset`` and ``MlflowArtifactDataset`` to make persist it both locally and remotely, see further.* 15 | 16 | Suppose you would like to register a `scikit-learn` model of your `DataCatalog` in mlflow, you can use the following yaml API: 17 | 18 | ```yaml 19 | my_sklearn_model: 20 | type: kedro_mlflow.io.models.MlflowModelTrackingDataset 21 | flavor: mlflow.sklearn 22 | ``` 23 | 24 | More informations on available parameters are available in the [dedicated section](https://kedro-mlflow.readthedocs.io/en/latest/source/05_API/01_python_objects/01_Datasets.html#mlflowmodeltrackingdataset). 25 | 26 | You are now able to use ``my_sklearn_model`` in your nodes. Since this model is registered in mlflow, you can also leverage the [mlflow model serving abilities](https://www.mlflow.org/docs/latest/cli.html#mlflow-models-serve) or [predicting on batch abilities](https://www.mlflow.org/docs/latest/cli.html#mlflow-models-predict), as well as the [mlflow models registry](https://www.mlflow.org/docs/latest/model-registry.html) to manage the lifecycle of this model. 27 | 28 | ## Frequently asked questions 29 | 30 | :::{dropdown} How is it working under the hood? 31 | 32 | **For ``MlflowModelTrackingDataset``** 33 | 34 | During save, a model object from node output is logged to mlflow using ``log_model`` function of the specified ``flavor``. It is logged in the `run_id` run if specified and if there is no active run, else in the currently active mlflow run. If the `run_id` is specified and there is an active run, the saving operation will fail. Consequently it will **never be possible to save in a specific mlflow run_id** if you launch a pipeline with the `kedro run` command because the `MlflowHook` creates a new run before each pipeline run. 35 | 36 | During load, the model is retrieved from the ``run_id`` if specified, else it is retrieved from the mlflow active run. If there is no mlflow active run, the loading fails. This will never happen if you are using the `kedro run` command, because the `MlflowHook` creates a new run before each pipeline run. 37 | 38 | **For ``MlflowModelLocalFileSystemDataset``** 39 | 40 | During save, a model object from node output is saved locally under specified ``filepath`` using ``save_model`` function of the specified ``flavor``. 41 | 42 | When model is loaded, the latest version stored locally is read using ``load_model`` function of the specified ``flavor``. You can also load a model from a specific kedro run by specifying the `version` argument to the constructor. 43 | ::: 44 | 45 | :::{dropdown} How can I track a custom MLflow model flavor? 46 | 47 | To track a custom MLflow model flavor you need to set the `flavor` parameter to import the module of your custom flavor and to specify a [pyfunc workflow](https://mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#pyfunc-create-custom-workflows) which can be set either to `python_model` or `loader_module`. The former is the more high level and user friendly and is [recommend by mlflow](https://mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#which-workflow-is-right-for-my-use-case) while the latter offer more control. We haven't tested the integration in `kedro-mlflow` of this second workflow extensively, and it should be used with caution. 48 | 49 | ```yaml 50 | my_custom_model: 51 | type: kedro_mlflow.io.models.MlflowModelTrackingDataset 52 | flavor: my_package.custom_mlflow_flavor 53 | pyfunc_workflow: python_model # or loader_module 54 | ``` 55 | 56 | ::: 57 | 58 | ### How can I save model locally and log it in MLflow in one step? 59 | 60 | :::{dropdown} How can I save model locally and log it in MLflow in one step? 61 | 62 | If you want to save your model both locally and remotely within the same run, you can leverage `MlflowArtifactDataset`: 63 | 64 | ```yaml 65 | sklearn_model: 66 | type: kedro_mlflow.io.artifacts.MlflowArtifactDataset 67 | dataset: 68 | type: kedro_mlflow.io.models.MlflowModelLocalFileSystemDataset 69 | flavor: mlflow.sklearn 70 | filepath: data/06_models/sklearn_model 71 | ``` 72 | 73 | This might be useful if you want to always read the lastest model saved locally and log it to MLflow each time the new model is being trained for tracking purpose. 74 | 75 | ::: 76 | -------------------------------------------------------------------------------- /docs/source/03_experiment_tracking/01_experiment_tracking/06_mlflow_ui.md: -------------------------------------------------------------------------------- 1 | # Open the mlflow UI 2 | 3 | ## The mlflow user interface 4 | 5 | Mlflow offers a user interface (UI) that enable to browse the run history. 6 | 7 | ## The ``kedro-mlflow`` helper 8 | 9 | When you use a local storage for kedro mlflow, you can call a [mlflow cli command](https://www.mlflow.org/docs/latest/tracking.html#tracking-ui) to launch the UI if you do not have a [mlflow tracking server configured](https://www.mlflow.org/docs/latest/tracking.html#mlflow-tracking-server-optional). 10 | 11 | To ensure this UI is linked to the tracking uri specified configuration, ``kedro-mlflow`` offers the following command: 12 | 13 | ```console 14 | kedro mlflow ui 15 | ``` 16 | 17 | which is a wrapper for ``kedro ui`` command with the tracking uri (as well as the port and host) specified the ``mlflow.yml`` file. 18 | 19 | Opens ``http://localhost:5000`` in your browser to see the UI after calling previous command. If your ``mlflow_tracking_uri`` is a ``http[s]`` URL, the command will automatically open it. 20 | -------------------------------------------------------------------------------- /docs/source/03_experiment_tracking/02_interactive_use/01_notebook_use.md: -------------------------------------------------------------------------------- 1 | # How to use `kedro-mlflow` in a notebook 2 | 3 | ```{important} 4 | You need to install ``ipython`` to access notebook functionalities. 5 | ``` 6 | 7 | ## Reminder on mlflow's limitations with interactive use 8 | 9 | Data science project lifecycle are very iterative. Mlflow intends to track parameters changes to improve reproducibility. However, one must be conscious that being able to **execute functions outside of a end to end pipeline** puts a strong burden on the user shoulders **because he is in charge to make the code execution coherent** by running the notebooks cells in the right order. Any back and forth during execution to change some parameters in a previous notebook cells and then retrain a model creates an operational risk that the recorded parameter stored in mlflow is different than the real parameter used for training the model. 10 | 11 | To make a long story short: **forget about efficient reproducibility** when using mlflow interactively. 12 | 13 | It may **still be useful to track some experiments results** especially if they are long to run and vary wildly with parameters, e.g. if you are performing hyperparameter tuning. 14 | 15 | These limitations are inherent to the data science process, not to mlflow itself or the plugin. 16 | 17 | ## Setup mlflow configuration in your notebook 18 | 19 | Open your notebook / ipython session with the Kedro CLI: 20 | 21 | ```bash 22 | kedro jupyter notebook 23 | ``` 24 | 25 | Or if you are on JupyterLab, 26 | 27 | ``` 28 | %load_ext kedro.ipython 29 | ``` 30 | 31 | Kedro [creates a bunch of global variables](https://kedro.readthedocs.io/en/stable/tools_integration/ipython.html#use-kedro-with-ipython-and-jupyter), including a `session`, a ``context`` and a ``catalog`` which are automatically accessible. 32 | 33 | When the context was created, ``kedro-mlflow`` automatically: 34 | 35 | - loaded and setup (create the tracking uri, export credentials...) the mlflow configuration of your `mlflow.yml` 36 | - import ``mlflow`` which is now accessible in your notebook 37 | 38 | If you change your ``mlflow.yml``, reload the kedro extension for the changes to take effect. 39 | 40 | ## Difference with running through the CLI 41 | 42 | - The DataSets `load` and `save` methods works as usual. You can call `catalog.save("my_artifact_dataset", data)` inside a cell, and your data will be logged in mlflow properly (assuming "my_artifact_dataset" is a `kedro_mlflow.io.MlflowArtifactDataset`). 43 | - The `hooks` which automatically save all parameters/metrics/artifacts in mlflow will work if you run the session interactively, e.g.: 44 | 45 | ```python 46 | session.run( 47 | pipeline_name="my_ml_pipeline", 48 | tags="training", 49 | from_inputs="data_2", 50 | to_outputs="data_7", 51 | ) 52 | ``` 53 | 54 | but it is not very likely in a notebook. 55 | 56 | - if you need to interact manually with the mlflow server, you can use ``context.mlflow.server._mlflow_client``. 57 | 58 | ## Guidelines and best practices suggestions 59 | 60 | During experimentation phase, you will likely not run entire pipelines (or sub pipelines filtered out between some inputs and outputs). Hence, you cannot benefit from Kedro's ``hooks`` (and hence from ``kedro-mlflow`` tracking). From this moment on, perfect reproducbility is impossible to achieve: there is no chance that you manage to maintain a perfectly linear workflow, as you will go back and forth modifying parameters and code to create your model. 61 | 62 | I suggest to : 63 | 64 | - **focus on versioning parameters and metrics**. The goal is to finetune your hyperparameters and to be able to remember later the best setup. It is not very important to this stage to version all parameters (e.g. preprocessing ones) nor models (after all you will need an entire pipeline to predict and it is very unlikely that you will need to reuse these experiment models one day.) It may be interesting to use ``mlflow.autolog()`` feature to have a easy basic setup. 65 | - **transition quickly to kedro pipelines**. For instance, when you preprocessing is roughly defined, try to put it in kedro pipelines. You can then use notebooks to experiment / perfom hyperparameter tuning while keeping preprocessing "fixed" to enhance reproducibility. You can run this pipeline interactively with : 66 | 67 | ```python 68 | result = session.run( 69 | pipeline_name="my_preprocessing_pipeline", 70 | tags="training", 71 | from_inputs="data_2", 72 | to_outputs="data_7", 73 | ) 74 | ``` 75 | 76 | ``result`` is a python `dict` with the outputs of your pipeline (e.g. a "preprocessed_data" ``pandas.DataFrame``), and you can use it interactively in your notebook. 77 | -------------------------------------------------------------------------------- /docs/source/03_experiment_tracking/index.md: -------------------------------------------------------------------------------- 1 | # {octicon}`beaker` Experiment tracking 2 | 3 | ```{toctree} 4 | :caption: Experiment tracking 5 | 6 | 01_experiment_tracking/01_configuration 7 | 01_experiment_tracking/02_version_parameters 8 | 01_experiment_tracking/03_version_datasets 9 | 01_experiment_tracking/04_version_models 10 | 01_experiment_tracking/05_version_metrics 11 | ``` 12 | 13 | ```{toctree} 14 | :caption: Visualise experiments 15 | 16 | 01_experiment_tracking/06_mlflow_ui 17 | ``` 18 | 19 | ```{toctree} 20 | :caption: Interactive use 21 | 22 | 02_interactive_use/01_notebook_use 23 | ``` 24 | -------------------------------------------------------------------------------- /docs/source/04_pipeline_as_model/01_pipeline_as_custom_model/01_mlflow_models.md: -------------------------------------------------------------------------------- 1 | # Introduction to mlflow models 2 | 3 | ## What are Mlflow Models ? 4 | 5 | [Mlflow Models are a standardised agnostic format to store machine learning models](https://www.mlflow.org/docs/latest/models.html). They intend to be standalone to be as portable as possible to be deployed virtually anywhere and mlflow provides built-in CLI commands to deploy a mlflow model to most common cloud platforms or to create an API. 6 | 7 | A Mlflow Model is composed of: 8 | 9 | - a ``MLModel`` file which is a configuration file to indicate to mlflow how to load the model. This file may also contain the ``Signature`` of the model (i.e. the ``Schema`` of the input and output of your model, including the columns names and order) as well as example data. 10 | - a ``conda.yml`` file which contains the specifications of the virtual conda environment inside which the model should run. It contains the packages versions necessary for your model to be executed. 11 | - a ``model.pkl`` (or a ``python_function.pkl`` for custom model) file containing the trained model. 12 | - an ``artifacts`` folder containing all other data necessary to execute the models 13 | 14 | ```{important} 15 | Mlflow enable to create **custom models "flavors" to convert any object to a Mlflow Model** provided we have these informations. Inside a Kedro project, the ``Pipeline`` and ``DataCatalog`` objects contain all these informations. As a consequence, it is easy to create a custom model to convert entire Kedro ``Pipeline``s to mlflow models, and it the purpose of ``pipeline_ml_factory`` and ``KedroPipelineModel`` that we will present in the following sections. 16 | ``` 17 | 18 | ## Pre-requisite for converting a pipeline to a mlflow model 19 | 20 | You can log any Kedro ``Pipeline`` matching the following requirements: 21 | 22 | - one of its input must be a ``pandas.DataFrame``, a ``spark.DataFrame`` or a ``numpy.array``. This is the **input which contains the data to predict on**. This can be any Kedro ``AbstractDataset`` which loads data in one of the previous three formats. It can also be a ``MemoryDataset`` and not be persisted in the ``catalog.yml``. 23 | - all its other inputs must be persisted on disk (e.g. if the machine learning model must already be trained and saved so we can export it) or declared as "parameters" in the model ``Signature``. 24 | 25 | ```{warning} 26 | If the pipeline has parameters : 27 | - For ``mlflow<2.7.0`` the parameters need to be persisted before exporting the model, which implies that you will not be able to modify them at runtime. This is a limitation of ``mlflow<2.6.0`` 28 | - For ``mlflow>=2.7.0`` , they can be declared in the signature and modified at runtime. See https://github.com/Galileo-Galilei/kedro-mlflow/issues/445 for more information. 29 | ``` 30 | -------------------------------------------------------------------------------- /docs/source/04_pipeline_as_model/01_pipeline_as_custom_model/02_scikit_learn_like_pipeline.md: -------------------------------------------------------------------------------- 1 | # Scikit-learn like Kedro pipelines - Automatically log the inference pipeline after training 2 | 3 | For consistency, you may want to **log an inference pipeline** (including some data preprocessing and prediction post processing) **automatically after you ran a training pipeline**, with all the artifacts generated during training (the new model, encoders, vectorizers...). 4 | 5 | ```{hint} 6 | You can think of ``pipeline_ml_factory`` as "**scikit-learn like pipeline in kedro**". Running ``kedro run -p training`` performs the scikit-learn's ``pipeline.fit()`` operation, storing all components (e.g. a model) we need to reuse further as mlflow artifacts and the inference pipeline as code. Hence, you can later use this mlflow model which will perform the scikit-learn's ``pipeline.predict(new_data)`` operation by running the entire kedro inference pipeline. 7 | ``` 8 | 9 | ## Getting started with pipeline_ml_factory 10 | 11 | ```{note} 12 | Below code assume that for inference, you want to skip some nodes that are training specific, e.g. you don't want to train the model, you just want to predict with it ; you don't want to fit and transform with you encoder, but only transform. Make sure these 2 steps ("train" and "predict", or "fit and "transform") are separated in 2 differnt nodes in your pipeline, so you can skip the train / transform step at inference time. 13 | ``` 14 | 15 | You can configure your project as follows: 16 | 17 | 1. Install ``kedro-mlflow`` ``MlflowHook`` (this is done automatically if you have installed ``kedro-mlflow`` in a ``kedro>=0.16.5`` project) 18 | 2. Turn your training pipeline in a ``PipelineML`` object with ``pipeline_ml_factory`` function in your ``pipeline_registry.py``: 19 | 20 | ```python 21 | # pipeline_registry.py for kedro>=0.17.2 (hooks.py for ``kedro>=0.16.5, <0.17.2) 22 | 23 | from kedro_mlflow_tutorial.pipelines.ml_app.pipeline import create_ml_pipeline 24 | 25 | 26 | def register_pipelines(self) -> [str, Pipeline]: 27 | ml_pipeline = create_ml_pipeline() 28 | training_pipeline_ml = pipeline_ml_factory( 29 | training=ml_pipeline.only_nodes_with_tags( 30 | "training" 31 | ), # nodes : encode_labels + preprocess + train_model + predict + postprocess + evaluate 32 | inference=ml_pipeline.only_nodes_with_tags( 33 | "inference" 34 | ), # nodes : preprocess + predict + postprocess 35 | input_name="instances", 36 | log_model_kwargs=dict( 37 | artifact_path="kedro_mlflow_tutorial", 38 | conda_env={ 39 | "python": 3.10, 40 | "dependencies": [f"kedro_mlflow_tutorial=={PROJECT_VERSION}"], 41 | }, 42 | signature="auto", 43 | ), 44 | ) 45 | 46 | return {"training": training_pipeline_ml} 47 | ``` 48 | 49 | 3. Persist all your artifacts locally in the ``catalog.yml`` 50 | 51 | ```yaml 52 | label_encoder: 53 | type: pickle.PickleDataset # <- This must be any Kedro Dataset other than "MemoryDataset" 54 | filepath: data/06_models/label_encoder.pkl # <- This must be a local path, no matter what is your mlflow storage (S3 or other) 55 | ``` 56 | 57 | and as well for your model if necessary. 58 | 59 | 4. Launch your training pipeline: 60 | 61 | ```bash 62 | kedro run --pipeline=training 63 | ``` 64 | 65 | **The inference pipeline will _automagically_ be logged as a custom mlflow model** (a ``KedroPipelineModel``) **at the end of the training pipeline!**. 66 | 67 | 5. Go to the UI, retrieve the run id of your "inference pipeline" model and use it as you want, e.g. in the `catalog.yml`: 68 | 69 | ```yaml 70 | # catalog.yml 71 | 72 | pipeline_inference_model: 73 | type: kedro_mlflow.io.models.MlflowModelTrackingDataset 74 | flavor: mlflow.pyfunc 75 | pyfunc_workflow: python_model 76 | artifact_path: kedro_mlflow_tutorial # the name of your mlflow folder = the model_name in pipeline_ml_factory 77 | run_id: <your-run-id> 78 | ``` 79 | 80 | Now you can run the entire inference pipeline inside a node as part of another pipeline. 81 | 82 | ## Advanced configuration for pipeline_ml_factory 83 | 84 | ### Register the model as a new version in the mlflow registry 85 | 86 | The ``log_model_kwargs`` argument is passed to the underlying [mlflow.pyfunc.log_model](https://mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#mlflow.pyfunc.log_model). Specifically, it accepts a ``registered_model_name`` argument : 87 | 88 | ```python 89 | pipeline_ml_factory( 90 | training=ml_pipeline.only_nodes_with_tags("training"), 91 | inference=ml_pipeline.only_nodes_with_tags("inference"), 92 | input_name="instances", 93 | log_model_kwargs=dict( 94 | artifact_path="kedro_mlflow_tutorial", 95 | registered_model_name="my_inference_pipeline", # a new version of "my_infernce_pipeline" model will be registered each time you run the "training" pipeline 96 | conda_env={ 97 | "python": 3.10, 98 | "dependencies": [f"kedro_mlflow_tutorial=={PROJECT_VERSION}"], 99 | }, 100 | signature="auto", 101 | ), 102 | ) 103 | ``` 104 | 105 | ## Complete step by step demo project with code 106 | 107 | A step by step tutorial with code is available in the [kedro-mlflow-tutorial repository on github](https://github.com/Galileo-Galilei/kedro-mlflow-tutorial#serve-the-inference-pipeline-to-a-end-user). 108 | 109 | You have also other resources to understand the rationale: 110 | 111 | - an explanation of the [``PipelineML`` class in the python objects section](https://kedro-mlflow.readthedocs.io/en/latest/source/05_API/01_python_objects/03_Pipelines.html) 112 | - detailed explanations [on this issue](https://github.com/Galileo-Galilei/kedro-mlflow/issues/16) and [this discussion](https://github.com/Galileo-Galilei/kedro-mlflow/discussions/229). 113 | - an example of use in a user project [in this repo](https://github.com/laurids-reichardt/kedro-examples/blob/kedro-mlflow-hotfix2/text-classification/src/text_classification/pipelines/pipeline.py). 114 | -------------------------------------------------------------------------------- /docs/source/04_pipeline_as_model/01_pipeline_as_custom_model/03_deployment_patterns.md: -------------------------------------------------------------------------------- 1 | # Deployment patterns for kedro pipelines as model 2 | 3 | A step by step tutorial with code is available in the [kedro-mlflow-tutorial repository on github](https://github.com/Galileo-Galilei/kedro-mlflow-tutorial#serve-the-inference-pipeline-to-an-end-user) which explains how to serve the pipeline as an API or a batch. 4 | 5 | ## Deploying a KedroPipelineModel 6 | 7 | ::::{tab-set} 8 | 9 | :::{tab-item} Reuse from a python script 10 | 11 | ```{note} 12 | See tutorial: <https://github.com/Galileo-Galilei/kedro-mlflow-tutorial?tab=readme-ov-file#scenario-1-reuse-from-a-python-script> 13 | ``` 14 | 15 | If you want to load and predict with your model from python, the ``load_model`` function of mlflow is what you need: 16 | 17 | ```python 18 | PROJECT_PATH = r"<your/project/path>" 19 | RUN_ID = "<your-run-id>" 20 | 21 | from kedro.framework.startup import bootstrap_project 22 | from kedro.framework.session import KedroSession 23 | from mlflow.pyfunc import load_model 24 | 25 | bootstrap_project(PROJECT_PATH) 26 | session = Kedrosession.create( 27 | session_id=1, 28 | project_path=PROJECT_PATH, 29 | package_name="kedro_mlflow_tutorial", 30 | ) 31 | local_context = session.load_context() # setup mlflow config 32 | 33 | instances = local_context.io.load("instances") 34 | model = load_model(f"runs:/{RUN_ID}/kedro_mlflow_tutorial") 35 | 36 | predictions = model.predict( 37 | instances 38 | ) # runs ``session.run(pipeline=inference)`` with the artifacts created ruing training. You should see the kedro logs. 39 | ``` 40 | 41 | The ``predictions`` object is a ``pandas.DataFrame`` and can be handled as usual. 42 | ::: 43 | 44 | :::{tab-item} Reuse in a kedro pipeline 45 | 46 | ```{note} 47 | See tutorial: <https://github.com/Galileo-Galilei/kedro-mlflow-tutorial?tab=readme-ov-file#scenario-2-reuse-in-a-kedro-pipeline> 48 | ``` 49 | 50 | Say that you want to reuse this trained model in a kedro Pipeline, like the user_app. The easiest way to do it is to add the model in the catalog.yml file 51 | 52 | ```yaml 53 | pipeline_inference_model: 54 | type: kedro_mlflow.io.models.MlflowModelLoggerDataSet 55 | flavor: mlflow.pyfunc 56 | pyfunc_workflow: python_model 57 | artifact_path: kedro_mlflow_tutorial # the name of your mlflow folder = the model_name in pipeline_ml_factory 58 | run_id: <your-run-id> # put it in globals.yml to help people find out what to modify 59 | ``` 60 | 61 | Then you can reuse it in a node to predict with this model which is the entire inference pipeline at the time you launched the training. 62 | 63 | ```python 64 | # nodes.py 65 | def predict_from_model(model, data): 66 | return model.predict(data) 67 | 68 | 69 | # pipeline.py 70 | def create_pipeline(): 71 | return pipeline( 72 | [ 73 | node( 74 | func=predict_from_model, 75 | inputs={"model": pipeline_inference_model, "data": "validation_data"}, 76 | ) 77 | ] 78 | ) 79 | ``` 80 | 81 | ::: 82 | 83 | :::{tab-item} Serve the model with mlflow 84 | 85 | ```{note} 86 | See tutorial: <https://github.com/Galileo-Galilei/kedro-mlflow-tutorial?tab=readme-ov-file#scenario-3-serve-the-model-with-mlflow> 87 | ``` 88 | 89 | Mlflow provide helpers to serve the model as an API with one line of code: 90 | 91 | ``mlflow models serve -m "runs:/<your-model-run-id>/kedro_mlflow_tutorial"`` 92 | 93 | This will serve your model as an API (beware: there are known issues on windows). You can test it with: 94 | ``curl -d "{\"columns\":[\"text\"],\"index\":[0,1],\"data\":[[\"This movie is cool\"],[\"awful film\"]]}" -H "Content-Type: application/json" localhost:5000/invocations`` 95 | ::: 96 | 97 | :::: 98 | 99 | ## Frequently asked questions 100 | 101 | :::{dropdown} How can I pass parameters at runtime to a ``KedroPipelineModel``? 102 | 103 | Since ``kedro-mlflow>0.14.0``, you can pass parameters when predicting with a ``KedroPipelineModel`` object. 104 | 105 | We assume you've trained a model with ``pipeline_factory_function``. First, load the model, e.g. through the catalog or as described in the previous section: 106 | 107 | ```yaml 108 | # catalog.yml 109 | pipeline_inference_model: 110 | type: kedro_mlflow.io.models.MlflowModelTrackingDataset 111 | flavor: mlflow.pyfunc 112 | pyfunc_workflow: python_model 113 | artifact_path: kedro_mlflow_tutorial # the name of your mlflow folder = the model_name in pipeline_ml_factory 114 | run_id: <your-run-id> 115 | ``` 116 | 117 | Then, pass params as a dict under the ``params`` argument of the ``predict`` method: 118 | 119 | ```python 120 | catalog.load("pipeline_inference_model") # You can also load it in a node "as usual" 121 | predictions = model.predict(input_data, params={"my_param": "<my_param_value>"}) 122 | ``` 123 | 124 | ```{warning} 125 | This will only work if ``my_param`` is a parameter (i.e. prefixed with ``params:``) of the inference pipeline. 126 | ``` 127 | 128 | ```{tip} 129 | Available params are visible in the model signature in the UI 130 | ``` 131 | 132 | ::: 133 | 134 | :::{dropdown} How can I change the runner at runtime when predicting with a ``KedroPipelineModel``? 135 | 136 | Assuming the syntax of previous section, a special key in "params" is reserved for the kedro runner: 137 | 138 | ```python 139 | catalog.load("pipeline_inference_model") 140 | predictions = model.predict( 141 | input_data, params={"my_param": "<my_param_value>", "runner": "ThreadRunner"} 142 | ) 143 | ``` 144 | 145 | ```{tip} 146 | You can pass any kedro runner, or even a custom runner by using the path to the module: ``params={"runner": "my_package.my_module.MyRunner"}`` 147 | ``` 148 | 149 | ::: 150 | -------------------------------------------------------------------------------- /docs/source/04_pipeline_as_model/01_pipeline_as_custom_model/04_custom_kedro_pipeline_model.md: -------------------------------------------------------------------------------- 1 | # Custom registering of a ``KedroPipelineModel`` 2 | 3 | ```{warning} 4 | The goal of this section is to give tool to machine learning engineer or platform engineer to reuse the objects and customize the workflow. This is specially useful in case you need high customisation or fine grained control of the kedro objects or the mlflow model attributes. This is **very unlikely you need this section** if you are using a kedro project "in the standard way" as a data scientist, in which case you should refer to the section [scikit-learn like pipeline in kedro](https://kedro-mlflow.readthedocs.io/en/stable/source/). 5 | ``` 6 | 7 | ## Log a pipeline to mlflow programatically with ``KedroPipelineModel`` custom mlflow model 8 | 9 | ```{hint} 10 | When using the ``KedroPipelineModel`` programatically, we focus only on the ``inference`` pipeline. We assume That you already ran the ``training`` pipeline previously, and that you now want to log the ``inference`` pipeline in mlflow manually by retrieveing all the needed objects to create the custom model. 11 | ``` 12 | 13 | ``kedro-mlflow`` has a ``KedroPipelineModel`` class (which inherits from ``mlflow.pyfunc.PythonModel``) which can turn any kedro ``Pipeline`` object to a Mlflow Model. 14 | 15 | To convert a ``Pipeline`` to a mlflow model, you need to create a ``KedroPipelineModel`` and then log it to mlflow. An example is given in below snippet: 16 | 17 | ```python 18 | from pathlib import Path 19 | from kedro.framework.session import KedroSession 20 | from kedro.framework.startup import bootstrap_project 21 | 22 | bootstrap_project(r"<path/to/project>") 23 | session = KedroSession.create(project_path=r"<path/to/project>") 24 | 25 | # "pipeline" is the Pipeline object you want to convert to a mlflow model 26 | 27 | context = session.load_context() # this setups mlflow configuration 28 | catalog = context.catalog 29 | pipeline = context.pipelines["<my-pipeline>"] 30 | input_name = "instances" 31 | 32 | 33 | # artifacts are all the inputs of the inference pipelines that are persisted in the catalog 34 | 35 | # (optional) get the schema of the input dataset 36 | input_data = catalog.load(input_name) 37 | model_signature = infer_signature( 38 | model_input=input_data 39 | ) # if you want to pass parameters in "predict", you should specify them in the signature 40 | 41 | # you can optionally pass other arguments, like the "copy_mode" to be used for each dataset 42 | kedro_pipeline_model = KedroPipelineModel( 43 | pipeline=pipeline, catalog=catalog, input_name=input_name 44 | ) 45 | 46 | artifacts = kedro_pipeline_model.extract_pipeline_artifacts() 47 | 48 | mlflow.pyfunc.log_model( 49 | artifact_path="model", 50 | python_model=kedro_pipeline_model, 51 | artifacts=artifacts, 52 | conda_env={"python": "3.10.0", dependencies: ["kedro==0.18.11"]}, 53 | model_signature=model_signature, 54 | ) 55 | ``` 56 | 57 | ```{important} 58 | Note that you need to provide the ``log_model`` function a bunch of non trivial-to-retrieve informations (the conda environment, the "artifacts" i.e. the persisted data you need to reuse like tokenizers / ml models / encoders, the model signature i.e. the columns names and types and the predict parameters...). The ``KedroPipelineModel`` object has methods like `extract_pipeline_artifacts` to help you, but it needs some work on your side. 59 | ``` 60 | 61 | ```{note} 62 | Saving Kedro pipelines as Mlflow Model objects is convenient and enable pipeline serving. However, it does not does not solve the decorrelation between training and inference: each time one triggers a training pipeline, (s)he must think to save it immediately afterwards. `kedro-mlflow` offers a convenient API through hooks to simplify this workflow, as described in the section [scikit-learn like pipeline in kedro](https://kedro-mlflow.readthedocs.io/en/stable/source/) . 63 | ``` 64 | 65 | ## Log a pipeline to mlflow with the CLI 66 | 67 | ```{note} 68 | This command is mainly a helper to relog a model manually without retraining (e.g. because you slighlty modify the preprocessing or post processing and don't want to train again.) 69 | ``` 70 | 71 | ```{warning} 72 | We **assume that you already ran the ``training`` pipeline previously**, which created persisted artifacts. Now you want to trigger logging the ``inference`` pipeline in mlflow trhough the CLI. This is dangerous because the commmand does not check that your pipeline is working correctly or that the perssited model has not been modified. 73 | ``` 74 | 75 | You can log a Kedro ``Pipeline`` to mlflow as a custom model through the CLI with ``modelify`` command: 76 | 77 | ```bash 78 | kedro mlflow modelify --pipeline=<your-inference-pipeline> --input-name <name-in-catalog-of-input-data> 79 | ``` 80 | 81 | This command will create a new run with an artifact named ``model`` and persist it the code fo your pipeline and all its inputs as artifacts (hence they should have been created *before* running this command, e.g. the model should already be persisted on the disk). Open the user interface with ``kedro mlflow ui`` to check the result. You can also: 82 | 83 | - specify the run id in which you want to log the pipeline with the ``--run-id`` argument, and its name with the ``--run-name`` argument. 84 | - pass almost all arguments accepted by [``mlflow.pyfunc.log_model``](https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#mlflow.pyfunc.log_model), see the list of all accepted arguments in the [API documentation](https://kedro-mlflow.readthedocs.io/en/latest/source/05_API/01_python_objects/04_CLI.html#modelify) 85 | -------------------------------------------------------------------------------- /docs/source/04_pipeline_as_model/02_framework_ml/02_ml_project_components.md: -------------------------------------------------------------------------------- 1 | # The components of a machine learning application 2 | 3 | ## Definition: apps of a machine learning projects 4 | 5 | A machine learning project is composed of 3 main blocks that I will call "apps" in the rest of the paragraph. These 3 apps are: 6 | 7 | - The *etl_app*, which is the application in charge of bringing the data to the machine learning pipeline 8 | - The *ml_app*, which is the application in charge of managing the machine learning model (including training and inference) 9 | - The *user_app* which is the application in charge of consuming the predictions of the machine learning model and doing the actual business logic with it 10 | 11 | ## Difference between an app and a Kedro pipeline 12 | 13 | Note that the previously defined "apps" are not pipelines in the Kedro sense. On the contrary, each app likely contain several (Kedro?) pipelines. 14 | 15 | The main differences between these apps are: 16 | 17 | - Each app development / deployment is likely under the responsibility of different people / teams. 18 | - Each app has a different development lifecyle. It implies that development can be parallelized, and releasing one app to fix a bug does not imply to release the other ones. If your training pipeline is time /resources consuming, you do not want a bugfix in the *user_app* to trigger a retraining of your model, do you? 19 | - Each app has its own orchestration timeline. For instance, the data produced by the etl can be stored independently of whether the *user_app* and the *ml_app* consume them "on the fly" or not. 20 | - Each app do not communicate with the other apart from a clear interface: the data schema accepted as inputs/ output of each app. 21 | 22 | ## Apps development lifecycle in a machine learning project 23 | 24 | ### The data scientist creates at least part of the 3 apps 25 | 26 | Note that there are **as many _etl_app_ and _user_app_** as needed for the different use of your model. Since **training the model is a specific use, the data scientist will need one to create its own _etl_app_ and _user_app_**. These apps will very likely be replaced later by the true business app dedicated to the model use. 27 | 28 | We saw that the data scientist has to create some code that will be replaced by other people code when deploying the model. As a consequence, the interactions between these apps must be very clearly defined at the beginning of the project. We claim that it is possible to cover most use case with the following schema: 29 | 30 | ![apps_interaction](../../imgs/apps_interaction.png) 31 | 32 | The *ml_app* takes `instances` (i.e. examples of the business object to handle) as input. This implies that the *ml_app* will include some machine learning-specific preprocessing and not only the model training. It also (optionally) takes labels as inputs if the underlying problem is supervised. Even in this situation, the labels will not be known at inference time so the *etl_app* does not necessarily produce them. 33 | 34 | This is a key principle: anyone who wants to consume the model later will need to bring instances of the same business object. 35 | 36 | ### The *etl_app* 37 | 38 | The *etl_app* is the one in charge of bringing the data to the *ml_app*. As a consequence, each different *user_app* will likely have to develop its associated *etl_app* to consume the *ml_app*. 39 | 40 | From the data scientist point of view, this app will create the training dataset. This app can do very different things: 41 | 42 | - send request over an API 43 | - extract from a database (with SQL, SAS...) 44 | - scrape data from a website 45 | - download data from an URL 46 | - read data from disk 47 | - ... 48 | 49 | For the labels, in addition of above possibility, this app can be a **labelling tool** with human labellers who provide the needed "true reference" as labels. 50 | 51 | It is also common to mix several of above approaches to gather different data sources, and to have different Kedro pipelines in this app. 52 | 53 | Note that during a training, this app very likely retrieves batch data from a given time period. This will necessarily be different when using the model, because the user often want to use live stream data. 54 | 55 | ### The *ml_app* 56 | 57 | This app is the core of the data scientist work. It is at least composed of two kedro pipelines: 58 | 59 | - a *training* pipeline, which produces all the artifacts (e.g. any object fitted on data, including obviously the machine learning model itself) 60 | - an *inference* pipeline which takes an instance as input and returns the prediction of the model 61 | 62 | It is quite common to have other pipelines depending on the data scientist needs (an *evaluation* pipelines which produces metrics for a given model, an *explanation* pipeline to produce explanation for a specific instance like shap values or importance pixel, ...). 63 | 64 | It is quite common to see data scientists duplicate the code when creating the inference pipeline, because it is written after the training pipeline. **Thanks to kedro tags, it is possible to mark a node to use it in two different pipelines**. Reuse is a key component to improve quality and deployment speed. **Each time a node is created (i.e. a function is called), the data scientist should wonder if it will be used in *training* pipeline only or in both (*training* and *inference*), and tag it accordingly.** 65 | 66 | ### The *user_app* 67 | 68 | The *user_app* must not be aware of how the inference pipeline operates under the hood. The *user_app* must either: 69 | 70 | - takes a *run_id* from mlflow to retrieve the model from mlflow and predict with it. This is mainly useful for batch predictions. 71 | - call the served model from an API endpoint and only get predictions as inputs. This assumes that the model has been served, which is very easy with mlflow. 72 | 73 | After that, the *user_app* can use the predictions and apply any needed business logic to them. 74 | -------------------------------------------------------------------------------- /docs/source/04_pipeline_as_model/02_framework_ml/03_framework_solutions.md: -------------------------------------------------------------------------------- 1 | # ``kedro-mlflow`` mlops solution 2 | 3 | ## Reminder 4 | 5 | We assume that we want to solve the following challenges among those described in ["Why we need a mlops framework"](https://kedro-mlflow.readthedocs.io/en/latest/source/04_pipeline_as_model/02_framework_ml/01_why_framework.html#) section: 6 | 7 | - serve pipelines (which handles business objects) instead of models 8 | - synchronize training and inference by packaging inference pipeline at training time 9 | 10 | ## Enforcing these principles with a dedicated tool 11 | 12 | ### Synchronizing training and inference pipeline 13 | 14 | To solve the problem of desynchronization between training and inference, ``kedro-mlflow`` offers a `PipelineML` class (which subclasses Kedro `Pipeline` class). A `PipelineML` is simply a Kedro standard ``Pipeline`` (the "training") which has a reference to another ``Pipeline`` (the "inference"). The two pipelines must share a common input DataSet name, which represents the data you will perform operations on (either train on for the training pipeline, or predict on for the inference pipeline). 15 | 16 | This class implements several methods to compare the ``DataCatalog``s associated to each of the two binded pipelines and performs subsetting oparations. This makes it quite difficult to handle directly. Fortunately, ``kedro-mlflow`` provides a convenient API to create ``PipelineML`` objects: the ``pipeline_ml_factory`` function. 17 | 18 | The use of ``pipeline_ml_factory`` is very straightforward, especially if you have used the [project architecture described previously](https://kedro-mlflow.readthedocs.io/en/latest/source/04_pipeline_as_model/02_framework_ml/02_ml_project_components.html). The best place to create such an object is your `hooks.py` file which will look like this: 19 | 20 | ```python 21 | # hooks.py 22 | from kedro_mlflow_tutorial.pipelines.ml_app.pipeline import create_ml_pipeline 23 | 24 | 25 | class ProjectHooks: 26 | @hook_impl 27 | def register_pipelines(self) -> [str, Pipeline]: 28 | ml_pipeline = create_ml_pipeline() 29 | 30 | # convert your two pipelines to a PipelinML object 31 | training_pipeline_ml = pipeline_ml_factory( 32 | training=ml_pipeline.only_nodes_with_tags("training"), 33 | inference=ml_pipeline.only_nodes_with_tags("inference"), 34 | input_name="instances", 35 | ) 36 | 37 | return {"__default__": training_pipeline_ml} 38 | ``` 39 | 40 | > So, what? We have created a link between our two pipelines, but the gain is not obvious at first glance. The 2 following sections demonstrates that such a construction enables to package and serve automatically the inference pipeline when executing the training one. 41 | 42 | ### Packaging and serving a Kedro Pipeline 43 | 44 | Mlflow offers the possibility to create [custom model class](https://www.mlflow.org/docs/latest/models.html#custom-python-models). Mlflow offers a variety of tool to package/containerize, deploy and serve such models. 45 | 46 | ``kedro-mlflow`` has a ``KedroPipelineModel`` class (which inherits from ``mlflow.pyfunc.PythonModel``) which can turn any kedro ``PipelineML`` object to a Mlflow Model. 47 | 48 | To convert a ``PipelineML``, you need to declare it as a ``KedroPipelineModel`` and then log it to mlflow: 49 | 50 | ```python 51 | from pathlib import Path 52 | from kedro.framework.context import load_context 53 | from kedro_mlflow.mlflow import KedroPipelineModel 54 | from mlflow.models import ModelSignature 55 | 56 | # pipeline_training is your PipelineML object, created as previsously 57 | catalog = load_context(".").io 58 | 59 | # artifacts are all the inputs of the inference pipelines that are persisted in the catalog 60 | artifacts = pipeline_training.extract_pipeline_artifacts(catalog) 61 | 62 | # (optional) get the schema of the input dataset 63 | input_data = catalog.load(pipeline_training.input_name) 64 | model_signature = infer_signature(model_input=input_data) 65 | 66 | kedro_model = KedroPipelineModel(pipeline=pipeline_training, catalog=catalog) 67 | 68 | mlflow.pyfunc.log_model( 69 | artifact_path="model", 70 | python_model=kedro_model, 71 | artifacts=artifacts, 72 | conda_env={"python": "3.10.0", dependencies: ["kedro==0.18.11"]}, 73 | signature=model_signature, 74 | ) 75 | ``` 76 | 77 | Note that you need to provide the ``log_model`` function a bunch of non trivial-to-retrieve informations (the conda environment, the "artifacts" i.e. the persisted data you need to reuse like tokenizers / ml models / encoders, the model signature i.e. the columns names and types...). The ``PipelineML`` object has methods like `extract_pipeline_artifacts` to help you, but it needs some work on your side. 78 | 79 | > Saving Kedro pipelines as Mlflow Model objects is convenient and enable pipeline serving serving. However, it does not does not solve the decorrelation between training and inference: each time one triggers a training pipeline, (s)he must think to save it immediately afterwards. Good news: triggering operations at some "execution moment" of a Kedro ``Pipeline`` (like after it finished runnning) is exactly what hooks are designed for! 80 | 81 | ### kedro-mlflow's magic: inference autologging 82 | 83 | When running the training pipeline, we have all the desired informations we want to pass to the ``KedroPipelineModel`` class and ``mlflow.pyfunc.log_model`` function: 84 | 85 | - the artifacts exist in the DataCatalog if they are persisted 86 | - the "instances" dataset is loaded at the beginning of training, thus we can infer its schema (columns names and types) 87 | - the inference and training pipeline codes are retrieved at the same moments, so consistency checks can be performed 88 | 89 | Hence, ``kedro-mlflow`` provides a ``MlflowHook.after_pipeline_run`` hook which perfoms the following operations: 90 | 91 | - check if the pipeline that have ust been run is a ``PipelineML`` object 92 | - in case it is, create the ``KedroPipelineModel`` like above and log it to mlflow 93 | 94 | > We have achieved perfect synchronicity since the exact inference pipeline (with code, and artifacts) will be logged in mlflow each time the training pipeline is executed. The model is than accessible in the mlflow UI "artifacts" section and can be downloaded, or [served as an API with the ``mlflow serve`` command](https://www.mlflow.org/docs/latest/cli.html#mlflow-models-serve), or [it can be used in the `catalog.yml` with the `MlflowModelTrackingDataset` for further reuse](https://github.com/Galileo-Galilei/kedro-mlflow-tutorial#serve-the-inference-pipeline-to-a-end-user). 95 | 96 | ### Reuse the model in kedro 97 | 98 | Say that you an to reuse this inference model as the input of another kedro pipeline (one of the "user_app" application). ``kedro-mlflow`` provides a ``MlflowModelTrackingDataset`` class which can be used int the ``catalog.yml`` file: 99 | 100 | ```yaml 101 | # catalog.yml 102 | 103 | pipeline_inference_model: 104 | type: kedro_mlflow.io.models.MlflowModelTrackingDataset 105 | flavor: mlflow.pyfunc 106 | pyfunc_workflow: python_model 107 | artifact_path: kedro_mlflow_tutorial # the name of your mlflow folder = the model_name in pipeline_ml_factory 108 | run_id: <your-run-id> 109 | ``` 110 | -------------------------------------------------------------------------------- /docs/source/04_pipeline_as_model/index.md: -------------------------------------------------------------------------------- 1 | 2 | # {octicon}`rocket` Pipeline as model 3 | 4 | ```{toctree} 5 | :caption: Pipeline as model 6 | 7 | 01_pipeline_as_custom_model/01_mlflow_models 8 | 01_pipeline_as_custom_model/02_scikit_learn_like_pipeline 9 | 01_pipeline_as_custom_model/03_deployment_patterns 10 | 01_pipeline_as_custom_model/04_custom_kedro_pipeline_model 11 | ``` 12 | 13 | ```{toctree} 14 | :caption: kedro-mlflow as a mlops framework 15 | 16 | 02_framework_ml/01_why_framework 17 | 02_framework_ml/02_ml_project_components 18 | 02_framework_ml/03_framework_solutions 19 | ``` 20 | -------------------------------------------------------------------------------- /docs/source/05_API/01_python_objects/02_Hooks.md: -------------------------------------------------------------------------------- 1 | # ``Hooks`` 2 | 3 | This package provides 1 new hook. 4 | 5 | ## ``MlflowHook`` 6 | 7 | This hook : 8 | 9 | 1. manages mlflow settings at the beginning and the end of the run (run start / end). 10 | 2. autolog nodes parameters each time the pipeline is run (with ``kedro run`` or programatically). 11 | 3. log useful informations for reproducibility as ``mlflow tags`` (including kedro ``Journal`` information for old kedro versions and the commands used to launch the run). 12 | 4. register the pipeline as a valid ``mlflow model`` if [it is a ``PipelineML`` instance](https://kedro-mlflow.readthedocs.io/en/latest/source/05_API/01_python_objects/03_Pipelines.html) 13 | -------------------------------------------------------------------------------- /docs/source/05_API/01_python_objects/03_Pipelines.md: -------------------------------------------------------------------------------- 1 | # Pipelines 2 | 3 | ## ``PipelineML`` and ``pipeline_ml_factory`` 4 | 5 | ``PipelineML`` is a new class which extends ``Pipeline`` and enable to bind two pipelines (one of training, one of inference) together. This class comes with a ``KedroPipelineModel`` class for logging it in mlflow. A pipeline logged as a mlflow model can be served using ``mlflow models serve`` and ``mlflow models predict`` command. 6 | 7 | The ``PipelineML`` class is not intended to be used directly. A ``pipeline_ml_factory`` factory is provided for user friendly interface. 8 | 9 | Example within kedro template: 10 | 11 | ```python 12 | # in src/PYTHON_PACKAGE/pipeline.py 13 | 14 | from PYTHON_PACKAGE.pipelines import data_science as ds 15 | 16 | 17 | def create_pipelines(**kwargs) -> dict[str, Pipeline]: 18 | data_science_pipeline = ds.create_pipeline() 19 | training_pipeline = pipeline_ml_factory( 20 | training=data_science_pipeline.only_nodes_with_tags( 21 | "training" 22 | ), # or whatever your logic is for filtering 23 | inference=data_science_pipeline.only_nodes_with_tags("inference"), 24 | ) 25 | 26 | return { 27 | "ds": data_science_pipeline, 28 | "training": training_pipeline, 29 | "__default__": data_engineering_pipeline + data_science_pipeline, 30 | } 31 | ``` 32 | 33 | Now each time you will run ``kedro run --pipeline=training`` (provided you registered ``MlflowHook`` in you ``run.py``), the full inference pipeline will be registered as a mlflow model (with all the outputs produced by training as artifacts : the machine learning model, but also the *scaler*, *vectorizer*, *imputer*, or whatever object fitted on data you create in ``training`` and that is used in ``inference``). 34 | 35 | Note that: 36 | 37 | - the `inference` pipeline `input_name` can be a `MemoryDataset` and it belongs to inference pipeline `inputs` 38 | - Apart form `input_name`, all other `inference` pipeline `inputs` must be persisted locally on disk (i.e. it must not be `MemoryDataset` and must have a local `filepath`) 39 | - the `inference` pipeline `inputs` must belong to training `outputs` (vectorizer, binarizer, machine learning model...) 40 | - the `inference` pipeline must have one and only one `output` 41 | 42 | ```{caution} 43 | ``PipelineML`` objects do not implement all filtering methods of a regular ``Pipeline``, and you cannot add or substract 2 ``PipelineML`` together. The rationale is that a filtered ``PipelineML`` is not a ``PipelineML`` in general, because the [filtering is not consistent between training and inference](https://github.com/Galileo-Galilei/kedro-mlflow/issues/554). You can see the ones which are supported [in the code](https://github.com/Galileo-Galilei/kedro-mlflow/blob/master/kedro_mlflow/pipeline/pipeline_ml.py#L162). 44 | ``` 45 | 46 | You can also directly log a ``PipelineML`` object in ``mlflow`` programatically: 47 | 48 | ```python 49 | from pathlib import Path 50 | from kedro.framework.context import load_context 51 | from kedro_mlflow.mlflow import KedroPipelineModel 52 | from mlflow.models import ModelSignature 53 | 54 | # pipeline_training is your PipelineML object, created as previsously 55 | catalog = load_context(".").io 56 | 57 | # artifacts are all the inputs of the inference pipelines that are persisted in the catalog 58 | artifacts = pipeline_training.extract_pipeline_artifacts(catalog) 59 | 60 | # get the schema of the input dataset 61 | input_data = catalog.load(pipeline_training.input_name) 62 | model_signature = infer_signature(model_input=input_data) 63 | 64 | mlflow.pyfunc.log_model( 65 | artifact_path="model", 66 | python_model=KedroPipelineModel(pipeline=pipeline_training, catalog=catalog), 67 | artifacts=artifacts, 68 | conda_env={"python": "3.10.0", dependencies: ["kedro==0.18.11"]}, 69 | signature=model_signature, 70 | ) 71 | ``` 72 | 73 | It is also possible to pass arguments to `KedroPipelineModel` to specify the runner or the copy_mode of ``MemoryDataset`` for the inference ``Pipeline``. This may be faster especially for compiled model (e.g keras, tensorflow...), and more suitable for an API serving pattern. Since ``kedro-mlflow==0.12.0``, ``copy_mode="assign"`` has become the default. 74 | 75 | ```python 76 | KedroPipelineModel(pipeline=pipeline_training, catalog=catalog, copy_mode="assign") 77 | ``` 78 | 79 | Available `copy_mode` are ``assign``, ``copy`` and ``deepcopy``. It is possible to pass a dictionary to specify different copy mode for each dataset. 80 | -------------------------------------------------------------------------------- /docs/source/05_API/01_python_objects/04_CLI.md: -------------------------------------------------------------------------------- 1 | # Cli commands 2 | 3 | ## ``init`` 4 | 5 | ``kedro mlflow init``: this command is needed to initalize your project. You cannot run any other commands before you run this one once. It performs 2 actions: 6 | - creates a ``mlflow.yml`` configuration file in your ``conf/local`` folder 7 | - replace the ``src/PYTHON_PACKAGE/run.py`` file by an updated version of the template. If your template has been modified since project creation, a warning will be raised. You can either run ``kedro mlflow init --force`` to ignore this warning (but this will erase your ``run.py``) or [set hooks manually](https://kedro-mlflow.readthedocs.io/en/latest/source/02_getting_started/01_installation/02_setup.html). 8 | 9 | `init` has two arguments: 10 | 11 | - `--env` which enable to specifiy another environment where the mlflow.yml should be created (e.g, `base`) 12 | - `--force` which overrides the `mlflow.yml` if it already exists and replaces it with the default one. Use it with caution! 13 | 14 | ## ``ui`` 15 | 16 | ``kedro mlflow ui``: this command opens the mlflow UI (basically launches the ``mlflow ui`` command ) 17 | 18 | `ui` accepts the port and host arguments of [``mlflow ui`` command](https://www.mlflow.org/docs/latest/cli.html#mlflow-ui). The default values used will be the ones defined in the [``mlflow.yml`` configuration file under the `ui`](https://kedro-mlflow.readthedocs.io/en/latest/source/03_experiment_tracking/01_experiment_tracking/01_configuration.html). 19 | 20 | If you provide the arguments at runtime, they wil take priority over the ``mlflow.yml``, e.g. if you have: 21 | 22 | ```yaml 23 | # mlflow.yml 24 | ui: 25 | localhost: "0.0.0.0" 26 | port: "5001" 27 | ``` 28 | 29 | then 30 | 31 | ```console 32 | kedro mlflow ui --port=5002 33 | ``` 34 | 35 | will open the ui on port 5002. 36 | 37 | ## ``modelify`` 38 | 39 | ``kedro mlflow modelify``: this command converts a kedro pipeline to a mlflow model and logs it in mlflow. It enables distributing the kedro pipeline as a standalone model and leverages all mlflow serving capabilities (as an API). 40 | 41 | `modelify` accepts the following arguments : 42 | 43 | - ``--pipeline``, ``-p``: The name of the kedro pipeline name registered in ``pipeline_registry.py`` that you want to convert to a mlflow model. 44 | - ``--input-name``, ``-i``: The name of the kedro dataset (in ``catalog.yml``) which is the input of your pipeline. It contains the data to predict on. 45 | - ``--infer-signature`` : A boolean which indicates if the signature of the input data should be inferred for mlflow or not. 46 | - ``--infer-input-example`` : A boolean which indicates if the input_example of the input data should be inferred for mlflow or not 47 | - ``--run-id``, ``-r`` : The id of the mlflow run where the model will be logged. If unspecified, the command creates a new run. 48 | - ``--run-name``: The name of the mlflow run where the model will be logged. Defaults to ``"modelify"``. 49 | - ``--copy-mode`` : The copy mode to use when replacing each dataset by a ``MemoryDataset``. Either a string (applied all datasets) or a dict mapping each dataset to a ``copy_mode``. 50 | - ``--artifact-path"`` : The artifact path of mlflow.pyfunc.log_model, see https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#mlflow.pyfunc.log_model 51 | - ``--code-path`` : The code path of mlflow.pyfunc.log_model, see https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#mlflow.pyfunc.log_model 52 | - ``--conda-env`` : "The conda environment of mlflow.pyfunc.log_model, see https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#mlflow.pyfunc.log_model 53 | - ``--registered-model-name`` : The registered_model_name of mlflow.pyfunc.log_model, see https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#mlflow.pyfunc.log_model 54 | - ``--await-registration-for``: The await_registration_for of mlflow.pyfunc.log_model, see https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#mlflow.pyfunc.log_model* 55 | - ``--pip-requirements`` : The pip_requirements of mlflow.pyfunc.log_model, see https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#mlflow.pyfunc.log_model 56 | - ``--extra-pip-requirements`` : The extra_pip_requirements of mlflow.pyfunc.log_model, see https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#mlflow.pyfunc.log_model 57 | -------------------------------------------------------------------------------- /docs/source/05_API/01_python_objects/05_Configuration.md: -------------------------------------------------------------------------------- 1 | # Configuration 2 | 3 | The python objecti is ``KedroMlflowConfig`` and it can be filled through ``mlflow.yml``. 4 | 5 | More details are coming soon. 6 | -------------------------------------------------------------------------------- /docs/source/05_API/02_autoapi/kedro_mlflow.config.rst: -------------------------------------------------------------------------------- 1 | Configuration 2 | ==================================== 3 | 4 | .. automodule:: kedro_mlflow.config.kedro_mlflow_config 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/05_API/02_autoapi/kedro_mlflow.framework.cli.rst: -------------------------------------------------------------------------------- 1 | CLI 2 | ==== 3 | 4 | .. click:: kedro_mlflow.framework.cli.cli:init 5 | :prog: init 6 | :nested: full 7 | 8 | .. click:: kedro_mlflow.framework.cli.cli:ui 9 | :prog: ui 10 | :nested: full 11 | 12 | .. click:: kedro_mlflow.framework.cli.cli:modelify 13 | :prog: modelify 14 | :nested: full 15 | -------------------------------------------------------------------------------- /docs/source/05_API/02_autoapi/kedro_mlflow.framework.hooks.rst: -------------------------------------------------------------------------------- 1 | Hooks 2 | ====== 3 | 4 | Node Hook 5 | ----------- 6 | 7 | .. automodule:: kedro_mlflow.framework.hooks.mlflow_hook 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | -------------------------------------------------------------------------------- /docs/source/05_API/02_autoapi/kedro_mlflow.io.rst: -------------------------------------------------------------------------------- 1 | Datasets 2 | ================================== 3 | 4 | Artifact Dataset 5 | ----------------- 6 | 7 | .. automodule:: kedro_mlflow.io.artifacts.mlflow_artifact_dataset 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | 12 | Metrics Dataset 13 | ---------------- 14 | 15 | .. automodule:: kedro_mlflow.io.metrics.mlflow_metric_dataset 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | .. automodule:: kedro_mlflow.io.metrics.mlflow_metric_history_dataset 21 | :members: 22 | :undoc-members: 23 | :show-inheritance: 24 | 25 | 26 | .. automodule:: kedro_mlflow.io.metrics.mlflow_metrics_history_dataset 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | Models Dataset 32 | --------------- 33 | 34 | .. automodule:: kedro_mlflow.io.models.mlflow_abstract_model_dataset 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | .. automodule:: kedro_mlflow.io.models.mlflow_model_tracking_dataset 40 | :members: 41 | :undoc-members: 42 | :show-inheritance: 43 | 44 | .. automodule:: kedro_mlflow.io.models.mlflow_model_local_filesystem_dataset 45 | :members: 46 | :undoc-members: 47 | :show-inheritance: 48 | 49 | .. automodule:: kedro_mlflow.io.models.mlflow_model_registry_dataset 50 | :members: 51 | :undoc-members: 52 | :show-inheritance: 53 | -------------------------------------------------------------------------------- /docs/source/05_API/02_autoapi/kedro_mlflow.mlflow.rst: -------------------------------------------------------------------------------- 1 | Custom Mlflow Models 2 | ==================== 3 | 4 | .. automodule:: kedro_mlflow.mlflow.kedro_pipeline_model 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/05_API/02_autoapi/kedro_mlflow.pipeline.rst: -------------------------------------------------------------------------------- 1 | Pipelines 2 | ========= 3 | 4 | .. automodule:: kedro_mlflow.pipeline.pipeline_ml 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | .. automodule:: kedro_mlflow.pipeline.pipeline_ml_factory 10 | :members: 11 | :undoc-members: 12 | :show-inheritance: 13 | -------------------------------------------------------------------------------- /docs/source/05_API/02_autoapi/kedro_mlflow.rst: -------------------------------------------------------------------------------- 1 | kedro\_mlflow package 2 | ===================== 3 | 4 | .. toctree:: 5 | :maxdepth: 6 6 | 7 | kedro_mlflow.io 8 | kedro_mlflow.framework.cli 9 | kedro_mlflow.pipeline 10 | kedro_mlflow.mlflow 11 | kedro_mlflow.config 12 | kedro_mlflow.framework.hooks 13 | -------------------------------------------------------------------------------- /docs/source/05_API/index.md: -------------------------------------------------------------------------------- 1 | 2 | # API 3 | 4 | ```{toctree} 5 | :caption: Python objects 6 | 7 | 01_python_objects/01_Datasets 8 | 01_python_objects/02_Hooks 9 | 01_python_objects/03_Pipelines 10 | 01_python_objects/04_CLI 11 | 01_python_objects/05_Configuration 12 | ``` 13 | 14 | ```{toctree} 15 | :caption: API 16 | 17 | 02_autoapi/kedro_mlflow 18 | ``` 19 | -------------------------------------------------------------------------------- /docs/source/06_migration_guide/index.md: -------------------------------------------------------------------------------- 1 | # Migration guides 2 | 3 | ```{toctree} 4 | :caption: Migrating between kedro-mlflow versions 5 | 6 | migration_guide_kedro_mlflow 7 | 8 | ``` 9 | 10 | ```{toctree} 11 | :caption: Migrating from kedro-viz experiment tracking to kedro-mlflow 12 | 13 | migration_guide_kedro_experiment_tracking 14 | ``` 15 | -------------------------------------------------------------------------------- /docs/source/06_migration_guide/migration_guide_kedro_experiment_tracking.md: -------------------------------------------------------------------------------- 1 | # Migration guide from kedro-viz experiment tracking 2 | 3 | If you use Kedro's [native experiment tracking functionality](https://docs.kedro.org/projects/kedro-viz/en/v9.2.0/experiment_tracking.html), it will be deprecated from ``kedro-viz==0.11.0``. 4 | 5 | The core team suggest migrating to kedro-mlflow and [provides a blog post](https://kedro.org/blog/deprecate-experiment-tracking-kedro-viz) to explain the process. 6 | 7 | 8 | ::::::{grid} 1 2 2 2 9 | :gutter: 3 10 | 11 | :::::{grid-item-card} 12 | :link: https://kedro.org/blog/deprecate-experiment-tracking-kedro-viz 13 | :link-type: url 14 | :shadow: none 15 | :class-card: example-gallery 16 | 17 | :::{image} ../imgs/blogpost_migrate_experiment_tracking.png 18 | ::: 19 | ::::: 20 | 21 | :::::: 22 | -------------------------------------------------------------------------------- /docs/source/imgs/apps_interaction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/apps_interaction.png -------------------------------------------------------------------------------- /docs/source/imgs/blogpost_migrate_experiment_tracking.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/blogpost_migrate_experiment_tracking.png -------------------------------------------------------------------------------- /docs/source/imgs/default_catalog.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/default_catalog.png -------------------------------------------------------------------------------- /docs/source/imgs/etl_app.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/etl_app.png -------------------------------------------------------------------------------- /docs/source/imgs/hook_registration_process.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/hook_registration_process.png -------------------------------------------------------------------------------- /docs/source/imgs/initialized_project.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/initialized_project.png -------------------------------------------------------------------------------- /docs/source/imgs/kedro_viz_params.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/kedro_viz_params.png -------------------------------------------------------------------------------- /docs/source/imgs/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/logo.png -------------------------------------------------------------------------------- /docs/source/imgs/ml_pipeline/preprocessing/all.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/ml_pipeline/preprocessing/all.PNG -------------------------------------------------------------------------------- /docs/source/imgs/ml_pipeline/preprocessing/inference.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/ml_pipeline/preprocessing/inference.PNG -------------------------------------------------------------------------------- /docs/source/imgs/ml_pipeline/preprocessing/training.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/ml_pipeline/preprocessing/training.PNG -------------------------------------------------------------------------------- /docs/source/imgs/ml_pipeline/shared_inputs/all.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/ml_pipeline/shared_inputs/all.PNG -------------------------------------------------------------------------------- /docs/source/imgs/ml_pipeline/shared_inputs/inference.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/ml_pipeline/shared_inputs/inference.PNG -------------------------------------------------------------------------------- /docs/source/imgs/ml_pipeline/shared_inputs/training.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/ml_pipeline/shared_inputs/training.PNG -------------------------------------------------------------------------------- /docs/source/imgs/ml_pipeline/tokenizer/all.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/ml_pipeline/tokenizer/all.PNG -------------------------------------------------------------------------------- /docs/source/imgs/ml_pipeline/tokenizer/inference.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/ml_pipeline/tokenizer/inference.PNG -------------------------------------------------------------------------------- /docs/source/imgs/ml_pipeline/tokenizer/training.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/ml_pipeline/tokenizer/training.PNG -------------------------------------------------------------------------------- /docs/source/imgs/ml_pipeline/vanilla/all.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/ml_pipeline/vanilla/all.PNG -------------------------------------------------------------------------------- /docs/source/imgs/ml_pipeline/vanilla/inference.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/ml_pipeline/vanilla/inference.PNG -------------------------------------------------------------------------------- /docs/source/imgs/ml_pipeline/vanilla/training.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/ml_pipeline/vanilla/training.PNG -------------------------------------------------------------------------------- /docs/source/imgs/mlflow_host_page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/mlflow_host_page.png -------------------------------------------------------------------------------- /docs/source/imgs/mlflow_run.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/mlflow_run.png -------------------------------------------------------------------------------- /docs/source/imgs/mlflow_tracking_schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/mlflow_tracking_schema.png -------------------------------------------------------------------------------- /docs/source/imgs/mlflow_yml.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/mlflow_yml.png -------------------------------------------------------------------------------- /docs/source/imgs/once_run_project.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/once_run_project.png -------------------------------------------------------------------------------- /docs/source/imgs/run_with_artifact.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/run_with_artifact.png -------------------------------------------------------------------------------- /docs/source/imgs/updated_catalog.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/docs/source/imgs/updated_catalog.png -------------------------------------------------------------------------------- /kedro_mlflow/__init__.py: -------------------------------------------------------------------------------- 1 | """kedro-mlflow plugin constants""" 2 | 3 | __version__ = "0.14.4" 4 | 5 | import logging 6 | 7 | logging.getLogger(__name__).setLevel(logging.INFO) 8 | -------------------------------------------------------------------------------- /kedro_mlflow/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/kedro_mlflow/config/__init__.py -------------------------------------------------------------------------------- /kedro_mlflow/config/resolvers.py: -------------------------------------------------------------------------------- 1 | from mlflow.utils.name_utils import _generate_random_name 2 | 3 | 4 | def resolve_random_name(): 5 | # a resolver must have an argument, see: https://github.com/omry/omegaconf/issues/1060 6 | return _generate_random_name() 7 | -------------------------------------------------------------------------------- /kedro_mlflow/framework/__init__.py: -------------------------------------------------------------------------------- 1 | """``kedro_mlflow.framework`` provides mlflow extensions for Kedro's framework components""" 2 | -------------------------------------------------------------------------------- /kedro_mlflow/framework/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/kedro_mlflow/framework/cli/__init__.py -------------------------------------------------------------------------------- /kedro_mlflow/framework/cli/cli_utils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Union 3 | 4 | from jinja2 import Environment, FileSystemLoader 5 | 6 | 7 | def render_jinja_template( 8 | src: Union[str, Path], is_cookiecutter=False, **kwargs 9 | ) -> str: 10 | """This functions enable to copy a file and render the 11 | tags (identified by {{ my_tag }}) with the values provided in kwargs. 12 | 13 | Arguments: 14 | src {Union[str, Path]} -- The path to the template which should be rendered 15 | 16 | Returns: 17 | str -- A string that contains all the files with replaced tags. 18 | """ 19 | src = Path(src) 20 | 21 | template_loader = FileSystemLoader(searchpath=src.parent.as_posix()) 22 | # the keep_trailing_new_line option is mandatory to 23 | # make sure that black formatting will be preserved 24 | template_env = Environment(loader=template_loader, keep_trailing_newline=True) 25 | template = template_env.get_template(src.name) 26 | if is_cookiecutter: 27 | # we need to match tags from a cookiecutter object 28 | # but cookiecutter only deals with folder, not file 29 | # thus we need to create an object with all necessary attributes 30 | class FalseCookieCutter: 31 | def __init__(self, **kwargs): 32 | self.__dict__.update(kwargs) 33 | 34 | parsed_template = template.render(cookiecutter=FalseCookieCutter(**kwargs)) 35 | else: 36 | parsed_template = template.render(**kwargs) 37 | 38 | return parsed_template 39 | 40 | 41 | def write_jinja_template( 42 | src: Union[str, Path], dst: Union[str, Path], **kwargs 43 | ) -> None: 44 | """Write a template file and replace tis jinja's tags 45 | (identified by {{ my_tag }}) with the values provided in kwargs. 46 | 47 | Arguments: 48 | src {Union[str, Path]} -- Path to the template which should be rendered 49 | dst {Union[str, Path]} -- Path where the rendered template should be saved 50 | """ 51 | dst = Path(dst) 52 | parsed_template = render_jinja_template(src, **kwargs) 53 | with open(dst, "w") as file_handler: 54 | file_handler.write(parsed_template) 55 | -------------------------------------------------------------------------------- /kedro_mlflow/framework/hooks/__init__.py: -------------------------------------------------------------------------------- 1 | from .mlflow_hook import MlflowHook, mlflow_hook 2 | 3 | __all__ = ["MlflowHook", "mlflow_hook"] 4 | -------------------------------------------------------------------------------- /kedro_mlflow/framework/hooks/utils.py: -------------------------------------------------------------------------------- 1 | from kedro_mlflow.config.kedro_mlflow_config import KedroMlflowConfig 2 | 3 | 4 | def _assert_mlflow_enabled( 5 | pipeline_name: str, mlflow_config: KedroMlflowConfig 6 | ) -> bool: 7 | # TODO: we may want to enable to filter on tags 8 | # but we need to deal with the case when several tags are passed 9 | # what to do if 1 out of 2 is in the list? 10 | disabled_pipelines = mlflow_config.tracking.disable_tracking.pipelines 11 | if pipeline_name in disabled_pipelines: 12 | return False 13 | 14 | return True 15 | 16 | 17 | def _generate_kedro_command( 18 | tags, node_names, from_nodes, to_nodes, from_inputs, load_versions, pipeline_name 19 | ): 20 | cmd_list = ["kedro", "run"] 21 | SEP = "=" 22 | if from_inputs: 23 | cmd_list.append("--from-inputs" + SEP + ",".join(from_inputs)) 24 | if from_nodes: 25 | cmd_list.append("--from-nodes" + SEP + ",".join(from_nodes)) 26 | if to_nodes: 27 | cmd_list.append("--to-nodes" + SEP + ",".join(to_nodes)) 28 | if node_names: 29 | cmd_list.append("--node" + SEP + ",".join(node_names)) 30 | if pipeline_name: 31 | cmd_list.append("--pipeline" + SEP + pipeline_name) 32 | if tags: 33 | # "tag" is the name of the command, "tags" the value in run_params 34 | cmd_list.append("--tag" + SEP + ",".join(tags)) 35 | if load_versions: 36 | # "load_version" is the name of the command, "load_versions" the value in run_params 37 | formatted_versions = [f"{k}:{v}" for k, v in load_versions.items()] 38 | cmd_list.append("--load-version" + SEP + ",".join(formatted_versions)) 39 | 40 | kedro_cmd = " ".join(cmd_list) 41 | return kedro_cmd 42 | 43 | 44 | def _flatten_dict(d: dict, recursive: bool = True, sep: str = ".") -> dict: 45 | def expand(key, value): 46 | if isinstance(value, dict): 47 | new_value = ( 48 | _flatten_dict(value, recursive=recursive, sep=sep) 49 | if recursive 50 | else value 51 | ) 52 | return [(f"{key}{sep}{k}", v) for k, v in new_value.items()] 53 | else: 54 | return [(f"{key}", value)] 55 | 56 | items = [item for k, v in d.items() for item in expand(k, v)] 57 | 58 | return dict(items) 59 | -------------------------------------------------------------------------------- /kedro_mlflow/io/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/kedro_mlflow/io/__init__.py -------------------------------------------------------------------------------- /kedro_mlflow/io/artifacts/__init__.py: -------------------------------------------------------------------------------- 1 | from .mlflow_artifact_dataset import MlflowArtifactDataset 2 | 3 | __all__ = ["MlflowArtifactDataset"] 4 | -------------------------------------------------------------------------------- /kedro_mlflow/io/catalog/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/kedro_mlflow/io/catalog/__init__.py -------------------------------------------------------------------------------- /kedro_mlflow/io/catalog/switch_catalog_logging.py: -------------------------------------------------------------------------------- 1 | def switch_catalog_logging(catalog, logging_flag=True): 2 | for name, dataset in catalog._datasets.items(): 3 | if type(dataset).__name__.startswith("Mlflow"): 4 | catalog._datasets[name]._logging_activated = logging_flag 5 | -------------------------------------------------------------------------------- /kedro_mlflow/io/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | from .mlflow_metric_dataset import MlflowMetricDataset 2 | from .mlflow_metric_history_dataset import MlflowMetricHistoryDataset 3 | from .mlflow_metrics_history_dataset import MlflowMetricsHistoryDataset 4 | 5 | __all__ = [ 6 | "MlflowMetricDataset", 7 | "MlflowMetricHistoryDataset", 8 | "MlflowMetricsHistoryDataset", 9 | ] 10 | -------------------------------------------------------------------------------- /kedro_mlflow/io/metrics/mlflow_abstract_metric_dataset.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Optional, Union 2 | 3 | import mlflow 4 | from kedro.io import AbstractDataset 5 | from mlflow.tracking import MlflowClient 6 | 7 | 8 | class MlflowAbstractMetricDataset(AbstractDataset): 9 | def __init__( 10 | self, 11 | key: str = None, 12 | run_id: str = None, 13 | load_args: dict[str, Any] = None, 14 | save_args: dict[str, Any] = None, 15 | metadata: Optional[dict[str, Any]] = None, 16 | ): 17 | """Initialise MlflowMetricsHistoryDataset. 18 | 19 | Args: 20 | run_id (str): The ID of the mlflow run where the metric should be logged 21 | """ 22 | 23 | self.key = key 24 | self.run_id = run_id 25 | self._load_args = load_args or {} 26 | self._save_args = save_args or {} 27 | self._logging_activated = True # by default, logging is activated! 28 | self.metadata = metadata 29 | 30 | @property 31 | def run_id(self) -> Union[str, None]: 32 | """Get run id.""" 33 | 34 | run = mlflow.active_run() 35 | if (self._run_id is None) and (run is not None): 36 | # if no run_id is specified, we try to retrieve the current run 37 | # this is useful because during a kedro run, we want to be able to retrieve 38 | # the metric from the active run to be able to reload a metric 39 | # without specifying the (unknown) run id 40 | return run.info.run_id 41 | 42 | # else we return the _run_id which can eventually be None. 43 | # In this case, saving will work (a new run will be created) 44 | # but loading will fail, 45 | # according to mlflow's behaviour 46 | return self._run_id 47 | 48 | @run_id.setter 49 | def run_id(self, run_id: str): 50 | self._run_id = run_id 51 | 52 | # we want to be able to turn logging off for an entire pipeline run 53 | # To avoid that a single call to a dataset in the catalog creates a new run automatically 54 | # we want to be able to turn everything off 55 | @property 56 | def _logging_activated(self): 57 | return self.__logging_activated 58 | 59 | @_logging_activated.setter 60 | def _logging_activated(self, flag): 61 | if not isinstance(flag, bool): 62 | raise ValueError(f"_logging_activated must be a boolean, got {type(flag)}") 63 | self.__logging_activated = flag 64 | 65 | def _validate_run_id(self): 66 | if self.run_id is None: 67 | raise ValueError( 68 | "You must either specify a run_id or have a mlflow active run opened. Use mlflow.start_run() if necessary." 69 | ) 70 | 71 | def _exists(self) -> bool: 72 | """Check if the metric exists in remote mlflow storage exists. 73 | 74 | Returns: 75 | bool: Does the metric name exist in the given run_id? 76 | """ 77 | mlflow_client = MlflowClient() 78 | run_id = self.run_id # will get the active run if nothing is specified 79 | run = mlflow_client.get_run(run_id) if run_id else mlflow.active_run() 80 | 81 | flag_exist = self.key in run.data.metrics.keys() if run else False 82 | return flag_exist 83 | 84 | def _describe(self) -> dict[str, Any]: 85 | """Describe MLflow metrics dataset. 86 | 87 | Returns: 88 | dict[str, Any]: dictionary with MLflow metrics dataset description. 89 | """ 90 | return { 91 | "key": self.key, 92 | "run_id": self.run_id, 93 | } 94 | -------------------------------------------------------------------------------- /kedro_mlflow/io/metrics/mlflow_metric_dataset.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from typing import Any, Optional 3 | 4 | from mlflow.tracking import MlflowClient 5 | 6 | from kedro_mlflow.io.metrics.mlflow_abstract_metric_dataset import ( 7 | MlflowAbstractMetricDataset, 8 | ) 9 | 10 | 11 | class MlflowMetricDataset(MlflowAbstractMetricDataset): 12 | SUPPORTED_SAVE_MODES = {"overwrite", "append"} 13 | DEFAULT_SAVE_MODE = "overwrite" 14 | 15 | def __init__( 16 | self, 17 | key: str = None, 18 | run_id: str = None, 19 | load_args: dict[str, Any] = None, 20 | save_args: dict[str, Any] = None, 21 | metadata: Optional[dict[str, Any]] = None, 22 | ): 23 | """Initialise MlflowMetricDataset. 24 | Args: 25 | run_id (str): The ID of the mlflow run where the metric should be logged 26 | """ 27 | 28 | super().__init__(key, run_id, load_args, save_args, metadata) 29 | 30 | # We add an extra argument mode="overwrite" / "append" to enable logging update an existing metric 31 | # this is not an offical mlflow argument for log_metric, so we separate it from the others 32 | # "overwrite" corresponds to the default mlflow behaviour 33 | self.mode = self._save_args.pop("mode", self.DEFAULT_SAVE_MODE) 34 | 35 | def _load(self): 36 | self._validate_run_id() 37 | mlflow_client = MlflowClient() 38 | metric_history = mlflow_client.get_metric_history( 39 | run_id=self.run_id, key=self.key 40 | ) # gets active run if no run_id was given 41 | 42 | # the metric history is always a list of mlflow.entities.metric.Metric 43 | # we want the value of the last one stored because this dataset only deal with one single metric 44 | step = self._load_args.get("step") 45 | 46 | if step is None: 47 | # we take the last value recorded 48 | metric_value = metric_history[-1].value 49 | else: 50 | # we should take the last historical value with the given step 51 | # (it is possible to have several values with the same step) 52 | metric_value = next( 53 | metric.value 54 | for metric in reversed(metric_history) 55 | if metric.step == step 56 | ) 57 | 58 | return metric_value 59 | 60 | def _save(self, data: float): 61 | if self._logging_activated: 62 | self._validate_run_id() 63 | run_id = self.run_id # we access it once instead of calling self.run_id everywhere to avoid looking or an active run each time 64 | 65 | mlflow_client = MlflowClient() 66 | 67 | # get the metric history if it has been saved previously to ensure 68 | # to retrieve the right data 69 | # reminder: this is True even if no run_id was originally specified but a run is active 70 | metric_history = ( 71 | mlflow_client.get_metric_history(run_id=run_id, key=self.key) 72 | if self._exists() 73 | else [] 74 | ) 75 | 76 | save_args = deepcopy(self._save_args) 77 | step = save_args.pop("step", None) 78 | if step is None: 79 | if self.mode == "overwrite": 80 | step = max([metric.step for metric in metric_history], default=0) 81 | elif self.mode == "append": 82 | # I put a max([]) default to -1 so that default "step" equals 0 83 | step = ( 84 | max([metric.step for metric in metric_history], default=-1) + 1 85 | ) 86 | else: 87 | raise ValueError( 88 | f"save_args['mode'] must be one of {self.SUPPORTED_SAVE_MODES}, got '{self.mode}' instead." 89 | ) 90 | 91 | mlflow_client.log_metric( 92 | run_id=run_id, 93 | key=self.key, 94 | value=data, 95 | step=step, 96 | **save_args, 97 | ) 98 | -------------------------------------------------------------------------------- /kedro_mlflow/io/metrics/mlflow_metric_history_dataset.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Optional, Union 2 | 3 | from mlflow.tracking import MlflowClient 4 | 5 | from kedro_mlflow.io.metrics.mlflow_abstract_metric_dataset import ( 6 | MlflowAbstractMetricDataset, 7 | ) 8 | 9 | 10 | class MlflowMetricHistoryDataset(MlflowAbstractMetricDataset): 11 | def __init__( 12 | self, 13 | key: str = None, 14 | run_id: str = None, 15 | load_args: dict[str, Any] = None, 16 | save_args: dict[str, Any] = None, 17 | metadata: Optional[dict[str, Any]] = None, 18 | ): 19 | """Initialise MlflowMetricDataset. 20 | Args: 21 | run_id (str): The ID of the mlflow run where the metric should be logged 22 | """ 23 | 24 | super().__init__(key, run_id, load_args, save_args, metadata) 25 | 26 | def _load(self): 27 | self._validate_run_id() 28 | mode = self._load_args.get("mode", "list") 29 | mlflow_client = MlflowClient() 30 | 31 | metric_history = mlflow_client.get_metric_history(self.run_id, key=self.key) 32 | 33 | if mode == "list": 34 | simplified_history = [metric.value for metric in metric_history] 35 | elif mode == "dict": 36 | simplified_history = { 37 | metric.step: metric.value for metric in metric_history 38 | } 39 | elif mode == "history": 40 | # history is a list of dict whom keys are "log_metric" arguments. The following is equivalent to dict mode: 41 | # [{"step": 0, "value": 0.1}, {"step": 1, "value": 0.2}, {"step": 2, "value": 0.3}] 42 | simplified_history = [ 43 | { 44 | "step": metric.step, 45 | "value": metric.value, 46 | "timestamp": metric.timestamp, 47 | } 48 | for metric in metric_history 49 | ] 50 | return simplified_history 51 | 52 | def _save( 53 | self, 54 | data: Union[list[int], dict[int, float], list[dict[str, Union[float, str]]]], 55 | ): 56 | if self._logging_activated: 57 | self._validate_run_id() 58 | run_id = self.run_id 59 | 60 | mode = self._save_args.get("mode", "list") 61 | mlflow_client = MlflowClient() 62 | if mode == "list": 63 | # list is a list of value in sequential order: 64 | # [0.1,0.2,0.3] 65 | for i, value in enumerate(data): 66 | mlflow_client.log_metric( 67 | run_id=run_id, key=self.key, step=i, value=value 68 | ) 69 | elif mode == "dict": 70 | # dict is a {step: value} mapping: 71 | # [{0: 0.1}, {1: 0.2}, {2: 0.3}] 72 | for step, value in data.items(): 73 | mlflow_client.log_metric( 74 | run_id=run_id, key=self.key, step=step, value=value 75 | ) 76 | elif mode == "history": 77 | # history is a list of dict whom keys are "log_metric" arguments. The following is equivalent to dict mode: 78 | # [{"step": 0, "value": 0.1}, {"step": 1, "value": 0.2}, {"step": 2, "value": 0.3}] 79 | for log_kwargs in data: 80 | mlflow_client.log_metric(run_id=run_id, key=self.key, **log_kwargs) 81 | -------------------------------------------------------------------------------- /kedro_mlflow/io/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .mlflow_model_local_filesystem_dataset import MlflowModelLocalFileSystemDataset 2 | from .mlflow_model_registry_dataset import MlflowModelRegistryDataset 3 | from .mlflow_model_tracking_dataset import MlflowModelTrackingDataset 4 | 5 | __all__ = [ 6 | "MlflowModelLocalFileSystemDataset", 7 | "MlflowModelRegistryDataset", 8 | "MlflowModelTrackingDataset", 9 | ] 10 | -------------------------------------------------------------------------------- /kedro_mlflow/io/models/mlflow_abstract_model_dataset.py: -------------------------------------------------------------------------------- 1 | from importlib import import_module 2 | from importlib.util import find_spec 3 | from pathlib import Path 4 | from typing import Any, Optional 5 | 6 | from kedro.io import AbstractVersionedDataset, Version 7 | from kedro.io.core import DatasetError 8 | 9 | 10 | class MlflowAbstractModelDataSet(AbstractVersionedDataset): 11 | """ 12 | Abstract mother class for model datasets. 13 | """ 14 | 15 | def __init__( 16 | self, 17 | filepath: str, 18 | flavor: str, 19 | pyfunc_workflow: Optional[str] = None, 20 | load_args: dict[str, Any] = None, 21 | save_args: dict[str, Any] = None, 22 | version: Version = None, 23 | metadata: Optional[dict[str, Any]] = None, 24 | ) -> None: 25 | """Initialize the Kedro MlflowAbstractModelDataSet. 26 | 27 | Parameters are passed from the Data Catalog. 28 | 29 | During save, the model is first logged to MLflow. 30 | During load, the model is pulled from MLflow run with `run_id`. 31 | 32 | Args: 33 | filepath (str): Path to store the dataset locally. 34 | flavor (str): Built-in or custom MLflow model flavor module. 35 | Must be Python-importable. 36 | pyfunc_workflow (str, optional): Either `python_model` or `loader_module`. 37 | See https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#workflows. 38 | load_args (dict[str, Any], optional): Arguments to `load_model` 39 | function from specified `flavor`. Defaults to {}. 40 | save_args (dict[str, Any], optional): Arguments to `log_model` 41 | function from specified `flavor`. Defaults to {}. 42 | version (Version, optional): Specific version to load. 43 | metadata: Any arbitrary metadata. 44 | This is ignored by Kedro, but may be consumed by users or external plugins. 45 | 46 | Raises: 47 | DatasetError: When passed `flavor` does not exist. 48 | """ 49 | 50 | super().__init__(Path(filepath), version) 51 | 52 | self._flavor = flavor 53 | self._pyfunc_workflow = pyfunc_workflow 54 | self._logging_activated = True # by default, it should be True! 55 | 56 | if flavor == "mlflow.pyfunc" and pyfunc_workflow not in ( 57 | "python_model", 58 | "loader_module", 59 | ): 60 | raise DatasetError( 61 | "PyFunc models require specifying `pyfunc_workflow` " 62 | "(set to either `python_model` or `loader_module`)" 63 | ) 64 | 65 | self._load_args = load_args or {} 66 | self._save_args = save_args or {} 67 | self.metadata = metadata 68 | 69 | try: 70 | self._mlflow_model_module 71 | except ImportError as err: 72 | raise DatasetError(err) 73 | 74 | # we want to be able to turn logging off for an entire pipeline run 75 | # To avoid that a single call to a dataset in the catalog creates a new run automatically 76 | # we want to be able to turn everything off 77 | @property 78 | def _logging_activated(self): 79 | return self.__logging_activated 80 | 81 | @_logging_activated.setter 82 | def _logging_activated(self, flag): 83 | if not isinstance(flag, bool): 84 | raise ValueError(f"_logging_activated must be a boolean, got {type(flag)}") 85 | self.__logging_activated = flag 86 | 87 | # IMPORTANT: _mlflow_model_module is a property to avoid STORING 88 | # the module as an attribute but rather store a string and load on the fly 89 | # The goal is to make this DataSet deepcopiable for compatibility with 90 | # KedroPipelineModel, e.g we can't just do : 91 | # self._mlflow_model_module = self._import_module(self._flavor) 92 | 93 | @property 94 | def _mlflow_model_module(self): # pragma: no cover 95 | pass 96 | 97 | @_mlflow_model_module.getter 98 | def _mlflow_model_module(self): 99 | return self._import_module(self._flavor) 100 | 101 | # TODO: check with Kajetan what was originally intended here 102 | # @classmethod 103 | # def _parse_args(cls, kwargs_dict: dict[str, Any]) -> dict[str, Any]: 104 | # parsed_kargs = {} 105 | # for key, value in kwargs_dict.items(): 106 | # if key.endswith("_args"): 107 | # continue 108 | # if f"{key}_args" in kwargs_dict: 109 | # new_value = cls._import_module(value)( 110 | # MlflowModelDataSet._parse_args(kwargs_dict[f"{key}_args"]) 111 | # ) 112 | # parsed_kargs[key] = new_value 113 | # else: 114 | # parsed_kargs[key] = value 115 | # return parsed_kargs 116 | 117 | @staticmethod 118 | def _import_module(import_path: str) -> Any: 119 | exists = find_spec(import_path) 120 | 121 | if not exists: 122 | raise ImportError( 123 | f"'{import_path}' module not found. Check valid flavor in mlflow documentation: https://www.mlflow.org/docs/latest/python_api/index.html" 124 | ) 125 | 126 | return import_module(import_path) 127 | -------------------------------------------------------------------------------- /kedro_mlflow/io/models/mlflow_model_local_filesystem_dataset.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | from os.path import exists 3 | from typing import Any, Optional 4 | 5 | from kedro.io import Version 6 | 7 | from kedro_mlflow.io.models.mlflow_abstract_model_dataset import ( 8 | MlflowAbstractModelDataSet, 9 | ) 10 | 11 | 12 | class MlflowModelLocalFileSystemDataset(MlflowAbstractModelDataSet): 13 | """Wrapper for saving, logging and loading for all MLflow model flavor.""" 14 | 15 | def __init__( 16 | self, 17 | filepath: str, 18 | flavor: str, 19 | pyfunc_workflow: Optional[str] = None, 20 | load_args: dict[str, Any] = None, 21 | save_args: dict[str, Any] = None, 22 | log_args: dict[str, Any] = None, 23 | version: Version = None, 24 | metadata: Optional[dict[str, Any]] = None, 25 | ) -> None: 26 | """Initialize the Kedro MlflowModelDataSet. 27 | 28 | Parameters are passed from the Data Catalog. 29 | 30 | During save, the model is saved locally at `filepath` 31 | During load, the model is loaded from the local `filepath`. 32 | 33 | Args: 34 | flavor (str): Built-in or custom MLflow model flavor module. 35 | Must be Python-importable. 36 | filepath (str): Path to store the dataset locally. 37 | pyfunc_workflow (str, optional): Either `python_model` or `loader_module`. 38 | See https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#workflows. 39 | load_args (dict[str, Any], optional): Arguments to `load_model` 40 | function from specified `flavor`. Defaults to None. 41 | save_args (dict[str, Any], optional): Arguments to `save_model` 42 | function from specified `flavor`. Defaults to None. 43 | version (Version, optional): Kedro version to use. Defaults to None. 44 | metadata: Any arbitrary metadata. 45 | This is ignored by Kedro, but may be consumed by users or external plugins. 46 | 47 | Raises: 48 | DatasetError: When passed `flavor` does not exist. 49 | """ 50 | super().__init__( 51 | filepath=filepath, 52 | flavor=flavor, 53 | pyfunc_workflow=pyfunc_workflow, 54 | load_args=load_args, 55 | save_args=save_args, 56 | version=version, 57 | metadata=metadata, 58 | ) 59 | 60 | def _load(self) -> Any: 61 | """Loads an MLflow model from local path or from MLflow run. 62 | 63 | Returns: 64 | Any: Deserialized model. 65 | """ 66 | return self._mlflow_model_module.load_model( 67 | model_uri=self._get_load_path().as_uri(), **self._load_args 68 | ) 69 | 70 | def _save(self, model: Any) -> None: 71 | """Save a model to local path and then logs it to MLflow. 72 | 73 | Args: 74 | model (Any): A model object supported by the given MLflow flavor. 75 | """ 76 | save_path = self._get_save_path() 77 | # In case of an unversioned model we need to remove the save path 78 | # because MLflow cannot overwrite the target directory. 79 | if exists(save_path): 80 | shutil.rmtree(save_path) 81 | 82 | if self._flavor == "mlflow.pyfunc": 83 | # PyFunc models utilise either `python_model` or `loader_module` 84 | # workflow. We we assign the passed `model` object to one of those keys 85 | # depending on the chosen `pyfunc_workflow`. 86 | self._save_args[self._pyfunc_workflow] = model 87 | self._mlflow_model_module.save_model(save_path, **self._save_args) 88 | else: 89 | # Otherwise we save using the common workflow where first argument is the 90 | # model object and second is the path. 91 | self._mlflow_model_module.save_model(model, save_path, **self._save_args) 92 | 93 | def _describe(self) -> dict[str, Any]: 94 | return dict( 95 | filepath=self._filepath, 96 | flavor=self._flavor, 97 | pyfunc_workflow=self._pyfunc_workflow, 98 | load_args=self._load_args, 99 | save_args=self._save_args, 100 | version=self._version, 101 | ) 102 | -------------------------------------------------------------------------------- /kedro_mlflow/io/models/mlflow_model_registry_dataset.py: -------------------------------------------------------------------------------- 1 | from logging import Logger, getLogger 2 | from typing import Any, Optional, Union 3 | 4 | from kedro.io.core import DatasetError 5 | 6 | from kedro_mlflow.io.models.mlflow_abstract_model_dataset import ( 7 | MlflowAbstractModelDataSet, 8 | ) 9 | 10 | 11 | class MlflowModelRegistryDataset(MlflowAbstractModelDataSet): 12 | """Wrapper for saving, logging and loading for all MLflow model flavor.""" 13 | 14 | def __init__( 15 | self, 16 | model_name: str, 17 | stage_or_version: Union[str, int, None] = None, 18 | alias: Optional[str] = None, 19 | flavor: Optional[str] = "mlflow.pyfunc", 20 | pyfunc_workflow: Optional[str] = "python_model", 21 | load_args: Optional[dict[str, Any]] = None, 22 | metadata: Optional[dict[str, Any]] = None, 23 | ) -> None: 24 | """Initialize the Kedro MlflowModelRegistryDataset. 25 | 26 | Parameters are passed from the Data Catalog. 27 | 28 | During "load", the model is pulled from MLflow model registry by its name. 29 | "save" is not supported. 30 | 31 | Args: 32 | model_name (str): The name of the registered model is the mlflow registry 33 | stage_or_version (str): A valid stage (either "staging" or "production") or version number for the registred model. 34 | Default to "latest" which fetch the last version and the higher "stage" available. 35 | flavor (str): Built-in or custom MLflow model flavor module. 36 | Must be Python-importable. 37 | pyfunc_workflow (str, optional): Either `python_model` or `loader_module`. 38 | See https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#workflows. 39 | load_args (dict[str, Any], optional): Arguments to `load_model` 40 | function from specified `flavor`. Defaults to None. 41 | metadata: Any arbitrary metadata. 42 | This is ignored by Kedro, but may be consumed by users or external plugins. 43 | 44 | Raises: 45 | DatasetError: When passed `flavor` does not exist. 46 | """ 47 | super().__init__( 48 | filepath="", 49 | flavor=flavor, 50 | pyfunc_workflow=pyfunc_workflow, 51 | load_args=load_args, 52 | save_args={}, 53 | version=None, 54 | metadata=metadata, 55 | ) 56 | 57 | if alias is None and stage_or_version is None: 58 | # reassign stage_or_version to "latest" 59 | stage_or_version = "latest" 60 | 61 | if alias and stage_or_version: 62 | raise DatasetError( 63 | f"You cannot specify 'alias' and 'stage_or_version' simultaneously ({alias=} and {stage_or_version=})" 64 | ) 65 | 66 | self.model_name = model_name 67 | self.stage_or_version = stage_or_version 68 | self.alias = alias 69 | self.model_uri = ( 70 | f"models:/{model_name}@{alias}" 71 | if alias 72 | else f"models:/{model_name}/{stage_or_version}" 73 | ) 74 | 75 | @property 76 | def _logger(self) -> Logger: 77 | return getLogger(__name__) 78 | 79 | def _load(self) -> Any: 80 | """Loads an MLflow model from local path or from MLflow run. 81 | 82 | Returns: 83 | Any: Deserialized model. 84 | """ 85 | 86 | # If `run_id` is specified, pull the model from MLflow. 87 | # TODO: enable loading from another mlflow conf (with a client with another tracking uri) 88 | # Alternatively, use local path to load the model. 89 | model = self._mlflow_model_module.load_model( 90 | model_uri=self.model_uri, **self._load_args 91 | ) 92 | 93 | # log some info because "latest" model is not very informative 94 | # the model itself does not have information about its registry 95 | # because the same run can be registered under several different names 96 | # in the registry. See https://github.com/Galileo-Galilei/kedro-mlflow/issues/552 97 | 98 | self._logger.info(f"Loading model from run_id='{model.metadata.run_id}'") 99 | return model 100 | 101 | def _save(self, model: Any) -> None: 102 | raise NotImplementedError( 103 | "The 'save' method is not implemented for MlflowModelRegistryDataset. You can pass 'registered_model_name' argument in 'MLflowModelTrackingDataset(..., save_args={registered_model_name='my_model'}' to save and register a model in the same step. " 104 | ) 105 | 106 | def _describe(self) -> dict[str, Any]: 107 | return dict( 108 | model_uri=self.model_uri, 109 | model_name=self.model_name, 110 | stage_or_version=self.stage_or_version, 111 | alias=self.alias, 112 | flavor=self._flavor, 113 | pyfunc_workflow=self._pyfunc_workflow, 114 | # load_args=self._load_args, 115 | ) 116 | -------------------------------------------------------------------------------- /kedro_mlflow/io/models/mlflow_model_tracking_dataset.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Optional 2 | 3 | import mlflow 4 | from kedro.io.core import DatasetError 5 | 6 | from kedro_mlflow.io.models.mlflow_abstract_model_dataset import ( 7 | MlflowAbstractModelDataSet, 8 | ) 9 | 10 | 11 | class MlflowModelTrackingDataset(MlflowAbstractModelDataSet): 12 | """Wrapper for saving, logging and loading for all MLflow model flavor.""" 13 | 14 | def __init__( 15 | self, 16 | flavor: str, 17 | run_id: Optional[str] = None, 18 | artifact_path: Optional[str] = "model", 19 | pyfunc_workflow: Optional[str] = None, 20 | load_args: Optional[dict[str, Any]] = None, 21 | save_args: Optional[dict[str, Any]] = None, 22 | metadata: Optional[dict[str, Any]] = None, 23 | ) -> None: 24 | """Initialize the Kedro MlflowModelDataSet. 25 | 26 | Parameters are passed from the Data Catalog. 27 | 28 | During save, the model is first logged to MLflow. 29 | During load, the model is pulled from MLflow run with `run_id`. 30 | 31 | Args: 32 | flavor (str): Built-in or custom MLflow model flavor module. 33 | Must be Python-importable. 34 | run_id (Optional[str], optional): MLflow run ID to use to load 35 | the model from or save the model to. Defaults to None. 36 | artifact_path (str, optional): the run relative path to 37 | the model. 38 | pyfunc_workflow (str, optional): Either `python_model` or `loader_module`. 39 | See https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#workflows. 40 | load_args (dict[str, Any], optional): Arguments to `load_model` 41 | function from specified `flavor`. Defaults to None. 42 | save_args (dict[str, Any], optional): Arguments to `log_model` 43 | function from specified `flavor`. Defaults to None. 44 | metadata: Any arbitrary metadata. 45 | This is ignored by Kedro, but may be consumed by users or external plugins. 46 | 47 | Raises: 48 | DatasetError: When passed `flavor` does not exist. 49 | """ 50 | super().__init__( 51 | filepath="", 52 | flavor=flavor, 53 | pyfunc_workflow=pyfunc_workflow, 54 | load_args=load_args, 55 | save_args=save_args, 56 | version=None, 57 | metadata=metadata, 58 | ) 59 | 60 | self._run_id = run_id 61 | self._artifact_path = artifact_path 62 | 63 | # drop the key which MUST be common to save and load and 64 | # thus is instantiated outside save_args 65 | self._save_args.pop("artifact_path", None) 66 | 67 | @property 68 | def model_uri(self): 69 | run_id = None 70 | if self._run_id: 71 | run_id = self._run_id 72 | elif mlflow.active_run() is not None: 73 | run_id = mlflow.active_run().info.run_id 74 | if run_id is None: 75 | raise DatasetError( 76 | "To access the model_uri, you must either: " 77 | "\n - specifiy 'run_id' " 78 | "\n - have an active run to retrieve data from" 79 | ) 80 | 81 | model_uri = f"runs:/{run_id}/{self._artifact_path}" 82 | 83 | return model_uri 84 | 85 | def _load(self) -> Any: 86 | """Loads an MLflow model from local path or from MLflow run. 87 | 88 | Returns: 89 | Any: Deserialized model. 90 | """ 91 | 92 | # If `run_id` is specified, pull the model from MLflow. 93 | # TODO: enable loading from another mlflow conf (with a client with another tracking uri) 94 | # Alternatively, use local path to load the model. 95 | return self._mlflow_model_module.load_model( 96 | model_uri=self.model_uri, **self._load_args 97 | ) 98 | 99 | def _save(self, model: Any) -> None: 100 | """Save a model to local path and then logs it to MLflow. 101 | 102 | Args: 103 | model (Any): A model object supported by the given MLflow flavor. 104 | """ 105 | if self._run_id: 106 | if mlflow.active_run(): 107 | # it is not possible to log in a run which is not the current opened one 108 | raise DatasetError( 109 | f"'run_id' cannot be specified (run_id='{self._run_id}') " 110 | f"if there is an mlflow active run (active run id='{mlflow.active_run().info.run_id}') " 111 | f"See the rationale in this issue: https://github.com/Galileo-Galilei/kedro-mlflow/issues/549." 112 | ) 113 | else: 114 | # if the run id is specified and there is no opened run, 115 | # open the right run before logging 116 | with mlflow.start_run(run_id=self._run_id): 117 | self._save_model_in_run(model) 118 | else: 119 | # if there is no run_id, log in active run 120 | # OR open automatically a new run to log 121 | self._save_model_in_run(model) 122 | 123 | def _save_model_in_run(self, model): 124 | if self._flavor == "mlflow.pyfunc": 125 | # PyFunc models utilise either `python_model` or `loader_module` 126 | # workflow. We we assign the passed `model` object to one of those keys 127 | # depending on the chosen `pyfunc_workflow`. 128 | self._save_args[self._pyfunc_workflow] = model 129 | if self._logging_activated: 130 | self._mlflow_model_module.log_model( 131 | self._artifact_path, **self._save_args 132 | ) 133 | elif self._logging_activated: 134 | # Otherwise we save using the common workflow where first argument is the 135 | # model object and second is the path. 136 | self._mlflow_model_module.log_model( 137 | model, self._artifact_path, **self._save_args 138 | ) 139 | 140 | def _describe(self) -> dict[str, Any]: 141 | return dict( 142 | flavor=self._flavor, 143 | run_id=self._run_id, 144 | artifact_path=self._artifact_path, 145 | pyfunc_workflow=self._pyfunc_workflow, 146 | load_args=self._load_args, 147 | save_args=self._save_args, 148 | ) 149 | -------------------------------------------------------------------------------- /kedro_mlflow/mlflow/__init__.py: -------------------------------------------------------------------------------- 1 | from .kedro_pipeline_model import KedroPipelineModel # noqa: F401 2 | -------------------------------------------------------------------------------- /kedro_mlflow/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | from .pipeline_ml_factory import pipeline_ml_factory 2 | 3 | __all__ = ["pipeline_ml_factory"] 4 | -------------------------------------------------------------------------------- /kedro_mlflow/pipeline/pipeline_ml_factory.py: -------------------------------------------------------------------------------- 1 | from kedro.pipeline import Pipeline 2 | 3 | from kedro_mlflow.pipeline.pipeline_ml import PipelineML 4 | 5 | 6 | def pipeline_ml_factory( 7 | training: Pipeline, 8 | inference: Pipeline, 9 | input_name: str = None, 10 | kpm_kwargs=None, 11 | log_model_kwargs=None, 12 | ) -> PipelineML: 13 | """This function is a helper to create `PipelineML` 14 | object directly from two Kedro `Pipelines` (one of 15 | training and one of inference) . 16 | 17 | Args: 18 | training (Pipeline): The `Pipeline` object that creates 19 | all mlflow artifacts for prediction (the model, 20 | but also encoders, binarizers, tokenizers...). 21 | These artifacts must be persisted in the catalog.yml. 22 | inference (Pipeline): A `Pipeline` object which will be 23 | stored in mlflow and use the output(s) 24 | of the training pipeline (namely, the model) 25 | to predict the outcome. 26 | input_name (str, optional): The name of the dataset in 27 | the catalog.yml which the model's user must provide 28 | for prediction (i.e. the data). Defaults to None. 29 | kpm_kwargs: 30 | extra arguments to be passed to `KedroPipelineModel` 31 | when the PipelineML object is automatically saved at the end of a run. 32 | This includes: 33 | - `copy_mode`: the copy_mode to be used for underlying dataset 34 | when loaded in memory 35 | - `runner`: the kedro runner to run the model with 36 | logging_kwargs: 37 | extra arguments to be passed to `mlflow.pyfunc.log_model` 38 | when the PipelineML object is automatically saved at the end of a run. 39 | See mlflow documentation to see all available options: https://www.mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#mlflow.pyfunc.log_model 40 | 41 | Returns: 42 | PipelineML: A `PipelineML` which is automatically 43 | discovered by the `MlflowHook` and 44 | contains all the information for logging the 45 | inference pipeline as a Mlflow Model. 46 | """ 47 | 48 | pipeline = PipelineML( 49 | nodes=training.nodes, 50 | inference=inference, 51 | input_name=input_name, 52 | kpm_kwargs=kpm_kwargs, 53 | log_model_kwargs=log_model_kwargs, 54 | ) 55 | return pipeline 56 | -------------------------------------------------------------------------------- /kedro_mlflow/template/project/mlflow.yml: -------------------------------------------------------------------------------- 1 | # SERVER CONFIGURATION ------------------- 2 | 3 | # `mlflow_tracking_uri` is the path where the runs will be recorded. 4 | # For more informations, see https://www.mlflow.org/docs/latest/tracking.html#where-runs-are-recorded 5 | # kedro-mlflow accepts relative path from the project root. 6 | # For instance, default `mlruns` will create a mlruns folder 7 | # at the root of the project 8 | 9 | # All credentials needed for mlflow must be stored in credentials .yml as a dict 10 | # they will be exported as environment variable 11 | # If you want to set some credentials, e.g. AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY 12 | # > in `credentials.yml`: 13 | # your_mlflow_credentials: 14 | # AWS_ACCESS_KEY_ID: 132456 15 | # AWS_SECRET_ACCESS_KEY: 132456 16 | # > in this file `mlflow.yml`: 17 | # credentials: mlflow_credentials 18 | 19 | server: 20 | mlflow_tracking_uri: null # if null, will use mlflow.get_tracking_uri() as a default 21 | mlflow_registry_uri: null # if null, mlflow_tracking_uri will be used as mlflow default 22 | credentials: null # must be a valid key in credentials.yml which refers to a dict of sensitive mlflow environment variables (password, tokens...). See top of the file. 23 | request_header_provider: # this is only useful to deal with expiring token, see https://github.com/Galileo-Galilei/kedro-mlflow/issues/357 24 | type: null # The path to a class : my_project.pipelines.module.MyClass. Should inherit from https://github.com/mlflow/mlflow/blob/master/mlflow/tracking/request_header/abstract_request_header_provider.py#L4 25 | pass_context: False # should the class be instantiated with "kedro_context" argument? 26 | init_kwargs: {} # any kwargs to pass to the class when it is instantiated 27 | 28 | tracking: 29 | # You can specify a list of pipeline names for which tracking will be disabled 30 | # Running "kedro run --pipeline=<pipeline_name>" will not log parameters 31 | # in a new mlflow run 32 | 33 | disable_tracking: 34 | disable_autologging: True # If True, we force autologging to be disabled. This is useful on databricks with autologging by default which conflicts with the plugin. If False, we keep the default behaviour which is disable by default anayway. 35 | pipelines: [] 36 | 37 | experiment: 38 | name: {{ python_package }} 39 | create_experiment_kwargs: # will be used only if the experiment does not exist yet and is created. 40 | artifact_location: null # enable to specify an artifact location for the experiment different than the global one for the mlflow server 41 | tags: null # a dict of tags for the experiment 42 | restore_if_deleted: True # if the experiment`name` was previously deleted experiment, should we restore it? 43 | 44 | run: 45 | id: null # if `id` is None, a new run will be created 46 | name: null # if `name` is None, pipeline name will be used for the run name. You can use "${km.random_name:}" to generate a random name (mlflow's default) 47 | nested: True # if `nested` is False, you won't be able to launch sub-runs inside your nodes 48 | 49 | params: 50 | dict_params: 51 | flatten: False # if True, parameter which are dictionary will be splitted in multiple parameters when logged in mlflow, one for each key. 52 | recursive: True # Should the dictionary flattening be applied recursively (i.e for nested dictionaries)? Not use if `flatten_dict_params` is False. 53 | sep: "." # In case of recursive flattening, what separator should be used between the keys? E.g. {hyperaparam1: {p1:1, p2:2}} will be logged as hyperaparam1.p1 and hyperaparam1.p2 in mlflow. 54 | long_params_strategy: fail # One of ["fail", "tag", "truncate" ] If a parameter is above mlflow limit (currently 250), what should kedro-mlflow do? -> fail, set as a tag instead of a parameter, or truncate it to its 250 first letters? 55 | 56 | 57 | # UI-RELATED PARAMETERS ----------------- 58 | 59 | ui: 60 | port: "5000" # the port to use for the ui. Use mlflow default with 5000. 61 | host: "127.0.0.1" # the host to use for the ui. Use mlflow efault of "127.0.0.1". 62 | -------------------------------------------------------------------------------- /kedro_mlflow/utils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Any, Union 3 | 4 | 5 | def _is_project(project_path: Union[str, Path]) -> bool: 6 | try: 7 | # untested in the CI, for retrocompatiblity with kedro >=0.19.0,<0.19.3 8 | from kedro.framework.startup import _is_project as _ip 9 | except ImportError: 10 | from kedro.utils import _is_project as _ip 11 | 12 | return _ip(project_path) 13 | 14 | 15 | def _find_kedro_project(current_dir: Path) -> Any: 16 | try: 17 | # untested in the CI, for retrocompatiblity with kedro >=0.19.0,<0.19.3 18 | from kedro.framework.startup import _find_kedro_project as _fkp 19 | except ImportError: 20 | from kedro.utils import _find_kedro_project as _fkp 21 | 22 | return _fkp(current_dir) 23 | -------------------------------------------------------------------------------- /mlc_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "aliveStatusCodes": [ 3 | 429, 4 | 200 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # PEP-518 https://peps.python.org/pep-0518/ 2 | 3 | [build-system] 4 | # Minimum requirements for the build system to execute. 5 | requires = ["setuptools>=65.5.1", "setuptools-scm>=8.0"] # PEP 518 specifications 6 | build-backend = "setuptools.build_meta" 7 | 8 | [project] 9 | name = "kedro_mlflow" 10 | authors = [ 11 | {name = "Yolan Honoré-Rougé"} 12 | ] 13 | description = "A kedro-plugin to use mlflow in your kedro projects" 14 | requires-python = ">=3.9" 15 | dependencies = [ 16 | "kedro>=0.19.0, <0.20.0", 17 | "kedro_datasets", 18 | "mlflow>=2.7.0, <3.0.0", 19 | "pydantic>=1.0.0, <3.0.0" 20 | ] 21 | keywords = [ 22 | "kedro-plugin", 23 | "kedro", 24 | "mlflow", 25 | "experiment tracking", 26 | "model versioning", 27 | "model serving", 28 | "machine learning", 29 | "data pipelines", 30 | "data science", 31 | "ml engineering", 32 | "mlops" 33 | ] 34 | license = {text = "Apache Software License (Apache 2.0)"} 35 | classifiers = [ 36 | "Development Status :: 4 - Beta", 37 | "Programming Language :: Python :: 3.9", 38 | "Programming Language :: Python :: 3.10", 39 | "Programming Language :: Python :: 3.11", 40 | "Programming Language :: Python :: 3.12", 41 | "Programming Language :: Python :: 3.13", 42 | "Framework :: Kedro", 43 | "Environment :: Plugins", 44 | "Intended Audience :: Developers", 45 | "Operating System :: Microsoft :: Windows", 46 | "Operating System :: MacOS", 47 | "Operating System :: POSIX :: Linux", 48 | ] 49 | dynamic = ["readme", "version"] 50 | 51 | [project.optional-dependencies] 52 | test = [ 53 | "pytest>=5.4.0, <9.0.0", 54 | "pytest-cov>=2.8.0, <7.0.0", 55 | "pytest-lazy-fixtures>=1.0.0, <2.0.0", # pytest==8.0.0 breaks pytest-lazy-fixture (without final S) : https://github.com/TvoroG/pytest-lazy-fixture/issues/65 56 | "pytest-mock>=3.1.0, <4.0.0", 57 | "pytest-xdist>=3.0.0,<4.0.0", # mess up the test readibility in the console but is much faster for the CI with "-n auto" option 58 | "ruff>=0.5.0,<0.10.0", # ensure consistency with pre-commit 59 | "scikit-learn>=0.23.0, <1.7.0", 60 | "kedro-datasets[pandas.CSVDataSet]", 61 | ] 62 | 63 | doc = [ 64 | "sphinx>=4.5.0,<9.0.0", 65 | "sphinx-markdown-tables~=0.0.15", 66 | "sphinx-click>=3.1,<6.1", 67 | "sphinx_copybutton~=0.5.0", 68 | "myst-parser>=0.17.2,<4.1.0", 69 | "sphinx_design>=0.6.0,<0.7.0", 70 | "pydata-sphinx-theme>=0.16.0,<0.17.0", 71 | ] 72 | dev = [ 73 | "pre-commit>=2.0.0,<5.0.0", 74 | "jupyter>=1.0.0,<2.0.0", 75 | ] 76 | 77 | all = [ "kedro_mlflow[test,doc,dev]" ] 78 | 79 | [project.urls] 80 | Source = "https://github.com/Galileo-Galilei/kedro-mlflow" 81 | Documentation = "https://kedro-mlflow.readthedocs.io/en/stable/" 82 | Tracker = "https://github.com/Galileo-Galilei/kedro-mlflow/issues" 83 | 84 | [project.entry-points."kedro.hooks"] 85 | mlflow_hook = "kedro_mlflow.framework.hooks.mlflow_hook:mlflow_hook" 86 | 87 | [project.entry-points."kedro.project_commands"] 88 | kedro_mlflow = "kedro_mlflow.framework.cli.cli:commands" 89 | 90 | [tool.setuptools] 91 | zip-safe = false 92 | 93 | [tool.setuptools.packages.find] 94 | include = ["kedro_mlflow*"] 95 | 96 | [tool.setuptools.package-data] 97 | kedro_mlflow = ["py.typed", "*.yml"] 98 | 99 | [tool.setuptools.dynamic] 100 | readme = {file = "README.md", content-type = "text/markdown"} 101 | version = {attr = "kedro_mlflow.__version__"} 102 | 103 | [tool.pytest.ini_options] 104 | addopts = "--cov=kedro_mlflow --cov-report=html tests/" 105 | 106 | [tool.ruff] 107 | exclude = [ 108 | ".bzr", 109 | ".direnv", 110 | ".eggs", 111 | ".git", 112 | ".git-rewrite", 113 | ".hg", 114 | ".ipynb_checkpoints", 115 | ".mypy_cache", 116 | ".nox", 117 | ".pants.d", 118 | ".pyenv", 119 | ".pytest_cache", 120 | ".pytype", 121 | ".ruff_cache", 122 | ".svn", 123 | ".tox", 124 | ".venv", 125 | ".vscode", 126 | "__pypackages__", 127 | "_build", 128 | "buck-out", 129 | "build", 130 | "dist", 131 | "node_modules", 132 | "site-packages", 133 | "venv", 134 | "/template/", 135 | "debug" 136 | ] 137 | 138 | # Same as Black. 139 | line-length = 88 140 | indent-width = 4 141 | 142 | # Assume Python 3.9 143 | target-version = "py39" 144 | 145 | [tool.ruff.lint] 146 | # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default. 147 | # Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or 148 | # McCabe complexity (`C901`) by default. 149 | select = ["E4", "E7", "E9", "F"] 150 | ignore = [] 151 | 152 | # Allow fix for all enabled rules (when `--fix`) is provided. 153 | fixable = ["ALL"] 154 | unfixable = [] 155 | 156 | # Allow unused variables when underscore-prefixed. 157 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" 158 | 159 | [tool.ruff.format] 160 | # Like Black, use double quotes for strings. 161 | quote-style = "double" 162 | 163 | # Like Black, indent with spaces, rather than tabs. 164 | indent-style = "space" 165 | 166 | # Like Black, respect magic trailing commas. 167 | skip-magic-trailing-comma = false 168 | 169 | # Like Black, automatically detect the appropriate line ending. 170 | line-ending = "auto" 171 | 172 | # Enable auto-formatting of code examples in docstrings. Markdown, 173 | # reStructuredText code/literal blocks and doctests are all supported. 174 | # 175 | # This is currently disabled by default, but it is planned for this 176 | # to be opt-out in the future. 177 | docstring-code-format = false 178 | 179 | # Set the line length limit used when formatting code snippets in 180 | # docstrings. 181 | # 182 | # This only has an effect when the `docstring-code-format` setting is 183 | # enabled. 184 | docstring-code-line-length = "dynamic" 185 | 186 | [tool.bumpversion] 187 | current_version = "0.14.4" 188 | 189 | [[tool.bumpversion.files]] 190 | filename = "kedro_mlflow/__init__.py" 191 | 192 | [[tool.bumpversion.files]] 193 | filename = "README.md" 194 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/tests/__init__.py -------------------------------------------------------------------------------- /tests/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/tests/config/__init__.py -------------------------------------------------------------------------------- /tests/config/test_resolvers.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import pytest 4 | import yaml 5 | from kedro.framework.session import KedroSession 6 | from kedro.framework.startup import bootstrap_project 7 | from mlflow.utils.name_utils import ( 8 | _GENERATOR_NOUNS, 9 | _GENERATOR_PREDICATES, 10 | ) 11 | from omegaconf import OmegaConf 12 | 13 | from kedro_mlflow.config.resolvers import resolve_random_name 14 | 15 | 16 | def _write_yaml(filepath, config): 17 | yaml_str = yaml.dump(config) 18 | filepath.write_text(yaml_str) 19 | 20 | 21 | def _is_mlflow_name(name: str) -> bool: 22 | splitted_name = name.split("-") 23 | flag1 = len(splitted_name) == 3 # noqa: PLR2004 24 | flag2 = splitted_name[0] in _GENERATOR_PREDICATES 25 | flag3 = splitted_name[1] in _GENERATOR_NOUNS 26 | flag4 = re.search(pattern=r"^\d+$", string=splitted_name[2]) 27 | return all({flag1, flag2, flag3, flag4}) 28 | 29 | 30 | @pytest.fixture 31 | def kedro_project_with_random_name(kedro_project): 32 | # kedro_project is a pytest.fixture in conftest 33 | dict_config = dict( 34 | server=dict( 35 | mlflow_tracking_uri="mlruns", 36 | mlflow_registry_uri=None, 37 | credentials=None, 38 | request_header_provider=dict(type=None, pass_context=False, init_kwargs={}), 39 | ), 40 | tracking=dict( 41 | disable_tracking=dict(pipelines=["my_disabled_pipeline"]), 42 | experiment=dict(name="fake_package", restore_if_deleted=True), 43 | run=dict(id="123456789", name="${km.random_name:}", nested=True), 44 | params=dict( 45 | dict_params=dict( 46 | flatten=True, 47 | recursive=False, 48 | sep="-", 49 | ), 50 | long_params_strategy="truncate", 51 | ), 52 | ), 53 | ui=dict(port="5151", host="localhost"), 54 | ) 55 | 56 | _write_yaml(kedro_project / "conf" / "local" / "mlflow.yml", dict_config) 57 | expected = dict_config.copy() 58 | expected["server"]["mlflow_tracking_uri"] = (kedro_project / "mlruns").as_uri() 59 | return kedro_project 60 | 61 | 62 | def test_resolve_random_name_is_valid_mlflow_name(): 63 | random_name = resolve_random_name() 64 | assert _is_mlflow_name(random_name) 65 | 66 | 67 | def test_resolve_random_name_is_registered(kedro_project_with_random_name): 68 | bootstrap_project(kedro_project_with_random_name) 69 | with KedroSession.create(project_path=kedro_project_with_random_name) as session: 70 | session.load_context() 71 | assert OmegaConf.has_resolver("km.random_name") 72 | 73 | 74 | def test_resolve_random_name_is_called_in_project(kedro_project_with_random_name): 75 | bootstrap_project(kedro_project_with_random_name) 76 | with KedroSession.create(project_path=kedro_project_with_random_name) as session: 77 | context = session.load_context() 78 | assert _is_mlflow_name(context.mlflow.tracking.run.name) 79 | 80 | 81 | @pytest.mark.skip(reason="kedro 0.19.2 does not take use_cache into account") 82 | def test_resolve_random_name_is_idempotent(kedro_project_with_random_name): 83 | bootstrap_project(kedro_project_with_random_name) 84 | with KedroSession.create(project_path=kedro_project_with_random_name) as session: 85 | context = session.load_context() 86 | assert ( 87 | context.config_loader["mlflow"]["tracking"]["run"]["name"] 88 | == context.config_loader["mlflow"]["tracking"]["run"]["name"] 89 | ) # when called twice, should be different is no use_cache because the resolver is random 90 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | import mlflow 5 | import pytest 6 | from cookiecutter.main import cookiecutter 7 | from kedro import __version__ as kedro_version 8 | from kedro.framework.cli.starters import TEMPLATE_PATH 9 | from mlflow import MlflowClient 10 | 11 | from kedro_mlflow.framework.cli.cli import TEMPLATE_FOLDER_PATH 12 | from kedro_mlflow.framework.cli.cli_utils import write_jinja_template 13 | 14 | _FAKE_PROJECT_NAME = "fake_project" 15 | 16 | 17 | @pytest.fixture 18 | def tracking_uri(tmp_path): 19 | tracking_uri = (tmp_path / "mlruns").as_uri() 20 | return tracking_uri 21 | 22 | 23 | @pytest.fixture 24 | def mlflow_client(tracking_uri): 25 | mlflow.set_tracking_uri(tracking_uri) 26 | client = MlflowClient(tracking_uri) 27 | return client 28 | 29 | 30 | @pytest.fixture(autouse=True) 31 | def cleanup_mlflow_after_runs(): 32 | yield # A test function will be run at this point 33 | while mlflow.active_run(): 34 | mlflow.end_run() 35 | 36 | # if set_experiment has been called before, it stores the experiment_id 37 | # as a global variable, so if we change the tracking_uri afterwards 38 | # mlflow is completly lost because the experiment id no longer exists 39 | # we just reset it after a test, like in a brand new session 40 | 41 | # CAVEAT 1 : do not import from "mlflow.tracking.fluent import _active_experiment_id" 42 | # because due to python namespacing import, it will not change the global variable accessed by mlflow 43 | 44 | # CAVEAT 2 : Since this PR: https://github.com/mlflow/mlflow/pull/13456/files 45 | # we need to reset experiment ID too because its now resetted in each thread 46 | mlflow.tracking.fluent._active_experiment_id = None 47 | os.environ.pop("MLFLOW_EXPERIMENT_ID", None) 48 | os.environ.pop("MLFLOW_TRACKING_URI", None) 49 | os.environ.pop("MLFLOW_REGISTRY_URI", None) 50 | 51 | # see https://github.com/kedro-org/kedro/blob/859f98217eed12208a922b771a97cbfb82ba7e80/tests/framework/session/test_session.py#L173 52 | 53 | 54 | @pytest.fixture 55 | def kedro_project(tmp_path): 56 | # TODO : this is also an integration test since this depends from the kedro version 57 | config = { 58 | # "output_dir": tmp_path, 59 | "project_name": _FAKE_PROJECT_NAME, 60 | "repo_name": _FAKE_PROJECT_NAME, 61 | "python_package": _FAKE_PROJECT_NAME, 62 | "kedro_version": kedro_version, 63 | "tools": "['None']", 64 | "example_pipeline": "False", 65 | } 66 | 67 | cookiecutter( 68 | str(TEMPLATE_PATH), 69 | output_dir=tmp_path, # config["output_dir"], 70 | no_input=True, 71 | extra_context=config, 72 | accept_hooks=False, 73 | ) 74 | 75 | shutil.rmtree( 76 | tmp_path / _FAKE_PROJECT_NAME / "tests" 77 | ) # avoid conflicts with pytest 78 | 79 | return tmp_path / _FAKE_PROJECT_NAME 80 | 81 | 82 | @pytest.fixture 83 | def kedro_project_with_mlflow_conf(kedro_project): 84 | write_jinja_template( 85 | src=TEMPLATE_FOLDER_PATH / "mlflow.yml", 86 | is_cookiecutter=False, 87 | dst=kedro_project / "conf" / "local" / "mlflow.yml", 88 | python_package="fake_project", 89 | ) 90 | 91 | return kedro_project 92 | -------------------------------------------------------------------------------- /tests/framework/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/tests/framework/__init__.py -------------------------------------------------------------------------------- /tests/framework/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/tests/framework/cli/__init__.py -------------------------------------------------------------------------------- /tests/framework/cli/test_cli_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from kedro_mlflow.framework.cli.cli_utils import ( 4 | render_jinja_template, 5 | write_jinja_template, 6 | ) 7 | 8 | 9 | @pytest.fixture 10 | def template_path(tmp_path): 11 | return tmp_path / "template.py" 12 | 13 | 14 | @pytest.fixture 15 | def jinja_template(template_path): 16 | with open(template_path, "w") as file_handler: 17 | file_handler.write("fake file\n which contains {{ fake_tag }}. Nice, isn't it?") 18 | return "fake file\n which contains 'Hello world!'. Nice, isn't it?" 19 | 20 | 21 | @pytest.fixture 22 | def cookiecutter_template(template_path): 23 | with open(template_path, "w") as file_handler: 24 | file_handler.write( 25 | "fake file\n which contains {{ cookiecutter.fake_tag }}. Nice, isn't it?" 26 | ) 27 | return "fake file\n which contains 'Hello world!'. Nice, isn't it?" 28 | 29 | 30 | def test_render_jinja_template(template_path, jinja_template): 31 | rendered = render_jinja_template(src=template_path, fake_tag="'Hello world!'") 32 | assert rendered == jinja_template 33 | 34 | 35 | def test_render_jinja_template_with_cookiecutter_tags( 36 | template_path, cookiecutter_template 37 | ): 38 | rendered = render_jinja_template( 39 | src=template_path, fake_tag="'Hello world!'", is_cookiecutter=True 40 | ) 41 | assert rendered == cookiecutter_template 42 | 43 | 44 | def test_write_jinja_template(tmp_path, template_path, jinja_template): 45 | rendered_path = tmp_path / "rendered.py" 46 | write_jinja_template( 47 | src=template_path, dst=rendered_path, fake_tag="'Hello world!'" 48 | ) 49 | with open(rendered_path) as file_handler: 50 | rendered = file_handler.read() 51 | assert rendered == jinja_template 52 | 53 | 54 | def test_write_jinja_template_with_cookiecutter_tags( 55 | tmp_path, template_path, cookiecutter_template 56 | ): 57 | rendered_path = tmp_path / "rendered.py" 58 | write_jinja_template( 59 | src=template_path, 60 | dst=rendered_path, 61 | is_cookiecutter=True, 62 | fake_tag="'Hello world!'", 63 | ) 64 | with open(rendered_path) as file_handler: 65 | rendered = file_handler.read() 66 | assert rendered == cookiecutter_template 67 | -------------------------------------------------------------------------------- /tests/framework/hooks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/tests/framework/hooks/__init__.py -------------------------------------------------------------------------------- /tests/framework/hooks/test_hook_active_run.py: -------------------------------------------------------------------------------- 1 | import mlflow 2 | import pytest 3 | from kedro.framework.session import KedroSession 4 | from kedro.framework.startup import bootstrap_project 5 | from kedro.io import DataCatalog, MemoryDataset 6 | from kedro.pipeline import Pipeline, node 7 | 8 | from kedro_mlflow.framework.hooks import MlflowHook 9 | 10 | 11 | @pytest.fixture 12 | def dummy_run_params(tmp_path): 13 | dummy_run_params = { 14 | "run_id": "", 15 | "project_path": tmp_path.as_posix(), 16 | "env": "local", 17 | "kedro_version": "0.16.5", 18 | "tags": [], 19 | "from_nodes": [], 20 | "to_nodes": [], 21 | "node_names": [], 22 | "from_inputs": [], 23 | "load_versions": [], 24 | "pipeline_name": "my_cool_pipeline", 25 | "extra_params": [], 26 | } 27 | return dummy_run_params 28 | 29 | 30 | @pytest.fixture 31 | def dummy_node(): 32 | def fake_fun(arg1, arg2, arg3): 33 | return None 34 | 35 | node_test = node( 36 | func=fake_fun, 37 | inputs={"arg1": "params:param1", "arg2": "foo", "arg3": "parameters"}, 38 | outputs="out", 39 | ) 40 | 41 | return node_test 42 | 43 | 44 | @pytest.fixture 45 | def dummy_pipeline(dummy_node): 46 | dummy_pipeline = Pipeline([dummy_node]) 47 | 48 | return dummy_pipeline 49 | 50 | 51 | @pytest.fixture 52 | def dummy_catalog(): 53 | catalog = DataCatalog( 54 | { 55 | "params:param1": 1, 56 | "foo": MemoryDataset(), 57 | "bar": MemoryDataset(), 58 | "parameters": {"param1": 1, "param2": 2}, 59 | } 60 | ) 61 | 62 | return catalog 63 | 64 | 65 | def test_hook_use_active_run_if_exist_and_do_not_close( 66 | kedro_project, 67 | dummy_run_params, 68 | dummy_pipeline, 69 | dummy_catalog, 70 | ): 71 | mlflow.set_tracking_uri(f"file:///{kedro_project}/mlruns") 72 | with mlflow.start_run(): 73 | mlflow_run_id = mlflow.active_run().info.run_id 74 | bootstrap_project(kedro_project) 75 | with KedroSession.create( 76 | project_path=kedro_project, 77 | ) as session: 78 | context = session.load_context() 79 | 80 | mlflow_node_hook = MlflowHook() 81 | mlflow_node_hook.after_context_created(context) 82 | mlflow_node_hook.before_pipeline_run( 83 | run_params=dummy_run_params, 84 | pipeline=dummy_pipeline, 85 | catalog=dummy_catalog, 86 | ) 87 | # check after before_pipeline_run, we should still have the same run 88 | assert mlflow.active_run().info.run_id == mlflow_run_id 89 | 90 | mlflow_node_hook.after_pipeline_run( 91 | run_params=dummy_run_params, 92 | pipeline=dummy_pipeline, 93 | catalog=dummy_catalog, 94 | ) 95 | # the run must still be open 96 | assert mlflow.active_run().info.run_id == mlflow_run_id 97 | 98 | mlflow_node_hook.on_pipeline_error( 99 | error=ValueError, 100 | run_params=dummy_run_params, 101 | pipeline=dummy_pipeline, 102 | catalog=dummy_catalog, 103 | ) 104 | # the run must still be open 105 | assert mlflow.active_run().info.run_id == mlflow_run_id 106 | 107 | 108 | def test_hook_active_run_exists_with_different_tracking_uri( 109 | kedro_project, 110 | dummy_run_params, 111 | dummy_pipeline, 112 | dummy_catalog, 113 | ): 114 | # tracking uri is "mlruns2", not "mlruns" 115 | mlflow.set_tracking_uri(f"file:///{kedro_project}/mlruns2") 116 | with mlflow.start_run(): 117 | mlflow_run_id = mlflow.active_run().info.run_id 118 | bootstrap_project(kedro_project) 119 | with KedroSession.create( 120 | project_path=kedro_project, 121 | ) as session: 122 | context = session.load_context() 123 | 124 | mlflow_node_hook = MlflowHook() 125 | mlflow_node_hook.after_context_created(context) 126 | 127 | mlflow.log_param("a", "1") # emulate param logging 128 | # the config should be modified 129 | assert ( 130 | mlflow_node_hook.mlflow_config.server.mlflow_tracking_uri 131 | == f"file:///{kedro_project}/mlruns2" 132 | ) 133 | assert mlflow_node_hook.mlflow_config.tracking.experiment.name == "Default" 134 | assert mlflow_node_hook.mlflow_config.tracking.run.id == mlflow_run_id 135 | 136 | assert mlflow.get_tracking_uri() == f"file:///{kedro_project}/mlruns2" 137 | 138 | # mlflow.active_run() does not have all data, we should get it trhough the client: https://www.mlflow.org/docs/latest/python_api/mlflow.html#mlflow.active_run 139 | active_run = mlflow_node_hook.mlflow_config.server._mlflow_client.get_run( 140 | mlflow.active_run().info.run_id 141 | ) 142 | assert active_run.data.params == {"a": "1"} 143 | -------------------------------------------------------------------------------- /tests/framework/hooks/test_hook_log_artifact.py: -------------------------------------------------------------------------------- 1 | import mlflow 2 | import pandas as pd 3 | import pytest 4 | from kedro.framework.hooks import _create_hook_manager 5 | from kedro.framework.hooks.manager import _register_hooks 6 | from kedro.framework.session import KedroSession 7 | from kedro.framework.startup import bootstrap_project 8 | from kedro.io import DataCatalog, MemoryDataset 9 | from kedro.pipeline import Pipeline, node 10 | from kedro.runner import ThreadRunner 11 | from kedro_datasets.pickle import PickleDataset 12 | 13 | from kedro_mlflow.framework.hooks.mlflow_hook import MlflowHook 14 | from kedro_mlflow.io.artifacts import MlflowArtifactDataset 15 | 16 | 17 | @pytest.fixture 18 | def dummy_pipeline(): 19 | def preprocess_fun(data): 20 | return data 21 | 22 | def train_fun(data): 23 | return 2 24 | 25 | dummy_pipeline = Pipeline( 26 | [ 27 | node( 28 | func=preprocess_fun, 29 | inputs="raw_data", 30 | outputs="data", 31 | ), 32 | node( 33 | func=train_fun, 34 | inputs=["data"], 35 | outputs="model", 36 | ), 37 | ] 38 | ) 39 | return dummy_pipeline 40 | 41 | 42 | @pytest.fixture 43 | def dummy_catalog(tmp_path): 44 | dummy_catalog = DataCatalog( 45 | { 46 | "raw_data": MemoryDataset(pd.DataFrame(data=[1], columns=["a"])), 47 | "data": MemoryDataset(), 48 | "model": MlflowArtifactDataset( 49 | dataset=dict( 50 | type=PickleDataset, filepath=(tmp_path / "model.csv").as_posix() 51 | ) 52 | ), 53 | } 54 | ) 55 | return dummy_catalog 56 | 57 | 58 | @pytest.fixture 59 | def dummy_run_params(tmp_path): 60 | dummy_run_params = { 61 | "project_path": tmp_path.as_posix(), 62 | "env": "local", 63 | "kedro_version": "0.16.0", 64 | "tags": [], 65 | "from_nodes": [], 66 | "to_nodes": [], 67 | "node_names": [], 68 | "from_inputs": [], 69 | "load_versions": [], 70 | "pipeline_name": "my_cool_pipeline", 71 | "extra_params": [], 72 | } 73 | return dummy_run_params 74 | 75 | 76 | def test_mlflow_hook_log_artifacts_within_same_run_with_thread_runner( 77 | kedro_project, dummy_run_params, dummy_pipeline, dummy_catalog 78 | ): 79 | # this test is very specific to a new design introduced in mlflow 2.18 to make it thread safe 80 | # see https://github.com/Galileo-Galilei/kedro-mlflow/issues/613 81 | bootstrap_project(kedro_project) 82 | 83 | with KedroSession.create(project_path=kedro_project) as session: 84 | context = session.load_context() # setup mlflow 85 | 86 | mlflow_hook = MlflowHook() 87 | runner = ThreadRunner() # this is what we want to test 88 | 89 | mlflow_hook.after_context_created(context) 90 | mlflow_hook.after_catalog_created( 91 | catalog=dummy_catalog, 92 | # `after_catalog_created` is not using any of arguments bellow, 93 | # so we are setting them to empty values. 94 | conf_catalog={}, 95 | conf_creds={}, 96 | feed_dict={}, 97 | save_version="", 98 | load_versions="", 99 | ) 100 | mlflow_hook.before_pipeline_run( 101 | run_params=dummy_run_params, 102 | pipeline=dummy_pipeline, 103 | catalog=dummy_catalog, 104 | ) 105 | 106 | # we get the run id BEFORE running the pipeline because it was modified in different thread 107 | run_id_before_run = mlflow.active_run().info.run_id 108 | 109 | hook_manager = _create_hook_manager() 110 | _register_hooks(hook_manager, (mlflow_hook,)) 111 | 112 | runner.run(dummy_pipeline, dummy_catalog, hook_manager) 113 | 114 | run_id_after_run = mlflow.active_run().info.run_id 115 | 116 | # CHECK 1: check that we are not on the second id created by the thread.lock() 117 | assert run_id_before_run == run_id_after_run 118 | 119 | mlflow_hook.after_pipeline_run( 120 | run_params=dummy_run_params, 121 | pipeline=dummy_pipeline, 122 | catalog=dummy_catalog, 123 | ) 124 | 125 | mlflow_client = context.mlflow.server._mlflow_client 126 | 127 | # check that the artifact is assocaied to the initial run: 128 | 129 | artifacts_list = mlflow_client.list_artifacts(run_id_before_run) 130 | assert len(artifacts_list) == 1 131 | -------------------------------------------------------------------------------- /tests/framework/hooks/test_hook_on_pipeline_error.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Iterable, Optional 2 | 3 | import mlflow 4 | import pytest 5 | from kedro.config import AbstractConfigLoader, OmegaConfigLoader 6 | from kedro.framework.hooks import hook_impl 7 | from kedro.framework.project import Validator, _ProjectPipelines, _ProjectSettings 8 | from kedro.framework.session import KedroSession 9 | from kedro.framework.startup import bootstrap_project 10 | from kedro.io import DataCatalog 11 | from kedro.pipeline import Pipeline, node 12 | from mlflow.entities import RunStatus 13 | from mlflow.tracking import MlflowClient 14 | 15 | from kedro_mlflow.framework.hooks.mlflow_hook import MlflowHook 16 | 17 | 18 | class DummyProjectHooks: 19 | @hook_impl 20 | def register_config_loader(self, conf_paths: Iterable[str]) -> AbstractConfigLoader: 21 | return OmegaConfigLoader(conf_paths) 22 | 23 | @hook_impl 24 | def register_catalog( 25 | self, 26 | catalog: Optional[dict[str, dict[str, Any]]], 27 | credentials: dict[str, dict[str, Any]], 28 | load_versions: dict[str, str], 29 | save_version: str, 30 | ) -> DataCatalog: 31 | return DataCatalog.from_config( 32 | catalog, credentials, load_versions, save_version 33 | ) 34 | 35 | 36 | def _mock_imported_settings_paths(mocker, mock_settings): 37 | for path in [ 38 | "kedro.framework.context.context.settings", 39 | "kedro.framework.session.session.settings", 40 | "kedro.framework.project.settings", 41 | ]: 42 | mocker.patch(path, mock_settings) 43 | return mock_settings 44 | 45 | 46 | def _mock_settings_with_hooks(mocker, hooks): 47 | class MockSettings(_ProjectSettings): 48 | _HOOKS = Validator("HOOKS", default=hooks) 49 | 50 | return _mock_imported_settings_paths(mocker, MockSettings()) 51 | 52 | 53 | @pytest.fixture 54 | def mock_settings_with_mlflow_hooks(mocker): 55 | return _mock_settings_with_hooks( 56 | mocker, 57 | hooks=( 58 | DummyProjectHooks(), 59 | MlflowHook(), 60 | ), 61 | ) 62 | 63 | 64 | @pytest.fixture 65 | def mock_failing_pipeline(mocker): 66 | def failing_node(): 67 | mlflow.start_run(nested=True) 68 | raise ValueError("Let's make this pipeline fail") 69 | 70 | def mocked_register_pipelines(): 71 | failing_pipeline = Pipeline( 72 | [ 73 | node( 74 | func=failing_node, 75 | inputs=None, 76 | outputs="fake_output", 77 | ) 78 | ] 79 | ) 80 | return {"__default__": failing_pipeline, "pipeline_off": failing_pipeline} 81 | 82 | mocker.patch.object( 83 | _ProjectPipelines, 84 | "_get_pipelines_registry_callable", 85 | return_value=mocked_register_pipelines, 86 | ) 87 | 88 | 89 | # @pytest.mark.usefixtures("mock_settings_with_mlflow_hooks") 90 | @pytest.mark.usefixtures("mock_failing_pipeline") 91 | def test_on_pipeline_error(kedro_project_with_mlflow_conf): 92 | tracking_uri = (kedro_project_with_mlflow_conf / "mlruns").as_uri() 93 | 94 | bootstrap_project(kedro_project_with_mlflow_conf) 95 | with KedroSession.create(project_path=kedro_project_with_mlflow_conf) as session: 96 | context = session.load_context() 97 | from logging import getLogger 98 | 99 | LOGGER = getLogger(__name__) 100 | LOGGER.info(f"{mlflow.active_run()=}") 101 | with pytest.raises(ValueError): 102 | LOGGER.info(f"{mlflow.active_run()=}") 103 | session.run() 104 | 105 | # the run we want is the last one in the configuration experiment 106 | mlflow_client = MlflowClient(tracking_uri) 107 | experiment = mlflow_client.get_experiment_by_name( 108 | context.mlflow.tracking.experiment.name 109 | ) 110 | failing_run_info = mlflow_client.search_runs(experiment.experiment_id)[-1].info 111 | 112 | assert mlflow.active_run() is None # the run must have been closed 113 | assert failing_run_info.status == RunStatus.to_string( 114 | RunStatus.FAILED 115 | ) # it must be marked as failed 116 | -------------------------------------------------------------------------------- /tests/framework/hooks/test_run_name.py: -------------------------------------------------------------------------------- 1 | import mlflow 2 | import pytest 3 | from kedro.framework.session import KedroSession 4 | from kedro.framework.startup import bootstrap_project 5 | from kedro.io import DataCatalog 6 | from kedro.pipeline import Pipeline 7 | 8 | from kedro_mlflow.framework.hooks import MlflowHook 9 | 10 | 11 | @pytest.mark.parametrize( 12 | "pipeline_name,expected_mlflow_run_name", 13 | [ 14 | ("my_cool_pipeline", "my_cool_pipeline"), 15 | ("__default__", "__default__"), 16 | (None, "__default__"), 17 | ], 18 | ) 19 | def test_pipeline_use_pipeline_name_as_run_name( 20 | kedro_project, pipeline_name, expected_mlflow_run_name 21 | ): 22 | dummy_run_params = { 23 | "run_id": "1234", 24 | "project_path": "path/to/project", 25 | "env": "local", 26 | "kedro_version": "X.Y.Z", 27 | "tags": [], 28 | "from_nodes": [], 29 | "to_nodes": [], 30 | "node_names": [], 31 | "from_inputs": [], 32 | "load_versions": [], 33 | "pipeline_name": pipeline_name, 34 | "extra_params": [], 35 | } 36 | 37 | bootstrap_project(kedro_project) 38 | with KedroSession.create( 39 | project_path=kedro_project, 40 | ) as session: 41 | context = session.load_context() 42 | 43 | mlflow_node_hook = MlflowHook() 44 | mlflow_node_hook.after_context_created(context) 45 | mlflow_node_hook.before_pipeline_run( 46 | run_params=dummy_run_params, pipeline=Pipeline([]), catalog=DataCatalog() 47 | ) 48 | 49 | assert ( 50 | mlflow.active_run().data.tags["mlflow.runName"] == expected_mlflow_run_name 51 | ) 52 | -------------------------------------------------------------------------------- /tests/framework/hooks/test_utils_flatten_dict.py: -------------------------------------------------------------------------------- 1 | from kedro_mlflow.framework.hooks.utils import _flatten_dict 2 | 3 | 4 | def test_flatten_dict_non_nested(): 5 | d = dict(a=1, b=2) 6 | assert _flatten_dict(d=d, recursive=True, sep=".") == d 7 | assert _flatten_dict(d=d, recursive=False, sep=".") == d 8 | 9 | 10 | def test_flatten_dict_nested_1_level(): 11 | d = dict(a=1, b=dict(c=3, d=4)) 12 | flattened = {"a": 1, "b.c": 3, "b.d": 4} 13 | assert _flatten_dict(d=d, recursive=True, sep=".") == flattened 14 | assert _flatten_dict(d=d, recursive=False, sep=".") == flattened 15 | 16 | 17 | def test_flatten_dict_nested_2_levels(): 18 | d = dict(a=1, b=dict(c=1, d=dict(e=3, f=5))) 19 | 20 | assert _flatten_dict(d=d, recursive=True, sep=".") == { 21 | "a": 1, 22 | "b.c": 1, 23 | "b.d.e": 3, 24 | "b.d.f": 5, 25 | } 26 | assert _flatten_dict(d=d, recursive=False, sep=".") == { 27 | "a": 1, 28 | "b.c": 1, 29 | "b.d": {"e": 3, "f": 5}, 30 | } 31 | 32 | 33 | def test_flatten_dict_nested_3_levels(): 34 | d = dict(a=1, b=dict(c=1, d=dict(e=3, f=dict(g=4, h=5)))) 35 | 36 | assert _flatten_dict(d=d, recursive=True, sep=".") == { 37 | "a": 1, 38 | "b.c": 1, 39 | "b.d.e": 3, 40 | "b.d.f.g": 4, 41 | "b.d.f.h": 5, 42 | } 43 | assert _flatten_dict(d=d, recursive=False, sep=".") == { 44 | "a": 1, 45 | "b.c": 1, 46 | "b.d": {"e": 3, "f": {"g": 4, "h": 5}}, 47 | } 48 | 49 | 50 | def test_flatten_dict_with_float_keys(): 51 | d = {0: 1, 1: {3: 1, 4: {"e": 3, 6.7: 5}}} 52 | 53 | assert _flatten_dict(d=d, recursive=True, sep="_") == { 54 | "0": 1, 55 | "1_3": 1, 56 | "1_4_e": 3, 57 | "1_4_6.7": 5, 58 | } 59 | assert ( 60 | _flatten_dict(d=d, recursive=False, sep="_") 61 | == { 62 | "0": 1, 63 | "1_3": 1, 64 | "1_4": { 65 | "e": 3, 66 | 6.7: 5, # 6.7 is not converted to string, but when the entire dict will be logged mlflow will take care of the conversion 67 | }, 68 | } 69 | ) 70 | 71 | 72 | def test_flatten_dict_with_used_defined_sep(): 73 | d = dict(a=1, b=dict(c=1, d=dict(e=3, f=dict(g=4, h=5)))) 74 | 75 | assert _flatten_dict(d=d, recursive=True, sep="_") == { 76 | "a": 1, 77 | "b_c": 1, 78 | "b_d_e": 3, 79 | "b_d_f_g": 4, 80 | "b_d_f_h": 5, 81 | } 82 | -------------------------------------------------------------------------------- /tests/framework/hooks/test_utils_generate_kedro_command.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from kedro_mlflow.framework.hooks.utils import _generate_kedro_command 4 | 5 | 6 | def test_generate_kedro_commands(): 7 | # TODO : add a better test because the formatting of record_data is subject to change 8 | # We could check that the command is recored and then rerun properly 9 | record_data = { 10 | "tags": ["tag1", "tag2"], 11 | "from_nodes": ["node1"], 12 | "to_nodes": ["node3"], 13 | "node_names": ["node1", "node2", "node1"], 14 | "from_inputs": ["data_in"], 15 | "load_versions": {"data_inter": "01:23:45"}, 16 | "pipeline_name": "fake_pl", 17 | } 18 | 19 | expected = "kedro run --from-inputs=data_in --from-nodes=node1 --to-nodes=node3 --node=node1,node2,node1 --pipeline=fake_pl --tag=tag1,tag2 --load-version=data_inter:01:23:45" 20 | assert _generate_kedro_command(**record_data) == expected 21 | 22 | 23 | @pytest.mark.parametrize("default_value", [None, []]) 24 | def test_generate_default_kedro_commands(default_value): 25 | """This test ensures that the _generate_kedro_comands accepts both 26 | `None` and empty `list` as default value, because CLI and interactive 27 | `Journal` do not use the same default. 28 | 29 | Args: 30 | default_value ([type]): [description] 31 | """ 32 | record_data = { 33 | "tags": default_value, 34 | "from_nodes": default_value, 35 | "to_nodes": default_value, 36 | "node_names": default_value, 37 | "from_inputs": default_value, 38 | "load_versions": default_value, 39 | "pipeline_name": "fake_pl", 40 | } 41 | 42 | expected = "kedro run --pipeline=fake_pl" 43 | assert _generate_kedro_command(**record_data) == expected 44 | -------------------------------------------------------------------------------- /tests/io/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/tests/io/__init__.py -------------------------------------------------------------------------------- /tests/io/artifacts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/tests/io/artifacts/__init__.py -------------------------------------------------------------------------------- /tests/io/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/tests/io/metrics/__init__.py -------------------------------------------------------------------------------- /tests/io/metrics/test_mlflow_metric_history_dataset.py: -------------------------------------------------------------------------------- 1 | import mlflow 2 | import pytest 3 | from mlflow.tracking import MlflowClient 4 | 5 | from kedro_mlflow.io.metrics import MlflowMetricHistoryDataset 6 | 7 | 8 | @pytest.fixture 9 | def mlflow_tracking_uri(tmp_path): 10 | tracking_uri = (tmp_path / "mlruns").as_uri() 11 | mlflow.set_tracking_uri(tracking_uri) 12 | return tracking_uri 13 | 14 | 15 | @pytest.fixture 16 | def mlflow_client(mlflow_tracking_uri): 17 | mlflow_client = MlflowClient(mlflow_tracking_uri) 18 | return mlflow_client 19 | 20 | 21 | @pytest.mark.parametrize( 22 | "save_mode,load_mode", 23 | [ 24 | ("list", "list"), 25 | ("list", "dict"), 26 | ("dict", "list"), 27 | ("dict", "dict"), 28 | ("history", "list"), 29 | ("history", "dict"), 30 | ("history", "history"), 31 | ], 32 | ) 33 | def test_mlflow_metric_history_dataset_save_load(mlflow_client, save_mode, load_mode): 34 | metric_as_list = [0.3, 0.2, 0.1, 0.15, 0.05] 35 | metric_as_dict = dict(enumerate(metric_as_list)) 36 | metric_as_history = [ 37 | {"step": i, "value": value, "timestamp": 1630235933 + i} 38 | for i, value in metric_as_dict.items() 39 | ] 40 | 41 | mode_metrics_mapping = { 42 | "list": metric_as_list, 43 | "dict": metric_as_dict, 44 | "history": metric_as_history, 45 | } 46 | 47 | metric_ds_model_local_filesystem = MlflowMetricHistoryDataset( 48 | key="my_metric", save_args={"mode": save_mode} 49 | ) 50 | with mlflow.start_run(): 51 | metric_ds_model_local_filesystem.save(mode_metrics_mapping[save_mode]) 52 | run_id = mlflow.active_run().info.run_id 53 | 54 | # check existence 55 | run = mlflow_client.get_run(run_id) 56 | assert "my_metric" in run.data.metrics.keys() 57 | 58 | metric_ds_loader = MlflowMetricHistoryDataset( 59 | key="my_metric", run_id=run_id, load_args={"mode": load_mode} 60 | ) 61 | 62 | assert metric_ds_loader.load() == mode_metrics_mapping[load_mode] 63 | 64 | 65 | def test_mlflow_metric_history_dataset_logging_deactivation(mlflow_tracking_uri): 66 | metric_ds = MlflowMetricHistoryDataset(key="inactive_metric") 67 | metric_ds._logging_activated = False 68 | with mlflow.start_run(): 69 | metric_ds.save([0.1]) 70 | assert metric_ds._exists() is False 71 | 72 | 73 | @pytest.mark.parametrize( 74 | "metadata", 75 | ( 76 | None, 77 | {"description": "My awsome dataset"}, 78 | {"string": "bbb", "int": 0}, 79 | ), 80 | ) 81 | def test_metric_history_dataset_with_metadata(tmp_path, metadata): 82 | metric_ds = MlflowMetricHistoryDataset( 83 | key="hello", 84 | metadata=metadata, 85 | ) 86 | 87 | assert metric_ds.metadata == metadata 88 | 89 | # Metadata should not show in _describe 90 | assert "metadata" not in metric_ds._describe() 91 | -------------------------------------------------------------------------------- /tests/io/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/tests/io/models/__init__.py -------------------------------------------------------------------------------- /tests/io/models/test_mlflow_model_local_filesystem_dataset.py: -------------------------------------------------------------------------------- 1 | from tempfile import TemporaryDirectory 2 | 3 | import mlflow 4 | import pandas as pd 5 | import pytest 6 | from kedro.io import DataCatalog, MemoryDataset 7 | from kedro.pipeline import Pipeline, node 8 | from kedro_datasets.pickle import PickleDataset 9 | from pytest_lazy_fixtures import lf 10 | from sklearn.linear_model import LinearRegression 11 | 12 | from kedro_mlflow.io.models import MlflowModelLocalFileSystemDataset 13 | from kedro_mlflow.mlflow import KedroPipelineModel 14 | from kedro_mlflow.pipeline import pipeline_ml_factory 15 | 16 | 17 | @pytest.fixture 18 | def linreg_model(): 19 | linreg_model = LinearRegression() 20 | return linreg_model 21 | 22 | 23 | @pytest.fixture 24 | def tmp_folder(): 25 | tmp_folder = TemporaryDirectory() 26 | return tmp_folder 27 | 28 | 29 | @pytest.fixture 30 | def linreg_path(tmp_path): 31 | linreg_path = tmp_path / "data" / "06_models" / "linreg" 32 | return linreg_path 33 | 34 | 35 | @pytest.fixture 36 | def pipeline_ml_obj(): 37 | def preprocess_fun(data): 38 | return data 39 | 40 | def fit_fun(data): 41 | return 2 42 | 43 | def predict_fun(model, data): 44 | return data * model 45 | 46 | full_pipeline = Pipeline( 47 | [ 48 | node( 49 | func=preprocess_fun, 50 | inputs="raw_data", 51 | outputs="data", 52 | tags=["inference", "training"], 53 | ), 54 | node(func=fit_fun, inputs="data", outputs="model", tags=["training"]), 55 | node( 56 | func=predict_fun, 57 | inputs=["data", "model"], 58 | outputs="predictions", 59 | tags=["inference"], 60 | ), 61 | ] 62 | ) 63 | 64 | pipeline_ml_obj = pipeline_ml_factory( 65 | training=full_pipeline.only_nodes_with_tags("training"), 66 | inference=full_pipeline.only_nodes_with_tags("inference"), 67 | input_name="raw_data", 68 | ) 69 | 70 | return pipeline_ml_obj 71 | 72 | 73 | @pytest.fixture 74 | def pipeline_inference(pipeline_ml_obj): 75 | return pipeline_ml_obj.inference 76 | 77 | 78 | @pytest.fixture 79 | def dummy_catalog(tmp_path): 80 | dummy_catalog = DataCatalog( 81 | { 82 | "raw_data": MemoryDataset(), 83 | "data": MemoryDataset(), 84 | "model": PickleDataset( 85 | filepath=(tmp_path / "data" / "06_models" / "model.pkl") 86 | .resolve() 87 | .as_posix() 88 | ), 89 | } 90 | ) 91 | dummy_catalog._datasets["model"].save(2) # emulate model fitting 92 | 93 | return dummy_catalog 94 | 95 | 96 | @pytest.fixture 97 | def kedro_pipeline_model(tmp_path, pipeline_ml_obj, dummy_catalog): 98 | kedro_pipeline_model = KedroPipelineModel( 99 | pipeline=pipeline_ml_obj, 100 | catalog=dummy_catalog, 101 | input_name=pipeline_ml_obj.input_name, 102 | ) 103 | 104 | return kedro_pipeline_model 105 | 106 | 107 | def test_save_unversioned_under_same_path( 108 | linreg_path, 109 | linreg_model, 110 | ): 111 | model_config = { 112 | "name": "linreg", 113 | "config": { 114 | "type": "kedro_mlflow.io.models.MlflowModelLocalFileSystemDataset", 115 | "flavor": "mlflow.sklearn", 116 | "filepath": linreg_path.as_posix(), 117 | }, 118 | } 119 | mlflow_model_ds = MlflowModelLocalFileSystemDataset.from_config(**model_config) 120 | mlflow_model_ds.save(linreg_model) 121 | # check that second save does not fail 122 | # this happens if the underlying folder already exists 123 | mlflow_model_ds.save(linreg_model) 124 | 125 | 126 | @pytest.mark.parametrize("versioned", [False, True]) 127 | def test_save_load_local(linreg_path, linreg_model, versioned): 128 | model_config = { 129 | "name": "linreg", 130 | "config": { 131 | "type": "kedro_mlflow.io.models.MlflowModelLocalFileSystemDataset", 132 | "filepath": linreg_path.as_posix(), 133 | "flavor": "mlflow.sklearn", 134 | "versioned": versioned, 135 | }, 136 | } 137 | mlflow_model_ds = MlflowModelLocalFileSystemDataset.from_config(**model_config) 138 | mlflow_model_ds.save(linreg_model) 139 | 140 | if versioned: 141 | assert ( 142 | linreg_path / mlflow_model_ds._version.save / linreg_path.name 143 | ).exists() # Versioned model saved locally 144 | else: 145 | assert linreg_path.exists() # Unversioned model saved locally 146 | 147 | linreg_model_loaded = mlflow_model_ds.load() 148 | assert isinstance(linreg_model_loaded, LinearRegression) 149 | 150 | 151 | @pytest.mark.parametrize( 152 | "pipeline", 153 | [ 154 | (lf("pipeline_ml_obj")), # must work for PipelineML 155 | (lf("pipeline_inference")), # must work for Pipeline 156 | ], 157 | ) 158 | def test_pyfunc_flavor_python_model_save_and_load( 159 | tmp_path, tmp_folder, pipeline, dummy_catalog 160 | ): 161 | kedro_pipeline_model = KedroPipelineModel( 162 | pipeline=pipeline, 163 | catalog=dummy_catalog, 164 | input_name="raw_data", 165 | ) 166 | artifacts = kedro_pipeline_model.extract_pipeline_artifacts(tmp_folder) 167 | 168 | model_config = { 169 | "name": "kedro_pipeline_model", 170 | "config": { 171 | "type": "kedro_mlflow.io.models.MlflowModelLocalFileSystemDataset", 172 | "filepath": ( 173 | tmp_path / "data" / "06_models" / "my_custom_model" 174 | ).as_posix(), 175 | "flavor": "mlflow.pyfunc", 176 | "pyfunc_workflow": "python_model", 177 | "save_args": { 178 | "artifacts": artifacts, 179 | "conda_env": {"python": "3.10.0", "dependencies": ["kedro==0.18.11"]}, 180 | }, 181 | }, 182 | } 183 | 184 | mlflow_model_ds = MlflowModelLocalFileSystemDataset.from_config(**model_config) 185 | mlflow_model_ds.save(kedro_pipeline_model) 186 | 187 | assert mlflow.active_run() is None 188 | 189 | # close the run, create another dataset and reload 190 | # (emulate a new "kedro run" with the launch of the ) 191 | loaded_model = mlflow_model_ds.load() 192 | 193 | loaded_model.predict(pd.DataFrame(data=[1], columns=["a"])) == pd.DataFrame( 194 | data=[2], columns=["a"] 195 | ) 196 | 197 | 198 | @pytest.mark.parametrize( 199 | "metadata", 200 | ( 201 | None, 202 | {"description": "My awsome dataset"}, 203 | {"string": "bbb", "int": 0}, 204 | ), 205 | ) 206 | def test_metrics_history_dataset_with_metadata(metadata): 207 | mlflow_model_ds = MlflowModelLocalFileSystemDataset( 208 | flavor="mlflow.sklearn", 209 | filepath="/my/file/path", 210 | metadata=metadata, 211 | ) 212 | 213 | assert mlflow_model_ds.metadata == metadata 214 | 215 | # Metadata should not show in _describe 216 | assert "metadata" not in mlflow_model_ds._describe() 217 | -------------------------------------------------------------------------------- /tests/mlflow/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/tests/mlflow/__init__.py -------------------------------------------------------------------------------- /tests/pipeline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/tests/pipeline/__init__.py -------------------------------------------------------------------------------- /tests/template/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/tests/template/__init__.py -------------------------------------------------------------------------------- /tests/template/project/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Galileo-Galilei/kedro-mlflow/880c040663ac784899516d9d987adcd4b4cf49b4/tests/template/project/__init__.py -------------------------------------------------------------------------------- /tests/template/project/test_mlflow_yml.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import yaml 3 | 4 | from kedro_mlflow.config.kedro_mlflow_config import KedroMlflowConfig 5 | from kedro_mlflow.framework.cli.cli import TEMPLATE_FOLDER_PATH 6 | from kedro_mlflow.framework.cli.cli_utils import write_jinja_template 7 | 8 | 9 | @pytest.fixture 10 | def template_mlflowyml(tmp_path): 11 | # the goal is to discover all potential ".py" files 12 | # but for now there is only "run.py" 13 | # this is rather a safeguard for further add 14 | raw_template_path = TEMPLATE_FOLDER_PATH / "mlflow.yml" 15 | rendered_template_path = tmp_path / raw_template_path.name 16 | tags = { 17 | "project_name": "This is a fake project", 18 | "python_package": "fake_project", 19 | "kedro_version": "0.16.0", 20 | } 21 | 22 | write_jinja_template(src=raw_template_path, dst=rendered_template_path, **tags) 23 | return rendered_template_path.as_posix() 24 | 25 | 26 | def test_mlflow_yml_rendering(template_mlflowyml): 27 | # the mlflow yml file must be consistent with the default in KedroMlflowConfig for readibility 28 | with open(template_mlflowyml) as file_handler: 29 | mlflow_config = yaml.safe_load(file_handler) 30 | 31 | # note: Using Pydantic model Construct method skip all validations 32 | # and here we do not want to check the path 33 | expected_config = KedroMlflowConfig.construct( 34 | project_path="fake/path", 35 | tracking=dict( 36 | disable_tracking=dict(pipelines=[], disable_autologging=True), 37 | experiment=dict( 38 | name="fake_project", 39 | create_experiment_kwargs=dict(artifact_location=None, tags=None), 40 | restore_if_deleted=True, 41 | ), 42 | params=dict( 43 | dict_params=dict(flatten=False, recursive=True, sep="."), 44 | long_params_strategy="fail", 45 | ), 46 | run=dict(id=None, name=None, nested=True), 47 | ), # check for proper rendering 48 | ) 49 | 50 | assert mlflow_config == expected_config.dict(exclude={"project_path"}) 51 | --------------------------------------------------------------------------------