├── .github ├── actions │ └── starter_template_test │ │ └── action.yml └── workflows │ ├── ci.yml │ └── image-optimizer.yml ├── .gitignore ├── .pytest.ini ├── LICENSE ├── README.md ├── copier.yaml ├── requirements.txt ├── template ├── .assets │ ├── cloud_mcp.png │ ├── cloud_mcp_predictions.png │ ├── cloud_mcp_screenshot.png │ ├── feature_engineering_pipeline.png │ ├── inference_pipeline.png │ ├── pipeline_overview.png │ └── training_pipeline.png ├── .dockerignore ├── README.md ├── configs │ ├── feature_engineering.yaml │ ├── inference.yaml │ ├── training_rf.yaml │ └── training_sgd.yaml ├── license ├── license_header ├── pipelines │ ├── __init__.py │ ├── feature_engineering.py │ ├── inference.py │ └── training.py ├── quickstart.ipynb ├── requirements.txt ├── run.py ├── steps │ ├── __init__.py │ ├── data_loader.py │ ├── data_preprocessor.py │ ├── data_splitter.py │ ├── inference_predict.py │ ├── inference_preprocessor.py │ ├── model_evaluator.py │ ├── model_promoter.py │ └── model_trainer.py ├── utils │ ├── __init__.py │ └── preprocess.py ├── {% if open_source_license %}LICENSE{% endif %} └── {{ _copier_conf.answers_file }} ├── test-requirements.txt └── tests ├── conftest.py └── test_starter_template.py /.github/actions/starter_template_test/action.yml: -------------------------------------------------------------------------------- 1 | name: 'Run STARTER template tests' 2 | inputs: 3 | stack-name: 4 | description: 'Name of ZenML stack to build (see `tests/conftest.py:configure_stack()`)' 5 | type: string 6 | required: true 7 | ref-zenml: 8 | description: 'Ref of ZenML package' 9 | type: string 10 | required: false 11 | default: '' 12 | ref-template: 13 | description: 'Ref of this template repo' 14 | type: string 15 | required: false 16 | default: '' 17 | python-version: 18 | description: 'Python version' 19 | type: string 20 | required: false 21 | default: '3.9' 22 | 23 | runs: 24 | using: "composite" 25 | steps: 26 | - name: Check out repository code 27 | uses: actions/checkout@v3 28 | with: 29 | repository: zenml-io/zenml-project-templates 30 | ref: ${{ inputs.ref-template }} 31 | path: ./local_checkout 32 | 33 | - name: Set up Python 34 | uses: actions/setup-python@v4 35 | with: 36 | python-version: ${{ inputs.python-version }} 37 | 38 | - name: Configure git (non-Windows) 39 | if: ${{ runner.os != 'Windows' }} 40 | shell: bash 41 | run: | 42 | git config --global user.email "info@zenml.io" 43 | git config --global user.name "ZenML GmbH" 44 | 45 | - name: Configure git (Windows) 46 | if: ${{ runner.os == 'Windows' }} 47 | shell: bash 48 | run: | 49 | "C:\Program Files\Git\bin\git.exe" config --global user.email "info@zenml.io" 50 | "C:\Program Files\Git\bin\git.exe" config --global user.name "ZenML GmbH" 51 | 52 | - name: Install wheel 53 | shell: bash 54 | run: | 55 | pip install wheel uv 56 | 57 | - name: Install ZenML 58 | if: ${{ inputs.ref-zenml != '' }} 59 | shell: bash 60 | run: | 61 | uv pip install --system "git+https://github.com/zenml-io/zenml.git@${{ inputs.ref-zenml }}" "zenml[server]@git+https://github.com/zenml-io/zenml.git@${{ inputs.ref-zenml }}" 62 | 63 | - name: Install ZenML 64 | if: ${{ inputs.ref-zenml == '' }} 65 | shell: bash 66 | run: | 67 | uv pip install --system zenml "zenml[server]" 68 | 69 | - name: Concatenate requirements 70 | shell: bash 71 | run: | 72 | zenml integration export-requirements -o ./local_checkout/integration-requirements.txt sklearn pandas 73 | cat ./local_checkout/requirements.txt ./local_checkout/test-requirements.txt ./local_checkout/integration-requirements.txt >> ./local_checkout/all-requirements.txt 74 | 75 | - name: Install requirements 76 | shell: bash 77 | run: | 78 | uv pip install --system -r ./local_checkout/all-requirements.txt 79 | 80 | - name: Run pytests 81 | shell: bash 82 | env: 83 | ZENML_STACK_NAME: ${{ inputs.stack-name }} 84 | run: | 85 | pytest ./local_checkout/tests 86 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | ref-template: 7 | description: 'Branch or tag ref to check out for template' 8 | type: string 9 | required: false 10 | ref-zenml: 11 | description: 'Branch or tag ref to check out for ZenML' 12 | type: string 13 | required: false 14 | workflow_call: 15 | inputs: 16 | ref-template: 17 | description: 'Branch or tag ref to check out for template' 18 | type: string 19 | required: false 20 | ref-zenml: 21 | description: 'Branch or tag ref to check out for ZenML' 22 | type: string 23 | required: false 24 | push: 25 | branches: ["main", "develop"] 26 | paths-ignore: ["README.md"] 27 | pull_request: 28 | types: [opened, synchronize, ready_for_review] 29 | paths-ignore: ["README.md"] 30 | 31 | concurrency: 32 | # New commit on branch cancels running workflows of the same branch 33 | group: ${{ github.workflow }}-${{ github.ref }} 34 | cancel-in-progress: true 35 | 36 | jobs: 37 | run-tests: 38 | runs-on: ${{ matrix.os }} 39 | strategy: 40 | fail-fast: false 41 | matrix: 42 | stack-name: [local] 43 | os: [windows-latest, ubuntu-latest, macos-latest] 44 | python-version: ["3.9", "3.10", "3.11", "3.12"] 45 | env: 46 | ZENML_DEBUG: true 47 | ZENML_ANALYTICS_OPT_IN: false 48 | ZENML_LOGGING_VERBOSITY: INFO 49 | steps: 50 | - name: Check out repository code 51 | uses: actions/checkout@v3 52 | 53 | - name: Run tests 54 | uses: ./.github/actions/starter_template_test 55 | with: 56 | stack-name: ${{ matrix.stack-name }} 57 | python-version: ${{ matrix.python-version }} 58 | ref-zenml: ${{ inputs.ref-zenml || 'feature/followup-run-metadata' }} 59 | ref-template: ${{ inputs.ref-template || github.ref }} 60 | -------------------------------------------------------------------------------- /.github/workflows/image-optimizer.yml: -------------------------------------------------------------------------------- 1 | name: Compress Images 2 | on: 3 | pull_request: 4 | # Run Image Actions when JPG, JPEG, PNG or WebP files are added or changed. 5 | # See https://help.github.com/en/actions/automating-your-workflow-with-github-actions/workflow-syntax-for-github-actions#onpushpull_requestpaths for reference. 6 | paths: 7 | - '**.jpg' 8 | - '**.jpeg' 9 | - '**.png' 10 | - '**.webp' 11 | jobs: 12 | build: 13 | # Only run on non-draft PRs within the same repository. 14 | if: github.event.pull_request.head.repo.full_name == github.repository && github.event.pull_request.draft == false 15 | name: calibreapp/image-actions 16 | runs-on: ubuntu-latest 17 | steps: 18 | - name: Checkout Repo 19 | uses: actions/checkout@v3 20 | 21 | - name: Compress Images 22 | uses: calibreapp/image-actions@main 23 | with: 24 | # The `GITHUB_TOKEN` is automatically generated by GitHub and scoped only to the repository that is currently running the action. By default, the action can’t update Pull Requests initiated from forked repositories. 25 | # See https://docs.github.com/en/actions/reference/authentication-in-a-workflow and https://help.github.com/en/articles/virtual-environments-for-github-actions#token-permissions 26 | githubToken: ${{ secrets.GITHUB_TOKEN }} 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # PyCharm Stuff 102 | .idea 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | *.zen 135 | .vscode 136 | .local 137 | -------------------------------------------------------------------------------- /.pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = 3 | -s 4 | testpaths = 5 | tests -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 📜 ZenML Starter Template 2 | 3 | This repository contains a starter template from which a simple ZenML project 4 | can be generated easily. It contains a collection of steps, pipelines, stack configurations and 5 | other artifacts and useful resources that can get you started with ZenML. 6 | 7 | 🔥 **Do you have a personal project powered by ZenML that you would like to see here?** 8 | 9 | At ZenML, we are looking for design partnerships and collaboration to help us 10 | better understand the real-world scenarios in which MLOps is being used and to 11 | build the best possible experience for our users. If you are interested in 12 | sharing all or parts of your project with us in the form of a ZenML project 13 | template, please [join our Slack](https://zenml.io/slack/) and leave us a 14 | message! 15 | 16 | ## 📦 Prerequisites 17 | 18 | To use the templates, you need to have Zenml and its `templates` extras 19 | installed: 20 | 21 | ```bash 22 | pip install "zenml[templates]" 23 | ``` 24 | 25 | ## 🚀 Generate a ZenML Project 26 | 27 | You can generate a project from one of the existing templates by using the 28 | `--template` flag with the `zenml init` command: 29 | 30 | ```bash 31 | zenml init --template 32 | ``` 33 | 34 | Under the hood, ZenML uses the popular [Copier](https://copier.readthedocs.io/en/stable/) 35 | library and a set of Jinja2 templates to generate the project. So you may also 36 | interact with Copier directly to generate a project, e.g.: 37 | 38 | ```bash 39 | copier gh:zenml-io/template-starter 40 | ``` 41 | 42 | You will be prompted to select the project template and enter various values for 43 | the template variables. Once you have entered them, the project will be 44 | generated in the indicated path. 45 | 46 | To update an already generated project, with different parameters you can run 47 | the same command again. If you want to skip the prompts to use the values you 48 | already entered and overwrite all files in the existing project, you can run: 49 | 50 | ```bash 51 | copier -wf gh:zenml-io/template-starter 52 | ``` 53 | -------------------------------------------------------------------------------- /copier.yaml: -------------------------------------------------------------------------------- 1 | --- # GLOBAL PROMPT -------------------------------- 2 | project_name: 3 | type: str 4 | help: Short name for your project 5 | default: ZenML Starter 6 | version: 7 | type: str 8 | help: | 9 | Version of your project 10 | default: "0.1.0" 11 | open_source_license: 12 | type: str 13 | help: >- 14 | The license under which your project will be released 15 | choices: 16 | Apache Software License 2.0: apache 17 | MIT license: mit 18 | BSD license: bsd 19 | ISC license: isc 20 | GNU General Public License v3: gpl3 21 | Not open source: none 22 | default: apache 23 | full_name: 24 | type: str 25 | help: >- 26 | The name of the person/entity holding the copyright 27 | default: ZenML GmbH 28 | when: "{{ open_source_license }}" 29 | email: 30 | type: str 31 | help: >- 32 | The email of the person/entity holding the copyright 33 | default: info@zenml.io 34 | when: "{{ open_source_license }}" 35 | 36 | # CONFIGURATION ------------------------- 37 | _templates_suffix: "" 38 | _subdirectory: "./template" 39 | _exclude: 40 | - license 41 | - license_header 42 | _tasks: 43 | # Remove unused imports and variables 44 | - >- 45 | {% if _copier_conf.os == 'windows' %} 46 | echo "Auto-formatting not supported on Windows" 47 | {% else %} 48 | {{ _copier_python }} -m ruff check --select F401,F841 --fix \ 49 | --exclude "__init__.py" --isolated \ 50 | steps pipelines run.py > /dev/null 2>&1 || true 51 | {% endif %} 52 | # Sort imports 53 | - >- 54 | {% if _copier_conf.os == 'windows' %} 55 | echo "Auto-formatting not supported on Windows" 56 | {% else %} 57 | {{ _copier_python }} -m ruff check --select I \ 58 | --fix --ignore D \ 59 | steps pipelines run.py > /dev/null 2>&1 || true 60 | {% endif %} 61 | # Auto-format code 62 | - >- 63 | {% if _copier_conf.os == 'windows' %} 64 | echo "Auto-formatting not supported on Windows" 65 | {% else %} 66 | {{ _copier_python }} -m black \ 67 | --exclude '' --include '\.pyi?$' -l 79 \ 68 | steps pipelines run.py > /dev/null 2>&1 || true 69 | {% endif %} 70 | - | 71 | echo "Congratulations, your project has been generated in the '{{ _copier_conf.dst_path }}' directory." 72 | echo "You can now run the following commands to get started:" 73 | echo " cd {{ _copier_conf.dst_path }}" 74 | echo " pip install -r requirements.txt" 75 | echo " # Start the ZenML UI (optional; you'll also need the zenml[server] Python" 76 | echo " # package installed" 77 | echo " zenml login --local" 78 | echo " python run.py" 79 | echo "Next, you should take a look at the '{{ _copier_conf.dst_path }}/README.md' file in the generated project." 80 | echo "Happy coding!" 81 | 82 | _jinja_extensions: 83 | - jinja2_time.TimeExtension 84 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-learn 2 | copier 3 | jinja2-time 4 | zenml[server]>=0.52.0 5 | notebook 6 | pyyaml-include<2.0 7 | -------------------------------------------------------------------------------- /template/.assets/cloud_mcp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zenml-io/template-starter/665ed6a723863315551c732eb22648b8c89a615b/template/.assets/cloud_mcp.png -------------------------------------------------------------------------------- /template/.assets/cloud_mcp_predictions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zenml-io/template-starter/665ed6a723863315551c732eb22648b8c89a615b/template/.assets/cloud_mcp_predictions.png -------------------------------------------------------------------------------- /template/.assets/cloud_mcp_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zenml-io/template-starter/665ed6a723863315551c732eb22648b8c89a615b/template/.assets/cloud_mcp_screenshot.png -------------------------------------------------------------------------------- /template/.assets/feature_engineering_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zenml-io/template-starter/665ed6a723863315551c732eb22648b8c89a615b/template/.assets/feature_engineering_pipeline.png -------------------------------------------------------------------------------- /template/.assets/inference_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zenml-io/template-starter/665ed6a723863315551c732eb22648b8c89a615b/template/.assets/inference_pipeline.png -------------------------------------------------------------------------------- /template/.assets/pipeline_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zenml-io/template-starter/665ed6a723863315551c732eb22648b8c89a615b/template/.assets/pipeline_overview.png -------------------------------------------------------------------------------- /template/.assets/training_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zenml-io/template-starter/665ed6a723863315551c732eb22648b8c89a615b/template/.assets/training_pipeline.png -------------------------------------------------------------------------------- /template/.dockerignore: -------------------------------------------------------------------------------- 1 | .venv* 2 | .requirements* -------------------------------------------------------------------------------- /template/README.md: -------------------------------------------------------------------------------- 1 | # :running: MLOps 101 with ZenML 2 | 3 | Build your first MLOps pipelines with ZenML. 4 | 5 | ## :earth_americas: Overview 6 | 7 | This repository is a minimalistic MLOps project intended as a starting point to learn how to put ML workflows in production. It features: 8 | 9 | - A feature engineering pipeline that loads data and prepares it for training. 10 | - A training pipeline that loads the preprocessed dataset and trains a model. 11 | - A batch inference pipeline that runs predictions on the trained model with new data. 12 | 13 | This is a representation of how it will all come together: 14 | 15 | Pipelines Overview 16 | 17 | Along the way we will also show you how to: 18 | 19 | - Structure your code into MLOps pipelines. 20 | - Automatically version, track, and cache data, models, and other artifacts. 21 | - Transition your ML models from development to production. 22 | 23 | ## 🏃 Run on Colab 24 | 25 | You can use Google Colab to see ZenML in action, no signup / installation required! 26 | 27 | Open In Colab 28 | 29 | ## :computer: Run Locally 30 | 31 | To run locally, install ZenML and pull this quickstart: 32 | 33 | ```shell 34 | # Install ZenML 35 | pip install "zenml[server]" 36 | 37 | # clone the ZenML repository 38 | git clone https://github.com/zenml-io/zenml.git 39 | cd zenml/examples/mlops_starter 40 | ``` 41 | 42 | Now we're ready to start. You have two options for running the quickstart locally: 43 | 44 | #### Option 1 - Interactively explore the quickstart using Jupyter Notebook: 45 | ```bash 46 | pip install notebook 47 | jupyter notebook 48 | # open quickstart.ipynb 49 | ``` 50 | 51 | #### Option 2 - Execute the whole ML pipeline from a Python script: 52 | ```bash 53 | # Install required zenml integrations 54 | zenml integration install sklearn pandas -y 55 | 56 | # Initialize ZenML 57 | zenml init 58 | 59 | # Start the ZenServer to enable dashboard access 60 | zenml login --local 61 | 62 | # Run the feature engineering pipeline 63 | python run.py --feature-pipeline 64 | 65 | # Run the training pipeline 66 | python run.py --training-pipeline 67 | 68 | # Run the training pipeline with versioned artifacts 69 | python run.py --training-pipeline --train-dataset-version-name=1 --test-dataset-version-name=1 70 | 71 | # Run the inference pipeline 72 | python run.py --inference-pipeline 73 | ``` 74 | 75 | ## 🌵 Learning MLOps with ZenML 76 | 77 | This project is also a great source of learning about some fundamental MLOps concepts. In sum, there are four exemplary steps happening, that can be mapped onto many other projects: 78 | 79 |
80 | 🥇 Step 1: Load your data and execute feature engineering 81 | 82 | We'll start off by importing our data. In this project, we'll be working with 83 | [the Breast Cancer](https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic) dataset 84 | which is publicly available on the UCI Machine Learning Repository. The task is a classification 85 | problem, to predict whether a patient is diagnosed with breast cancer or not. 86 | 87 | When you're getting started with a machine learning problem you'll want to do 88 | something similar to this: import your data and get it in the right shape for 89 | your training. Here are the typical steps within a feature engineering pipeline. 90 | 91 | The steps can be found defined the [steps](steps/) directory, while the [pipelines](pipelines/) directory has the pipeline code to connect them together. 92 | 93 | Feature engineering pipeline 94 | 95 | To execute the feature engineer pipelines, run: 96 | 97 | ```python 98 | python run.py --feature-pipeline 99 | ``` 100 | 101 | After the pipeline has run, the pipeline will produce some logs like: 102 | 103 | ```shell 104 | The latest feature engineering pipeline produced the following artifacts: 105 | 106 | 1. Train Dataset - Name: dataset_trn, Version Name: 1 107 | 2. Test Dataset: Name: dataset_tst, Version Name: 1 108 | ``` 109 | 110 | We will use these versions in the next pipeline. 111 | 112 |
113 | 114 |
115 | ⌚ Step 2: Training pipeline 116 | 117 | Now that our data is prepared, it makes sense to train some models to get a sense of how difficult the task is. The Breast Cancer dataset is sufficiently large and complex that it's unlikely we'll be able to train a model that behaves perfectly since the problem is inherently complex, but we can get a sense of what a reasonable baseline looks like. 118 | 119 | We'll start with two simple models, a SGD Classifier and a Random Forest 120 | Classifier, both batteries-included from `sklearn`. We'll train them on the 121 | same data and then compare their performance. 122 | 123 | Training pipeline 124 | 125 | Run it by using the ID's from the first step: 126 | 127 | ```python 128 | # You can also ignore the `--train-dataset-version-name` and `--test-dataset-version-name` to use 129 | # the latest versions 130 | python run.py --training-pipeline --train-dataset-version-name 1 --test-dataset-version-name 1 131 | ``` 132 | 133 | To track these models, ZenML offers a *Model Control Plane*, which is a central register of all your ML models. 134 | Each run of the training pipeline will produce a ZenML Model Version. 135 | 136 | ```shell 137 | zenml model list 138 | ``` 139 | 140 | This will show you a new `breast_cancer_classifier` model with two versions, `sgd` and `rf` created. You can find out how this was configured in the [YAML pipeline configuration files](configs/). 141 | 142 | If you are a [ZenML Pro](https://zenml.io/pro) user, you can see all of this visualized in the dashboard: 143 | 144 | Model Control Plane 145 | 146 | There is a lot more you can do with ZenML models, including the ability to 147 | track metrics by adding metadata to it, or having them persist in a model 148 | registry. However, these topics can be explored more in the 149 | [ZenML docs](https://docs.zenml.io). 150 | 151 |
152 | 153 |
154 | 💯 Step 3: Promoting the best model to production 155 | 156 | For now, we will use the ZenML model control plane to promote our best 157 | model to `production`. You can do this by simply setting the `stage` of 158 | your chosen model version to the `production` tag. 159 | 160 | ```shell 161 | zenml model version update breast_cancer_classifier rf --stage production 162 | ``` 163 | 164 | While we've demonstrated a manual promotion process for clarity, a more in-depth look at the [promoter code](steps/model_promoter.py) reveals that the training pipeline is designed to automate this step. It evaluates the latest model against established production metrics and, if the new model outperforms the existing one based on test set results, it will automatically promote the model to production. Here is an overview of the process: 165 | 166 | Model Control Plane 167 | 168 | Again, if you are a [ZenML Pro](https://zenml.io/pro) user, you would be able to see all this in the cloud dashboard. 169 | 170 |
171 | 172 |
173 | 🫅 Step 4: Consuming the model in production 174 | 175 | Once the model is promoted, we can now consume the right model version in our 176 | batch inference pipeline directly. Let's see how that works. 177 | 178 | The batch inference pipeline simply takes the model marked as `production` and runs inference on it 179 | with `live data`. The critical step here is the `inference_predict` step, where we load the model in memory and generate predictions. Apart from the loading the model, we must also load the preprocessing pipeline that we ran in feature engineering, 180 | so that we can do the exact steps that we did on training time, in inference time. Let's bring it all together: 181 | 182 | ZenML automatically links all artifacts to the `production` model version as well, including the predictions 183 | that were returned in the pipeline. This completes the MLOps loop of training to inference: 184 | 185 | Inference pipeline 186 | 187 | You can also see all predictions ever created as a complete history in the dashboard (Again only for [ZenML Pro](https://zenml.io/pro) users): 188 | 189 | Model Control Plane 190 | 191 |
192 | 193 | ## :bulb: Learn More 194 | 195 | You're a legit MLOps engineer now! You trained two models, evaluated them against 196 | a test set, registered the best one with the ZenML model control plane, 197 | and served some predictions. You also learned how to iterate on your models and 198 | data by using some of the ZenML utility abstractions. You saw how to view your 199 | artifacts and stacks via the client as well as the ZenML Dashboard. 200 | 201 | If you want to learn more about ZenML as a tool, then the 202 | [:page_facing_up: **ZenML Docs**](https://docs.zenml.io/) are the perfect place 203 | to get started. In particular, the [Production Guide](https://docs.zenml.io/user-guide/production-guide/) 204 | goes into more detail as to how to transition these same pipelines into production on the cloud. 205 | 206 | The best way to get a production ZenML instance up and running with all batteries included is the [ZenML Pro](https://zenml.io/pro). Check it out! 207 | 208 | Also, make sure to join our 209 | Slack 210 | Slack Community 211 | to become part of the ZenML family! 212 | -------------------------------------------------------------------------------- /template/configs/feature_engineering.yaml: -------------------------------------------------------------------------------- 1 | # environment configuration 2 | settings: 3 | docker: 4 | required_integrations: 5 | - sklearn 6 | - pandas 7 | requirements: 8 | - pyarrow 9 | 10 | # pipeline configuration 11 | test_size: 0.35 -------------------------------------------------------------------------------- /template/configs/inference.yaml: -------------------------------------------------------------------------------- 1 | # environment configuration 2 | settings: 3 | docker: 4 | required_integrations: 5 | - sklearn 6 | - pandas 7 | requirements: 8 | - pyarrow 9 | 10 | # configuration of the Model Control Plane 11 | model: 12 | name: "breast_cancer_classifier" 13 | version: "production" 14 | license: Apache 2.0 15 | description: A breast cancer classifier 16 | tags: ["breast_cancer", "classifier"] -------------------------------------------------------------------------------- /template/configs/training_rf.yaml: -------------------------------------------------------------------------------- 1 | # environment configuration 2 | settings: 3 | docker: 4 | required_integrations: 5 | - sklearn 6 | - pandas 7 | requirements: 8 | - pyarrow 9 | 10 | # configuration of the Model Control Plane 11 | model: 12 | name: breast_cancer_classifier 13 | version: rf 14 | license: Apache 2.0 15 | description: A breast cancer classifier 16 | tags: ["breast_cancer", "classifier"] 17 | 18 | # Configure the pipeline 19 | parameters: 20 | model_type: "rf" # Choose between rf/sgd 21 | -------------------------------------------------------------------------------- /template/configs/training_sgd.yaml: -------------------------------------------------------------------------------- 1 | # environment configuration 2 | settings: 3 | docker: 4 | required_integrations: 5 | - sklearn 6 | - pandas 7 | requirements: 8 | - pyarrow 9 | 10 | # configuration of the Model Control Plane 11 | model: 12 | name: breast_cancer_classifier 13 | version: sgd 14 | license: Apache 2.0 15 | description: A breast cancer classifier 16 | tags: ["breast_cancer", "classifier"] 17 | 18 | # Configure the pipeline 19 | parameters: 20 | model_type: "sgd" # Choose between rf/sgd -------------------------------------------------------------------------------- /template/license: -------------------------------------------------------------------------------- 1 | {% if open_source_license == 'mit' -%} 2 | MIT License 3 | 4 | Copyright (c) {{ full_name }} {% now 'local', '%Y' %} 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | {% elif open_source_license == 'bsd' %} 24 | 25 | BSD License 26 | 27 | Copyright (c) {{ full_name }} {% now 'local', '%Y' %}. All rights reserved. 28 | 29 | Redistribution and use in source and binary forms, with or without modification, 30 | are permitted provided that the following conditions are met: 31 | 32 | * Redistributions of source code must retain the above copyright notice, this 33 | list of conditions and the following disclaimer. 34 | 35 | * Redistributions in binary form must reproduce the above copyright notice, this 36 | list of conditions and the following disclaimer in the documentation and/or 37 | other materials provided with the distribution. 38 | 39 | * Neither the name of the copyright holder nor the names of its 40 | contributors may be used to endorse or promote products derived from this 41 | software without specific prior written permission. 42 | 43 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 44 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 45 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 46 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 47 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 48 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 49 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 50 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 51 | OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 52 | OF THE POSSIBILITY OF SUCH DAMAGE. 53 | {% elif open_source_license == 'isc' -%} 54 | ISC License 55 | 56 | Copyright (c) {{ full_name }} {% now 'local', '%Y' %} 57 | 58 | Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies. 59 | 60 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 61 | {% elif open_source_license == 'apache' -%} 62 | Apache Software License 2.0 63 | 64 | Copyright (c) {{ full_name }} {% now 'local', '%Y' %}. All rights reserved. 65 | 66 | Licensed under the Apache License, Version 2.0 (the "License"); 67 | you may not use this file except in compliance with the License. 68 | You may obtain a copy of the License at 69 | 70 | http://www.apache.org/licenses/LICENSE-2.0 71 | 72 | Unless required by applicable law or agreed to in writing, software 73 | distributed under the License is distributed on an "AS IS" BASIS, 74 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 75 | See the License for the specific language governing permissions and 76 | limitations under the License. 77 | {% elif open_source_license == 'gpl3' -%} 78 | GNU GENERAL PUBLIC LICENSE 79 | Version 3, 29 June 2007 80 | 81 | {{ project_short_description }} 82 | Copyright (C) {{ full_name }} {% now 'local', '%Y' %} 83 | 84 | This program is free software: you can redistribute it and/or modify 85 | it under the terms of the GNU General Public License as published by 86 | the Free Software Foundation, either version 3 of the License, or 87 | (at your option) any later version. 88 | 89 | This program is distributed in the hope that it will be useful, 90 | but WITHOUT ANY WARRANTY; without even the implied warranty of 91 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 92 | GNU General Public License for more details. 93 | 94 | You should have received a copy of the GNU General Public License 95 | along with this program. If not, see . 96 | 97 | Also add information on how to contact you by electronic and paper mail. 98 | 99 | You should also get your employer (if you work as a programmer) or school, 100 | if any, to sign a "copyright disclaimer" for the program, if necessary. 101 | For more information on this, and how to apply and follow the GNU GPL, see 102 | . 103 | 104 | The GNU General Public License does not permit incorporating your program 105 | into proprietary programs. If your program is a subroutine library, you 106 | may consider it more useful to permit linking proprietary applications with 107 | the library. If this is what you want to do, use the GNU Lesser General 108 | Public License instead of this License. But first, please read 109 | . 110 | {% endif %} -------------------------------------------------------------------------------- /template/license_header: -------------------------------------------------------------------------------- 1 | {%- macro license() %}{% include 'template/license' %}{% endmacro -%} 2 | {{ license() | replace('\n', '\n# ') }} -------------------------------------------------------------------------------- /template/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | # {% include 'template/license_header' %} 2 | 3 | from .feature_engineering import feature_engineering 4 | from .inference import inference 5 | from .training import training 6 | -------------------------------------------------------------------------------- /template/pipelines/feature_engineering.py: -------------------------------------------------------------------------------- 1 | # {% include 'template/license_header' %} 2 | 3 | from typing import List, Optional 4 | 5 | from steps import ( 6 | data_loader, 7 | data_preprocessor, 8 | data_splitter, 9 | ) 10 | from zenml import pipeline 11 | from zenml.logger import get_logger 12 | 13 | logger = get_logger(__name__) 14 | 15 | 16 | @pipeline 17 | def feature_engineering( 18 | test_size: float = 0.2, 19 | drop_na: Optional[bool] = None, 20 | normalize: Optional[bool] = None, 21 | drop_columns: Optional[List[str]] = None, 22 | target: Optional[str] = "target", 23 | random_state: int = 17, 24 | ): 25 | """ 26 | Feature engineering pipeline. 27 | 28 | This is a pipeline that loads the data, processes it and splits 29 | it into train and test sets. 30 | 31 | Args: 32 | test_size: Size of holdout set for training 0.0..1.0 33 | drop_na: If `True` NA values will be removed from dataset 34 | normalize: If `True` dataset will be normalized with MinMaxScaler 35 | drop_columns: List of columns to drop from dataset 36 | target: Name of target column in dataset 37 | random_state: Random state to configure the data loader 38 | 39 | Returns: 40 | The processed datasets (dataset_trn, dataset_tst). 41 | """ 42 | # Link all the steps together by calling them and passing the output 43 | # of one step as the input of the next step. 44 | raw_data = data_loader(random_state=random_state, target=target) 45 | dataset_trn, dataset_tst = data_splitter( 46 | dataset=raw_data, 47 | test_size=test_size, 48 | ) 49 | dataset_trn, dataset_tst, _ = data_preprocessor( 50 | dataset_trn=dataset_trn, 51 | dataset_tst=dataset_tst, 52 | drop_na=drop_na, 53 | normalize=normalize, 54 | drop_columns=drop_columns, 55 | target=target, 56 | random_state=random_state, 57 | ) 58 | return dataset_trn, dataset_tst 59 | -------------------------------------------------------------------------------- /template/pipelines/inference.py: -------------------------------------------------------------------------------- 1 | # {% include 'template/license_header' %} 2 | 3 | from steps import ( 4 | data_loader, 5 | inference_predict, 6 | inference_preprocessor, 7 | ) 8 | from zenml import get_pipeline_context, pipeline 9 | from zenml.logger import get_logger 10 | 11 | logger = get_logger(__name__) 12 | 13 | 14 | @pipeline 15 | def inference(random_state: int, target: str): 16 | """ 17 | Model inference pipeline. 18 | 19 | This is a pipeline that loads the inference data, processes it with 20 | the same preprocessing pipeline used in training, and runs inference 21 | with the trained model. 22 | 23 | Args: 24 | random_state: Random state for reproducibility. 25 | target: Name of target column in dataset. 26 | """ 27 | # Get the production model artifact 28 | model = get_pipeline_context().model.get_artifact("sklearn_classifier") 29 | 30 | # Get the preprocess pipeline artifact associated with this version 31 | preprocess_pipeline = get_pipeline_context().model.get_artifact( 32 | "preprocess_pipeline" 33 | ) 34 | 35 | # Link all the steps together by calling them and passing the output 36 | # of one step as the input of the next step. 37 | df_inference = data_loader(random_state=random_state, is_inference=True) 38 | df_inference = inference_preprocessor( 39 | dataset_inf=df_inference, 40 | preprocess_pipeline=preprocess_pipeline, 41 | target=target, 42 | ) 43 | inference_predict( 44 | model=model, 45 | dataset_inf=df_inference, 46 | ) 47 | -------------------------------------------------------------------------------- /template/pipelines/training.py: -------------------------------------------------------------------------------- 1 | # {% include 'template/license_header' %} 2 | 3 | from typing import Optional 4 | from uuid import UUID 5 | 6 | from steps import model_evaluator, model_promoter, model_trainer 7 | from zenml import pipeline 8 | from zenml.client import Client 9 | from zenml.logger import get_logger 10 | 11 | from pipelines import ( 12 | feature_engineering, 13 | ) 14 | 15 | logger = get_logger(__name__) 16 | 17 | 18 | @pipeline 19 | def training( 20 | train_dataset_id: Optional[UUID] = None, 21 | test_dataset_id: Optional[UUID] = None, 22 | target: Optional[str] = "target", 23 | model_type: Optional[str] = "sgd", 24 | ): 25 | """ 26 | Model training pipeline. 27 | 28 | This is a pipeline that loads the data from a preprocessing pipeline, 29 | trains a model on it and evaluates the model. If it is the first model 30 | to be trained, it will be promoted to production. If not, it will be 31 | promoted only if it has a higher accuracy than the current production 32 | model version. 33 | 34 | Args: 35 | train_dataset_id: ID of the train dataset produced by feature engineering. 36 | test_dataset_id: ID of the test dataset produced by feature engineering. 37 | target: Name of target column in dataset. 38 | model_type: The type of model to train. 39 | """ 40 | # Link all the steps together by calling them and passing the output 41 | # of one step as the input of the next step. 42 | 43 | # Execute Feature Engineering Pipeline 44 | if train_dataset_id is None or test_dataset_id is None: 45 | dataset_trn, dataset_tst = feature_engineering() 46 | else: 47 | client = Client() 48 | dataset_trn = client.get_artifact_version(name_id_or_prefix=train_dataset_id) 49 | dataset_tst = client.get_artifact_version(name_id_or_prefix=test_dataset_id) 50 | 51 | model = model_trainer(dataset_trn=dataset_trn, target=target, model_type=model_type) 52 | 53 | acc = model_evaluator( 54 | model=model, 55 | dataset_trn=dataset_trn, 56 | dataset_tst=dataset_tst, 57 | target=target, 58 | ) 59 | 60 | model_promoter(accuracy=acc) 61 | -------------------------------------------------------------------------------- /template/quickstart.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "63ab391a", 6 | "metadata": {}, 7 | "source": [ 8 | "# Intro to MLOps using ZenML\n", 9 | "\n", 10 | "## 🌍 Overview\n", 11 | "\n", 12 | "This repository is a minimalistic MLOps project intended as a starting point to learn how to put ML workflows in production. It features: \n", 13 | "\n", 14 | "- A feature engineering pipeline that loads data and prepares it for training.\n", 15 | "- A training pipeline that loads the preprocessed dataset and trains a model.\n", 16 | "- A batch inference pipeline that runs predictions on the trained model with new data.\n", 17 | "\n", 18 | "Follow along this notebook to understand how you can use ZenML to productionalize your ML workflows!\n", 19 | "\n", 20 | "\"Pipelines" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "id": "8f466b16", 26 | "metadata": {}, 27 | "source": [ 28 | "## Run on Colab\n", 29 | "\n", 30 | "You can use Google Colab to see ZenML in action, no signup / installation\n", 31 | "required!\n", 32 | "\n", 33 | "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](\n", 34 | "https://colab.research.google.com/github/zenml-io/zenml/blob/main/examples/mlops_starter/quickstart.ipynb)" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "id": "66b2977c", 40 | "metadata": {}, 41 | "source": [ 42 | "# 👶 Step 0. Install Requirements\n", 43 | "\n", 44 | "Let's install ZenML to get started. First we'll install the latest version of\n", 45 | "ZenML as well as the `sklearn` integration of ZenML:" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "id": "ce2f40eb", 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "!pip install \"zenml[server]\"" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "id": "5aad397e", 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "from zenml.environment import Environment\n", 66 | "\n", 67 | "if Environment.in_google_colab():\n", 68 | " # Install Cloudflare Tunnel binary\n", 69 | " !wget -q https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64.deb && dpkg -i cloudflared-linux-amd64.deb\n", 70 | "\n", 71 | " # Pull required modules from this example\n", 72 | " !git clone -b main https://github.com/zenml-io/zenml\n", 73 | " !cp -r zenml/examples/quickstart/* .\n", 74 | " !rm -rf zenml" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "id": "f76f562e", 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "!zenml integration install sklearn -y\n", 85 | "\n", 86 | "import IPython\n", 87 | "\n", 88 | "IPython.Application.instance().kernel.do_shutdown(restart=True)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "id": "3b044374", 94 | "metadata": {}, 95 | "source": [ 96 | "Please wait for the installation to complete before running subsequent cells. At\n", 97 | "the end of the installation, the notebook kernel will automatically restart." 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "id": "966ce581", 103 | "metadata": {}, 104 | "source": [ 105 | "## ☁️ Step 1: Connect to ZenML Pro\n", 106 | "\n", 107 | "If you are using [ZenML Pro](https://zenml.io/pro), execute the following\n", 108 | "cell with your tenant URL. Otherwise ignore.\n", 109 | "\n", 110 | "ZenML Pro is a managed service that provides a hosted ZenML environment. It\n", 111 | "allows you to run your pipelines on the cloud, manage your metadata, and\n", 112 | "collaborate with your team. Sign up [here](https://zenml.io/pro) for\n", 113 | "a free trial and to get started!" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "id": "e2587315", 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "zenml_server_url = \"PLEASE_UPDATE_ME\" # in the form \"https://URL_TO_SERVER\"\n", 124 | "\n", 125 | "!zenml login $zenml_server_url" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "id": "081d5616", 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "# Initialize ZenML and set the default stack\n", 136 | "!zenml init\n", 137 | "\n", 138 | "!zenml stack set default" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "id": "79f775f2", 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "# Do the imports at the top\n", 149 | "import random\n", 150 | "from typing import List, Optional\n", 151 | "from uuid import UUID\n", 152 | "\n", 153 | "import pandas as pd\n", 154 | "from sklearn.datasets import load_breast_cancer\n", 155 | "from steps import (\n", 156 | " data_loader,\n", 157 | " data_preprocessor,\n", 158 | " data_splitter,\n", 159 | " inference_preprocessor,\n", 160 | " model_evaluator,\n", 161 | ")\n", 162 | "from typing_extensions import Annotated\n", 163 | "from zenml import Model, get_step_context, pipeline, step\n", 164 | "from zenml.client import Client\n", 165 | "from zenml.logger import get_logger\n", 166 | "\n", 167 | "logger = get_logger(__name__)\n", 168 | "\n", 169 | "# Initialize the ZenML client to fetch objects from the ZenML Server\n", 170 | "client = Client()" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "id": "35e48460", 176 | "metadata": {}, 177 | "source": [ 178 | "## 🥇 Step 2: Load your data and execute feature engineering\n", 179 | "\n", 180 | "We'll start off by importing our data. In this quickstart we'll be working with\n", 181 | "[the Breast Cancer](https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic) dataset\n", 182 | "which is publicly available on the UCI Machine Learning Repository. The task is a classification\n", 183 | "problem, to predict whether a patient is diagnosed with breast cancer or not.\n", 184 | "\n", 185 | "When you're getting started with a machine learning problem you'll want to do\n", 186 | "something similar to this: import your data and get it in the right shape for\n", 187 | "your training. ZenML mostly gets out of your way when you're writing your Python\n", 188 | "code, as you'll see from the following cell.\n", 189 | "\n", 190 | "\"Feature" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "id": "3cd974d1", 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "@step\n", 201 | "def data_loader_simplified(\n", 202 | " random_state: int, is_inference: bool = False, target: str = \"target\"\n", 203 | ") -> Annotated[pd.DataFrame, \"dataset\"]: # We name the dataset\n", 204 | " \"\"\"Dataset reader step.\"\"\"\n", 205 | " dataset = load_breast_cancer(as_frame=True)\n", 206 | " inference_size = int(len(dataset.target) * 0.05)\n", 207 | " dataset: pd.DataFrame = dataset.frame\n", 208 | " inference_subset = dataset.sample(inference_size, random_state=random_state)\n", 209 | " if is_inference:\n", 210 | " dataset = inference_subset\n", 211 | " dataset.drop(columns=target, inplace=True)\n", 212 | " else:\n", 213 | " dataset.drop(inference_subset.index, inplace=True)\n", 214 | " dataset.reset_index(drop=True, inplace=True)\n", 215 | " logger.info(f\"Dataset with {len(dataset)} records loaded!\")\n", 216 | " return dataset" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "id": "1e8ba4c6", 222 | "metadata": {}, 223 | "source": [ 224 | "The whole function is decorated with the `@step` decorator, which\n", 225 | "tells ZenML to track this function as a step in the pipeline. This means that\n", 226 | "ZenML will automatically version, track, and cache the data that is produced by\n", 227 | "this function as an `artifact`. This is a very powerful feature, as it means that you can\n", 228 | "reproduce your data at any point in the future, even if the original data source\n", 229 | "changes or disappears. \n", 230 | "\n", 231 | "Note the use of the `typing` module's `Annotated` type hint in the output of the\n", 232 | "step. We're using this to give a name to the output of the step, which will make\n", 233 | "it possible to access it via a keyword later on.\n", 234 | "\n", 235 | "You'll also notice that we have included type hints for the outputs\n", 236 | "to the function. These are not only useful for anyone reading your code, but\n", 237 | "help ZenML process your data in a way appropriate to the specific data types." 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "id": "b6286b67", 243 | "metadata": {}, 244 | "source": [ 245 | "ZenML is built in a way that allows you to experiment with your data and build\n", 246 | "your pipelines as you work, so if you want to call this function to see how it\n", 247 | "works, you can just call it directly. Here we take a look at the first few rows\n", 248 | "of your training dataset." 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "id": "d838e2ea", 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "df = data_loader_simplified(random_state=42)\n", 259 | "df.head()" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "id": "28c05291", 265 | "metadata": {}, 266 | "source": [ 267 | "Everything looks as we'd expect and the values are all in the right format 🥳.\n", 268 | "\n", 269 | "We're now at the point where can bring this step (and some others) together into a single\n", 270 | "pipeline, the top-level organising entity for code in ZenML. Creating such a pipeline is\n", 271 | "as simple as adding a `@pipeline` decorator to a function. This specific\n", 272 | "pipeline doesn't return a value, but that option is available to you if you need." 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": null, 278 | "id": "b50a9537", 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "@pipeline\n", 283 | "def feature_engineering(\n", 284 | " test_size: float = 0.3,\n", 285 | " drop_na: Optional[bool] = None,\n", 286 | " normalize: Optional[bool] = None,\n", 287 | " drop_columns: Optional[List[str]] = None,\n", 288 | " target: Optional[str] = \"target\",\n", 289 | " random_state: int = 17,\n", 290 | "):\n", 291 | " \"\"\"Feature engineering pipeline.\"\"\"\n", 292 | " # Link all the steps together by calling them and passing the output\n", 293 | " # of one step as the input of the next step.\n", 294 | " raw_data = data_loader(random_state=random_state, target=target)\n", 295 | " dataset_trn, dataset_tst = data_splitter(\n", 296 | " dataset=raw_data,\n", 297 | " test_size=test_size,\n", 298 | " )\n", 299 | " dataset_trn, dataset_tst, _ = data_preprocessor(\n", 300 | " dataset_trn=dataset_trn,\n", 301 | " dataset_tst=dataset_tst,\n", 302 | " drop_na=drop_na,\n", 303 | " normalize=normalize,\n", 304 | " drop_columns=drop_columns,\n", 305 | " target=target,\n", 306 | " random_state=random_state,\n", 307 | " )" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "id": "7cd73c23", 313 | "metadata": {}, 314 | "source": [ 315 | "We're ready to run the pipeline now, which we can do just as with the step - by calling the\n", 316 | "pipeline function itself:" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "id": "1e0aa9af", 323 | "metadata": {}, 324 | "outputs": [], 325 | "source": [ 326 | "feature_engineering()" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "id": "1785c303", 332 | "metadata": {}, 333 | "source": [ 334 | "Let's run this again with a slightly different test size, to create more datasets:" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "id": "658c0570-2607-4b97-a72d-d45c92633e48", 341 | "metadata": {}, 342 | "outputs": [], 343 | "source": [ 344 | "feature_engineering(test_size=0.25)" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "id": "64bb7206", 350 | "metadata": {}, 351 | "source": [ 352 | "Notice the second time around, the data loader step was **cached**, while the rest of the pipeline was rerun. \n", 353 | "This is because ZenML automatically determined that nothing had changed in the data loader step, \n", 354 | "so it didn't need to rerun it." 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "id": "5bc6849d-31ac-4c08-9ca2-cf7f5f35ccbf", 360 | "metadata": {}, 361 | "source": [ 362 | "Let's run this again with a slightly different test size and random state, to disable the cache and to create more datasets:" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": null, 368 | "id": "1e1d8546", 369 | "metadata": {}, 370 | "outputs": [], 371 | "source": [ 372 | "feature_engineering(test_size=0.25, random_state=104)" 373 | ] 374 | }, 375 | { 376 | "cell_type": "markdown", 377 | "id": "6c42078a", 378 | "metadata": {}, 379 | "source": [ 380 | "At this point you might be interested to view your pipeline runs in the ZenML\n", 381 | "Dashboard. In case you are not using a hosted instance of ZenML, you can spin this up by executing the next cell. This will start a\n", 382 | "server which you can access by clicking on the link that appears in the output\n", 383 | "of the cell.\n", 384 | "\n", 385 | "Log into the Dashboard using default credentials (username 'default' and\n", 386 | "password left blank). From there you can inspect the pipeline or the specific\n", 387 | "pipeline run.\n" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": null, 393 | "id": "8cd3cc8c", 394 | "metadata": {}, 395 | "outputs": [], 396 | "source": [ 397 | "from zenml.environment import Environment\n", 398 | "from zenml.zen_stores.rest_zen_store import RestZenStore\n", 399 | "\n", 400 | "if not isinstance(client.zen_store, RestZenStore):\n", 401 | " # Only spin up a local Dashboard in case you aren't already connected to a remote server\n", 402 | " if Environment.in_google_colab():\n", 403 | " # run ZenML through a cloudflare tunnel to get a public endpoint\n", 404 | " !zenml login --local --port 8237 & cloudflared tunnel --url http://localhost:8237\n", 405 | " else:\n", 406 | " !zenml login --local" 407 | ] 408 | }, 409 | { 410 | "cell_type": "markdown", 411 | "id": "e8471f93", 412 | "metadata": {}, 413 | "source": [ 414 | "We can also fetch the pipeline from the server and view the results directly in the notebook:" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": null, 420 | "id": "f208b200", 421 | "metadata": {}, 422 | "outputs": [], 423 | "source": [ 424 | "client = Client()\n", 425 | "run = client.get_pipeline(\"feature_engineering\").last_run\n", 426 | "print(run.name)" 427 | ] 428 | }, 429 | { 430 | "cell_type": "markdown", 431 | "id": "a037f09d", 432 | "metadata": {}, 433 | "source": [ 434 | "We can also see the data artifacts that were produced by the last step of the pipeline:" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": null, 440 | "id": "34283e89", 441 | "metadata": {}, 442 | "outputs": [], 443 | "source": [ 444 | "run.steps[\"data_preprocessor\"].outputs" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": null, 450 | "id": "bceb0312", 451 | "metadata": {}, 452 | "outputs": [], 453 | "source": [ 454 | "# Read one of the datasets. This is the one with a 0.25 test split\n", 455 | "run.steps[\"data_preprocessor\"].outputs[\"dataset_trn\"].load()" 456 | ] 457 | }, 458 | { 459 | "cell_type": "markdown", 460 | "id": "26d26436", 461 | "metadata": {}, 462 | "source": [ 463 | "We can also get the artifacts directly. Each time you create a new pipeline run, a new `artifact version` is created.\n", 464 | "\n", 465 | "You can fetch these artifact and their versions using the `client`: " 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": null, 471 | "id": "c8f90647", 472 | "metadata": {}, 473 | "outputs": [], 474 | "source": [ 475 | "# Get artifact version from our run\n", 476 | "dataset_trn_artifact_version_via_run = run.steps[\"data_preprocessor\"].outputs[\n", 477 | " \"dataset_trn\"\n", 478 | "]\n", 479 | "\n", 480 | "# Get latest version from client directly\n", 481 | "dataset_trn_artifact_version = client.get_artifact_version(\"dataset_trn\")\n", 482 | "\n", 483 | "# This should be true if our run is the latest run and no artifact has been produced\n", 484 | "# in the intervening time\n", 485 | "dataset_trn_artifact_version_via_run.id == dataset_trn_artifact_version.id" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": null, 491 | "id": "3f9d3dfd", 492 | "metadata": {}, 493 | "outputs": [], 494 | "source": [ 495 | "# Fetch the rest of the artifacts\n", 496 | "dataset_tst_artifact_version = client.get_artifact_version(\"dataset_tst\")\n", 497 | "preprocessing_pipeline_artifact_version = client.get_artifact_version(\n", 498 | " \"preprocess_pipeline\"\n", 499 | ")" 500 | ] 501 | }, 502 | { 503 | "cell_type": "markdown", 504 | "id": "7a7d1b04", 505 | "metadata": {}, 506 | "source": [ 507 | "If you started with a fresh install, then you would have two versions corresponding\n", 508 | "to the two pipelines that we ran above. We can even load a artifact version in memory: " 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": null, 514 | "id": "c82aca75", 515 | "metadata": {}, 516 | "outputs": [], 517 | "source": [ 518 | "# Load an artifact to verify you can fetch it\n", 519 | "dataset_trn_artifact_version.load()" 520 | ] 521 | }, 522 | { 523 | "cell_type": "markdown", 524 | "id": "5963509e", 525 | "metadata": {}, 526 | "source": [ 527 | "We'll use these artifacts from above in our next pipeline" 528 | ] 529 | }, 530 | { 531 | "cell_type": "markdown", 532 | "id": "8c28b474", 533 | "metadata": {}, 534 | "source": [ 535 | "# ⌚ Step 3: Training pipeline" 536 | ] 537 | }, 538 | { 539 | "cell_type": "markdown", 540 | "id": "87909827", 541 | "metadata": {}, 542 | "source": [ 543 | "Now that we have our data it makes sense to train some models to get a sense of\n", 544 | "how difficult the task is. The Breast Cancer dataset is sufficiently large and complex \n", 545 | "that it's unlikely we'll be able to train a model that behaves perfectly since the problem \n", 546 | "is inherently complex, but we can get a sense of what a reasonable baseline looks like.\n", 547 | "\n", 548 | "We'll start with two simple models, a SGD Classifier and a Random Forest\n", 549 | "Classifier, both batteries-included from `sklearn`. We'll train them both on the\n", 550 | "same data and then compare their performance.\n", 551 | "\n", 552 | "\"Training" 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": null, 558 | "id": "fccf1bd9", 559 | "metadata": {}, 560 | "outputs": [], 561 | "source": [ 562 | "import pandas as pd\n", 563 | "from sklearn.base import ClassifierMixin\n", 564 | "from sklearn.ensemble import RandomForestClassifier\n", 565 | "from sklearn.linear_model import SGDClassifier\n", 566 | "from typing_extensions import Annotated\n", 567 | "from zenml import ArtifactConfig, step\n", 568 | "from zenml.logger import get_logger\n", 569 | "\n", 570 | "logger = get_logger(__name__)\n", 571 | "\n", 572 | "\n", 573 | "@step\n", 574 | "def model_trainer(\n", 575 | " dataset_trn: pd.DataFrame,\n", 576 | " model_type: str = \"sgd\",\n", 577 | ") -> Annotated[\n", 578 | " ClassifierMixin, ArtifactConfig(name=\"sklearn_classifier\", is_model_artifact=True)\n", 579 | "]:\n", 580 | " \"\"\"Configure and train a model on the training dataset.\"\"\"\n", 581 | " target = \"target\"\n", 582 | " if model_type == \"sgd\":\n", 583 | " model = SGDClassifier()\n", 584 | " elif model_type == \"rf\":\n", 585 | " model = RandomForestClassifier()\n", 586 | " else:\n", 587 | " raise ValueError(f\"Unknown model type {model_type}\")\n", 588 | "\n", 589 | " logger.info(f\"Training model {model}...\")\n", 590 | "\n", 591 | " model.fit(\n", 592 | " dataset_trn.drop(columns=[target]),\n", 593 | " dataset_trn[target],\n", 594 | " )\n", 595 | " return model" 596 | ] 597 | }, 598 | { 599 | "cell_type": "markdown", 600 | "id": "73a00008", 601 | "metadata": {}, 602 | "source": [ 603 | "Our two training steps both return different kinds of `sklearn` classifier\n", 604 | "models, so we use the generic `ClassifierMixin` type hint for the return type." 605 | ] 606 | }, 607 | { 608 | "cell_type": "markdown", 609 | "id": "a5f22174", 610 | "metadata": {}, 611 | "source": [ 612 | "ZenML allows you to load any version of any dataset that is tracked by the framework\n", 613 | "directly into a pipeline using the `Client().get_artifact_version` interface. This is very convenient\n", 614 | "in this case, as we'd like to send our preprocessed dataset from the older pipeline directly\n", 615 | "into the training pipeline." 616 | ] 617 | }, 618 | { 619 | "cell_type": "code", 620 | "execution_count": null, 621 | "id": "1aa98f2f", 622 | "metadata": {}, 623 | "outputs": [], 624 | "source": [ 625 | "@pipeline\n", 626 | "def training(\n", 627 | " train_dataset_id: Optional[UUID] = None,\n", 628 | " test_dataset_id: Optional[UUID] = None,\n", 629 | " model_type: str = \"sgd\",\n", 630 | " min_train_accuracy: float = 0.0,\n", 631 | " min_test_accuracy: float = 0.0,\n", 632 | "):\n", 633 | " \"\"\"Model training pipeline.\"\"\"\n", 634 | " if train_dataset_id is None or test_dataset_id is None:\n", 635 | " # If we dont pass the IDs, this will run the feature engineering pipeline\n", 636 | " dataset_trn, dataset_tst = feature_engineering()\n", 637 | " else:\n", 638 | " # Load the datasets from an older pipeline\n", 639 | " dataset_trn = client.get_artifact_version(name_id_or_prefix=train_dataset_id)\n", 640 | " dataset_tst = client.get_artifact_version(name_id_or_prefix=test_dataset_id)\n", 641 | "\n", 642 | " trained_model = model_trainer(\n", 643 | " dataset_trn=dataset_trn,\n", 644 | " model_type=model_type,\n", 645 | " )\n", 646 | "\n", 647 | " model_evaluator(\n", 648 | " model=trained_model,\n", 649 | " dataset_trn=dataset_trn,\n", 650 | " dataset_tst=dataset_tst,\n", 651 | " min_train_accuracy=min_train_accuracy,\n", 652 | " min_test_accuracy=min_test_accuracy,\n", 653 | " )" 654 | ] 655 | }, 656 | { 657 | "cell_type": "markdown", 658 | "id": "88b70fd3", 659 | "metadata": {}, 660 | "source": [ 661 | "The end goal of this quick baseline evaluation is to understand which of the two\n", 662 | "models performs better. We'll use the `evaluator` step to compare the two\n", 663 | "models. This step takes in the model from the trainer step, and computes its score\n", 664 | "over the testing set." 665 | ] 666 | }, 667 | { 668 | "cell_type": "code", 669 | "execution_count": null, 670 | "id": "c64885ac", 671 | "metadata": {}, 672 | "outputs": [], 673 | "source": [ 674 | "# Use a random forest model with the chosen datasets.\n", 675 | "# We need to pass the ID's of the datasets into the function\n", 676 | "training(\n", 677 | " model_type=\"rf\",\n", 678 | " train_dataset_id=dataset_trn_artifact_version.id,\n", 679 | " test_dataset_id=dataset_tst_artifact_version.id,\n", 680 | ")\n", 681 | "\n", 682 | "rf_run = client.get_pipeline(\"training\").last_run" 683 | ] 684 | }, 685 | { 686 | "cell_type": "code", 687 | "execution_count": null, 688 | "id": "4300c82f", 689 | "metadata": {}, 690 | "outputs": [], 691 | "source": [ 692 | "# Use a SGD classifier\n", 693 | "sgd_run = training(\n", 694 | " model_type=\"sgd\",\n", 695 | " train_dataset_id=dataset_trn_artifact_version.id,\n", 696 | " test_dataset_id=dataset_tst_artifact_version.id,\n", 697 | ")\n", 698 | "\n", 699 | "sgd_run = client.get_pipeline(\"training\").last_run" 700 | ] 701 | }, 702 | { 703 | "cell_type": "markdown", 704 | "id": "43f1a68a", 705 | "metadata": {}, 706 | "source": [ 707 | "You can see from the logs already how our model training went: the\n", 708 | "`RandomForestClassifier` performed considerably better than the `SGDClassifier`.\n", 709 | "We can use the ZenML `Client` to verify this:" 710 | ] 711 | }, 712 | { 713 | "cell_type": "code", 714 | "execution_count": null, 715 | "id": "d95810b1", 716 | "metadata": {}, 717 | "outputs": [], 718 | "source": [ 719 | "# The evaluator returns a float value with the accuracy\n", 720 | "rf_run.steps[\"model_evaluator\"].output.load() > sgd_run.steps[\n", 721 | " \"model_evaluator\"\n", 722 | "].output.load()" 723 | ] 724 | }, 725 | { 726 | "cell_type": "markdown", 727 | "id": "e256d145", 728 | "metadata": {}, 729 | "source": [ 730 | "# 💯 Step 4: Associating a model with your pipeline" 731 | ] 732 | }, 733 | { 734 | "cell_type": "markdown", 735 | "id": "927978f3", 736 | "metadata": {}, 737 | "source": [ 738 | "You can see it is relatively easy to train ML models using ZenML pipelines. But it can be somewhat clunky to track\n", 739 | "all the models produced as you develop your experiments and use-cases. Luckily, ZenML offers a *Model Control Plane*,\n", 740 | "which is a central register of all your ML models.\n", 741 | "\n", 742 | "You can easily create a ZenML Model and associate it with your pipelines using the `Model` object:" 743 | ] 744 | }, 745 | { 746 | "cell_type": "code", 747 | "execution_count": null, 748 | "id": "99ca00c0", 749 | "metadata": {}, 750 | "outputs": [], 751 | "source": [ 752 | "pipeline_settings = {}\n", 753 | "\n", 754 | "# Lets add some metadata to the model to make it identifiable\n", 755 | "pipeline_settings[\"model\"] = Model(\n", 756 | " name=\"breast_cancer_classifier\",\n", 757 | " license=\"Apache 2.0\",\n", 758 | " description=\"A breast cancer classifier\",\n", 759 | " tags=[\"breast_cancer\", \"classifier\"],\n", 760 | ")" 761 | ] 762 | }, 763 | { 764 | "cell_type": "code", 765 | "execution_count": null, 766 | "id": "0e78a520", 767 | "metadata": {}, 768 | "outputs": [], 769 | "source": [ 770 | "# Let's train the SGD model and set the version name to \"sgd\"\n", 771 | "pipeline_settings[\"model\"].version = \"sgd\"\n", 772 | "\n", 773 | "# the `with_options` method allows us to pass in pipeline settings\n", 774 | "# and returns a configured pipeline\n", 775 | "training_configured = training.with_options(**pipeline_settings)\n", 776 | "\n", 777 | "# We can now run this as usual\n", 778 | "training_configured(\n", 779 | " model_type=\"sgd\",\n", 780 | " train_dataset_id=dataset_trn_artifact_version.id,\n", 781 | " test_dataset_id=dataset_tst_artifact_version.id,\n", 782 | ")" 783 | ] 784 | }, 785 | { 786 | "cell_type": "code", 787 | "execution_count": null, 788 | "id": "9b8e0002", 789 | "metadata": {}, 790 | "outputs": [], 791 | "source": [ 792 | "# Let's train the RF model and set the version name to \"rf\"\n", 793 | "pipeline_settings[\"model\"].version = \"rf\"\n", 794 | "\n", 795 | "# the `with_options` method allows us to pass in pipeline settings\n", 796 | "# and returns a configured pipeline\n", 797 | "training_configured = training.with_options(**pipeline_settings)\n", 798 | "\n", 799 | "# Let's run it again to make sure we have two versions\n", 800 | "training_configured(\n", 801 | " model_type=\"rf\",\n", 802 | " train_dataset_id=dataset_trn_artifact_version.id,\n", 803 | " test_dataset_id=dataset_tst_artifact_version.id,\n", 804 | ")" 805 | ] 806 | }, 807 | { 808 | "cell_type": "markdown", 809 | "id": "09597223", 810 | "metadata": {}, 811 | "source": [ 812 | "This time, running both pipelines has created two associated **model versions**.\n", 813 | "You can list your ZenML model and their versions as follows:" 814 | ] 815 | }, 816 | { 817 | "cell_type": "code", 818 | "execution_count": null, 819 | "id": "fbb25913", 820 | "metadata": {}, 821 | "outputs": [], 822 | "source": [ 823 | "zenml_model = client.get_model(\"breast_cancer_classifier\")\n", 824 | "print(zenml_model)\n", 825 | "\n", 826 | "print(f\"Model {zenml_model.name} has {len(zenml_model.versions)} versions\")\n", 827 | "\n", 828 | "zenml_model.versions[0].version, zenml_model.versions[1].version" 829 | ] 830 | }, 831 | { 832 | "cell_type": "markdown", 833 | "id": "e82cfac2", 834 | "metadata": {}, 835 | "source": [ 836 | "The interesting part is that ZenML went ahead and linked all artifacts produced by the\n", 837 | "pipelines to that model version, including the two pickle files that represent our\n", 838 | "SGD and RandomForest classifier. We can see all artifacts directly from the model\n", 839 | "version object:" 840 | ] 841 | }, 842 | { 843 | "cell_type": "code", 844 | "execution_count": null, 845 | "id": "31211413", 846 | "metadata": {}, 847 | "outputs": [], 848 | "source": [ 849 | "# Let's load the RF version\n", 850 | "rf_zenml_model_version = client.get_model_version(\"breast_cancer_classifier\", \"rf\")\n", 851 | "\n", 852 | "# We can now load our classifier directly as well\n", 853 | "random_forest_classifier = rf_zenml_model_version.get_artifact(\n", 854 | " \"sklearn_classifier\"\n", 855 | ").load()\n", 856 | "\n", 857 | "random_forest_classifier" 858 | ] 859 | }, 860 | { 861 | "cell_type": "markdown", 862 | "id": "53517a9a", 863 | "metadata": {}, 864 | "source": [ 865 | "If you are a [ZenML Pro](https://zenml.io/pro) user, you can see all of this visualized in the dashboard:\n", 866 | "\n", 867 | "\"Model" 868 | ] 869 | }, 870 | { 871 | "cell_type": "markdown", 872 | "id": "eb645dde", 873 | "metadata": {}, 874 | "source": [ 875 | "There is a lot more you can do with ZenML models, including the ability to\n", 876 | "track metrics by adding metadata to it, or having them persist in a model\n", 877 | "registry. However, these topics can be explored more in the\n", 878 | "[ZenML docs](https://docs.zenml.io).\n", 879 | "\n", 880 | "For now, we will use the ZenML model control plane to promote our best\n", 881 | "model to `production`. You can do this by simply setting the `stage` of\n", 882 | "your chosen model version to the `production` tag." 883 | ] 884 | }, 885 | { 886 | "cell_type": "code", 887 | "execution_count": null, 888 | "id": "26b718f8", 889 | "metadata": {}, 890 | "outputs": [], 891 | "source": [ 892 | "# Set our best classifier to production\n", 893 | "rf_zenml_model_version.set_stage(\"production\", force=True)" 894 | ] 895 | }, 896 | { 897 | "cell_type": "markdown", 898 | "id": "9fddf3d0", 899 | "metadata": {}, 900 | "source": [ 901 | "Of course, normally one would only promote the model by comparing to all other model\n", 902 | "versions and doing some other tests. But that's a bit more advanced use-case. See the\n", 903 | "[e2e_batch example](https://github.com/zenml-io/zenml/tree/main/examples/e2e) to get\n", 904 | "more insight into that sort of flow!" 905 | ] 906 | }, 907 | { 908 | "cell_type": "markdown", 909 | "id": "2ecbc8cf", 910 | "metadata": {}, 911 | "source": [ 912 | "\"Model" 913 | ] 914 | }, 915 | { 916 | "cell_type": "markdown", 917 | "id": "8f1146db", 918 | "metadata": {}, 919 | "source": [ 920 | "Once the model is promoted, we can now consume the right model version in our\n", 921 | "batch inference pipeline directly. Let's see how that works." 922 | ] 923 | }, 924 | { 925 | "cell_type": "markdown", 926 | "id": "d6306f14", 927 | "metadata": {}, 928 | "source": [ 929 | "# 🫅 Step 5: Consuming the model in production" 930 | ] 931 | }, 932 | { 933 | "cell_type": "markdown", 934 | "id": "b51f3108", 935 | "metadata": {}, 936 | "source": [ 937 | "The batch inference pipeline simply takes the model marked as `production` and runs inference on it\n", 938 | "with `live data`. The critical step here is the `inference_predict` step, where we load the model in memory\n", 939 | "and generate predictions:\n", 940 | "\n", 941 | "\"Inference" 942 | ] 943 | }, 944 | { 945 | "cell_type": "code", 946 | "execution_count": null, 947 | "id": "92c4c7dc", 948 | "metadata": {}, 949 | "outputs": [], 950 | "source": [ 951 | "@step\n", 952 | "def inference_predict(dataset_inf: pd.DataFrame) -> Annotated[pd.Series, \"predictions\"]:\n", 953 | " \"\"\"Predictions step\"\"\"\n", 954 | " # Get the model\n", 955 | " model = get_step_context().model\n", 956 | "\n", 957 | " # run prediction from memory\n", 958 | " predictor = model.load_artifact(\"sklearn_classifier\")\n", 959 | " predictions = predictor.predict(dataset_inf)\n", 960 | "\n", 961 | " predictions = pd.Series(predictions, name=\"predicted\")\n", 962 | "\n", 963 | " return predictions" 964 | ] 965 | }, 966 | { 967 | "cell_type": "markdown", 968 | "id": "3aeb227b", 969 | "metadata": {}, 970 | "source": [ 971 | "Apart from the loading the model, we must also load the preprocessing pipeline that we ran in feature engineering,\n", 972 | "so that we can do the exact steps that we did on training time, in inference time. Let's bring it all together:" 973 | ] 974 | }, 975 | { 976 | "cell_type": "code", 977 | "execution_count": null, 978 | "id": "37c409bd", 979 | "metadata": {}, 980 | "outputs": [], 981 | "source": [ 982 | "@pipeline\n", 983 | "def inference(preprocess_pipeline_id: UUID):\n", 984 | " \"\"\"Model batch inference pipeline\"\"\"\n", 985 | " # random_state = client.get_artifact_version(name_id_or_prefix=preprocess_pipeline_id).metadata[\"random_state\"]\n", 986 | " # target = client.get_artifact_version(name_id_or_prefix=preprocess_pipeline_id).run_metadata['target']\n", 987 | " random_state = 42\n", 988 | " target = \"target\"\n", 989 | "\n", 990 | " df_inference = data_loader(random_state=random_state, is_inference=True)\n", 991 | " df_inference = inference_preprocessor(\n", 992 | " dataset_inf=df_inference,\n", 993 | " # We use the preprocess pipeline from the feature engineering pipeline\n", 994 | " preprocess_pipeline=client.get_artifact_version(\n", 995 | " name_id_or_prefix=preprocess_pipeline_id\n", 996 | " ),\n", 997 | " target=target,\n", 998 | " )\n", 999 | " inference_predict(\n", 1000 | " dataset_inf=df_inference,\n", 1001 | " )" 1002 | ] 1003 | }, 1004 | { 1005 | "cell_type": "markdown", 1006 | "id": "c7afe7be", 1007 | "metadata": {}, 1008 | "source": [ 1009 | "The way to load the right model is to pass in the `production` stage into the `Model` config this time.\n", 1010 | "This will ensure to always load the production model, decoupled from all other pipelines:" 1011 | ] 1012 | }, 1013 | { 1014 | "cell_type": "code", 1015 | "execution_count": null, 1016 | "id": "61bf5939", 1017 | "metadata": {}, 1018 | "outputs": [], 1019 | "source": [ 1020 | "pipeline_settings = {\"enable_cache\": False}\n", 1021 | "\n", 1022 | "# Lets add some metadata to the model to make it identifiable\n", 1023 | "pipeline_settings[\"model\"] = Model(\n", 1024 | " name=\"breast_cancer_classifier\",\n", 1025 | " version=\"production\", # We can pass in the stage name here!\n", 1026 | " license=\"Apache 2.0\",\n", 1027 | " description=\"A breast cancer classifier\",\n", 1028 | " tags=[\"breast_cancer\", \"classifier\"],\n", 1029 | ")" 1030 | ] 1031 | }, 1032 | { 1033 | "cell_type": "code", 1034 | "execution_count": null, 1035 | "id": "ff3402f1", 1036 | "metadata": {}, 1037 | "outputs": [], 1038 | "source": [ 1039 | "# the `with_options` method allows us to pass in pipeline settings\n", 1040 | "# and returns a configured pipeline\n", 1041 | "inference_configured = inference.with_options(**pipeline_settings)\n", 1042 | "\n", 1043 | "# Let's run it again to make sure we have two versions\n", 1044 | "# We need to pass in the ID of the preprocessing done in the feature engineering pipeline\n", 1045 | "# in order to avoid training-serving skew\n", 1046 | "inference_configured(preprocess_pipeline_id=preprocessing_pipeline_artifact_version.id)" 1047 | ] 1048 | }, 1049 | { 1050 | "cell_type": "markdown", 1051 | "id": "2935d1fa", 1052 | "metadata": {}, 1053 | "source": [ 1054 | "ZenML automatically links all artifacts to the `production` model version as well, including the predictions\n", 1055 | "that were returned in the pipeline. This completes the MLOps loop of training to inference:" 1056 | ] 1057 | }, 1058 | { 1059 | "cell_type": "code", 1060 | "execution_count": null, 1061 | "id": "e191d019", 1062 | "metadata": {}, 1063 | "outputs": [], 1064 | "source": [ 1065 | "# Fetch production model\n", 1066 | "production_model_version = client.get_model_version(\n", 1067 | " \"breast_cancer_classifier\", \"production\"\n", 1068 | ")\n", 1069 | "\n", 1070 | "# Get the predictions artifact\n", 1071 | "production_model_version.get_artifact(\"predictions\").load()" 1072 | ] 1073 | }, 1074 | { 1075 | "cell_type": "markdown", 1076 | "id": "b0a73cdf", 1077 | "metadata": {}, 1078 | "source": [ 1079 | "You can also see all predictions ever created as a complete history in the dashboard:\n", 1080 | "\n", 1081 | "\"Model" 1082 | ] 1083 | }, 1084 | { 1085 | "cell_type": "markdown", 1086 | "id": "594ee4fc-f102-4b99-bdc3-2f1670c87679", 1087 | "metadata": {}, 1088 | "source": [ 1089 | "## Congratulations!\n", 1090 | "\n", 1091 | "You're a legit MLOps engineer now! You trained two models, evaluated them against\n", 1092 | "a test set, registered the best one with the ZenML model control plane,\n", 1093 | "and served some predictions. You also learned how to iterate on your models and\n", 1094 | "data by using some of the ZenML utility abstractions. You saw how to view your\n", 1095 | "artifacts and models via the client as well as the ZenML Dashboard.\n", 1096 | "\n", 1097 | "## Further exploration\n", 1098 | "\n", 1099 | "This was just the tip of the iceberg of what ZenML can do; check out the [**docs**](https://docs.zenml.io/) to learn more\n", 1100 | "about the capabilities of ZenML. For example, you might want to:\n", 1101 | "\n", 1102 | "- [Deploy ZenML](https://docs.zenml.io/user-guides/production-guide/connect-deployed-zenml) to collaborate with your colleagues.\n", 1103 | "- Run the same pipeline on a [cloud MLOps stack in production](https://docs.zenml.io/user-guides/production-guide/cloud-stack).\n", 1104 | "- Track your metrics in an experiment tracker like [MLflow](https://docs.zenml.io/stacks-and-components/component-guide/experiment-trackers/mlflow).\n", 1105 | "\n", 1106 | "## What next?\n", 1107 | "\n", 1108 | "* If you have questions or feedback... join our [**Slack Community**](https://zenml.io/slack) and become part of the ZenML family!\n", 1109 | "* If you want to quickly get started with ZenML, check out [ZenML Pro](https://zenml.io/pro)." 1110 | ] 1111 | } 1112 | ], 1113 | "metadata": { 1114 | "kernelspec": { 1115 | "display_name": "Python 3 (ipykernel)", 1116 | "language": "python", 1117 | "name": "python3" 1118 | }, 1119 | "language_info": { 1120 | "codemirror_mode": { 1121 | "name": "ipython", 1122 | "version": 3 1123 | }, 1124 | "file_extension": ".py", 1125 | "mimetype": "text/x-python", 1126 | "name": "python", 1127 | "nbconvert_exporter": "python", 1128 | "pygments_lexer": "ipython3", 1129 | "version": "3.11.3" 1130 | } 1131 | }, 1132 | "nbformat": 4, 1133 | "nbformat_minor": 5 1134 | } 1135 | -------------------------------------------------------------------------------- /template/requirements.txt: -------------------------------------------------------------------------------- 1 | zenml[server]>=0.50.0 2 | notebook 3 | scikit-learn 4 | pyarrow 5 | pandas 6 | -------------------------------------------------------------------------------- /template/run.py: -------------------------------------------------------------------------------- 1 | # {% include 'template/license_header' %} 2 | 3 | import os 4 | from typing import Optional 5 | 6 | import click 7 | import yaml 8 | from pipelines import ( 9 | feature_engineering, 10 | inference, 11 | training, 12 | ) 13 | from zenml.client import Client 14 | from zenml.logger import get_logger 15 | 16 | logger = get_logger(__name__) 17 | 18 | 19 | @click.command( 20 | help=""" 21 | ZenML Starter project. 22 | 23 | Run the ZenML starter project with basic options. 24 | 25 | Examples: 26 | 27 | \b 28 | # Run the feature engineering pipeline 29 | python run.py --feature-pipeline 30 | 31 | \b 32 | # Run the training pipeline 33 | python run.py --training-pipeline 34 | 35 | \b 36 | # Run the training pipeline with versioned artifacts 37 | python run.py --training-pipeline --train-dataset-version-name=1 --test-dataset-version-name=1 38 | 39 | \b 40 | # Run the inference pipeline 41 | python run.py --inference-pipeline 42 | 43 | """ 44 | ) 45 | @click.option( 46 | "--train-dataset-name", 47 | default="dataset_trn", 48 | type=click.STRING, 49 | help="The name of the train dataset produced by feature engineering.", 50 | ) 51 | @click.option( 52 | "--train-dataset-version-name", 53 | default=None, 54 | type=click.STRING, 55 | help="Version of the train dataset produced by feature engineering. " 56 | "If not specified, a new version will be created.", 57 | ) 58 | @click.option( 59 | "--test-dataset-name", 60 | default="dataset_tst", 61 | type=click.STRING, 62 | help="The name of the test dataset produced by feature engineering.", 63 | ) 64 | @click.option( 65 | "--test-dataset-version-name", 66 | default=None, 67 | type=click.STRING, 68 | help="Version of the test dataset produced by feature engineering. " 69 | "If not specified, a new version will be created.", 70 | ) 71 | @click.option( 72 | "--feature-pipeline", 73 | is_flag=True, 74 | default=False, 75 | help="Whether to run the pipeline that creates the dataset.", 76 | ) 77 | @click.option( 78 | "--training-pipeline", 79 | is_flag=True, 80 | default=False, 81 | help="Whether to run the pipeline that trains the model.", 82 | ) 83 | @click.option( 84 | "--inference-pipeline", 85 | is_flag=True, 86 | default=False, 87 | help="Whether to run the pipeline that performs inference.", 88 | ) 89 | @click.option( 90 | "--no-cache", 91 | is_flag=True, 92 | default=False, 93 | help="Disable caching for the pipeline run.", 94 | ) 95 | def main( 96 | train_dataset_name: str = "dataset_trn", 97 | train_dataset_version_name: Optional[str] = None, 98 | test_dataset_name: str = "dataset_tst", 99 | test_dataset_version_name: Optional[str] = None, 100 | feature_pipeline: bool = False, 101 | training_pipeline: bool = False, 102 | inference_pipeline: bool = False, 103 | no_cache: bool = False, 104 | ): 105 | """Main entry point for the pipeline execution. 106 | 107 | This entrypoint is where everything comes together: 108 | 109 | * configuring pipeline with the required parameters 110 | (some of which may come from command line arguments, but most 111 | of which comes from the YAML config files) 112 | * launching the pipeline 113 | 114 | Args: 115 | train_dataset_name: The name of the train dataset produced by feature engineering. 116 | train_dataset_version_name: Version of the train dataset produced by feature engineering. 117 | If not specified, a new version will be created. 118 | test_dataset_name: The name of the test dataset produced by feature engineering. 119 | test_dataset_version_name: Version of the test dataset produced by feature engineering. 120 | If not specified, a new version will be created. 121 | feature_pipeline: Whether to run the pipeline that creates the dataset. 122 | training_pipeline: Whether to run the pipeline that trains the model. 123 | inference_pipeline: Whether to run the pipeline that performs inference. 124 | no_cache: If `True` cache will be disabled. 125 | """ 126 | client = Client() 127 | 128 | config_folder = os.path.join( 129 | os.path.dirname(os.path.realpath(__file__)), 130 | "configs", 131 | ) 132 | 133 | # Execute Feature Engineering Pipeline 134 | if feature_pipeline: 135 | pipeline_args = {} 136 | if no_cache: 137 | pipeline_args["enable_cache"] = False 138 | pipeline_args["config_path"] = os.path.join( 139 | config_folder, "feature_engineering.yaml" 140 | ) 141 | run_args_feature = {} 142 | feature_engineering.with_options(**pipeline_args)(**run_args_feature) 143 | logger.info("Feature Engineering pipeline finished successfully!\n") 144 | 145 | train_dataset_artifact = client.get_artifact_version(train_dataset_name) 146 | test_dataset_artifact = client.get_artifact_version(test_dataset_name) 147 | logger.info( 148 | "The latest feature engineering pipeline produced the following " 149 | f"artifacts: \n\n1. Train Dataset - Name: {train_dataset_name}, " 150 | f"Version Name: {train_dataset_artifact.version} \n2. Test Dataset: " 151 | f"Name: {test_dataset_name}, Version Name: {test_dataset_artifact.version}" 152 | ) 153 | 154 | # Execute Training Pipeline 155 | if training_pipeline: 156 | run_args_train = {} 157 | 158 | # If train_dataset_version_name is specified, use versioned artifacts 159 | if train_dataset_version_name or test_dataset_version_name: 160 | # However, both train and test dataset versions must be specified 161 | assert ( 162 | train_dataset_version_name is not None 163 | and test_dataset_version_name is not None 164 | ) 165 | train_dataset_artifact_version = client.get_artifact_version( 166 | train_dataset_name, train_dataset_version_name 167 | ) 168 | # If train dataset is specified, test dataset must be specified 169 | test_dataset_artifact_version = client.get_artifact_version( 170 | test_dataset_name, test_dataset_version_name 171 | ) 172 | # Use versioned artifacts 173 | run_args_train["train_dataset_id"] = train_dataset_artifact_version.id 174 | run_args_train["test_dataset_id"] = test_dataset_artifact_version.id 175 | 176 | # Run the SGD pipeline 177 | pipeline_args = {} 178 | if no_cache: 179 | pipeline_args["enable_cache"] = False 180 | pipeline_args["config_path"] = os.path.join(config_folder, "training_sgd.yaml") 181 | training.with_options(**pipeline_args)(**run_args_train) 182 | logger.info("Training pipeline with SGD finished successfully!\n\n") 183 | 184 | # Run the RF pipeline 185 | pipeline_args = {} 186 | if no_cache: 187 | pipeline_args["enable_cache"] = False 188 | pipeline_args["config_path"] = os.path.join(config_folder, "training_rf.yaml") 189 | training.with_options(**pipeline_args)(**run_args_train) 190 | logger.info("Training pipeline with RF finished successfully!\n\n") 191 | 192 | if inference_pipeline: 193 | run_args_inference = {} 194 | pipeline_args = {"enable_cache": False} 195 | pipeline_args["config_path"] = os.path.join(config_folder, "inference.yaml") 196 | 197 | # Configure the pipeline 198 | inference_configured = inference.with_options(**pipeline_args) 199 | 200 | # Fetch the production model 201 | with open(pipeline_args["config_path"], "r") as f: 202 | config = yaml.load(f, Loader=yaml.SafeLoader) 203 | zenml_model = client.get_model_version( 204 | config["model"]["name"], config["model"]["version"] 205 | ) 206 | preprocess_pipeline_artifact = zenml_model.get_artifact("preprocess_pipeline") 207 | 208 | # Use the metadata of feature engineering pipeline artifact 209 | # to get the random state and target column 210 | random_state = preprocess_pipeline_artifact.run_metadata["random_state"] 211 | target = preprocess_pipeline_artifact.run_metadata["target"] 212 | run_args_inference["random_state"] = random_state 213 | run_args_inference["target"] = target 214 | 215 | # Run the pipeline 216 | inference_configured(**run_args_inference) 217 | logger.info("Inference pipeline finished successfully!") 218 | 219 | 220 | if __name__ == "__main__": 221 | main() 222 | -------------------------------------------------------------------------------- /template/steps/__init__.py: -------------------------------------------------------------------------------- 1 | # {% include 'template/license_header' %} 2 | 3 | from .data_loader import ( 4 | data_loader, 5 | ) 6 | from .data_preprocessor import ( 7 | data_preprocessor, 8 | ) 9 | from .data_splitter import ( 10 | data_splitter, 11 | ) 12 | from .inference_predict import ( 13 | inference_predict, 14 | ) 15 | from .inference_preprocessor import ( 16 | inference_preprocessor, 17 | ) 18 | from .model_evaluator import ( 19 | model_evaluator, 20 | ) 21 | from .model_promoter import ( 22 | model_promoter, 23 | ) 24 | from .model_trainer import ( 25 | model_trainer, 26 | ) 27 | -------------------------------------------------------------------------------- /template/steps/data_loader.py: -------------------------------------------------------------------------------- 1 | # {% include 'template/license_header' %} 2 | 3 | import pandas as pd 4 | from sklearn.datasets import load_breast_cancer 5 | from typing_extensions import Annotated 6 | from zenml import step 7 | from zenml.logger import get_logger 8 | 9 | logger = get_logger(__name__) 10 | 11 | 12 | @step 13 | def data_loader( 14 | random_state: int, is_inference: bool = False, target: str = "target" 15 | ) -> Annotated[pd.DataFrame, "dataset"]: 16 | """Dataset reader step. 17 | 18 | This is an example of a dataset reader step that load Breast Cancer dataset. 19 | 20 | This step is parameterized, which allows you to configure the step 21 | independently of the step code, before running it in a pipeline. 22 | In this example, the step can be configured with number of rows and logic 23 | to drop target column or not. See the documentation for more information: 24 | 25 | https://docs.zenml.io/how-to/build-pipelines/use-pipeline-step-parameters 26 | 27 | Args: 28 | random_state: Random state for sampling 29 | is_inference: If `True` subset will be returned and target column 30 | will be removed from dataset. 31 | target: Name of target columns in dataset. 32 | 33 | Returns: 34 | The dataset artifact as Pandas DataFrame and name of target column. 35 | """ 36 | dataset = load_breast_cancer(as_frame=True) 37 | inference_size = int(len(dataset.target) * 0.05) 38 | dataset: pd.DataFrame = dataset.frame 39 | inference_subset = dataset.sample(inference_size, random_state=random_state) 40 | if is_inference: 41 | dataset = inference_subset 42 | dataset.drop(columns=target, inplace=True) 43 | else: 44 | dataset.drop(inference_subset.index, inplace=True) 45 | dataset.reset_index(drop=True, inplace=True) 46 | logger.info(f"Dataset with {len(dataset)} records loaded!") 47 | return dataset 48 | -------------------------------------------------------------------------------- /template/steps/data_preprocessor.py: -------------------------------------------------------------------------------- 1 | # {% include 'template/license_header' %} 2 | 3 | from typing import List, Optional, Tuple 4 | 5 | import pandas as pd 6 | from sklearn.pipeline import Pipeline 7 | from sklearn.preprocessing import MinMaxScaler 8 | from typing_extensions import Annotated 9 | from utils.preprocess import ColumnsDropper, DataFrameCaster, NADropper 10 | from zenml import log_metadata, step 11 | 12 | 13 | @step 14 | def data_preprocessor( 15 | random_state: int, 16 | dataset_trn: pd.DataFrame, 17 | dataset_tst: pd.DataFrame, 18 | drop_na: Optional[bool] = None, 19 | normalize: Optional[bool] = None, 20 | drop_columns: Optional[List[str]] = None, 21 | target: Optional[str] = "target", 22 | ) -> Tuple[ 23 | Annotated[pd.DataFrame, "dataset_trn"], 24 | Annotated[pd.DataFrame, "dataset_tst"], 25 | Annotated[Pipeline, "preprocess_pipeline"], 26 | ]: 27 | """Data preprocessor step. 28 | 29 | This is an example of a data processor step that prepares the data so that 30 | it is suitable for model training. It takes in a dataset as an input step 31 | artifact and performs any necessary preprocessing steps like cleaning, 32 | feature engineering, feature selection, etc. It then returns the processed 33 | dataset as a step output artifact. 34 | 35 | This step is parameterized, which allows you to configure the step 36 | independently of the step code, before running it in a pipeline. 37 | In this example, the step can be configured to drop NA values, drop some 38 | columns and normalize numerical columns. See the documentation for more 39 | information: 40 | 41 | https://docs.zenml.io/how-to/build-pipelines/use-pipeline-step-parameters 42 | 43 | Args: 44 | random_state: Random state for sampling. 45 | dataset_trn: The train dataset. 46 | dataset_tst: The test dataset. 47 | drop_na: If `True` all NA rows will be dropped. 48 | normalize: If `True` all numeric fields will be normalized. 49 | drop_columns: List of column names to drop. 50 | target: Name of target column in dataset. 51 | 52 | Returns: 53 | The processed datasets (dataset_trn, dataset_tst) and fitted `Pipeline` object. 54 | """ 55 | # We use the sklearn pipeline to chain together multiple preprocessing steps 56 | preprocess_pipeline = Pipeline([("passthrough", "passthrough")]) 57 | if drop_na: 58 | preprocess_pipeline.steps.append(("drop_na", NADropper())) 59 | if drop_columns: 60 | # Drop columns 61 | preprocess_pipeline.steps.append(("drop_columns", ColumnsDropper(drop_columns))) 62 | if normalize: 63 | # Normalize the data 64 | preprocess_pipeline.steps.append(("normalize", MinMaxScaler())) 65 | preprocess_pipeline.steps.append(("cast", DataFrameCaster(dataset_trn.columns))) 66 | dataset_trn = preprocess_pipeline.fit_transform(dataset_trn) 67 | dataset_tst = preprocess_pipeline.transform(dataset_tst) 68 | 69 | # Log metadata so we can load it in the inference pipeline 70 | log_metadata( 71 | metadata={"random_state": random_state, "target": target}, 72 | artifact_name="preprocess_pipeline", 73 | infer_artifact=True, 74 | ) 75 | return dataset_trn, dataset_tst, preprocess_pipeline 76 | -------------------------------------------------------------------------------- /template/steps/data_splitter.py: -------------------------------------------------------------------------------- 1 | # {% include 'template/license_header' %} 2 | 3 | from typing import Tuple 4 | 5 | import pandas as pd 6 | from sklearn.model_selection import train_test_split 7 | from typing_extensions import Annotated 8 | from zenml import step 9 | 10 | 11 | @step 12 | def data_splitter( 13 | dataset: pd.DataFrame, test_size: float = 0.2 14 | ) -> Tuple[ 15 | Annotated[pd.DataFrame, "raw_dataset_trn"], 16 | Annotated[pd.DataFrame, "raw_dataset_tst"], 17 | ]: 18 | """Dataset splitter step. 19 | 20 | This is an example of a dataset splitter step that splits the data 21 | into train and test set before passing it to ML model. 22 | 23 | This step is parameterized, which allows you to configure the step 24 | independently of the step code, before running it in a pipeline. 25 | In this example, the step can be configured to use different test 26 | set sizes. See the documentation for more information: 27 | 28 | https://docs.zenml.io/how-to/build-pipelines/use-pipeline-step-parameters 29 | 30 | Args: 31 | dataset: Dataset read from source. 32 | test_size: 0.0..1.0 defining portion of test set. 33 | 34 | Returns: 35 | The split dataset: dataset_trn, dataset_tst. 36 | """ 37 | dataset_trn, dataset_tst = train_test_split( 38 | dataset, 39 | test_size=test_size, 40 | random_state=42, 41 | shuffle=True, 42 | ) 43 | dataset_trn = pd.DataFrame(dataset_trn, columns=dataset.columns) 44 | dataset_tst = pd.DataFrame(dataset_tst, columns=dataset.columns) 45 | return dataset_trn, dataset_tst 46 | -------------------------------------------------------------------------------- /template/steps/inference_predict.py: -------------------------------------------------------------------------------- 1 | # Apache Software License 2.0 2 | # 3 | # Copyright (c) ZenML GmbH 2023. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from typing import Any 19 | 20 | import pandas as pd 21 | from typing_extensions import Annotated 22 | from zenml import step 23 | from zenml.logger import get_logger 24 | 25 | logger = get_logger(__name__) 26 | 27 | 28 | @step 29 | def inference_predict( 30 | model: Any, 31 | dataset_inf: pd.DataFrame, 32 | ) -> Annotated[pd.Series, "predictions"]: 33 | """Predictions step. 34 | 35 | This is an example of a predictions step that takes the data and model in 36 | and returns predicted values. 37 | 38 | This step is parameterized, which allows you to configure the step 39 | independently of the step code, before running it in a pipeline. 40 | In this example, the step can be configured to use different input data. 41 | See the documentation for more information: 42 | 43 | https://docs.zenml.io/how-to/build-pipelines/use-pipeline-step-parameters 44 | 45 | Args: 46 | model: Trained model. 47 | dataset_inf: The inference dataset. 48 | 49 | Returns: 50 | The predictions as pandas series 51 | """ 52 | # run prediction from memory 53 | predictions = model.predict(dataset_inf) 54 | 55 | predictions = pd.Series(predictions, name="predicted") 56 | return predictions 57 | -------------------------------------------------------------------------------- /template/steps/inference_preprocessor.py: -------------------------------------------------------------------------------- 1 | # Apache Software License 2.0 2 | # 3 | # Copyright (c) ZenML GmbH 2023. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | import pandas as pd 19 | from sklearn.pipeline import Pipeline 20 | from typing_extensions import Annotated 21 | from zenml import step 22 | 23 | 24 | @step 25 | def inference_preprocessor( 26 | dataset_inf: pd.DataFrame, 27 | preprocess_pipeline: Pipeline, 28 | target: str, 29 | ) -> Annotated[pd.DataFrame, "inference_dataset"]: 30 | """Data preprocessor step. 31 | 32 | This is an example of a data processor step that prepares the data so that 33 | it is suitable for model inference. It takes in a dataset as an input step 34 | artifact and performs any necessary preprocessing steps based on pretrained 35 | preprocessing pipeline. 36 | 37 | Args: 38 | dataset_inf: The inference dataset. 39 | preprocess_pipeline: Pretrained `Pipeline` to process dataset. 40 | target: Name of target columns in dataset. 41 | 42 | Returns: 43 | The processed dataframe: dataset_inf. 44 | """ 45 | # artificially adding `target` column to avoid Pipeline issues 46 | dataset_inf[target] = pd.Series([1] * dataset_inf.shape[0]) 47 | dataset_inf = preprocess_pipeline.transform(dataset_inf) 48 | dataset_inf.drop(columns=[target], inplace=True) 49 | return dataset_inf 50 | -------------------------------------------------------------------------------- /template/steps/model_evaluator.py: -------------------------------------------------------------------------------- 1 | # {% include 'template/license_header' %} 2 | 3 | from typing import Optional 4 | 5 | import pandas as pd 6 | from sklearn.base import ClassifierMixin 7 | 8 | from zenml import log_metadata, step 9 | from zenml.client import Client 10 | from zenml.logger import get_logger 11 | 12 | logger = get_logger(__name__) 13 | 14 | 15 | @step 16 | def model_evaluator( 17 | model: ClassifierMixin, 18 | dataset_trn: pd.DataFrame, 19 | dataset_tst: pd.DataFrame, 20 | min_train_accuracy: float = 0.0, 21 | min_test_accuracy: float = 0.0, 22 | target: Optional[str] = "target", 23 | ) -> float: 24 | """Evaluate a trained model. 25 | 26 | This is an example of a model evaluation step that takes in a model artifact 27 | previously trained by another step in your pipeline, and a training 28 | and validation data set pair which it uses to evaluate the model's 29 | performance. The model metrics are then returned as step output artifacts 30 | (in this case, the model accuracy on the train and test set). 31 | 32 | The suggested step implementation also outputs some warnings if the model 33 | performance does not meet some minimum criteria. This is just an example of 34 | how you can use steps to monitor your model performance and alert you if 35 | something goes wrong. As an alternative, you can raise an exception in the 36 | step to force the pipeline run to fail early and all subsequent steps to 37 | be skipped. 38 | 39 | This step is parameterized to configure the step independently of the step code, 40 | before running it in a pipeline. In this example, the step can be configured 41 | to use different values for the acceptable model performance thresholds and 42 | to control whether the pipeline run should fail if the model performance 43 | does not meet the minimum criteria. See the documentation for more 44 | information: 45 | 46 | https://docs.zenml.io/how-to/build-pipelines/use-pipeline-step-parameters 47 | 48 | Args: 49 | model: The pre-trained model artifact. 50 | dataset_trn: The train dataset. 51 | dataset_tst: The test dataset. 52 | min_train_accuracy: Minimal acceptable training accuracy value. 53 | min_test_accuracy: Minimal acceptable testing accuracy value. 54 | target: Name of target column in dataset. 55 | 56 | Returns: 57 | The model accuracy on the test set. 58 | """ 59 | # Calculate the model accuracy on the train and test set 60 | trn_acc = model.score( 61 | dataset_trn.drop(columns=[target]), 62 | dataset_trn[target], 63 | ) 64 | tst_acc = model.score( 65 | dataset_tst.drop(columns=[target]), 66 | dataset_tst[target], 67 | ) 68 | logger.info(f"Train accuracy={trn_acc * 100:.2f}%") 69 | logger.info(f"Test accuracy={tst_acc * 100:.2f}%") 70 | 71 | messages = [] 72 | if trn_acc < min_train_accuracy: 73 | messages.append( 74 | f"Train accuracy {trn_acc * 100:.2f}% is below {min_train_accuracy * 100:.2f}% !" 75 | ) 76 | if tst_acc < min_test_accuracy: 77 | messages.append( 78 | f"Test accuracy {tst_acc * 100:.2f}% is below {min_test_accuracy * 100:.2f}% !" 79 | ) 80 | else: 81 | for message in messages: 82 | logger.warning(message) 83 | 84 | client = Client() 85 | latest_classifier = client.get_artifact_version("sklearn_classifier") 86 | 87 | log_metadata( 88 | metadata={ 89 | "train_accuracy": float(trn_acc), 90 | "test_accuracy": float(tst_acc) 91 | }, 92 | artifact_version_id=latest_classifier.id 93 | ) 94 | 95 | return float(tst_acc) 96 | -------------------------------------------------------------------------------- /template/steps/model_promoter.py: -------------------------------------------------------------------------------- 1 | # {% include 'template/license_header' %} 2 | 3 | from zenml import get_step_context, step 4 | from zenml.client import Client 5 | from zenml.logger import get_logger 6 | 7 | logger = get_logger(__name__) 8 | 9 | 10 | @step 11 | def model_promoter(accuracy: float, stage: str = "production") -> bool: 12 | """Model promoter step. 13 | 14 | This is an example of a step that conditionally promotes a model. It takes 15 | in the accuracy of the model and the stage to promote the model to. If the 16 | accuracy is below 80%, the model is not promoted. If it is above 80%, the 17 | model is promoted to the stage indicated in the parameters. If there is 18 | already a model in the indicated stage, the model with the higher accuracy 19 | is promoted. 20 | 21 | Args: 22 | accuracy: Accuracy of the model. 23 | stage: Which stage to promote the model to. 24 | 25 | Returns: 26 | Whether the model was promoted or not. 27 | """ 28 | is_promoted = False 29 | 30 | if accuracy < 0.8: 31 | logger.info( 32 | f"Model accuracy {accuracy*100:.2f}% is below 80% ! Not promoting model." 33 | ) 34 | else: 35 | logger.info(f"Model promoted to {stage}!") 36 | is_promoted = True 37 | 38 | # Get the model in the current context 39 | current_model = get_step_context().model 40 | 41 | # Get the model that is in the production stage 42 | client = Client() 43 | try: 44 | stage_model = client.get_model_version( 45 | current_model.name, stage 46 | ) 47 | # We compare their metrics 48 | prod_accuracy = ( 49 | stage_model.get_artifact("sklearn_classifier") 50 | .run_metadata["test_accuracy"] 51 | ) 52 | if float(accuracy) > float(prod_accuracy): 53 | # If current model has better metrics, we promote it 54 | is_promoted = True 55 | current_model.set_stage(stage, force=True) 56 | except KeyError: 57 | # If no such model exists, current one is promoted 58 | is_promoted = True 59 | current_model.set_stage(stage, force=True) 60 | return is_promoted 61 | -------------------------------------------------------------------------------- /template/steps/model_trainer.py: -------------------------------------------------------------------------------- 1 | # {% include 'template/license_header' %} 2 | 3 | from typing import Optional 4 | 5 | import pandas as pd 6 | from sklearn.base import ClassifierMixin 7 | from sklearn.ensemble import RandomForestClassifier 8 | from sklearn.linear_model import SGDClassifier 9 | from typing_extensions import Annotated 10 | from zenml import ArtifactConfig, step 11 | from zenml.logger import get_logger 12 | 13 | logger = get_logger(__name__) 14 | 15 | 16 | @step 17 | def model_trainer( 18 | dataset_trn: pd.DataFrame, 19 | model_type: str = "sgd", 20 | target: Optional[str] = "target", 21 | ) -> Annotated[ 22 | ClassifierMixin, ArtifactConfig(name="sklearn_classifier", is_model_artifact=True) 23 | ]: 24 | """Configure and train a model on the training dataset. 25 | 26 | This is an example of a model training step that takes in a dataset artifact 27 | previously loaded and pre-processed by other steps in your pipeline, then 28 | configures and trains a model on it. The model is then returned as a step 29 | output artifact. 30 | 31 | Args: 32 | dataset_trn: The preprocessed train dataset. 33 | model_type: The type of model to train. 34 | target: The name of the target column in the dataset. 35 | 36 | Returns: 37 | The trained model artifact. 38 | 39 | Raises: 40 | ValueError: If the model type is not supported. 41 | """ 42 | # Initialize the model with the hyperparameters indicated in the step 43 | # parameters and train it on the training set. 44 | if model_type == "sgd": 45 | model = SGDClassifier() 46 | elif model_type == "rf": 47 | model = RandomForestClassifier() 48 | else: 49 | raise ValueError(f"Unknown model type {model_type}") 50 | logger.info(f"Training model {model}...") 51 | 52 | model.fit( 53 | dataset_trn.drop(columns=[target]), 54 | dataset_trn[target], 55 | ) 56 | return model 57 | -------------------------------------------------------------------------------- /template/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # {% include 'template/license_header' %} 2 | -------------------------------------------------------------------------------- /template/utils/preprocess.py: -------------------------------------------------------------------------------- 1 | # {% include 'template/license_header' %} 2 | 3 | from typing import Union 4 | 5 | import pandas as pd 6 | 7 | 8 | class NADropper: 9 | """Support class to drop NA values in sklearn Pipeline.""" 10 | 11 | def fit(self, *args, **kwargs): 12 | return self 13 | 14 | def transform(self, X: Union[pd.DataFrame, pd.Series]): 15 | return X.dropna() 16 | 17 | 18 | class ColumnsDropper: 19 | """Support class to drop specific columns in sklearn Pipeline.""" 20 | 21 | def __init__(self, columns): 22 | self.columns = columns 23 | 24 | def fit(self, *args, **kwargs): 25 | return self 26 | 27 | def transform(self, X: Union[pd.DataFrame, pd.Series]): 28 | return X.drop(columns=self.columns) 29 | 30 | 31 | class DataFrameCaster: 32 | """Support class to cast type back to pd.DataFrame in sklearn Pipeline.""" 33 | 34 | def __init__(self, columns): 35 | self.columns = columns 36 | 37 | def fit(self, *args, **kwargs): 38 | return self 39 | 40 | def transform(self, X): 41 | return pd.DataFrame(X, columns=self.columns) 42 | -------------------------------------------------------------------------------- /template/{% if open_source_license %}LICENSE{% endif %}: -------------------------------------------------------------------------------- 1 | {% include 'template/license' %} -------------------------------------------------------------------------------- /template/{{ _copier_conf.answers_file }}: -------------------------------------------------------------------------------- 1 | # Changes here will be overwritten by Copier 2 | {{ _copier_answers|to_nice_yaml -}} -------------------------------------------------------------------------------- /test-requirements.txt: -------------------------------------------------------------------------------- 1 | autopep8 2 | pytest 3 | pytest-randomly 4 | ruff 5 | black 6 | isort 7 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) ZenML GmbH 2023. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at: 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 12 | # or implied. See the License for the specific language governing 13 | # permissions and limitations under the License. 14 | 15 | 16 | import contextlib 17 | import os 18 | import sys 19 | import shutil 20 | from typing import Generator 21 | 22 | import pytest 23 | from zenml.client import Client 24 | from zenml.config.global_config import GlobalConfiguration 25 | from zenml.constants import ENV_ZENML_CONFIG_PATH 26 | from zenml.enums import StackComponentType 27 | 28 | 29 | def configure_stack(): 30 | stack_name = os.environ.get("ZENML_STACK_NAME", "local") 31 | zenml_client = Client() 32 | 33 | if stack_name == "local": 34 | components = {} 35 | for component in [ 36 | ("local", "local", StackComponentType.ORCHESTRATOR), 37 | ("local", "local", StackComponentType.ARTIFACT_STORE), 38 | ]: 39 | zenml_client.create_stack_component(*component, {}) 40 | components[component[2]] = component[0] 41 | zenml_client.create_stack("local", components=components) 42 | zenml_client.activate_stack("local") 43 | else: 44 | raise RuntimeError(f"Stack {stack_name} not supported") 45 | 46 | 47 | @pytest.fixture(scope="module") 48 | def clean_zenml_client( 49 | tmp_path_factory: pytest.TempPathFactory, 50 | ) -> Generator[Client, None, None]: 51 | """Context manager to initialize and use a clean local default ZenML client. 52 | 53 | This context manager creates a clean ZenML client with its own global 54 | configuration and local database. 55 | 56 | Args: 57 | tmp_path_factory: A pytest fixture that provides a temporary directory. 58 | 59 | Yields: 60 | A clean ZenML client. 61 | """ 62 | # save the current global configuration and client singleton instances 63 | # to restore them later, then reset them 64 | orig_cwd = os.getcwd() 65 | original_config = GlobalConfiguration.get_instance() 66 | original_client = Client.get_instance() 67 | orig_config_path = os.getenv("ZENML_CONFIG_PATH") 68 | 69 | GlobalConfiguration._reset_instance() 70 | Client._reset_instance() 71 | 72 | # change the working directory to a fresh temp path 73 | tmp_path = tmp_path_factory.mktemp("pytest-clean-client") 74 | os.chdir(tmp_path) 75 | 76 | os.environ[ENV_ZENML_CONFIG_PATH] = str(tmp_path / "zenml") 77 | os.environ["ZENML_ANALYTICS_OPT_IN"] = "false" 78 | 79 | # initialize the global config client and store at the new path 80 | gc = GlobalConfiguration() 81 | gc.analytics_opt_in = False 82 | client = Client() 83 | _ = client.zen_store 84 | 85 | # prepare stack configuration 86 | configure_stack() 87 | 88 | yield client 89 | 90 | # restore the global configuration path 91 | if orig_config_path: 92 | os.environ[ENV_ZENML_CONFIG_PATH] = orig_config_path 93 | else: 94 | del os.environ[ENV_ZENML_CONFIG_PATH] 95 | 96 | # restore the global configuration and the client 97 | GlobalConfiguration._reset_instance(original_config) 98 | Client._reset_instance(original_client) 99 | 100 | # remove all traces, and change working directory back to base path 101 | os.chdir(orig_cwd) 102 | if sys.platform == "win32": 103 | with contextlib.suppress(Exception): 104 | shutil.rmtree(str(tmp_path)) 105 | else: 106 | shutil.rmtree(str(tmp_path)) 107 | -------------------------------------------------------------------------------- /tests/test_starter_template.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) ZenML GmbH 2023. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at: 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 12 | # or implied. See the License for the specific language governing 13 | # permissions and limitations under the License. 14 | 15 | 16 | import os 17 | import pathlib 18 | import platform 19 | import shutil 20 | import subprocess 21 | import sys 22 | from typing import Optional 23 | 24 | import pytest 25 | from copier import Worker 26 | from zenml.client import Client 27 | from zenml.enums import ExecutionStatus 28 | 29 | TEMPLATE_DIRECTORY = str(pathlib.Path.joinpath(pathlib.Path(__file__).parent.parent)) 30 | 31 | 32 | def generate_and_run_project( 33 | tmp_path_factory: pytest.TempPathFactory, 34 | open_source_license: Optional[str] = "apache", 35 | product_name: str = "starter_project", 36 | ): 37 | """Generate and run the starter project with different options.""" 38 | 39 | answers = { 40 | "project_name": "Pytest Templated Project", 41 | "version": "0.0.1", 42 | "open_source_license": str(open_source_license).lower(), 43 | "product_name": product_name, 44 | } 45 | if open_source_license: 46 | answers["email"] = "pytest@zenml.io" 47 | answers["full_name"] = "Pytest" 48 | 49 | # generate the template in a temp path 50 | current_dir = os.getcwd() 51 | dst_path = tmp_path_factory.mktemp("pytest-template") 52 | os.chdir(str(dst_path)) 53 | with Worker( 54 | src_path=TEMPLATE_DIRECTORY, 55 | dst_path=str(dst_path), 56 | data=answers, 57 | unsafe=True, 58 | vcs_ref="HEAD", 59 | ) as worker: 60 | worker.run_copy() 61 | 62 | # run the project 63 | call = [ 64 | sys.executable, 65 | "run.py", 66 | "--training-pipeline", 67 | "--feature-pipeline", 68 | "--inference-pipeline", 69 | "--no-cache", 70 | ] 71 | 72 | try: 73 | subprocess.check_output( 74 | call, 75 | cwd=str(dst_path), 76 | env=os.environ.copy(), 77 | stderr=subprocess.STDOUT, 78 | ) 79 | except subprocess.CalledProcessError as e: 80 | raise RuntimeError( 81 | f"Failed to run project generated with parameters: {answers}\n" 82 | f"{e.output.decode()}" 83 | ) from e 84 | 85 | # check the pipeline run is successful 86 | for pipeline_name, run_count in [ 87 | ("training", 2), 88 | ("inference", 1), 89 | ("feature_engineering", 1), 90 | ]: 91 | pipeline = Client().get_pipeline(pipeline_name) 92 | assert pipeline 93 | runs = pipeline.runs 94 | assert len(runs) == run_count 95 | assert runs[0].status == ExecutionStatus.COMPLETED 96 | 97 | # clean up 98 | Client().delete_pipeline(pipeline_name) 99 | Client().delete_model("breast_cancer_classifier") 100 | 101 | os.chdir(current_dir) 102 | shutil.rmtree(dst_path) 103 | 104 | 105 | @pytest.mark.parametrize("open_source_license", ["mit", None], ids=["oss", "css"]) 106 | def test_generate_license( 107 | clean_zenml_client, 108 | tmp_path_factory: pytest.TempPathFactory, 109 | open_source_license: Optional[str], 110 | ): 111 | """Test generating licenses.""" 112 | 113 | generate_and_run_project( 114 | tmp_path_factory=tmp_path_factory, 115 | open_source_license=open_source_license, 116 | ) 117 | 118 | 119 | def test_custom_product_name( 120 | clean_zenml_client, 121 | tmp_path_factory: pytest.TempPathFactory, 122 | ): 123 | """Test using custom pipeline name.""" 124 | 125 | generate_and_run_project( 126 | tmp_path_factory=tmp_path_factory, 127 | product_name="custom_product_name", 128 | ) 129 | --------------------------------------------------------------------------------