├── .github
    ├── actions
    │   └── starter_template_test
    │   │   └── action.yml
    └── workflows
    │   ├── ci.yml
    │   └── image-optimizer.yml
├── .gitignore
├── .pytest.ini
├── LICENSE
├── README.md
├── copier.yaml
├── requirements.txt
├── template
    ├── .assets
    │   ├── cloud_mcp.png
    │   ├── cloud_mcp_predictions.png
    │   ├── cloud_mcp_screenshot.png
    │   ├── feature_engineering_pipeline.png
    │   ├── inference_pipeline.png
    │   ├── pipeline_overview.png
    │   └── training_pipeline.png
    ├── .dockerignore
    ├── README.md
    ├── configs
    │   ├── feature_engineering.yaml
    │   ├── inference.yaml
    │   ├── training_rf.yaml
    │   └── training_sgd.yaml
    ├── license
    ├── license_header
    ├── pipelines
    │   ├── __init__.py
    │   ├── feature_engineering.py
    │   ├── inference.py
    │   └── training.py
    ├── quickstart.ipynb
    ├── requirements.txt
    ├── run.py
    ├── steps
    │   ├── __init__.py
    │   ├── data_loader.py
    │   ├── data_preprocessor.py
    │   ├── data_splitter.py
    │   ├── inference_predict.py
    │   ├── inference_preprocessor.py
    │   ├── model_evaluator.py
    │   ├── model_promoter.py
    │   └── model_trainer.py
    ├── utils
    │   ├── __init__.py
    │   └── preprocess.py
    ├── {% if open_source_license %}LICENSE{% endif %}
    └── {{ _copier_conf.answers_file }}
├── test-requirements.txt
└── tests
    ├── conftest.py
    └── test_starter_template.py


/.github/actions/starter_template_test/action.yml:
--------------------------------------------------------------------------------
 1 | name: 'Run STARTER template tests'
 2 | inputs:
 3 |   stack-name:
 4 |     description: 'Name of ZenML stack to build (see `tests/conftest.py:configure_stack()`)'
 5 |     type: string
 6 |     required: true
 7 |   ref-zenml:
 8 |     description: 'Ref of ZenML package'
 9 |     type: string
10 |     required: false
11 |     default: ''
12 |   ref-template:
13 |     description: 'Ref of this template repo'
14 |     type: string
15 |     required: false
16 |     default: ''
17 |   python-version:
18 |     description: 'Python version'
19 |     type: string
20 |     required: false
21 |     default: '3.9'
22 | 
23 | runs:
24 |   using: "composite"
25 |   steps:
26 |   - name: Check out repository code
27 |     uses: actions/checkout@v3
28 |     with:
29 |       repository: zenml-io/zenml-project-templates
30 |       ref: ${{ inputs.ref-template }}
31 |       path: ./local_checkout
32 | 
33 |   - name: Set up Python
34 |     uses: actions/setup-python@v4
35 |     with:
36 |       python-version: ${{ inputs.python-version }}
37 | 
38 |   - name: Configure git (non-Windows)
39 |     if: ${{ runner.os != 'Windows' }}
40 |     shell: bash
41 |     run: |
42 |       git config --global user.email "info@zenml.io"
43 |       git config --global user.name "ZenML GmbH"
44 | 
45 |   - name: Configure git (Windows)
46 |     if: ${{ runner.os == 'Windows' }}
47 |     shell: bash
48 |     run: |
49 |       "C:\Program Files\Git\bin\git.exe" config --global user.email "info@zenml.io"
50 |       "C:\Program Files\Git\bin\git.exe" config --global user.name "ZenML GmbH"
51 | 
52 |   - name: Install  wheel
53 |     shell: bash
54 |     run: |
55 |       pip install wheel uv
56 | 
57 |   - name: Install ZenML
58 |     if: ${{ inputs.ref-zenml != '' }}
59 |     shell: bash
60 |     run: |
61 |       uv pip install --system "git+https://github.com/zenml-io/zenml.git@${{ inputs.ref-zenml }}" "zenml[server]@git+https://github.com/zenml-io/zenml.git@${{ inputs.ref-zenml }}"
62 | 
63 |   - name: Install ZenML
64 |     if: ${{ inputs.ref-zenml == '' }}
65 |     shell: bash
66 |     run: |
67 |       uv pip install --system zenml "zenml[server]"
68 | 
69 |   - name: Concatenate requirements
70 |     shell: bash
71 |     run: |
72 |       zenml integration export-requirements -o ./local_checkout/integration-requirements.txt sklearn pandas
73 |       cat ./local_checkout/requirements.txt ./local_checkout/test-requirements.txt ./local_checkout/integration-requirements.txt >> ./local_checkout/all-requirements.txt
74 | 
75 |   - name: Install requirements
76 |     shell: bash
77 |     run: |
78 |       uv pip install --system -r ./local_checkout/all-requirements.txt
79 | 
80 |   - name: Run pytests
81 |     shell: bash
82 |     env:
83 |       ZENML_STACK_NAME: ${{ inputs.stack-name }}
84 |     run: |
85 |       pytest ./local_checkout/tests
86 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |     inputs:
 6 |       ref-template:
 7 |         description: 'Branch or tag ref to check out for template'
 8 |         type: string
 9 |         required: false
10 |       ref-zenml:
11 |         description: 'Branch or tag ref to check out for ZenML'
12 |         type: string
13 |         required: false
14 |   workflow_call:
15 |     inputs:
16 |       ref-template:
17 |         description: 'Branch or tag ref to check out for template'
18 |         type: string
19 |         required: false
20 |       ref-zenml:
21 |         description: 'Branch or tag ref to check out for ZenML'
22 |         type: string
23 |         required: false
24 |   push:
25 |     branches: ["main", "develop"]
26 |     paths-ignore: ["README.md"]
27 |   pull_request:
28 |     types: [opened, synchronize, ready_for_review]
29 |     paths-ignore: ["README.md"]
30 | 
31 | concurrency:
32 |   # New commit on branch cancels running workflows of the same branch
33 |   group: ${{ github.workflow }}-${{ github.ref }}
34 |   cancel-in-progress: true
35 | 
36 | jobs:
37 |   run-tests:
38 |     runs-on: ${{ matrix.os }}
39 |     strategy:
40 |       fail-fast: false
41 |       matrix:
42 |         stack-name: [local]
43 |         os: [windows-latest, ubuntu-latest, macos-latest]
44 |         python-version: ["3.9", "3.10", "3.11", "3.12"]
45 |     env:
46 |       ZENML_DEBUG: true
47 |       ZENML_ANALYTICS_OPT_IN: false
48 |       ZENML_LOGGING_VERBOSITY: INFO
49 |     steps:
50 |       - name: Check out repository code
51 |         uses: actions/checkout@v3
52 | 
53 |       - name: Run tests
54 |         uses: ./.github/actions/starter_template_test
55 |         with:
56 |           stack-name: ${{ matrix.stack-name }}
57 |           python-version: ${{ matrix.python-version }}
58 |           ref-zenml: ${{ inputs.ref-zenml || 'feature/followup-run-metadata' }}
59 |           ref-template: ${{ inputs.ref-template || github.ref }}
60 | 


--------------------------------------------------------------------------------
/.github/workflows/image-optimizer.yml:
--------------------------------------------------------------------------------
 1 | name: Compress Images
 2 | on:
 3 |   pull_request:
 4 |     # Run Image Actions when JPG, JPEG, PNG or WebP files are added or changed.
 5 |     # See https://help.github.com/en/actions/automating-your-workflow-with-github-actions/workflow-syntax-for-github-actions#onpushpull_requestpaths for reference.
 6 |     paths:
 7 |       - '**.jpg'
 8 |       - '**.jpeg'
 9 |       - '**.png'
10 |       - '**.webp'
11 | jobs:
12 |   build:
13 |     # Only run on non-draft PRs within the same repository.
14 |     if: github.event.pull_request.head.repo.full_name == github.repository && github.event.pull_request.draft == false
15 |     name: calibreapp/image-actions
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - name: Checkout Repo
19 |         uses: actions/checkout@v3
20 | 
21 |       - name: Compress Images
22 |         uses: calibreapp/image-actions@main
23 |         with:
24 |           # The `GITHUB_TOKEN` is automatically generated by GitHub and scoped only to the repository that is currently running the action. By default, the action can’t update Pull Requests initiated from forked repositories.
25 |           # See https://docs.github.com/en/actions/reference/authentication-in-a-workflow and https://help.github.com/en/articles/virtual-environments-for-github-actions#token-permissions
26 |           githubToken: ${{ secrets.GITHUB_TOKEN }}
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # PyCharm Stuff
102 | .idea
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | *.zen
135 | .vscode
136 | .local
137 | 


--------------------------------------------------------------------------------
/.pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts = 
3 |     -s 
4 | testpaths =
5 |     tests


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 📜 ZenML Starter Template
 2 | 
 3 | This repository contains a starter template from which a simple ZenML project
 4 | can be generated easily. It contains a collection of steps, pipelines, stack configurations and
 5 | other artifacts and useful resources that can get you started with ZenML.
 6 | 
 7 | 🔥 **Do you have a personal project powered by ZenML that you would like to see here?** 
 8 | 
 9 | At ZenML, we are looking for design partnerships and collaboration to help us
10 | better understand the real-world scenarios in which MLOps is being used and to
11 | build the best possible experience for our users. If you are interested in
12 | sharing all or parts of your project with us in the form of a ZenML project
13 | template, please [join our Slack](https://zenml.io/slack/) and leave us a
14 | message!
15 | 
16 | ## 📦 Prerequisites
17 | 
18 | To use the templates, you need to have Zenml and its `templates` extras
19 | installed: 
20 | 
21 | ```bash
22 | pip install "zenml[templates]"
23 | ```
24 | 
25 | ## 🚀 Generate a ZenML Project
26 | 
27 | You can generate a project from one of the existing templates by using the
28 | `--template` flag with the `zenml init` command:
29 | 
30 | ```bash
31 | zenml init --template
32 | ```
33 | 
34 | Under the hood, ZenML uses the popular [Copier](https://copier.readthedocs.io/en/stable/)
35 | library and a set of Jinja2 templates to generate the project. So you may also
36 | interact with Copier directly to generate a project, e.g.:
37 | 
38 | ```bash
39 | copier gh:zenml-io/template-starter <directory>
40 | ```
41 | 
42 | You will be prompted to select the project template and enter various values for
43 | the template variables. Once you have entered them, the project will be
44 | generated in the indicated path.
45 | 
46 | To update an already generated project, with different parameters you can run
47 | the same command again. If you want to skip the prompts to use the values you
48 | already entered and overwrite all files in the existing project, you can run:
49 | 
50 | ```bash
51 | copier -wf gh:zenml-io/template-starter <directory>
52 | ```
53 | 


--------------------------------------------------------------------------------
/copier.yaml:
--------------------------------------------------------------------------------
 1 | --- # GLOBAL PROMPT --------------------------------
 2 | project_name:
 3 |     type: str
 4 |     help: Short name for your project
 5 |     default: ZenML Starter
 6 | version:
 7 |     type: str
 8 |     help: |
 9 |         Version of your project
10 |     default: "0.1.0"
11 | open_source_license:
12 |     type: str
13 |     help: >-
14 |         The license under which your project will be released
15 |     choices:
16 |         Apache Software License 2.0: apache
17 |         MIT license: mit
18 |         BSD license: bsd
19 |         ISC license: isc
20 |         GNU General Public License v3: gpl3
21 |         Not open source: none
22 |     default: apache
23 | full_name:
24 |     type: str
25 |     help: >-
26 |         The name of the person/entity holding the copyright
27 |     default: ZenML GmbH
28 |     when: "{{ open_source_license }}"
29 | email:
30 |     type: str
31 |     help: >-
32 |         The email of the person/entity holding the copyright
33 |     default: info@zenml.io
34 |     when: "{{ open_source_license }}"
35 | 
36 | # CONFIGURATION -------------------------
37 | _templates_suffix: ""
38 | _subdirectory: "./template"
39 | _exclude:
40 |     - license
41 |     - license_header
42 | _tasks:
43 |     # Remove unused imports and variables
44 |     - >-
45 |       {% if _copier_conf.os == 'windows' %}
46 |       echo "Auto-formatting not supported on Windows"
47 |       {% else  %}
48 |       {{ _copier_python }} -m ruff check --select F401,F841 --fix \
49 |           --exclude "__init__.py" --isolated \
50 |           steps pipelines run.py > /dev/null 2>&1 || true
51 |       {% endif %}
52 |     # Sort imports
53 |     - >-
54 |       {% if _copier_conf.os == 'windows' %}
55 |       echo "Auto-formatting not supported on Windows"
56 |       {% else  %}
57 |       {{ _copier_python }} -m ruff check --select I \
58 |           --fix --ignore D \
59 |           steps pipelines run.py > /dev/null 2>&1 || true
60 |       {% endif %}
61 |     # Auto-format code
62 |     - >-
63 |       {% if _copier_conf.os == 'windows' %}
64 |       echo "Auto-formatting not supported on Windows"
65 |       {% else  %}
66 |       {{ _copier_python }} -m black \
67 |           --exclude '' --include '\.pyi?$' -l 79 \
68 |           steps pipelines run.py > /dev/null 2>&1 || true
69 |       {% endif %}
70 |     - |
71 |       echo "Congratulations, your project has been generated in the '{{ _copier_conf.dst_path }}' directory."
72 |       echo "You can now run the following commands to get started:"
73 |       echo "    cd {{ _copier_conf.dst_path }}"
74 |       echo "    pip install -r requirements.txt"
75 |       echo "    # Start the ZenML UI (optional; you'll also need the zenml[server] Python"
76 |       echo "    # package installed"
77 |       echo "    zenml login --local"
78 |       echo "    python run.py"
79 |       echo "Next, you should take a look at the '{{ _copier_conf.dst_path }}/README.md' file in the generated project."
80 |       echo "Happy coding!"
81 | 
82 | _jinja_extensions:
83 |     - jinja2_time.TimeExtension
84 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | scikit-learn
2 | copier
3 | jinja2-time
4 | zenml[server]>=0.52.0
5 | notebook
6 | pyyaml-include<2.0
7 | 


--------------------------------------------------------------------------------
/template/.assets/cloud_mcp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zenml-io/template-starter/665ed6a723863315551c732eb22648b8c89a615b/template/.assets/cloud_mcp.png


--------------------------------------------------------------------------------
/template/.assets/cloud_mcp_predictions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zenml-io/template-starter/665ed6a723863315551c732eb22648b8c89a615b/template/.assets/cloud_mcp_predictions.png


--------------------------------------------------------------------------------
/template/.assets/cloud_mcp_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zenml-io/template-starter/665ed6a723863315551c732eb22648b8c89a615b/template/.assets/cloud_mcp_screenshot.png


--------------------------------------------------------------------------------
/template/.assets/feature_engineering_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zenml-io/template-starter/665ed6a723863315551c732eb22648b8c89a615b/template/.assets/feature_engineering_pipeline.png


--------------------------------------------------------------------------------
/template/.assets/inference_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zenml-io/template-starter/665ed6a723863315551c732eb22648b8c89a615b/template/.assets/inference_pipeline.png


--------------------------------------------------------------------------------
/template/.assets/pipeline_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zenml-io/template-starter/665ed6a723863315551c732eb22648b8c89a615b/template/.assets/pipeline_overview.png


--------------------------------------------------------------------------------
/template/.assets/training_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zenml-io/template-starter/665ed6a723863315551c732eb22648b8c89a615b/template/.assets/training_pipeline.png


--------------------------------------------------------------------------------
/template/.dockerignore:
--------------------------------------------------------------------------------
1 | .venv*
2 | .requirements*


--------------------------------------------------------------------------------
/template/README.md:
--------------------------------------------------------------------------------
  1 | # :running: MLOps 101 with ZenML
  2 | 
  3 | Build your first MLOps pipelines with ZenML.
  4 | 
  5 | ## :earth_americas: Overview
  6 | 
  7 | This repository is a minimalistic MLOps project intended as a starting point to learn how to put ML workflows in production. It features: 
  8 | 
  9 | - A feature engineering pipeline that loads data and prepares it for training.
 10 | - A training pipeline that loads the preprocessed dataset and trains a model.
 11 | - A batch inference pipeline that runs predictions on the trained model with new data.
 12 | 
 13 | This is a representation of how it will all come together: 
 14 | 
 15 | <img src=".assets/pipeline_overview.png" width="70%" alt="Pipelines Overview">
 16 | 
 17 | Along the way we will also show you how to:
 18 | 
 19 | - Structure your code into MLOps pipelines.
 20 | - Automatically version, track, and cache data, models, and other artifacts.
 21 | - Transition your ML models from development to production.
 22 | 
 23 | ## 🏃 Run on Colab
 24 | 
 25 | You can use Google Colab to see ZenML in action, no signup / installation required!
 26 | 
 27 | <a href="https://colab.research.google.com/github/zenml-io/zenml/blob/main/examples/mlops_starter/quickstart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
 28 | 
 29 | ## :computer: Run Locally
 30 | 
 31 | To run locally, install ZenML and pull this quickstart:
 32 | 
 33 | ```shell
 34 | # Install ZenML
 35 | pip install "zenml[server]"
 36 | 
 37 | # clone the ZenML repository
 38 | git clone https://github.com/zenml-io/zenml.git
 39 | cd zenml/examples/mlops_starter
 40 | ```
 41 | 
 42 | Now we're ready to start. You have two options for running the quickstart locally:
 43 | 
 44 | #### Option 1 - Interactively explore the quickstart using Jupyter Notebook:
 45 | ```bash
 46 | pip install notebook
 47 | jupyter notebook
 48 | # open quickstart.ipynb
 49 | ```
 50 | 
 51 | #### Option 2 - Execute the whole ML pipeline from a Python script:
 52 | ```bash
 53 | # Install required zenml integrations
 54 | zenml integration install sklearn pandas -y
 55 | 
 56 | # Initialize ZenML
 57 | zenml init
 58 | 
 59 | # Start the ZenServer to enable dashboard access
 60 | zenml login --local
 61 | 
 62 | # Run the feature engineering pipeline
 63 | python run.py --feature-pipeline
 64 | 
 65 | # Run the training pipeline
 66 | python run.py --training-pipeline
 67 | 
 68 | # Run the training pipeline with versioned artifacts
 69 | python run.py --training-pipeline --train-dataset-version-name=1 --test-dataset-version-name=1
 70 | 
 71 | # Run the inference pipeline
 72 | python run.py --inference-pipeline
 73 | ```
 74 | 
 75 | ## 🌵 Learning MLOps with ZenML
 76 | 
 77 | This project is also a great source of learning about some fundamental MLOps concepts. In sum, there are four exemplary steps happening, that can be mapped onto many other projects:
 78 | 
 79 | <details>
 80 |   <summary>🥇 Step 1: Load your data and execute feature engineering</summary>
 81 | 
 82 | We'll start off by importing our data. In this project, we'll be working with
 83 | [the Breast Cancer](https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic) dataset
 84 | which is publicly available on the UCI Machine Learning Repository. The task is a classification
 85 | problem, to predict whether a patient is diagnosed with breast cancer or not.
 86 | 
 87 | When you're getting started with a machine learning problem you'll want to do
 88 | something similar to this: import your data and get it in the right shape for
 89 | your training. Here are the typical steps within a feature engineering pipeline.
 90 | 
 91 | The steps can be found defined the [steps](steps/) directory, while the [pipelines](pipelines/) directory has the pipeline code to connect them together.
 92 | 
 93 | <img src=".assets/feature_engineering_pipeline.png" width="50%" alt="Feature engineering pipeline" />
 94 | 
 95 | To execute the feature engineer pipelines, run:
 96 | 
 97 | ```python
 98 | python run.py --feature-pipeline
 99 | ```
100 | 
101 | After the pipeline has run, the pipeline will produce some logs like:
102 | 
103 | ```shell
104 | The latest feature engineering pipeline produced the following artifacts: 
105 | 
106 | 1. Train Dataset - Name: dataset_trn, Version Name: 1 
107 | 2. Test Dataset: Name: dataset_tst, Version Name: 1
108 | ```
109 | 
110 | We will use these versions in the next pipeline.
111 | 
112 | </details>
113 | 
114 | <details>
115 |   <summary>⌚ Step 2: Training pipeline</summary>
116 | 
117 | Now that our data is prepared, it makes sense to train some models to get a sense of how difficult the task is. The Breast Cancer dataset is sufficiently large and complex  that it's unlikely we'll be able to train a model that behaves perfectly since the problem is inherently complex, but we can get a sense of what a reasonable baseline looks like.
118 | 
119 | We'll start with two simple models, a SGD Classifier and a Random Forest
120 | Classifier, both batteries-included from `sklearn`. We'll train them on the
121 | same data and then compare their performance.
122 | 
123 | <img src=".assets/training_pipeline.png" width="50%" alt="Training pipeline">
124 | 
125 | Run it by using the ID's from the first step:
126 | 
127 | ```python
128 | # You can also ignore the `--train-dataset-version-name` and `--test-dataset-version-name` to use 
129 | #  the latest versions
130 | python run.py --training-pipeline --train-dataset-version-name 1 --test-dataset-version-name 1
131 | ```
132 | 
133 | To track these models, ZenML offers a *Model Control Plane*, which is a central register of all your ML models.
134 | Each run of the training pipeline will produce a ZenML Model Version.
135 | 
136 | ```shell
137 | zenml model list
138 | ```
139 | 
140 | This will show you a new `breast_cancer_classifier` model with two versions, `sgd` and `rf` created. You can find out how this was configured in the [YAML pipeline configuration files](configs/).
141 | 
142 | If you are a [ZenML Pro](https://zenml.io/pro) user, you can see all of this visualized in the dashboard:
143 | 
144 | <img src=".assets/cloud_mcp_screenshot.png" width="70%" alt="Model Control Plane">
145 | 
146 | There is a lot more you can do with ZenML models, including the ability to
147 | track metrics by adding metadata to it, or having them persist in a model
148 | registry. However, these topics can be explored more in the
149 | [ZenML docs](https://docs.zenml.io).
150 | 
151 | </details>
152 | 
153 | <details>
154 |   <summary>💯 Step 3: Promoting the best model to production</summary>
155 | 
156 | For now, we will use the ZenML model control plane to promote our best
157 | model to `production`. You can do this by simply setting the `stage` of
158 | your chosen model version to the `production` tag.
159 | 
160 | ```shell
161 | zenml model version update breast_cancer_classifier rf --stage production
162 | ```
163 | 
164 | While we've demonstrated a manual promotion process for clarity, a more in-depth look at the [promoter code](steps/model_promoter.py) reveals that the training pipeline is designed to automate this step. It evaluates the latest model against established production metrics and, if the new model outperforms the existing one based on test set results, it will automatically promote the model to production. Here is an overview of the process:
165 | 
166 | <img src=".assets/cloud_mcp.png" width="60%" alt="Model Control Plane">
167 | 
168 | Again, if you are a [ZenML Pro](https://zenml.io/pro) user, you would be able to see all this in the cloud dashboard.
169 | 
170 | </details>
171 | 
172 | <details>
173 |   <summary>🫅 Step 4: Consuming the model in production</summary>
174 | 
175 | Once the model is promoted, we can now consume the right model version in our
176 | batch inference pipeline directly. Let's see how that works.
177 | 
178 | The batch inference pipeline simply takes the model marked as `production` and runs inference on it
179 | with `live data`. The critical step here is the `inference_predict` step, where we load the model in memory and generate predictions. Apart from the loading the model, we must also load the preprocessing pipeline that we ran in feature engineering,
180 | so that we can do the exact steps that we did on training time, in inference time. Let's bring it all together:
181 | 
182 | ZenML automatically links all artifacts to the `production` model version as well, including the predictions
183 | that were returned in the pipeline. This completes the MLOps loop of training to inference:
184 | 
185 | <img src=".assets/inference_pipeline.png" width="45%" alt="Inference pipeline">
186 | 
187 | You can also see all predictions ever created as a complete history in the dashboard (Again only for [ZenML Pro](https://zenml.io/pro) users):
188 | 
189 | <img src=".assets/cloud_mcp_predictions.png" width="70%" alt="Model Control Plane">
190 | 
191 | </details>
192 | 
193 | ## :bulb: Learn More
194 | 
195 | You're a legit MLOps engineer now! You trained two models, evaluated them against
196 | a test set, registered the best one with the ZenML model control plane,
197 | and served some predictions. You also learned how to iterate on your models and
198 | data by using some of the ZenML utility abstractions. You saw how to view your
199 | artifacts and stacks via the client as well as the ZenML Dashboard.
200 | 
201 | If you want to learn more about ZenML as a tool, then the
202 | [:page_facing_up: **ZenML Docs**](https://docs.zenml.io/) are the perfect place
203 | to get started. In particular, the [Production Guide](https://docs.zenml.io/user-guide/production-guide/)
204 | goes into more detail as to how to transition these same pipelines into production on the cloud.
205 | 
206 | The best way to get a production ZenML instance up and running with all batteries included is the [ZenML Pro](https://zenml.io/pro). Check it out!
207 | 
208 | Also, make sure to join our <a href="https://zenml.io/slack" target="_blank">
209 |     <img width="15" src="https://cdn3.iconfinder.com/data/icons/logos-and-brands-adobe/512/306_Slack-512.png" alt="Slack"/>
210 |     <b>Slack Community</b> 
211 | </a> to become part of the ZenML family!
212 | 


--------------------------------------------------------------------------------
/template/configs/feature_engineering.yaml:
--------------------------------------------------------------------------------
 1 | # environment configuration
 2 | settings:
 3 |   docker:
 4 |     required_integrations:
 5 |       - sklearn
 6 |       - pandas
 7 |     requirements:
 8 |       - pyarrow
 9 | 
10 | # pipeline configuration
11 | test_size: 0.35


--------------------------------------------------------------------------------
/template/configs/inference.yaml:
--------------------------------------------------------------------------------
 1 | # environment configuration
 2 | settings:
 3 |   docker:
 4 |     required_integrations:
 5 |       - sklearn
 6 |       - pandas
 7 |     requirements:
 8 |       - pyarrow
 9 | 
10 | # configuration of the Model Control Plane
11 | model:
12 |   name: "breast_cancer_classifier"
13 |   version: "production"
14 |   license: Apache 2.0
15 |   description: A breast cancer classifier
16 |   tags: ["breast_cancer", "classifier"]


--------------------------------------------------------------------------------
/template/configs/training_rf.yaml:
--------------------------------------------------------------------------------
 1 | # environment configuration
 2 | settings:
 3 |   docker:
 4 |     required_integrations:
 5 |       - sklearn
 6 |       - pandas
 7 |     requirements:
 8 |       - pyarrow
 9 | 
10 | # configuration of the Model Control Plane
11 | model:
12 |   name: breast_cancer_classifier
13 |   version: rf
14 |   license: Apache 2.0
15 |   description: A breast cancer classifier
16 |   tags: ["breast_cancer", "classifier"]
17 | 
18 | # Configure the pipeline
19 | parameters:
20 |   model_type: "rf"  # Choose between rf/sgd
21 | 


--------------------------------------------------------------------------------
/template/configs/training_sgd.yaml:
--------------------------------------------------------------------------------
 1 | # environment configuration
 2 | settings:
 3 |   docker:
 4 |     required_integrations:
 5 |       - sklearn
 6 |       - pandas
 7 |     requirements:
 8 |       - pyarrow
 9 | 
10 | # configuration of the Model Control Plane
11 | model:
12 |   name: breast_cancer_classifier
13 |   version: sgd
14 |   license: Apache 2.0
15 |   description: A breast cancer classifier
16 |   tags: ["breast_cancer", "classifier"]
17 | 
18 | # Configure the pipeline
19 | parameters:
20 |   model_type: "sgd"  # Choose between rf/sgd


--------------------------------------------------------------------------------
/template/license:
--------------------------------------------------------------------------------
  1 | {% if open_source_license == 'mit' -%}
  2 | MIT License
  3 | 
  4 | Copyright (c) {{ full_name }} {% now 'local', '%Y' %}
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | of this software and associated documentation files (the "Software"), to deal
  8 | in the Software without restriction, including without limitation the rights
  9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | copies of the Software, and to permit persons to whom the Software is
 11 | furnished to do so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in all
 14 | copies or substantial portions of the Software.
 15 | 
 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22 | SOFTWARE.
 23 | {% elif open_source_license == 'bsd' %}
 24 | 
 25 | BSD License
 26 | 
 27 | Copyright (c) {{ full_name }} {% now 'local', '%Y' %}. All rights reserved.
 28 | 
 29 | Redistribution and use in source and binary forms, with or without modification,
 30 | are permitted provided that the following conditions are met:
 31 | 
 32 | * Redistributions of source code must retain the above copyright notice, this
 33 |   list of conditions and the following disclaimer.
 34 | 
 35 | * Redistributions in binary form must reproduce the above copyright notice, this
 36 |   list of conditions and the following disclaimer in the documentation and/or
 37 |   other materials provided with the distribution.
 38 | 
 39 | * Neither the name of the copyright holder nor the names of its
 40 |   contributors may be used to endorse or promote products derived from this
 41 |   software without specific prior written permission.
 42 | 
 43 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 44 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 45 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 46 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 47 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 48 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 49 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 50 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
 51 | OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 52 | OF THE POSSIBILITY OF SUCH DAMAGE.
 53 | {% elif open_source_license == 'isc' -%}
 54 | ISC License
 55 | 
 56 | Copyright (c) {{ full_name }} {% now 'local', '%Y' %}
 57 | 
 58 | Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies.
 59 | 
 60 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 61 | {% elif open_source_license == 'apache' -%}
 62 | Apache Software License 2.0
 63 | 
 64 | Copyright (c) {{ full_name }} {% now 'local', '%Y' %}. All rights reserved.
 65 | 
 66 | Licensed under the Apache License, Version 2.0 (the "License");
 67 | you may not use this file except in compliance with the License.
 68 | You may obtain a copy of the License at
 69 | 
 70 | http://www.apache.org/licenses/LICENSE-2.0
 71 | 
 72 | Unless required by applicable law or agreed to in writing, software
 73 | distributed under the License is distributed on an "AS IS" BASIS,
 74 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 75 | See the License for the specific language governing permissions and
 76 | limitations under the License.
 77 | {% elif open_source_license == 'gpl3' -%}
 78 | GNU GENERAL PUBLIC LICENSE
 79 |                       Version 3, 29 June 2007
 80 | 
 81 |     {{ project_short_description }}
 82 |     Copyright (C) {{ full_name }} {% now 'local', '%Y' %}
 83 | 
 84 |     This program is free software: you can redistribute it and/or modify
 85 |     it under the terms of the GNU General Public License as published by
 86 |     the Free Software Foundation, either version 3 of the License, or
 87 |     (at your option) any later version.
 88 | 
 89 |     This program is distributed in the hope that it will be useful,
 90 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
 91 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 92 |     GNU General Public License for more details.
 93 | 
 94 |     You should have received a copy of the GNU General Public License
 95 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 96 | 
 97 | Also add information on how to contact you by electronic and paper mail.
 98 | 
 99 |   You should also get your employer (if you work as a programmer) or school,
100 | if any, to sign a "copyright disclaimer" for the program, if necessary.
101 | For more information on this, and how to apply and follow the GNU GPL, see
102 | <http://www.gnu.org/licenses/>.
103 | 
104 |   The GNU General Public License does not permit incorporating your program
105 | into proprietary programs.  If your program is a subroutine library, you
106 | may consider it more useful to permit linking proprietary applications with
107 | the library.  If this is what you want to do, use the GNU Lesser General
108 | Public License instead of this License.  But first, please read
109 | <http://www.gnu.org/philosophy/why-not-lgpl.html>.
110 | {% endif %}


--------------------------------------------------------------------------------
/template/license_header:
--------------------------------------------------------------------------------
1 | {%- macro license() %}{% include 'template/license' %}{% endmacro -%}
2 | {{ license() | replace('\n', '\n# ') }}


--------------------------------------------------------------------------------
/template/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | # {% include 'template/license_header' %}
2 | 
3 | from .feature_engineering import feature_engineering
4 | from .inference import inference
5 | from .training import training
6 | 


--------------------------------------------------------------------------------
/template/pipelines/feature_engineering.py:
--------------------------------------------------------------------------------
 1 | # {% include 'template/license_header' %}
 2 | 
 3 | from typing import List, Optional
 4 | 
 5 | from steps import (
 6 |     data_loader,
 7 |     data_preprocessor,
 8 |     data_splitter,
 9 | )
10 | from zenml import pipeline
11 | from zenml.logger import get_logger
12 | 
13 | logger = get_logger(__name__)
14 | 
15 | 
16 | @pipeline
17 | def feature_engineering(
18 |     test_size: float = 0.2,
19 |     drop_na: Optional[bool] = None,
20 |     normalize: Optional[bool] = None,
21 |     drop_columns: Optional[List[str]] = None,
22 |     target: Optional[str] = "target",
23 |     random_state: int = 17,
24 | ):
25 |     """
26 |     Feature engineering pipeline.
27 | 
28 |     This is a pipeline that loads the data, processes it and splits
29 |     it into train and test sets.
30 | 
31 |     Args:
32 |         test_size: Size of holdout set for training 0.0..1.0
33 |         drop_na: If `True` NA values will be removed from dataset
34 |         normalize: If `True` dataset will be normalized with MinMaxScaler
35 |         drop_columns: List of columns to drop from dataset
36 |         target: Name of target column in dataset
37 |         random_state: Random state to configure the data loader
38 | 
39 |     Returns:
40 |         The processed datasets (dataset_trn, dataset_tst).
41 |     """
42 |     # Link all the steps together by calling them and passing the output
43 |     # of one step as the input of the next step.
44 |     raw_data = data_loader(random_state=random_state, target=target)
45 |     dataset_trn, dataset_tst = data_splitter(
46 |         dataset=raw_data,
47 |         test_size=test_size,
48 |     )
49 |     dataset_trn, dataset_tst, _ = data_preprocessor(
50 |         dataset_trn=dataset_trn,
51 |         dataset_tst=dataset_tst,
52 |         drop_na=drop_na,
53 |         normalize=normalize,
54 |         drop_columns=drop_columns,
55 |         target=target,
56 |         random_state=random_state,
57 |     )
58 |     return dataset_trn, dataset_tst
59 | 


--------------------------------------------------------------------------------
/template/pipelines/inference.py:
--------------------------------------------------------------------------------
 1 | # {% include 'template/license_header' %}
 2 | 
 3 | from steps import (
 4 |     data_loader,
 5 |     inference_predict,
 6 |     inference_preprocessor,
 7 | )
 8 | from zenml import get_pipeline_context, pipeline
 9 | from zenml.logger import get_logger
10 | 
11 | logger = get_logger(__name__)
12 | 
13 | 
14 | @pipeline
15 | def inference(random_state: int, target: str):
16 |     """
17 |     Model inference pipeline.
18 | 
19 |     This is a pipeline that loads the inference data, processes it with
20 |     the same preprocessing pipeline used in training, and runs inference
21 |     with the trained model.
22 | 
23 |     Args:
24 |         random_state: Random state for reproducibility.
25 |         target: Name of target column in dataset.
26 |     """
27 |     # Get the production model artifact
28 |     model = get_pipeline_context().model.get_artifact("sklearn_classifier")
29 | 
30 |     # Get the preprocess pipeline artifact associated with this version
31 |     preprocess_pipeline = get_pipeline_context().model.get_artifact(
32 |         "preprocess_pipeline"
33 |     )
34 | 
35 |     # Link all the steps together by calling them and passing the output
36 |     #  of one step as the input of the next step.
37 |     df_inference = data_loader(random_state=random_state, is_inference=True)
38 |     df_inference = inference_preprocessor(
39 |         dataset_inf=df_inference,
40 |         preprocess_pipeline=preprocess_pipeline,
41 |         target=target,
42 |     )
43 |     inference_predict(
44 |         model=model,
45 |         dataset_inf=df_inference,
46 |     )
47 | 


--------------------------------------------------------------------------------
/template/pipelines/training.py:
--------------------------------------------------------------------------------
 1 | # {% include 'template/license_header' %}
 2 | 
 3 | from typing import Optional
 4 | from uuid import UUID
 5 | 
 6 | from steps import model_evaluator, model_promoter, model_trainer
 7 | from zenml import pipeline
 8 | from zenml.client import Client
 9 | from zenml.logger import get_logger
10 | 
11 | from pipelines import (
12 |     feature_engineering,
13 | )
14 | 
15 | logger = get_logger(__name__)
16 | 
17 | 
18 | @pipeline
19 | def training(
20 |     train_dataset_id: Optional[UUID] = None,
21 |     test_dataset_id: Optional[UUID] = None,
22 |     target: Optional[str] = "target",
23 |     model_type: Optional[str] = "sgd",
24 | ):
25 |     """
26 |     Model training pipeline.
27 | 
28 |     This is a pipeline that loads the data from a preprocessing pipeline,
29 |     trains a model on it and evaluates the model. If it is the first model
30 |     to be trained, it will be promoted to production. If not, it will be
31 |     promoted only if it has a higher accuracy than the current production
32 |     model version.
33 | 
34 |     Args:
35 |         train_dataset_id: ID of the train dataset produced by feature engineering.
36 |         test_dataset_id: ID of the test dataset produced by feature engineering.
37 |         target: Name of target column in dataset.
38 |         model_type: The type of model to train.
39 |     """
40 |     # Link all the steps together by calling them and passing the output
41 |     # of one step as the input of the next step.
42 | 
43 |     # Execute Feature Engineering Pipeline
44 |     if train_dataset_id is None or test_dataset_id is None:
45 |         dataset_trn, dataset_tst = feature_engineering()
46 |     else:
47 |         client = Client()
48 |         dataset_trn = client.get_artifact_version(name_id_or_prefix=train_dataset_id)
49 |         dataset_tst = client.get_artifact_version(name_id_or_prefix=test_dataset_id)
50 | 
51 |     model = model_trainer(dataset_trn=dataset_trn, target=target, model_type=model_type)
52 | 
53 |     acc = model_evaluator(
54 |         model=model,
55 |         dataset_trn=dataset_trn,
56 |         dataset_tst=dataset_tst,
57 |         target=target,
58 |     )
59 | 
60 |     model_promoter(accuracy=acc)
61 | 


--------------------------------------------------------------------------------
/template/quickstart.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "id": "63ab391a",
   6 |    "metadata": {},
   7 |    "source": [
   8 |     "# Intro to MLOps using ZenML\n",
   9 |     "\n",
  10 |     "## 🌍 Overview\n",
  11 |     "\n",
  12 |     "This repository is a minimalistic MLOps project intended as a starting point to learn how to put ML workflows in production. It features: \n",
  13 |     "\n",
  14 |     "- A feature engineering pipeline that loads data and prepares it for training.\n",
  15 |     "- A training pipeline that loads the preprocessed dataset and trains a model.\n",
  16 |     "- A batch inference pipeline that runs predictions on the trained model with new data.\n",
  17 |     "\n",
  18 |     "Follow along this notebook to understand how you can use ZenML to productionalize your ML workflows!\n",
  19 |     "\n",
  20 |     "<img src=\".assets/pipeline_overview.png\" width=\"50%\" alt=\"Pipelines Overview\">"
  21 |    ]
  22 |   },
  23 |   {
  24 |    "cell_type": "markdown",
  25 |    "id": "8f466b16",
  26 |    "metadata": {},
  27 |    "source": [
  28 |     "## Run on Colab\n",
  29 |     "\n",
  30 |     "You can use Google Colab to see ZenML in action, no signup / installation\n",
  31 |     "required!\n",
  32 |     "\n",
  33 |     "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](\n",
  34 |     "https://colab.research.google.com/github/zenml-io/zenml/blob/main/examples/mlops_starter/quickstart.ipynb)"
  35 |    ]
  36 |   },
  37 |   {
  38 |    "cell_type": "markdown",
  39 |    "id": "66b2977c",
  40 |    "metadata": {},
  41 |    "source": [
  42 |     "# 👶 Step 0. Install Requirements\n",
  43 |     "\n",
  44 |     "Let's install ZenML to get started. First we'll install the latest version of\n",
  45 |     "ZenML as well as the `sklearn` integration of ZenML:"
  46 |    ]
  47 |   },
  48 |   {
  49 |    "cell_type": "code",
  50 |    "execution_count": null,
  51 |    "id": "ce2f40eb",
  52 |    "metadata": {},
  53 |    "outputs": [],
  54 |    "source": [
  55 |     "!pip install \"zenml[server]\""
  56 |    ]
  57 |   },
  58 |   {
  59 |    "cell_type": "code",
  60 |    "execution_count": null,
  61 |    "id": "5aad397e",
  62 |    "metadata": {},
  63 |    "outputs": [],
  64 |    "source": [
  65 |     "from zenml.environment import Environment\n",
  66 |     "\n",
  67 |     "if Environment.in_google_colab():\n",
  68 |     "    # Install Cloudflare Tunnel binary\n",
  69 |     "    !wget -q https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64.deb && dpkg -i cloudflared-linux-amd64.deb\n",
  70 |     "\n",
  71 |     "    # Pull required modules from this example\n",
  72 |     "    !git clone -b main https://github.com/zenml-io/zenml\n",
  73 |     "    !cp -r zenml/examples/quickstart/* .\n",
  74 |     "    !rm -rf zenml"
  75 |    ]
  76 |   },
  77 |   {
  78 |    "cell_type": "code",
  79 |    "execution_count": null,
  80 |    "id": "f76f562e",
  81 |    "metadata": {},
  82 |    "outputs": [],
  83 |    "source": [
  84 |     "!zenml integration install sklearn -y\n",
  85 |     "\n",
  86 |     "import IPython\n",
  87 |     "\n",
  88 |     "IPython.Application.instance().kernel.do_shutdown(restart=True)"
  89 |    ]
  90 |   },
  91 |   {
  92 |    "cell_type": "markdown",
  93 |    "id": "3b044374",
  94 |    "metadata": {},
  95 |    "source": [
  96 |     "Please wait for the installation to complete before running subsequent cells. At\n",
  97 |     "the end of the installation, the notebook kernel will automatically restart."
  98 |    ]
  99 |   },
 100 |   {
 101 |    "cell_type": "markdown",
 102 |    "id": "966ce581",
 103 |    "metadata": {},
 104 |    "source": [
 105 |     "## ☁️ Step 1: Connect to ZenML Pro\n",
 106 |     "\n",
 107 |     "If you are using [ZenML Pro](https://zenml.io/pro), execute the following\n",
 108 |     "cell with your tenant URL. Otherwise ignore.\n",
 109 |     "\n",
 110 |     "ZenML Pro is a managed service that provides a hosted ZenML environment. It\n",
 111 |     "allows you to run your pipelines on the cloud, manage your metadata, and\n",
 112 |     "collaborate with your team. Sign up [here](https://zenml.io/pro) for\n",
 113 |     "a free trial and to get started!"
 114 |    ]
 115 |   },
 116 |   {
 117 |    "cell_type": "code",
 118 |    "execution_count": null,
 119 |    "id": "e2587315",
 120 |    "metadata": {},
 121 |    "outputs": [],
 122 |    "source": [
 123 |     "zenml_server_url = \"PLEASE_UPDATE_ME\"  # in the form \"https://URL_TO_SERVER\"\n",
 124 |     "\n",
 125 |     "!zenml login $zenml_server_url"
 126 |    ]
 127 |   },
 128 |   {
 129 |    "cell_type": "code",
 130 |    "execution_count": null,
 131 |    "id": "081d5616",
 132 |    "metadata": {},
 133 |    "outputs": [],
 134 |    "source": [
 135 |     "# Initialize ZenML and set the default stack\n",
 136 |     "!zenml init\n",
 137 |     "\n",
 138 |     "!zenml stack set default"
 139 |    ]
 140 |   },
 141 |   {
 142 |    "cell_type": "code",
 143 |    "execution_count": null,
 144 |    "id": "79f775f2",
 145 |    "metadata": {},
 146 |    "outputs": [],
 147 |    "source": [
 148 |     "# Do the imports at the top\n",
 149 |     "import random\n",
 150 |     "from typing import List, Optional\n",
 151 |     "from uuid import UUID\n",
 152 |     "\n",
 153 |     "import pandas as pd\n",
 154 |     "from sklearn.datasets import load_breast_cancer\n",
 155 |     "from steps import (\n",
 156 |     "    data_loader,\n",
 157 |     "    data_preprocessor,\n",
 158 |     "    data_splitter,\n",
 159 |     "    inference_preprocessor,\n",
 160 |     "    model_evaluator,\n",
 161 |     ")\n",
 162 |     "from typing_extensions import Annotated\n",
 163 |     "from zenml import Model, get_step_context, pipeline, step\n",
 164 |     "from zenml.client import Client\n",
 165 |     "from zenml.logger import get_logger\n",
 166 |     "\n",
 167 |     "logger = get_logger(__name__)\n",
 168 |     "\n",
 169 |     "# Initialize the ZenML client to fetch objects from the ZenML Server\n",
 170 |     "client = Client()"
 171 |    ]
 172 |   },
 173 |   {
 174 |    "cell_type": "markdown",
 175 |    "id": "35e48460",
 176 |    "metadata": {},
 177 |    "source": [
 178 |     "## 🥇 Step 2: Load your data and execute feature engineering\n",
 179 |     "\n",
 180 |     "We'll start off by importing our data. In this quickstart we'll be working with\n",
 181 |     "[the Breast Cancer](https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic) dataset\n",
 182 |     "which is publicly available on the UCI Machine Learning Repository. The task is a classification\n",
 183 |     "problem, to predict whether a patient is diagnosed with breast cancer or not.\n",
 184 |     "\n",
 185 |     "When you're getting started with a machine learning problem you'll want to do\n",
 186 |     "something similar to this: import your data and get it in the right shape for\n",
 187 |     "your training. ZenML mostly gets out of your way when you're writing your Python\n",
 188 |     "code, as you'll see from the following cell.\n",
 189 |     "\n",
 190 |     "<img src=\".assets/feature_engineering_pipeline.png\" width=\"50%\" alt=\"Feature engineering pipeline\" />"
 191 |    ]
 192 |   },
 193 |   {
 194 |    "cell_type": "code",
 195 |    "execution_count": null,
 196 |    "id": "3cd974d1",
 197 |    "metadata": {},
 198 |    "outputs": [],
 199 |    "source": [
 200 |     "@step\n",
 201 |     "def data_loader_simplified(\n",
 202 |     "    random_state: int, is_inference: bool = False, target: str = \"target\"\n",
 203 |     ") -> Annotated[pd.DataFrame, \"dataset\"]:  # We name the dataset\n",
 204 |     "    \"\"\"Dataset reader step.\"\"\"\n",
 205 |     "    dataset = load_breast_cancer(as_frame=True)\n",
 206 |     "    inference_size = int(len(dataset.target) * 0.05)\n",
 207 |     "    dataset: pd.DataFrame = dataset.frame\n",
 208 |     "    inference_subset = dataset.sample(inference_size, random_state=random_state)\n",
 209 |     "    if is_inference:\n",
 210 |     "        dataset = inference_subset\n",
 211 |     "        dataset.drop(columns=target, inplace=True)\n",
 212 |     "    else:\n",
 213 |     "        dataset.drop(inference_subset.index, inplace=True)\n",
 214 |     "    dataset.reset_index(drop=True, inplace=True)\n",
 215 |     "    logger.info(f\"Dataset with {len(dataset)} records loaded!\")\n",
 216 |     "    return dataset"
 217 |    ]
 218 |   },
 219 |   {
 220 |    "cell_type": "markdown",
 221 |    "id": "1e8ba4c6",
 222 |    "metadata": {},
 223 |    "source": [
 224 |     "The whole function is decorated with the `@step` decorator, which\n",
 225 |     "tells ZenML to track this function as a step in the pipeline. This means that\n",
 226 |     "ZenML will automatically version, track, and cache the data that is produced by\n",
 227 |     "this function as an `artifact`. This is a very powerful feature, as it means that you can\n",
 228 |     "reproduce your data at any point in the future, even if the original data source\n",
 229 |     "changes or disappears. \n",
 230 |     "\n",
 231 |     "Note the use of the `typing` module's `Annotated` type hint in the output of the\n",
 232 |     "step. We're using this to give a name to the output of the step, which will make\n",
 233 |     "it possible to access it via a keyword later on.\n",
 234 |     "\n",
 235 |     "You'll also notice that we have included type hints for the outputs\n",
 236 |     "to the function. These are not only useful for anyone reading your code, but\n",
 237 |     "help ZenML process your data in a way appropriate to the specific data types."
 238 |    ]
 239 |   },
 240 |   {
 241 |    "cell_type": "markdown",
 242 |    "id": "b6286b67",
 243 |    "metadata": {},
 244 |    "source": [
 245 |     "ZenML is built in a way that allows you to experiment with your data and build\n",
 246 |     "your pipelines as you work, so if you want to call this function to see how it\n",
 247 |     "works, you can just call it directly. Here we take a look at the first few rows\n",
 248 |     "of your training dataset."
 249 |    ]
 250 |   },
 251 |   {
 252 |    "cell_type": "code",
 253 |    "execution_count": null,
 254 |    "id": "d838e2ea",
 255 |    "metadata": {},
 256 |    "outputs": [],
 257 |    "source": [
 258 |     "df = data_loader_simplified(random_state=42)\n",
 259 |     "df.head()"
 260 |    ]
 261 |   },
 262 |   {
 263 |    "cell_type": "markdown",
 264 |    "id": "28c05291",
 265 |    "metadata": {},
 266 |    "source": [
 267 |     "Everything looks as we'd expect and the values are all in the right format 🥳.\n",
 268 |     "\n",
 269 |     "We're now at the point where can bring this step (and some others) together into a single\n",
 270 |     "pipeline, the top-level organising entity for code in ZenML. Creating such a pipeline is\n",
 271 |     "as simple as adding a `@pipeline` decorator to a function. This specific\n",
 272 |     "pipeline doesn't return a value, but that option is available to you if you need."
 273 |    ]
 274 |   },
 275 |   {
 276 |    "cell_type": "code",
 277 |    "execution_count": null,
 278 |    "id": "b50a9537",
 279 |    "metadata": {},
 280 |    "outputs": [],
 281 |    "source": [
 282 |     "@pipeline\n",
 283 |     "def feature_engineering(\n",
 284 |     "    test_size: float = 0.3,\n",
 285 |     "    drop_na: Optional[bool] = None,\n",
 286 |     "    normalize: Optional[bool] = None,\n",
 287 |     "    drop_columns: Optional[List[str]] = None,\n",
 288 |     "    target: Optional[str] = \"target\",\n",
 289 |     "    random_state: int = 17,\n",
 290 |     "):\n",
 291 |     "    \"\"\"Feature engineering pipeline.\"\"\"\n",
 292 |     "    # Link all the steps together by calling them and passing the output\n",
 293 |     "    # of one step as the input of the next step.\n",
 294 |     "    raw_data = data_loader(random_state=random_state, target=target)\n",
 295 |     "    dataset_trn, dataset_tst = data_splitter(\n",
 296 |     "        dataset=raw_data,\n",
 297 |     "        test_size=test_size,\n",
 298 |     "    )\n",
 299 |     "    dataset_trn, dataset_tst, _ = data_preprocessor(\n",
 300 |     "        dataset_trn=dataset_trn,\n",
 301 |     "        dataset_tst=dataset_tst,\n",
 302 |     "        drop_na=drop_na,\n",
 303 |     "        normalize=normalize,\n",
 304 |     "        drop_columns=drop_columns,\n",
 305 |     "        target=target,\n",
 306 |     "        random_state=random_state,\n",
 307 |     "    )"
 308 |    ]
 309 |   },
 310 |   {
 311 |    "cell_type": "markdown",
 312 |    "id": "7cd73c23",
 313 |    "metadata": {},
 314 |    "source": [
 315 |     "We're ready to run the pipeline now, which we can do just as with the step - by calling the\n",
 316 |     "pipeline function itself:"
 317 |    ]
 318 |   },
 319 |   {
 320 |    "cell_type": "code",
 321 |    "execution_count": null,
 322 |    "id": "1e0aa9af",
 323 |    "metadata": {},
 324 |    "outputs": [],
 325 |    "source": [
 326 |     "feature_engineering()"
 327 |    ]
 328 |   },
 329 |   {
 330 |    "cell_type": "markdown",
 331 |    "id": "1785c303",
 332 |    "metadata": {},
 333 |    "source": [
 334 |     "Let's run this again with a slightly different test size, to create more datasets:"
 335 |    ]
 336 |   },
 337 |   {
 338 |    "cell_type": "code",
 339 |    "execution_count": null,
 340 |    "id": "658c0570-2607-4b97-a72d-d45c92633e48",
 341 |    "metadata": {},
 342 |    "outputs": [],
 343 |    "source": [
 344 |     "feature_engineering(test_size=0.25)"
 345 |    ]
 346 |   },
 347 |   {
 348 |    "cell_type": "markdown",
 349 |    "id": "64bb7206",
 350 |    "metadata": {},
 351 |    "source": [
 352 |     "Notice the second time around, the data loader step was **cached**, while the rest of the pipeline was rerun. \n",
 353 |     "This is because ZenML automatically determined that nothing had changed in the data loader step, \n",
 354 |     "so it didn't need to rerun it."
 355 |    ]
 356 |   },
 357 |   {
 358 |    "cell_type": "markdown",
 359 |    "id": "5bc6849d-31ac-4c08-9ca2-cf7f5f35ccbf",
 360 |    "metadata": {},
 361 |    "source": [
 362 |     "Let's run this again with a slightly different test size and random state, to disable the cache and to create more datasets:"
 363 |    ]
 364 |   },
 365 |   {
 366 |    "cell_type": "code",
 367 |    "execution_count": null,
 368 |    "id": "1e1d8546",
 369 |    "metadata": {},
 370 |    "outputs": [],
 371 |    "source": [
 372 |     "feature_engineering(test_size=0.25, random_state=104)"
 373 |    ]
 374 |   },
 375 |   {
 376 |    "cell_type": "markdown",
 377 |    "id": "6c42078a",
 378 |    "metadata": {},
 379 |    "source": [
 380 |     "At this point you might be interested to view your pipeline runs in the ZenML\n",
 381 |     "Dashboard. In case you are not using a hosted instance of ZenML, you can spin this up by executing the next cell. This will start a\n",
 382 |     "server which you can access by clicking on the link that appears in the output\n",
 383 |     "of the cell.\n",
 384 |     "\n",
 385 |     "Log into the Dashboard using default credentials (username 'default' and\n",
 386 |     "password left blank). From there you can inspect the pipeline or the specific\n",
 387 |     "pipeline run.\n"
 388 |    ]
 389 |   },
 390 |   {
 391 |    "cell_type": "code",
 392 |    "execution_count": null,
 393 |    "id": "8cd3cc8c",
 394 |    "metadata": {},
 395 |    "outputs": [],
 396 |    "source": [
 397 |     "from zenml.environment import Environment\n",
 398 |     "from zenml.zen_stores.rest_zen_store import RestZenStore\n",
 399 |     "\n",
 400 |     "if not isinstance(client.zen_store, RestZenStore):\n",
 401 |     "    # Only spin up a local Dashboard in case you aren't already connected to a remote server\n",
 402 |     "    if Environment.in_google_colab():\n",
 403 |     "        # run ZenML through a cloudflare tunnel to get a public endpoint\n",
 404 |     "        !zenml login --local --port 8237 & cloudflared tunnel --url http://localhost:8237\n",
 405 |     "    else:\n",
 406 |     "        !zenml login --local"
 407 |    ]
 408 |   },
 409 |   {
 410 |    "cell_type": "markdown",
 411 |    "id": "e8471f93",
 412 |    "metadata": {},
 413 |    "source": [
 414 |     "We can also fetch the pipeline from the server and view the results directly in the notebook:"
 415 |    ]
 416 |   },
 417 |   {
 418 |    "cell_type": "code",
 419 |    "execution_count": null,
 420 |    "id": "f208b200",
 421 |    "metadata": {},
 422 |    "outputs": [],
 423 |    "source": [
 424 |     "client = Client()\n",
 425 |     "run = client.get_pipeline(\"feature_engineering\").last_run\n",
 426 |     "print(run.name)"
 427 |    ]
 428 |   },
 429 |   {
 430 |    "cell_type": "markdown",
 431 |    "id": "a037f09d",
 432 |    "metadata": {},
 433 |    "source": [
 434 |     "We can also see the data artifacts that were produced by the last step of the pipeline:"
 435 |    ]
 436 |   },
 437 |   {
 438 |    "cell_type": "code",
 439 |    "execution_count": null,
 440 |    "id": "34283e89",
 441 |    "metadata": {},
 442 |    "outputs": [],
 443 |    "source": [
 444 |     "run.steps[\"data_preprocessor\"].outputs"
 445 |    ]
 446 |   },
 447 |   {
 448 |    "cell_type": "code",
 449 |    "execution_count": null,
 450 |    "id": "bceb0312",
 451 |    "metadata": {},
 452 |    "outputs": [],
 453 |    "source": [
 454 |     "# Read one of the datasets. This is the one with a 0.25 test split\n",
 455 |     "run.steps[\"data_preprocessor\"].outputs[\"dataset_trn\"].load()"
 456 |    ]
 457 |   },
 458 |   {
 459 |    "cell_type": "markdown",
 460 |    "id": "26d26436",
 461 |    "metadata": {},
 462 |    "source": [
 463 |     "We can also get the artifacts directly. Each time you create a new pipeline run, a new `artifact version` is created.\n",
 464 |     "\n",
 465 |     "You can fetch these artifact and their versions using the `client`: "
 466 |    ]
 467 |   },
 468 |   {
 469 |    "cell_type": "code",
 470 |    "execution_count": null,
 471 |    "id": "c8f90647",
 472 |    "metadata": {},
 473 |    "outputs": [],
 474 |    "source": [
 475 |     "# Get artifact version from our run\n",
 476 |     "dataset_trn_artifact_version_via_run = run.steps[\"data_preprocessor\"].outputs[\n",
 477 |     "    \"dataset_trn\"\n",
 478 |     "]\n",
 479 |     "\n",
 480 |     "# Get latest version from client directly\n",
 481 |     "dataset_trn_artifact_version = client.get_artifact_version(\"dataset_trn\")\n",
 482 |     "\n",
 483 |     "# This should be true if our run is the latest run and no artifact has been produced\n",
 484 |     "#  in the intervening time\n",
 485 |     "dataset_trn_artifact_version_via_run.id == dataset_trn_artifact_version.id"
 486 |    ]
 487 |   },
 488 |   {
 489 |    "cell_type": "code",
 490 |    "execution_count": null,
 491 |    "id": "3f9d3dfd",
 492 |    "metadata": {},
 493 |    "outputs": [],
 494 |    "source": [
 495 |     "# Fetch the rest of the artifacts\n",
 496 |     "dataset_tst_artifact_version = client.get_artifact_version(\"dataset_tst\")\n",
 497 |     "preprocessing_pipeline_artifact_version = client.get_artifact_version(\n",
 498 |     "    \"preprocess_pipeline\"\n",
 499 |     ")"
 500 |    ]
 501 |   },
 502 |   {
 503 |    "cell_type": "markdown",
 504 |    "id": "7a7d1b04",
 505 |    "metadata": {},
 506 |    "source": [
 507 |     "If you started with a fresh install, then you would have two versions corresponding\n",
 508 |     "to the two pipelines that we ran above. We can even load a artifact version in memory:   "
 509 |    ]
 510 |   },
 511 |   {
 512 |    "cell_type": "code",
 513 |    "execution_count": null,
 514 |    "id": "c82aca75",
 515 |    "metadata": {},
 516 |    "outputs": [],
 517 |    "source": [
 518 |     "# Load an artifact to verify you can fetch it\n",
 519 |     "dataset_trn_artifact_version.load()"
 520 |    ]
 521 |   },
 522 |   {
 523 |    "cell_type": "markdown",
 524 |    "id": "5963509e",
 525 |    "metadata": {},
 526 |    "source": [
 527 |     "We'll use these artifacts from above in our next pipeline"
 528 |    ]
 529 |   },
 530 |   {
 531 |    "cell_type": "markdown",
 532 |    "id": "8c28b474",
 533 |    "metadata": {},
 534 |    "source": [
 535 |     "# ⌚ Step 3: Training pipeline"
 536 |    ]
 537 |   },
 538 |   {
 539 |    "cell_type": "markdown",
 540 |    "id": "87909827",
 541 |    "metadata": {},
 542 |    "source": [
 543 |     "Now that we have our data it makes sense to train some models to get a sense of\n",
 544 |     "how difficult the task is. The Breast Cancer dataset is sufficiently large and complex \n",
 545 |     "that it's unlikely we'll be able to train a model that behaves perfectly since the problem \n",
 546 |     "is inherently complex, but we can get a sense of what a reasonable baseline looks like.\n",
 547 |     "\n",
 548 |     "We'll start with two simple models, a SGD Classifier and a Random Forest\n",
 549 |     "Classifier, both batteries-included from `sklearn`. We'll train them both on the\n",
 550 |     "same data and then compare their performance.\n",
 551 |     "\n",
 552 |     "<img src=\".assets/training_pipeline.png\" width=\"50%\" alt=\"Training pipeline\">"
 553 |    ]
 554 |   },
 555 |   {
 556 |    "cell_type": "code",
 557 |    "execution_count": null,
 558 |    "id": "fccf1bd9",
 559 |    "metadata": {},
 560 |    "outputs": [],
 561 |    "source": [
 562 |     "import pandas as pd\n",
 563 |     "from sklearn.base import ClassifierMixin\n",
 564 |     "from sklearn.ensemble import RandomForestClassifier\n",
 565 |     "from sklearn.linear_model import SGDClassifier\n",
 566 |     "from typing_extensions import Annotated\n",
 567 |     "from zenml import ArtifactConfig, step\n",
 568 |     "from zenml.logger import get_logger\n",
 569 |     "\n",
 570 |     "logger = get_logger(__name__)\n",
 571 |     "\n",
 572 |     "\n",
 573 |     "@step\n",
 574 |     "def model_trainer(\n",
 575 |     "    dataset_trn: pd.DataFrame,\n",
 576 |     "    model_type: str = \"sgd\",\n",
 577 |     ") -> Annotated[\n",
 578 |     "    ClassifierMixin, ArtifactConfig(name=\"sklearn_classifier\", is_model_artifact=True)\n",
 579 |     "]:\n",
 580 |     "    \"\"\"Configure and train a model on the training dataset.\"\"\"\n",
 581 |     "    target = \"target\"\n",
 582 |     "    if model_type == \"sgd\":\n",
 583 |     "        model = SGDClassifier()\n",
 584 |     "    elif model_type == \"rf\":\n",
 585 |     "        model = RandomForestClassifier()\n",
 586 |     "    else:\n",
 587 |     "        raise ValueError(f\"Unknown model type {model_type}\")\n",
 588 |     "\n",
 589 |     "    logger.info(f\"Training model {model}...\")\n",
 590 |     "\n",
 591 |     "    model.fit(\n",
 592 |     "        dataset_trn.drop(columns=[target]),\n",
 593 |     "        dataset_trn[target],\n",
 594 |     "    )\n",
 595 |     "    return model"
 596 |    ]
 597 |   },
 598 |   {
 599 |    "cell_type": "markdown",
 600 |    "id": "73a00008",
 601 |    "metadata": {},
 602 |    "source": [
 603 |     "Our two training steps both return different kinds of `sklearn` classifier\n",
 604 |     "models, so we use the generic `ClassifierMixin` type hint for the return type."
 605 |    ]
 606 |   },
 607 |   {
 608 |    "cell_type": "markdown",
 609 |    "id": "a5f22174",
 610 |    "metadata": {},
 611 |    "source": [
 612 |     "ZenML allows you to load any version of any dataset that is tracked by the framework\n",
 613 |     "directly into a pipeline using the `Client().get_artifact_version` interface. This is very convenient\n",
 614 |     "in this case, as we'd like to send our preprocessed dataset from the older pipeline directly\n",
 615 |     "into the training pipeline."
 616 |    ]
 617 |   },
 618 |   {
 619 |    "cell_type": "code",
 620 |    "execution_count": null,
 621 |    "id": "1aa98f2f",
 622 |    "metadata": {},
 623 |    "outputs": [],
 624 |    "source": [
 625 |     "@pipeline\n",
 626 |     "def training(\n",
 627 |     "    train_dataset_id: Optional[UUID] = None,\n",
 628 |     "    test_dataset_id: Optional[UUID] = None,\n",
 629 |     "    model_type: str = \"sgd\",\n",
 630 |     "    min_train_accuracy: float = 0.0,\n",
 631 |     "    min_test_accuracy: float = 0.0,\n",
 632 |     "):\n",
 633 |     "    \"\"\"Model training pipeline.\"\"\"\n",
 634 |     "    if train_dataset_id is None or test_dataset_id is None:\n",
 635 |     "        # If we dont pass the IDs, this will run the feature engineering pipeline\n",
 636 |     "        dataset_trn, dataset_tst = feature_engineering()\n",
 637 |     "    else:\n",
 638 |     "        # Load the datasets from an older pipeline\n",
 639 |     "        dataset_trn = client.get_artifact_version(name_id_or_prefix=train_dataset_id)\n",
 640 |     "        dataset_tst = client.get_artifact_version(name_id_or_prefix=test_dataset_id)\n",
 641 |     "\n",
 642 |     "    trained_model = model_trainer(\n",
 643 |     "        dataset_trn=dataset_trn,\n",
 644 |     "        model_type=model_type,\n",
 645 |     "    )\n",
 646 |     "\n",
 647 |     "    model_evaluator(\n",
 648 |     "        model=trained_model,\n",
 649 |     "        dataset_trn=dataset_trn,\n",
 650 |     "        dataset_tst=dataset_tst,\n",
 651 |     "        min_train_accuracy=min_train_accuracy,\n",
 652 |     "        min_test_accuracy=min_test_accuracy,\n",
 653 |     "    )"
 654 |    ]
 655 |   },
 656 |   {
 657 |    "cell_type": "markdown",
 658 |    "id": "88b70fd3",
 659 |    "metadata": {},
 660 |    "source": [
 661 |     "The end goal of this quick baseline evaluation is to understand which of the two\n",
 662 |     "models performs better. We'll use the `evaluator` step to compare the two\n",
 663 |     "models. This step takes in the model from the trainer step, and computes its score\n",
 664 |     "over the testing set."
 665 |    ]
 666 |   },
 667 |   {
 668 |    "cell_type": "code",
 669 |    "execution_count": null,
 670 |    "id": "c64885ac",
 671 |    "metadata": {},
 672 |    "outputs": [],
 673 |    "source": [
 674 |     "# Use a random forest model with the chosen datasets.\n",
 675 |     "# We need to pass the ID's of the datasets into the function\n",
 676 |     "training(\n",
 677 |     "    model_type=\"rf\",\n",
 678 |     "    train_dataset_id=dataset_trn_artifact_version.id,\n",
 679 |     "    test_dataset_id=dataset_tst_artifact_version.id,\n",
 680 |     ")\n",
 681 |     "\n",
 682 |     "rf_run = client.get_pipeline(\"training\").last_run"
 683 |    ]
 684 |   },
 685 |   {
 686 |    "cell_type": "code",
 687 |    "execution_count": null,
 688 |    "id": "4300c82f",
 689 |    "metadata": {},
 690 |    "outputs": [],
 691 |    "source": [
 692 |     "# Use a SGD classifier\n",
 693 |     "sgd_run = training(\n",
 694 |     "    model_type=\"sgd\",\n",
 695 |     "    train_dataset_id=dataset_trn_artifact_version.id,\n",
 696 |     "    test_dataset_id=dataset_tst_artifact_version.id,\n",
 697 |     ")\n",
 698 |     "\n",
 699 |     "sgd_run = client.get_pipeline(\"training\").last_run"
 700 |    ]
 701 |   },
 702 |   {
 703 |    "cell_type": "markdown",
 704 |    "id": "43f1a68a",
 705 |    "metadata": {},
 706 |    "source": [
 707 |     "You can see from the logs already how our model training went: the\n",
 708 |     "`RandomForestClassifier` performed considerably better than the `SGDClassifier`.\n",
 709 |     "We can use the ZenML `Client` to verify this:"
 710 |    ]
 711 |   },
 712 |   {
 713 |    "cell_type": "code",
 714 |    "execution_count": null,
 715 |    "id": "d95810b1",
 716 |    "metadata": {},
 717 |    "outputs": [],
 718 |    "source": [
 719 |     "# The evaluator returns a float value with the accuracy\n",
 720 |     "rf_run.steps[\"model_evaluator\"].output.load() > sgd_run.steps[\n",
 721 |     "    \"model_evaluator\"\n",
 722 |     "].output.load()"
 723 |    ]
 724 |   },
 725 |   {
 726 |    "cell_type": "markdown",
 727 |    "id": "e256d145",
 728 |    "metadata": {},
 729 |    "source": [
 730 |     "# 💯 Step 4: Associating a model with your pipeline"
 731 |    ]
 732 |   },
 733 |   {
 734 |    "cell_type": "markdown",
 735 |    "id": "927978f3",
 736 |    "metadata": {},
 737 |    "source": [
 738 |     "You can see it is relatively easy to train ML models using ZenML pipelines. But it can be somewhat clunky to track\n",
 739 |     "all the models produced as you develop your experiments and use-cases. Luckily, ZenML offers a *Model Control Plane*,\n",
 740 |     "which is a central register of all your ML models.\n",
 741 |     "\n",
 742 |     "You can easily create a ZenML Model and associate it with your pipelines using the `Model` object:"
 743 |    ]
 744 |   },
 745 |   {
 746 |    "cell_type": "code",
 747 |    "execution_count": null,
 748 |    "id": "99ca00c0",
 749 |    "metadata": {},
 750 |    "outputs": [],
 751 |    "source": [
 752 |     "pipeline_settings = {}\n",
 753 |     "\n",
 754 |     "# Lets add some metadata to the model to make it identifiable\n",
 755 |     "pipeline_settings[\"model\"] = Model(\n",
 756 |     "    name=\"breast_cancer_classifier\",\n",
 757 |     "    license=\"Apache 2.0\",\n",
 758 |     "    description=\"A breast cancer classifier\",\n",
 759 |     "    tags=[\"breast_cancer\", \"classifier\"],\n",
 760 |     ")"
 761 |    ]
 762 |   },
 763 |   {
 764 |    "cell_type": "code",
 765 |    "execution_count": null,
 766 |    "id": "0e78a520",
 767 |    "metadata": {},
 768 |    "outputs": [],
 769 |    "source": [
 770 |     "# Let's train the SGD model and set the version name to \"sgd\"\n",
 771 |     "pipeline_settings[\"model\"].version = \"sgd\"\n",
 772 |     "\n",
 773 |     "# the `with_options` method allows us to pass in pipeline settings\n",
 774 |     "#  and returns a configured pipeline\n",
 775 |     "training_configured = training.with_options(**pipeline_settings)\n",
 776 |     "\n",
 777 |     "# We can now run this as usual\n",
 778 |     "training_configured(\n",
 779 |     "    model_type=\"sgd\",\n",
 780 |     "    train_dataset_id=dataset_trn_artifact_version.id,\n",
 781 |     "    test_dataset_id=dataset_tst_artifact_version.id,\n",
 782 |     ")"
 783 |    ]
 784 |   },
 785 |   {
 786 |    "cell_type": "code",
 787 |    "execution_count": null,
 788 |    "id": "9b8e0002",
 789 |    "metadata": {},
 790 |    "outputs": [],
 791 |    "source": [
 792 |     "# Let's train the RF model and set the version name to \"rf\"\n",
 793 |     "pipeline_settings[\"model\"].version = \"rf\"\n",
 794 |     "\n",
 795 |     "# the `with_options` method allows us to pass in pipeline settings\n",
 796 |     "#  and returns a configured pipeline\n",
 797 |     "training_configured = training.with_options(**pipeline_settings)\n",
 798 |     "\n",
 799 |     "# Let's run it again to make sure we have two versions\n",
 800 |     "training_configured(\n",
 801 |     "    model_type=\"rf\",\n",
 802 |     "    train_dataset_id=dataset_trn_artifact_version.id,\n",
 803 |     "    test_dataset_id=dataset_tst_artifact_version.id,\n",
 804 |     ")"
 805 |    ]
 806 |   },
 807 |   {
 808 |    "cell_type": "markdown",
 809 |    "id": "09597223",
 810 |    "metadata": {},
 811 |    "source": [
 812 |     "This time, running both pipelines has created two associated **model versions**.\n",
 813 |     "You can list your ZenML model and their versions as follows:"
 814 |    ]
 815 |   },
 816 |   {
 817 |    "cell_type": "code",
 818 |    "execution_count": null,
 819 |    "id": "fbb25913",
 820 |    "metadata": {},
 821 |    "outputs": [],
 822 |    "source": [
 823 |     "zenml_model = client.get_model(\"breast_cancer_classifier\")\n",
 824 |     "print(zenml_model)\n",
 825 |     "\n",
 826 |     "print(f\"Model {zenml_model.name} has {len(zenml_model.versions)} versions\")\n",
 827 |     "\n",
 828 |     "zenml_model.versions[0].version, zenml_model.versions[1].version"
 829 |    ]
 830 |   },
 831 |   {
 832 |    "cell_type": "markdown",
 833 |    "id": "e82cfac2",
 834 |    "metadata": {},
 835 |    "source": [
 836 |     "The interesting part is that ZenML went ahead and linked all artifacts produced by the\n",
 837 |     "pipelines to that model version, including the two pickle files that represent our\n",
 838 |     "SGD and RandomForest classifier. We can see all artifacts directly from the model\n",
 839 |     "version object:"
 840 |    ]
 841 |   },
 842 |   {
 843 |    "cell_type": "code",
 844 |    "execution_count": null,
 845 |    "id": "31211413",
 846 |    "metadata": {},
 847 |    "outputs": [],
 848 |    "source": [
 849 |     "# Let's load the RF version\n",
 850 |     "rf_zenml_model_version = client.get_model_version(\"breast_cancer_classifier\", \"rf\")\n",
 851 |     "\n",
 852 |     "# We can now load our classifier directly as well\n",
 853 |     "random_forest_classifier = rf_zenml_model_version.get_artifact(\n",
 854 |     "    \"sklearn_classifier\"\n",
 855 |     ").load()\n",
 856 |     "\n",
 857 |     "random_forest_classifier"
 858 |    ]
 859 |   },
 860 |   {
 861 |    "cell_type": "markdown",
 862 |    "id": "53517a9a",
 863 |    "metadata": {},
 864 |    "source": [
 865 |     "If you are a [ZenML Pro](https://zenml.io/pro) user, you can see all of this visualized in the dashboard:\n",
 866 |     "\n",
 867 |     "<img src=\".assets/cloud_mcp_screenshot.png\" width=\"70%\" alt=\"Model Control Plane\">"
 868 |    ]
 869 |   },
 870 |   {
 871 |    "cell_type": "markdown",
 872 |    "id": "eb645dde",
 873 |    "metadata": {},
 874 |    "source": [
 875 |     "There is a lot more you can do with ZenML models, including the ability to\n",
 876 |     "track metrics by adding metadata to it, or having them persist in a model\n",
 877 |     "registry. However, these topics can be explored more in the\n",
 878 |     "[ZenML docs](https://docs.zenml.io).\n",
 879 |     "\n",
 880 |     "For now, we will use the ZenML model control plane to promote our best\n",
 881 |     "model to `production`. You can do this by simply setting the `stage` of\n",
 882 |     "your chosen model version to the `production` tag."
 883 |    ]
 884 |   },
 885 |   {
 886 |    "cell_type": "code",
 887 |    "execution_count": null,
 888 |    "id": "26b718f8",
 889 |    "metadata": {},
 890 |    "outputs": [],
 891 |    "source": [
 892 |     "# Set our best classifier to production\n",
 893 |     "rf_zenml_model_version.set_stage(\"production\", force=True)"
 894 |    ]
 895 |   },
 896 |   {
 897 |    "cell_type": "markdown",
 898 |    "id": "9fddf3d0",
 899 |    "metadata": {},
 900 |    "source": [
 901 |     "Of course, normally one would only promote the model by comparing to all other model\n",
 902 |     "versions and doing some other tests. But that's a bit more advanced use-case. See the\n",
 903 |     "[e2e_batch example](https://github.com/zenml-io/zenml/tree/main/examples/e2e) to get\n",
 904 |     "more insight into that sort of flow!"
 905 |    ]
 906 |   },
 907 |   {
 908 |    "cell_type": "markdown",
 909 |    "id": "2ecbc8cf",
 910 |    "metadata": {},
 911 |    "source": [
 912 |     "<img src=\".assets/cloud_mcp.png\" width=\"60%\" alt=\"Model Control Plane\">"
 913 |    ]
 914 |   },
 915 |   {
 916 |    "cell_type": "markdown",
 917 |    "id": "8f1146db",
 918 |    "metadata": {},
 919 |    "source": [
 920 |     "Once the model is promoted, we can now consume the right model version in our\n",
 921 |     "batch inference pipeline directly. Let's see how that works."
 922 |    ]
 923 |   },
 924 |   {
 925 |    "cell_type": "markdown",
 926 |    "id": "d6306f14",
 927 |    "metadata": {},
 928 |    "source": [
 929 |     "# 🫅 Step 5: Consuming the model in production"
 930 |    ]
 931 |   },
 932 |   {
 933 |    "cell_type": "markdown",
 934 |    "id": "b51f3108",
 935 |    "metadata": {},
 936 |    "source": [
 937 |     "The batch inference pipeline simply takes the model marked as `production` and runs inference on it\n",
 938 |     "with `live data`. The critical step here is the `inference_predict` step, where we load the model in memory\n",
 939 |     "and generate predictions:\n",
 940 |     "\n",
 941 |     "<img src=\".assets/inference_pipeline.png\" width=\"45%\" alt=\"Inference pipeline\">"
 942 |    ]
 943 |   },
 944 |   {
 945 |    "cell_type": "code",
 946 |    "execution_count": null,
 947 |    "id": "92c4c7dc",
 948 |    "metadata": {},
 949 |    "outputs": [],
 950 |    "source": [
 951 |     "@step\n",
 952 |     "def inference_predict(dataset_inf: pd.DataFrame) -> Annotated[pd.Series, \"predictions\"]:\n",
 953 |     "    \"\"\"Predictions step\"\"\"\n",
 954 |     "    # Get the model\n",
 955 |     "    model = get_step_context().model\n",
 956 |     "\n",
 957 |     "    # run prediction from memory\n",
 958 |     "    predictor = model.load_artifact(\"sklearn_classifier\")\n",
 959 |     "    predictions = predictor.predict(dataset_inf)\n",
 960 |     "\n",
 961 |     "    predictions = pd.Series(predictions, name=\"predicted\")\n",
 962 |     "\n",
 963 |     "    return predictions"
 964 |    ]
 965 |   },
 966 |   {
 967 |    "cell_type": "markdown",
 968 |    "id": "3aeb227b",
 969 |    "metadata": {},
 970 |    "source": [
 971 |     "Apart from the loading the model, we must also load the preprocessing pipeline that we ran in feature engineering,\n",
 972 |     "so that we can do the exact steps that we did on training time, in inference time. Let's bring it all together:"
 973 |    ]
 974 |   },
 975 |   {
 976 |    "cell_type": "code",
 977 |    "execution_count": null,
 978 |    "id": "37c409bd",
 979 |    "metadata": {},
 980 |    "outputs": [],
 981 |    "source": [
 982 |     "@pipeline\n",
 983 |     "def inference(preprocess_pipeline_id: UUID):\n",
 984 |     "    \"\"\"Model batch inference pipeline\"\"\"\n",
 985 |     "    # random_state = client.get_artifact_version(name_id_or_prefix=preprocess_pipeline_id).metadata[\"random_state\"]\n",
 986 |     "    # target = client.get_artifact_version(name_id_or_prefix=preprocess_pipeline_id).run_metadata['target']\n",
 987 |     "    random_state = 42\n",
 988 |     "    target = \"target\"\n",
 989 |     "\n",
 990 |     "    df_inference = data_loader(random_state=random_state, is_inference=True)\n",
 991 |     "    df_inference = inference_preprocessor(\n",
 992 |     "        dataset_inf=df_inference,\n",
 993 |     "        # We use the preprocess pipeline from the feature engineering pipeline\n",
 994 |     "        preprocess_pipeline=client.get_artifact_version(\n",
 995 |     "            name_id_or_prefix=preprocess_pipeline_id\n",
 996 |     "        ),\n",
 997 |     "        target=target,\n",
 998 |     "    )\n",
 999 |     "    inference_predict(\n",
1000 |     "        dataset_inf=df_inference,\n",
1001 |     "    )"
1002 |    ]
1003 |   },
1004 |   {
1005 |    "cell_type": "markdown",
1006 |    "id": "c7afe7be",
1007 |    "metadata": {},
1008 |    "source": [
1009 |     "The way to load the right model is to pass in the `production` stage into the `Model` config this time.\n",
1010 |     "This will ensure to always load the production model, decoupled from all other pipelines:"
1011 |    ]
1012 |   },
1013 |   {
1014 |    "cell_type": "code",
1015 |    "execution_count": null,
1016 |    "id": "61bf5939",
1017 |    "metadata": {},
1018 |    "outputs": [],
1019 |    "source": [
1020 |     "pipeline_settings = {\"enable_cache\": False}\n",
1021 |     "\n",
1022 |     "# Lets add some metadata to the model to make it identifiable\n",
1023 |     "pipeline_settings[\"model\"] = Model(\n",
1024 |     "    name=\"breast_cancer_classifier\",\n",
1025 |     "    version=\"production\",  # We can pass in the stage name here!\n",
1026 |     "    license=\"Apache 2.0\",\n",
1027 |     "    description=\"A breast cancer classifier\",\n",
1028 |     "    tags=[\"breast_cancer\", \"classifier\"],\n",
1029 |     ")"
1030 |    ]
1031 |   },
1032 |   {
1033 |    "cell_type": "code",
1034 |    "execution_count": null,
1035 |    "id": "ff3402f1",
1036 |    "metadata": {},
1037 |    "outputs": [],
1038 |    "source": [
1039 |     "# the `with_options` method allows us to pass in pipeline settings\n",
1040 |     "#  and returns a configured pipeline\n",
1041 |     "inference_configured = inference.with_options(**pipeline_settings)\n",
1042 |     "\n",
1043 |     "# Let's run it again to make sure we have two versions\n",
1044 |     "# We need to pass in the ID of the preprocessing done in the feature engineering pipeline\n",
1045 |     "# in order to avoid training-serving skew\n",
1046 |     "inference_configured(preprocess_pipeline_id=preprocessing_pipeline_artifact_version.id)"
1047 |    ]
1048 |   },
1049 |   {
1050 |    "cell_type": "markdown",
1051 |    "id": "2935d1fa",
1052 |    "metadata": {},
1053 |    "source": [
1054 |     "ZenML automatically links all artifacts to the `production` model version as well, including the predictions\n",
1055 |     "that were returned in the pipeline. This completes the MLOps loop of training to inference:"
1056 |    ]
1057 |   },
1058 |   {
1059 |    "cell_type": "code",
1060 |    "execution_count": null,
1061 |    "id": "e191d019",
1062 |    "metadata": {},
1063 |    "outputs": [],
1064 |    "source": [
1065 |     "# Fetch production model\n",
1066 |     "production_model_version = client.get_model_version(\n",
1067 |     "    \"breast_cancer_classifier\", \"production\"\n",
1068 |     ")\n",
1069 |     "\n",
1070 |     "# Get the predictions artifact\n",
1071 |     "production_model_version.get_artifact(\"predictions\").load()"
1072 |    ]
1073 |   },
1074 |   {
1075 |    "cell_type": "markdown",
1076 |    "id": "b0a73cdf",
1077 |    "metadata": {},
1078 |    "source": [
1079 |     "You can also see all predictions ever created as a complete history in the dashboard:\n",
1080 |     "\n",
1081 |     "<img src=\".assets/cloud_mcp_predictions.png\" width=\"70%\" alt=\"Model Control Plane\">"
1082 |    ]
1083 |   },
1084 |   {
1085 |    "cell_type": "markdown",
1086 |    "id": "594ee4fc-f102-4b99-bdc3-2f1670c87679",
1087 |    "metadata": {},
1088 |    "source": [
1089 |     "## Congratulations!\n",
1090 |     "\n",
1091 |     "You're a legit MLOps engineer now! You trained two models, evaluated them against\n",
1092 |     "a test set, registered the best one with the ZenML model control plane,\n",
1093 |     "and served some predictions. You also learned how to iterate on your models and\n",
1094 |     "data by using some of the ZenML utility abstractions. You saw how to view your\n",
1095 |     "artifacts and models via the client as well as the ZenML Dashboard.\n",
1096 |     "\n",
1097 |     "## Further exploration\n",
1098 |     "\n",
1099 |     "This was just the tip of the iceberg of what ZenML can do; check out the [**docs**](https://docs.zenml.io/) to learn more\n",
1100 |     "about the capabilities of ZenML. For example, you might want to:\n",
1101 |     "\n",
1102 |     "- [Deploy ZenML](https://docs.zenml.io/user-guides/production-guide/connect-deployed-zenml) to collaborate with your colleagues.\n",
1103 |     "- Run the same pipeline on a [cloud MLOps stack in production](https://docs.zenml.io/user-guides/production-guide/cloud-stack).\n",
1104 |     "- Track your metrics in an experiment tracker like [MLflow](https://docs.zenml.io/stacks-and-components/component-guide/experiment-trackers/mlflow).\n",
1105 |     "\n",
1106 |     "## What next?\n",
1107 |     "\n",
1108 |     "* If you have questions or feedback... join our [**Slack Community**](https://zenml.io/slack) and become part of the ZenML family!\n",
1109 |     "* If you want to quickly get started with ZenML, check out [ZenML Pro](https://zenml.io/pro)."
1110 |    ]
1111 |   }
1112 |  ],
1113 |  "metadata": {
1114 |   "kernelspec": {
1115 |    "display_name": "Python 3 (ipykernel)",
1116 |    "language": "python",
1117 |    "name": "python3"
1118 |   },
1119 |   "language_info": {
1120 |    "codemirror_mode": {
1121 |     "name": "ipython",
1122 |     "version": 3
1123 |    },
1124 |    "file_extension": ".py",
1125 |    "mimetype": "text/x-python",
1126 |    "name": "python",
1127 |    "nbconvert_exporter": "python",
1128 |    "pygments_lexer": "ipython3",
1129 |    "version": "3.11.3"
1130 |   }
1131 |  },
1132 |  "nbformat": 4,
1133 |  "nbformat_minor": 5
1134 | }
1135 | 


--------------------------------------------------------------------------------
/template/requirements.txt:
--------------------------------------------------------------------------------
1 | zenml[server]>=0.50.0
2 | notebook
3 | scikit-learn
4 | pyarrow
5 | pandas
6 | 


--------------------------------------------------------------------------------
/template/run.py:
--------------------------------------------------------------------------------
  1 | # {% include 'template/license_header' %}
  2 | 
  3 | import os
  4 | from typing import Optional
  5 | 
  6 | import click
  7 | import yaml
  8 | from pipelines import (
  9 |     feature_engineering,
 10 |     inference,
 11 |     training,
 12 | )
 13 | from zenml.client import Client
 14 | from zenml.logger import get_logger
 15 | 
 16 | logger = get_logger(__name__)
 17 | 
 18 | 
 19 | @click.command(
 20 |     help="""
 21 | ZenML Starter project.
 22 | 
 23 | Run the ZenML starter project with basic options.
 24 | 
 25 | Examples:
 26 | 
 27 |   \b
 28 |   # Run the feature engineering pipeline
 29 |     python run.py --feature-pipeline
 30 |   
 31 |   \b
 32 |   # Run the training pipeline
 33 |     python run.py --training-pipeline
 34 | 
 35 |   \b 
 36 |   # Run the training pipeline with versioned artifacts
 37 |     python run.py --training-pipeline --train-dataset-version-name=1 --test-dataset-version-name=1
 38 | 
 39 |   \b
 40 |   # Run the inference pipeline
 41 |     python run.py --inference-pipeline
 42 | 
 43 | """
 44 | )
 45 | @click.option(
 46 |     "--train-dataset-name",
 47 |     default="dataset_trn",
 48 |     type=click.STRING,
 49 |     help="The name of the train dataset produced by feature engineering.",
 50 | )
 51 | @click.option(
 52 |     "--train-dataset-version-name",
 53 |     default=None,
 54 |     type=click.STRING,
 55 |     help="Version of the train dataset produced by feature engineering. "
 56 |     "If not specified, a new version will be created.",
 57 | )
 58 | @click.option(
 59 |     "--test-dataset-name",
 60 |     default="dataset_tst",
 61 |     type=click.STRING,
 62 |     help="The name of the test dataset produced by feature engineering.",
 63 | )
 64 | @click.option(
 65 |     "--test-dataset-version-name",
 66 |     default=None,
 67 |     type=click.STRING,
 68 |     help="Version of the test dataset produced by feature engineering. "
 69 |     "If not specified, a new version will be created.",
 70 | )
 71 | @click.option(
 72 |     "--feature-pipeline",
 73 |     is_flag=True,
 74 |     default=False,
 75 |     help="Whether to run the pipeline that creates the dataset.",
 76 | )
 77 | @click.option(
 78 |     "--training-pipeline",
 79 |     is_flag=True,
 80 |     default=False,
 81 |     help="Whether to run the pipeline that trains the model.",
 82 | )
 83 | @click.option(
 84 |     "--inference-pipeline",
 85 |     is_flag=True,
 86 |     default=False,
 87 |     help="Whether to run the pipeline that performs inference.",
 88 | )
 89 | @click.option(
 90 |     "--no-cache",
 91 |     is_flag=True,
 92 |     default=False,
 93 |     help="Disable caching for the pipeline run.",
 94 | )
 95 | def main(
 96 |     train_dataset_name: str = "dataset_trn",
 97 |     train_dataset_version_name: Optional[str] = None,
 98 |     test_dataset_name: str = "dataset_tst",
 99 |     test_dataset_version_name: Optional[str] = None,
100 |     feature_pipeline: bool = False,
101 |     training_pipeline: bool = False,
102 |     inference_pipeline: bool = False,
103 |     no_cache: bool = False,
104 | ):
105 |     """Main entry point for the pipeline execution.
106 | 
107 |     This entrypoint is where everything comes together:
108 | 
109 |       * configuring pipeline with the required parameters
110 |         (some of which may come from command line arguments, but most
111 |         of which comes from the YAML config files)
112 |       * launching the pipeline
113 | 
114 |     Args:
115 |         train_dataset_name: The name of the train dataset produced by feature engineering.
116 |         train_dataset_version_name: Version of the train dataset produced by feature engineering.
117 |             If not specified, a new version will be created.
118 |         test_dataset_name: The name of the test dataset produced by feature engineering.
119 |         test_dataset_version_name: Version of the test dataset produced by feature engineering.
120 |             If not specified, a new version will be created.
121 |         feature_pipeline: Whether to run the pipeline that creates the dataset.
122 |         training_pipeline: Whether to run the pipeline that trains the model.
123 |         inference_pipeline: Whether to run the pipeline that performs inference.
124 |         no_cache: If `True` cache will be disabled.
125 |     """
126 |     client = Client()
127 | 
128 |     config_folder = os.path.join(
129 |         os.path.dirname(os.path.realpath(__file__)),
130 |         "configs",
131 |     )
132 | 
133 |     # Execute Feature Engineering Pipeline
134 |     if feature_pipeline:
135 |         pipeline_args = {}
136 |         if no_cache:
137 |             pipeline_args["enable_cache"] = False
138 |         pipeline_args["config_path"] = os.path.join(
139 |             config_folder, "feature_engineering.yaml"
140 |         )
141 |         run_args_feature = {}
142 |         feature_engineering.with_options(**pipeline_args)(**run_args_feature)
143 |         logger.info("Feature Engineering pipeline finished successfully!\n")
144 | 
145 |         train_dataset_artifact = client.get_artifact_version(train_dataset_name)
146 |         test_dataset_artifact = client.get_artifact_version(test_dataset_name)
147 |         logger.info(
148 |             "The latest feature engineering pipeline produced the following "
149 |             f"artifacts: \n\n1. Train Dataset - Name: {train_dataset_name}, "
150 |             f"Version Name: {train_dataset_artifact.version} \n2. Test Dataset: "
151 |             f"Name: {test_dataset_name}, Version Name: {test_dataset_artifact.version}"
152 |         )
153 | 
154 |     # Execute Training Pipeline
155 |     if training_pipeline:
156 |         run_args_train = {}
157 | 
158 |         # If train_dataset_version_name is specified, use versioned artifacts
159 |         if train_dataset_version_name or test_dataset_version_name:
160 |             # However, both train and test dataset versions must be specified
161 |             assert (
162 |                 train_dataset_version_name is not None
163 |                 and test_dataset_version_name is not None
164 |             )
165 |             train_dataset_artifact_version = client.get_artifact_version(
166 |                 train_dataset_name, train_dataset_version_name
167 |             )
168 |             # If train dataset is specified, test dataset must be specified
169 |             test_dataset_artifact_version = client.get_artifact_version(
170 |                 test_dataset_name, test_dataset_version_name
171 |             )
172 |             # Use versioned artifacts
173 |             run_args_train["train_dataset_id"] = train_dataset_artifact_version.id
174 |             run_args_train["test_dataset_id"] = test_dataset_artifact_version.id
175 | 
176 |         # Run the SGD pipeline
177 |         pipeline_args = {}
178 |         if no_cache:
179 |             pipeline_args["enable_cache"] = False
180 |         pipeline_args["config_path"] = os.path.join(config_folder, "training_sgd.yaml")
181 |         training.with_options(**pipeline_args)(**run_args_train)
182 |         logger.info("Training pipeline with SGD finished successfully!\n\n")
183 | 
184 |         # Run the RF pipeline
185 |         pipeline_args = {}
186 |         if no_cache:
187 |             pipeline_args["enable_cache"] = False
188 |         pipeline_args["config_path"] = os.path.join(config_folder, "training_rf.yaml")
189 |         training.with_options(**pipeline_args)(**run_args_train)
190 |         logger.info("Training pipeline with RF finished successfully!\n\n")
191 | 
192 |     if inference_pipeline:
193 |         run_args_inference = {}
194 |         pipeline_args = {"enable_cache": False}
195 |         pipeline_args["config_path"] = os.path.join(config_folder, "inference.yaml")
196 | 
197 |         # Configure the pipeline
198 |         inference_configured = inference.with_options(**pipeline_args)
199 | 
200 |         # Fetch the production model
201 |         with open(pipeline_args["config_path"], "r") as f:
202 |             config = yaml.load(f, Loader=yaml.SafeLoader)
203 |         zenml_model = client.get_model_version(
204 |             config["model"]["name"], config["model"]["version"]
205 |         )
206 |         preprocess_pipeline_artifact = zenml_model.get_artifact("preprocess_pipeline")
207 | 
208 |         # Use the metadata of feature engineering pipeline artifact
209 |         #  to get the random state and target column
210 |         random_state = preprocess_pipeline_artifact.run_metadata["random_state"]
211 |         target = preprocess_pipeline_artifact.run_metadata["target"]
212 |         run_args_inference["random_state"] = random_state
213 |         run_args_inference["target"] = target
214 | 
215 |         # Run the pipeline
216 |         inference_configured(**run_args_inference)
217 |         logger.info("Inference pipeline finished successfully!")
218 | 
219 | 
220 | if __name__ == "__main__":
221 |     main()
222 | 


--------------------------------------------------------------------------------
/template/steps/__init__.py:
--------------------------------------------------------------------------------
 1 | # {% include 'template/license_header' %}
 2 | 
 3 | from .data_loader import (
 4 |     data_loader,
 5 | )
 6 | from .data_preprocessor import (
 7 |     data_preprocessor,
 8 | )
 9 | from .data_splitter import (
10 |     data_splitter,
11 | )
12 | from .inference_predict import (
13 |     inference_predict,
14 | )
15 | from .inference_preprocessor import (
16 |     inference_preprocessor,
17 | )
18 | from .model_evaluator import (
19 |     model_evaluator,
20 | )
21 | from .model_promoter import (
22 |     model_promoter,
23 | )
24 | from .model_trainer import (
25 |     model_trainer,
26 | )
27 | 


--------------------------------------------------------------------------------
/template/steps/data_loader.py:
--------------------------------------------------------------------------------
 1 | # {% include 'template/license_header' %}
 2 | 
 3 | import pandas as pd
 4 | from sklearn.datasets import load_breast_cancer
 5 | from typing_extensions import Annotated
 6 | from zenml import step
 7 | from zenml.logger import get_logger
 8 | 
 9 | logger = get_logger(__name__)
10 | 
11 | 
12 | @step
13 | def data_loader(
14 |     random_state: int, is_inference: bool = False, target: str = "target"
15 | ) -> Annotated[pd.DataFrame, "dataset"]:
16 |     """Dataset reader step.
17 | 
18 |     This is an example of a dataset reader step that load Breast Cancer dataset.
19 | 
20 |     This step is parameterized, which allows you to configure the step
21 |     independently of the step code, before running it in a pipeline.
22 |     In this example, the step can be configured with number of rows and logic
23 |     to drop target column or not. See the documentation for more information:
24 | 
25 |         https://docs.zenml.io/how-to/build-pipelines/use-pipeline-step-parameters
26 | 
27 |     Args:
28 |         random_state: Random state for sampling
29 |         is_inference: If `True` subset will be returned and target column
30 |             will be removed from dataset.
31 |         target: Name of target columns in dataset.
32 | 
33 |     Returns:
34 |         The dataset artifact as Pandas DataFrame and name of target column.
35 |     """
36 |     dataset = load_breast_cancer(as_frame=True)
37 |     inference_size = int(len(dataset.target) * 0.05)
38 |     dataset: pd.DataFrame = dataset.frame
39 |     inference_subset = dataset.sample(inference_size, random_state=random_state)
40 |     if is_inference:
41 |         dataset = inference_subset
42 |         dataset.drop(columns=target, inplace=True)
43 |     else:
44 |         dataset.drop(inference_subset.index, inplace=True)
45 |     dataset.reset_index(drop=True, inplace=True)
46 |     logger.info(f"Dataset with {len(dataset)} records loaded!")
47 |     return dataset
48 | 


--------------------------------------------------------------------------------
/template/steps/data_preprocessor.py:
--------------------------------------------------------------------------------
 1 | # {% include 'template/license_header' %}
 2 | 
 3 | from typing import List, Optional, Tuple
 4 | 
 5 | import pandas as pd
 6 | from sklearn.pipeline import Pipeline
 7 | from sklearn.preprocessing import MinMaxScaler
 8 | from typing_extensions import Annotated
 9 | from utils.preprocess import ColumnsDropper, DataFrameCaster, NADropper
10 | from zenml import log_metadata, step
11 | 
12 | 
13 | @step
14 | def data_preprocessor(
15 |     random_state: int,
16 |     dataset_trn: pd.DataFrame,
17 |     dataset_tst: pd.DataFrame,
18 |     drop_na: Optional[bool] = None,
19 |     normalize: Optional[bool] = None,
20 |     drop_columns: Optional[List[str]] = None,
21 |     target: Optional[str] = "target",
22 | ) -> Tuple[
23 |     Annotated[pd.DataFrame, "dataset_trn"],
24 |     Annotated[pd.DataFrame, "dataset_tst"],
25 |     Annotated[Pipeline, "preprocess_pipeline"],
26 | ]:
27 |     """Data preprocessor step.
28 | 
29 |     This is an example of a data processor step that prepares the data so that
30 |     it is suitable for model training. It takes in a dataset as an input step
31 |     artifact and performs any necessary preprocessing steps like cleaning,
32 |     feature engineering, feature selection, etc. It then returns the processed
33 |     dataset as a step output artifact.
34 | 
35 |     This step is parameterized, which allows you to configure the step
36 |     independently of the step code, before running it in a pipeline.
37 |     In this example, the step can be configured to drop NA values, drop some
38 |     columns and normalize numerical columns. See the documentation for more
39 |     information:
40 | 
41 |         https://docs.zenml.io/how-to/build-pipelines/use-pipeline-step-parameters
42 | 
43 |     Args:
44 |         random_state: Random state for sampling.
45 |         dataset_trn: The train dataset.
46 |         dataset_tst: The test dataset.
47 |         drop_na: If `True` all NA rows will be dropped.
48 |         normalize: If `True` all numeric fields will be normalized.
49 |         drop_columns: List of column names to drop.
50 |         target: Name of target column in dataset.
51 | 
52 |     Returns:
53 |         The processed datasets (dataset_trn, dataset_tst) and fitted `Pipeline` object.
54 |     """
55 |     # We use the sklearn pipeline to chain together multiple preprocessing steps
56 |     preprocess_pipeline = Pipeline([("passthrough", "passthrough")])
57 |     if drop_na:
58 |         preprocess_pipeline.steps.append(("drop_na", NADropper()))
59 |     if drop_columns:
60 |         # Drop columns
61 |         preprocess_pipeline.steps.append(("drop_columns", ColumnsDropper(drop_columns)))
62 |     if normalize:
63 |         # Normalize the data
64 |         preprocess_pipeline.steps.append(("normalize", MinMaxScaler()))
65 |     preprocess_pipeline.steps.append(("cast", DataFrameCaster(dataset_trn.columns)))
66 |     dataset_trn = preprocess_pipeline.fit_transform(dataset_trn)
67 |     dataset_tst = preprocess_pipeline.transform(dataset_tst)
68 | 
69 |     # Log metadata so we can load it in the inference pipeline
70 |     log_metadata(
71 |         metadata={"random_state": random_state, "target": target},
72 |         artifact_name="preprocess_pipeline",
73 |         infer_artifact=True,
74 |     )
75 |     return dataset_trn, dataset_tst, preprocess_pipeline
76 | 


--------------------------------------------------------------------------------
/template/steps/data_splitter.py:
--------------------------------------------------------------------------------
 1 | # {% include 'template/license_header' %}
 2 | 
 3 | from typing import Tuple
 4 | 
 5 | import pandas as pd
 6 | from sklearn.model_selection import train_test_split
 7 | from typing_extensions import Annotated
 8 | from zenml import step
 9 | 
10 | 
11 | @step
12 | def data_splitter(
13 |     dataset: pd.DataFrame, test_size: float = 0.2
14 | ) -> Tuple[
15 |     Annotated[pd.DataFrame, "raw_dataset_trn"],
16 |     Annotated[pd.DataFrame, "raw_dataset_tst"],
17 | ]:
18 |     """Dataset splitter step.
19 | 
20 |     This is an example of a dataset splitter step that splits the data
21 |     into train and test set before passing it to ML model.
22 | 
23 |     This step is parameterized, which allows you to configure the step
24 |     independently of the step code, before running it in a pipeline.
25 |     In this example, the step can be configured to use different test
26 |     set sizes. See the documentation for more information:
27 | 
28 |         https://docs.zenml.io/how-to/build-pipelines/use-pipeline-step-parameters
29 | 
30 |     Args:
31 |         dataset: Dataset read from source.
32 |         test_size: 0.0..1.0 defining portion of test set.
33 | 
34 |     Returns:
35 |         The split dataset: dataset_trn, dataset_tst.
36 |     """
37 |     dataset_trn, dataset_tst = train_test_split(
38 |         dataset,
39 |         test_size=test_size,
40 |         random_state=42,
41 |         shuffle=True,
42 |     )
43 |     dataset_trn = pd.DataFrame(dataset_trn, columns=dataset.columns)
44 |     dataset_tst = pd.DataFrame(dataset_tst, columns=dataset.columns)
45 |     return dataset_trn, dataset_tst
46 | 


--------------------------------------------------------------------------------
/template/steps/inference_predict.py:
--------------------------------------------------------------------------------
 1 | # Apache Software License 2.0
 2 | #
 3 | # Copyright (c) ZenML GmbH 2023. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | from typing import Any
19 | 
20 | import pandas as pd
21 | from typing_extensions import Annotated
22 | from zenml import step
23 | from zenml.logger import get_logger
24 | 
25 | logger = get_logger(__name__)
26 | 
27 | 
28 | @step
29 | def inference_predict(
30 |     model: Any,
31 |     dataset_inf: pd.DataFrame,
32 | ) -> Annotated[pd.Series, "predictions"]:
33 |     """Predictions step.
34 | 
35 |     This is an example of a predictions step that takes the data and model in
36 |     and returns predicted values.
37 | 
38 |     This step is parameterized, which allows you to configure the step
39 |     independently of the step code, before running it in a pipeline.
40 |     In this example, the step can be configured to use different input data.
41 |     See the documentation for more information:
42 | 
43 |         https://docs.zenml.io/how-to/build-pipelines/use-pipeline-step-parameters
44 | 
45 |     Args:
46 |         model: Trained model.
47 |         dataset_inf: The inference dataset.
48 | 
49 |     Returns:
50 |         The predictions as pandas series
51 |     """
52 |     # run prediction from memory
53 |     predictions = model.predict(dataset_inf)
54 | 
55 |     predictions = pd.Series(predictions, name="predicted")
56 |     return predictions
57 | 


--------------------------------------------------------------------------------
/template/steps/inference_preprocessor.py:
--------------------------------------------------------------------------------
 1 | # Apache Software License 2.0
 2 | #
 3 | # Copyright (c) ZenML GmbH 2023. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | import pandas as pd
19 | from sklearn.pipeline import Pipeline
20 | from typing_extensions import Annotated
21 | from zenml import step
22 | 
23 | 
24 | @step
25 | def inference_preprocessor(
26 |     dataset_inf: pd.DataFrame,
27 |     preprocess_pipeline: Pipeline,
28 |     target: str,
29 | ) -> Annotated[pd.DataFrame, "inference_dataset"]:
30 |     """Data preprocessor step.
31 | 
32 |     This is an example of a data processor step that prepares the data so that
33 |     it is suitable for model inference. It takes in a dataset as an input step
34 |     artifact and performs any necessary preprocessing steps based on pretrained
35 |     preprocessing pipeline.
36 | 
37 |     Args:
38 |         dataset_inf: The inference dataset.
39 |         preprocess_pipeline: Pretrained `Pipeline` to process dataset.
40 |         target: Name of target columns in dataset.
41 | 
42 |     Returns:
43 |         The processed dataframe: dataset_inf.
44 |     """
45 |     # artificially adding `target` column to avoid Pipeline issues
46 |     dataset_inf[target] = pd.Series([1] * dataset_inf.shape[0])
47 |     dataset_inf = preprocess_pipeline.transform(dataset_inf)
48 |     dataset_inf.drop(columns=[target], inplace=True)
49 |     return dataset_inf
50 | 


--------------------------------------------------------------------------------
/template/steps/model_evaluator.py:
--------------------------------------------------------------------------------
 1 | # {% include 'template/license_header' %}
 2 | 
 3 | from typing import Optional
 4 | 
 5 | import pandas as pd
 6 | from sklearn.base import ClassifierMixin
 7 | 
 8 | from zenml import log_metadata, step
 9 | from zenml.client import Client
10 | from zenml.logger import get_logger
11 | 
12 | logger = get_logger(__name__)
13 | 
14 | 
15 | @step
16 | def model_evaluator(
17 |         model: ClassifierMixin,
18 |         dataset_trn: pd.DataFrame,
19 |         dataset_tst: pd.DataFrame,
20 |         min_train_accuracy: float = 0.0,
21 |         min_test_accuracy: float = 0.0,
22 |         target: Optional[str] = "target",
23 | ) -> float:
24 |     """Evaluate a trained model.
25 | 
26 |     This is an example of a model evaluation step that takes in a model artifact
27 |     previously trained by another step in your pipeline, and a training
28 |     and validation data set pair which it uses to evaluate the model's
29 |     performance. The model metrics are then returned as step output artifacts
30 |     (in this case, the model accuracy on the train and test set).
31 | 
32 |     The suggested step implementation also outputs some warnings if the model
33 |     performance does not meet some minimum criteria. This is just an example of
34 |     how you can use steps to monitor your model performance and alert you if
35 |     something goes wrong. As an alternative, you can raise an exception in the
36 |     step to force the pipeline run to fail early and all subsequent steps to
37 |     be skipped.
38 | 
39 |     This step is parameterized to configure the step independently of the step code,
40 |     before running it in a pipeline. In this example, the step can be configured
41 |     to use different values for the acceptable model performance thresholds and
42 |     to control whether the pipeline run should fail if the model performance
43 |     does not meet the minimum criteria. See the documentation for more
44 |     information:
45 | 
46 |         https://docs.zenml.io/how-to/build-pipelines/use-pipeline-step-parameters
47 | 
48 |     Args:
49 |         model: The pre-trained model artifact.
50 |         dataset_trn: The train dataset.
51 |         dataset_tst: The test dataset.
52 |         min_train_accuracy: Minimal acceptable training accuracy value.
53 |         min_test_accuracy: Minimal acceptable testing accuracy value.
54 |         target: Name of target column in dataset.
55 | 
56 |     Returns:
57 |         The model accuracy on the test set.
58 |     """
59 |     # Calculate the model accuracy on the train and test set
60 |     trn_acc = model.score(
61 |         dataset_trn.drop(columns=[target]),
62 |         dataset_trn[target],
63 |     )
64 |     tst_acc = model.score(
65 |         dataset_tst.drop(columns=[target]),
66 |         dataset_tst[target],
67 |     )
68 |     logger.info(f"Train accuracy={trn_acc * 100:.2f}%")
69 |     logger.info(f"Test accuracy={tst_acc * 100:.2f}%")
70 | 
71 |     messages = []
72 |     if trn_acc < min_train_accuracy:
73 |         messages.append(
74 |             f"Train accuracy {trn_acc * 100:.2f}% is below {min_train_accuracy * 100:.2f}% !"
75 |         )
76 |     if tst_acc < min_test_accuracy:
77 |         messages.append(
78 |             f"Test accuracy {tst_acc * 100:.2f}% is below {min_test_accuracy * 100:.2f}% !"
79 |         )
80 |     else:
81 |         for message in messages:
82 |             logger.warning(message)
83 | 
84 |     client = Client()
85 |     latest_classifier = client.get_artifact_version("sklearn_classifier")
86 | 
87 |     log_metadata(
88 |         metadata={
89 |             "train_accuracy": float(trn_acc),
90 |             "test_accuracy": float(tst_acc)
91 |         },
92 |         artifact_version_id=latest_classifier.id
93 |     )
94 | 
95 |     return float(tst_acc)
96 | 


--------------------------------------------------------------------------------
/template/steps/model_promoter.py:
--------------------------------------------------------------------------------
 1 | # {% include 'template/license_header' %}
 2 | 
 3 | from zenml import get_step_context, step
 4 | from zenml.client import Client
 5 | from zenml.logger import get_logger
 6 | 
 7 | logger = get_logger(__name__)
 8 | 
 9 | 
10 | @step
11 | def model_promoter(accuracy: float, stage: str = "production") -> bool:
12 |     """Model promoter step.
13 | 
14 |     This is an example of a step that conditionally promotes a model. It takes
15 |     in the accuracy of the model and the stage to promote the model to. If the
16 |     accuracy is below 80%, the model is not promoted. If it is above 80%, the
17 |     model is promoted to the stage indicated in the parameters. If there is
18 |     already a model in the indicated stage, the model with the higher accuracy
19 |     is promoted.
20 | 
21 |     Args:
22 |         accuracy: Accuracy of the model.
23 |         stage: Which stage to promote the model to.
24 | 
25 |     Returns:
26 |         Whether the model was promoted or not.
27 |     """
28 |     is_promoted = False
29 | 
30 |     if accuracy < 0.8:
31 |         logger.info(
32 |             f"Model accuracy {accuracy*100:.2f}% is below 80% ! Not promoting model."
33 |         )
34 |     else:
35 |         logger.info(f"Model promoted to {stage}!")
36 |         is_promoted = True
37 | 
38 |         # Get the model in the current context
39 |         current_model = get_step_context().model
40 | 
41 |         # Get the model that is in the production stage
42 |         client = Client()
43 |         try:
44 |             stage_model = client.get_model_version(
45 |                 current_model.name, stage
46 |             )
47 |             # We compare their metrics
48 |             prod_accuracy = (
49 |                 stage_model.get_artifact("sklearn_classifier")
50 |                 .run_metadata["test_accuracy"]
51 |             )
52 |             if float(accuracy) > float(prod_accuracy):
53 |                 # If current model has better metrics, we promote it
54 |                 is_promoted = True
55 |                 current_model.set_stage(stage, force=True)
56 |         except KeyError:
57 |             # If no such model exists, current one is promoted
58 |             is_promoted = True
59 |             current_model.set_stage(stage, force=True)
60 |     return is_promoted
61 | 


--------------------------------------------------------------------------------
/template/steps/model_trainer.py:
--------------------------------------------------------------------------------
 1 | # {% include 'template/license_header' %}
 2 | 
 3 | from typing import Optional
 4 | 
 5 | import pandas as pd
 6 | from sklearn.base import ClassifierMixin
 7 | from sklearn.ensemble import RandomForestClassifier
 8 | from sklearn.linear_model import SGDClassifier
 9 | from typing_extensions import Annotated
10 | from zenml import ArtifactConfig, step
11 | from zenml.logger import get_logger
12 | 
13 | logger = get_logger(__name__)
14 | 
15 | 
16 | @step
17 | def model_trainer(
18 |     dataset_trn: pd.DataFrame,
19 |     model_type: str = "sgd",
20 |     target: Optional[str] = "target",
21 | ) -> Annotated[
22 |     ClassifierMixin, ArtifactConfig(name="sklearn_classifier", is_model_artifact=True)
23 | ]:
24 |     """Configure and train a model on the training dataset.
25 | 
26 |     This is an example of a model training step that takes in a dataset artifact
27 |     previously loaded and pre-processed by other steps in your pipeline, then
28 |     configures and trains a model on it. The model is then returned as a step
29 |     output artifact.
30 | 
31 |     Args:
32 |         dataset_trn: The preprocessed train dataset.
33 |         model_type: The type of model to train.
34 |         target: The name of the target column in the dataset.
35 | 
36 |     Returns:
37 |         The trained model artifact.
38 | 
39 |     Raises:
40 |         ValueError: If the model type is not supported.
41 |     """
42 |     # Initialize the model with the hyperparameters indicated in the step
43 |     # parameters and train it on the training set.
44 |     if model_type == "sgd":
45 |         model = SGDClassifier()
46 |     elif model_type == "rf":
47 |         model = RandomForestClassifier()
48 |     else:
49 |         raise ValueError(f"Unknown model type {model_type}")
50 |     logger.info(f"Training model {model}...")
51 | 
52 |     model.fit(
53 |         dataset_trn.drop(columns=[target]),
54 |         dataset_trn[target],
55 |     )
56 |     return model
57 | 


--------------------------------------------------------------------------------
/template/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # {% include 'template/license_header' %}
2 | 


--------------------------------------------------------------------------------
/template/utils/preprocess.py:
--------------------------------------------------------------------------------
 1 | # {% include 'template/license_header' %}
 2 | 
 3 | from typing import Union
 4 | 
 5 | import pandas as pd
 6 | 
 7 | 
 8 | class NADropper:
 9 |     """Support class to drop NA values in sklearn Pipeline."""
10 | 
11 |     def fit(self, *args, **kwargs):
12 |         return self
13 | 
14 |     def transform(self, X: Union[pd.DataFrame, pd.Series]):
15 |         return X.dropna()
16 | 
17 | 
18 | class ColumnsDropper:
19 |     """Support class to drop specific columns in sklearn Pipeline."""
20 | 
21 |     def __init__(self, columns):
22 |         self.columns = columns
23 | 
24 |     def fit(self, *args, **kwargs):
25 |         return self
26 | 
27 |     def transform(self, X: Union[pd.DataFrame, pd.Series]):
28 |         return X.drop(columns=self.columns)
29 | 
30 | 
31 | class DataFrameCaster:
32 |     """Support class to cast type back to pd.DataFrame in sklearn Pipeline."""
33 | 
34 |     def __init__(self, columns):
35 |         self.columns = columns
36 | 
37 |     def fit(self, *args, **kwargs):
38 |         return self
39 | 
40 |     def transform(self, X):
41 |         return pd.DataFrame(X, columns=self.columns)
42 | 


--------------------------------------------------------------------------------
/template/{% if open_source_license %}LICENSE{% endif %}:
--------------------------------------------------------------------------------
1 | {% include 'template/license' %}


--------------------------------------------------------------------------------
/template/{{ _copier_conf.answers_file }}:
--------------------------------------------------------------------------------
1 | # Changes here will be overwritten by Copier
2 | {{ _copier_answers|to_nice_yaml -}}


--------------------------------------------------------------------------------
/test-requirements.txt:
--------------------------------------------------------------------------------
1 | autopep8
2 | pytest
3 | pytest-randomly
4 | ruff
5 | black
6 | isort
7 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
  1 | #  Copyright (c) ZenML GmbH 2023. All Rights Reserved.
  2 | #
  3 | #  Licensed under the Apache License, Version 2.0 (the "License");
  4 | #  you may not use this file except in compliance with the License.
  5 | #  You may obtain a copy of the License at:
  6 | #
  7 | #       https://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #  Unless required by applicable law or agreed to in writing, software
 10 | #  distributed under the License is distributed on an "AS IS" BASIS,
 11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 12 | #  or implied. See the License for the specific language governing
 13 | #  permissions and limitations under the License.
 14 | 
 15 | 
 16 | import contextlib 
 17 | import os
 18 | import sys
 19 | import shutil
 20 | from typing import Generator
 21 | 
 22 | import pytest
 23 | from zenml.client import Client
 24 | from zenml.config.global_config import GlobalConfiguration
 25 | from zenml.constants import ENV_ZENML_CONFIG_PATH
 26 | from zenml.enums import StackComponentType
 27 | 
 28 | 
 29 | def configure_stack():
 30 |     stack_name = os.environ.get("ZENML_STACK_NAME", "local")
 31 |     zenml_client = Client()
 32 | 
 33 |     if stack_name == "local":
 34 |         components = {}
 35 |         for component in [
 36 |             ("local", "local", StackComponentType.ORCHESTRATOR),
 37 |             ("local", "local", StackComponentType.ARTIFACT_STORE),
 38 |         ]:
 39 |             zenml_client.create_stack_component(*component, {})
 40 |             components[component[2]] = component[0]
 41 |         zenml_client.create_stack("local", components=components)
 42 |         zenml_client.activate_stack("local")
 43 |     else:
 44 |         raise RuntimeError(f"Stack {stack_name} not supported")
 45 | 
 46 | 
 47 | @pytest.fixture(scope="module")
 48 | def clean_zenml_client(
 49 |     tmp_path_factory: pytest.TempPathFactory,
 50 | ) -> Generator[Client, None, None]:
 51 |     """Context manager to initialize and use a clean local default ZenML client.
 52 | 
 53 |     This context manager creates a clean ZenML client with its own global
 54 |     configuration and local database.
 55 | 
 56 |     Args:
 57 |         tmp_path_factory: A pytest fixture that provides a temporary directory.
 58 | 
 59 |     Yields:
 60 |         A clean ZenML client.
 61 |     """
 62 |     # save the current global configuration and client singleton instances
 63 |     # to restore them later, then reset them
 64 |     orig_cwd = os.getcwd()
 65 |     original_config = GlobalConfiguration.get_instance()
 66 |     original_client = Client.get_instance()
 67 |     orig_config_path = os.getenv("ZENML_CONFIG_PATH")
 68 | 
 69 |     GlobalConfiguration._reset_instance()
 70 |     Client._reset_instance()
 71 | 
 72 |     # change the working directory to a fresh temp path
 73 |     tmp_path = tmp_path_factory.mktemp("pytest-clean-client")
 74 |     os.chdir(tmp_path)
 75 | 
 76 |     os.environ[ENV_ZENML_CONFIG_PATH] = str(tmp_path / "zenml")
 77 |     os.environ["ZENML_ANALYTICS_OPT_IN"] = "false"
 78 | 
 79 |     # initialize the global config client and store at the new path
 80 |     gc = GlobalConfiguration()
 81 |     gc.analytics_opt_in = False
 82 |     client = Client()
 83 |     _ = client.zen_store
 84 | 
 85 |     # prepare stack configuration
 86 |     configure_stack()
 87 | 
 88 |     yield client
 89 | 
 90 |     # restore the global configuration path
 91 |     if orig_config_path:
 92 |         os.environ[ENV_ZENML_CONFIG_PATH] = orig_config_path
 93 |     else:
 94 |         del os.environ[ENV_ZENML_CONFIG_PATH]
 95 | 
 96 |     # restore the global configuration and the client
 97 |     GlobalConfiguration._reset_instance(original_config)
 98 |     Client._reset_instance(original_client)
 99 | 
100 |     # remove all traces, and change working directory back to base path
101 |     os.chdir(orig_cwd)
102 |     if sys.platform == "win32":
103 |         with contextlib.suppress(Exception):  
104 |             shutil.rmtree(str(tmp_path))  
105 |     else:
106 |         shutil.rmtree(str(tmp_path))
107 | 


--------------------------------------------------------------------------------
/tests/test_starter_template.py:
--------------------------------------------------------------------------------
  1 | #  Copyright (c) ZenML GmbH 2023. All Rights Reserved.
  2 | #
  3 | #  Licensed under the Apache License, Version 2.0 (the "License");
  4 | #  you may not use this file except in compliance with the License.
  5 | #  You may obtain a copy of the License at:
  6 | #
  7 | #       https://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #  Unless required by applicable law or agreed to in writing, software
 10 | #  distributed under the License is distributed on an "AS IS" BASIS,
 11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 12 | #  or implied. See the License for the specific language governing
 13 | #  permissions and limitations under the License.
 14 | 
 15 | 
 16 | import os
 17 | import pathlib
 18 | import platform
 19 | import shutil
 20 | import subprocess
 21 | import sys
 22 | from typing import Optional
 23 | 
 24 | import pytest
 25 | from copier import Worker
 26 | from zenml.client import Client
 27 | from zenml.enums import ExecutionStatus
 28 | 
 29 | TEMPLATE_DIRECTORY = str(pathlib.Path.joinpath(pathlib.Path(__file__).parent.parent))
 30 | 
 31 | 
 32 | def generate_and_run_project(
 33 |     tmp_path_factory: pytest.TempPathFactory,
 34 |     open_source_license: Optional[str] = "apache",
 35 |     product_name: str = "starter_project",
 36 | ):
 37 |     """Generate and run the starter project with different options."""
 38 | 
 39 |     answers = {
 40 |         "project_name": "Pytest Templated Project",
 41 |         "version": "0.0.1",
 42 |         "open_source_license": str(open_source_license).lower(),
 43 |         "product_name": product_name,
 44 |     }
 45 |     if open_source_license:
 46 |         answers["email"] = "pytest@zenml.io"
 47 |         answers["full_name"] = "Pytest"
 48 | 
 49 |     # generate the template in a temp path
 50 |     current_dir = os.getcwd()
 51 |     dst_path = tmp_path_factory.mktemp("pytest-template")
 52 |     os.chdir(str(dst_path))
 53 |     with Worker(
 54 |         src_path=TEMPLATE_DIRECTORY,
 55 |         dst_path=str(dst_path),
 56 |         data=answers,
 57 |         unsafe=True,
 58 |         vcs_ref="HEAD",
 59 |     ) as worker:
 60 |         worker.run_copy()
 61 | 
 62 |     # run the project
 63 |     call = [
 64 |         sys.executable,
 65 |         "run.py",
 66 |         "--training-pipeline",
 67 |         "--feature-pipeline",
 68 |         "--inference-pipeline",
 69 |         "--no-cache",
 70 |     ]
 71 | 
 72 |     try:
 73 |         subprocess.check_output(
 74 |             call,
 75 |             cwd=str(dst_path),
 76 |             env=os.environ.copy(),
 77 |             stderr=subprocess.STDOUT,
 78 |         )
 79 |     except subprocess.CalledProcessError as e:
 80 |         raise RuntimeError(
 81 |             f"Failed to run project generated with parameters: {answers}\n"
 82 |             f"{e.output.decode()}"
 83 |         ) from e
 84 | 
 85 |     # check the pipeline run is successful
 86 |     for pipeline_name, run_count in [
 87 |         ("training", 2),
 88 |         ("inference", 1),
 89 |         ("feature_engineering", 1),
 90 |     ]:
 91 |         pipeline = Client().get_pipeline(pipeline_name)
 92 |         assert pipeline
 93 |         runs = pipeline.runs
 94 |         assert len(runs) == run_count
 95 |         assert runs[0].status == ExecutionStatus.COMPLETED
 96 | 
 97 |         # clean up
 98 |         Client().delete_pipeline(pipeline_name)
 99 |     Client().delete_model("breast_cancer_classifier")
100 | 
101 |     os.chdir(current_dir)
102 |     shutil.rmtree(dst_path)
103 | 
104 | 
105 | @pytest.mark.parametrize("open_source_license", ["mit", None], ids=["oss", "css"])
106 | def test_generate_license(
107 |     clean_zenml_client,
108 |     tmp_path_factory: pytest.TempPathFactory,
109 |     open_source_license: Optional[str],
110 | ):
111 |     """Test generating licenses."""
112 | 
113 |     generate_and_run_project(
114 |         tmp_path_factory=tmp_path_factory,
115 |         open_source_license=open_source_license,
116 |     )
117 | 
118 | 
119 | def test_custom_product_name(
120 |     clean_zenml_client,
121 |     tmp_path_factory: pytest.TempPathFactory,
122 | ):
123 |     """Test using custom pipeline name."""
124 | 
125 |     generate_and_run_project(
126 |         tmp_path_factory=tmp_path_factory,
127 |         product_name="custom_product_name",
128 |     )
129 | 


--------------------------------------------------------------------------------