├── .github └── workflows │ └── mlrun.yml ├── LICENSE ├── README.md ├── docs ├── flow.png ├── mlrun.png ├── pipeline.png ├── pr.png ├── slack.png └── use-this.png ├── gitops_project.ipynb ├── project.yaml └── workflow.py /.github/workflows/mlrun.yml: -------------------------------------------------------------------------------- 1 | name: mlrun-project-workflow 2 | on: [issue_comment] 3 | 4 | jobs: 5 | submit-project: 6 | if: github.event.issue.pull_request != null && startsWith(github.event.comment.body, '/run') 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - uses: actions/checkout@v2 11 | - name: Set up Python 3.6 12 | uses: actions/setup-python@v1 13 | with: 14 | python-version: '3.6' 15 | architecture: 'x64' 16 | 17 | - name: Install mlrun 18 | run: python -m pip install mlrun 19 | - name: Submit project 20 | run: python -m mlrun project ./ --git-issue "${{github.event.issue.number}}" --git-repo ${GITHUB_REPOSITORY} -w -x commit=${COMMIT:33} -r main ${CMD:5} 21 | env: 22 | V3IO_USERNAME: ${{ secrets.V3IO_USERNAME }} 23 | V3IO_PASSWORD: ${{ secrets.V3IO_USERNAME }} 24 | V3IO_API: ${{ secrets.V3IO_API }} 25 | V3IO_ACCESS_KEY: ${{ secrets.V3IO_ACCESS_KEY }} 26 | MLRUN_DBPATH: ${{ secrets.MLRUN_DBPATH }} 27 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 28 | SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} 29 | CMD: ${{ github.event.comment.body}} 30 | COMMIT: ${{ github.sha}} 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ML Pipeline Automation and CI/CD Using GitHub Actions, Kubeflow and MLRun 2 | 3 | Machine learning (ML) pipelines allow us to automate multi-stage workflow which comprise of 4 | data ingestion, data preparation, model training, validation and finally deployment. 5 | 6 | Every time our code, data or parameters change we may want to re-evaluate our model accuracy and performance before we deploy. 7 | This resembles the CI/CD practice for delivering code to production with the additional aspects of data and parameter/configuration versioning, 8 | and may require more powerful resources (computation cluster, GPUs, data processing engines, etc.). 9 | 10 | This template repo demonstrates how you can automate the development, testing, and deployment 11 | of machine learning projects using the following tools: 12 | 13 | * [**GitHub actions**](https://github.com/features/actions) - used for code and metadata versioning, workflow triggering, and process tracking 14 | * [**Kubeflow Pipelines**](https://www.kubeflow.org/docs/pipelines/overview/pipelines-overview/) - Used to execute ML pipeline steps on a (remote) Kubernetes cluster 15 | * [**MLRun**](https://github.com/mlrun/mlrun) - Used for end to end MLOps automation and tracking, [read more below](#mlrun-overview). 16 | 17 | To clone and run with your own environment or on [**iguazio data science platform**](https://www.iguazio.com/), check the [**instructions below**](#how-to-run-with-your-cluster). 18 | 19 | ## How Does It Work? 20 | 21 | This repo is representing an **mlrun project**, mlrun projects consists of **Functions** (code), **Artifacts** (data), **Workflows**, and **Parameters/secrets**. 22 | The [**project.yaml**](project.yaml) file list all of those elements. 23 | 24 | Project elements can be linked (e.g. point to a library function which runs AutoML or data analysis, point to code/notebook files, point to external data objects, workflow files, etc.), 25 | or they can be embedded (e.g. store function code + configuration, workflow steps, etc.), in this example we show how to combine both. 26 | 27 | The project file, workflow and embedded/linked code were generated by running the [**gitops_project notebook**](gitops_project.ipynb), 28 | you can modify it to your needs, this is based on code from [MLRun Demos repo](https://github.com/mlrun/demos), 29 | where you can find more end to end ML Pipeline examples. 30 | 31 | When we change one of the elements (the project.yaml file or one of the other linked code/metadata files) and open a pull request (PR) 32 | we can type `/run` in our PR, this will trigger running the ML Pipeline (as specified in the [workflow file](workflow.py)). 33 | Once the pipeline starts, a comment will be added to your PR with a link to MLRun UI (allowing to track the progress), and when the ML Pipeline completes 34 | MLRUn will write a result summary as a comment back into your PR with links to more details and data artifacts 35 | 36 | **Flow diagram:** 37 | 38 |


39 | 40 | **This is an example of the PR comments:** 41 | 42 |


43 | 44 | **This is an example of the summary report sent to `Slack`:** 45 | 46 |


47 | 48 | **The Kubeflow pipeline graph** 49 | 50 |


51 | 52 | **MLRun UI showing the AutoML results (linked to from the PR)** 53 | 54 |


55 | 56 | ## What Is MLRun? 57 | 58 | MLRun is the first and currently only integrated open-source framework for end to end MLOps automation, it: 59 | * Orchestrates job/pipeline from simple code or pre-baked functions (via Kubeflow and various k8s CRDs) 60 | * Runs, tracks and version projects comprising of experiments, jobs/functions, data, code, models and more. 61 | * Provides an open marketplace for various ML, DL, Analytics, MLOps functions 62 | * Runs iterative AutoML, Hyper-param, or data analysis tasks on a distributed cluster 63 | * Automates deployment of models and real-time data processing functions using (Nuclio) real-time serverless engine 64 | 65 | Read more in [mlrun/mlrun](https://github.com/mlrun/mlrun) 66 | 67 | ## How To Run With Your Cluster 68 | 69 | ### Prerequisites 70 | 71 | You need access to a working Kubernetes cluster with Kubeflow, Nuclio, and MLRun (see [installing MLRun](https://github.com/mlrun/mlrun#installation))
72 | Or use [**iguazio data science platform**](https://www.iguazio.com/) with all of those pre-installed and managed. 73 | 74 | ### Clone and setup 75 | 76 | #### 1. Copy this repo to your own GitHub account by clicking the `Use this template` button 77 | 78 |


79 | 80 | #### 2. Configure the required secrets and addresses 81 | 82 | Under the repo settings select the `secrets` tab and configure the following: 83 | * `MLRUN_DBPATH` - remote URL to mlrun service (e.g. `https://`) 84 | * `SLACK_WEBHOOK` - optional, if you would like to get run summary into your slack 85 | 86 | When using Iguazio platform your should set the following: 87 | * `V3IO_USERNAME` - Iguazio platform username 88 | * `V3IO_ACCESS_KEY` - Iguazio V3IO data layer credentials (copy from your user settings) 89 | * `V3IO_PASSWORD` - user password 90 | * `V3IO_API` - V3IO data access API url (copy from the services screen) 91 | 92 | When using the open source version and a secure API gateway you can use the following secrets 93 | * `MLRUN_HTTPDB__USER` - remote username 94 | * `MLRUN_HTTPDB__PASSWORD` (for basic auth) or `MLRUN_HTTPDB__TOKEN` (for Bearer token) 95 | 96 | 97 | ### Customize 98 | 99 | Update and run the [**gitops_project notebook**](gitops_project.ipynb), 100 | The Notebook will generate the data ingestion function, the workflow code and the [**project.yaml**](project.yaml) files. 101 | You can also run the workflow from the notebook, or you can run it from the pull request. 102 | 103 | ### Run from a PR 104 | 105 | change the project.yaml file, the workflow, or other elements and create a pull request, 106 | once the PR is opened type `/run` in your PR. 107 | 108 | For trouble shooting go to the `Actions` tab to see GitHub Actions workflow progress. 109 | 110 | -------------------------------------------------------------------------------- /docs/flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlrun/demo-github-actions/06d375eba8c79058ea7f1da6ebfc0758973c891f/docs/flow.png -------------------------------------------------------------------------------- /docs/mlrun.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlrun/demo-github-actions/06d375eba8c79058ea7f1da6ebfc0758973c891f/docs/mlrun.png -------------------------------------------------------------------------------- /docs/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlrun/demo-github-actions/06d375eba8c79058ea7f1da6ebfc0758973c891f/docs/pipeline.png -------------------------------------------------------------------------------- /docs/pr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlrun/demo-github-actions/06d375eba8c79058ea7f1da6ebfc0758973c891f/docs/pr.png -------------------------------------------------------------------------------- /docs/slack.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlrun/demo-github-actions/06d375eba8c79058ea7f1da6ebfc0758973c891f/docs/slack.png -------------------------------------------------------------------------------- /docs/use-this.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlrun/demo-github-actions/06d375eba8c79058ea7f1da6ebfc0758973c891f/docs/use-this.png -------------------------------------------------------------------------------- /gitops_project.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Demonstrate Git Based ML Pipeline Automation\n", 8 | " --------------------------------------------------------------------\n", 9 | "\n", 10 | "Creating a local function, running predefined functions, creating and running a full ML pipeline with local and library functions.\n", 11 | "\n", 12 | "#### **notebook how-to's**\n", 13 | "* Create and test a simple function\n", 14 | "* Examine data using serverless (containarized) `describe` function\n", 15 | "* Create an automated ML pipeline from various library functions\n", 16 | "* Running and tracking the pipeline results and artifacts" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "## Create and Test a Local Ingestion/Data-prep Function (e.g. Iris Data Generator)\n", 24 | "Import nuclio SDK and magics, do not remove the cell and comment !!!" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 1, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "# nuclio: ignore\n", 34 | "import nuclio" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "Specify function dependencies and configuration" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 2, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "%nuclio: setting spec.image to 'mlrun/ml-models'\n" 54 | ] 55 | } 56 | ], 57 | "source": [ 58 | "%nuclio config spec.image = \"mlrun/ml-models\"" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "#### Function code\n", 66 | "Generate the iris dataset and log the dataframe (as csv or parquet file)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 3, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "import os\n", 76 | "from sklearn.datasets import load_iris\n", 77 | "from sklearn.model_selection import train_test_split\n", 78 | "import numpy as np\n", 79 | "from sklearn.metrics import accuracy_score\n", 80 | "from mlrun.artifacts import TableArtifact, PlotArtifact\n", 81 | "import pandas as pd\n", 82 | "\n", 83 | "def iris_generator(context, format='csv'):\n", 84 | " iris = load_iris()\n", 85 | " iris_dataset = pd.DataFrame(data=iris.data, columns=iris.feature_names)\n", 86 | " iris_labels = pd.DataFrame(data=iris.target, columns=['label'])\n", 87 | " iris_dataset = pd.concat([iris_dataset, iris_labels], axis=1)\n", 88 | " \n", 89 | " context.logger.info('saving iris dataframe to {}'.format(context.artifact_path))\n", 90 | " context.log_dataset('iris_dataset', df=iris_dataset, format=format, index=False)\n" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "The following end-code annotation tells ```nuclio``` to stop parsing the notebook from this cell. _**Please do not remove this cell**_:" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 4, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "# nuclio: end-code\n", 107 | "# marks the end of a code section" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "## Create a project to host our functions, jobs and artifacts\n", 115 | "\n", 116 | "Projects are used to package multiple functions, workflows, and artifacts. We usually store project code and definitions in a Git archive.\n", 117 | "\n", 118 | "The following code creates a new project in a local dir and initialize git tracking on that" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 5, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "from os import path\n", 128 | "from mlrun import run_local, NewTask, mlconf, import_function, mount_v3io\n", 129 | "mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'\n", 130 | "\n", 131 | "# specify artifacts target location\n", 132 | "artifact_path = mlconf.artifact_path or path.abspath('./')\n", 133 | "project_name = 'gitops-project'" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 6, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "from mlrun import new_project, code_to_function\n", 143 | "project_dir = './'\n", 144 | "skproj = new_project(project_name, project_dir)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "\n", 152 | "### Run/test the data generator function locally\n", 153 | "\n", 154 | "The functions above can be tested locally. Parameters, inputs, and outputs can be specified in the API or the `Task` object.
\n", 155 | "when using `run_local()` the function inputs and outputs are automatically recorded by MLRun experiment and data tracking DB.\n", 156 | "\n", 157 | "In each run we can specify the function, inputs, parameters/hyper-parameters, etc... For more details, see the [mlrun_basics notebook](mlrun_basics.ipynb)." 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 7, 163 | "metadata": {}, 164 | "outputs": [ 165 | { 166 | "name": "stdout", 167 | "output_type": "stream", 168 | "text": [ 169 | "> 2020-07-29 10:38:35,433 [info] starting run iris_gen uid=3e340d3561ca402c91e9bb09b1631dd4 -> http://mlrun-api:8080\n", 170 | "> 2020-07-29 10:38:35,518 [info] saving iris dataframe to /User/demo-github-actions/data\n" 171 | ] 172 | }, 173 | { 174 | "data": { 175 | "text/html": [ 176 | "\n", 310 | "
\n", 311 | "
\n", 312 | "\n", 325 | "\n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
gitops-project0Jul 29 10:38:35completediris_gen
v3io_user=admin
kind=handler
owner=admin
host=jupyter-58d8fdb6fc-nmqbq
iris_dataset
\n", 357 | "
\n", 358 | "
\n", 359 | "
\n", 360 | " Title\n", 361 | " ×\n", 362 | "
\n", 363 | " \n", 364 | "
\n", 365 | "
\n" 366 | ], 367 | "text/plain": [ 368 | "" 369 | ] 370 | }, 371 | "metadata": {}, 372 | "output_type": "display_data" 373 | }, 374 | { 375 | "name": "stdout", 376 | "output_type": "stream", 377 | "text": [ 378 | "to track results use .show() or .logs() or in CLI: \n", 379 | "!mlrun get run 3e340d3561ca402c91e9bb09b1631dd4 --project gitops-project , !mlrun logs 3e340d3561ca402c91e9bb09b1631dd4 --project gitops-project\n", 380 | "> 2020-07-29 10:38:35,641 [info] run executed, status=completed\n" 381 | ] 382 | } 383 | ], 384 | "source": [ 385 | "# run the function locally\n", 386 | "gen = run_local(name='iris_gen', handler=iris_generator, \n", 387 | " project=project_name, artifact_path=path.join(artifact_path, 'data')) " 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "metadata": {}, 393 | "source": [ 394 | "#### Convert our local code to a distributed serverless function object " 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 8, 400 | "metadata": {}, 401 | "outputs": [ 402 | { 403 | "data": { 404 | "text/plain": [ 405 | "" 406 | ] 407 | }, 408 | "execution_count": 8, 409 | "metadata": {}, 410 | "output_type": "execute_result" 411 | } 412 | ], 413 | "source": [ 414 | "gen_func = code_to_function(name='gen_iris', kind='job')\n", 415 | "skproj.set_function(gen_func)" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "metadata": {}, 422 | "outputs": [], 423 | "source": [] 424 | }, 425 | { 426 | "cell_type": "markdown", 427 | "metadata": {}, 428 | "source": [ 429 | "## Analyze the dataset features (useing marketplace function)\n", 430 | "load dataset analysis function (`describe`) from the function hub (marketplace), and print its doc." 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": 15, 436 | "metadata": {}, 437 | "outputs": [ 438 | { 439 | "name": "stdout", 440 | "output_type": "stream", 441 | "text": [ 442 | "function: describe\n", 443 | "describe and visualizes dataset stats\n", 444 | "default handler: summarize\n", 445 | "entry points:\n", 446 | " summarize: Summarize a table\n", 447 | " context(MLClientCtx) - the function context, default=\n", 448 | " table(DataItem) - MLRun input pointing to pandas dataframe (csv/parquet file path), default=\n", 449 | " label_column(str) - ground truth column label, default=None\n", 450 | " class_labels(List[str]) - label for each class in tables and plots, default=[]\n", 451 | " plot_hist(bool) - (True) set this to False for large tables, default=True\n", 452 | " plots_dest(str) - destination folder of summary plots (relative to artifact_path), default=plots\n", 453 | " update_dataset - when the table is a registered dataset update the charts in-place, default=False\n" 454 | ] 455 | } 456 | ], 457 | "source": [ 458 | "skproj.set_function('hub://describe', 'describe')\n", 459 | "skproj.func('describe').doc()" 460 | ] 461 | }, 462 | { 463 | "cell_type": "markdown", 464 | "metadata": {}, 465 | "source": [ 466 | "### Run the describe function on our dataset (as a Kubernetes job)\n", 467 | " using shared file system mount (`mount_v3io`) with our notebook." 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": 16, 473 | "metadata": {}, 474 | "outputs": [ 475 | { 476 | "name": "stdout", 477 | "output_type": "stream", 478 | "text": [ 479 | "> 2020-07-29 12:46:52,341 [info] starting run describe-summarize uid=301ab10adbf34adb898f0751c7f0f0b4 -> http://mlrun-api:8080\n", 480 | "> 2020-07-29 12:46:52,497 [info] Job is running in the background, pod: describe-summarize-r9tvz\n", 481 | "> 2020-07-29 12:47:01,761 [info] run executed, status=completed\n", 482 | "final state: succeeded\n" 483 | ] 484 | }, 485 | { 486 | "data": { 487 | "text/html": [ 488 | "\n", 622 | "
\n", 623 | "
\n", 624 | "\n", 637 | "\n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | "
projectuiditerstartstatenamelabelsinputsparametersresultsartifacts
gitops-project0Jul 29 12:46:57completeddescribe-summarize
v3io_user=admin
kind=job
owner=admin
host=describe-summarize-r9tvz
table
label_column=label
histograms
violin
imbalance
imbalance-weights-vec
correlation-matrix
correlation
\n", 669 | "
\n", 670 | "
\n", 671 | "
\n", 672 | " Title\n", 673 | " ×\n", 674 | "
\n", 675 | " \n", 676 | "
\n", 677 | "
\n" 678 | ], 679 | "text/plain": [ 680 | "" 681 | ] 682 | }, 683 | "metadata": {}, 684 | "output_type": "display_data" 685 | }, 686 | { 687 | "name": "stdout", 688 | "output_type": "stream", 689 | "text": [ 690 | "to track results use .show() or .logs() or in CLI: \n", 691 | "!mlrun get run 301ab10adbf34adb898f0751c7f0f0b4 --project gitops-project , !mlrun logs 301ab10adbf34adb898f0751c7f0f0b4 --project gitops-project\n", 692 | "> 2020-07-29 12:47:11,671 [info] run executed, status=completed\n" 693 | ] 694 | }, 695 | { 696 | "data": { 697 | "text/plain": [ 698 | "" 699 | ] 700 | }, 701 | "execution_count": 16, 702 | "metadata": {}, 703 | "output_type": "execute_result" 704 | } 705 | ], 706 | "source": [ 707 | "skproj.func('describe').apply(mount_v3io()).run(params={'label_column': 'label'}, \n", 708 | " inputs={\"table\": gen.outputs['iris_dataset']}, \n", 709 | " artifact_path=artifact_path)" 710 | ] 711 | }, 712 | { 713 | "cell_type": "markdown", 714 | "metadata": {}, 715 | "source": [ 716 | "## Create a Fully Automated ML Pipeline\n", 717 | "\n", 718 | "#### Add more functions to our project to be used in our pipeline (from the functions hub/marketplace)\n", 719 | "\n", 720 | "AutoML training (classifier), Model validation (test_classifier), Real-time model server, and Model REST API Tester" 721 | ] 722 | }, 723 | { 724 | "cell_type": "code", 725 | "execution_count": 9, 726 | "metadata": {}, 727 | "outputs": [ 728 | { 729 | "data": { 730 | "text/plain": [ 731 | "" 732 | ] 733 | }, 734 | "execution_count": 9, 735 | "metadata": {}, 736 | "output_type": "execute_result" 737 | } 738 | ], 739 | "source": [ 740 | "skproj.set_function('hub://sklearn_classifier', 'train')\n", 741 | "skproj.set_function('hub://test_classifier', 'test')\n", 742 | "skproj.set_function('hub://model_server', 'serving')\n", 743 | "skproj.set_function('hub://model_server_tester', 'live_tester')\n", 744 | "#print(skproj.to_yaml())" 745 | ] 746 | }, 747 | { 748 | "cell_type": "markdown", 749 | "metadata": {}, 750 | "source": [ 751 | "#### Define and save a pipeline \n", 752 | "\n", 753 | "The following workflow definition will be written into a file, it describes a Kubeflow execution graph (DAG)
\n", 754 | "and how functions and data are connected to form an end to end pipeline. \n", 755 | "\n", 756 | "* Build the iris generator (ingest) function container \n", 757 | "* Ingest the iris data\n", 758 | "* Analyze the dataset (describe)\n", 759 | "* Train and test the model\n", 760 | "* Deploy the model as a real-time serverless function\n", 761 | "* Test the serverless function REST API with test dataset\n", 762 | "\n", 763 | "Check the code below to see how functions objects are initialized and used (by name) inside the workflow.
\n", 764 | "The `workflow.py` file has two parts, initialize the function objects and define pipeline dsl (connect the function inputs and outputs).\n", 765 | "\n", 766 | "> Note: the pipeline can include CI steps like building container images and deploying models as illustrated in the following example.\n" 767 | ] 768 | }, 769 | { 770 | "cell_type": "code", 771 | "execution_count": 17, 772 | "metadata": {}, 773 | "outputs": [ 774 | { 775 | "name": "stdout", 776 | "output_type": "stream", 777 | "text": [ 778 | "Overwriting ./workflow.py\n" 779 | ] 780 | } 781 | ], 782 | "source": [ 783 | "%%writefile ./workflow.py\n", 784 | "from kfp import dsl\n", 785 | "from mlrun import mount_v3io, NewTask\n", 786 | "\n", 787 | "\n", 788 | "funcs = {}\n", 789 | "this_project = None\n", 790 | "DATASET = 'iris_dataset'\n", 791 | "LABELS = \"label\"\n", 792 | "\n", 793 | "# init functions is used to configure function resources and local settings\n", 794 | "def init_functions(functions: dict, project=None, secrets=None):\n", 795 | " for f in functions.values():\n", 796 | " f.apply(mount_v3io())\n", 797 | " \n", 798 | " # uncomment this line to collect the inference results into a stream\n", 799 | " # and specify a path in V3IO (/)\n", 800 | " #functions['serving'].set_env('INFERENCE_STREAM', 'users/admin/model_stream')\n", 801 | "\n", 802 | " \n", 803 | "@dsl.pipeline(\n", 804 | " name=\"Demo training pipeline\",\n", 805 | " description=\"Shows how to use mlrun.\"\n", 806 | ")\n", 807 | "def kfpipeline():\n", 808 | " \n", 809 | " # run the ingestion function with the new image and params\n", 810 | " ingest = funcs['gen-iris'].as_step(\n", 811 | " name=\"get-data\",\n", 812 | " handler='iris_generator',\n", 813 | " params={'format': 'pq'},\n", 814 | " outputs=[DATASET])\n", 815 | "\n", 816 | " # analyze our dataset\n", 817 | " describe = funcs[\"describe\"].as_step(\n", 818 | " name=\"summary\",\n", 819 | " params={\"label_column\": LABELS},\n", 820 | " inputs={\"table\": ingest.outputs[DATASET]})\n", 821 | " \n", 822 | " # train with hyper-paremeters\n", 823 | " train = funcs[\"train\"].as_step(\n", 824 | " name=\"train\",\n", 825 | " params={\"sample\" : -1,\n", 826 | " \"label_column\" : LABELS,\n", 827 | " \"test_size\" : 0.10},\n", 828 | " hyperparams={'model_pkg_class': [\"sklearn.ensemble.RandomForestClassifier\",\n", 829 | " \"sklearn.linear_model.LogisticRegression\",\n", 830 | " \"sklearn.ensemble.AdaBoostClassifier\"]},\n", 831 | " selector='max.accuracy',\n", 832 | " inputs={\"dataset\" : ingest.outputs[DATASET]},\n", 833 | " labels={\"commit\": this_project.params.get('commit', '')},\n", 834 | " outputs=['model', 'test_set'])\n", 835 | "\n", 836 | " # test and visualize our model\n", 837 | " test = funcs[\"test\"].as_step(\n", 838 | " name=\"test\",\n", 839 | " params={\"label_column\": LABELS},\n", 840 | " inputs={\"models_path\" : train.outputs['model'],\n", 841 | " \"test_set\" : train.outputs['test_set']})\n", 842 | "\n", 843 | " # deploy our model as a serverless function\n", 844 | " deploy = funcs[\"serving\"].deploy_step(models={f\"{DATASET}_v1\": train.outputs['model']},\n", 845 | " tag=this_project.params.get('commit', 'v1'))\n", 846 | "\n", 847 | " # test out new model server (via REST API calls)\n", 848 | " tester = funcs[\"live_tester\"].as_step(name='model-tester',\n", 849 | " params={'addr': deploy.outputs['endpoint'], 'model': f\"{DATASET}_v1\"},\n", 850 | " inputs={'table': train.outputs['test_set']})\n" 851 | ] 852 | }, 853 | { 854 | "cell_type": "code", 855 | "execution_count": 18, 856 | "metadata": {}, 857 | "outputs": [], 858 | "source": [ 859 | "# register the workflow file as \"main\", embed the workflow code into the project YAML\n", 860 | "skproj.set_workflow('main', 'workflow.py')" 861 | ] 862 | }, 863 | { 864 | "cell_type": "markdown", 865 | "metadata": {}, 866 | "source": [ 867 | "Save the project definitions to a file (project.yaml), it is recommended to commit all changes to a Git repo." 868 | ] 869 | }, 870 | { 871 | "cell_type": "code", 872 | "execution_count": 22, 873 | "metadata": {}, 874 | "outputs": [], 875 | "source": [ 876 | "skproj.artifact_path = 'v3io:///users/{{run.user}}/pipe/{{workflow.uid}}'\n", 877 | "skproj.save()" 878 | ] 879 | }, 880 | { 881 | "cell_type": "markdown", 882 | "metadata": {}, 883 | "source": [ 884 | "\n", 885 | "## Run a pipeline workflow manually (not via git PR)\n", 886 | "\n", 887 | "This section is not used for the git automation, rather demo how to run the workflow from the notebook\n", 888 | "\n", 889 | "use the `run` method to execute a workflow, you can provide alternative arguments and specify the default target for workflow artifacts.
\n", 890 | "The workflow ID is returned and can be used to track the progress or you can use the hyperlinks\n", 891 | "\n", 892 | "> Note: The same command can be issued through CLI commands:
\n", 893 | " `mlrun project my-proj/ -r main -p \"v3io:///users/{{run.user}}/mlrun/kfp/{{workflow.uid}}/\"`\n", 894 | "\n", 895 | "The `dirty` flag allow us to run a project with uncommited changes (when the notebook is in the same git dir it will always be dirty)
\n", 896 | "The `watch` flag will wait for the pipeline to complete" 897 | ] 898 | }, 899 | { 900 | "cell_type": "code", 901 | "execution_count": 23, 902 | "metadata": {}, 903 | "outputs": [], 904 | "source": [ 905 | "# If you want to get slack notification after the run with result summary, set the env var below\n", 906 | "# %env SLACK_WEBHOOK=" 907 | ] 908 | }, 909 | { 910 | "cell_type": "code", 911 | "execution_count": 24, 912 | "metadata": {}, 913 | "outputs": [ 914 | { 915 | "data": { 916 | "text/html": [ 917 | "Experiment link here" 918 | ], 919 | "text/plain": [ 920 | "" 921 | ] 922 | }, 923 | "metadata": {}, 924 | "output_type": "display_data" 925 | }, 926 | { 927 | "data": { 928 | "text/html": [ 929 | "Run link here" 930 | ], 931 | "text/plain": [ 932 | "" 933 | ] 934 | }, 935 | "metadata": {}, 936 | "output_type": "display_data" 937 | }, 938 | { 939 | "name": "stdout", 940 | "output_type": "stream", 941 | "text": [ 942 | "> 2020-07-29 13:04:18,155 [info] Pipeline run id=8f462295-2154-428a-b861-4ec8be504832, check UI or DB for progress\n", 943 | "> 2020-07-29 13:04:18,156 [info] waiting for pipeline run completion\n" 944 | ] 945 | }, 946 | { 947 | "data": { 948 | "text/html": [ 949 | "

Run Results

Workflow 8f462295-2154-428a-b861-4ec8be504832 finished, status=Succeeded
click the hyper links below to see detailed results
\n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | "
uidstartstatenameresultsartifacts
Jul 29 13:05:03completedmodel-tester
total_tests=15
errors=0
match=14
avg_latency=11446
min_latency=11047
max_latency=12131
latency
Jul 29 13:04:54completedtest
accuracy=0.9333333333333333
test-error=0.06666666666666667
auc-micro=0.9655555555555556
auc-weighted=0.9888888888888889
f1-score=0.9137254901960784
precision_score=0.8888888888888888
recall_score=0.9629629629629629
confusion-matrix
feature-importances
precision-recall-multiclass
roc-multiclass
test_set_preds
Jul 29 13:04:37completedsummary
histograms
violin
imbalance
imbalance-weights-vec
correlation-matrix
correlation
Jul 29 13:04:36completedtrain
best_iteration=1
accuracy=0.9705882352941176
test-error=0.029411764705882353
auc-micro=0.9969723183391004
auc-weighted=0.9949732620320856
f1-score=0.9679633867276888
precision_score=0.9666666666666667
recall_score=0.9722222222222222
test_set
confusion-matrix
feature-importances
precision-recall-multiclass
roc-multiclass
model
iteration_results
Jul 29 13:04:26completedget-data
iris_dataset
" 1003 | ], 1004 | "text/plain": [ 1005 | "" 1006 | ] 1007 | }, 1008 | "metadata": {}, 1009 | "output_type": "display_data" 1010 | } 1011 | ], 1012 | "source": [ 1013 | "run_id = skproj.run(\n", 1014 | " 'main', arguments={}, \n", 1015 | " dirty=True, watch=True)" 1016 | ] 1017 | }, 1018 | { 1019 | "cell_type": "markdown", 1020 | "metadata": {}, 1021 | "source": [ 1022 | "**[back to top](#top)**" 1023 | ] 1024 | } 1025 | ], 1026 | "metadata": { 1027 | "kernelspec": { 1028 | "display_name": "Python 3", 1029 | "language": "python", 1030 | "name": "python3" 1031 | }, 1032 | "language_info": { 1033 | "codemirror_mode": { 1034 | "name": "ipython", 1035 | "version": 3 1036 | }, 1037 | "file_extension": ".py", 1038 | "mimetype": "text/x-python", 1039 | "name": "python", 1040 | "nbconvert_exporter": "python", 1041 | "pygments_lexer": "ipython3", 1042 | "version": "3.7.6" 1043 | } 1044 | }, 1045 | "nbformat": 4, 1046 | "nbformat_minor": 4 1047 | } 1048 | -------------------------------------------------------------------------------- /project.yaml: -------------------------------------------------------------------------------- 1 | name: gitops-project 2 | functions: 3 | - name: gen-iris 4 | spec: 5 | kind: job 6 | metadata: 7 | name: gen-iris 8 | tag: '' 9 | project: gitops-project 10 | spec: 11 | command: '' 12 | args: [] 13 | image: mlrun/ml-models 14 | env: [] 15 | default_handler: '' 16 | entry_points: 17 | iris_generator: 18 | name: iris_generator 19 | doc: '' 20 | parameters: 21 | - name: context 22 | default: '' 23 | - name: format 24 | default: csv 25 | outputs: 26 | - default: '' 27 | lineno: 11 28 | description: '' 29 | build: 30 | functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG9zCmZyb20gc2tsZWFybi5kYXRhc2V0cyBpbXBvcnQgbG9hZF9pcmlzCmZyb20gc2tsZWFybi5tb2RlbF9zZWxlY3Rpb24gaW1wb3J0IHRyYWluX3Rlc3Rfc3BsaXQKaW1wb3J0IG51bXB5IGFzIG5wCmZyb20gc2tsZWFybi5tZXRyaWNzIGltcG9ydCBhY2N1cmFjeV9zY29yZQpmcm9tIG1scnVuLmFydGlmYWN0cyBpbXBvcnQgVGFibGVBcnRpZmFjdCwgUGxvdEFydGlmYWN0CmltcG9ydCBwYW5kYXMgYXMgcGQKCmRlZiBpcmlzX2dlbmVyYXRvcihjb250ZXh0LCBmb3JtYXQ9J2NzdicpOgogICAgaXJpcyA9IGxvYWRfaXJpcygpCiAgICBpcmlzX2RhdGFzZXQgPSBwZC5EYXRhRnJhbWUoZGF0YT1pcmlzLmRhdGEsIGNvbHVtbnM9aXJpcy5mZWF0dXJlX25hbWVzKQogICAgaXJpc19sYWJlbHMgPSBwZC5EYXRhRnJhbWUoZGF0YT1pcmlzLnRhcmdldCwgY29sdW1ucz1bJ2xhYmVsJ10pCiAgICBpcmlzX2RhdGFzZXQgPSBwZC5jb25jYXQoW2lyaXNfZGF0YXNldCwgaXJpc19sYWJlbHNdLCBheGlzPTEpCiAgICAKICAgIGNvbnRleHQubG9nZ2VyLmluZm8oJ3NhdmluZyBpcmlzIGRhdGFmcmFtZSB0byB7fScuZm9ybWF0KGNvbnRleHQuYXJ0aWZhY3RfcGF0aCkpCiAgICBjb250ZXh0LmxvZ19kYXRhc2V0KCdpcmlzX2RhdGFzZXQnLCBkZj1pcmlzX2RhdGFzZXQsIGZvcm1hdD1mb3JtYXQsIGluZGV4PUZhbHNlKQoK 31 | commands: [] 32 | code_origin: https://github.com/mlrun/demo-github-actions.git#0e717588b1354d3d60cd96ba5c352d71aace0552 33 | - url: hub://sklearn_classifier 34 | name: train 35 | - url: hub://test_classifier 36 | name: test 37 | - url: hub://model_server 38 | name: serving 39 | - url: hub://model_server_tester 40 | name: live_tester 41 | - url: hub://describe 42 | name: describe 43 | workflows: 44 | - name: main 45 | path: workflow.py 46 | artifacts: [] 47 | artifact_path: v3io:///users/{{run.user}}/pipe/{{workflow.uid}} 48 | -------------------------------------------------------------------------------- /workflow.py: -------------------------------------------------------------------------------- 1 | from kfp import dsl 2 | from mlrun import mount_v3io, NewTask 3 | 4 | 5 | funcs = {} 6 | this_project = None 7 | DATASET = 'iris_dataset' 8 | LABELS = "label" 9 | 10 | # init functions is used to configure function resources and local settings 11 | def init_functions(functions: dict, project=None, secrets=None): 12 | for f in functions.values(): 13 | f.apply(mount_v3io()) 14 | 15 | # uncomment this line to collect the inference results into a stream 16 | # and specify a path in V3IO (/) 17 | #functions['serving'].set_env('INFERENCE_STREAM', 'users/admin/model_stream') 18 | 19 | 20 | @dsl.pipeline( 21 | name="Demo training pipeline", 22 | description="Shows how to use mlrun." 23 | ) 24 | def kfpipeline(): 25 | 26 | # run the ingestion function with the new image and params 27 | ingest = funcs['gen-iris'].as_step( 28 | name="get-data", 29 | handler='iris_generator', 30 | params={'format': 'pq'}, 31 | outputs=[DATASET]) 32 | 33 | # analyze our dataset 34 | describe = funcs["describe"].as_step( 35 | name="summary", 36 | params={"label_column": LABELS}, 37 | inputs={"table": ingest.outputs[DATASET]}) 38 | 39 | # train with hyper-paremeters 40 | train = funcs["train"].as_step( 41 | name="train", 42 | params={"sample" : -1, 43 | "label_column" : LABELS, 44 | "test_size" : 0.10}, 45 | hyperparams={'model_pkg_class': ["sklearn.ensemble.RandomForestClassifier", 46 | "sklearn.linear_model.LogisticRegression", 47 | "sklearn.ensemble.AdaBoostClassifier"]}, 48 | selector='max.accuracy', 49 | inputs={"dataset" : ingest.outputs[DATASET]}, 50 | labels={"commit": this_project.params.get('commit', '')}, 51 | outputs=['model', 'test_set']) 52 | 53 | # test and visualize our model 54 | test = funcs["test"].as_step( 55 | name="test", 56 | params={"label_column": LABELS}, 57 | inputs={"models_path" : train.outputs['model'], 58 | "test_set" : train.outputs['test_set']}) 59 | 60 | # deploy our model as a serverless function 61 | deploy = funcs["serving"].deploy_step(models={f"{DATASET}_v1": train.outputs['model']}, 62 | tag=this_project.params.get('commit', 'v1')) 63 | 64 | # test out new model server (via REST API calls) 65 | tester = funcs["live_tester"].as_step(name='model-tester', 66 | params={'addr': deploy.outputs['endpoint'], 'model': f"{DATASET}_v1"}, 67 | inputs={'table': train.outputs['test_set']}) 68 | --------------------------------------------------------------------------------