├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── workflows │ ├── codeql.yml │ ├── integration_test_minio_gitops.yaml │ ├── linter-py.yaml │ └── main.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── charts └── mlflow-controller │ ├── .helmignore │ ├── Chart.yaml │ ├── README.md │ ├── templates │ ├── _helpers.tpl │ ├── deployment-ui.yaml │ ├── deployment.yaml │ ├── gitops-cm.yaml │ ├── ingress.yaml │ ├── mlflow-cm.yaml │ ├── service.yaml │ └── serviceaccount.yaml │ └── values.yaml ├── doc ├── Mlflow Deployment controller.drawio ├── doc.md └── gitops.md ├── examples ├── argo-manifest │ ├── mlflow-controller-production.yaml │ ├── mlflow-controller.yaml │ ├── mlflow.yaml │ └── seldon-core.yaml ├── gitops │ └── gitops.ipynb ├── notebook │ ├── deploy.yaml │ └── mlflow.ipynb └── readme.md ├── main.py ├── mlflow_controller ├── __init__.py ├── controller.py ├── gitops.py ├── mlflow_direct.py ├── mlservers │ ├── kserve.py │ ├── rclone.py │ ├── seldon.py │ └── utils.py ├── registries │ ├── mlflow.py │ └── mlflow_backend.py └── utils │ └── var_extract.py ├── requirements.txt ├── test.py ├── tests ├── docker_build_push.sh ├── install_gitea.sh ├── install_istio.sh ├── install_kserve.sh ├── install_kserve_deployment_controller.sh ├── install_mlflow.sh ├── install_seldon_core.sh ├── install_seldon_deployment_controller.sh ├── kind-cluster-1-24.yaml ├── log_mlflow_model.sh ├── mlflow-cm.yaml ├── mlflow │ ├── iris.py │ ├── list_model.py │ └── test_deploy.py ├── pf_mlflow.sh ├── repo-test │ ├── production │ │ ├── kserve-s3.yaml │ │ └── seldon-s3.yaml │ └── staging │ │ ├── kserve-s3.yaml │ │ ├── kserve-s3t.yaml │ │ ├── kserve-sa.yaml │ │ ├── seldon-s3.yaml │ │ ├── seldon-secret.yaml │ │ └── seldon-single-model.yaml └── setup_git_repo.sh ├── tox.ini └── ui ├── Dockerfile ├── app.py ├── pages ├── deployments.py ├── logs.py ├── not_found_404.py └── seldon.py ├── requirements.txt └── seldon_deployments ├── card.py └── data.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ "main", gh-pages ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ "main" ] 20 | schedule: 21 | - cron: '33 5 * * 0' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'python' ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 37 | # Use only 'java' to analyze code written in Java, Kotlin or both 38 | # Use only 'javascript' to analyze code written in JavaScript, TypeScript or both 39 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support 40 | 41 | steps: 42 | - name: Checkout repository 43 | uses: actions/checkout@v3 44 | 45 | # Initializes the CodeQL tools for scanning. 46 | - name: Initialize CodeQL 47 | uses: github/codeql-action/init@v2 48 | with: 49 | languages: ${{ matrix.language }} 50 | # If you wish to specify custom queries, you can do so here or in a config file. 51 | # By default, queries listed here will override any specified in a config file. 52 | # Prefix the list here with "+" to use these queries and those in the config file. 53 | 54 | # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs 55 | # queries: security-extended,security-and-quality 56 | 57 | 58 | # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java). 59 | # If this step fails, then you should remove it and run the build manually (see below) 60 | - name: Autobuild 61 | uses: github/codeql-action/autobuild@v2 62 | 63 | # ℹ️ Command-line programs to run using the OS shell. 64 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun 65 | 66 | # If the Autobuild fails above, remove it and uncomment the following three lines. 67 | # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. 68 | 69 | # - run: | 70 | # echo "Run, Build Application using script" 71 | # ./location_of_script_within_repo/buildscript.sh 72 | 73 | - name: Perform CodeQL Analysis 74 | uses: github/codeql-action/analyze@v2 75 | with: 76 | category: "/language:${{matrix.language}}" 77 | -------------------------------------------------------------------------------- /.github/workflows/integration_test_minio_gitops.yaml: -------------------------------------------------------------------------------- 1 | name: Integration test gitops in KinD [minio] 2 | on: 3 | pull_request: 4 | 5 | 6 | jobs: 7 | build: 8 | runs-on: ubuntu-latest 9 | strategy: 10 | fail-fast: false 11 | matrix: 12 | mlserver: [seldon,kserve] 13 | steps: 14 | - name: Checkout 15 | uses: actions/checkout@v3 16 | with: 17 | ref: ${{ github.event.pull_request.head.sha }} 18 | 19 | - uses: engineerd/setup-kind@v0.5.0 20 | with: 21 | skipClusterCreation: "true" 22 | version: v0.17.0 23 | 24 | - name: Install Helm 25 | uses: azure/setup-helm@v1 26 | with: 27 | version: v3.8.1 28 | 29 | 30 | - name: Create KinD Cluster 31 | run: kind create cluster --config tests/kind-cluster-1-24.yaml 32 | 33 | - name: Testing 34 | run: | 35 | kubectl cluster-info 36 | kubectl get pods -n kube-system 37 | echo "current-context:" $(kubectl config current-context) 38 | echo "environment-kubeconfig:" ${KUBECONFIG} 39 | kubectl get nodes 40 | kubectl wait --for=condition=Ready nodes --all --timeout=600s 41 | 42 | - name: Install Gitea 43 | run: ./tests/install_gitea.sh 44 | 45 | - name: setup git repo 46 | run: ./tests/setup_git_repo.sh 47 | 48 | - name: Install mlflow 49 | run: ./tests/install_mlflow.sh 50 | 51 | - name: PF Mlflow 52 | run: ./tests/pf_mlflow.sh 53 | - uses: actions/setup-python@v4 54 | with: 55 | python-version: '3.7' 56 | 57 | - name: Log model Mlflow 58 | run: ./tests/log_mlflow_model.sh 59 | 60 | - name: Install Kserve 61 | run: ./tests/install_kserve.sh 62 | 63 | - name: Install Seldon Core 64 | run: ./tests/install_seldon_core.sh 65 | 66 | - name: Build and Push image 67 | run: ./tests/docker_build_push.sh 68 | 69 | - name: Install deployment controller ${{matrix.mlserver}} 70 | run: ./tests/install_${{matrix.mlserver}}_deployment_controller.sh 71 | env: 72 | mlserver: ${{matrix.mlserver}} 73 | 74 | -------------------------------------------------------------------------------- /.github/workflows/linter-py.yaml: -------------------------------------------------------------------------------- 1 | name: linter 2 | 3 | on: [pull_request] 4 | 5 | jobs: 6 | lint-python: 7 | runs-on: ubuntu-latest 8 | env: 9 | PYTHON: 3.8 10 | steps: 11 | - uses: actions/checkout@v2 12 | - name: Setup Python 13 | id: setup-python 14 | uses: actions/setup-python@v2 15 | with: 16 | python-version: "3.8" 17 | architecture: x64 18 | - name: Upgrade pip version 19 | run: | 20 | pip install --upgrade "pip>=21.3.1,<22.1" 21 | - name: requirements.txt 22 | id: pip-requirements 23 | run: | 24 | pip install isort black flake8 25 | 26 | - name: Lint python 27 | run: make lint-python-check -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: Release Charts 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | paths: 8 | - 'charts/**' 9 | permissions: write-all 10 | 11 | jobs: 12 | release: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: Checkout 16 | uses: actions/checkout@v2 17 | with: 18 | fetch-depth: 0 19 | 20 | - name: Configure Git 21 | run: | 22 | git config user.name "$GITHUB_ACTOR" 23 | git config user.email "$GITHUB_ACTOR@users.noreply.github.com" 24 | 25 | - name: Install Helm 26 | uses: azure/setup-helm@v1 27 | with: 28 | version: v3.8.1 29 | 30 | - name: Run chart-releaser 31 | uses: helm/chart-releaser-action@v1.4.0 32 | env: 33 | CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}" 34 | docker: 35 | runs-on: ubuntu-latest 36 | needs: ["release"] 37 | steps: 38 | - name: Checkout 39 | uses: actions/checkout@v2 40 | with: 41 | fetch-depth: 0 42 | - 43 | name: Set up QEMU 44 | uses: docker/setup-qemu-action@v2 45 | - 46 | name: Set up Docker Buildx 47 | uses: docker/setup-buildx-action@v2 48 | 49 | - name: 'Get Previous tag' 50 | id: previoustag 51 | uses: "WyriHaximus/github-action-get-previous-tag@v1" 52 | with: 53 | fallback: 1.0.0 54 | - 55 | name: Login to DockerHub 56 | uses: docker/login-action@v2 57 | with: 58 | username: ${{ secrets.DOCKERHUB_USERNAME }} 59 | password: ${{ secrets.DOCKERHUB_TOKEN }} 60 | - 61 | name: Build and push 62 | uses: docker/build-push-action@v3 63 | with: 64 | push: true 65 | tags: tachyongroup/mlflow-deployment-controller:${{ steps.previoustag.outputs.tag }} 66 | 67 | - 68 | name: Build and push 69 | uses: docker/build-push-action@v3 70 | with: 71 | push: true 72 | context: ui/ 73 | tags: tachyongroup/mlflow-deployment-controller-ui:${{ steps.previoustag.outputs.tag }} 74 | 75 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | mdc/* 6 | *.DS_Store 7 | tmp/* 8 | # C extensions 9 | *.so 10 | scripts/* 11 | env* 12 | mdc/* 13 | *DS_Store 14 | live.py 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | *.py,cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | cover/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | db.sqlite3 68 | db.sqlite3-journal 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | 80 | # PyBuilder 81 | .pybuilder/ 82 | target/ 83 | 84 | # Jupyter Notebook 85 | .ipynb_checkpoints 86 | 87 | # IPython 88 | profile_default/ 89 | ipython_config.py 90 | 91 | # pyenv 92 | # For a library or package, you might want to ignore these files since the code is 93 | # intended to run in multiple environments; otherwise, check them in: 94 | # .python-version 95 | 96 | # pipenv 97 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 98 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 99 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 100 | # install all needed dependencies. 101 | #Pipfile.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/#use-with-ide 116 | .pdm.toml 117 | 118 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 119 | __pypackages__/ 120 | 121 | # Celery stuff 122 | celerybeat-schedule 123 | celerybeat.pid 124 | 125 | # SageMath parsed files 126 | *.sage.py 127 | 128 | # Environments 129 | .env 130 | .venv 131 | env/ 132 | venv/ 133 | ENV/ 134 | env.bak/ 135 | venv.bak/ 136 | 137 | # Spyder project settings 138 | .spyderproject 139 | .spyproject 140 | 141 | # Rope project settings 142 | .ropeproject 143 | 144 | # mkdocs documentation 145 | /site 146 | 147 | # mypy 148 | .mypy_cache/ 149 | .dmypy.json 150 | dmypy.json 151 | 152 | # Pyre type checker 153 | .pyre/ 154 | 155 | # pytype static type analyzer 156 | .pytype/ 157 | 158 | # Cython debug symbols 159 | cython_debug/ 160 | 161 | # PyCharm 162 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 163 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 164 | # and can be added to the global gitignore or merged into this file. For a more nuclear 165 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 166 | #.idea/ 167 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8.16-slim-buster 2 | RUN apt-get -y update 3 | RUN apt-get -y install git 4 | COPY requirements.txt requirements.txt 5 | RUN pip install -r requirements.txt 6 | RUN pip install protobuf==3.20 7 | WORKDIR /app 8 | COPY . /app 9 | CMD ["python", "main.py"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 ROCKET9 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ROOT_DIR := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) 2 | 3 | lint-python: 4 | cd ${ROOT_DIR}; python -m isort . --recursive --atomic 5 | cd ${ROOT_DIR}; python -m black . 6 | cd ${ROOT_DIR}; python -m flake8 mlflow_controller/ 7 | cd ${ROOT_DIR}; python -m flake8 ui/ 8 | # autoflake --remove-all-unused-imports -i -r . 9 | 10 | 11 | lint-python-check: 12 | # cd ${ROOT_DIR}; python -m isort mlflow_controller/ --check-only 13 | cd ${ROOT_DIR}; python -m flake8 mlflow_controller/ 14 | cd ${ROOT_DIR}; python -m black --check mlflow_controller 15 | cd ${ROOT_DIR}; python -m black --check ui -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | Hellopmlops 3 | 4 |   5 | 6 | 7 |
8 | 9 |

Mlflow Deployment Controller

10 | 11 | [![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/mlflow-deployment-controller)](https://artifacthub.io/packages/search?repo=mlflow-deployment-controller) 12 | 13 | 14 |
15 | 16 | ## :dart: About ## 17 | 18 | Mlflow Does not have integration with model servers ( Ex: Seldon-core) for automated deployment of models when registered or promoted to different stages, Mlflow deployment controller tries to solve this problem. Mlflow deployment controller is a python based controller which periodically checks the state between mlflow and model server's CRDs in k8s and acts accordingly. Every stage in Mlflow needs a separate controller as in the real world we would have different clusters for each stage. you can configure the controller to manage the state for a certain stage based on the use case. 19 | 20 | Screenshot 2022-12-17 at 5 36 52 PM 21 | 22 | 23 | ## :rocket: Technologies ## 24 | 25 | The following tools were used in this project: 26 | 27 | - [Seldon-Core](https://docs.seldon.io/projects/seldon-core/en/latest/index.html) 28 | - [Mlflow](https://www.mlflow.org/docs/latest/index.html) 29 | 30 | ## :white_check_mark: Requirements ## 31 | 32 | Before starting :checkered_flag:, you need to have [Helm](https://helm.sh/docs/helm/helm_install/) 33 | 34 | ## :checkered_flag: Starting ## 35 | 36 | ```bash 37 | $ helm repo add rocket9-code https://rocket9-code.github.io/helm-charts 38 | 39 | $ helm install mlflow-deployment-controller rocket9-code/mlflow-deployment-controller 40 | 41 | ``` 42 | 43 | ## To Setup Deployment controller in different environments 44 | 45 | ### For Staging environment 46 | 47 | Deployment controller will look for models logged with deploy.yaml in Mlflow Staging Environment and deploys the model in staging Namespace 48 | 49 | ```bash 50 | $ helm repo add rocket9-code https://rocket9-code.github.io/mlflow-deployment-controller/ 51 | 52 | $ helm install mlflow-deployment-controller-staging rocket9-code/mlflow-deployment-controller --set mlflow.stage=Staging --set mlflow.namespace=staging 53 | 54 | ``` 55 | 56 | ### For Production environment 57 | 58 | Deployment controller will look models logged with deploy.yaml in Mlflow Production Environment and deploys the model in production Namespace 59 | 60 | ```bash 61 | $ helm repo add rocket9-code https://rocket9-code.github.io/helm-charts 62 | 63 | $ helm install mlflow-deployment-controller-production rocket9-code/mlflow-deployment-controller --set mlflow.stage=Production --set mlflow.namespace=production 64 | 65 | ``` 66 | 67 | Quick Start using argocd 68 | --- 69 | 70 | Setup Mlflow and Mlflow controllers for different stages using argocd 71 | 72 | ``` 73 | kubectl apply -f examples/argo-manifest 74 | ``` 75 | 76 | #### Log a Mlflow model with Seldon deployment configuration with the name deploy.yaml 77 | 78 | Model Uri parameter will be overwritten by controller so it can be left blank 79 | 80 | Screenshot 2022-07-10 at 6 26 01 PM 81 | 82 | If any Model in mlflow is registered with deploy.yaml deployment controller will start deploying or managing the model server based on the config 83 | 84 | Screenshot 2022-07-10 at 6 25 47 PM 85 | 86 | 87 | 88 | Once the Model is logged with deploy.yaml deployment controller will deploy the model to the predefined namespace 89 | Currently, the deployment controller does not have a UI(But it is in our roadmap ) so you can check the logs of the Mlflow deployment controller to see the model deployment and any errors 90 | 91 | 92 | 93 | ``` 94 | kubectl logs -f deployment/mlflow-deploment-controller 95 | ``` 96 | 97 | 98 | 99 | Screenshot 2022-07-10 at 6 27 11 PM 100 | 101 | 102 | https://user-images.githubusercontent.com/62284209/182024746-1fa281ac-a388-467e-98cd-98e9f40a0ed0.mp4 103 | 104 | 105 | ## Gitops based deployment controller 106 | 107 | Gitops based deployment controller helps to version control seldon deployments as well as version control the models in ml registries in a automated way. 108 | Controller expects a templated variable in place of modelUri of the deplyment files which will be updated by the controller with the lastest version 109 | avalilable from the registies certain stage. For example if a controller is prod namespaces and production stage in mlflow and looking at the git repostory 110 | under folder production. it will get the manifest from the git repo's folder and the latest version from mlflow and deploy the model servers. 111 | 112 | 113 | 114 | Create a new repository for deployment controller and create a seldon manifest in the place of modelUri use this template '{{ mlflow.blob["iris demo1"] }}' 115 | to specify the model metadata the syntax of the template is {{ registry.backend["MODEL NAME IN REGISTRY"]}} 116 | 117 | Example deployment file deploying multiple models in seldon-core 118 |
119 | Expand me 120 | 121 | ``` 122 | apiVersion: machinelearning.seldon.io/v1 123 | kind: SeldonDeployment 124 | metadata: 125 | name: mlflow-var 126 | spec: 127 | name: iris 128 | predictors: 129 | - graph: 130 | children: 131 | - name: step-one 132 | modelUri: '{{ mlflow.blob["iris demo1"] }}' 133 | envSecretRefName: seldon-rclone-secret 134 | implementation: MLFLOW_SERVER 135 | type: MODEL 136 | children: 137 | - name: step-two 138 | modelUri: '{{ mlflow.blob["iris demo2"] }}' 139 | envSecretRefName: seldon-rclone-secret 140 | implementation: MLFLOW_SERVER 141 | type: MODEL 142 | children: [] 143 | - name: step-three 144 | implementation: MLFLOW_SERVER 145 | modelUri: '{{ mlflow.blob["iris demo3"] }}' 146 | envSecretRefName: seldon-rclone-secret 147 | type: MODEL 148 | children: [] 149 | implementation: MLFLOW_SERVER 150 | modelUri: '{{ mlflow.blob["iris demo4"] }}' 151 | envSecretRefName: seldon-rclone-secret 152 | logger: 153 | url: http://broker-ingress.knative-eventing.svc.cluster.local/demo/default 154 | mode: all 155 | name: classifier 156 | name: default 157 | replicas: 1 158 | ``` 159 |
160 | 161 | 162 | The template values are updated by the controller with the latest version the registry as below and submitted to the kubernetes api 163 | 164 |
165 | Expand me 166 | 167 | ``` 168 | apiVersion: machinelearning.seldon.io/v1 169 | kind: SeldonDeployment 170 | metadata: 171 | name: mlflow-var 172 | namespace: staging 173 | spec: 174 | name: iris 175 | predictors: 176 | - graph: 177 | children: 178 | - children: 179 | - children: [] 180 | envSecretRefName: seldon-rclone-secret 181 | implementation: MLFLOW_SERVER 182 | modelUri: '{{ mlflow.blob["iris demo2"] }}' 183 | name: step-two 184 | type: MODEL 185 | envSecretRefName: seldon-rclone-secret 186 | implementation: MLFLOW_SERVER 187 | modelUri: '{{ mlflow.blob["iris demo1"] }}' 188 | name: step-one 189 | type: MODEL 190 | - children: [] 191 | envSecretRefName: seldon-rclone-secret 192 | implementation: MLFLOW_SERVER 193 | modelUri: >- 194 | wasbs://artifacts/mlflow/10/262bee84b7dd4b039973084383880b57/artifacts/model 195 | name: step-three 196 | type: MODEL 197 | envSecretRefName: seldon-rclone-secret 198 | implementation: MLFLOW_SERVER 199 | logger: 200 | mode: all 201 | url: >- 202 | http://broker-ingress.knative-eventing.svc.cluster.local/demo/default 203 | modelUri: '{{ mlflow.blob["iris demo4"] }}' 204 | name: classifier 205 | name: default 206 | ``` 207 |
208 | 209 | 210 | To enable gitops in the controller 211 | 212 | ``` 213 | ! helm repo add rocket9-code https://rocket9-code.github.io/helm-charts 214 | 215 | ! helm install mlflow-controller rocket9-code/mlflow-deployment-controller -n mlflow --set gitops.enabled=true 216 | ``` 217 | Supported values 218 | registes: mlflow 219 | backend: blob , gcs , s3 220 | 221 | in future releases we can support azureml registries and databricks mlflow 222 | 223 | 224 | ## To Setup Deployment controller in different environments with Gitops Enabled 225 | 226 | ### For Staging environment 227 | 228 | Deployment controller will look for yaml files staging folder and model in Mlflow Staging Environment and deploys the model in staging Namespace 229 | 230 | ```bash 231 | $ helm repo add rocket9-code https://rocket9-code.github.io/mlflow-deployment-controller/ 232 | 233 | $ helm install mlflow-deployment-controller-staging rocket9-code/mlflow-deployment-controller --set gitops.enabled=true \ 234 | --set gitops.repository= github.com/rocket9-code/model-deployments \ 235 | --set gitops.deploymentLocation=staging --set mlflow.stage=Staging \ 236 | --set mlflow.namespace=staging 237 | 238 | ``` 239 | 240 | ### For Production environment 241 | 242 | Deployment controller will look for yaml files in production folder and model in Mlflow Production Environment and deploys the model in production Namespace 243 | 244 | ```bash 245 | $ helm repo add rocket9-code https://rocket9-code.github.io/helm-charts 246 | 247 | $ helm install mlflow-deployment-controller-production rocket9-code/mlflow-deployment-controller --set gitops.enabled=true \ 248 | --set gitops.repository= github.com/rocket9-code/model-deployments \ 249 | --set gitops.deploymentLocation=production --set mlflow.stage=Production \ 250 | --set mlflow.namespace=production 251 | 252 | ``` 253 | 254 | quick start example is available at examples/gitops 255 | 256 | Support matrix 257 | | Ml endpoints | Seldon core | Kserve | Databricks | Azure ml | Vertex AI | SageMaker | 258 | |-----|---------|---------|---------|---------|---------|---------| 259 | | Registries | | | | | | 260 | | mlflow oss gcs | :white_check_mark: | :white_check_mark: | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | 261 | | mlflow oss blob | :white_check_mark: | :white_check_mark: | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | 262 | | mlflow oss s3 | :white_check_mark: | :white_check_mark: | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | 263 | | databricks mlflow| ✖️ (in roadmap) | ✖️ (in roadmap) | --- | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | 264 | | azureml | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | 265 | | vertexai registry | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | 266 | 267 | ## :memo: License ## 268 | 269 | This project is under license from MIT. For more details, see the [LICENSE](LICENSE.md) file. 270 | 271 | Back to top 272 | -------------------------------------------------------------------------------- /charts/mlflow-controller/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /charts/mlflow-controller/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: mlflow-controller 3 | description: A Helm chart for Mlflow Deployment Controller and MDC ui 4 | 5 | # A chart can be either an 'application' or a 'library' chart. 6 | # 7 | # 8 | # Library charts provide useful utilities or functions for the chart developer. They're included as 9 | # a dependency of application charts to inject those utilities and functions into the rendering 10 | # pipeline. Library charts do not define any templates and therefore cannot be deployed. 11 | type: application 12 | # This is the chart version. This version number should be incremented each time you make changes 13 | # to the chart and its templates, including the app version. 14 | # Versions are expected to follow Semantic Versioning (https://semver.org/) 15 | version: 0.1.8 16 | 17 | # This is the version number of the application being deployed. This version number should be 18 | # incremented each time you make changes to the application. Versions are not expected to 19 | # follow Semantic Versioning. They should reflect the version the application is using. 20 | # It is recommended to use it with quotes. 21 | appVersion: "0.1.8" 22 | -------------------------------------------------------------------------------- /charts/mlflow-controller/README.md: -------------------------------------------------------------------------------- 1 | # mlflow-controller 2 | 3 | ![Version: 0.1.6](https://img.shields.io/badge/Version-0.1.6-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 0.1.6](https://img.shields.io/badge/AppVersion-0.1.6-informational?style=flat-square) 4 | 5 | A Helm chart for Mlflow Deployment Controller 6 | 7 | ## Values 8 | 9 | | Key | Type | Default | Description | 10 | |-----|------|---------|-------------| 11 | | affinity | object | `{}` | affinity | 12 | | envFromSecret | string | `""` | additional ENV from secret | 13 | | fullnameOverride | string | `""` | | 14 | | gitops.BRANCH | string | `"main"` | | 15 | | gitops.deploymentLocation | string | `"/"` | deployment files folder location | 16 | | gitops.enabled | bool | `true` | enable/disable gitops | 17 | | gitops.gitPasswordSecretKey | string | `"githubtoken"` | git password secret key | 18 | | gitops.gitPasswordSecretName | string | `"github-secret"` | git password secret name | 19 | | gitops.gitUser | string | `"mdcadmin"` | git username | 20 | | gitops.protocol | string | `"https"` | git repo protocol | 21 | | gitops.repository | string | `"github.com/rocket9-code/model-deployments"` | git repository | 22 | | image.pullPolicy | string | `"Always"` | image pull policy | 23 | | image.repository | string | `"tachyongroup/mlflow-deployment-controller"` | image repository | 24 | | image.tag | string | `"mlflow-controller-0.1.6"` | image tag | 25 | | imagePullSecrets | list | `[]` | | 26 | | mlflow.MLFLOW_TRACKING_URI | string | `"http://mlflow-service:5000"` | mlflow tracking uri | 27 | | mlflow.backend | string | `"blob"` | Object Storage Used by mlflow supported gcs , blob , s3 | 28 | | mlflow.enabled | bool | `true` | | 29 | | mlflow.namespace | string | `"staging"` | Namespace model to be deployed | 30 | | mlflow.stage | string | `"Staging"` | Stage To be Tracked From Mlflow | 31 | | mlserver | string | `"seldon"` | mlserver one of [seldon, kserve] | 32 | | nameOverride | string | `""` | | 33 | | nodeSelector | object | `{}` | node selector | 34 | | podAnnotations | object | `{}` | pod annotations | 35 | | podSecurityContext | object | `{}` | | 36 | | replicaCount | int | `1` | replica count | 37 | | resources | object | `{}` | cpu memory resource config | 38 | | securityContext | object | `{}` | security context | 39 | | serviceAccount.annotations | object | `{}` | Annotations to add to the service account | 40 | | serviceAccount.create | bool | `true` | Specifies whether a service account should be created | 41 | | serviceAccount.name | string | `""` | If not set and create is true, a name is generated using the fullname template | 42 | | tolerations | list | `[]` | tolerations | 43 | 44 | ---------------------------------------------- 45 | Autogenerated from chart metadata using [helm-docs v1.11.0](https://github.com/norwoodj/helm-docs/releases/v1.11.0) 46 | -------------------------------------------------------------------------------- /charts/mlflow-controller/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* 2 | Expand the name of the chart. 3 | */}} 4 | {{- define "mlflow-controller.name" -}} 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} 6 | {{- end }} 7 | 8 | {{/* 9 | Create a default fully qualified app name. 10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 11 | If release name contains chart name it will be used as a full name. 12 | */}} 13 | {{- define "mlflow-controller.fullname" -}} 14 | {{- if .Values.fullnameOverride }} 15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} 16 | {{- else }} 17 | {{- $name := default .Chart.Name .Values.nameOverride }} 18 | {{- if contains $name .Release.Name }} 19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }} 20 | {{- else }} 21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} 22 | {{- end }} 23 | {{- end }} 24 | {{- end }} 25 | 26 | {{/* 27 | Create chart name and version as used by the chart label. 28 | */}} 29 | {{- define "mlflow-controller.chart" -}} 30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} 31 | {{- end }} 32 | 33 | {{/* 34 | Common labels 35 | */}} 36 | {{- define "mlflow-controller.labels" -}} 37 | helm.sh/chart: {{ include "mlflow-controller.chart" . }} 38 | {{ include "mlflow-controller.selectorLabels" . }} 39 | {{- if .Chart.AppVersion }} 40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 41 | {{- end }} 42 | app.kubernetes.io/managed-by: {{ .Release.Service }} 43 | {{- end }} 44 | 45 | {{/* 46 | Selector labels 47 | */}} 48 | {{- define "mlflow-controller.selectorLabels" -}} 49 | app.kubernetes.io/name: {{ include "mlflow-controller.name" . }} 50 | app.kubernetes.io/instance: {{ .Release.Name }} 51 | {{- end }} 52 | 53 | {{/* 54 | Create the name of the service account to use 55 | */}} 56 | {{- define "mlflow-controller.serviceAccountName" -}} 57 | {{- if .Values.serviceAccount.create }} 58 | {{- default (include "mlflow-controller.fullname" .) .Values.serviceAccount.name }} 59 | {{- else }} 60 | {{- default "default" .Values.serviceAccount.name }} 61 | {{- end }} 62 | {{- end }} 63 | -------------------------------------------------------------------------------- /charts/mlflow-controller/templates/deployment-ui.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.ui.enabled }} 2 | apiVersion: apps/v1 3 | kind: Deployment 4 | metadata: 5 | name: {{ include "mlflow-controller.fullname" . }}-ui 6 | labels: 7 | app: mlflow-controller-ui 8 | spec: 9 | {{- if not .Values.autoscaling.enabled }} 10 | replicas: {{ .Values.replicaCount }} 11 | {{- end }} 12 | selector: 13 | matchLabels: 14 | app: mlflow-controller-ui 15 | template: 16 | metadata: 17 | {{- with .Values.podAnnotations }} 18 | annotations: 19 | {{- toYaml . | nindent 8 }} 20 | {{- end }} 21 | labels: 22 | app: mlflow-controller-ui 23 | spec: 24 | {{- with .Values.imagePullSecrets }} 25 | imagePullSecrets: 26 | {{- toYaml . | nindent 8 }} 27 | {{- end }} 28 | serviceAccountName: {{ include "mlflow-controller.serviceAccountName" . }} 29 | securityContext: 30 | {{- toYaml .Values.podSecurityContext | nindent 8 }} 31 | containers: 32 | - name: {{ .Chart.Name }} 33 | securityContext: 34 | {{- toYaml .Values.securityContext | nindent 12 }} 35 | image: "{{ .Values.ui.image.repository }}:{{ .Values.ui.image.tag | default .Chart.AppVersion }}" 36 | imagePullPolicy: {{ .Values.image.pullPolicy }} 37 | env: 38 | - name: seldon_url 39 | value: {{ .Values.ui.seldon_url }} 40 | - name: namespace 41 | value: {{ .Values.mlflow.namespace }} 42 | ports: 43 | - containerPort: 8000 44 | name: http 45 | resources: 46 | {{- toYaml .Values.resources | nindent 12 }} 47 | {{- with .Values.nodeSelector }} 48 | nodeSelector: 49 | {{- toYaml . | nindent 8 }} 50 | {{- end }} 51 | {{- with .Values.affinity }} 52 | affinity: 53 | {{- toYaml . | nindent 8 }} 54 | {{- end }} 55 | {{- with .Values.tolerations }} 56 | tolerations: 57 | {{- toYaml . | nindent 8 }} 58 | {{- end }} 59 | {{- end }} 60 | -------------------------------------------------------------------------------- /charts/mlflow-controller/templates/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: {{ include "mlflow-controller.fullname" . }} 5 | labels: 6 | {{- include "mlflow-controller.labels" . | nindent 4 }} 7 | spec: 8 | replicas: {{ .Values.replicaCount }} 9 | selector: 10 | matchLabels: 11 | {{- include "mlflow-controller.selectorLabels" . | nindent 6 }} 12 | template: 13 | metadata: 14 | {{- with .Values.podAnnotations }} 15 | annotations: 16 | {{- toYaml . | nindent 8 }} 17 | {{- end }} 18 | labels: 19 | {{- include "mlflow-controller.selectorLabels" . | nindent 8 }} 20 | spec: 21 | {{- with .Values.imagePullSecrets }} 22 | imagePullSecrets: 23 | {{- toYaml . | nindent 8 }} 24 | {{- end }} 25 | serviceAccountName: {{ include "mlflow-controller.serviceAccountName" . }} 26 | securityContext: 27 | {{- toYaml .Values.podSecurityContext | nindent 8 }} 28 | containers: 29 | - name: {{ .Chart.Name }} 30 | securityContext: 31 | {{- toYaml .Values.securityContext | nindent 12 }} 32 | image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" 33 | imagePullPolicy: {{ .Values.image.pullPolicy }} 34 | resources: 35 | {{- toYaml .Values.resources | nindent 12 }} 36 | env: 37 | - name: ML_SERVER 38 | value: {{ .Values.mlserver }} 39 | {{- if .Values.gitops.gitPasswordSecretName }} 40 | - name: GIT_PASSWORD 41 | valueFrom: 42 | secretKeyRef: 43 | name: {{ .Values.gitops.gitPasswordSecretName }} 44 | key: {{ .Values.gitops.gitPasswordSecretKey }} 45 | optional: false 46 | {{- end }} 47 | envFrom: 48 | - configMapRef: 49 | name: {{ include "mlflow-controller.fullname" . }}-mlflow-cm 50 | {{- if .Values.envFromSecret }} 51 | - secretRef: 52 | name: {{ .Values.envFromSecret }} 53 | {{- end }} 54 | {{- if .Values.gitops.enabled }} 55 | - configMapRef: 56 | name: {{ include "mlflow-controller.fullname" . }}-gitops-cm 57 | {{- end }} 58 | {{- with .Values.nodeSelector }} 59 | nodeSelector: 60 | {{- toYaml . | nindent 8 }} 61 | {{- end }} 62 | {{- with .Values.affinity }} 63 | affinity: 64 | {{- toYaml . | nindent 8 }} 65 | {{- end }} 66 | {{- with .Values.tolerations }} 67 | tolerations: 68 | {{- toYaml . | nindent 8 }} 69 | {{- end }} 70 | -------------------------------------------------------------------------------- /charts/mlflow-controller/templates/gitops-cm.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.gitops.enabled }} 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: {{ include "mlflow-controller.fullname" . }}-gitops-cm 6 | labels: 7 | {{- include "mlflow-controller.labels" . | nindent 4 }} 8 | data: 9 | GIT_USER: {{ .Values.gitops.gitUser }} 10 | MANIFEST_LOCATION: {{ .Values.gitops.deploymentLocation }} 11 | GIT_REPO: {{ .Values.gitops.repository }} 12 | BRANCH: {{ .Values.gitops.BRANCH }} 13 | GITOPS_ENABLED: "True" 14 | GIT_PROTOCOL: {{ .Values.gitops.protocol }} 15 | {{- end }} -------------------------------------------------------------------------------- /charts/mlflow-controller/templates/ingress.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.ingress.enabled -}} 2 | {{- $fullName := include "mlflow-controller.fullname" . -}} 3 | {{- $svcPort := .Values.service.port -}} 4 | {{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }} 5 | {{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }} 6 | {{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}} 7 | {{- end }} 8 | {{- end }} 9 | {{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}} 10 | apiVersion: networking.k8s.io/v1 11 | {{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}} 12 | apiVersion: networking.k8s.io/v1beta1 13 | {{- else -}} 14 | apiVersion: extensions/v1beta1 15 | {{- end }} 16 | kind: Ingress 17 | metadata: 18 | name: {{ $fullName }} 19 | labels: 20 | app: mlflow-controller-ui 21 | {{- with .Values.ingress.annotations }} 22 | annotations: 23 | {{- toYaml . | nindent 4 }} 24 | {{- end }} 25 | spec: 26 | {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }} 27 | ingressClassName: {{ .Values.ingress.className }} 28 | {{- end }} 29 | {{- if .Values.ingress.tls }} 30 | tls: 31 | {{- range .Values.ingress.tls }} 32 | - hosts: 33 | {{- range .hosts }} 34 | - {{ . | quote }} 35 | {{- end }} 36 | secretName: {{ .secretName }} 37 | {{- end }} 38 | {{- end }} 39 | rules: 40 | {{- range .Values.ingress.hosts }} 41 | - host: {{ .host | quote }} 42 | http: 43 | paths: 44 | {{- range .paths }} 45 | - path: {{ .path }} 46 | {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }} 47 | pathType: {{ .pathType }} 48 | {{- end }} 49 | backend: 50 | {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }} 51 | service: 52 | name: {{ $fullName }}-ui 53 | port: 54 | number: {{ $svcPort }} 55 | {{- else }} 56 | serviceName: {{ $fullName }} 57 | servicePort: {{ $svcPort }} 58 | {{- end }} 59 | {{- end }} 60 | {{- end }} 61 | {{- end }} 62 | -------------------------------------------------------------------------------- /charts/mlflow-controller/templates/mlflow-cm.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.mlflow.enabled }} 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: {{ include "mlflow-controller.fullname" . }}-mlflow-cm 6 | labels: 7 | {{- include "mlflow-controller.labels" . | nindent 4 }} 8 | data: 9 | MLFLOW_TRACKING_URI: {{ .Values.mlflow.MLFLOW_TRACKING_URI }} 10 | stage: {{ .Values.mlflow.stage }} 11 | namespace: {{ .Values.mlflow.namespace }} 12 | backend: {{ .Values.mlflow.backend }} 13 | MLFLOW_ENABLED: "True" 14 | {{- end }} -------------------------------------------------------------------------------- /charts/mlflow-controller/templates/service.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.ui.enabled }} 2 | apiVersion: v1 3 | kind: Service 4 | metadata: 5 | name: {{ include "mlflow-controller.fullname" . }}-ui 6 | labels: 7 | app: mlflow-controller-ui 8 | spec: 9 | type: {{ .Values.service.type }} 10 | ports: 11 | - port: {{ .Values.service.port }} 12 | targetPort: http 13 | protocol: TCP 14 | name: http 15 | selector: 16 | app: mlflow-controller-ui 17 | {{- end }} 18 | -------------------------------------------------------------------------------- /charts/mlflow-controller/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.serviceAccount.create -}} 2 | apiVersion: v1 3 | kind: ServiceAccount 4 | metadata: 5 | name: {{ include "mlflow-controller.serviceAccountName" . }} 6 | labels: 7 | {{- include "mlflow-controller.labels" . | nindent 4 }} 8 | {{- with .Values.serviceAccount.annotations }} 9 | annotations: 10 | {{- toYaml . | nindent 4 }} 11 | {{- end }} 12 | 13 | --- 14 | 15 | apiVersion: rbac.authorization.k8s.io/v1 16 | kind: ClusterRole 17 | metadata: 18 | name: {{ include "mlflow-controller.serviceAccountName" . }} 19 | rules: 20 | - apiGroups: 21 | - machinelearning.seldon.io 22 | resources: 23 | - seldondeployments 24 | verbs: 25 | - get 26 | - list 27 | - watch 28 | - create 29 | - delete 30 | - deletecollection 31 | - patch 32 | - update 33 | - apiGroups: 34 | - "apps" 35 | resources: 36 | - deployments 37 | verbs: 38 | - get 39 | - list 40 | - apiGroups: 41 | - serving.kserve.io 42 | resources: 43 | - inferenceservices 44 | - inferenceservices/status 45 | verbs: 46 | - get 47 | - list 48 | - watch 49 | - create 50 | - delete 51 | - deletecollection 52 | - patch 53 | - update 54 | - apiGroups: 55 | - serving.knative.dev 56 | resources: 57 | - services 58 | - services/status 59 | - routes 60 | - routes/status 61 | - configurations 62 | - configurations/status 63 | - revisions 64 | - revisions/status 65 | verbs: 66 | - get 67 | - list 68 | --- 69 | 70 | apiVersion: rbac.authorization.k8s.io/v1 71 | kind: ClusterRoleBinding 72 | metadata: 73 | name: {{ include "mlflow-controller.serviceAccountName" . }} 74 | roleRef: 75 | apiGroup: rbac.authorization.k8s.io 76 | kind: ClusterRole 77 | name: {{ include "mlflow-controller.serviceAccountName" . }} 78 | subjects: 79 | - kind: ServiceAccount 80 | name: {{ include "mlflow-controller.serviceAccountName" . }} 81 | namespace: "{{.Release.Namespace}}" 82 | 83 | {{- end }} 84 | -------------------------------------------------------------------------------- /charts/mlflow-controller/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for mlflow-controller. 2 | # This is a YAML-formatted file. 3 | # Declare variables to be passed into your templates. 4 | # -- replica count 5 | replicaCount: 1 6 | 7 | image: 8 | # -- image repository 9 | repository: tachyongroup/mlflow-deployment-controller 10 | # -- image pull policy 11 | pullPolicy: Always 12 | # -- image tag 13 | tag: "mlflow-controller-0.1.8" 14 | 15 | imagePullSecrets: [] 16 | nameOverride: "" 17 | fullnameOverride: "" 18 | 19 | ui: 20 | enabled: true 21 | 22 | seldon_url: https://seldon.mlops.wianai.com 23 | image: 24 | repository: tachyongroup/mlflow-deployment-controller-ui 25 | tag: "mlflow-controller-0.1.8" 26 | pullPolicy: Always 27 | 28 | mlflow: 29 | enabled: true 30 | # -- mlflow tracking uri 31 | MLFLOW_TRACKING_URI: http://mlflow-service:5000 32 | # -- Stage To be Tracked From Mlflow 33 | stage: Staging 34 | # -- Namespace model to be deployed 35 | namespace: staging 36 | # -- Object Storage Used by mlflow supported gcs , blob , s3 37 | backend: "blob" 38 | 39 | # -- mlserver one of [seldon, kserve] 40 | mlserver: seldon 41 | 42 | gitops: 43 | # -- enable/disable gitops 44 | enabled: true 45 | # -- git repository 46 | repository: github.com/rocket9-code/model-deployments 47 | # -- git repo protocol 48 | protocol: https 49 | # -- deployment files folder location 50 | deploymentLocation: staging/ 51 | # -- git username 52 | gitUser: raghulkrishna 53 | # -- git password secret name 54 | gitPasswordSecretName: "github-secret" 55 | # -- git password secret key 56 | gitPasswordSecretKey: "githubtoken" 57 | # git branch to be tracked 58 | BRANCH: main 59 | 60 | serviceAccount: 61 | # -- Specifies whether a service account should be created 62 | create: true 63 | # -- Annotations to add to the service account 64 | annotations: {} 65 | # -- The name of the service account to use. 66 | # -- If not set and create is true, a name is generated using the fullname template 67 | name: "" 68 | # -- pod annotations 69 | podAnnotations: {} 70 | # pod security context 71 | podSecurityContext: {} 72 | # fsGroup: 2000 73 | # -- additional ENV from secret 74 | envFromSecret: "" 75 | # -- security context 76 | securityContext: {} 77 | # capabilities: 78 | # drop: 79 | # - ALL 80 | # readOnlyRootFilesystem: true 81 | # runAsNonRoot: true 82 | # runAsUser: 1000 83 | service: 84 | type: ClusterIP 85 | port: 8000 86 | 87 | ingress: 88 | enabled: true 89 | className: "nginx" 90 | annotations: {} 91 | # kubernetes.io/ingress.class: nginx 92 | # kubernetes.io/tls-acme: "true" 93 | hosts: 94 | - host: mdcv2.mlops.wianai.com 95 | paths: 96 | - path: / 97 | pathType: ImplementationSpecific 98 | tls: 99 | - secretName: mdcv2.mlops.wianai.com 100 | hosts: 101 | - aui-secret 102 | 103 | # -- cpu memory resource config 104 | resources: {} 105 | # We usually recommend not to specify default resources and to leave this as a conscious 106 | # choice for the user. This also increases chances charts run on environments with little 107 | # resources, such as Minikube. If you do want to specify resources, uncomment the following 108 | # lines, adjust them as necessary, and remove the curly braces after 'resources:'. 109 | # limits: 110 | # cpu: 100m 111 | # memory: 128Mi 112 | # requests: 113 | # cpu: 100m 114 | # memory: 128Mi 115 | autoscaling: 116 | enabled: false 117 | minReplicas: 1 118 | maxReplicas: 100 119 | targetCPUUtilizationPercentage: 80 120 | # targetMemoryUtilizationPercentage: 80 121 | 122 | # -- node selector 123 | nodeSelector: {} 124 | # -- tolerations 125 | tolerations: [] 126 | # -- affinity 127 | affinity: {} 128 | -------------------------------------------------------------------------------- /doc/Mlflow Deployment controller.drawio: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /doc/doc.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /doc/gitops.md: -------------------------------------------------------------------------------- 1 | ## Gitops based deployment controller 2 | 3 | Gitops based deployment controller helps to version control seldon deployments as well as version control the models in ml registries in a automated way. 4 | Controller expects a templated variable in place of modelUri of the deplyment files which will be updated by the controller with the lastest version 5 | avalilable from the registies certain stage. For example if a controller is prod namespaces and production stage in mlflow and looking at the git repostory 6 | under folder production. it will get the manifest from the git repo's folder and the latest version from mlflow and deploy the model servers. 7 | 8 | Screenshot 2022-12-17 at 6 33 32 PM 9 | 10 | Create a new repository for deployment controller and create a seldon manifest in the place of modelUri use this template '{{ mlflow.blob["iris demo1"] }}' 11 | to specify the model metadata the syntax of the template is {{ registry.backend["MODEL NAME IN REGISTRY"]}} 12 | 13 | Example deployment file deploying multiple models in seldon-core 14 |
15 | Expand me 16 | 17 | ``` 18 | apiVersion: machinelearning.seldon.io/v1 19 | kind: SeldonDeployment 20 | metadata: 21 | name: mlflow-var 22 | spec: 23 | name: iris 24 | predictors: 25 | - graph: 26 | children: 27 | - name: step-one 28 | modelUri: '{{ mlflow.blob["iris demo1"] }}' 29 | envSecretRefName: seldon-rclone-secret 30 | implementation: MLFLOW_SERVER 31 | type: MODEL 32 | children: 33 | - name: step-two 34 | modelUri: '{{ mlflow.blob["iris demo2"] }}' 35 | envSecretRefName: seldon-rclone-secret 36 | implementation: MLFLOW_SERVER 37 | type: MODEL 38 | children: [] 39 | - name: step-three 40 | implementation: MLFLOW_SERVER 41 | modelUri: '{{ mlflow.blob["iris demo3"] }}' 42 | envSecretRefName: seldon-rclone-secret 43 | type: MODEL 44 | children: [] 45 | implementation: MLFLOW_SERVER 46 | modelUri: '{{ mlflow.blob["iris demo4"] }}' 47 | envSecretRefName: seldon-rclone-secret 48 | logger: 49 | url: http://broker-ingress.knative-eventing.svc.cluster.local/demo/default 50 | mode: all 51 | name: classifier 52 | name: default 53 | replicas: 1 54 | ``` 55 |
56 | 57 | 58 | The template values are updated by the controller with the latest version the registry as below and submitted to the kubernetes api 59 | 60 |
61 | Expand me 62 | 63 | ``` 64 | apiVersion: machinelearning.seldon.io/v1 65 | kind: SeldonDeployment 66 | metadata: 67 | name: mlflow-var 68 | namespace: staging 69 | spec: 70 | name: iris 71 | predictors: 72 | - graph: 73 | children: 74 | - children: 75 | - children: [] 76 | envSecretRefName: seldon-rclone-secret 77 | implementation: MLFLOW_SERVER 78 | modelUri: '{{ mlflow.blob["iris demo2"] }}' 79 | name: step-two 80 | type: MODEL 81 | envSecretRefName: seldon-rclone-secret 82 | implementation: MLFLOW_SERVER 83 | modelUri: '{{ mlflow.blob["iris demo1"] }}' 84 | name: step-one 85 | type: MODEL 86 | - children: [] 87 | envSecretRefName: seldon-rclone-secret 88 | implementation: MLFLOW_SERVER 89 | modelUri: >- 90 | wasbs://artifacts/mlflow/10/262bee84b7dd4b039973084383880b57/artifacts/model 91 | name: step-three 92 | type: MODEL 93 | envSecretRefName: seldon-rclone-secret 94 | implementation: MLFLOW_SERVER 95 | logger: 96 | mode: all 97 | url: >- 98 | http://broker-ingress.knative-eventing.svc.cluster.local/demo/default 99 | modelUri: '{{ mlflow.blob["iris demo4"] }}' 100 | name: classifier 101 | name: default 102 | ``` 103 |
104 | 105 | 106 | To enable gitops in the controller 107 | 108 | ``` 109 | ! git clone -b gitops-enable https://github.com/rocket9-code/mlflow-deployment-controller 110 | 111 | ! helm install mlflow-controller mlflow-deployment-controller/charts/mlflow-controller -n mlflow --set gitops.enabled=true 112 | ``` 113 | Supported values 114 | registes: mlflow 115 | backend: blob , gcs , s3 116 | 117 | in future releases we can support azureml registries and databricks mlflow 118 | 119 | Support matrix 120 | | Ml endpoints | Seldon core | Kserve | Databricks | Azure ml | Vertex AI | SageMaker | 121 | |-----|---------|---------|---------|---------|---------|---------| 122 | | Registries | | | | | | 123 | | mlflow oss gcs | :white_check_mark: | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | 124 | | mlflow oss blob | :white_check_mark: | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | 125 | | mlflow oss s3 | :white_check_mark: | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | 126 | | databricks mlflow| ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | 127 | | databricks azureml | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | 128 | | vertexai registry | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | ✖️ (in roadmap) | 129 | 130 | 131 | ## To Setup Deployment controller in different environments with Gitops Enabled 132 | 133 | ### For Staging environment 134 | 135 | Deployment controller will look for yaml files staging folder and model in Mlflow Staging Environment and deploys the model in staging Namespace 136 | 137 | ```bash 138 | $ helm repo add f9n-code https://f9n-code.github.io/mlflow-deployment-controller/ 139 | 140 | $ helm install mlflow-controller-deployment-staging f9n-code/mlflow-controller-deployment --set gitops.enabled=true \ 141 | --set gitops.repository= github.com/rocket9-code/model-deployments \ 142 | --set gitops.deploymentLocation=staging --set mlflow.stage=Staging \ 143 | --set mlflow.namespace=staging 144 | 145 | ``` 146 | 147 | ### For Production environment 148 | 149 | Deployment controller will look for yaml files in production folder and model in Mlflow Production Environment and deploys the model in production Namespace 150 | 151 | ```bash 152 | $ helm repo add f9n-code https://f9n-code.github.io/helm-charts 153 | 154 | $ helm install mlflow-controller-deployment-production f9n-code/mlflow-controller-deployment --set gitops.enabled=true \ 155 | --set gitops.repository= github.com/rocket9-code/model-deployments \ 156 | --set gitops.deploymentLocation=production --set mlflow.stage=Production \ 157 | --set mlflow.namespace=production 158 | 159 | ``` 160 | -------------------------------------------------------------------------------- /examples/argo-manifest/mlflow-controller-production.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: Application 3 | metadata: 4 | name: mlflow-deployment-controller-production 5 | namespace: argocd 6 | finalizers: 7 | - resources-finalizer.argocd.argoproj.io 8 | spec: 9 | project: default 10 | source: 11 | repoURL: https://github.com/wianai/mlflow-deployment-controller 12 | path: charts/mlflow-controller 13 | targetRevision: main 14 | helm: 15 | releaseName: mlflow-deployment-controller-production 16 | parameters: 17 | - name: "mlflow.stage" 18 | value: "Production" 19 | - name: "mlflow.namespace" 20 | value: "production" 21 | syncPolicy: 22 | automated: 23 | prune: true 24 | allowEmpty: true 25 | selfHeal: true 26 | destination: 27 | server: "https://kubernetes.default.svc" 28 | namespace: mlflow -------------------------------------------------------------------------------- /examples/argo-manifest/mlflow-controller.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: Application 3 | metadata: 4 | name: mlflow-deployment-controller-staging 5 | namespace: argocd 6 | finalizers: 7 | - resources-finalizer.argocd.argoproj.io 8 | spec: 9 | project: default 10 | source: 11 | repoURL: https://github.com/wianai/mlflow-deployment-controller 12 | path: charts/mlflow-controller 13 | targetRevision: main 14 | helm: 15 | releaseName: mlflow-deployment-controller-staging 16 | parameters: 17 | - name: "mlflow.stage" 18 | value: "Staging" 19 | - name: "mlflow.namespace" 20 | value: "staging" 21 | syncPolicy: 22 | automated: 23 | prune: true 24 | allowEmpty: true 25 | selfHeal: true 26 | destination: 27 | server: "https://kubernetes.default.svc" 28 | namespace: mlflow -------------------------------------------------------------------------------- /examples/argo-manifest/mlflow.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: Application 3 | metadata: 4 | name: mlflow 5 | namespace: argocd 6 | finalizers: 7 | - resources-finalizer.argocd.argoproj.io 8 | spec: 9 | project: default 10 | source: 11 | repoURL: https://github.com/wianai/hello-mlflow 12 | path: charts/mlflow 13 | targetRevision: main 14 | helm: 15 | releaseName: mlflow 16 | parameters: 17 | - name: "artifact.ArtifactRoot" 18 | value: "gs://wian-ai-lab-mlflow/mlflow_artifacts/" 19 | syncPolicy: 20 | automated: 21 | prune: true 22 | allowEmpty: true 23 | selfHeal: true 24 | destination: 25 | server: "https://kubernetes.default.svc" 26 | namespace: mlflow -------------------------------------------------------------------------------- /examples/argo-manifest/seldon-core.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: Application 3 | metadata: 4 | name: seldon-core-operator 5 | namespace: argocd 6 | finalizers: 7 | - resources-finalizer.argocd.argoproj.io 8 | spec: 9 | project: default 10 | source: 11 | repoURL: https://storage.googleapis.com/seldon-charts 12 | chart: seldon-core-operator 13 | targetRevision: 1.14.0 14 | helm: 15 | releaseName: seldon-core-operator 16 | parameters: 17 | - name: "usageMetrics.enabled" 18 | value: "false" 19 | - name: "istio.enabled" 20 | value: "true" 21 | syncPolicy: 22 | syncOptions: 23 | - CreateNamespace=true 24 | automated: 25 | prune: true 26 | allowEmpty: true 27 | selfHeal: true 28 | destination: 29 | server: "https://kubernetes.default.svc" 30 | namespace: seldon-system 31 | -------------------------------------------------------------------------------- /examples/gitops/gitops.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Gitops example" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Install deployment controller with gitops enabled" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 8, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stdout", 24 | "output_type": "stream", 25 | "text": [ 26 | "NAME: mlflow-controller-staging\n", 27 | "LAST DEPLOYED: Mon Dec 19 14:29:32 2022\n", 28 | "NAMESPACE: mlflow\n", 29 | "STATUS: deployed\n", 30 | "REVISION: 1\n", 31 | "TEST SUITE: None\n" 32 | ] 33 | } 34 | ], 35 | "source": [ 36 | "! helm install mlflow-controller-staging ../../charts/mlflow-controller -n mlflow --set image.tag=f20fd19f28f1f39ced794e0a2f7736f403447d91 --set gitops.enabled=true --set mlflow.backend=blob --set gitops.repository=github.com/rocket9-code/model-deployments --set gitops.deploymentLocation=staging --set mlflow.stage=Staging \\--set mlflow.namespace=staging" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 11, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "name": "stdout", 46 | "output_type": "stream", 47 | "text": [ 48 | "pod/mlflow-controller-staging-787fd66687-gxl8z condition met\n" 49 | ] 50 | } 51 | ], 52 | "source": [ 53 | "! kubectl wait --for=condition=ready pod -l 'app.kubernetes.io/instance in (mlflow-controller-staging)' --timeout=180s -n mlflow" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "! kubectl port-forward -n mlflow svc/mlflow-service 5000:5000 " 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "# Register Mlflow models" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 12, 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "name": "stdout", 79 | "output_type": "stream", 80 | "text": [ 81 | " sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) \\\n", 82 | "0 5.1 3.5 1.4 0.2 \n", 83 | "1 4.9 3.0 1.4 0.2 \n", 84 | "2 4.7 3.2 1.3 0.2 \n", 85 | "3 4.6 3.1 1.5 0.2 \n", 86 | "4 5.0 3.6 1.4 0.2 \n", 87 | "\n", 88 | " target \n", 89 | "0 0 \n", 90 | "1 0 \n", 91 | "2 0 \n", 92 | "3 0 \n", 93 | "4 0 \n", 94 | "IRIS train df shape\n", 95 | "(105, 4)\n", 96 | "(105,)\n", 97 | "IRIS test df shape\n", 98 | "(45, 4)\n", 99 | "(45,)\n" 100 | ] 101 | }, 102 | { 103 | "name": "stderr", 104 | "output_type": "stream", 105 | "text": [ 106 | "Registered model 'iris demo0' already exists. Creating a new version of this model...\n", 107 | "2022/12/19 14:32:24 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: iris demo0, version 2\n", 108 | "Created version '2' of model 'iris demo0'.\n" 109 | ] 110 | }, 111 | { 112 | "name": "stdout", 113 | "output_type": "stream", 114 | "text": [ 115 | " sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) \\\n", 116 | "0 5.1 3.5 1.4 0.2 \n", 117 | "1 4.9 3.0 1.4 0.2 \n", 118 | "2 4.7 3.2 1.3 0.2 \n", 119 | "3 4.6 3.1 1.5 0.2 \n", 120 | "4 5.0 3.6 1.4 0.2 \n", 121 | "\n", 122 | " target \n", 123 | "0 0 \n", 124 | "1 0 \n", 125 | "2 0 \n", 126 | "3 0 \n", 127 | "4 0 \n", 128 | "IRIS train df shape\n", 129 | "(105, 4)\n", 130 | "(105,)\n", 131 | "IRIS test df shape\n", 132 | "(45, 4)\n", 133 | "(45,)\n" 134 | ] 135 | }, 136 | { 137 | "name": "stderr", 138 | "output_type": "stream", 139 | "text": [ 140 | "Registered model 'iris demo1' already exists. Creating a new version of this model...\n", 141 | "2022/12/19 14:32:44 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: iris demo1, version 2\n", 142 | "Created version '2' of model 'iris demo1'.\n" 143 | ] 144 | }, 145 | { 146 | "name": "stdout", 147 | "output_type": "stream", 148 | "text": [ 149 | " sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) \\\n", 150 | "0 5.1 3.5 1.4 0.2 \n", 151 | "1 4.9 3.0 1.4 0.2 \n", 152 | "2 4.7 3.2 1.3 0.2 \n", 153 | "3 4.6 3.1 1.5 0.2 \n", 154 | "4 5.0 3.6 1.4 0.2 \n", 155 | "\n", 156 | " target \n", 157 | "0 0 \n", 158 | "1 0 \n", 159 | "2 0 \n", 160 | "3 0 \n", 161 | "4 0 \n", 162 | "IRIS train df shape\n", 163 | "(105, 4)\n", 164 | "(105,)\n", 165 | "IRIS test df shape\n", 166 | "(45, 4)\n", 167 | "(45,)\n" 168 | ] 169 | }, 170 | { 171 | "name": "stderr", 172 | "output_type": "stream", 173 | "text": [ 174 | "Registered model 'iris demo2' already exists. Creating a new version of this model...\n", 175 | "2022/12/19 14:33:02 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: iris demo2, version 2\n", 176 | "Created version '2' of model 'iris demo2'.\n" 177 | ] 178 | }, 179 | { 180 | "name": "stdout", 181 | "output_type": "stream", 182 | "text": [ 183 | " sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) \\\n", 184 | "0 5.1 3.5 1.4 0.2 \n", 185 | "1 4.9 3.0 1.4 0.2 \n", 186 | "2 4.7 3.2 1.3 0.2 \n", 187 | "3 4.6 3.1 1.5 0.2 \n", 188 | "4 5.0 3.6 1.4 0.2 \n", 189 | "\n", 190 | " target \n", 191 | "0 0 \n", 192 | "1 0 \n", 193 | "2 0 \n", 194 | "3 0 \n", 195 | "4 0 \n", 196 | "IRIS train df shape\n", 197 | "(105, 4)\n", 198 | "(105,)\n", 199 | "IRIS test df shape\n", 200 | "(45, 4)\n", 201 | "(45,)\n" 202 | ] 203 | }, 204 | { 205 | "name": "stderr", 206 | "output_type": "stream", 207 | "text": [ 208 | "Registered model 'iris demo3' already exists. Creating a new version of this model...\n", 209 | "2022/12/19 14:33:18 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: iris demo3, version 2\n", 210 | "Created version '2' of model 'iris demo3'.\n" 211 | ] 212 | }, 213 | { 214 | "name": "stdout", 215 | "output_type": "stream", 216 | "text": [ 217 | " sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) \\\n", 218 | "0 5.1 3.5 1.4 0.2 \n", 219 | "1 4.9 3.0 1.4 0.2 \n", 220 | "2 4.7 3.2 1.3 0.2 \n", 221 | "3 4.6 3.1 1.5 0.2 \n", 222 | "4 5.0 3.6 1.4 0.2 \n", 223 | "\n", 224 | " target \n", 225 | "0 0 \n", 226 | "1 0 \n", 227 | "2 0 \n", 228 | "3 0 \n", 229 | "4 0 \n", 230 | "IRIS train df shape\n", 231 | "(105, 4)\n", 232 | "(105,)\n", 233 | "IRIS test df shape\n", 234 | "(45, 4)\n", 235 | "(45,)\n" 236 | ] 237 | }, 238 | { 239 | "name": "stderr", 240 | "output_type": "stream", 241 | "text": [ 242 | "Registered model 'iris demo4' already exists. Creating a new version of this model...\n", 243 | "2022/12/19 14:33:35 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: iris demo4, version 2\n", 244 | "Created version '2' of model 'iris demo4'.\n" 245 | ] 246 | } 247 | ], 248 | "source": [ 249 | "import os\n", 250 | "\n", 251 | "import mlflow\n", 252 | "import mlflow.sklearn\n", 253 | "import pandas as pd\n", 254 | "from minio import Minio\n", 255 | "from mlflow.tracking import MlflowClient\n", 256 | "from sklearn import datasets\n", 257 | "from sklearn.ensemble import RandomForestClassifier\n", 258 | "from sklearn.metrics import roc_auc_score\n", 259 | "from sklearn.model_selection import train_test_split\n", 260 | "\n", 261 | "\n", 262 | "os.environ[\"MLFLOW_TRACKING_URI\"] = \"http://localhost:5000\"\n", 263 | "os.environ[\"AZURE_STORAGE_ACCESS_KEY\"] = \"\"\n", 264 | "os.environ[\"AZURE_STORAGE_CONNECTION_STRING\"] = \"\"\n", 265 | "\n", 266 | "\n", 267 | "def main(MODEL_NAME=\"iris gitops\", stage=\"Staging\"):\n", 268 | " iris = datasets.load_iris()\n", 269 | " iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)\n", 270 | " y = iris.target\n", 271 | " iris_df[\"target\"] = y\n", 272 | "\n", 273 | " print(iris_df.head())\n", 274 | "\n", 275 | " train_df, test_df = train_test_split(\n", 276 | " iris_df, test_size=0.3, random_state=42, stratify=iris_df[\"target\"]\n", 277 | " )\n", 278 | " X_train = train_df[\n", 279 | " [\n", 280 | " \"sepal length (cm)\",\n", 281 | " \"sepal width (cm)\",\n", 282 | " \"petal length (cm)\",\n", 283 | " \"petal width (cm)\",\n", 284 | " ]\n", 285 | " ]\n", 286 | " y_train = train_df[\"target\"]\n", 287 | "\n", 288 | " X_test = test_df[\n", 289 | " [\n", 290 | " \"sepal length (cm)\",\n", 291 | " \"sepal width (cm)\",\n", 292 | " \"petal length (cm)\",\n", 293 | " \"petal width (cm)\",\n", 294 | " ]\n", 295 | " ]\n", 296 | " y_test = test_df[\"target\"]\n", 297 | "\n", 298 | " EXPERIMENT_NAME = MODEL_NAME\n", 299 | "\n", 300 | " print(\"IRIS train df shape\")\n", 301 | " print(X_train.shape)\n", 302 | " print(y_train.shape)\n", 303 | "\n", 304 | " print(\"IRIS test df shape\")\n", 305 | " print(X_test.shape)\n", 306 | " print(y_test.shape)\n", 307 | "\n", 308 | " mlflow_client = MlflowClient()\n", 309 | "\n", 310 | " # Create an MLFlow experiment, if not already exists\n", 311 | " experiment_details = mlflow_client.get_experiment_by_name(EXPERIMENT_NAME)\n", 312 | "\n", 313 | " if experiment_details is not None:\n", 314 | " experiment_id = experiment_details.experiment_id\n", 315 | " else:\n", 316 | " experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)\n", 317 | "\n", 318 | " # Start an MLFlow experiment run\n", 319 | " with mlflow.start_run(\n", 320 | " experiment_id=experiment_id, run_name=\"iris dataset rf run\"\n", 321 | " ) as run:\n", 322 | " # Log parameters\n", 323 | "\n", 324 | " mlflow.log_param(\"max_depth\", 10)\n", 325 | " mlflow.log_param(\"random_state\", 0)\n", 326 | " mlflow.log_param(\"n_estimators\", 100)\n", 327 | " clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)\n", 328 | " clf.fit(X_train, y_train)\n", 329 | " iris_predict_y = clf.predict(X_test)\n", 330 | "\n", 331 | " roc_auc_score_val = roc_auc_score(\n", 332 | " y_test, clf.predict_proba(X_test), multi_class=\"ovr\"\n", 333 | " )\n", 334 | " mlflow.log_metric(\"test roc_auc_score\", roc_auc_score_val)\n", 335 | "\n", 336 | " # Log model\n", 337 | " result = mlflow.sklearn.log_model(clf, artifact_path=\"model\")\n", 338 | "\n", 339 | " # Register a new version\n", 340 | " result = mlflow.register_model(result.model_uri, MODEL_NAME)\n", 341 | "\n", 342 | " client = MlflowClient()\n", 343 | " client.transition_model_version_stage(\n", 344 | " name=MODEL_NAME, version=result.version, stage=stage\n", 345 | " )\n", 346 | "\n", 347 | "\n", 348 | "for i in range(5):\n", 349 | " main(MODEL_NAME=f\"iris demo{i}\")" 350 | ] 351 | }, 352 | { 353 | "cell_type": "markdown", 354 | "metadata": {}, 355 | "source": [ 356 | "# write deployment file and commit to git repository" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "! git clone https://github.com/rocket9-code/model-deployments" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "metadata": {}, 372 | "outputs": [], 373 | "source": [ 374 | "dep_yaml = \"\"\"apiVersion: machinelearning.seldon.io/v1\n", 375 | "kind: SeldonDeployment\n", 376 | "metadata:\n", 377 | " name: mlflow-var-test1\n", 378 | "spec:\n", 379 | " name: iris\n", 380 | " predictors:\n", 381 | " - graph:\n", 382 | " children:\n", 383 | " - name: step-one\n", 384 | " modelUri: '{{ mlflow.blob[\"iris demo1\"] }}'\n", 385 | " envSecretRefName: seldon-rclone-secret\n", 386 | " implementation: MLFLOW_SERVER\n", 387 | " type: MODEL\n", 388 | " children: \n", 389 | " - name: step-two\n", 390 | " modelUri: '{{ mlflow.blob[\"iris demo2\"] }}'\n", 391 | " envSecretRefName: seldon-rclone-secret\n", 392 | " implementation: MLFLOW_SERVER\n", 393 | " type: MODEL\n", 394 | " children: []\n", 395 | " - name: step-three\n", 396 | " implementation: MLFLOW_SERVER\n", 397 | " modelUri: '{{ mlflow.blob[\"iris demo3\"] }}'\n", 398 | " envSecretRefName: seldon-rclone-secret\n", 399 | " type: MODEL\n", 400 | " children: []\n", 401 | " implementation: MLFLOW_SERVER\n", 402 | " modelUri: '{{ mlflow.blob[\"iris demo4\"] }}'\n", 403 | " envSecretRefName: seldon-rclone-secret\n", 404 | " logger:\n", 405 | " url: http://broker-ingress.knative-eventing.svc.cluster.local/demo/default\n", 406 | " mode: all\n", 407 | " name: classifier\n", 408 | " name: default\n", 409 | " replicas: 1\"\"\"\n", 410 | "with open(\"model-deployments/staging/seldon-deploy-test1.yaml\", \"x\") as f:\n", 411 | " f.write(dep_yaml)" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": null, 417 | "metadata": {}, 418 | "outputs": [], 419 | "source": [ 420 | "! cd model-deployments && git add staging/seldon-deploy-test1.yaml" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "metadata": {}, 427 | "outputs": [], 428 | "source": [ 429 | "! cd model-deployments && git commit -m \"test deploy yaml\" \n", 430 | "! cd model-deployments && git push" 431 | ] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "metadata": {}, 436 | "source": [ 437 | "# wait for the controller to pickup the changes and creates a new deploy yaml" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 13, 443 | "metadata": {}, 444 | "outputs": [], 445 | "source": [ 446 | "import time\n", 447 | "\n", 448 | "from kubernetes import client as KubeClient\n", 449 | "from kubernetes import config\n", 450 | "\n", 451 | "try:\n", 452 | " config.load_kube_config()\n", 453 | "except config.ConfigException:\n", 454 | " config.load_incluster_config()\n", 455 | "kube_client = KubeClient.CustomObjectsApi()" 456 | ] 457 | }, 458 | { 459 | "cell_type": "markdown", 460 | "metadata": {}, 461 | "source": [ 462 | "you can see the controller updated the model uri with latest model versions" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": 15, 468 | "metadata": {}, 469 | "outputs": [ 470 | { 471 | "name": "stdout", 472 | "output_type": "stream", 473 | "text": [ 474 | "wasbs://artifacts/mlflow/8/4083c71c946e47e19422218b69a5d67c/artifacts/model wasbs://artifacts/mlflow/9/10e8b48f3cfc451da361fabccb6e1c08/artifacts/model wasbs://artifacts/mlflow/10/262bee84b7dd4b039973084383880b57/artifacts/model wasbs://artifacts/mlflow/11/0dd0c915e3e0446d9139fb81b0b6ad83/artifacts/model\n" 475 | ] 476 | } 477 | ], 478 | "source": [ 479 | "manifest = kube_client.get_namespaced_custom_object(\n", 480 | " group=\"machinelearning.seldon.io\",\n", 481 | " version=\"v1\",\n", 482 | " plural=\"seldondeployments\",\n", 483 | " namespace=\"staging\",\n", 484 | " name=\"mlflow-var\",\n", 485 | ")\n", 486 | "demo1 = manifest[\"spec\"][\"predictors\"][0][\"graph\"][\"children\"][0][\"modelUri\"]\n", 487 | "demo2 = manifest[\"spec\"][\"predictors\"][0][\"graph\"][\"children\"][0][\"children\"][0][\n", 488 | " \"modelUri\"\n", 489 | "]\n", 490 | "demo3 = manifest[\"spec\"][\"predictors\"][0][\"graph\"][\"children\"][1][\"modelUri\"]\n", 491 | "demo4 = manifest[\"spec\"][\"predictors\"][0][\"graph\"][\"modelUri\"]\n", 492 | "\n", 493 | "print(demo1, demo2, demo3, demo4)" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": 16, 499 | "metadata": {}, 500 | "outputs": [ 501 | { 502 | "name": "stdout", 503 | "output_type": "stream", 504 | "text": [ 505 | "release \"mlflow-controller-staging\" uninstalled\n" 506 | ] 507 | } 508 | ], 509 | "source": [ 510 | "! helm delete mlflow-controller-staging -n mlflow" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": null, 516 | "metadata": {}, 517 | "outputs": [], 518 | "source": [] 519 | } 520 | ], 521 | "metadata": { 522 | "kernelspec": { 523 | "display_name": "Python 3", 524 | "language": "python", 525 | "name": "python3" 526 | }, 527 | "language_info": { 528 | "codemirror_mode": { 529 | "name": "ipython", 530 | "version": 3 531 | }, 532 | "file_extension": ".py", 533 | "mimetype": "text/x-python", 534 | "name": "python", 535 | "nbconvert_exporter": "python", 536 | "pygments_lexer": "ipython3", 537 | "version": "3.7.7" 538 | } 539 | }, 540 | "nbformat": 4, 541 | "nbformat_minor": 4 542 | } 543 | -------------------------------------------------------------------------------- /examples/notebook/deploy.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: machinelearning.seldon.io/v1 2 | kind: SeldonDeployment 3 | metadata: 4 | name: mlflow 5 | labels: 6 | app.kubernetes.io/managed-by: mlflow-seldon 7 | app.kubernetes.io/name: mlflow 8 | spec: 9 | name: iris 10 | predictors: 11 | - componentSpecs: 12 | - spec: 13 | # We are setting high failureThreshold as installing conda dependencies 14 | # can take long time and we want to avoid k8s killing the container prematurely 15 | containers: 16 | # - image: seldonio/mlflowserver:1.14.0-dev 17 | # imagePullPolicy: IfNotPresent 18 | # name: classifier 19 | - name: classifier 20 | livenessProbe: 21 | initialDelaySeconds: 800 22 | failureThreshold: 20000000 23 | periodSeconds: 25 24 | successThreshold: 1 25 | httpGet: 26 | path: /health/ping 27 | port: http 28 | scheme: HTTP 29 | readinessProbe: 30 | initialDelaySeconds: 800 31 | failureThreshold: 2000000 32 | periodSeconds: 25 33 | successThreshold: 1 34 | httpGet: 35 | path: /health/ping 36 | port: http 37 | scheme: HTTP 38 | 39 | graph: 40 | children: [] 41 | implementation: MLFLOW_SERVER 42 | modelUri: gs://hellomlops-mlflow/mlflow_artifacts/1/6887f98225b9419f9681d68e7cdd9335/artifacts/random-forest-model 43 | logger: 44 | url: http://broker-ingress.knative-eventing.svc.cluster.local/demo/default 45 | mode: all 46 | name: classifier 47 | name: default 48 | replicas: 1 -------------------------------------------------------------------------------- /examples/notebook/mlflow.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "id": "3bab55b3-a167-48c1-b3b2-0ca66f4c7c21", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | " sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) \\\n", 14 | "0 5.1 3.5 1.4 0.2 \n", 15 | "1 4.9 3.0 1.4 0.2 \n", 16 | "2 4.7 3.2 1.3 0.2 \n", 17 | "3 4.6 3.1 1.5 0.2 \n", 18 | "4 5.0 3.6 1.4 0.2 \n", 19 | "\n", 20 | " target \n", 21 | "0 0 \n", 22 | "1 0 \n", 23 | "2 0 \n", 24 | "3 0 \n", 25 | "4 0 \n", 26 | "IRIS train df shape\n", 27 | "(105, 4)\n", 28 | "(105,)\n", 29 | "IRIS test df shape\n", 30 | "(45, 4)\n", 31 | "(45,)\n" 32 | ] 33 | } 34 | ], 35 | "source": [ 36 | "import pandas as pd\n", 37 | "from sklearn import datasets\n", 38 | "from sklearn.ensemble import RandomForestClassifier\n", 39 | "import mlflow, os\n", 40 | "import mlflow.sklearn\n", 41 | "from mlflow.tracking import MlflowClient\n", 42 | "from sklearn.metrics import roc_auc_score, accuracy_score\n", 43 | "from sklearn.model_selection import train_test_split\n", 44 | "\n", 45 | "os.environ[\"MLFLOW_TRACKING_URI\"] = \"http://localhost:5000\"\n", 46 | "iris = datasets.load_iris()\n", 47 | "iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)\n", 48 | "y = iris.target\n", 49 | "iris_df[\"target\"] = y\n", 50 | "\n", 51 | "print(iris_df.head())\n", 52 | "\n", 53 | "\n", 54 | "train_df, test_df = train_test_split(\n", 55 | " iris_df, test_size=0.3, random_state=42, stratify=iris_df[\"target\"]\n", 56 | ")\n", 57 | "X_train = train_df[\n", 58 | " [\"sepal length (cm)\", \"sepal width (cm)\", \"petal length (cm)\", \"petal width (cm)\"]\n", 59 | "]\n", 60 | "y_train = train_df[\"target\"]\n", 61 | "\n", 62 | "X_test = test_df[\n", 63 | " [\"sepal length (cm)\", \"sepal width (cm)\", \"petal length (cm)\", \"petal width (cm)\"]\n", 64 | "]\n", 65 | "y_test = test_df[\"target\"]\n", 66 | "# print(iris)\n", 67 | "# print(iris_df.head())\n", 68 | "\n", 69 | "\n", 70 | "EXPERIMENT_NAME = \"IRIS dataset classification\"\n", 71 | "\n", 72 | "\n", 73 | "print(\"IRIS train df shape\")\n", 74 | "print(X_train.shape)\n", 75 | "print(y_train.shape)\n", 76 | "\n", 77 | "print(\"IRIS test df shape\")\n", 78 | "print(X_test.shape)\n", 79 | "print(y_test.shape)\n", 80 | "\n", 81 | "mlflow_client = MlflowClient()\n", 82 | "\n", 83 | "# Create an MLFlow experiment, if not already exists\n", 84 | "experiment_details = mlflow_client.get_experiment_by_name(EXPERIMENT_NAME)\n", 85 | "\n", 86 | "if experiment_details is not None:\n", 87 | " experiment_id = experiment_details.experiment_id\n", 88 | "else:\n", 89 | " experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)\n", 90 | "\n", 91 | "# Start an MLFlow experiment run\n", 92 | "with mlflow.start_run(\n", 93 | " experiment_id=experiment_id, run_name=\"iris dataset rf run\"\n", 94 | ") as run:\n", 95 | " # Log parameters\n", 96 | "\n", 97 | " mlflow.log_param(\"max_depth\", 10)\n", 98 | " mlflow.log_param(\"random_state\", 0)\n", 99 | " mlflow.log_param(\"n_estimators\", 100)\n", 100 | " clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)\n", 101 | " clf.fit(X_train, y_train)\n", 102 | " iris_predict_y = clf.predict(X_test)\n", 103 | "\n", 104 | " roc_auc_score_val = roc_auc_score(\n", 105 | " y_test, clf.predict_proba(X_test), multi_class=\"ovr\"\n", 106 | " )\n", 107 | " mlflow.log_metric(\"test roc_auc_score\", roc_auc_score_val)\n", 108 | "\n", 109 | " accuracy_score = accuracy_score(y_test, iris_predict_y)\n", 110 | " mlflow.log_metric(\"test accuracy_score\", accuracy_score)\n", 111 | " mlflow.log_artifact(\"deploy.yaml\")\n", 112 | "\n", 113 | " # Log model\n", 114 | " mlflow.sklearn.log_model(clf, artifact_path=\"model\")" 115 | ] 116 | } 117 | ], 118 | "metadata": { 119 | "interpreter": { 120 | "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" 121 | }, 122 | "kernelspec": { 123 | "display_name": "Python 3", 124 | "language": "python", 125 | "name": "python3" 126 | }, 127 | "language_info": { 128 | "codemirror_mode": { 129 | "name": "ipython", 130 | "version": 3 131 | }, 132 | "file_extension": ".py", 133 | "mimetype": "text/x-python", 134 | "name": "python", 135 | "nbconvert_exporter": "python", 136 | "pygments_lexer": "ipython3", 137 | "version": "3.9.12" 138 | } 139 | }, 140 | "nbformat": 4, 141 | "nbformat_minor": 5 142 | } 143 | -------------------------------------------------------------------------------- /examples/readme.md: -------------------------------------------------------------------------------- 1 | Example Deployment using argocd 2 | --- 3 | 4 | Setup Mlflow and Mlflow controllers for different stages using argocd 5 | 6 | ``` 7 | kubectl apply -f argo-manifest 8 | ``` 9 | 10 | Log a Mlflow model with Seldon deployment configuration with the name deploy.yaml 11 | 12 | Screenshot 2022-07-10 at 6 26 01 PM 13 | 14 | Screenshot 2022-07-10 at 6 25 47 PM 15 | 16 | 17 | Mlflow controllers will deploy the models to appropriate Namespaces based on the configuration 18 | 19 | Screenshot 2022-07-10 at 6 27 11 PM 20 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | """ 2 | __author__ = "Raghul Krishna" 3 | __copyright__ = "" 4 | __credits__ = "" 5 | __license__ = "" 6 | __version__ = "" 7 | __maintainer__ = "raghul Krishna" 8 | __email__ = "rrkraghulkrishna@gmail.com" 9 | 10 | """ 11 | import logging 12 | import os 13 | from time import sleep 14 | 15 | from apscheduler.schedulers.background import BackgroundScheduler 16 | 17 | from mlflow_controller.gitops import GitopsMDC 18 | from mlflow_controller.mlflow_direct import DeployConroller 19 | 20 | logging.getLogger("apscheduler").setLevel(logging.ERROR) 21 | 22 | if __name__ == "__main__": 23 | scheduler = BackgroundScheduler() 24 | controller = DeployConroller() 25 | giopsmdc = GitopsMDC() 26 | # scheduler.add_job( 27 | # controller.deploy_controller, CronTrigger.from_crontab("* * * * *") 28 | # ) 29 | # scheduler.add_job( 30 | # id="controller", 31 | # func=controller.deploy_controller, 32 | # trigger="interval", 33 | # seconds=15, 34 | # ) 35 | if os.getenv("GITOPS_ENABLED", "False"): 36 | scheduler.add_job( 37 | id="gitopsmdc", 38 | func=giopsmdc.gitops_mlflow_controller, 39 | trigger="interval", 40 | seconds=15, 41 | ) 42 | scheduler.start() 43 | while True: 44 | sleep(1) 45 | -------------------------------------------------------------------------------- /mlflow_controller/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rocket9-code/mlflow-deployment-controller/9bd5eefa87f8351bfe6837754f373fcab0ac86c0/mlflow_controller/__init__.py -------------------------------------------------------------------------------- /mlflow_controller/controller.py: -------------------------------------------------------------------------------- 1 | """ 2 | __author__ = "Raghul Krishna" 3 | __copyright__ = "" 4 | __credits__ = "" 5 | __license__ = "" 6 | __version__ = "" 7 | __maintainer__ = "raghul Krishna" 8 | __email__ = "rrkraghulkrishna@gmail.com" 9 | 10 | """ 11 | import logging 12 | import os 13 | import re 14 | 15 | from kubernetes import client as KubeClient 16 | from kubernetes import config 17 | from mlflow.tracking import MlflowClient 18 | 19 | import mlflow_controller.storage 20 | 21 | logger = logging.getLogger(__name__) 22 | logger.setLevel(logging.DEBUG) 23 | 24 | formatter = logging.Formatter("%(asctime)s:%(name)s:%(message)s") 25 | 26 | file_handler = logging.FileHandler("log.log") 27 | file_handler.setLevel(logging.ERROR) 28 | file_handler.setFormatter(formatter) 29 | 30 | stream_handler = logging.StreamHandler() 31 | stream_handler.setFormatter(formatter) 32 | 33 | logger.addHandler(file_handler) 34 | logger.addHandler(stream_handler) 35 | 36 | logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO")) 37 | 38 | # os.environ["MLFLOW_TRACKING_URI"] = "http://localhost:5000" 39 | 40 | 41 | class DeployConroller: 42 | """ 43 | A class to Matain the controller 44 | ... 45 | 46 | Methods 47 | ------- 48 | deploy_controller(): 49 | Manages the deployments from Mlflow 50 | """ 51 | 52 | def __init__(self): 53 | self.mlflow_client = MlflowClient() 54 | logger.info("Mlflow client initialized") 55 | self.object_init = mlflow_controller.storage.Artifact() 56 | try: 57 | config.load_kube_config() 58 | except config.ConfigException: 59 | config.load_incluster_config() 60 | self.kube_client = KubeClient.CustomObjectsApi() 61 | logger.info("KubeClient initialized") 62 | self.mlflow_deploy_config = "deploy.yaml" 63 | self.stage = os.environ["stage"] 64 | self.model_details = [] 65 | self.Namespace = os.environ["namespace"] 66 | self.cloud = os.environ["cloud"] 67 | self.label = "app.kubernetes.io/managed-by=mdc-mlflow" 68 | 69 | def __str__(self): 70 | return self.__class__.__name__ 71 | 72 | def state_manager(self): 73 | """To delete resources deleted in Mlflow""" 74 | manifests = self.kube_client.list_namespaced_custom_object( 75 | group="machinelearning.seldon.io", 76 | version="v1", 77 | plural="seldondeployments", 78 | namespace=self.Namespace, 79 | label_selector=self.label, 80 | ) 81 | for manifest in manifests["items"]: 82 | model_names = self.model_details 83 | manifest_name = manifest["metadata"]["name"] 84 | manifest_namespace = manifest["metadata"]["namespace"] 85 | print(model_names, manifest_name, manifest_namespace) 86 | model = next( 87 | ( 88 | item 89 | for item in model_names 90 | if item["deploy_name"] == manifest_name 91 | and item["Namespace"] == manifest_namespace 92 | ), 93 | None, 94 | ) 95 | if model: 96 | logger.info( 97 | "Model %s Namespace %s in Sync ", 98 | manifest["metadata"]["name"], 99 | manifest["metadata"]["namespace"], 100 | ) 101 | else: 102 | logger.info( 103 | "Deleting a Deployment %s Namespace %s", 104 | manifest["metadata"]["name"], 105 | manifest["metadata"]["namespace"], 106 | ) 107 | self.kube_client.delete_namespaced_custom_object( 108 | group="machinelearning.seldon.io", 109 | version="v1", 110 | plural="seldondeployments", 111 | name=manifest["metadata"]["name"], 112 | namespace=manifest["metadata"]["namespace"], 113 | ) 114 | self.model_details = [] 115 | 116 | def deploy_controller(self): 117 | """ 118 | Manages the deployments from Mlflow 119 | """ 120 | model_versions = [] 121 | for registered_model in self.mlflow_client.list_registered_models(): 122 | for version in registered_model.latest_versions: 123 | model_versions.append(version) 124 | for version in model_versions: 125 | if version.current_stage == self.stage: 126 | print(version.current_stage) 127 | for file in self.mlflow_client.list_artifacts(version.run_id): 128 | if file.path == self.mlflow_deploy_config: 129 | model_name = version.name.lower() 130 | model_run_id = version.run_id 131 | run_details = self.mlflow_client.get_run(version.run_id) 132 | model_version = version.version 133 | artifact_uri = run_details.info.artifact_uri 134 | if self.cloud == "gcp": 135 | model_source = version.source 136 | deploy_yaml = self.object_init.gcp_bucket(artifact_uri) 137 | elif self.cloud == "azure_blob": 138 | model_source = re.sub( 139 | r"(?=\@)(.*?)(?=\/)", "", version.source 140 | ) 141 | deploy_yaml = self.object_init.azure_blob(artifact_uri) 142 | elif self.cloud == "aws_s3": 143 | model_source = re.sub( 144 | r"(?=\@)(.*?)(?=\/)", "", version.source 145 | ) 146 | deploy_yaml = self.object_init.azure_blob(artifact_uri) 147 | 148 | else: 149 | raise ("unsupported Object Storage") 150 | model_deploy_name = model_name.replace(" ", "").replace( 151 | "_", "-" 152 | ) 153 | deploy_yaml["spec"]["predictors"][0]["graph"][ 154 | "modelUri" 155 | ] = model_source 156 | deploy_yaml["spec"]["predictors"][0]["annotations"][ 157 | "predictor_version" 158 | ] = model_version 159 | deploy_yaml["metadata"]["name"] = model_deploy_name 160 | try: 161 | deploy_yaml["metadata"]["annotations"] 162 | except KeyError: 163 | deploy_yaml["metadata"]["annotations"] = {} 164 | deploy_yaml["metadata"]["labels"][ 165 | "app.kubernetes.io/managed-by" 166 | ] = "mdc-mlflow" 167 | logger.info( 168 | "Model Name: %s, Model Run Id: %s", 169 | model_name, 170 | model_run_id, 171 | ) 172 | self.model_details.append( 173 | { 174 | "name": model_name, 175 | "deploy_name": deploy_yaml["metadata"]["name"], 176 | "Namespace": self.Namespace, 177 | } 178 | ) 179 | try: 180 | self.kube_client.create_namespaced_custom_object( 181 | group="machinelearning.seldon.io", 182 | version="v1", 183 | plural="seldondeployments", 184 | body=deploy_yaml, 185 | namespace=self.Namespace, 186 | ) 187 | logger.info( 188 | "Created a Deployment %s Namespace %s", 189 | model_name, 190 | self.Namespace, 191 | ) 192 | except KubeClient.rest.ApiException: 193 | self.kube_client.patch_namespaced_custom_object( 194 | group="machinelearning.seldon.io", 195 | version="v1", 196 | plural="seldondeployments", 197 | body=deploy_yaml, 198 | name=deploy_yaml["metadata"]["name"], 199 | namespace=self.Namespace, 200 | ) 201 | logger.info( 202 | "Patched a Deployment %s Namespace %s", 203 | model_name, 204 | self.Namespace, 205 | ) 206 | self.state_manager() 207 | -------------------------------------------------------------------------------- /mlflow_controller/gitops.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import logging 3 | import os 4 | import shutil 5 | import uuid 6 | 7 | import yaml 8 | from git import Repo 9 | from kubernetes import config 10 | 11 | from mlflow_controller.mlservers import kserve, seldon 12 | from mlflow_controller.registries.mlflow import MLflowMetadata 13 | 14 | logger = logging.getLogger(__name__) 15 | logger.setLevel(logging.DEBUG) 16 | 17 | formatter = logging.Formatter("%(asctime)s:%(name)s:%(message)s") 18 | 19 | file_handler = logging.FileHandler("log.log") 20 | file_handler.setLevel(logging.ERROR) 21 | file_handler.setFormatter(formatter) 22 | 23 | stream_handler = logging.StreamHandler() 24 | stream_handler.setFormatter(formatter) 25 | 26 | logger.addHandler(file_handler) 27 | logger.addHandler(stream_handler) 28 | 29 | logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO")) 30 | 31 | TRACKING_URI = os.getenv("MLFLOW_TRACKING_URI", "http://localhost:9000") 32 | GIT_USER = os.getenv("GIT_USER", "") 33 | GIT_PASSWORD = os.getenv("GIT_PASSWORD", "") 34 | GIT_PROTOCOL = os.getenv("GIT_PROTOCOL", "https") 35 | GIT_REPO = os.getenv("GIT_REPO", "github.com/rocket9-code/model-deployments") 36 | if GIT_PASSWORD: 37 | GIT_URL = f"{GIT_PROTOCOL}://{GIT_USER}:{GIT_PASSWORD}@{GIT_REPO}" 38 | else: 39 | GIT_URL = f"{GIT_PROTOCOL}://{GIT_REPO}" 40 | 41 | MANIFEST_LOCATION = os.getenv("MANIFEST_LOCATION", "staging") 42 | GLOBAL_NAMESPACE = os.getenv("namespace", "staging") 43 | MLFLOW_STAGE = os.getenv("stage", "Staging") 44 | backend = os.getenv("backend", "") 45 | BRANCH = os.getenv("BRANCH", "main") 46 | ML_SERVER = os.getenv("ML_SERVER", "kserve") 47 | 48 | 49 | class GitopsMDC: 50 | def gitops_mlflow_controller(self): 51 | folder_name = str(uuid.uuid4()) 52 | path = "./tmp/" + folder_name 53 | if not os.path.exists(path): 54 | os.makedirs(path) 55 | logger.info(f"Cloning repo {GIT_REPO} with branch {BRANCH}") 56 | Repo.clone_from(GIT_URL, path, single_branch=True, branch=BRANCH) 57 | try: 58 | config.load_kube_config() 59 | except config.ConfigException: 60 | config.load_incluster_config() 61 | manifest_path = path + "/" + MANIFEST_LOCATION 62 | deploy_yamls = glob.glob(f"{manifest_path}/*.yaml") + glob.glob( 63 | f"{manifest_path}/*.yml" 64 | ) 65 | mlflowcontroller = MLflowMetadata(tracking_uri=TRACKING_URI, stage=MLFLOW_STAGE) 66 | logger.info(f"Mlflow tracking uri {TRACKING_URI}") 67 | logger.info(f"Mlflow Stage {MLFLOW_STAGE}") 68 | logger.info(f"backend {backend}") 69 | mlflow_models_metadata, _ = mlflowcontroller.get_model_metadata( 70 | check_deploy=False, backend=backend 71 | ) 72 | read_seldon_deploy_yamls = [] 73 | for i in deploy_yamls: 74 | with open(i, "r") as stream: 75 | try: 76 | deploy_yaml = yaml.safe_load(stream) 77 | resource_group = deploy_yaml["apiVersion"].split("/")[0] 78 | if ML_SERVER == "seldon": 79 | if resource_group == "machinelearning.seldon.io": 80 | read_seldon_deploy_yamls.append(deploy_yaml) 81 | elif ML_SERVER == "kserve": 82 | if resource_group == "serving.kserve.io": 83 | read_seldon_deploy_yamls.append(deploy_yaml) 84 | except yaml.YAMLError as exc: 85 | logger.error(exc) 86 | if len(mlflow_models_metadata.keys()) > 0: 87 | if ML_SERVER == "seldon": 88 | seldon.sync( 89 | read_seldon_deploy_yamls, 90 | mlflow_models_metadata, 91 | MLFLOW_STAGE, 92 | GLOBAL_NAMESPACE, 93 | f"mdc-gitops-{backend}-mlflow-seldon", 94 | "mlflow", 95 | backend, 96 | ) 97 | elif ML_SERVER == "kserve": 98 | kserve.sync( 99 | read_seldon_deploy_yamls, 100 | mlflow_models_metadata, 101 | MLFLOW_STAGE, 102 | GLOBAL_NAMESPACE, 103 | f"mdc-gitops-{backend}-mlflow-kserve", 104 | "mlflow", 105 | backend, 106 | ) 107 | shutil.rmtree(path, ignore_errors=True) 108 | -------------------------------------------------------------------------------- /mlflow_controller/mlflow_direct.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | from mlflow_controller.mlservers import kserve, seldon 5 | from mlflow_controller.registries.mlflow import MLflowMetadata 6 | 7 | logger = logging.getLogger(__name__) 8 | logger.setLevel(logging.DEBUG) 9 | 10 | formatter = logging.Formatter("%(asctime)s:%(name)s:%(message)s") 11 | 12 | file_handler = logging.FileHandler("log.log") 13 | file_handler.setLevel(logging.ERROR) 14 | file_handler.setFormatter(formatter) 15 | 16 | stream_handler = logging.StreamHandler() 17 | stream_handler.setFormatter(formatter) 18 | 19 | logger.addHandler(file_handler) 20 | logger.addHandler(stream_handler) 21 | 22 | logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO")) 23 | TRACKING_URI = os.getenv("MLFLOW_TRACKING_URI", "http://localhost:5000") 24 | GLOBAL_NAMESPACE = os.getenv("namespace", "staging") 25 | MLFLOW_STAGE = os.getenv("stage", "Staging") 26 | backend = os.getenv("backend", "") 27 | ML_SERVER = os.getenv("ML_SERVER", "kserve") 28 | 29 | 30 | class DeployConroller: 31 | """ 32 | A class to Matain the controller 33 | 34 | ... 35 | 36 | Methods 37 | ------- 38 | deploy_controller(): 39 | Manages the deployments from Mlflow 40 | """ 41 | 42 | def __init__(self): 43 | self.managed_label = "mdc-direct" 44 | 45 | def __str__(self): 46 | return self.__class__.__name__ 47 | 48 | def deploy_controller(self): 49 | """ 50 | Manages the deployments from Mlflow 51 | """ 52 | mlflowcontroller = MLflowMetadata(tracking_uri=TRACKING_URI, stage=MLFLOW_STAGE) 53 | logger.info(f"Mlflow tracking uri {TRACKING_URI}") 54 | logger.info(f"Mlflow Stage {MLFLOW_STAGE}") 55 | logger.info(f"backend {backend}") 56 | mlflow_models_metadata, read_deploy_yaml = mlflowcontroller.get_model_metadata( 57 | check_deploy=True, 58 | backend=backend, 59 | manager_label=self.managed_label, 60 | mlflow_deploy_config="deploy.yaml", 61 | ) 62 | if len(mlflow_models_metadata.keys()) > 0: 63 | if ML_SERVER == "seldon": 64 | seldon.sync( 65 | read_deploy_yaml, 66 | mlflow_models_metadata, 67 | MLFLOW_STAGE, 68 | GLOBAL_NAMESPACE, 69 | f"{self.managed_label}-mlflow-{backend}-seldon", 70 | "mlflow", 71 | backend, 72 | ) 73 | elif ML_SERVER == "kserve": 74 | kserve.sync( 75 | read_deploy_yaml, 76 | mlflow_models_metadata, 77 | MLFLOW_STAGE, 78 | GLOBAL_NAMESPACE, 79 | f"{self.managed_label}-mlflow-{backend}-kserve", 80 | "mlflow", 81 | backend, 82 | ) 83 | -------------------------------------------------------------------------------- /mlflow_controller/mlservers/kserve.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | 4 | from kubernetes import client as KubeClient 5 | from kubernetes import config 6 | 7 | from mlflow_controller.mlservers.utils import mlflow_model_search, update_modeluris 8 | from mlflow_controller.utils.var_extract import var_parser 9 | 10 | logger = logging.getLogger(__name__) 11 | logger.setLevel(logging.DEBUG) 12 | 13 | formatter = logging.Formatter("%(asctime)s:%(name)s:%(message)s") 14 | 15 | file_handler = logging.FileHandler("log.log") 16 | file_handler.setLevel(logging.ERROR) 17 | file_handler.setFormatter(formatter) 18 | 19 | stream_handler = logging.StreamHandler() 20 | stream_handler.setFormatter(formatter) 21 | 22 | logger.addHandler(file_handler) 23 | logger.addHandler(stream_handler) 24 | 25 | try: 26 | config.load_kube_config() 27 | except config.ConfigException: 28 | config.load_incluster_config() 29 | kube_client = KubeClient.CustomObjectsApi() 30 | 31 | 32 | class InvalidVariable(Exception): 33 | "Raised when wrong templates" 34 | 35 | 36 | def sync( 37 | deploy_yamls, 38 | model_metadata, 39 | stage, 40 | GLOBAL_NAMESPACE, 41 | controller_label_value, 42 | registry_name, 43 | backend, 44 | ): 45 | git_models = [] 46 | for deploy_yaml in deploy_yamls: 47 | logger.info(deploy_yamls) 48 | resource_group = deploy_yaml["apiVersion"].split("/")[0] 49 | logger.info(resource_group) 50 | if resource_group == "serving.kserve.io": 51 | models = list( 52 | set(mlflow_model_search("storageUri", deploy_yaml, search_result=[])) 53 | ) 54 | logger.info(f"models {models}") 55 | rep_deploy_yaml = deploy_yaml 56 | try: 57 | rep_deploy_yaml["metadata"]["annotations"] 58 | 59 | except KeyError: 60 | rep_deploy_yaml["metadata"]["annotations"] = {} 61 | try: 62 | rep_deploy_yaml["metadata"]["labels"] 63 | 64 | except KeyError: 65 | rep_deploy_yaml["metadata"]["labels"] = {} 66 | deploy = False 67 | for m in models: 68 | try: 69 | pattern = r"\{\{\s(.*)\s\}\}" 70 | model_jinja = re.findall(pattern, m)[0] 71 | model_name, bk_name, rg_name = var_parser(model_jinja) 72 | if (bk_name != backend) or (rg_name != registry_name): 73 | raise InvalidVariable 74 | model = model_metadata[registry_name][backend][model_name] 75 | run_id = model["run_id"] 76 | if backend == "blob": 77 | model_source = model["source"].replace("wasbs", "https") 78 | else: 79 | model_source = model["source"] 80 | rep_deploy_yaml = update_modeluris( 81 | rep_deploy_yaml, 82 | f'{{{{ {registry_name}.{backend}["{model_name}"] }}}}', 83 | model_source, 84 | ) 85 | rep_deploy_yaml["metadata"]["annotations"][ 86 | f"mdc/mlflow-{run_id}" 87 | ] = str(model) 88 | rep_deploy_yaml["metadata"]["annotations"][ 89 | "mdc/mlflow-stage" 90 | ] = stage 91 | rep_deploy_yaml["metadata"]["labels"][ 92 | "app.kubernetes.io/mdc-type" 93 | ] = controller_label_value 94 | rep_deploy_yaml["metadata"]["labels"][ 95 | "app.kubernetes.io/managed-by" 96 | ] = "mdc" 97 | deploy = True 98 | name = rep_deploy_yaml["metadata"]["name"] 99 | except InvalidVariable: 100 | deploy = False 101 | logger.error( 102 | f"Error in variable for model {m} backend {bk_name} registry {rg_name}" 103 | ) 104 | except Exception as e: 105 | deploy = False 106 | logger.error( 107 | f"Error deploying {name} Model {m} not found in mlflow {e}" 108 | ) 109 | if deploy: 110 | logger.info( 111 | f"deploying kserve deployment {name} in namespace {GLOBAL_NAMESPACE}" 112 | ) 113 | try: 114 | manifest = kube_client.get_namespaced_custom_object( 115 | group=resource_group, 116 | version="v1beta1", 117 | plural="inferenceservices", 118 | namespace=GLOBAL_NAMESPACE, 119 | name=rep_deploy_yaml["metadata"]["name"], 120 | ) 121 | resourceVersion = manifest["metadata"]["resourceVersion"] 122 | manifest["metadata"].pop("creationTimestamp") 123 | manifest["metadata"].pop("generation") 124 | manifest["metadata"].pop("managedFields") 125 | manifest["metadata"].pop("resourceVersion") 126 | manifest["metadata"].pop("uid") 127 | manifest["metadata"].pop("namespace") 128 | manifest.pop("status") 129 | _name = rep_deploy_yaml["metadata"]["name"] 130 | if rep_deploy_yaml == manifest: 131 | logger.info(f"Kserve deployment {_name} in sync") 132 | else: 133 | rep_deploy_yaml["metadata"]["resourceVersion"] = resourceVersion 134 | kube_client.replace_namespaced_custom_object( 135 | group=resource_group, 136 | version="v1beta1", 137 | plural="inferenceservices", 138 | body=rep_deploy_yaml, 139 | name=_name, 140 | namespace=GLOBAL_NAMESPACE, 141 | ) 142 | except KubeClient.rest.ApiException: 143 | kube_client.create_namespaced_custom_object( 144 | group=resource_group, 145 | version="v1beta1", 146 | plural="inferenceservices", 147 | body=rep_deploy_yaml, 148 | namespace=GLOBAL_NAMESPACE, 149 | ) 150 | git_models.append(rep_deploy_yaml["metadata"]["name"]) 151 | manifests = kube_client.list_namespaced_custom_object( 152 | group="serving.kserve.io", 153 | version="v1beta1", 154 | plural="inferenceservices", 155 | namespace=GLOBAL_NAMESPACE, 156 | label_selector=f"app.kubernetes.io/mdc-type={controller_label_value}", 157 | ) 158 | for i in manifests["items"]: 159 | model_name = i["metadata"]["name"] 160 | if model_name in git_models: 161 | logger.info(f"kserve dpeloyment in sync {model_name}") 162 | else: 163 | kube_client.delete_namespaced_custom_object( 164 | group="serving.kserve.io", 165 | version="v1beta1", 166 | plural="inferenceservices", 167 | name=model_name, 168 | namespace=GLOBAL_NAMESPACE, 169 | ) 170 | -------------------------------------------------------------------------------- /mlflow_controller/mlservers/rclone.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def rclone_source(source, backend): 5 | if backend == "blob": 6 | pattern = r"(?<=net/).*" 7 | rclonesource = re.search(pattern, source).group() 8 | conatiner_pattern = r"(?<=/)\w+" 9 | conatiner_name = re.search(conatiner_pattern, source).group() 10 | return "wasbs://" + conatiner_name + "/" + rclonesource 11 | else: 12 | return source 13 | -------------------------------------------------------------------------------- /mlflow_controller/mlservers/seldon.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | 4 | from kubernetes import client as KubeClient 5 | from kubernetes import config 6 | 7 | from mlflow_controller.mlservers.rclone import rclone_source 8 | from mlflow_controller.mlservers.utils import mlflow_model_search, update_modeluris 9 | from mlflow_controller.utils.var_extract import var_parser 10 | 11 | logger = logging.getLogger(__name__) 12 | logger.setLevel(logging.DEBUG) 13 | 14 | formatter = logging.Formatter("%(asctime)s:%(name)s:%(message)s") 15 | 16 | file_handler = logging.FileHandler("log.log") 17 | file_handler.setLevel(logging.ERROR) 18 | file_handler.setFormatter(formatter) 19 | 20 | stream_handler = logging.StreamHandler() 21 | stream_handler.setFormatter(formatter) 22 | 23 | logger.addHandler(file_handler) 24 | logger.addHandler(stream_handler) 25 | 26 | try: 27 | config.load_kube_config() 28 | except config.ConfigException: 29 | config.load_incluster_config() 30 | kube_client = KubeClient.CustomObjectsApi() 31 | 32 | 33 | class InvalidVariable(Exception): 34 | "Raised when wrong templates" 35 | 36 | 37 | def sync( 38 | deploy_yamls, 39 | model_metadata, 40 | stage, 41 | GLOBAL_NAMESPACE, 42 | controller_label_value, 43 | registry_name, 44 | backend, 45 | ): 46 | git_models = [] 47 | for deploy_yaml in deploy_yamls: 48 | resource_group = deploy_yaml["apiVersion"].split("/")[0] 49 | if resource_group == "machinelearning.seldon.io": 50 | models = list( 51 | set(mlflow_model_search("modelUri", deploy_yaml, search_result=[])) 52 | ) 53 | logger.info(f"models {models}") 54 | rep_deploy_yaml = deploy_yaml 55 | try: 56 | rep_deploy_yaml["metadata"]["annotations"] 57 | 58 | except KeyError: 59 | rep_deploy_yaml["metadata"]["annotations"] = {} 60 | try: 61 | rep_deploy_yaml["metadata"]["labels"] 62 | 63 | except KeyError: 64 | rep_deploy_yaml["metadata"]["labels"] = {} 65 | deploy = False 66 | for m in models: 67 | try: 68 | pattern = r"\{\{\s(.*)\s\}\}" 69 | model_jinja = re.findall(pattern, m)[0] 70 | model_name, bk_name, rg_name = var_parser(model_jinja) 71 | if (bk_name != backend) or (rg_name != registry_name): 72 | raise InvalidVariable 73 | model = model_metadata[registry_name][backend][model_name] 74 | run_id = model["run_id"] 75 | rep_deploy_yaml = update_modeluris( 76 | rep_deploy_yaml, 77 | f'{{{{ {registry_name}.{backend}["{model_name}"] }}}}', 78 | rclone_source(model["source"], backend), 79 | ) 80 | rep_deploy_yaml["metadata"]["annotations"][ 81 | f"mdc/mlflow-{run_id}" 82 | ] = str(model) 83 | rep_deploy_yaml["metadata"]["annotations"][ 84 | "mdc/mlflow-stage" 85 | ] = stage 86 | rep_deploy_yaml["metadata"]["labels"][ 87 | "app.kubernetes.io/mdc-type" 88 | ] = controller_label_value 89 | rep_deploy_yaml["metadata"]["labels"][ 90 | "app.kubernetes.io/managed-by" 91 | ] = "mdc" 92 | deploy = True 93 | name = rep_deploy_yaml["metadata"]["name"] 94 | except InvalidVariable: 95 | deploy = False 96 | logger.error( 97 | f"Error in variable for model {m} backend {bk_name} registry {rg_name}" 98 | ) 99 | except Exception as e: 100 | deploy = False 101 | logger.error( 102 | f"Error deploying {name} Model {m} not found in mlflow {e}" 103 | ) 104 | if deploy: 105 | logger.info( 106 | f"deploying seldon deployment {name} in namespace {GLOBAL_NAMESPACE}" 107 | ) 108 | try: 109 | manifest = kube_client.get_namespaced_custom_object( 110 | group=resource_group, 111 | version="v1", 112 | plural="seldondeployments", 113 | namespace=GLOBAL_NAMESPACE, 114 | name=rep_deploy_yaml["metadata"]["name"], 115 | ) 116 | resourceVersion = manifest["metadata"]["resourceVersion"] 117 | manifest["metadata"].pop("creationTimestamp") 118 | manifest["metadata"].pop("generation") 119 | manifest["metadata"].pop("managedFields") 120 | manifest["metadata"].pop("resourceVersion") 121 | manifest["metadata"].pop("uid") 122 | manifest["metadata"].pop("namespace") 123 | manifest.pop("status") 124 | _name = rep_deploy_yaml["metadata"]["name"] 125 | if rep_deploy_yaml == manifest: 126 | logger.info(f"seldon deployment {_name} in sync") 127 | else: 128 | rep_deploy_yaml["metadata"]["resourceVersion"] = resourceVersion 129 | kube_client.replace_namespaced_custom_object( 130 | group=resource_group, 131 | version="v1", 132 | plural="seldondeployments", 133 | body=rep_deploy_yaml, 134 | name=_name, 135 | namespace=GLOBAL_NAMESPACE, 136 | ) 137 | 138 | except KubeClient.rest.ApiException: 139 | kube_client.create_namespaced_custom_object( 140 | group=resource_group, 141 | version="v1", 142 | plural="seldondeployments", 143 | body=rep_deploy_yaml, 144 | namespace=GLOBAL_NAMESPACE, 145 | ) 146 | git_models.append(rep_deploy_yaml["metadata"]["name"]) 147 | manifests = kube_client.list_namespaced_custom_object( 148 | group="machinelearning.seldon.io", 149 | version="v1", 150 | plural="seldondeployments", 151 | namespace=GLOBAL_NAMESPACE, 152 | label_selector=f"app.kubernetes.io/mdc-type={controller_label_value}", 153 | ) 154 | for i in manifests["items"]: 155 | model_name = i["metadata"]["name"] 156 | if model_name in git_models: 157 | logger.info(f"seldon deployment in sync {model_name}") 158 | else: 159 | kube_client.delete_namespaced_custom_object( 160 | group="machinelearning.seldon.io", 161 | version="v1", 162 | plural="seldondeployments", 163 | name=model_name, 164 | namespace=GLOBAL_NAMESPACE, 165 | ) 166 | -------------------------------------------------------------------------------- /mlflow_controller/mlservers/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | def mlflow_model_search(lookup_key, json_dict, search_result=[]): 5 | if type(json_dict) == dict: 6 | for key, value in json_dict.items(): 7 | if key == lookup_key: 8 | search_result.append(value) 9 | mlflow_model_search(lookup_key, value, search_result) 10 | elif type(json_dict) == list: 11 | for element in json_dict: 12 | mlflow_model_search(lookup_key, element, search_result) 13 | return search_result 14 | 15 | 16 | def update_modeluris(json_para, search_para, replace_para): 17 | def decode_dict(a_dict): 18 | if search_para in a_dict.values(): 19 | for key, value in a_dict.items(): 20 | if value == search_para: 21 | a_dict[key] = replace_para 22 | return a_dict 23 | 24 | return json.loads(json.dumps(json_para), object_hook=decode_dict) 25 | -------------------------------------------------------------------------------- /mlflow_controller/registries/mlflow.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | from mlflow.tracking import MlflowClient 5 | 6 | from mlflow_controller.registries import mlflow_backend 7 | 8 | logger = logging.getLogger(__name__) 9 | logger.setLevel(logging.DEBUG) 10 | 11 | formatter = logging.Formatter("%(asctime)s:%(name)s:%(message)s") 12 | 13 | file_handler = logging.FileHandler("log.log") 14 | file_handler.setLevel(logging.ERROR) 15 | file_handler.setFormatter(formatter) 16 | 17 | stream_handler = logging.StreamHandler() 18 | stream_handler.setFormatter(formatter) 19 | 20 | logger.addHandler(file_handler) 21 | logger.addHandler(stream_handler) 22 | 23 | logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO")) 24 | 25 | 26 | class MLflowMetadata: 27 | def __init__(self, tracking_uri, stage): 28 | self.mlflow_client = MlflowClient(tracking_uri=tracking_uri) 29 | logger.debug("Mlflow client initialized") 30 | self.object_init = mlflow_backend.Artifact() 31 | self.stage = stage 32 | 33 | def __str__(self): 34 | return self.__class__.__name__ 35 | 36 | def get_model_metadata( 37 | self, 38 | check_deploy=False, 39 | manager_label="mdc-mlflow-direct", 40 | backend="", 41 | mlflow_deploy_config="deploy.yaml", 42 | ): 43 | mlflow_models_metadata = {} 44 | read_deploy_yaml = [] 45 | registered_models = self.mlflow_client.list_registered_models() 46 | for registered_model in registered_models: 47 | for version in registered_model.latest_versions: 48 | if version.current_stage == self.stage: 49 | model_details = dict(version) 50 | model_run_id = model_details["run_id"] 51 | run_details = dict(self.mlflow_client.get_run(model_run_id).info) 52 | name = model_details["name"] 53 | model_template = f'{{{{ mlflow.{backend}["{name}"] }}}}' 54 | artifact_uri = run_details["artifact_uri"] 55 | mlflow_models_metadata[name] = { 56 | "name": name, 57 | "run_id": model_details["run_id"], 58 | "source": model_details["source"], 59 | "status": model_details["status"], 60 | "version": model_details["version"], 61 | "artifact_uri": artifact_uri, 62 | } 63 | logger.debug(artifact_uri) 64 | if check_deploy: 65 | for file in self.mlflow_client.list_artifacts(model_run_id): 66 | if file.path == mlflow_deploy_config: 67 | if backend == "gcs": 68 | deploy_yaml = self.object_init.gcp_bucket( 69 | artifact_uri 70 | ) 71 | elif backend == "blob": 72 | deploy_yaml = self.object_init.azure_blob( 73 | artifact_uri 74 | ) 75 | elif backend == "s3": 76 | deploy_yaml = self.object_init.aws_s3(artifact_uri) 77 | else: 78 | raise ("unsupported Object Storage") 79 | deploy_yaml["spec"]["predictors"][0]["graph"][ 80 | "modelUri" 81 | ] = model_template 82 | deploy_yaml["spec"]["predictors"][0]["annotations"][ 83 | "predictor_version" 84 | ] = model_details["version"] 85 | try: 86 | deploy_yaml["metadata"]["annotations"] 87 | except KeyError: 88 | deploy_yaml["metadata"]["annotations"] = {} 89 | deploy_yaml["metadata"]["labels"][ 90 | "app.kubernetes.io/mdc-type" 91 | ] = manager_label 92 | read_deploy_yaml.append(deploy_yaml) 93 | ml_metadata = {"mlflow": {f"{backend}": mlflow_models_metadata}} 94 | return ml_metadata, read_deploy_yaml 95 | -------------------------------------------------------------------------------- /mlflow_controller/registries/mlflow_backend.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import re 4 | from io import BytesIO 5 | 6 | import boto3 7 | import yaml 8 | from azure.identity import DefaultAzureCredential 9 | from azure.storage.blob import BlobServiceClient 10 | from google.cloud.storage import Client as GoogleClient 11 | 12 | logger = logging.getLogger(__name__) 13 | logger.setLevel(logging.DEBUG) 14 | 15 | formatter = logging.Formatter("%(asctime)s:%(name)s:%(message)s") 16 | 17 | file_handler = logging.FileHandler("log.log") 18 | file_handler.setLevel(logging.ERROR) 19 | file_handler.setFormatter(formatter) 20 | 21 | stream_handler = logging.StreamHandler() 22 | stream_handler.setFormatter(formatter) 23 | 24 | logger.addHandler(file_handler) 25 | logger.addHandler(stream_handler) 26 | 27 | logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO")) 28 | 29 | 30 | class Artifact: 31 | def __init__(self): 32 | print("Class Artifact initalized") 33 | self.mlflow_deploy_config = "deploy.yaml" 34 | 35 | def gcp_bucket(self, artifact_uri): 36 | google_client = GoogleClient() 37 | bucket = artifact_uri.split("/")[2] 38 | object_name = ( 39 | "/".join(artifact_uri.split("/")[3:]) + f"/{self.mlflow_deploy_config}" 40 | ) 41 | bucket = google_client.get_bucket(bucket) 42 | blob = bucket.get_blob(object_name) 43 | downloaded_file = blob.download_as_text(encoding="utf-8") 44 | deploy_yaml = yaml.safe_load(downloaded_file) 45 | return deploy_yaml 46 | 47 | def azure_blob(self, artifact_uri): 48 | acc_name_re = r"(?<=\/\/)(.*)(?=\@)" 49 | container_re = r"(?<=\@)(.*)(?=[\.])" 50 | container = re.search(acc_name_re, artifact_uri).group(1) 51 | acc_name = re.search(container_re, artifact_uri).group(1).split(".")[0] 52 | STORAGEACCOUNTURL = f"https://{acc_name}.blob.core.windows.net" 53 | default_credential = DefaultAzureCredential() 54 | blob_service_client_instance = BlobServiceClient( 55 | account_url=STORAGEACCOUNTURL, credential=default_credential 56 | ) 57 | blob_location = ( 58 | "/".join(artifact_uri.split("blob.core.windows.net")[1].split("/")[1:-1]) 59 | + f"/artifacts/{self.mlflow_deploy_config}" 60 | ) 61 | blob_client_instance = blob_service_client_instance.get_blob_client( 62 | container, blob_location, snapshot=None 63 | ) 64 | blob_data = blob_client_instance.download_blob() 65 | bl = blob_data.readall() 66 | deploy_yaml = yaml.load(bl, Loader=yaml.FullLoader) 67 | return deploy_yaml 68 | 69 | def aws_s3(self, artifact_uri): 70 | session = boto3.Session() 71 | s3_client = session.client("s3") 72 | path_parts = artifact_uri.replace("s3://", "").split("/") 73 | bucket = path_parts.pop(0) 74 | key = "/".join(path_parts) + "/deploy.yaml" 75 | f = BytesIO() 76 | s3_client.download_fileobj(bucket, key, f) 77 | 78 | deploy_yaml = yaml.load(f.getvalue(), Loader=yaml.FullLoader) 79 | return deploy_yaml 80 | -------------------------------------------------------------------------------- /mlflow_controller/utils/var_extract.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import re 3 | 4 | 5 | def var_parser(placeholder): 6 | model_pattern = r"\[.*\]" 7 | model = re.search(model_pattern, placeholder) 8 | model_name = ast.literal_eval(model.group())[0] 9 | vendor_pattern = r"\..*\[" 10 | vendor = re.search(vendor_pattern, placeholder) 11 | vendor_name = vendor.group().replace(".", "").replace("[", "") 12 | registry_pattern = r"^[a-zA-Z0-9_]*" 13 | registry = re.search(registry_pattern, placeholder) 14 | registry_name = registry.group() 15 | return model_name, vendor_name, registry_name 16 | 17 | 18 | def validate_variable(placeholder): 19 | pattern = re.compile(r"\w+\.\w+\[\".+\"\]", re.IGNORECASE) 20 | return pattern.match(placeholder) 21 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | mlflow==1.25 2 | kubernetes==22.6.0 3 | google==3.0.0 4 | gcloud==0.18.3 5 | apscheduler 6 | azure-storage-blob==12.14.1 7 | azure-identity==1.12.0 8 | boto3==1.26.25 9 | GitPython>=3.1.30 10 | google-apitools==0.5.32 11 | google-auth==2.1.0 12 | google-auth-oauthlib==0.4.6 13 | google-cloud==0.34.0 14 | google-cloud-core==2.0.0 15 | google-cloud-storage==1.42.2 16 | google-crc32c==1.2.0 17 | google-pasta==0.2.0 18 | google-reauth==0.1.1 19 | google-resumable-media==2.0.3 20 | googleapis-common-protos==1.52.0 21 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | from mlflow_controller.gitops import GitopsMDC 2 | 3 | controller = GitopsMDC() 4 | controller.gitops_mlflow_controller() 5 | 6 | # controller = DeployConroller() 7 | # controller.deploy_controller() 8 | -------------------------------------------------------------------------------- /tests/docker_build_push.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | echo "Installing build test image and push ..." 4 | docker build -t tachyongroup/mdc-test:$GITHUB_SHA . 5 | # docker push tachyongroup/mdc-test:$GITHUB_SHA 6 | kind load docker-image tachyongroup/mdc-test:$GITHUB_SHA 7 | -------------------------------------------------------------------------------- /tests/install_gitea.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | 4 | helm repo add gitea-charts https://dl.gitea.io/charts/ 5 | helm install gitea gitea-charts/gitea --set "gitea.admin.username=mdcadmin" --set "gitea.admin.password=password" --set "gitea.admin.email=mdcadmin@local.domain" 6 | sleep 30 7 | kubectl wait --for=condition=ready pod -l 'app.kubernetes.io/name in (gitea)' --timeout=180s 8 | 9 | kubectl --namespace default port-forward svc/gitea-http 3000:3000 & 10 | GITEA_PID=$! 11 | -------------------------------------------------------------------------------- /tests/install_istio.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | echo "Installing Istio service mesh ..." 4 | helm repo add istio https://istio-release.storage.googleapis.com/charts 5 | helm repo update 6 | kubectl create namespace istio-system 7 | helm install istio-base istio/base -n istio-system 8 | helm install istiod istio/istiod -n istio-system --wait 9 | helm status istiod -n istio-system 10 | 11 | echo "Waiting for Istio service mesh to be ready ..." 12 | kubectl wait --for=condition=ready pod -l 'app in (istiod)' --timeout=180s -n istio-system -------------------------------------------------------------------------------- /tests/install_kserve.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | echo "Installing Kserve ..." 4 | curl -s "https://raw.githubusercontent.com/kserve/kserve/release-0.9/hack/quick_install.sh" | bash 5 | -------------------------------------------------------------------------------- /tests/install_kserve_deployment_controller.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | echo "Installing Kserve Deployment Controller ..." 4 | kubectl create ns staging 5 | kubectl create ns production 6 | kubectl create secret generic github-secret -n mlflow --from-literal=githubtoken=password 7 | kubectl apply -f tests/repo-test/staging/kserve-sa.yaml -n staging 8 | 9 | helm install mdc-staging charts/mlflow-controller -n mlflow --set image.tag=$GITHUB_SHA --set image.pullPolicy=Never --set image.repository=docker.io/tachyongroup/mdc-test --set mlflow.backend=s3 --set gitops.deploymentLocation=staging/ --set mlserver=kserve --set gitops.repository=gitea-http.default.svc.cluster.local:3000/mdcadmin/repo-test --set gitops.protocol=http 10 | 11 | kubectl get deployment -n mlflow 12 | kubectl get cm -n mlflow 13 | kubectl get po -n mlflow 14 | echo "Waiting for Deployment Controller to be ready ..." 15 | export POD_NAME=$(kubectl get pods --namespace mlflow -l "app.kubernetes.io/instance=mdc-staging" -o jsonpath="{.items[0].metadata.name}") 16 | 17 | kubectl describe po $POD_NAME -n mlflow 18 | sleep 180 19 | kubectl logs deployment/mdc-staging-mlflow-controller -n mlflow 20 | #kubectl get inferenceservice --all-namespaces 21 | kubectl get inferenceservice sklearn-iris-minio -n staging -o yaml 22 | 23 | export MLFLOW_S3_ENDPOINT_URL=http://localhost:9000 24 | export AWS_ACCESS_KEY_ID=minioadmin 25 | export AWS_SECRET_ACCESS_KEY=minioadmin 26 | export MLFLOW_TRACKING_URI=http://localhost:5000 27 | python ./tests/mlflow/list_model.py $mlserver 28 | 29 | kubectl wait --for=condition=ready inferenceservice sklearn-iris-miniot -n staging --timeout=380s 30 | kubectl describe inferenceservice sklearn-iris-miniot -n staging -------------------------------------------------------------------------------- /tests/install_mlflow.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | echo "Installing Mlflow ..." 4 | kubectl create ns mlflow 5 | helm repo add minio https://charts.bitnami.com/bitnami 6 | helm install minio minio/minio -n mlflow --set auth.rootUser=minioadmin --set auth.rootPassword=minioadmin --set livenessProbe.enabled=false --set readinessProbe.enabled=false #--set mode=distributed 7 | 8 | export ROOT_USER=$(kubectl get secret --namespace mlflow minio -o jsonpath="{.data.root-user}" | base64 -d) 9 | export ROOT_PASSWORD=$(kubectl get secret --namespace mlflow minio -o jsonpath="{.data.root-password}" | base64 -d) 10 | 11 | kubectl wait --for=condition=ready pod -l 'app.kubernetes.io/name in (minio)' --timeout=380s -n mlflow 12 | kubectl apply -f tests/mlflow-cm.yaml -n mlflow 13 | helm repo add rocket9-code https://rocket9-code.github.io/hello-mlflow 14 | helm install mlflow rocket9-code/mlflow -n mlflow --set artifact.ArtifactRoot=s3://artifacts --set envFromconfigMap=minio-cm --set image.pullPolicy=Always 15 | kubectl get po -n mlflow 16 | export POD_NAME=$(kubectl get pods --namespace mlflow -l "app.kubernetes.io/name=mlflow,app.kubernetes.io/instance=mlflow" -o jsonpath="{.items[0].metadata.name}") 17 | kubectl describe po $POD_NAME -n mlflow 18 | kubectl wait --for=condition=ready pod -l 'app.kubernetes.io/name in (mlflow)' --timeout=380s -n mlflow 19 | -------------------------------------------------------------------------------- /tests/install_seldon_core.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | echo "Installing Seldon Core ..." 4 | kubectl create namespace seldon-system 5 | helm install seldon-core seldon-core-operator \ 6 | --repo https://storage.googleapis.com/seldon-charts \ 7 | --set usageMetrics.enabled=true \ 8 | --set istio.enabled=true \ 9 | --namespace seldon-system 10 | echo "Waiting for Seldon Core to be ready ..." 11 | kubectl wait --for=condition=ready pod -l 'app in (seldon)' --timeout=180s -n seldon-system -------------------------------------------------------------------------------- /tests/install_seldon_deployment_controller.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | echo "Installing Seldon Deployment Controller ..." 4 | kubectl create ns staging 5 | kubectl create secret generic github-secret -n mlflow --from-literal=githubtoken=password 6 | 7 | helm install mdc-staging charts/mlflow-controller -n mlflow --set image.tag=$GITHUB_SHA --set image.pullPolicy=Never --set image.repository=docker.io/tachyongroup/mdc-test --set mlflow.backend=s3 --set gitops.deploymentLocation=staging/ --set mlserver=seldon --set gitops.repository=gitea-http.default.svc.cluster.local:3000/mdcadmin/repo-test --set gitops.protocol=http 8 | kubectl apply -f tests/repo-test/staging/seldon-secret.yaml -n staging 9 | kubectl get deployment -n mlflow 10 | kubectl get cm -n mlflow 11 | kubectl get po -n mlflow 12 | 13 | echo "Waiting for Deployment Controller to be ready ..." 14 | export POD_NAME=$(kubectl get pods --namespace mlflow -l "app.kubernetes.io/instance=mdc-staging" -o jsonpath="{.items[0].metadata.name}") 15 | sleep 180 16 | kubectl describe po $POD_NAME -n mlflow 17 | kubectl wait --for=condition=ready pod -l 'app.kubernetes.io/instance in (mdc-staging)' --timeout=380s -n mlflow 18 | 19 | 20 | kubectl describe po $POD_NAME -n mlflow 21 | sleep 180 22 | kubectl logs deployment/mdc-staging-mlflow-controller -n mlflow 23 | kubectl get seldondeployment --all-namespaces 24 | kubectl get seldondeployment mlflow-var-minio -n staging -o yaml 25 | 26 | export MLFLOW_S3_ENDPOINT_URL=http://localhost:9000 27 | export AWS_ACCESS_KEY_ID=minioadmin 28 | export AWS_SECRET_ACCESS_KEY=minioadmin 29 | export MLFLOW_TRACKING_URI=http://localhost:5000 30 | python ./tests/mlflow/list_model.py $mlserver 31 | 32 | 33 | python ./tests/mlflow/test_deploy.py -------------------------------------------------------------------------------- /tests/kind-cluster-1-24.yaml: -------------------------------------------------------------------------------- 1 | # This testing option is available for testing projects that don't yet support k8s 1.25 2 | apiVersion: kind.x-k8s.io/v1alpha4 3 | kind: Cluster 4 | # Configure registry for KinD. 5 | containerdConfigPatches: 6 | - |- 7 | [plugins."io.containerd.grpc.v1.cri".registry.mirrors."$REGISTRY_NAME:$REGISTRY_PORT"] 8 | endpoint = ["http://$REGISTRY_NAME:$REGISTRY_PORT"] 9 | # This is needed in order to support projected volumes with service account tokens. 10 | # See: https://kubernetes.slack.com/archives/CEKK1KTN2/p1600268272383600 11 | kubeadmConfigPatches: 12 | - | 13 | apiVersion: kubeadm.k8s.io/v1beta2 14 | kind: ClusterConfiguration 15 | metadata: 16 | name: config 17 | apiServer: 18 | extraArgs: 19 | "service-account-issuer": "kubernetes.default.svc" 20 | "service-account-signing-key-file": "/etc/kubernetes/pki/sa.key" 21 | nodes: 22 | - role: control-plane 23 | image: kindest/node:v1.22.15@sha256:7d9708c4b0873f0fe2e171e2b1b7f45ae89482617778c1c875f1053d4cef2e41 24 | - role: worker 25 | image: kindest/node:v1.22.15@sha256:7d9708c4b0873f0fe2e171e2b1b7f45ae89482617778c1c875f1053d4cef2e41 26 | - role: worker 27 | image: kindest/node:v1.22.15@sha256:7d9708c4b0873f0fe2e171e2b1b7f45ae89482617778c1c875f1053d4cef2e41 28 | 29 | - role: worker 30 | image: kindest/node:v1.22.15@sha256:7d9708c4b0873f0fe2e171e2b1b7f45ae89482617778c1c875f1053d4cef2e41 -------------------------------------------------------------------------------- /tests/log_mlflow_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | echo "Installing Mlflow ..." 4 | pip install mlflow==1.25.1 5 | pip install protobuf==3.20.* 6 | pip install scikit-learn==0.23.2 7 | pip install pandas==0.23.4 8 | pip install boto3==1.22.9 9 | pip install minio 10 | pip install kubernetes 11 | pip install termcolor 12 | export MLFLOW_S3_ENDPOINT_URL=http://localhost:9000 13 | export AWS_ACCESS_KEY_ID=minioadmin 14 | export AWS_SECRET_ACCESS_KEY=minioadmin 15 | export MLFLOW_TRACKING_URI=http://localhost:5000 16 | python ./tests/mlflow/iris.py 1 staging 17 | 18 | -------------------------------------------------------------------------------- /tests/mlflow-cm.yaml: -------------------------------------------------------------------------------- 1 | kind: ConfigMap 2 | apiVersion: v1 3 | metadata: 4 | name: minio-cm 5 | namespace: mlflow 6 | data: 7 | MLFLOW_S3_ENDPOINT_URL: 'http://minio.mlflow.svc.cluster.local' 8 | AWS_ACCESS_KEY_ID: minioadmin 9 | AWS_SECRET_ACCESS_KEY: minioadmin 10 | -------------------------------------------------------------------------------- /tests/mlflow/iris.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import mlflow 4 | import mlflow.sklearn 5 | import pandas as pd 6 | from minio import Minio 7 | from mlflow.tracking import MlflowClient 8 | from sklearn import datasets 9 | from sklearn.ensemble import RandomForestClassifier 10 | from sklearn.metrics import roc_auc_score 11 | from sklearn.model_selection import train_test_split 12 | 13 | try: 14 | client = Minio( 15 | "localhost:9000", access_key="minioadmin", secret_key="minioadmin", secure=False 16 | ) 17 | 18 | # Create bucket. 19 | client.make_bucket("artifacts") 20 | policy = '{"Version":"2012-10-17","Statement":[{"Action":["s3:GetBucketLocation","s3:ListBucket","s3:ListBucketMultipartUploads"],"Effect":"Allow","Principal":{"AWS":["*"]},"Resource":["arn:aws:s3:::artifacts"],"Sid":""},{"Action":["s3:AbortMultipartUpload","s3:DeleteObject","s3:GetObject","s3:ListMultipartUploadParts","s3:PutObject"],"Effect":"Allow","Principal":{"AWS":["*"]},"Resource":["arn:aws:s3:::artifacts/*"],"Sid":""}]}' 21 | client.set_bucket_policy(bucket_name="artifacts", policy=policy) 22 | except Exception as e: 23 | print(e) 24 | 25 | 26 | def main(version, stage, MODEL_NAME): 27 | iris = datasets.load_iris() 28 | iris_df = pd.DataFrame(iris.data, columns=iris.feature_names) 29 | y = iris.target 30 | iris_df["target"] = y 31 | 32 | train_df, test_df = train_test_split( 33 | iris_df, test_size=0.3, random_state=42, stratify=iris_df["target"] 34 | ) 35 | X_train = train_df[ 36 | [ 37 | "sepal length (cm)", 38 | "sepal width (cm)", 39 | "petal length (cm)", 40 | "petal width (cm)", 41 | ] 42 | ] 43 | y_train = train_df["target"] 44 | 45 | X_test = test_df[ 46 | [ 47 | "sepal length (cm)", 48 | "sepal width (cm)", 49 | "petal length (cm)", 50 | "petal width (cm)", 51 | ] 52 | ] 53 | y_test = test_df["target"] 54 | 55 | EXPERIMENT_NAME = MODEL_NAME 56 | 57 | # print("IRIS train df shape") 58 | # print(X_train.shape) 59 | # print(y_train.shape) 60 | 61 | # print("IRIS test df shape") 62 | # print(X_test.shape) 63 | # print(y_test.shape) 64 | 65 | mlflow_client = MlflowClient() 66 | 67 | # Create an MLFlow experiment, if not already exists 68 | experiment_details = mlflow_client.get_experiment_by_name(EXPERIMENT_NAME) 69 | 70 | if experiment_details is not None: 71 | experiment_id = experiment_details.experiment_id 72 | else: 73 | experiment_id = mlflow.create_experiment(EXPERIMENT_NAME) 74 | 75 | # Start an MLFlow experiment run 76 | with mlflow.start_run( 77 | experiment_id=experiment_id, run_name="iris dataset rf run" 78 | ) as run: 79 | # Log parameters 80 | 81 | mlflow.log_param("max_depth", 10) 82 | mlflow.log_param("random_state", 0) 83 | mlflow.log_param("n_estimators", 100) 84 | clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0) 85 | clf.fit(X_train, y_train) 86 | iris_predict_y = clf.predict(X_test) 87 | 88 | roc_auc_score_val = roc_auc_score( 89 | y_test, clf.predict_proba(X_test), multi_class="ovr" 90 | ) 91 | mlflow.log_metric("test roc_auc_score", roc_auc_score_val) 92 | 93 | # Log model 94 | result = mlflow.sklearn.log_model(clf, artifact_path="model") 95 | 96 | # Register a new version 97 | result = mlflow.register_model(result.model_uri, MODEL_NAME) 98 | 99 | mlflow_client.transition_model_version_stage( 100 | name=MODEL_NAME, version=version, stage=stage 101 | ) 102 | registered_models = mlflow_client.list_registered_models() 103 | 104 | 105 | if __name__ == "__main__": 106 | for i in range(5): 107 | print(f"iris demo{i}") 108 | version = sys.argv[1] 109 | stage = sys.argv[2] 110 | main(MODEL_NAME=f"iris demo{i}", version=version, stage=stage) 111 | -------------------------------------------------------------------------------- /tests/mlflow/list_model.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | 4 | from git import Repo 5 | from iris import main 6 | from kubernetes import client as KubeClient 7 | from kubernetes import config 8 | from mlflow.tracking import MlflowClient 9 | from termcolor import colored 10 | 11 | try: 12 | config.load_kube_config() 13 | except config.ConfigException: 14 | config.load_incluster_config() 15 | kube_client = KubeClient.CustomObjectsApi() 16 | 17 | timeout = time.time() + 60 * 2 18 | 19 | 20 | print(colored("Test", "red"), colored("no1", "green")) 21 | 22 | 23 | def test(): 24 | backend = "s3" 25 | mlflow_client = MlflowClient() 26 | registered_models = mlflow_client.list_registered_models() 27 | mlflow_models_metadata = {} 28 | for registered_model in registered_models: 29 | for version in registered_model.latest_versions: 30 | if version.current_stage == "Staging": 31 | model_details = dict(version) 32 | model_run_id = model_details["run_id"] 33 | run_details = dict(mlflow_client.get_run(model_run_id).info) 34 | name = model_details["name"] 35 | model_template = f'{{{{ mlflow.{backend}["{name}"] }}}}' 36 | artifact_uri = run_details["artifact_uri"] 37 | mlflow_models_metadata[name] = { 38 | "name": name, 39 | "run_id": model_details["run_id"], 40 | "source": model_details["source"], 41 | "status": model_details["status"], 42 | "version": model_details["version"], 43 | "artifact_uri": artifact_uri, 44 | } 45 | while True: 46 | if sys.argv[1] == "seldon": 47 | manifest = kube_client.get_namespaced_custom_object( 48 | group="machinelearning.seldon.io", 49 | version="v1", 50 | plural="seldondeployments", 51 | namespace="staging", 52 | name="mlflow-var-minio", 53 | ) 54 | demo1 = manifest["spec"]["predictors"][0]["graph"]["children"][0][ 55 | "modelUri" 56 | ] 57 | demo2 = manifest["spec"]["predictors"][0]["graph"]["children"][0][ 58 | "children" 59 | ][0]["modelUri"] 60 | demo3 = manifest["spec"]["predictors"][0]["graph"]["children"][1][ 61 | "modelUri" 62 | ] 63 | demo4 = manifest["spec"]["predictors"][0]["graph"]["modelUri"] 64 | if ( 65 | (demo1 == mlflow_models_metadata["iris demo1"]["source"]) 66 | & (demo2 == mlflow_models_metadata["iris demo2"]["source"]) 67 | & (demo4 == mlflow_models_metadata["iris demo4"]["source"]) 68 | ): 69 | print(demo1, demo2, demo3, demo4) 70 | print("test passed", mlflow_models_metadata) 71 | break 72 | elif sys.argv[1] == "kserve": 73 | manifest = kube_client.get_namespaced_custom_object( 74 | group="serving.kserve.io", 75 | version="v1beta1", 76 | plural="inferenceservices", 77 | namespace="staging", 78 | name="sklearn-iris-minio", 79 | ) 80 | demo2 = manifest["spec"]["predictor"]["model"]["storageUri"] 81 | if demo2 == mlflow_models_metadata["iris demo2"]["source"]: 82 | print(demo2) 83 | print("test passed", mlflow_models_metadata) 84 | break 85 | if time.time() > timeout: 86 | print(mlflow_models_metadata) 87 | print(manifest) 88 | print(sys.argv[1]) 89 | raise ("Timeout error") 90 | 91 | 92 | test() 93 | 94 | # Test transition 95 | 96 | print(colored("Test", "red"), colored("no2", "green")) 97 | 98 | for i in range(5): 99 | main(MODEL_NAME=f"iris demo{i}", version=2, stage="Staging") 100 | 101 | test() 102 | 103 | # Test removal 104 | print(colored("Test", "red"), colored("no3", "green")) 105 | 106 | if sys.argv[1] == "kserve": 107 | PATH_OF_GIT_REPO = "tests/repo-test" 108 | COMMIT_MESSAGE = "comment from python script" 109 | 110 | def git_push(): 111 | import os 112 | 113 | os.remove("tests/repo-test/staging/kserve-s3.yaml") 114 | try: 115 | repo = Repo(PATH_OF_GIT_REPO) 116 | repo.git.add(update=True) 117 | repo.index.commit(COMMIT_MESSAGE) 118 | origin = repo.remote(name="origin") 119 | origin.push() 120 | except: 121 | print("Some error occured while pushing the code") 122 | 123 | git_push() 124 | 125 | while True: 126 | if time.time() > timeout: 127 | raise ("Timeout error") 128 | manifest = kube_client.list_namespaced_custom_object( 129 | group="serving.kserve.io", 130 | version="v1beta1", 131 | plural="inferenceservices", 132 | namespace="staging", 133 | ) 134 | model_names = [] 135 | for i in manifest["items"]: 136 | model_names.append(i["metadata"]["name"]) 137 | if "sklearn-iris-minio" in model_names: 138 | pass 139 | else: 140 | print(model_names) 141 | print("Deletion test passed") 142 | break 143 | 144 | if sys.argv[1] == "seldon": 145 | PATH_OF_GIT_REPO = "tests/repo-test" 146 | COMMIT_MESSAGE = "comment from python script" 147 | 148 | def git_push(): 149 | import os 150 | 151 | os.remove("tests/repo-test/staging/seldon-s3.yaml") 152 | try: 153 | repo = Repo(PATH_OF_GIT_REPO) 154 | repo.git.add(update=True) 155 | repo.index.commit(COMMIT_MESSAGE) 156 | origin = repo.remote(name="origin") 157 | origin.push() 158 | except: 159 | print("Some error occured while pushing the code") 160 | 161 | git_push() 162 | time.sleep(60) 163 | 164 | while True: 165 | if time.time() > timeout: 166 | raise ("Timeout error") 167 | manifest = kube_client.list_namespaced_custom_object( 168 | group="machinelearning.seldon.io", 169 | version="v1", 170 | plural="seldondeployments", 171 | namespace="staging", 172 | ) 173 | model_names = [] 174 | for i in manifest["items"]: 175 | model_names.append(i["metadata"]["name"]) 176 | if "mlflow-var-minio" in model_names: 177 | pass 178 | else: 179 | print(model_names) 180 | print("Deletion test passed") 181 | break 182 | -------------------------------------------------------------------------------- /tests/mlflow/test_deploy.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from kubernetes import client as KubeClient 4 | from kubernetes import config 5 | 6 | try: 7 | config.load_kube_config() 8 | except config.ConfigException: 9 | config.load_incluster_config() 10 | kube_client = KubeClient.CustomObjectsApi() 11 | status = "" 12 | timeout = time.time() + 60 * 10 13 | 14 | while True: 15 | test = kube_client.get_namespaced_custom_object( 16 | group="machinelearning.seldon.io", 17 | version="v1", 18 | plural="seldondeployments", 19 | namespace="staging", 20 | name="mlflow", 21 | ) 22 | status = test["status"]["state"] 23 | print(status) 24 | if status == "Available": 25 | break 26 | else: 27 | print(test["status"]) 28 | time.sleep(30) 29 | if time.time() > timeout: 30 | # print(test) 31 | deploy_name = list(test["status"]["deploymentStatus"].keys())[0] 32 | kube_client = KubeClient.AppsV1Api() 33 | deployment = kube_client.read_namespaced_deployment( 34 | name=deploy_name, namespace="staging" 35 | ) 36 | print(deployment) 37 | raise ("Timeout error") 38 | -------------------------------------------------------------------------------- /tests/pf_mlflow.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | 4 | kubectl port-forward -n mlflow svc/mlflow-service 5000:5000 & 5 | MLFLOW_PID=$! 6 | 7 | echo "Started mlflow port-forward, pid: $MLFLOW_PID" 8 | echo MLFLOW_PID=$MLFLOW_PID >> pids.env 9 | 10 | sleep 1 11 | 12 | 13 | kubectl port-forward --namespace mlflow svc/minio 9000:9000 & 14 | MINIO_PID=$! 15 | 16 | echo "Started mlflow port-forward, pid: $MINIO_PID" 17 | echo MINIO_PID=$MINIO_PID >> pids.env 18 | 19 | sleep 1 20 | 21 | curl -X POST http://localhost:5000/api/2.0/preview/mlflow/experiments/create -d '{"name":"test"}' -------------------------------------------------------------------------------- /tests/repo-test/production/kserve-s3.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: "serving.kserve.io/v1beta1" 2 | kind: "InferenceService" 3 | metadata: 4 | name: "sklearn-iris-minio" 5 | spec: 6 | predictor: 7 | model: 8 | modelFormat: 9 | name: mlflow 10 | storageUri: '{{ mlflow.s3["iris demo2"] }}' -------------------------------------------------------------------------------- /tests/repo-test/production/seldon-s3.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: machinelearning.seldon.io/v1 2 | kind: SeldonDeployment 3 | metadata: 4 | name: mlflow-var-minio 5 | spec: 6 | name: iris 7 | predictors: 8 | - graph: 9 | children: 10 | - name: step-one 11 | modelUri: '{{ mlflow.s3["iris demo1"] }}' 12 | envSecretRefName: seldon-rclone-secret 13 | implementation: MLFLOW_SERVER 14 | type: MODEL 15 | children: 16 | - name: step-two 17 | modelUri: '{{ mlflow.s3["iris demo2"] }}' 18 | envSecretRefName: seldon-rclone-secret 19 | implementation: MLFLOW_SERVER 20 | type: MODEL 21 | children: [] 22 | - name: step-three 23 | implementation: MLFLOW_SERVER 24 | modelUri: '{{ mlflow.s3["iris demo3"] }}' 25 | envSecretRefName: seldon-rclone-secret 26 | type: MODEL 27 | children: [] 28 | implementation: MLFLOW_SERVER 29 | modelUri: '{{ mlflow.s3["iris demo4"] }}' 30 | envSecretRefName: seldon-rclone-secret 31 | name: classifier 32 | name: default 33 | replicas: 1 -------------------------------------------------------------------------------- /tests/repo-test/staging/kserve-s3.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: "serving.kserve.io/v1beta1" 2 | kind: "InferenceService" 3 | metadata: 4 | name: "sklearn-iris-minio" 5 | spec: 6 | predictor: 7 | serviceAccountName: sa 8 | model: 9 | modelFormat: 10 | name: mlflow 11 | storageUri: '{{ mlflow.s3["iris demo2"] }}' -------------------------------------------------------------------------------- /tests/repo-test/staging/kserve-s3t.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: "serving.kserve.io/v1beta1" 2 | kind: "InferenceService" 3 | metadata: 4 | name: "sklearn-iris-miniot" 5 | spec: 6 | predictor: 7 | serviceAccountName: sa 8 | model: 9 | modelFormat: 10 | name: mlflow 11 | storageUri: '{{ mlflow.s3["iris demo2"] }}' -------------------------------------------------------------------------------- /tests/repo-test/staging/kserve-sa.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Secret 3 | metadata: 4 | name: s3creds 5 | annotations: 6 | serving.kserve.io/s3-endpoint: minio.mlflow.svc.cluster.local:9000 # replace with your s3 endpoint e.g minio-service.kubeflow:9000 7 | serving.kserve.io/s3-usehttps: "0" # by default 1, if testing with minio you can set to 0 8 | serving.kserve.io/s3-region: "us-east-2" 9 | serving.kserve.io/s3-useanoncredential: "false" # omitting this is the same as false, if true will ignore provided credential and use anonymous credentials 10 | type: Opaque 11 | stringData: # use `stringData` for raw credential string or `data` for base64 encoded string 12 | AWS_ACCESS_KEY_ID: minioadmin 13 | AWS_SECRET_ACCESS_KEY: minioadmin 14 | --- 15 | 16 | apiVersion: v1 17 | kind: ServiceAccount 18 | metadata: 19 | name: sa 20 | secrets: 21 | - name: s3creds -------------------------------------------------------------------------------- /tests/repo-test/staging/seldon-s3.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: machinelearning.seldon.io/v1 2 | kind: SeldonDeployment 3 | metadata: 4 | name: mlflow-var-minio 5 | spec: 6 | name: iris 7 | predictors: 8 | - graph: 9 | children: 10 | - name: step-one 11 | modelUri: '{{ mlflow.s3["iris demo1"] }}' 12 | envSecretRefName: seldon-rclone-secret 13 | implementation: MLFLOW_SERVER 14 | type: MODEL 15 | children: 16 | - name: step-two 17 | modelUri: '{{ mlflow.s3["iris demo2"] }}' 18 | envSecretRefName: seldon-rclone-secret 19 | implementation: MLFLOW_SERVER 20 | type: MODEL 21 | children: [] 22 | - name: step-three 23 | implementation: MLFLOW_SERVER 24 | modelUri: '{{ mlflow.s3["iris demo3"] }}' 25 | envSecretRefName: seldon-rclone-secret 26 | type: MODEL 27 | children: [] 28 | implementation: MLFLOW_SERVER 29 | modelUri: '{{ mlflow.s3["iris demo4"] }}' 30 | envSecretRefName: seldon-rclone-secret 31 | logger: 32 | url: http://broker-ingress.knative-eventing.svc.cluster.local/demo/default 33 | mode: all 34 | name: classifier 35 | name: default 36 | replicas: 1 -------------------------------------------------------------------------------- /tests/repo-test/staging/seldon-secret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Secret 3 | metadata: 4 | name: seldon-init-container-secret 5 | namespace: staging 6 | type: Opaque 7 | stringData: 8 | RCLONE_CONFIG_S3_TYPE: s3 9 | RCLONE_CONFIG_S3_PROVIDER: minio 10 | RCLONE_CONFIG_S3_ACCESS_KEY_ID: minioadmin 11 | RCLONE_CONFIG_S3_SECRET_ACCESS_KEY: minioadmin 12 | RCLONE_CONFIG_S3_ENDPOINT: http://minio.mlflow.svc.cluster.local:9000 13 | RCLONE_CONFIG_S3_ENV_AUTH: "false" -------------------------------------------------------------------------------- /tests/repo-test/staging/seldon-single-model.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: machinelearning.seldon.io/v1 2 | kind: SeldonDeployment 3 | metadata: 4 | name: mlflow 5 | spec: 6 | name: iris 7 | predictors: 8 | - componentSpecs: 9 | - spec: 10 | containers: 11 | - name: classifier 12 | livenessProbe: 13 | initialDelaySeconds: 80 14 | failureThreshold: 200 15 | periodSeconds: 25 16 | successThreshold: 1 17 | httpGet: 18 | path: /health/ping 19 | port: http 20 | scheme: HTTP 21 | readinessProbe: 22 | initialDelaySeconds: 80 23 | failureThreshold: 20 24 | periodSeconds: 25 25 | successThreshold: 1 26 | httpGet: 27 | path: /health/ping 28 | port: http 29 | scheme: HTTP 30 | graph: 31 | implementation: MLFLOW_SERVER 32 | modelUri: '{{ mlflow.s3["iris demo3"] }}' 33 | envSecretRefName: seldon-init-container-secret 34 | name: classifier 35 | name: default 36 | replicas: 1 -------------------------------------------------------------------------------- /tests/setup_git_repo.sh: -------------------------------------------------------------------------------- 1 | curl -X 'POST' \ 2 | 'http://localhost:3000/api/v1/user/repos' \ 3 | -H 'accept: application/json' \ 4 | -H 'authorization: Basic bWRjYWRtaW46cGFzc3dvcmQ=' \ 5 | -H 'Content-Type: application/json' \ 6 | -d '{ 7 | "auto_init": false, 8 | "default_branch": "main", 9 | "description": "demo", 10 | "name": "repo-test", 11 | "private": false, 12 | "template": false, 13 | "trust_model": "default" 14 | }' 15 | 16 | git config --global user.email "mdcadmin@example.com" 17 | git config --global user.name "mdcadmin" 18 | cd tests/repo-test 19 | git init 20 | git add . 21 | git checkout -b main 22 | git commit -m "first commit" 23 | git remote add origin "http://mdcadmin:password@localhost:3000/mdcadmin/repo-test" 24 | git push -u origin main 25 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [flake8] 2 | # it's not a bug that we aren't using all of hacking, ignore: 3 | # F812: list comprehension redefines ... 4 | # H101: Use TODO(NAME) 5 | # H202: assertRaises Exception too broad 6 | # H233: Python 3.x incompatible use of print operator 7 | # H301: one import per line 8 | # H306: imports not in alphabetical order (time, os) 9 | # H401: docstring should not start with a space 10 | # H403: multi line docstrings should end on a new line 11 | # H404: multi line docstring should start without a leading new line 12 | # H405: multi line docstring summary not separated with an empty line 13 | # H501: Do not use self.__dict__ for string formatting 14 | ignore = E501,W503 15 | -------------------------------------------------------------------------------- /ui/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM continuumio/miniconda3:4.11.0 2 | COPY requirements.txt requirements.txt 3 | RUN pip install -r requirements.txt 4 | WORKDIR /ui 5 | COPY . /ui 6 | CMD ["python", "app.py"] -------------------------------------------------------------------------------- /ui/app.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import dash 4 | import dash_bootstrap_components as dbc 5 | import dash_html_components as html 6 | import pandas as pd 7 | from dash import Input, Output, dcc 8 | from kubernetes import client as kube_client 9 | from kubernetes import config 10 | 11 | MLFLOW_NAMESPACE = os.getenv("namespace", "mlflow") 12 | MDC_LABEL = os.getenv("MDC_LABEL", "mdc-staging") 13 | 14 | 15 | app = dash.Dash( 16 | __name__, 17 | use_pages=True, 18 | external_stylesheets=[dbc.themes.FLATLY, dbc.icons.BOOTSTRAP], 19 | ) 20 | 21 | navbar = dbc.NavbarSimple( 22 | [ 23 | dbc.Button("Home", href="/", color="secondary", className="me-1"), 24 | dbc.Button("Logs", href="/logs", color="secondary", className="me-1"), 25 | ], 26 | brand="Mlflow Deployment Controller", 27 | color="primary", 28 | dark=True, 29 | className="mb-2", 30 | ) 31 | 32 | 33 | def serve_layout(): 34 | return html.Div( 35 | [navbar, dash.page_container], 36 | # fluid=True, 37 | ) 38 | 39 | 40 | app.layout = serve_layout 41 | 42 | try: 43 | config.load_kube_config() 44 | except config.ConfigException: 45 | config.load_incluster_config() 46 | 47 | 48 | def dataf(): 49 | v1 = kube_client.CustomObjectsApi() 50 | manifests = v1.list_cluster_custom_object( 51 | group="machinelearning.seldon.io", 52 | version="v1", 53 | plural="seldondeployments", 54 | label_selector="app.kubernetes.io/managed-by=mdc", 55 | ) 56 | model_name = [] 57 | namespace = [] 58 | state = [] 59 | replicas = [] 60 | for i in manifests["items"]: 61 | model_name.append(i["metadata"]["name"]) 62 | namespace.append(i["metadata"]["namespace"]) 63 | for _id in i["metadata"]["annotations"].keys(): 64 | if "mdc" in _id: 65 | pass 66 | state.append(i["status"]["state"]) 67 | deploy_name = list(i["status"]["deploymentStatus"].keys())[0] 68 | replicas.append(i["status"]["deploymentStatus"][deploy_name]["replicas"]) 69 | df = pd.DataFrame( 70 | { 71 | "models": model_name, 72 | "namespace": namespace, 73 | "replicas": replicas, 74 | "state": state, 75 | } 76 | ) 77 | df["models"] = [dcc.Link(f"{i}", href=f"/seldon/{i}") for i in df.models.values] 78 | table = dbc.Table.from_dataframe(df, striped=True, bordered=True, hover=True) 79 | 80 | return table 81 | 82 | 83 | @app.callback( 84 | dash.dependencies.Output("table-deployments", "children"), 85 | [dash.dependencies.Input("interval-component", "n_intervals")], 86 | ) 87 | def interval_deployment(n_intervals): 88 | return dataf() 89 | 90 | 91 | @app.callback( 92 | dash.dependencies.Output("seldon-deployment", "children"), 93 | [dash.dependencies.Input("interval-component-seldon", "n_intervals")], 94 | ) 95 | def internal_seldon_deployment(n_intervals): 96 | return [] 97 | 98 | 99 | @app.callback(Output("live-graph", "children"), [Input("graph-update", "n_intervals")]) 100 | def update_graph_scatter(n_intervals): 101 | print(n_intervals) 102 | v1 = kube_client.CoreV1Api() 103 | pod_name = v1.list_namespaced_pod( 104 | namespace=MLFLOW_NAMESPACE, 105 | label_selector=f"app.kubernetes.io/instance={MDC_LABEL}", 106 | ) 107 | pod_name = pod_name.items[0].metadata.name 108 | lines = [] 109 | lines = v1.read_namespaced_pod_log( 110 | name=pod_name, 111 | pretty=True, 112 | since_seconds=60, 113 | namespace=MLFLOW_NAMESPACE, 114 | follow=False, 115 | _preload_content=True, 116 | ) 117 | # print(lines) 118 | return [ 119 | html.Br(), 120 | html.H4("Controller Logs"), 121 | html.Plaintext( 122 | lines, 123 | style={ 124 | "display": "inline-block", 125 | "fontSize": 15, 126 | # "verticalAlign": "top", 127 | "color": "white", 128 | "backgroundColor": "black", 129 | }, 130 | ), 131 | ] 132 | 133 | 134 | @app.callback( 135 | Output("collapse0", "is_open"), 136 | Output("collapse1", "is_open"), 137 | Output("collapse2", "is_open"), 138 | Output("collapse3", "is_open"), 139 | Output("collapse4", "is_open"), 140 | Output("collapse5", "is_open"), 141 | Output("collapse-button0", "n_clicks"), 142 | Output("collapse-button1", "n_clicks"), 143 | Output("collapse-button2", "n_clicks"), 144 | Output("collapse-button3", "n_clicks"), 145 | Output("collapse-button4", "n_clicks"), 146 | Output("collapse-button5", "n_clicks"), 147 | [ 148 | Input("collapse-button0", "n_clicks"), 149 | Input("collapse-button1", "n_clicks"), 150 | Input("collapse-button2", "n_clicks"), 151 | Input("collapse-button3", "n_clicks"), 152 | Input("collapse-button4", "n_clicks"), 153 | Input("collapse-button5", "n_clicks"), 154 | ], 155 | ) 156 | def toggle_collapse(n, n1, n2, n3, n4, n5): 157 | if n: 158 | return True, False, False, False, False, False, 0, 0, 0, 0, 0, 0 159 | if n1: 160 | return False, True, False, False, False, False, 0, 0, 0, 0, 0, 0 161 | if n2: 162 | return False, False, True, False, False, False, 0, 0, 0, 0, 0, 0 163 | if n3: 164 | return False, False, False, True, False, False, 0, 0, 0, 0, 0, 0 165 | if n4: 166 | return False, False, False, False, True, False, 0, 0, 0, 0, 0, 0 167 | if n5: 168 | return False, False, False, False, False, True, 0, 0, 0, 0, 0, 0 169 | return False, False, False, False, False, False, 0, 0, 0, 0, 0, 0 170 | 171 | 172 | if __name__ == "__main__": 173 | app.run_server(host="0.0.0.0", port=8000, debug=False) 174 | -------------------------------------------------------------------------------- /ui/pages/deployments.py: -------------------------------------------------------------------------------- 1 | from dash import dcc, html, register_page 2 | from kubernetes import config 3 | 4 | register_page(__name__, path="/") 5 | 6 | 7 | try: 8 | config.load_kube_config() 9 | except config.ConfigException: 10 | config.load_incluster_config() 11 | 12 | 13 | layout = html.Div( 14 | [ 15 | html.H5( 16 | "Seldon Deployments", 17 | className="mt-5", 18 | ), 19 | dcc.Interval( 20 | id="interval-component", interval=1 * 1000, n_intervals=0 # in milliseconds 21 | ), 22 | html.Div(id="table-deployments"), 23 | ] 24 | ) 25 | -------------------------------------------------------------------------------- /ui/pages/logs.py: -------------------------------------------------------------------------------- 1 | import dash 2 | import dash_core_components as dcc 3 | import dash_html_components as html 4 | from kubernetes import config 5 | 6 | try: 7 | config.load_kube_config() 8 | except config.ConfigException: 9 | config.load_incluster_config() 10 | 11 | 12 | def title(): 13 | return "Logs" 14 | 15 | 16 | def description(ticker=None): 17 | return "Controller Logs" 18 | 19 | 20 | dash.register_page( 21 | __name__, 22 | path_template="/logs", 23 | title=title, 24 | description=description, 25 | path="/logs", 26 | ) 27 | 28 | 29 | def layout(ticker=None, **other_unknown_query_strings): 30 | return html.Div( 31 | [ 32 | html.Div(id="live-graph"), 33 | dcc.Interval(id="graph-update", interval=1 * 10000, n_intervals=0), 34 | ] 35 | ) 36 | -------------------------------------------------------------------------------- /ui/pages/not_found_404.py: -------------------------------------------------------------------------------- 1 | import dash 2 | from dash import html 3 | 4 | dash.register_page(__name__, path="/404") 5 | 6 | 7 | layout = html.H1("404 Not found") 8 | -------------------------------------------------------------------------------- /ui/pages/seldon.py: -------------------------------------------------------------------------------- 1 | import dash 2 | from kubernetes import config 3 | from seldon_deployments.card import card_layout 4 | 5 | try: 6 | config.load_kube_config() 7 | except config.ConfigException: 8 | config.load_incluster_config() 9 | 10 | 11 | def title(ticker=None): 12 | return f"{ticker} Status" 13 | 14 | 15 | def description(ticker=None): 16 | return f"Deployment status {ticker}" 17 | 18 | 19 | dash.register_page( 20 | __name__, 21 | path_template="/seldon/", 22 | title=title, 23 | description=description, 24 | path="/seldon/mlflow", 25 | ) 26 | 27 | 28 | def layout(ticker=None, **other_unknown_query_strings): 29 | return card_layout(ticker) 30 | -------------------------------------------------------------------------------- /ui/requirements.txt: -------------------------------------------------------------------------------- 1 | mlflow 2 | kubernetes 3 | google 4 | gcloud 5 | google-apitools==0.5.32 6 | google-auth==2.1.0 7 | google-auth-oauthlib==0.4.6 8 | google-cloud==0.34.0 9 | google-cloud-core==2.0.0 10 | google-cloud-storage==1.42.2 11 | google-crc32c==1.2.0 12 | google-pasta==0.2.0 13 | google-reauth==0.1.1 14 | google-resumable-media==2.0.3 15 | googleapis-common-protos==1.52.0 16 | apscheduler 17 | plotly 18 | dash_core_components 19 | dash 20 | dash_html_components 21 | dash_bootstrap_components -------------------------------------------------------------------------------- /ui/seldon_deployments/card.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import json 3 | import os 4 | 5 | import dash 6 | import dash_bootstrap_components as dbc 7 | import yaml 8 | from dash import dcc, html 9 | from kubernetes import config 10 | from seldon_deployments.data import dataf 11 | 12 | try: 13 | config.load_kube_config() 14 | except config.ConfigException: 15 | config.load_incluster_config() 16 | GLOBAL_NAMESPACE = os.getenv("namespace", "staging") 17 | SELDON_URL = os.getenv("seldon_url", "https://example.mlops.com") 18 | 19 | 20 | def card_layout(deploy_name=None): 21 | ( 22 | model_manifests, 23 | name, 24 | external_url, 25 | internal_url, 26 | status, 27 | status_message, 28 | status_reason, 29 | status_button, 30 | manifest, 31 | ) = dataf(name=deploy_name, namespace=GLOBAL_NAMESPACE, seldon_url=SELDON_URL) 32 | conditions = manifest["status"]["conditions"] 33 | collapses = [] 34 | for i in range(len(conditions)): 35 | if conditions[i]["status"] == "False": 36 | color = "secondary" 37 | else: 38 | color = "success" 39 | type = conditions[i]["type"] 40 | try: 41 | reason = conditions[i]["reason"] 42 | except Exception as e: 43 | print(e) 44 | reason = type 45 | collapse = html.Div( 46 | [ 47 | dbc.Button( 48 | type, 49 | id=f"collapse-button{i}", 50 | className="mb-3", 51 | color=color, 52 | n_clicks=0, 53 | ), 54 | dbc.Collapse( 55 | dbc.Card(dbc.CardBody(reason)), 56 | id=f"collapse{i}", 57 | is_open=False, 58 | ), 59 | ] 60 | ) 61 | collapses.append(collapse) 62 | 63 | res = ast.literal_eval(json.dumps(manifest)) 64 | res = yaml.safe_dump(res, default_flow_style=False) 65 | code = f"```yaml{res}```" 66 | model_cards = [] 67 | for i in model_manifests: 68 | model_card = dbc.Card( 69 | [ 70 | dbc.CardBody( 71 | [ 72 | html.H4( 73 | i["name"], id="seldon-deployment", className="card-title" 74 | ), 75 | dbc.ListGroup( 76 | [ 77 | dbc.ListGroupItem( 78 | [ 79 | html.A( 80 | "Run id: ", style={"font-weight": "bold"} 81 | ), 82 | html.A(i["run_id"]), 83 | ] 84 | ), 85 | dbc.ListGroupItem( 86 | [ 87 | html.A( 88 | "Source: ", style={"font-weight": "bold"} 89 | ), 90 | html.A(i["source"]), 91 | ] 92 | ), 93 | dbc.ListGroupItem( 94 | [ 95 | html.A( 96 | "Version: ", style={"font-weight": "bold"} 97 | ), 98 | html.A(i["version"]), 99 | ] 100 | ), 101 | dbc.ListGroupItem( 102 | [ 103 | html.A( 104 | "Artifacu Uri: ", 105 | style={"font-weight": "bold"}, 106 | ), 107 | html.A(i["artifact_uri"]), 108 | ] 109 | ), 110 | ] 111 | ), 112 | ] 113 | ), 114 | ], 115 | ) 116 | model_cards.append(model_card) 117 | 118 | Overview_tab = dcc.Tab( 119 | label="Overview", 120 | children=[ 121 | dbc.Card( 122 | dbc.ListGroup( 123 | [ 124 | dbc.ListGroupItem( 125 | [ 126 | html.A( 127 | "External Endpoint: ", style={"font-weight": "bold"} 128 | ), 129 | html.A( 130 | id="external_url", 131 | href=external_url, 132 | children=external_url, 133 | target="_blank", 134 | ), 135 | html.A(" "), 136 | dcc.Clipboard( 137 | target_id="external_url", 138 | title="copy", 139 | style={ 140 | "display": "inline-block", 141 | "fontSize": 20, 142 | "verticalAlign": "top", 143 | }, 144 | ), 145 | ] 146 | ), 147 | dbc.ListGroupItem( 148 | [ 149 | html.A( 150 | "Internal Endpoint: ", style={"font-weight": "bold"} 151 | ), 152 | html.A( 153 | id="internal_url", 154 | href=internal_url, 155 | children=internal_url, 156 | target="_blank", 157 | ), 158 | html.A(" "), 159 | dcc.Clipboard( 160 | target_id="internal_url", 161 | title="copy", 162 | style={ 163 | "display": "inline-block", 164 | "fontSize": 20, 165 | "verticalAlign": "top", 166 | }, 167 | ), 168 | ] 169 | ), 170 | dbc.ListGroupItem( 171 | [ 172 | html.A( 173 | "Status Message: ", style={"font-weight": "bold"} 174 | ), 175 | html.A(status_message), 176 | ] 177 | ), 178 | dbc.ListGroupItem( 179 | [ 180 | html.A( 181 | "Status Message: ", style={"font-weight": "bold"} 182 | ), 183 | html.A(status_reason), 184 | ] 185 | ), 186 | dbc.ListGroupItem( 187 | [ 188 | html.A("Status: ", style={"font-weight": "bold"}), 189 | status_button, 190 | ] 191 | ), 192 | ], 193 | flush=True, 194 | ), 195 | ) 196 | ] 197 | + collapses, 198 | ) 199 | 200 | tabs = [ 201 | Overview_tab, 202 | dcc.Tab(label="Model Details", children=model_cards), 203 | dcc.Tab(label="Yaml", children=[dcc.Markdown(str(code))]), 204 | ] 205 | if status == "Available": 206 | tabs.append( 207 | dcc.Tab( 208 | label="Doc", 209 | children=[ 210 | html.Iframe( 211 | src=external_url, style={"height": "1067px", "width": "100%"} 212 | ) 213 | ], 214 | ) 215 | ) 216 | 217 | layout = html.Div( 218 | [ 219 | dash.html.H3(f"{name}"), 220 | dcc.Tabs(tabs), 221 | dcc.Interval( 222 | id="interval-component-seldon", 223 | interval=1 * 1000, # in milliseconds 224 | n_intervals=0, 225 | ), 226 | ] 227 | ) 228 | return layout 229 | -------------------------------------------------------------------------------- /ui/seldon_deployments/data.py: -------------------------------------------------------------------------------- 1 | import ast 2 | 3 | import dash_bootstrap_components as dbc 4 | from dash import html 5 | from kubernetes import client as KubeClient 6 | from kubernetes import config 7 | 8 | try: 9 | config.load_kube_config() 10 | except config.ConfigException: 11 | config.load_incluster_config() 12 | 13 | 14 | def pod_status(namespace, deploy_name): 15 | v1 = KubeClient.CoreV1Api() 16 | api_response = v1.list_namespaced_pod(namespace) 17 | for pod in api_response.items: 18 | if (pod.status.container_statuses is None) and ( 19 | pod.status.init_container_statuses is None 20 | ): 21 | status = pod.status.conditions[0].message 22 | return (pod.metadata.name, status) 23 | 24 | if api_response.items[0].metadata.labels["app"] == deploy_name: 25 | status = pod.status.phase 26 | container_status = pod.status.container_statuses[0] 27 | 28 | if container_status.started is False or container_status.ready is False: 29 | waiting_state = container_status.state.waiting 30 | if ( 31 | waiting_state.message is not None 32 | and "Error" in waiting_state.message 33 | ): 34 | status = waiting_state.reason 35 | try: 36 | init_container_statuses = pod.status.init_container_statuses[0] 37 | if ( 38 | init_container_statuses.started is False 39 | or init_container_statuses.ready is False 40 | ): 41 | waiting_state = init_container_statuses.state.waiting 42 | if ( 43 | waiting_state.message is not None 44 | and "failed" in waiting_state.message 45 | ): 46 | status = waiting_state.reason 47 | except Exception as e: 48 | print(e) 49 | print("No init container found") 50 | if status == "CrashLoopBackOff": 51 | return (pod.metadata.name, status, waiting_state.message) 52 | 53 | 54 | def dataf( 55 | name="mlflow-var", namespace="staging", seldon_url="https://seldon.mlops.wianai.com" 56 | ): 57 | v1 = KubeClient.CustomObjectsApi() 58 | manifest = v1.get_namespaced_custom_object( 59 | group="machinelearning.seldon.io", 60 | version="v1", 61 | plural="seldondeployments", 62 | namespace=namespace, 63 | name=name, 64 | ) 65 | models = [] 66 | print(manifest["metadata"]["annotations"].keys()) 67 | for _id in manifest["metadata"]["annotations"].keys(): 68 | if ("mdc" in _id) and ("mlflow-stage" not in _id): 69 | models.append(manifest["metadata"]["annotations"][_id]) 70 | model = [ast.literal_eval(i) for i in models] 71 | name = manifest["metadata"]["name"] 72 | external_url = f"{seldon_url}/seldon/{namespace}/{name}/api/v1.0/doc/" 73 | internal_url = manifest["status"]["address"]["url"] 74 | deploy_name = list(manifest["status"]["deploymentStatus"].keys())[0] 75 | kube_client = KubeClient.AppsV1Api() 76 | deployment = kube_client.read_namespaced_deployment( 77 | name=deploy_name, namespace=namespace 78 | ) 79 | # label = deployment.metadata.labels["app"] 80 | status = "" 81 | for condition in deployment.status.conditions: 82 | if (condition.type == "Available") and (condition.status == "True"): 83 | status = "Available" 84 | status_button = dbc.Button( 85 | [html.I(className="bi bi-check-circle-fill me-2"), " Available"], 86 | color="success", 87 | disabled=True, 88 | ) 89 | status_message = condition.message 90 | status_reason = condition.reason 91 | if status != "Available": 92 | if (condition.type == "Progressing") and (condition.status == "True"): 93 | status = "Progressing" 94 | status_message = condition.message 95 | status_reason = condition.reason 96 | status_button = dbc.Button( 97 | [dbc.Spinner(size="sm"), " Progressing..."], 98 | color="primary", 99 | disabled=True, 100 | ) 101 | elif (condition.type == "Progressing") and (condition.status == "False"): 102 | status = condition.reason 103 | status_message = condition.message 104 | status_reason = condition.reason 105 | status_button = dbc.Button( 106 | [html.I(className="bi bi-x-octagon-fill me-2"), " Failed"], 107 | color="danger", 108 | disabled=True, 109 | ) 110 | return ( 111 | model, 112 | name, 113 | external_url, 114 | internal_url, 115 | status, 116 | status_message, 117 | status_reason, 118 | status_button, 119 | manifest, 120 | ) 121 | --------------------------------------------------------------------------------