├── .github ├── CODEOWNERS └── workflows │ └── stale.yml ├── LICENSE ├── README.md ├── architecture-decision-records ├── ODH-ADR-0000-template.md ├── ODH-ADR-0001-use-architecture-decision-records-for-open-data-hub.md ├── ODH-ADR-0002-data-science-pipelines-multi-user-approach.md ├── ODH-ADR-0003-use-apache-2-0-licence.md ├── ODH-ADR-0005-github-labels-standards.md ├── ODH-ADR-0006-organization-membership-automation.md ├── README.md ├── data-science-pipelines │ └── ODH-ADR-DSP-0001-data-science-pipelines-upgrade-testing-strategy.md ├── distributed-workloads │ └── ODH-ADR-DW-0001-determine-codeflare-deployment-strategy.md ├── explainability │ └── ODH-ADR-XAI-0001-trustyaiservice-database-configuration.md ├── model-serving │ ├── ODH-ADR-MS-0001-kserve-private-network-in-cluster.md │ └── images │ │ ├── ODH-ADR-MS-0001-kserve-private-network-in-cluster-img-1.png │ │ └── ODH-ADR-MS-0001-kserve-private-network-in-cluster-img-2.png └── operator │ ├── ODH-ADR-0004-odh-trusted-ca-configmap.md │ ├── ODH-ADR-Operator-0001-distributed-manifests.md │ ├── ODH-ADR-Operator-0002-operator-scope.md │ ├── ODH-ADR-Operator-0003-component-integration.md │ ├── ODH-ADR-Operator-0005-configure-resources.md │ ├── ODH-ADR-Operator-0006-internal-api.md │ ├── ODH-ADR-Operator-0007-auth-crd.md │ ├── ODH-ADR-Operator-0007-components-version-mapping.md │ ├── ODH-ADR-Operator-0008-resources-lifecycle.md │ └── assets │ └── ODH-ADR-Operator-0006 │ ├── odh-operator-current.png │ └── odh-operator-next.png └── documentation ├── README.md ├── arch-overview.md ├── components ├── dashboard │ ├── .gitkeep │ ├── README.md │ ├── assets │ │ ├── Dashboard Arch Diagrams.drawio │ │ ├── featureFlags.drawio │ │ └── featureFlags.png │ ├── configuringDashboard.md │ ├── dashboardStorage.md │ ├── features │ │ ├── README.md │ │ ├── connections.md │ │ ├── modelCatalog.md │ │ └── modelRegistry.md │ └── k8sLabelsAndAnnotations.md ├── devops │ └── .gitkeep ├── distributed-workload │ └── .gitkeep ├── edge │ └── .gitkeep ├── explainability │ ├── .gitkeep │ ├── README.md │ ├── diagram.drawio │ └── diagram.png ├── feature_store │ ├── README.md │ └── images │ │ ├── feature-store-overview.drawio │ │ └── feature-store-overview.jpg ├── model-registry │ ├── .gitkeep │ ├── README.md │ ├── images │ │ ├── model-registry-connections.png │ │ ├── model-registry-deployment-model.png │ │ ├── model-registry-logical-model.png │ │ ├── model-registry-overview.jpg │ │ └── model-registry-tenancy-model.png │ └── model-registry-tenancy.md ├── pipelines │ ├── .gitkeep │ ├── README.md │ ├── dsp v2 architecture.drawio │ ├── dsp-v2-architecture.drawio.png │ └── dsp-v2-high-level-architecture.png ├── platform │ ├── .gitkeep │ ├── Authorization in Service Mesh.png │ ├── Platform Architecture Overview.png │ └── README.md ├── serving │ ├── .gitkeep │ ├── README.md │ ├── modelserving-architecture-High-Level Components Architecture.jpg │ └── modelserving-kserve-architecture.drawio └── workbenches │ ├── .gitkeep │ ├── README.md │ ├── high-level-workbench-arch.drawio.png │ ├── notebook-controller.drawio.png │ ├── rstudio-imagestream.drawio.png │ └── workbenches-imagestreams.drawio.png ├── diagram ├── README.MD ├── RHOAI Architecture.drawio └── RHOAI_Network_Architecture.drawio ├── enhancements └── .gitkeep └── images ├── RHOAI Architecture - D1 - Operator.png ├── RHOAI Architecture - D2 - DSP.png ├── RHOAI Architecture - D3 - Workbenches.png ├── RHOAI Architecture - D4 - Dashboard.png ├── RHOAI Architecture - D5 - Distr Workloads.png ├── RHOAI Architecture - D6a - Model Serving.png ├── RHOAI Architecture - D6b - Model Serving.png ├── RHOAI Architecture - D6c - Model Serving.png ├── RHOAI Architecture - D7 - Trusty.png ├── RHOAI Architecture - D9 - Feature Store.png ├── RHOAI Architecture-Overview.drawio.png ├── RHODS Architecture - Network Diagram.png └── network ├── Dashboard.png ├── DataScienePipelines.png ├── DistributedWorkloads_KubeFlow_Training_Operator.png ├── DistributedWorkloads_KubeRay.png ├── ModelRegistry.png ├── ModelServing.png ├── TrustyAI.png └── Workbenches.png /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Assigning ownership for specific files or directories 2 | # Syntax: 3 | 4 | # Assigning ownership for all files in a directory 5 | # Syntax: /* 6 | 7 | # Assigning ownership for all files in a repository 8 | # Syntax: * 9 | 10 | * @opendatahub-io/architects 11 | 12 | /documentation/components/dashboard/ @opendatahub-io/architects @opendatahub-io/exploring-team 13 | /documentation/components/devops/ @opendatahub-io/architects @opendatahub-io/platform 14 | /documentation/components/distributed-workloads/ @opendatahub-io/architects @opendatahub-io/training-experimentation 15 | /documentation/components/edge/ @opendatahub-io/architects @opendatahub-io/platform 16 | /documentation/components/explainability/ @opendatahub-io/architects @opendatahub-io/model-serving 17 | /documentation/components/model-registry/ @opendatahub-io/architects @opendatahub-io/model-serving 18 | /documentation/components/pipelines/ @opendatahub-io/architects @opendatahub-io/training-experimentation 19 | /documentation/components/platform/ @opendatahub-io/architects @opendatahub-io/platform 20 | /documentation/components/serving/ @opendatahub-io/architects @opendatahub-io/model-serving 21 | /documentation/components/workbenches/ @opendatahub-io/architects @opendatahub-io/exploring-team 22 | 23 | -------------------------------------------------------------------------------- /.github/workflows/stale.yml: -------------------------------------------------------------------------------- 1 | # This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time. 2 | # 3 | # You can adjust the behavior by modifying this file. 4 | # For more information, see: 5 | # https://github.com/actions/stale 6 | name: Mark stale issues and pull requests 7 | 8 | on: 9 | schedule: 10 | - cron: '00 03 * * *' 11 | workflow_dispatch: 12 | 13 | jobs: 14 | stale: 15 | 16 | runs-on: ubuntu-latest 17 | permissions: 18 | issues: write 19 | pull-requests: write 20 | 21 | steps: 22 | - uses: actions/stale@v5 23 | with: 24 | # for now, we will only label PRs as stale, not issues 25 | days-before-issue-stale: -1 26 | days-before-pr-stale: 21 27 | days-before-pr-close: 7 28 | stale-pr-message: 'This PR is stale because it has been open 21 days with no activity. Remove stale label or comment or this will be closed in 7 days.' 29 | close-pr-message: 'This PR was closed because it has been stale for 21+7 days with no activity.' 30 | stale-pr-label: 'Stale' 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Open Data Hub Architecture Documents 2 | 3 | This repository contains all the Architecture Decision Records as well as the Architecture Documentation for Open Data Hub and OpenShift AI 4 | 5 | ## Architecture Decision Records 6 | [Architecture Decision Records](architecture-decision-records) 7 | 8 | ## Architecture Documentation 9 | [Documentation](documentation) 10 | -------------------------------------------------------------------------------- /architecture-decision-records/ODH-ADR-0000-template.md: -------------------------------------------------------------------------------- 1 | # Open Data Hub - Architecture Decision Record template 2 | 3 | 4 | 5 | 6 | 7 | | | | 8 | | -------------- | ---------- | 9 | | Date | insert data | 10 | | Scope | | 11 | | Status | Approved | 12 | | Authors | [name](@github-username) | 13 | | Supersedes | N/A | 14 | | Superseded by: | N/A | 15 | | Tickets | | 16 | | Other docs: | none | 17 | 18 | ## What 19 | 20 | A couple sentences describing what this ADR is about. 21 | ## Why 22 | 23 | A couple sentences describing why we need an ADR for this. 24 | 25 | ## Goals 26 | 27 | * Bulleted list of goals 28 | 29 | ## Non-Goals 30 | 31 | * Bulleted list of non-goals 32 | * 33 | ## How 34 | 35 | A couple sentences describing the high level approach that this ADR captures. 36 | 37 | ## Open Questions 38 | 39 | Optional section, hopefully removed before transitioning from Draft/Proposed to Accepted. 40 | 41 | ## Alternatives 42 | 43 | Carefully describe the alternatives considered, and specifically document what the tradeoffs of each approach are. 44 | 45 | ## Security and Privacy Considerations 46 | 47 | Optional section. Talk about any security and privacy concerns here. 48 | 49 | ## Risks 50 | 51 | Optional section. Talk about any risks here. 52 | 53 | ## Stakeholder Impacts 54 | 55 | | Group | Key Contacts | Date | Impacted? | 56 | | ----------------------------- | ---------------- | ---------- | --------- | 57 | | group or team name | key contact name | date | ? | 58 | 59 | 60 | ## References 61 | 62 | * optional bulleted list 63 | 64 | ## Reviews 65 | 66 | | Reviewed by | Date | Notes | 67 | | ----------------------------- | --------- | ------| 68 | | name | date | ? | 69 | -------------------------------------------------------------------------------- /architecture-decision-records/ODH-ADR-0001-use-architecture-decision-records-for-open-data-hub.md: -------------------------------------------------------------------------------- 1 | # Use Architecture Decision Records for Open Data Hub 2 | 3 | | | | 4 | | -------------- | ---------- | 5 | | Date | 2023-02-20 | 6 | | Scope | Open Data Hub | 7 | | Status | Draft | 8 | | Authors | [Greg Sheremeta](@gregsheremeta) | 9 | | Supersedes | N/A | 10 | | Superseded by: | N/A | 11 | | Tickets | | 12 | | Other docs: | none | 13 | 14 | # Open Data Hub Architecture Decision Records 15 | 16 | "Documenting architectural decisions helps a project succeed by helping current and future contributors understand the reasons for doing things a certain way." [1] 17 | 18 | ## What is an ADR? 19 | 20 | An architecture decision record is a short text file in a Markdown format. Each record describes a set of forces and a single decision in response to those forces. [2] 21 | 22 | An ADR is not a technical design, a team-level internal procedure, or a roadmap. An ADR does not replace detailed technical design documents or good commit messages. 23 | 24 | ## How 25 | 26 | We will keep each ADR in a short text file in Markdown format. 27 | 28 | We will keep ADRs in this repository, https://github.com/opendatahub-io/adr. 29 | 30 | ADRs will be numbered sequentially and monotonically. Numbers will not be reused. 31 | 32 | If a decision is reversed, we will keep the old one around, but mark it as superseded. (It's still relevant to know that it was the decision, but is no longer the decision.) 33 | 34 | We will use a format with just a few parts, so each document is easy to digest. 35 | 36 | ## Approval Process / Flow 37 | 38 | TBD 39 | 40 | ## Reviews 41 | 42 | | Reviewed by | Date | Notes | 43 | | ----------------------------- | --------- | ------| 44 | 45 | ## References 46 | 47 | * https://www.redhat.com/architect/architecture-decision-records 48 | * https://cognitect.com/blog/2011/11/15/documenting-architecture-decisions 49 | * https://github.com/operate-first/blueprint/tree/main/adr 50 | * https://adr.github.io/ 51 | * https://docs.aws.amazon.com/prescriptive-guidance/latest/architectural-decision-records/adr-process.html 52 | 53 | ## Citations 54 | 55 | * [1] Heiko W. Rupp, https://www.redhat.com/architect/architecture-decision-records 56 | * [2] Michael Nygard, https://cognitect.com/blog/2011/11/15/documenting-architecture-decisions 57 | -------------------------------------------------------------------------------- /architecture-decision-records/ODH-ADR-0002-data-science-pipelines-multi-user-approach.md: -------------------------------------------------------------------------------- 1 | # Data Science Pipelines Multi-User Approach 2 | 3 | | | | 4 | | -------------- | ---------- | 5 | | Date | 2023-02-20 | 6 | | Scope | Data Science Pipelines Project, within Open Data Hub | 7 | | Status | Draft | 8 | | Authors | [Greg Sheremeta](@gregsheremeta) | 9 | | Supersedes | N/A | 10 | | Superseded by: | N/A | 11 | | Tickets | | 12 | | Other docs: | none | 13 | 14 | ## What 15 | 16 | This decision document is proposing an approach for integrating [Data Science Pipelines](https://github.com/opendatahub-io/data-science-pipelines) (upstream = [Kubeflow Pipelines](https://github.com/kubeflow/kfp-tekton)) in a multi-user environment. 17 | 18 | ## Why 19 | 20 | Out of the box, [Kubeflow Pipelines](https://www.kubeflow.org/docs/components/pipelines/) comes with [multi-user isolation](https://www.kubeflow.org/docs/components/pipelines/v1/overview/multi-user/) that utilizes the Kubeflow concept of [Profiles](https://www.kubeflow.org/docs/components/multi-tenancy/overview/). We’re not using Profiles in Open Data Hub. Additionally, Kubeflow Pipelines [technical implementation](https://www.kubeflow.org/docs/components/multi-tenancy/design/) for multi-user isolation requires the installation of Istio and Dex. Istio in particular is a heavy dependency that we’d rather not require either. 21 | 22 | ## Goals 23 | 24 | * Time pressure – implement a working solution for multi-user isolation and have it production ready by the beginning of April 2023. 25 | * Implement a solution that takes into account supportability, operability, and SLAs in a managed services environment. We prefer a solution that will page SREs less over one that could potentially page SREs more. 26 | * Utilize well-known Kubernetes concepts (RBAC, namespaces) to implement multi-user isolation. 27 | * Ensure that our solution doesn’t make it difficult to migrate to Kubeflow Pipelines V2 when it’s released. 28 | * Ensure that our solution stays consistent enough with upstream such that we can rebase on upstream frequently and that we can eventually make meaningful contributions to upstream. 29 | 30 | ## Non-Goals 31 | 32 | * No plans to make Data Science Pipelines concepts work in a cross-namespace fashion. If a user has access to namespaces Namespace1 and Namespace2, a Component in Namespace1 cannot work with a Component in Namespace2 to comprise a Pipeline. 33 | 34 | ## How 35 | 36 | Instead of using Kubeflow Pipelines’ out of the box [Multi-user isolation](https://www.kubeflow.org/docs/components/pipelines/v1/overview/multi-user/), we propose to roll out multiple individual single-user Kubeflow Pipelines stack into multiple namespaces, one for each Data Science Project. We’ll create a new operator called [data-science-pipelines-operator](https://github.com/opendatahub-io/data-science-pipelines-operator) to roll out and maintain these multiple stacks. (The rest of the ADR will refer to this approach as the “multi-stack approach”.) 37 | 38 | ## Open Questions 39 | 40 | The Data Science Pipelines stack requires access to object storage (default is Minio) and a relational database (default is MySQL). For both Open Data Hub running on-prem and as a managed service, it’s unclear whether we’re better off using one central multi-tenant database and object store, or whether each stack should have its own individual single-tenant database and object store. Our chosen design allows for either approach. 41 | 42 | ## Alternatives 43 | 44 | 1. Use the Kubeflow Pipelines [multi-user isolation](https://www.kubeflow.org/docs/components/pipelines/v1/overview/multi-user/) as it exists, out of the box, including Istio and Dex and Profiles. 45 | * This is less work upfront for us to implement, but Istio is a complex and heavy dependency that we don’t want to require. It’s possible the Open Data Hub will require Istio in the future, but we don’t want to force that inclusion now if we don’t have to. We also don’t have the concept of Profiles in Open Data Hub, and it would be difficult to shoehorn that concept in. We dismissed this option early on. 46 | 2. Use the Kubeflow Pipelines [multi-user isolation](https://www.kubeflow.org/docs/components/pipelines/v1/overview/multi-user/) as it exists, out of the box, but **remove** the requirement for Istio and Dex. Replace these authz components with an oauth proxy that sets the HTTP header that the Kubeflow Pipelines components use to authorize users. 47 | * We have a [proof of concept](https://github.com/HumairAK/odh-manifests/blob/rhods-auth/data-science-pipelines/base/auth/AUTH_NOTES.md) of this working, but after weighing the tradeoffs with the multi-stack approach, we decided that the multi-stack approach fits our goals and requirements better: 48 | * supportability, operability, and SLAs in a managed services environment – while there is more surface area to monitor and potentially more things that can break with multiple stacks running, the impact of one single-tenant stack breaking is much less than the impact of the only multi-user stack breaking. In other words, we believe that the multi-stack approach will make it easier for us to meet SLOs and SLAs in a managed environment. 49 | * Ensure that our solution stays consistent with upstream – we felt that utilizing the upstream multi-user solution but modifying it by removing Istio left us farther away from upstream than our multi-stack approach. In the multi-stack approach we’re deploying multiple single-tenant stacks and not modifying them – we’re wrapping them with an operator. 50 | * Time pressure – we were originally concerned that writing a custom operator to maintain the multi-stack approach would cause a risk to our deadline, but we were able to create a working proof of concept of our operator within a week. It is true, though, that our multi-stack approach requires us to maintain custom code (an operator written in go) whereas using Alternative 2 would have merely required us to maintain custom manifests. 51 | 52 | ## Security and Privacy Considerations 53 | 54 | * Because the multi-stack approach relies on Kubernetes RBAC and namespaces, we inherit the security and privacy benefits around multi-user authorization inherent in Kubernetes itself. 55 | * We still have an open question around the object storage and relational database requirements, and our decision there has security and privacy considerations. If we use a single shared database, we would specify separate databases within the provider/database instance for each stack, so users’ Data Science Project data would not be stored alongside each other in the same tables. This is in contrast to the single-stack solution, which would have one provider and one shared database within the provider, so users’ data would be stored within the same tables. If we use multiple databases (one per stack), then the data is a little more separated, but we don’t see this as advantageous over multiple databases within one provider/database instance. 56 | 57 | ## Risks 58 | 59 | * The implementing team as a whole doesn’t have deep experience with building operators. 60 | 61 | ## Stakeholder Impacts 62 | 63 | | Group | Key Contact | Date | Impacted? | 64 | | ----------------------------- | --------------- | ---------- | --------- | 65 | | Open Data Hub UI Console team | Andrew Ballantyne| 2023-02-20 | ? | 66 | 67 | 68 | ## References 69 | 70 | * https://github.com/opendatahub-io/data-science-pipelines 71 | * https://www.kubeflow.org/docs/components/pipelines/v1/overview/multi-user/ 72 | * https://www.kubeflow.org/docs/components/pipelines/ 73 | 74 | ## Reviews 75 | 76 | | Reviewed by | Date | Notes | 77 | | ----------------------------- | --------- | ------| 78 | | Anish Asthana | 2023-02-13 | lgtm | 79 | | Humai Khan | 2023-02-10 | lgtm | 80 | 81 | ## Accept / Reject 82 | -------------------------------------------------------------------------------- /architecture-decision-records/ODH-ADR-0003-use-apache-2-0-licence.md: -------------------------------------------------------------------------------- 1 | # Open Data Hub - ODH-ADR-0003 - Open Data Hub default licence 2 | 3 | | | | 4 | | -------------- | ---------- | 5 | | Date | 11-April-2023 6 | | Scope | Open Data Hub default licence | 7 | | Status | Accepted | 8 | | Authors | [Greg Sheremeta](@gregsheremeta) | 9 | | Supersedes | N/A | 10 | | Superseded by: | N/A | 11 | | Tickets | N/A | 12 | | Other docs: | none | 13 | 14 | ## What 15 | 16 | This ADR captures our decision to license Open Data Hub under the Apache 2.0 license going forward. 17 | 18 | ## Why 19 | 20 | Historically, Open Data Hub had standardized on using the GPLv3 license for all new code repositories. Over time, we lost track of the original reasons for selecting GPLv3. When the original decision was made, Open Data Hub was structured differently and was more focused on providing an open source reference architecture, so the reasons for the previous decision no longer apply. 21 | 22 | The current engineers and project team members who build Open Data Hub are in the best place to select the best license for the project. 23 | 24 | While GPLv3 and Apache 2.0 are both acceptable choices for Open Data Hub, we believe that selecting the Apache 2.0 license will better align Open Data Hub with other projects in the Machine Learning domain. We also believe that selecting the Apache 2.0 license will encourage more open source contributions to Open Data Hub. 25 | 26 | We did an inventory of the licenses in use in the ODH-included projects and also some popular peer projects, and we found that 100% of those we looked at use a permissive licence, and the vast majority of those use Apache 2.0: 27 | 28 | | | | 29 | | -------------- | ---------- | 30 | | Jupyter | permissive, 3-clause BSD | 31 | | Kubeflow | permissive, Apache 2.0 | 32 | | KFP-Tekton | permissive, Apache 2.0 | 33 | | Tekton | permissive, Apache 2.0 | 34 | | CodeFlare | permissive, Apache 2.0 | 35 | | Ray | permissive, Apache 2.0 | 36 | | Elyra | permissive, Apache 2.0 | 37 | | Modelmesh | permissive, Apache 2.0 | 38 | | Pytorch | permissive, Apache-like, mostly Apache 2.0 headers | 39 | | Tensorflow | permissive, Apache 2.0 | 40 | | Keras | permissive, Apache 2.0 | 41 | | Spark | permissive, Apache 2.0 | 42 | | AutoML | permissive, Apache 2.0 | 43 | | Scikit-image | permissive, mostly 3-clause BSD | 44 | | Scikit-learn | permissive, 3-clause BSD | 45 | | Pandas | permissive, 3-clause BSD | 46 | | MXNet | permissive, Apache 2.0 | 47 | 48 | ## Goals 49 | 50 | * Recognize that the Machine Learning community uses permissive licenses (mostly Apache 2.0) as the de facto standard, and strive to align Open Data Hub to match that de facto standard. 51 | * Capture the decision that Open Data Hub will be licensed under the Apache 2.0 license going forward. 52 | * Relicense any existing Open Data Hub-specfic repositories that are currently GPLv3 (such as Data Science Pipelines Operator) to Apache 2.0. 53 | 54 | ## Non-Goals 55 | 56 | * We are not changing the license of any repositories included in Open Data Hub that are direct copies or forks of some other repository outside of Open Data Hub. Those must retain their existing licenses. 57 | 58 | ## How 59 | 60 | * Publish this ADR as a proposed ADR. 61 | * Have a week of commentary period for the Open Data Hub Community to ask questions and provide feedback. 62 | * Assuming there is no Community dissent, we will move this ADR to accepted, change the affected repositories, and announce the change as being completed. 63 | 64 | ## Open Questions 65 | 66 | We're leaving this ADR as "proposed" for a period of time so that the Open Data Hub Community can comment. 67 | 68 | ## Alternatives 69 | 70 | The primary alternative is to not do anything and leave the default license as GPLv3. However, this has caused some confusion recently because the Machine Learning community has mostly adopted Apache 2.0 as discussed above. 71 | 72 | Another alternative is to use a permissive license other than Apache 2.0. However, also as stated above, our goal is to be consistent with the Machine Learning community, and the community has mostly adopted Apache 2.0. 73 | 74 | ## Security and Privacy Considerations 75 | 76 | n/a 77 | 78 | ## Risks 79 | 80 | n/a for technical risks. 81 | 82 | If there have been any non-trivial contributions to Open Data Hub that were made with the author's understanding that they were contributing under GPLv3, we need to get their permission to change the license on their contributions. We're not currently aware of any such contributions where the auther would not approve the relicensing to Apache 2.0. 83 | 84 | ## Stakeholder Impacts 85 | 86 | n/a 87 | 88 | ## References 89 | 90 | * n/a 91 | 92 | ## Reviews 93 | 94 | | Reviewed by | Date | Notes | 95 | | ----------------------------- | --------- | ------| 96 | | Open Data Hub Community | 2023-04-22 | Accepted | 97 | -------------------------------------------------------------------------------- /architecture-decision-records/ODH-ADR-0005-github-labels-standards.md: -------------------------------------------------------------------------------- 1 | # GitHub Label Standard for opendatahub-io organization 2 | 3 | | | | 4 | | -------------- | ---------- | 5 | | Date | 2023-04-14 | 6 | | Scope | | 7 | | Status | Accepted | 8 | | Authors | [Landon LaSmith](@lavlas) | 9 | | Supersedes | N/A | 10 | | Superseded by: | N/A | 11 | | Tickets | | 12 | | Other docs: | none | 13 | 14 | ## What 15 | 16 | This document will set the the organization standards for the core set of GitHub [Issue Labels](https://docs.github.com/en/issues/using-labels-and-milestones-to-track-work/managing-labels) that should be supported by every repository in the opendatahub-io organziation. 17 | 18 | ## Why 19 | 20 | [opendatahub-io](https://github.com/opendatahub-io) needs a unified Issue workflow that can support common [queries](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) for all Issue and Pull Request metadata from any repository in the opendatahub-io organization. To support this unified workflow, the required label names must match across all repositories. 21 | 22 | ## Goals 23 | 24 | * Standardize an issue workflow that will be used to show the Issue lifecycle, ownership and category 25 | * Define the label name standard that covers the universal states and metadata that is relevant to the entire opendatahub-io organization 26 | * Type: Bug, Feature, Documentation, tracker 27 | * Status: To Do, In Progess, In Review, Closed 28 | * Priority 29 | * ODH Component: ODH Operator, Notebook Controller, Notebooks, Manifests, Data Science Pipelines, Model Serving, AI Explainability, ... 30 | * SIG: Platform, ML Ops, Developer Experience 31 | * Extra: Good First Issue 32 | * Use the `opendatahub-community` repository that will become the centralized Issue triaging location for ODH Component owners to triage new issues and/or transfer to the appropriate component repository 33 | 34 | ## Non-Goals 35 | 36 | * This ADR will only define the core set of labels that will be supported across all repositories in opendatahub-io. 37 | The labels outlined in this document will only be a subset of the available labels in any given repo and will not contain any labels that are isolatedj to an individual repostory workflow 38 | * This is not a mandate that every component or SIG must use the centralized `opendatahub-community` repository to manage the lifecyle of issues relevant to their workflow 39 | 40 | ## How 41 | 42 | Across all repositories in the opendatahub-io organization, we will create the labels below with the expectation that the workflows they outline will be universal across all opendatahub-io repositories 43 | 44 | | Label | Description | 45 | | ----------------------------- | ---------------- | 46 | | `tracker` | Non-completable ticket; used for tracking work - akin to a Jira Epic | 47 | | `untriaged` | Indicates the newly created issue has not been triaged yet | 48 | | `kind/bug` | Indicates an unexpected problem or unintended behavior | 49 | | `kind/enhancement` | New functionality request (existing augments or new additions) | 50 | | `kind/documentation` | Improvements or additions to documentation | 51 | | `kind/security` | Indicates that this is a security issue that should be addressed | 52 | | `needinfo` | Further information is requested to unblock any progress on the issue | 53 | | `priority/normal` | An issue with the product; fix when possible | 54 | | `priority/blocker` | Critical issue that needs to be fixed asap; blocks up coming releases | 55 | | `priority/low` | An issue with the product that doesn't impact the user much or not at all (ie tech debt) | 56 | | `priority/high` | Important issue that needs to be resolved asap. Releases should not have too many of these. | 57 | | `good-first-issue` | Good for newcomers | 58 | | `odh-component/*` | Name of the odh-component that owns this issue. This should be used as the indicator for which component should own this issue if located in the centralized `opendatahub-community` repository | 59 | | `sig/*` | Name of the Special Interest Group in charge of this subject matter| 60 | | `wg/*` | Name of the Working Group that should be assigned to this issue | 61 | 62 | Additional labels maybe reserved depending on certain bots or apps running in the organization. 63 | 64 | It is assumed that all new issues will have the `untriaged` label until it is reviewed. Once an issue is triaged, a `kind/*` and `priority/*` label should be added and the issue will follow the issue worklow outlined in the component repo where it is located. 65 | 66 | ## Alternatives 67 | 68 | Alternative is to allow each SIG, WG, Maintainer, ... to use their own label naming system which would complicate attempts when querying or filtering on labels that have the same purpose but different name structure: `kind/bug` vs `kind::bug` vs `bug` 69 | 70 | ## Stakeholder Impacts 71 | 72 | | Group | Key Contacts | Date | Impacted? | 73 | | ----------------------------- | ---------------- | ---------- | --------- | 74 | | Platform SIG | | 2023-05-31 | yes | 75 | | MLOps SIG | | 2023-05-31 | yes | 76 | | Developer Experience SIG | | 2023-05-31 | yes | 77 | 78 | ## References 79 | 80 | * Kubernetes Contributors Documentation 81 | * [Issue Triage Guidelines](https://www.kubernetes.dev/docs/guide/issue-triage/) 82 | * [Help Wanted and Good First Issue Labels](https://www.kubernetes.dev/docs/guide/help-wanted/#good-first-issue) 83 | 84 | ## Reviews 85 | GitHub Approvals will function as reviews 86 | -------------------------------------------------------------------------------- /architecture-decision-records/ODH-ADR-0006-organization-membership-automation.md: -------------------------------------------------------------------------------- 1 | # Codification of Open Data Hub GitHub organization membership 2 | 3 | 4 | 5 | 6 | 7 | | | | 8 | | -------------- | ---------- | 9 | | Date | 2024-08-12 | 10 | | Scope | | 11 | | Status | Approved | 12 | | Authors | [Alex Corvin](@accorvin) | 13 | | Supersedes | N/A | 14 | | Superseded by: | N/A | 15 | | Tickets | | 16 | | Other docs: | none | 17 | 18 | ## What 19 | 20 | We will use codify membership in the OpenDataHub-io GitHub organization using 21 | [Peribolos](https://docs.prow.k8s.io/docs/components/cli-tools/peribolos/). We will 22 | automate the process of applying this membership using GitHub actions. 23 | 24 | ## Why 25 | 26 | This change is being made in the context of broader changes to ensure that the OpenDataHub code 27 | base is secure and minimally vulnerable to malicious actors. As part of this effort, we plan to 28 | reduce the set of organization owners to a very small set of individuals. We do not want this 29 | change to result in a bottleneck for managing organization membership, and thus want to enable 30 | individual teams to manage membership themselves. 31 | 32 | ## Goals 33 | 34 | * Create an easy method to add and remove organizational members in a way that is self-service to teams 35 | * Ensure that we can have a small set of organizational owners 36 | * Implement a reliable process which won't have a high administrative burden 37 | 38 | ## Non-Goals 39 | 40 | * We will not automate GitHub team membership or permissions on individual repositories. 41 | We want this automation to be as minimal as possible so that teams can continue using the 42 | GitHub native interface for as much as possible 43 | 44 | ## How 45 | 46 | We will revive the [org-management](https://github.com/opendatahub-io/org-management) repository 47 | which will contain a streamlined configuration file listing organization owners and members. We will 48 | not use this file to list Organization teams, repositories, or teams' permissions on individual repositories. 49 | Management of these items will continue to be manual. 50 | 51 | We will create a new GitHub team in the opendatahub-io org called `Org Membership Maintainers` which 52 | will include all managers and development leads. This team will be used in a CODEOWNERS file in the 53 | org-management repo and have permission to approve and merge pull requests to the membership configuration file. 54 | 55 | With this change in place, going forward we will use a pull request flow to add and remove 56 | members to the organization. Individuals (either the individual requesting membership or somone acting 57 | on their behalf) will open a pull request modifying the membership, which a member of the the 58 | `Org Membership Maintainers` team will then approve. 59 | 60 | ## Open Questions 61 | 62 | N/A 63 | 64 | ## Alternatives 65 | 66 | * Continue with the current process of organization owners manually making membership changes - this 67 | is seen as not a viable alternative as our current set of org owners is seen as too large and therefore 68 | exposes us to security vulnerabilities 69 | * Implement a custom organization role that enables only management or org members - this is hypothetically 70 | possible, but we have not tested this. Custom organization roles require GitHub enterprise which we 71 | do not currently have funding to adopt. 72 | 73 | ## Security and Privacy Considerations 74 | 75 | * With this change we make the full set or organization owners and members public in the config file 76 | * We will need to use a GitHub personal access tokken for us in the automation. This will need 77 | to be periodically renewed. 78 | 79 | ## Risks 80 | 81 | * The automation will need to be maintained. 82 | * We expect the DevOps team to own this automation as part of their long term plans to automate 83 | membership in the Red-Hat-Data-Services org as required for Konflux. 84 | * When we previously attempted to use Peribolos the automation was flaky and a constant point 85 | of frustration. To mitigate this: 86 | * We will use Peribolos only for managing organization membership, not teams or repository permissions. 87 | We feel that these latter items are more impactful to individual teams, and we'll leave manual control 88 | of these with the teams. 89 | * The previous implementation of Peribolos was not clearly communicated and therefore 90 | never fully adopted. This ADR is an attempt to communicate this more fully. 91 | 92 | ## Stakeholder Impacts 93 | 94 | | Group | Key Contacts | Date | Impacted? | 95 | | -------------------------------- | ---------------------------------------- | ---------- | --------- | 96 | | Architects Team | @opendatahub-io/architects | | y | 97 | | Documentation Team | @opendatahub-io/documentation | | y | 98 | | Exploring Team | @opendatahub-io/exploring-team | | y | 99 | | Model Serving Team | @opendatahub-io/model-serving | | y | 100 | | Training & Experimentation Team | @opendatahub-io/training-experimentation | | y | 101 | | Platform Team | @opendatahub-io/platform | | y | 102 | 103 | ## References 104 | 105 | * [Peribolos](https://docs.prow.k8s.io/docs/components/cli-tools/peribolos/ 106 | 107 | ## Reviews 108 | 109 | | Reviewed by | Date | Notes | 110 | | ----------------------------- | --------- | ------| 111 | | name | date | ? | 112 | 113 | We will not use this table for reviews. Instead, approval on the pull request 114 | adding this ADR will be used as reviews. -------------------------------------------------------------------------------- /architecture-decision-records/README.md: -------------------------------------------------------------------------------- 1 | # Open Data Hub Architecture Decision Records 2 | 3 | "Documenting architectural decisions helps a project succeed by helping current and future contributors understand the reasons for doing things a certain way." [1] 4 | 5 | ## What is an ADR? 6 | 7 | An architecture decision record is a short text file in a Markdown format. Each record describes a set of forces and a single decision in response to those forces. [2] 8 | 9 | An ADR is not a technical design, a team-level internal procedure, or a roadmap. An ADR does not replace detailed technical design documents or good commit messages. 10 | 11 | ## How 12 | 13 | We will keep each ADR in a short text file in Markdown format. 14 | 15 | We will keep ADRs in this repository, https://github.com/opendatahub-io/architecture-decision-records . 16 | 17 | ADRs will be numbered sequentially and monotonically. Numbers will not be reused. 18 | 19 | If a decision is reversed, we will keep the old one around, but mark it as superseded. (It's still relevant to know that it was the decision, but is no longer the decision.) 20 | 21 | We will use a format with just a few parts, so each document is easy to digest. 22 | 23 | ## References 24 | 25 | * https://www.redhat.com/architect/architecture-decision-records 26 | * https://cognitect.com/blog/2011/11/15/documenting-architecture-decisions 27 | * https://github.com/operate-first/blueprint/tree/main/adr 28 | * https://adr.github.io/ 29 | * https://docs.aws.amazon.com/prescriptive-guidance/latest/architectural-decision-records/adr-process.html 30 | 31 | ## Citations 32 | 33 | * [1] Heiko W. Rupp, https://www.redhat.com/architect/architecture-decision-records 34 | * [2] Michael Nygard, https://cognitect.com/blog/2011/11/15/documenting-architecture-decisions 35 | -------------------------------------------------------------------------------- /architecture-decision-records/data-science-pipelines/ODH-ADR-DSP-0001-data-science-pipelines-upgrade-testing-strategy.md: -------------------------------------------------------------------------------- 1 | # Upgrade Testing Process for Data Science Pipelines (DSP) 2 | 3 | | | | 4 | | -------------- | -------------------------- | 5 | | Date | 2023-07-05 | 6 | | Scope | | 7 | | Status | Accepted | 8 | | Authors | [Dharmit Dalvi](@DharmitD) | 9 | | Supersedes | N/A | 10 | | Superseded by: | N/A | 11 | | Tickets | | 12 | | Other docs: | none | 13 | 14 | ## What 15 | 16 | This document outlines the upgrade testing process for Data Science Pipelines (DSP). The process involves upgrading DSP from a latest released version to a version with the newest commit in main (or a tag version). 17 | 18 | ## Why 19 | 20 | The upgrade testing process is crucial to ensure the successful upgrade of DSP versions and to identify any issues early on. By following a standardized testing approach, we can automate the process and provide continuous feedback on the stability and functionality of the upgraded DSP versions. 21 | 22 | ## Goals 23 | 24 | * Regularly test the success of DSP version upgrades using the most recent commit or tag. 25 | * Detect issues early and provide continuous feedback on the stability and functionality of the upgraded DSP versions. 26 | * Incorporate upgrade testing into the regular release strategy to ensure smooth transitions between released and unreleased DSP versions. 27 | 28 | ## Non-Goals 29 | 30 | * This ADR does not cover the implementation details such as tooling that ought to be used. Those aspects can be tailored according to specific requirements. 31 | 32 | ## How 33 | 34 | The update testing strategy should account for all pre-requisites for deploying DSP, then perform an upgrade, and any follow up tests to confirm the upgrade was successful. The following outlines the design for such a process: 35 | 36 | 1. Set up a Test Environment: 37 | - Provision a test Kubernetes cluster or an OpenShift cluster. 38 | - Ensure cluster admin privileges are available to perform the following steps. 39 | 40 | 2. Install Open Data Hub (ODH) Operator. 41 | 42 | 3. Deploy KfDef Core for the most recently released DSPO Version: 43 | - Use the KfDef manifest to deploy the core components of the most recently released DSP version. 44 | - Deploy a DSPA instance 45 | 46 | 4. Prepare for Upgrade Testing: 47 | - Determine the candidate version for the DSP upgrade. It could be a tag, branch or a commit. 48 | - Update the KfDef manifest for the candidate DSP version by configuring DSPO to point to the latest commit or a tag for that version. 49 | 50 | 5. Deploy KfDef Core for the candidate DSP Version: 51 | - Use the updated KfDef manifest to deploy the core components for the candidate DSPO version. 52 | - Deploy a DSPA instance 53 | 54 | 6. Run Upgrade Tests: 55 | - Execute tests specific to testing the success of the DSP version upgrade. Examples of test cases could include checking if resources such as DSPO and DSPA deployments, ServiceMonitors, etc., come up correctly. 56 | - Customize the tests according to specific requirements and use cases. 57 | 58 | ## Automation Considerations 59 | 60 | - Set up a workflow to automate the upgrade testing process. 61 | - Configure the workflow to trigger every night asynchronously using the most recent commit of the DSP upgrade branch. 62 | - Within the workflow, deploy the updated KfDef Core for the candidate DSP version and run the upgrade tests. 63 | - Collect the test results, including logs, error messages, and any relevant information. 64 | 65 | ## Open Questions 66 | 67 | 1. Impact on Running Pipelines: 68 | - Consider the impact of upgrading DSP on currently running pipelines. 69 | - Assess how the upgrade process might affect pipeline execution, ongoing workflows, and pipeline outcomes. 70 | - Plan for any necessary adjustments or mitigations to ensure the smooth functioning of ongoing pipelines during the upgrade process. 71 | - Efforts to address this question to be tracked in [this issue.](https://github.com/opendatahub-io/data-science-pipelines-operator/issues/217) 72 | 73 | ## Security and Privacy Considerations 74 | 75 | No security and privacy considerations identified for the upgrade testing process. 76 | 77 | ## Risks 78 | 79 | 1. ODH Operator Redesign: 80 | - As per the input from the operator team, updating manifests will not be officially supported in the ODH Operator. 81 | - In development mode, users may have the option to update the manifest URI, which can enable testing of upgrades. However, this approach may not be officially supported and could introduce potential inconsistencies or issues. 82 | - There is a risk that relying on the manifest update capability in development mode may not align with the desired upgrade testing process or may not be reliable in a production environment. 83 | 84 | 2. The implementing team may face challenges in building and maintaining the automated upgrade testing process. 85 | 86 | ## Reviews 87 | 88 | | Reviewed by | Date | Notes | 89 | | ----------------------------- | --------------- | ------| 90 | | Achyut M. | July 19th, 2023 | -- | 91 | | Greg S. | July 20th, 2023 | -- | 92 | | Giulio F. | July 21st, 2023 | -- | 93 | | Humair K. | July 18th, 2023 | -- | 94 | -------------------------------------------------------------------------------- /architecture-decision-records/distributed-workloads/ODH-ADR-DW-0001-determine-codeflare-deployment-strategy.md: -------------------------------------------------------------------------------- 1 | # Open Data Hub - Determine CodeFlare Deployment Strategy 2 | 3 | | | | 4 | | -------------- | ------------------------------------------------------------------------------------ | 5 | | Date | September 22, 2023 | 6 | | Scope | Distributed Workloads | 7 | | Status | Review | 8 | | Authors | [Anish Asthana](@anishasthana) | 9 | | Supersedes | N/A | 10 | | Superseded by: | N/A | 11 | | Tickets | [Tracking Issue](https://github.com/project-codeflare/codeflare-operator/issues/314) | 12 | | Other docs: | none | 13 | 14 | ## What 15 | 16 | We will no longer use an OLM installed CodeFlare operator to provide distributed workloads capabilities in ODH. 17 | 18 | ## Why 19 | 20 | The Operator of Operators pattern followed by the ODH has ODH responsible for deploying CRDs and controllers for components under its umbrella. CodeFlare has taken a different approach, where we have: 21 | 22 | 1. A published CodeFlare Operator (CFO) is available in community operator hub 23 | 1. Users need to manually subscribe to this operator 24 | 2. ODH creates configurations for CFO as part of the Data Science Cluster custom resource 25 | 1. If the CFO does not exist on the cluster, ODH Operator will expose a status requiring users to manually subscribe to the CFO. 26 | 27 | The above flow results in confusion for users – we have seen multiple instances of folks not subscribing to the CFO and coming to community channels with “issues”. Additionally, this results in CodeFlare diverging from other ODH components. 28 | 29 | ## Goals 30 | 31 | * Simplify CodeFlare usage experience for users 32 | 33 | ## Non-Goals 34 | 35 | * Changing installation path for upstream CodeFlare project 36 | 37 | ## How 38 | 39 | 1. We will create a fork of the CodeFlare Operator repository in the ODH organization 40 | 1. This fork will be kept in sync with upstream CodeFlare. There are no plans for code to diverge 41 | 2. This repository will now serve as the home for CodeFlare Operator code, manifests, and CRDs in ODH. 42 | 1. Having a fork allows us to have an ODH-controlled repository for new image builds and manifest references, allowing us to better version the CodeFlare stack in ODH. 43 | 2. CodeFlare CRDs will initially be included alongside the CFO manifests 44 | 1. They will eventually be moved into the ODH Operator bundle. 45 | 2. An implication of this is that ODH and community olm CFO can not be installed on the same cluster. 46 | 3. This fork will be synced with upstream every time there is an upstream release 47 | 4. As we also have a fork of KubeRay in the ODH organization, we can follow a similar process for KubeRay. Given this, we can probably delete all manifests from the distributed workloads repository. 48 | 49 | This would simplify the user experience greatly as users simply need to enable CodeFlare in the DSC, with ODH taking care of the rest. The above flow also brings CodeFlare closer in line with other ODH components. From a testing perspective, all of our existing test cases will still be useful. The only ones that won’t carry over without updates are the existing olm upgrade tests in the upstream CFO repository. 50 | 51 | Another benefit to the above approach is better controls around the versioning of CodeFlare. In the event of a CodeFlare community operator release, existing ODH users are not at risk of being auto-updated before the ODH changes are ready. 52 | 53 | ## Questions 54 | 55 | 1. Should we delete the distributed workloads repository altogether? 56 | 1. This requires us to have a landing place for distributed workloads documentation. 57 | 2. [GitHub Issue](https://github.com/red-hat-data-services/distributed-workloads/issues/25) 58 | 59 | ## Alternatives 60 | 61 | 1. Continue to use OLM for CodeFlare 62 | 1. This is not a great user experience as users need to manually subscribe to dependent operators. The ODH operator is currently not planning to include manage of subscription to dependent operators 63 | 2. This will result in us continuing to differ from other ODH components, which could result in other unforeseen issues popping up in the future. 64 | 65 | ## Stakeholder Impacts 66 | 67 | | Group | Key Contacts | Date | Impacted? | 68 | | --------------------- | -------------- | ------------- | --------- | 69 | | Distributed Workloads | Anish Asthana | Sept 22, 2023 | Yes | 70 | | ODH Operator | Vaishnavi Hire | Oct 4, 2023 | Yes | 71 | 72 | ## Reviews 73 | 74 | | Reviewed by | Date | Approval | Notes | 75 | | ------------------ | ------------- | -------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 76 | | Antonin Stefanutti | Sept 22, 2023 | Approved | | 77 | | Dimitri Saridakis | Sept 29, 2023 | Approved | | 78 | | Karel Suta | Sept 26, 2023 | Approved | | 79 | | Jessica Forrester | Oct 02, 2023 | Approved | If the team feels comfortable with the overhead of carrying the community operator and the slightly different install path downstream, then this is the right customer experience from my perspective. Whether the CRDs are included in the operator bundle needs to be settled, but I’d recommended sticking with whatever pattern the operator has already established for now. | 80 | | Daniele Zonca | Oct 02, 2023 | Approved | I think we need to revisit how we integrate components into ODH to limit the need of having a fork “just” to simplify user experience. But this requires work that is not in the scope of this ADR so I’m fine to proceed with this proposal assuming DW team is fine maintaining this “fork” | 81 | | Greg Sheremata | Sept 27, 2023 | Neutral | +1 to not making users click extra buttons in operatorhub. I’m agnostic to the implementation details described here, and thus not explicitly marking my review as an “approval”. | 82 | | Edson Tirelli | Sept 26, 2023 | Approved | Ideally this change is also coordinated with changes in the ODH operator, to support cases where the codeflare operator is pre-installed by the user (“managementState: unmanaged”). Also, please ensure the “fork” is only a snapshot for supporting/image building purposes and not an actual fork with diverging code. | 83 | | Vaishnavi Hire | Oct 4, 2023 | Approved | No additional notes | 84 | -------------------------------------------------------------------------------- /architecture-decision-records/explainability/ODH-ADR-XAI-0001-trustyaiservice-database-configuration.md: -------------------------------------------------------------------------------- 1 | # TrustyAI service database configuration 2 | 3 | | | | 4 | | -------------- |------------------------------------------------------------------| 5 | | Date | 8 Jun 2024 | 6 | | Scope | Explainability | 7 | | Status | Approved | 8 | | Authors | [Rui Vieira](https://github.com/ruivieira) | 9 | | Supersedes | N/A | 10 | | Superseded by: | N/A | 11 | | Tickets | [Tracking Issue](https://issues.redhat.com/browse/RHOAIENG-8178) | 12 | | Other docs: | none | 13 | 14 | 15 | ## What 16 | 17 | This document describes the decision to add configuration options to the `TrustyAIService` (TAS) Custom Resource Definition (CRD) in order to support database backends. 18 | 19 | ## Why 20 | 21 | TASs require a way to store data. Currently, TASs store data in files in a Persistent Volume Claim (PVC). This is a simple and effective way to store data, but it has limitations. 22 | Implementing a database backend will allow TASs to store data in a more efficient, secure and scalable way. 23 | 24 | In order to configure the TAS/Database communication new fields need to be added to the TAS CRD. 25 | 26 | We will distinguish the following modes throughout this document: 27 | 28 | * **PVC-mode**: the currently supported mode. Data is stored in files in a PVC 29 | * **DB-mode**: future mode where data is stored in a database. 30 | 31 | ## Goals 32 | 33 | This ADR aims to: 34 | 35 | * Define required fields to configure a database connection for TASs 36 | * Provide backwards compatibility with previous CRs (PVC-mode) 37 | * Define custom DB connections using the TAS CRD 38 | 39 | ## Non-Goals 40 | 41 | This ADR does **not** aim to: 42 | 43 | * Define database details or architecture beyond the connection level (e.g. schemas) and CRD configuration fields 44 | * Document how to deploy or manage the database backend. A running database is assumed to be available. 45 | 46 | ## How 47 | 48 | The type of database is abstracted by using Hibernate's ORM. This means that, technically, all Hibernate supported databases should also be transparently supported by the TrustyAI service (although at this stage, only the `mysql` family will be supported). We will focus on the required fields to configure the database connection. 49 | 50 | The current TAS CRD is typically defined as follows: 51 | 52 | ```yaml 53 | apiVersion: trustyai.opendatahub.io/v1alpha1 54 | kind: TrustyAIService 55 | metadata: 56 | name: trustyai-service-example 57 | spec: 58 | storage: 59 | format: "PVC" 60 | folder: "/inputs" 61 | size: "1Gi" 62 | data: 63 | filename: "data.csv" 64 | format: "CSV" 65 | metrics: 66 | schedule: "5s" 67 | batchSize: 5000 # Optional, defaults to 5000 68 | ``` 69 | 70 | The proposed configuration introduces the following fields: 71 | 72 | _TAS CR_ 73 | 74 | | Field | Mandatory | Note | 75 | |-------|-----------|------| 76 | |`databaseConfigurations`|No|Secret name with the connection credentials, service and port. If omitted, operator will look in `$(metadata.name)-db-credentials`| 77 | 78 | _Secret:_ 79 | 80 | | Field | Mandatory | Note | 81 | |------------------|-----------|------| 82 | | `databaseKind` |Yes|The type of the database (only `mysql` supported at this stage)| 83 | | `databaseUsername` |Yes|Username| 84 | | `databasePassword` |Yes|Password| 85 | | `databaseService` |Yes|The Kubernetes service to communicate with the database| 86 | |`databasePort`|Yes|The port over which communication is made| 87 | 88 | ### Example 89 | 90 | ```yaml 91 | apiVersion: trustyai.opendatahub.io/v1alpha1 92 | kind: TrustyAIService 93 | metadata: 94 | name: trustyai-service-example 95 | spec: 96 | storage: 97 | format: "DATABASE" 98 | databaseConfigurations: db-credentials # Optional 99 | metrics: 100 | schedule: "5s" 101 | batchSize: 5000 # Optional, defaults to 5000 102 | ``` 103 | 104 | ```yaml 105 | apiVersion: v1 106 | kind: Secret 107 | metadata: 108 | name: db-credentials 109 | type: Opaque 110 | data: 111 | databaseKind: mysql 112 | databaseUsername: foo 113 | databasePassword: bar 114 | databaseService: mariadb-service 115 | databasePort: 3306 116 | ``` 117 | 118 | From the above example, the operator will: 119 | 120 | * Read the provided `Secret` with the connection credentials 121 | * If not provided, it will look for a secret named `$(metadata.name)-db-credentials` 122 | * Use the secret’s `databaseService`, `databasePort`, `databaseKind` and credentials to connect to the database 123 | 124 | _Following the Principle Of Least Astonishment:_ 125 | 126 | * An invalid database configuration (_e.g._ invalid DB location) will not default to PVC, but instead prevent the TAS to be deployed, with respective feedback information (_e.g_ Kubernetes error Event + condition in the status of the CRD) 127 | * If databaseCredentials is omitted, the operator will look for a Kubernetes Secret in the same namespace with name _$(metadata.name)-db-credentials_. 128 | * If no such secret is found, the TAS will not be created 129 | 130 | > **Note** 131 | > _TBD: If the databaseService is of a file type DB (e.g. H2), a PVC will still be created to store the DB._ 132 | > _This means that the fields (spec.storage.folder and spec.storage.size still need to be provided._ 133 | 134 | The values in the CRD will be passed to the TAS as environment variables, which will be used to configure the Hibernate connection. 135 | 136 | ### Scenarios 137 | 138 | #### PVC-mode 139 | 140 | The PVC-mode is the only mode currently supported. As such, for backwards compatibility, with the current typical CR 141 | ```yaml 142 | apiVersion: trustyai.opendatahub.io/v1alpha1 143 | kind: TrustyAIService 144 | metadata: 145 | name: trustyai-service-example 146 | spec: 147 | storage: 148 | format: "PVC" 149 | folder: "/inputs" 150 | size: "1Gi" 151 | data: 152 | filename: "data.csv" 153 | format: "CSV" 154 | metrics: 155 | schedule: "5s" 156 | batchSize: 5000 # Optional, defaults to 5000 157 | ``` 158 | 159 | **PVC will take precedence over other types**. This means that if other DB related fields are added erroneously, the operator will ignore them and proceed with the PVC-mode. 160 | 161 | ```yaml 162 | apiVersion: trustyai.opendatahub.io/v1alpha1 163 | kind: TrustyAIService 164 | metadata: 165 | name: trustyai-service-example 166 | spec: 167 | storage: 168 | format: "PVC" 169 | databaseConfigurations: my-secret 170 | folder: "/inputs" 171 | size: "1Gi" 172 | data: 173 | filename: "data.csv" 174 | format: "CSV" 175 | metrics: 176 | schedule: "5s" 177 | batchSize: 5000 # Optional, defaults to 5000 178 | ``` 179 | 180 | The operator will display a warning in the logs, but proceed with the PVC-mode as previously. 181 | 182 | #### DB-mode 183 | 184 | ##### Example: MariaDB 185 | 186 | ```yaml 187 | apiVersion: trustyai.opendatahub.io/v1alpha1 188 | kind: TrustyAIService 189 | metadata: 190 | name: trustyai-service-example 191 | spec: 192 | storage: 193 | format: "DATABASE" 194 | databaseConfigurations: db-credentials # Optional 195 | metrics: 196 | schedule: "5s" 197 | batchSize: 5000 # Optional, defaults to 5000 198 | ``` 199 | 200 | ```yaml 201 | apiVersion: v1 202 | kind: Secret 203 | metadata: 204 | name: db-credentials 205 | type: Opaque 206 | data: 207 | databaseKind: mysql 208 | databaseUsername: foo 209 | databasePassword: bar 210 | databaseService: mariadb-service 211 | databasePort: 3306 212 | 213 | ``` 214 | 215 | ### Migration 216 | 217 | Options for migration. 218 | 219 | #### Migration with an existing PVC-mode 220 | 221 | **If** 222 | 223 | * `storage.format` is `DATABASE` 224 | 225 | **And** 226 | 227 | * Both `storage.folder` and `data.filename` are set 228 | 229 | Then the operator will pass the information to the service, so that it migrates from the PVC files to the DB. 230 | The details of this process are outside the scope of this ADR. 231 | 232 | #### Migration direction 233 | 234 | * Migrating from PVC-mode to DB-mode will be supported. 235 | * Migrating from DB-mode to PVC will **not** be supported. 236 | 237 | ### Co-existence 238 | 239 | Different TASs can co-exist in different modes or the same mode, since they are isolated. 240 | 241 | ```mermaid 242 | flowchart TD 243 | TA[TrustyAI\nService A] --> PVCA[PVC A] 244 | TB[TrustyAI\nService B] --> PVCB[PVC B] 245 | TC[TrustyAI\nService C] --> DBA[MariaDB C] 246 | TD[TrustyAI\nService D] --> DBD[MariaDB D] 247 | ``` 248 | 249 | However, this does not prevent deployment mistakes, such as two different services sharing the same database. 250 | 251 | 252 | ## Alternatives 253 | 254 | Since the TAS configuration and deployment is managed by the operator, there are currently no alternatives to configure the database connection from the CR. 255 | 256 | An alternative would be to provide a separate ConfigMap with the database connection details, but this would require additional manual intervention and there would be no way to specify a database connection instead of a PVC directly in the CR. 257 | 258 | ## Stakeholder Impacts 259 | 260 | ## References 261 | 262 | * [Hibernate supported databases](https://github.com/hibernate/hibernate-orm/blob/main/dialects.adoc) 263 | 264 | Reviews 265 | 266 | Reviewed by 267 | Date 268 | Notes 269 | name 270 | date 271 | ? 272 | 273 | 274 | 275 | -------------------------------------------------------------------------------- /architecture-decision-records/model-serving/ODH-ADR-MS-0001-kserve-private-network-in-cluster.md: -------------------------------------------------------------------------------- 1 | # Open Data Hub - KServe Enable Private Endpoint 2 | 3 | | | | 4 | | -------------- | ---------------------------------------------------------------- | 5 | | Date | May 5, 2024 | 6 | | Scope | Model Serving(KServe Serverless mode) | 7 | | Status | Review | 8 | | Authors | [Jooho Lee](@JOOHO) | 9 | | Supersedes | N/A | 10 | | Superseded by: | N/A | 11 | | Tickets | [Tracking Issue](https://issues.redhat.com/browse/RHOAIENG-7918) | 12 | | Other docs: | none | 13 | 14 | 15 | ## What 16 | This ADR addresses the issue with Kserve Serverless using Istio and mTLS, which requires client certificates between pods. Applications accessing the isvc endpoint via service hostname in the same cluster must include an Istio sidecar, or they will fail. 17 | 18 | ## Why 19 | Customers deploying ODH/RHOAI on OpenShift Cluster do not expect to include their services in the mesh to access deployed models. From their perspective, models deployed through KServe are just another service, and changing the network layer for access is burdensome. Therefore, we need to provide an out-of-the-box solution allowing access to inference services via internal hostnames, similar to other services. 20 | 21 | ## Goals 22 | - Pods without an Istio sidecar should have access to the isvc endpoint. 23 | - Ensure a seamless transition for users from ModelMesh to KServe. 24 | - Implement a stable and tested solution. 25 | 26 | ## Non-Goals 27 | - Immediate overhaul of the existing Knative infrastructure. 28 | - Long-term commitment to a specific certificate management solution. 29 | 30 | 31 | ## How 32 | It involves adding an Ingress Gateway and integrating with OpenShift Certificates services. 33 | The new Istio ingress gateway would only be used for internal requests. 34 | - KServe would no longer share the knative-local-gateway, with regards to configurations. 35 | - The new Istio ingress gateway would forward requests to knative-local-gateway 36 | - i.e. we build on top of Knative routing needs (or we chain KServe routing needs with Knative ones) 37 | - The ISTIO_MUTUAL configuration on knative-local-gateway is no longer an issue. 38 | - Since both KServe and Knative gateways belong to the mesh, they can use mTLS. 39 | - KServe gateway would be capable of a TLS-simple configuration. 40 | 41 | ![proposed architecture](./images/ODH-ADR-MS-0001-kserve-private-network-in-cluster-img-1.png) 42 | 43 | ## Alternatives 44 | Knative team members designed the solution below for consideration. The benefit of the solution being in Knative layer is sharing tests coverage within the component and have lighter the ODH/KServe layer on top of it. This alternative solution is not available in Knative yet and it requires a new component `cert-manager`. Therefore, it may take a time to be done, so it can be considerable in the future once available. In contrast, the main solution proposed in this document is for the short term in ODH releases. 45 | 46 | ![alternative architecture](./images/ODH-ADR-MS-0001-kserve-private-network-in-cluster-img-2.png) 47 | 48 | ## Security and Privacy Considerations 49 | - Ensuring that the chosen solution maintains or enhances the current security posture. 50 | - Managing the secure transition and implementation of certificate management. 51 | 52 | ## Risks 53 | - Delays in implementation affecting user migration. 54 | - Limited testing of this method possibly leads to unforeseen issues. 55 | 56 | ## Stakeholder Impacts 57 | 58 | | Group | Key Contacts | Date | Impacted? | 59 | | ------------ | ----------------------------------------------------------------------------------------------------------------------------------------------- | ----------- | --------- | 60 | | ModelServing | [Jooho Lee](jlee@redhat.com), [Edgar Hernandez](ehernand@redhat.com), [Alessandro Lazarotti](lazarotti@redhat.com) | May 5, 2024 | Yes | 61 | | Serverless | [Reto Lehmann](rlehmann@redhat.com), [Lukas Berk](lberk@redhat.com), [Stavros Kontopoulos](skontopo@redhat.com), [Roland Huß](rhuss@redhat.com) | May 5, 2024 | Yes | 62 | | ServiceMesh | [Rob Cernich](rcernich@redhat.com), [Bartosz Majsak](bmajsak@redhat.com), [Aslak Knutsen](aknutsen@redhat.com) | May 5, 2024 | Yes | 63 | | Dashboard | [Andrew Ballantyne](aballant@redhat.com), [Lucas Fernadez aragon](lferrnan@redhat.com), [Vince Conzola](vconzola@redhat.com) | May 5, 2024 | Yes | 64 | | ODH Operator | [Vaishnavi Hire](vhire@redhat.com) | May 5, 2024 | Yes | 65 | 66 | # Reviews 67 | 68 | | Reviewed by | Date | Approval | Notes | 69 | | --------------------- | ----------- | -------- | ----- | 70 | | Jooho Lee | Jun 3, 2024 | Approved | | 71 | | Edgar Hernandez | Jun 3, 2024 | Approved | | 72 | | Alessandro Lazarotti | Jun 3, 2024 | Approved | | 73 | | Daniele Zonca | Jun 3, 2024 | Approved | | 74 | | Reto Lehmann | Jun 3, 2024 | Approved | | 75 | | Rob Cernich | Jun 18, 2024 | Approved | | 76 | | Lucas Fernadez aragon | Jun 12, 2024 | Approved | | 77 | | Wen Zhou | Jun 27, 2024 | Approved | | 78 | -------------------------------------------------------------------------------- /architecture-decision-records/model-serving/images/ODH-ADR-MS-0001-kserve-private-network-in-cluster-img-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/architecture-decision-records/model-serving/images/ODH-ADR-MS-0001-kserve-private-network-in-cluster-img-1.png -------------------------------------------------------------------------------- /architecture-decision-records/model-serving/images/ODH-ADR-MS-0001-kserve-private-network-in-cluster-img-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/architecture-decision-records/model-serving/images/ODH-ADR-MS-0001-kserve-private-network-in-cluster-img-2.png -------------------------------------------------------------------------------- /architecture-decision-records/operator/ODH-ADR-0004-odh-trusted-ca-configmap.md: -------------------------------------------------------------------------------- 1 | # Open Data Hub - Make Trusted Bundle Configmap available 2 | 3 | 4 | | | | 5 | | -------------- |-------------------------------------------------------------| 6 | | Date | 2024-02-12 | 7 | | Scope | Open Data Hub | 8 | | Status | Draft | 9 | | Authors | [Landon LaSmith](@LaVLaS), [Vaishnavi Hire](@VaishnaviHire) | 10 | | Supersedes | N/A | 11 | | Superseded by: | N/A | 12 | | Tickets | | 13 | | Other docs: | none | 14 | 15 | ## What 16 | 17 | Add trusted-cabundle configmap to all non-openshift namespaces on ODH operator installation. 18 | 19 | ## Why 20 | 21 | The first step to support self-signed certificates in ODH deployments is to make trusted-cabundle available in all ODH namespaces. 22 | 23 | This allows ODH components to mount the certs as part of their deployment VolumeMounts. 24 | 25 | ## Goals 26 | 27 | * Make trusted-cabundle configmap available in all non-openshift namespaces 28 | * Users can opt-out of configmap injection by explicitly adding `security.opendatahub.io/inject-trusted-ca-bundle=false` annotation to a given namespace. 29 | 30 | ## Non-Goals 31 | 32 | * Modification or management of OpenShift-specific or default namespaces concerning the trusted-cabundle configmap. 33 | * Removal of injected trusted-cabundle configmap 34 | 35 | ## How 36 | 37 | * We are introducing a controller that will be responsible for creating trusted-cabundle configmap in all new and existing non-openshift namespaces. 38 | * For trusted-cabundle configmap, we are standardizing on `odh-trusted-ca-bundle` as the configmap name with a label of `app.kubernetes.io/part-of=opendatahub-operator`. 39 | * A namespace is considered non-openshift if - 40 | * It doesn't start with `openshift-` 41 | * It doesn't start with `kube-` 42 | * It is not `openshift` 43 | * It is not `default` 44 | * The configmap injection is triggered using an api field in DSCI `.spec.trustedCABundle.managementState`. When set to a `Managed` state this will inject the cert configmap in 45 | all non-openshift namespaces. Users can opt-out of cert injection by setting the managementState to `Removed`. 46 | 47 | ## Alternatives 48 | 49 | 1. We are proposing a longer term solution that involves adding cert-cabundle configmap to only the namespaces that have 50 | ODH resources. This approach is contingent upon the successful implementation of [DataScienceProjects controller ADR](https://github.com/opendatahub-io/architecture-decision-records/pull/25). 51 | 52 | 53 | ## Stakeholder Impacts 54 | 55 | | Group | Key Contacts | Date | Impacted? | 56 | |-----------------------|-------------------| ---------- | --------- | 57 | | ODH Dashboard Team | @andrewballantyne | date | ? | 58 | | IDE Team | @harshad16 | date | ? | 59 | | DS Pipelines Team | @HumairAK | date | ? | 60 | | Serving Team | @Jooho | date | ? | 61 | | TrustyAI Team | @RobGeada | date | ? | 62 | | Docs Team | Manuela Ansaldo | date | ? | 63 | | Distributed Workloads | @anishasthana | date | ? | 64 | 65 | ## Reviews 66 | 67 | | Reviewed by | Date | Notes | 68 | |---------------|------------| ------| 69 | | Edson Tirelli | 2024-02-12 | ? | 70 | -------------------------------------------------------------------------------- /architecture-decision-records/operator/ODH-ADR-Operator-0001-distributed-manifests.md: -------------------------------------------------------------------------------- 1 | # Open Data Hub - odh-manifests git repository transition 2 | 3 | | | | 4 | | -------------- | ---------- | 5 | | Date | 2023-08-28 | 6 | | Scope | | 7 | | Status | Approved | 8 | | Authors | [Wen Zhou](@zdtsw) | 9 | | Supersedes | N/A | 10 | | Superseded by: | N/A | 11 | | Tickets | | 12 | | Other docs: | none | 13 | 14 | ## What 15 | 16 | This document outlines a solution to transform the current setup of hosting manifests within a centralized `odh-manifests` git repository into separate repositories for each individual component. 17 | 18 | ## Why 19 | 20 | The existing structure of having a singular `odh-manifests` git repository for hosting manifests across all components presents several drawbacks: 21 | 22 | - Manifests Duplication: The presence of duplicated manifests can lead to confusion among community users. 23 | - Synchronization Challenges: Ensuring timely updates from component repositories to `odh-manifests` becomes problematic. 24 | - Scalability Concerns: Extending the scope of Open Data Hub to include new tier-0/1 components presents complexities, despite its current status as the solution for tier-0/1. 25 | 26 | ## Goals 27 | 28 | - Reduced Human Error: Streamlining the release cycle to minimize human errors. 29 | - Enhanced Product Quality: Increasing confidence in the overall product quality. 30 | - Developer Support: Assisting developers in validating and troubleshooting changes within their respective domains. 31 | - This approach aims to enhance the organization and efficiency of manifest management within Open Data Hub's ecosystem. 32 | 33 | ## Non-Goals 34 | 35 | This ADR is specifically intendeded for tier-0/1 components, which are supported across all repositories within the opendatahub-io organization. 36 | 37 | ## How 38 | 39 | To achieve this transition, a multi-step approach is proposed: 40 | 41 | - Component Repositories: Each component's git repository will independently host its manifests. Progress towards this transition is tracked in the checklist below: 42 | 43 | | Component | Default Git Repo | Default Git branch/tag| Transition finished ? | 44 | | ----------------------- | ----------------------------------------------- |----------------- | ----------- | 45 | | Platform | opendatahub-io/opendatahub-operator | main | Yes | 46 | | Dashboard | opendatahub-io/odh-dashboard | incubation | Yes | 47 | | Data Science Pipelines | opendatahub-io/data-science-pipelines-operator | main | Yes | 48 | | Kserve | opendatahub-io/kserve | release-v0.11 | No | 49 | | Modelmesh | opendatahub-io/modelmesh-serving | - | No | 50 | | Workbenches | opendatahub-io/notebooks | main | Yes | 51 | | Workbenches | opendatahub-io/kubeflow | v1.7-branch | Yes | 52 | | Ray | opendatahub-io/distributed-workloads | main | Yes | 53 | | Codeflare | opendatahub-io/distributed-workloads | main | Yes | 54 | 55 | - Operator Integration: Within the operator, a function will be introduced to fetch manifests from individual component repositories during image build processes. 56 | - Archival of odh-manifests: During the image build process, the current `odh-manifests` git repository is downloaded into a tarball file, which is subsequently decompressed into the final operator image. This approach is currently in use. However, as part of this transition, the `odh-manifests` git repository will be archived into read-only mode to align with the evolving workflow. 57 | - Development Mode API: A new API field in `DataScienceCluster` will be integrated into the operator for development mode, enabling the fetching of manifests from component repositories during runtime. 58 | 59 | ## Open Questions 60 | 61 | - Quality Assurance for Manifests: the component team is responsible for ensuring the quality of its manifests. Any changes should undergo thorough verification before merging into the default branch, as it serves as a crucial integration point. ODH nightly build will be generated and subjected to essential verification steps, enabling a rapid feedback loop. 62 | - Flexible Component Repository Proposal: the proposal is to use the component repository within the "opendatahub-io" organization. However, if this doesn't align with most use cases, we are open to making it more configurable to accommodate different components. 63 | - Verification Process Proposal: In the current workflow, changes made to the odh-manifests Git repository undergo comprehensive testing steps as a pre-merge CI process. However, a defined process for post-transition verification remains pending. This raises the question of whether verification should be conducted within the Operator CI system or if we should rely on the ODH nightly build solely for post-merge verification purposes. Further clarification and decision-making are needed in this regard. 64 | - During the transition period, the operator continues to utilize the `odh-manifests` git repository to retrieve manifests for any components that are not yet ready to host manifests. 65 | 66 | ## Alternatives 67 | 68 | N/A 69 | 70 | ## Security and Privacy Considerations 71 | 72 | N/A 73 | 74 | ## Risks 75 | 76 | There are a couple of potential risks associated with the proposed approach: 77 | 78 | - Operator Logic Updates: The component teams will be required to update the logic within the operator to align with the newly updated manifests. This transition may demand additional effort and coordination to ensure a seamless integration. 79 | - Non-Production Manifests in Runtime: As a consequence of the new setup, there's a possibility that users might unintentionally use non-production manifests to build image or during runtime. This could potentially lead to unexpected behavior or issues. 80 | - Monitoring Maintenance: The platform team will take primary responsibility for maintaining any modifications made to downstream monitoring manifests. 81 | - ODH Release Cycle: Each component must provide a valid release tag from its git repository for the release coordinator to update the configuration in the operator. Otherwise, it falls back to using the default branch or tag as specified in the table above. 82 | 83 | Addressing these risks through careful planning and communication will be crucial to the success of the manifest repository transition. 84 | 85 | ## Stakeholder Impacts 86 | 87 | | Group | Key Contacts | Impacted? | 88 | | ----------------------- | ----------------------------------------- | --------- | 89 | | Platform | [Landon LaSmith](@LaVLas) | Yes | 90 | | Open Data Hub UI | [Andrew Ballantyne](@andrewballantyne) | Yes | 91 | | Data Science Pipelines | [Giulio Frasca](@gmfrasca) | Yes | 92 | | Model Serving | [Daniel Zonca](@danielezonca) | Yes | 93 | | Workbenches | [Harshad Reddy Nalla](@harshad16) | Yes | 94 | | Distributed Workloads | [Anish Asthana](@anishasthana) | Yes | 95 | 96 | ## References 97 | 98 | N/A 99 | 100 | ## Reviews 101 | 102 | | Reviewed by | Date | Notes | 103 | | ------------------------------ | ------------ | ----- | 104 | |[Humair Khan](@HumairAK) | 2023-09-08 | ----- | 105 | |[Anish Asthana](@anishasthana) | 2023-09-10 | ----- | 106 | |[Vaishnavi Hire](@VaishnaviHire)| 2023-09-11 | ----- | 107 | |[Harshad Reddy Nalla](@harshad16)|2023-09-11 | ----- | 108 | |[Joohoo Lee](@Jooho) | 2023-09-20 | ----- | 109 | |[Andrew Ballantyne](@andrewballantyne)| 2023-09-25|-----| 110 | -------------------------------------------------------------------------------- /architecture-decision-records/operator/ODH-ADR-Operator-0002-operator-scope.md: -------------------------------------------------------------------------------- 1 | # Open Data Hub - Operator Scope 2 | 3 | | | | 4 | | -------------- | ---------- | 5 | | Date | Sep 5th, 2023 | 6 | | Scope | Open Data Hub Operator | 7 | | Status | Approved | 8 | | Authors | [Edson Tirelli](@etirelli), [Vaishnavi Hire](@VaishnaviHire) | 9 | | Supersedes | N/A | 10 | | Superseded by: | N/A | 11 | | Tickets | [Tracker issue](https://github.com/opendatahub-io/opendatahub-operator/issues/158) | 12 | | Other docs: | [TODO: add link to ODH Operator Design](http://) | 13 | 14 | ## What 15 | 16 | OpenShift and Kubernetes operators can be Namespace scoped, Multi-namespace scoped or Cluster scoped. The choice of the 17 | scope has impacts both on required permissions as well as operator capabilities. 18 | 19 | The ODH Operator v2.x is a meta-operator that manages a number of resources and other operators in the OpenShift cluster. 20 | As such, the operator was implemented as a Cluster Scoped Operator. This ADR captures the reasoning and impact of the 21 | decision. 22 | 23 | ## Why 24 | 25 | The ODH operator acts like a meta operator, installing and managing other operators and resources. In particular the operator manages the following types of resources, among possibly others: 26 | 27 | - Namespaces 28 | - Deployment 29 | - Role 30 | - ClusterRole 31 | - RoleBindings 32 | - ClusterRoleBindings 33 | - ConfigMaps 34 | - Secrets 35 | - Service 36 | - NetworkPolicy 37 | - Route 38 | 39 | The operator deploys the following Custom Resource Definitions: 40 | 41 | - DSCInitialization 42 | - DataScienceCluster 43 | - FeatureTracker 44 | 45 | The operator (optionally) requires and/or manages the following dependent operators, among others: 46 | 47 | - KServe 48 | - ModelMesh 49 | - Data Science Pipeline Operator 50 | - KubeRay 51 | - Codeflare Operator 52 | - Kueue 53 | - Training Operator 54 | - TrustyAI Operator 55 | - Model Registry Operator 56 | 57 | In order to properly manage these resources, the operator requires access and permissions across namespaces. In particular, the operator creates and/or manages the following namespace (the namespace name can be changed via configuration): 58 | 59 | - opendatahub 60 | - odh-model-registries 61 | 62 | The operator also leverages the platform’s [Owner References](https://kubernetes.io/docs/concepts/overview/working-with-objects/owners-dependents/) capability that ensures all dependent resources are owned by the operator and their life cycles are tracked and managed cleanly. Kubernetes however restricts the use of owner references in a way that only cluster scoped resources can own other cluster scoped resources, leading to the requirement of the operator being cluster scoped. 63 | 64 | ## Goals 65 | 66 | * The operator needs to support management of resources across namespaces 67 | * The operator must track all resources it creates and manages 68 | * The operator must remove resources created by a managed custom resource when that custom resource is removed 69 | 70 | ## Non-Goals 71 | 72 | * 73 | 74 | ## How 75 | 76 | The ODH operator v2.x is set to Cluster scope. 77 | 78 | ## Alternatives 79 | 80 | We considered the use of a namespace scoped operator, but the following limitations were determinant to choose the Cluster scope instead: 81 | 82 | ### 1. Some dependent resources are cluster scoped 83 | Some of the resources/dependencies that OpenShift AI requires are cluster scoped (e.g. Serverless, ModelMesh, etc). Managing these resources/dependencies requires the operator to also be cluster scoped. 84 | The operator also introduces two new CRDs (DSCInitialization) and (DataScienceCluster) that are cluster scoped. 85 | 86 | ### 2. Tracking and management of dependent resources 87 | One of the drivers for the design and development of the ODH v2 operator was the need to increase the resilience and reliability of the tracking and management of dependent resources. In particular the operator should provide accurate, clear and reliable diagnostics in case of problems with dependencies, and be able to properly maintain a clean cluster in case of upgrades or uninstalls. 88 | The operator leverages platform capabilities for that task, like owner references. Namespaced operators cannot own resources outside their own namespace (as per Kubernetes design). Scoping the operator to a namespace would require us to stop using owner references and implement from scratch the management of resources, using a different mechanism like labels. That would be complex, brittle, and duplicate functionality available in the platform itself. 89 | 90 | ### 3. Contradictory requirements with other users and best practices 91 | The use of namespaced operator would prevent us from meeting the requirements of other customers and users (e.g. automated install and upgrades of resources across namespaces). 92 | 93 | ## Tradeoffs 94 | 95 | The main drawback of using a cluster scoped operator is the impossibility of running multiple instances and/or versions of RHOAI in the same cluster. Users that need multiple instances/versions of RHOAI are required to use one cluster for each. For cases where cost is a concern, alternatives like Hosted Control Plane can be considered. 96 | 97 | ## Stakeholder Impacts 98 | 99 | | Group | Key Contacts | Date | Impacted? | 100 | | ----------------------------- | ---------------- | ---------- | --------- | 101 | | ODH Platform Team | Vaishnavi Hire | 2023/09/05 | ? | 102 | 103 | ## References 104 | 105 | * ODH Operator Design document 106 | 107 | ## Reviews 108 | 109 | | Reviewed by | Date | Notes | 110 | | ----------------------------- | --------- | ------| 111 | | [Vaishnavi Hire](https://github.com/VaishnaviHire) | Sep 15, 2023 | | 112 | | [Wen Zhou](https://github.com/zdtsw) | Sep 15, 2023 | | 113 | | [Trevor Royer](https://github.com/strangiato) | Sep 15, 2023 | | 114 | 115 | -------------------------------------------------------------------------------- /architecture-decision-records/operator/ODH-ADR-Operator-0003-component-integration.md: -------------------------------------------------------------------------------- 1 | # Open Data Hub - ODH component Integration with DataScienceCluster 2 | 3 | | | | 4 | | -------------- |----------------------------------| 5 | | Date | 2023-09-18 | 6 | | Scope | | 7 | | Status | Draft | 8 | | Authors | [Vaishnavi Hire](@VaishnaviHire) | 9 | | Supersedes | N/A | 10 | | Superseded by: | N/A | 11 | | Tickets | | 12 | | Other docs: | none | 13 | 14 | ## What 15 | 16 | This document outlines design decision to integrate ODH components with DataScienceCluster CRD. The document also defines the rationale behind fostering a close-knit integration between the individual components and the operator API. 17 | 18 | ## Why 19 | 20 | The [KfDef](https://github.com/opendatahub-io/opendatahub-operator/blob/master/config/crd/bases/kfdef.apps.kubeflow.org_kfdefs.yaml) CRD, defined by the v1.x of ODH operator allowed any valid kustomize manifests to be deployed to the OpenShift cluster. 21 | However, this design introduced following drawbacks: 22 | 23 | - **Managing Resources:** Monitoring all resources for custom components became strenuous, coupled with the hindrance in utilizing owner references for a clean-up process. 24 | - **Limited Customization:** The existing structure restricted the customization of components, offering limited accessibility to component-specific fields through the API. 25 | - **Duplication of GitOps Workflow/ kustomize-build:** The KfDef CRD replicated the kustomize build functions, presenting no supplemental features post the deployment of the components. 26 | 27 | ## Goals 28 | 29 | - **Increased Component Customization:** Allowing the [DataScienceCluster](https://github.com/opendatahub-io/opendatahub-operator/blob/main/config/crd/bases/datasciencecluster.opendatahub.io_datascienceclusters.yaml) API to expose every integrated component will accord users the flexibility to directly configure component-specific fields via the CRD, thereby expanding customization scope. 30 | - **Improved Component Management:** As every component is tightly coupled with the operator, the controller is aware of the resources being deployed and only has the permissions to watch and manage those specific resources. This also allows operator to manage component lifecycle and upgrades. 31 | - **Informed Approach:** The goal is to ensure that the operator has knowledge of the components being deployed, and make intelligent decisions based on that knowledge. 32 | ## Non-Goals 33 | 34 | - This ADR will not define transition of Tier 2 components into Tier 0/1. 35 | 36 | ## How 37 | 38 | - To achieve this transition, any new component should be integrated with ODH Operator by following steps given [here](https://github.com/opendatahub-io/opendatahub-operator/blob/main/components/README.md). As a result update the DataScieceCluster API to expose new component fields. 39 | - Ensure any components that are integrated follow the requirements for Tier 0/1 components. 40 | 41 | ## Open Questions 42 | 43 | - **Quality Assurance for Components:** The component team is responsible for ensuring unit tests are added to any new component specific code and for update operator [e2e tests](https://github.com/opendatahub-io/opendatahub-operator/blob/main/tests/e2e/helper_test.go#L55) to include testing of the 44 | new component. 45 | 46 | 47 | ## Alternatives 48 | 49 | - Any valid kustomize manifests that users want to deploy alongside ODH integrated components, can be deployed using kustomize build 50 | or GitOps workflow. 51 | 52 | ## References 53 | 54 | N/A 55 | 56 | -------------------------------------------------------------------------------- /architecture-decision-records/operator/ODH-ADR-Operator-0005-configure-resources.md: -------------------------------------------------------------------------------- 1 | # Open Data Hub - Whitelist some component fields for user customizations 2 | 3 | | | | 4 | |----------------|--------------------------------------------------------------------------| 5 | | Date | 2024-03-07 | 6 | | Scope | | 7 | | Status | Draft | 8 | | Authors | [Vaishnavi Hire](@VaishnaviHire) | 9 | | Supersedes | https://github.com/opendatahub-io/architecture-decision-records/pull/23 | 10 | | Superseded by: | N/A | 11 | | Tickets | | 12 | | Other docs: | none | 13 | 14 | ## What 15 | 16 | This document outlines design decision to introduce ability for user customizations of fields like resources and replicas. 17 | 18 | ## Why 19 | 20 | ODH component deployments lack a mechanism for customizing resource limits, requests and deployment replicas. Deployment fields are hard-coded in the manifests with no means to adjust them according to user requirements. 21 | We need a mechanism for users to configure resources when available resources are limited. 22 | 23 | ## Goals 24 | 25 | * Enable the configuration of resource limits, requests and replicas for individual components. 26 | * Introduce an internal kustomize plugin that will whitelist fields like `replicas` and `resources`. This means any changes to these 27 | fields will not be overwritten by the operator. 28 | * We will not update or expose any fields 29 | 30 | ## Non-Goals 31 | 32 | - This ADR will not define customization parameters other than `resources` and `replicas`. 33 | 34 | ## How 35 | 36 | * Implementation for this will be done in two phases 37 | * Introduce customizations only in **Kserve** component. This is to address resource utilization [issues](https://github.com/kserve/kserve/issues/3467) in kserve. 38 | * Replicate the functionality in other components that require customization. 39 | * Kustomize [plugins](https://github.com/kubernetes-sigs/kustomize/tree/master/plugin/builtin) can be used to patch resources once deployed. 40 | 41 | 42 | ## Alternatives 43 | 44 | - N/A 45 | 46 | 47 | ## Stakeholder Impacts 48 | 49 | | Group | Key Contacts | Impacted? | 50 | | ----------------------- |----------------------------------------------------------------|-----------| 51 | | Platform | [Landon LaSmith](@LaVLas), [Edson Tirelli](@etirelli) | Yes | 52 | | Model Serving | [Daniel Zonca](@danielezonca), [Edgar Hernández](@israel-hdez) | Yes | 53 | | ODH Dashboard Team | @andrewballantyne | ? | 54 | | IDE Team | @harshad16 | ? | 55 | | DS Pipelines Team | @HumairAK | ? | 56 | | Serving Team | @Jooho | ? | 57 | | TrustyAI Team | @RobGeada | ? | 58 | | Distributed Workloads | @dimakis | ? | 59 | 60 | ## References 61 | 62 | N/A 63 | 64 | -------------------------------------------------------------------------------- /architecture-decision-records/operator/ODH-ADR-Operator-0006-internal-api.md: -------------------------------------------------------------------------------- 1 | # Open Data Hub - Architecture Decision Record template 2 | 3 | 4 | 5 | 6 | 7 | | | | 8 | | -------------- | ---------- | 9 | | Date | Sep 13th, 2024 | 10 | | Scope | Open Data Hub Operator | 11 | | Status | Approved | 12 | | Authors | [Luca Burgazzoli](@lburgazzoli), [Vaishnavi Hire](@VaishnaviHire) | 13 | | Supersedes | N/A | 14 | | Superseded by: | N/A | 15 | | Tickets | | 16 | | Other docs: | none | 17 | 18 | ## What 19 | 20 | The Open Data Hub Operator is a meta-operator that manages a number of resources and other operators in the OpenShift cluster leveragin a number of CRDs: 21 | - **DSCInitialization (DSCI)**: a cluster scope user-facing API that the operator creates to perform initial setup common for all components. 22 | - **DataScienceCluster (DSC)**: a cluster scope user-facing API that the ODH Operator watches to enable and configure various data science components. 23 | - **FeatureTracker**: an internal API that represents a cluster-scoped resource in the Data Science Cluster specifically designed for monitoring and managing component related resources created by the operator. 24 | 25 | This document outlines design decision to introduce additional, internal only, components specific CRDs. 26 | 27 | ## Why 28 | 29 | The deployment of components is handled by a single reconcile loop that is in charge to deploy all the enabled components within the platform. 30 | This means that one centralized loop handles the configuration, updates, and error handling for all components. 31 | 32 | ![architecture](assets/ODH-ADR-Operator-0006/odh-operator-current.png) 33 | 34 | However, this design introduced following drawbacks: 35 | - **Scalability**: as the ODH Operator evolves and new features or components are added, the single reconcile loop can become a significant bottleneck. The centralized nature of the loop means that any update or change in one component triggers a reconciliation for all components, even those unaffected by the change. This can lead to delays in processing, reduced responsiveness, and a potential decrease in the overall performance of the operator. 36 | - **Granularity**: the single reconcile loop lacks granularity, meaning that any update or error in one component forces the loop to reconcile all components. This can cause unnecessary workload and resource consumption, particularly in large deployments where only a subset of components may need updating. 37 | - **Error handling and failure isolation**: One of the most significant challenges with this model is error handling and failure isolation. Since all components are managed within the same loop, a failure in one part of the reconciliation process can affect the entire loop, potentially disrupting the deployment or management of other components. 38 | 39 | ## Goals 40 | 41 | - Improve efficiency and scalability of the ODH Operator. 42 | - Improve developer productivity and ability to add more features/components/subsystems over time. 43 | - Improve visibility of the state of the platform. 44 | - Reduce resource ustilisation, having a more fine grained reconciliation and impacting only components that have changed, so less resources spent rendering not impacted resources and less call to the API server. 45 | 46 | ## Non-Goals 47 | 48 | * Provide additional, user facing CRDs. 49 | 50 | ## How 51 | 52 | Move from a monolithic reconciliation loop to a more modular approach that manages components individually by: 53 | 54 | - Introducing more granular reconciliation. 55 | - Introducing a set of internal APIs/CRDs. 56 | - These CRDs will be used exclusively by the operator for internal management and will not be exposed to end-users and marked as [internal objects](https://docs.openshift.com/container-platform/4.16/operators/operator_sdk/osdk-generating-csvs.html#osdk-hiding-internal-objects_osdk-generating-csvs) so the OCP console hides them in the Operator UIs. 57 | - Since CRDs are public, the ODH Operator will exclusively own them and it will revert any changes applied outside its control. 58 | - Each internal API/CRD will have its own reconciliation loop, which: 59 | - Is responsible for managing the lifecycle and state of the associated components. 60 | - Surfaces any relevant information as part of the component specific status (versions, namespace, routes, service, etc). 61 | - The DSC reconciler would be in charge: 62 | - To create/update/delete internal CRs depending on the management state and configuration of the components. 63 | - To updates status of the DSC based on individual component CR statuses. 64 | 65 | 66 | ![architecture](assets/ODH-ADR-Operator-0006/odh-operator-next.png) 67 | 68 | ## Alternatives 69 | 70 | N/A 71 | 72 | ## Stakeholder Impacts 73 | 74 | | Group | Key Contacts | Date | Impacted? | 75 | | ------------------ | ---------------- | ---------- | --------- | 76 | | ODH Platform Team | [Luca Burgazzoli](@lburgazzoli), [Vaishnavi Hire](@VaishnaviHire) | 2024/09/13 | YES | 77 | 78 | ## Reviews 79 | 80 | | Reviewed by | Date | Notes | 81 | | ----------------------------- | --------- | ------| 82 | | name | date | ? | 83 | -------------------------------------------------------------------------------- /architecture-decision-records/operator/ODH-ADR-Operator-0007-auth-crd.md: -------------------------------------------------------------------------------- 1 | # Open Data Hub - Addition of an Auth CR 2 | 3 | 4 | 5 | 6 | 7 | | | | 8 | | -------------- | ---------- | 9 | | Date | 22-10-2024 | 10 | | Scope | Open Data Hub Operator | 11 | | Status | Accepted | 12 | | Authors | [Steven Tobin](@StevenTobin) | 13 | | Supersedes | N/A | 14 | | Superseded by: | N/A | 15 | | Tickets | [RHAOIENG-14807](https://issues.redhat.com/browse/RHOAIENG-14807)| 16 | | Other docs: | none | 17 | 18 | ## What 19 | 20 | This document outlines the decision to implement an auth CRD in ODH. 21 | 22 | ## Why 23 | 24 | There is a growing need for the ODH operator to provide centralized authentication and authorization services for the platform. The near term needs are : 25 | 26 | - Add support for managing user groups. Currently this is managed by the Dashboard component and needs to be centralized. 27 | - Prepare for upcoming for changes to handle OCP Platform support for external OIDC authentication 28 | 29 | Longer term, this API and its controller will be enhanced to handle additional RBAC configuration currently handled in the Dashboard component, such as creating roles and role bindings. 30 | 31 | Centralising auth concerns will simplify the architecture of auth features, security and ease of use of these features for users. 32 | 33 | 34 | ## Goals 35 | 36 | * Define a new auth CRD. 37 | 38 | ## Non-Goals 39 | 40 | 41 | ## How 42 | 43 | A new API is added to the operator in the form of an auth CRD in the services api group. This will initially handle a list of adminGroups and allowedGroups (migrated from the groupsConfig field of the dashboardConfig) which the operator will reconcile for access to dashboard UIs and applying requisite openshift permissions. This is intended to be the canonical place for auth configuration for future auth initiatives. 44 | 45 | The Auth CR is a singleton like the DSC and DSCi CRs. 46 | 47 | An example of the CR: 48 | ``` 49 | apiVersion: services.platform.opendatahub.io/v1alpha1 50 | kind: Auth 51 | metadata: 52 | name: odhAuth 53 | spec: 54 | adminGroups: [] 55 | allowedGroups: [] 56 | ``` 57 | 58 | ## Open Questions 59 | * Migration Path: 60 | * We will use [CEL](https://kubernetes.io/blog/2022/09/29/enforce-immutability-using-cel/#immutability-upon-object-creation) to make the groupsConfig field in the current OdhDashboardConfig CRD immutable. The operator will manage copying the content of the field over to the new CRD and the Auth CRD will be the new source of truth for adminGroups and allowedGroups. 61 | 62 | ## Alternatives 63 | 64 | * Continue to not have a central API for auth. 65 | 66 | ## Security and Privacy Considerations 67 | 68 | 69 | ## Risks 70 | 71 | 72 | ## Stakeholder Impacts 73 | 74 | | Group | Key Contacts | Date | Impacted? | 75 | | ----------------------------- | ---------------- | ---------- | --------- | 76 | | ODH platform | [Luca Burgazzoli](@lburgazzoli), [Chris Sams](@), [Lindani Phiri](@), [Steven Tobin](@StevenTobin) | 22-10-2024 | ? | 77 | 78 | 79 | ## References 80 | 81 | * optional bulleted list 82 | 83 | ## Reviews 84 | 85 | | Reviewed by | Date | Notes | 86 | | ----------------------------- | --------- | ------| 87 | | name | date | ? | 88 | -------------------------------------------------------------------------------- /architecture-decision-records/operator/ODH-ADR-Operator-0007-components-version-mapping.md: -------------------------------------------------------------------------------- 1 | # Open Data Hub - Map component upstream versions to ODH releases 2 | 3 | | | | 4 | | -------------- |----------------------------------| 5 | | Date | 2024-10-31 | 6 | | Scope | Open Data Hub Operator | 7 | | Status | Approved | 8 | | Authors | [Saravana Srinivasan](@sasriniv) | 9 | | Supersedes | N/A | 10 | | Superseded by: | N/A | 11 | | Tickets | | 12 | | Other docs: | none | 13 | 14 | ## What 15 | 16 | This document is intended to outline the design decisions made to map upstream versions of components that are supported by Data Science Cluster to ODH releases. 17 | 18 | ## Why 19 | 20 | Users are expecting to know the list of upstream components and their versions that are being shipped with the product. There were already several requests from users wanting to know the component versions during releases. 21 | 22 | ## Goals 23 | 24 | - To maintain a standard to capture the list of upstreams of the components along with their version and repository url that are supported by Data Science Cluster. 25 | - Have it displayed in the Data Science Cluster's components status. 26 | 27 | ## How 28 | 29 | - Component teams are expected to create and maintain a "component_metadata.yaml" file in their repositories, in the root of the directory from where the manifests are retrieved by the ODH operator at build time. The yaml can contain release details of the upstreams in the format specified below. 30 | ``` 31 | releases: 32 | - name: 33 | version: 34 | repositoryurl: 35 | - name: 36 | version: 37 | repositoryurl: 38 | ``` 39 | - Develop code logic on the ODH operator to read through the component_metadata.yaml file. 40 | - Update the component status section in Data Science Cluster with the information read from the yaml file 41 | 42 | ## Open Questions 43 | 44 | N/A 45 | 46 | ## Responsibility 47 | 48 | - **Updating version details:** The component teams are responsible for creating and maintaining the "component_metadata.yaml" file containing details of the upstreams and also, to promptly update the same when there are new additions to the upstreams. 49 | 50 | 51 | ## Alternatives 52 | 53 | - Initially, it was proposed to maintain this version information in an .env file. But, with further discussions and understanding, certain components have multiple upstreams with them. Maintaining and fetching information about multiple upstream releases from an env file would be cumbersome, this pushed us to choose a yaml file where we can group details of each of the upstreams. 54 | 55 | ## Stakeholder Impacts 56 | 57 | | Group | Key Contacts | Date | Impacted? | 58 | | ------------------------- | --------------------------------------------------------------- | ---------- | --------- | 59 | | ODH Platform Team | @lburgazzoli @lphiri | | y | 60 | | Model Serving | [Daniel Zonca](@danielezonca), [Edgar Hernández](@israel-hdez) | | y | 61 | | Model Serving Runtimes | [Sean Pryor](@Xaenalt), [Vaibhav Jain](@vaibhavjainwiz) | | y | 62 | | ODH Dashboard Team | @andrewballantyne | | y | 63 | | IDE Team | @harshad16 | | y | 64 | | DS Pipelines Team | @HumairAK @gmfrasca | | y | 65 | | Serving Team | @Jooho | | y | 66 | | TrustyAI Team | @RobGeada @ruivieira | | y | 67 | | Distributed Workloads | @dimakis @amsharma3 | | y | 68 | 69 | ## References 70 | 71 | RHOAISTRAT-327 [Refinement Document](https://docs.google.com/document/d/1nbQB-uA48x79Ci3xrpMHGfdo3XjTR4xirtWk0we0kl8) 72 | -------------------------------------------------------------------------------- /architecture-decision-records/operator/ODH-ADR-Operator-0008-resources-lifecycle.md: -------------------------------------------------------------------------------- 1 | # Open Data Hub - Resource Lifecycle Management in opendatahub-operator 2 | 3 | | | | 4 | | -------------- |----------------------------------| 5 | | Date | 2025-04-05 | 6 | | Scope | Open Data Hub Operator | 7 | | Status | TBD | 8 | | Authors | [Luca Burgazzoli](@burgazzoli) | 9 | | Supersedes | N/A | 10 | | Superseded by: | N/A | 11 | | Tickets | | 12 | | Other docs: | none | 13 | 14 | ## What 15 | 16 | This ADR defines how the opendatahub-operator manages the lifecycle of the Kubernetes resources it provisions using component-defined manifests and the `opendatahub.io/managed` annotation. 17 | 18 | ## Why 19 | 20 | The operator provisions and reconciles resources defined by components. 21 | To support a variety of use cases—including user-customizable objects and create-only behavior. There needs to be a clear contract regarding how resources are managed over time, especially in the presence of manual modifications or evolving manifests. 22 | 23 | ## Goals 24 | 25 | - Define consistent resource management semantics. 26 | - Allow component developers to declare create-only resources. 27 | - Allow end users to take ownership of specific resources post-deployment. 28 | - Prevent unintended reconciliation or overwriting of user-managed resources. 29 | 30 | ## Terminology 31 | To ensure clarity throughout this document, we define the following terms: 32 | 33 | - Kustomized Manifest: The YAML configuration that has passed through the kustomization process but has not yet been applied to the cluster. This represents the intended state of resources as defined by component/services manifests and processed by kustomize. 34 | - Kubernetes Object: The actual resource that exists in the Kubernetes cluster after manifests have been applied. These are the live entities that the operator manages and users can interact with. 35 | 36 | ## How 37 | 38 | The operator evaluates the presence and value of the `opendatahub.io/managed` annotation on both the **Kustomized Manifest** and the **Kubernetes Object**: 39 | 40 | | Annotation Location | Value | Behavior | 41 | |--------------------------|------------------------------|--------------------------------------------------------------------------| 42 | | **Kustomized Manifest** | `"false"` | Resource is created once, not reconciled afterward (create-only). | 43 | | **Kustomized Manifest** | _Missing_ or `"true"` | Operator fully manages and reconciles the resource. | 44 | | **Kubernetes Object** | `"false"` | Operator skips reconciliation and treats the object as user-owned. | 45 | | **Kubernetes Object** | _Missing_ or `"true"` | Operator enforces manifest state and overwrites any manual modifications.| 46 | 47 | ### Additional Behavior 48 | 49 | - The `opendatahub.io/managed` annotation defined in Kustomized Manifests is **not propagated** to the resulting Kubernetes Objects to avoid misleading users into thinking they can control lifecycle via the cluster object. 50 | - For Kustomized Manifests with `opendatahub.io/managed: "false"`, the operator does not set an owner reference on the created Kubernetes Objects. This means that these objects will remain in the cluster rather than being garbage-collected when the component/service the objects are part of get removed/disabled. 51 | - **In all cases**, if the Kubernetes Object is deleted from the cluster (either accidentally or manually), the operator will **recreate it** during the next reconciliation loop. This ensures declared state is always realized, regardless of whether the object is fully managed or marked as create-only. 52 | 53 | 54 | ## Open Questions 55 | 56 | N/A 57 | 58 | ## Responsibility 59 | 60 | The ODH Platform team is responsible for implementing and maintaining the behavior described in this ADR within the `opendatahub-operator`. 61 | 62 | ## Alternatives 63 | 64 | N/A 65 | 66 | ## Stakeholder Impacts 67 | 68 | | Group | Key Contacts | Date | Impacted? | 69 | | ------------------------- | --------------------------------------------------------------- | ---------- | --------- | 70 | | ODH Platform Team | @lphiri | | y | 71 | | Model Serving | | | y | 72 | | Model Serving Runtimes | | | y | 73 | | Model Registry | | | y | 74 | | ODH Dashboard Team | @andrewballantyne | | y | 75 | | IDE Team | | | y | 76 | | DS Pipelines Team | | | y | 77 | | Serving Team | | | y | 78 | | TrustyAI Team | | | y | 79 | | Distributed Workloads | | | y | 80 | 81 | ## References -------------------------------------------------------------------------------- /architecture-decision-records/operator/assets/ODH-ADR-Operator-0006/odh-operator-current.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/architecture-decision-records/operator/assets/ODH-ADR-Operator-0006/odh-operator-current.png -------------------------------------------------------------------------------- /architecture-decision-records/operator/assets/ODH-ADR-Operator-0006/odh-operator-next.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/architecture-decision-records/operator/assets/ODH-ADR-Operator-0006/odh-operator-next.png -------------------------------------------------------------------------------- /documentation/README.md: -------------------------------------------------------------------------------- 1 | # RHOAI Architecture 2 | 3 | ## Architecture Overview 4 | [Architecture Overview](arch-overview.md) 5 | 6 | ## Component Architecture Details 7 | [Components](components) 8 | -------------------------------------------------------------------------------- /documentation/components/dashboard/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/components/dashboard/.gitkeep -------------------------------------------------------------------------------- /documentation/components/dashboard/assets/featureFlags.drawio: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | -------------------------------------------------------------------------------- /documentation/components/dashboard/assets/featureFlags.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/components/dashboard/assets/featureFlags.png -------------------------------------------------------------------------------- /documentation/components/dashboard/configuringDashboard.md: -------------------------------------------------------------------------------- 1 | [OdhDashboardConfig]: ./README.md#odhdashboardconfig-singleton 2 | 3 | # Configuring the Dashboard 4 | 5 | * [The Big Picture](#the-big-picture) 6 | * [Configuring Features On/Off](#configuring-features-onoff) 7 | * [UI-K8s Features](#ui-k8s-feature-eg-ds-projects-feature) 8 | * [UI-Backend Component Features](#ui-backend-component-feature-eg-ds-pipelines-feature) 9 | * [Configuring Aspects of Features](#configuring-aspects-of-features) 10 | 11 | ## The Big Picture 12 | 13 | ![featureFlags.png](assets%2FfeatureFlags.png) 14 | 15 | - (1) The user accesses the dashboard from their computer 16 | - (1a) They'll access the Dashboard route url 17 | - (1b) The client is served resources specifically from one pod (irrespective of replica counts) 18 | - (1c) This pod for all intensive purposes is the only thing the client sees & operates with 19 | - (2) All features use our "areas" concept (explained in more detail [below](#configuring-features-onoff)) 20 | - (3) "Areas" fetches all needed settings from the cluster 21 | - (3a) From the DSC & DSCI we get only `.status` values (this is typically only for features backed by a [OpenShift AI served backend](#ui-backend-component-feature-eg-ds-pipelines-feature)) 22 | - (3b) From our own OdhDashboardConfig, we will get our feature flags 23 | - (4) Every 2 minutes, each pod refreshes the internal cached state of the DashboardConfig (this delays any changes you do in this area) 24 | 25 | ## Configuring Features On/Off 26 | 27 | The Dashboard uses a concept we call "areas" to control the flow of features being visible in the Dashboard UI. It effectively is a 2-flag system around every feature we add. 28 | 29 | 1. A flag for the API installation 30 | 2. A flag for the UI installation 31 | * You may want to disable a UI flow, and interact via an API-driven flow, so you want the backend, but not the frontend 32 | 33 | "Areas" are a combination of system settings which can be simplified to these questions: 34 | 35 | * Does the Operator, specifically the DataScienceCluster (DSC), have the particular backend component installed? 36 | * Does the [OdhDashboardConfig] have the feature flag enabled? 37 | * Are other areas reliant (a foundation) for this feature & are they installed? 38 | 39 | If the "area" is configured with any combination of the 3 questions above, that combination must all be true for the feature to be visible in the UI. 40 | 41 | A couple examples are visible below, using snippets of the Dashboard configuration. 42 | 43 | ### UI-K8s Feature (eg. DS Projects feature) 44 | 45 | Our configuration would look something like this: 46 | ```javascript 47 | const configurations = { 48 | [SupportedArea.DS_PROJECTS_VIEW]: { 49 | featureFlags: ['disableProjects'], 50 | }, 51 | [SupportedArea.DS_PROJECTS_PERMISSIONS]: { 52 | featureFlags: ['disableProjectSharing'], 53 | reliantAreas: [SupportedArea.DS_PROJECTS_VIEW], 54 | }, 55 | // ... 56 | } 57 | ``` 58 | 59 | * The block above simply says "our feature flag `disableProjects` needs to be enabled to show the DS Projects View (the navigation based view / list page" 60 | * The second configuration `DS_PROJECTS_PERMISSIONS` allows us to configure a sub-portion of the DS Projects feature that you can disable our Project sharing feature (if you're an admin of a project, you can share your project with another user / group on the cluster; aka invite the to join your project) -- this feature is reliant on the Project View being visible (as it's a sub feature) -- but this allows us to 61 | 62 | As you can see, there are no backend components listed here. This is because this feature is built on K8s backend itself -- we are interacting with the K8s resources without needing a dedicated OpenShift AI backend. 63 | 64 | Note: this would effectively mean there is only active 1 layer of our 2-layered flag system for features like this. 65 | 66 | > Tip: To know what features are designed this way, consider the output of each feature. eg. [Creating a DS Project is just a OpenShift Project](#projects---openshift-console-vs-data-science-differences), Creating Custom Images are just ImageStreams on the cluster, etc 67 | 68 | ### UI-Backend Component Feature (eg. DS Pipelines feature) 69 | 70 | Our configuration would look something like this: 71 | ```javascript 72 | const configurations = { 73 | [SupportedArea.DS_PIPELINES]: { 74 | featureFlags: ['disablePipelines'], 75 | requiredComponents: [StackComponent.DS_PIPELINES], 76 | }, 77 | // ... 78 | } 79 | ``` 80 | 81 | * Similar to DS Projects, we have a feature flag 82 | * But now we see `requiredComponents`, this indicates the need of the DSC to provide a successful install of this component otherwise the Dashboard feature flag means nothing -- no backend, can't use the UI anyways 83 | 84 | > Tip: To know what features are designed this way, the simple question is... is the feature it in the DSC? 85 | 86 | ## Configuring Aspects of Features 87 | 88 | Many features have various different configurations that give them the ability to be slightly shifted to fit the needs of the customer; these usually are small 'nudges' in the way a feature works -- dropdown values, ordering of importance, display information. These aspects are almost exclusively UI flavouring and not something for the OpenShift AI stack to consume. 89 | 90 | See the comprehensive list below for Dashboard-specific aspects. 91 | 92 | > Note: These are not a comprehensive list of what each backend feature can do, just what we configure in the Dashboard for our needs. 93 | 94 | > Note: For this list, each item will be flagged with **(UI)** or **(API)**. UI features have a UI flow and can be configured through API as needed (aka outside of the UI). API features are not able to be configured inside the UI and can only be configured outside of the UI. 95 | 96 | * Workbench Container Sizes **(API)** 97 | > Visible during creation of a Workbench & Jupyter tile's Notebooks 98 | * Configured through the [OdhDashboardConfig] `.spec.notebookSizes` an array of resource objects (memory & cpu limits/requests) 99 | * We have a fallback default if not provided 100 | * Model Serving Container Sizes **(API)** 101 | > Visible during the creation of a Model Server (or KServe model) 102 | * Configured through the [OdhDashboardConfig] `.spec.modelServingSizes` an array of resource objects (memory & cpu limits/requests) 103 | * We have a fallback default if not provided 104 | * Jupyter Tile configurations **(UI)** 105 | * PVC Size through the [OdhDashboardConfig] `spec.notebookController.pvcSize` 106 | * Telemetry 107 | -------------------------------------------------------------------------------- /documentation/components/dashboard/dashboardStorage.md: -------------------------------------------------------------------------------- 1 | [Workbench component documentation]: ../workbenches 2 | [AcceleratorProfiles]: ./README.md#acceleratorprofiles 3 | 4 | # Dashboard Storage Mechanisms 5 | 6 | There are only two types of storages we have in the Dashboard. Local to the user's browser & on-cluster storage. 7 | 8 | * [Browser Storage](#browser-storage) 9 | * [On-Cluster Storage](#on-cluster-storage) 10 | * [Admin / Dashboard Configurations](#admin--dashboard-configurations) 11 | * [Non-Admin flows](#non-admin-flows) 12 | 13 | ## Browser Storage 14 | 15 | User-specific choices are currently stored in the browser's storages (local & session storages). 16 | 17 | Such as: 18 | * "remember my choice" settings 19 | * Stop notebook on toggle modal not showing up 20 | * Remember to open Jupyter tile Notebooks in new tab without asking 21 | * Active QuickStart 22 | * Some technical infrastructure around detecting token expiry and auto-handling an auto logout 23 | 24 | ## On-Cluster Storage 25 | 26 | ### Admin / Dashboard Configurations 27 | 28 | Features that impact all users or the Dashboard itself. These are only available to those considered admin. 29 | 30 | * Cluster Settings 31 | * Such as: 32 | * Model serving platforms 33 | * PVC size (Notebook tile only) 34 | * Notebook pod tolerations 35 | * Telemetry 36 | * are stored in the [OdhDashboardConfig] 37 | * Cluster Settings' Notebook Culler 38 | * This is configured by the Notebook Controller feature 39 | * Stored today as `notebook-controller-culler-config` ConfigMap in the deployment namespace (see [Workbench component documentation] for more information) 40 | * Accelerator profiles 41 | * These are stored as [AcceleratorProfiles] 42 | * Notebook images 43 | * These are stored as ImageStreams in the deployment namespace 44 | * Serving Runtimes 45 | * These are stored as OpenShift Templates in the deployment namespace 46 | > Note: OpenShift Templates was an idea of future feature expansion and are not executed as OpenShift Templates today. 47 | * Connection Types 48 | * These are stored as ConfigMaps in the deployment namespace 49 | 50 | ### Non-Admin Flows 51 | 52 | Flows that can be performed by any user, provided the [feature is enabled](./configuringDashboard.md#configuring-features-onoff). 53 | 54 | These are all stored as K8s resources using OCP or OpenShift AI backing CRDs. 55 | 56 | #### Connections 57 | 58 | Connections is a concept created by the Dashboard to store information and enable users to connect to various data sources. This information is stored in a K8s secret. The data within these secrets conform to the schema defined within connection types. Connection types are predefined OOTB and can also be defined by an admin. 59 | 60 | ```yaml 61 | kind: Secret 62 | apiVersion: v1 63 | metadata: 64 | name: aws-connection- 65 | namespace: 66 | labels: 67 | opendatahub.io/dashboard: 'true' 68 | opendatahub.io/managed: 'true' 69 | annotations: 70 | opendatahub.io/connection-type: s3 71 | openshift.io/display-name: 72 | data: 73 | AWS_ACCESS_KEY_ID: 74 | AWS_DEFAULT_REGION: 75 | AWS_S3_BUCKET: 76 | AWS_S3_ENDPOINT: 77 | AWS_SECRET_ACCESS_KEY: 78 | type: Opaque 79 | ``` 80 | 81 | See more information on the labels & annotations in the [Connection section of the K8s Labels & Annotations](./k8sLabelsAndAnnotations.md#connections) 82 | 83 | See more information on how the Connection feature works by reading more in the [Feature Connections section](./features/connections.md). 84 | -------------------------------------------------------------------------------- /documentation/components/dashboard/features/README.md: -------------------------------------------------------------------------------- 1 | # Features In-Depth 2 | 3 | This area is intended to shed some light on how Dashboard features work. We don't typically have a lot of features that do not rely on some backend team, but for those that we do, questions are often asked about "how do they work". 4 | 5 | It is worth noting, some of this information will talk about other backend features when appropriate. This will not be an in-depth knowledge on how _those_ features work. Just how Dashboard interacts with them. 6 | 7 | ## Features 8 | 9 | 1. [Data Connections - Connection Types - Connections](./connections.md) 10 | 11 | Additional specific feature details coming soon... 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /documentation/components/dashboard/features/connections.md: -------------------------------------------------------------------------------- 1 | # Connections 2 | 3 | > Introduced in 2.16 -- Encompasses & replaces Data Connections 4 | 5 | * [Introduction](#introduction) 6 | * [Connection Types & Corresponding Connections](#connection-types--corresponding-connections) 7 | * [How Editing Impacts Things](#how-editing-impacts-things) 8 | * [Out of the Box (ootb) Offerings](#out-of-the-box-ootb-offerings) 9 | * [Connectivity](#connectivity) 10 | * [Workbench Connections](#workbench-connections) 11 | * [Model Serving Connections](#model-serving-connections) 12 | 13 | ## Introduction 14 | 15 | There are three terms we are working with here: 16 | 17 | * **Data Connections** -- The "old" (no longer used) term. This indicated the S3-compatible Data Connection that we had prior to 2.16; moving forward these are _Connections_ based on the S3 _Connection Type_ (which comes [ootb](#out-of-the-box-ootb-offerings)) 18 | * **Connection Types** -- These are like templates for new _Connections_. These are crafted & managed by the RHOAI admins and stored inside the deployment namespace; some come [ootb](#out-of-the-box-ootb-offerings) 19 | * **Connections** -- These are the instances inside a Data Science Project that can be connected to Workbenches & Model Serving Models, they are always based off a _Connection Type_ 20 | 21 | ## Connection Types & Corresponding Connections 22 | 23 | Connection Types are a form-driven way of adding a structured object that details the fields and structure of a Connection interface. These provide the RHOAI admin with some flexibility to how to structure Connections for their users. 24 | 25 | Connection Types are a configmap in the deployment namespace. They are managed in the Admin Settings page. There is a preview button to see what the Connection will look like built into the form builder and should allow good coordination on how it will work for users. 26 | 27 | Connections are project-based and always built off of one of the Connection Types that are accessible (created and enabled) to the user at time of creation. They are saved as Secrets inside the project. 28 | 29 | Connection types are of this structure: 30 | ```typescript 31 | type ConnectionTypeConfigMap = K8sConfigMapResource & { 32 | metadata: { 33 | annotations?: DisplayNameAnnotations & { 34 | 'opendatahub.io/disabled'?: 'true' | 'false'; 35 | 'opendatahub.io/username'?: string; 36 | }; 37 | labels: DashboardLabels & { 38 | 'opendatahub.io/connection-type': 'true'; 39 | }; 40 | }; 41 | data?: { 42 | category?: string; 43 | // JSON array of ConnectionTypeFields 44 | fields?: string; 45 | }; 46 | }; 47 | ``` 48 | 49 | Each `ConnectionTypeField` is a configuration of a type of field. Read more about the [Dashboard Labels & Annotations over here](../k8sLabelsAndAnnotations.md). 50 | 51 | The supported field types today are: 52 | * BooleanField 53 | * DropdownField 54 | * FileField 55 | * HiddenField 56 | * NumericField 57 | * SectionField 58 | * ShortTextField 59 | * TextField 60 | * UriField 61 | 62 | > Note: Each of these have fields to configure read-only, required, and varying configurations based on the type. There is quite a bit of variability here, so the details can be added if that kind of granularity is needed. 63 | 64 | ### How Editing Impacts Things 65 | 66 | Editing an existing connection attempts to re-present the same look and feel at the time of creation with a few exceptions: 67 | 68 | * If the Connection Type has since been modified 69 | * & new fields added 70 | * Then the new fields will be accessible 71 | * There should be limited impact, with exceptions 72 | * If the new fields are required, they will prevent users from resaving changes in their Connections until they are updated with new values 73 | * If the new fields have defaults, it will require the users to edit and resave the Connections 74 | * & existing fields removed 75 | * If they were not used, it will be pretty seamless to the form experience 76 | * If they were used, the field will be marked with very little information [1] & will just be the environment variable name to its value 77 | * & changes an existing field's type 78 | * The field should remain as-is in the old type/value until otherwise modified 79 | * If the Connection Type has since been deleted 80 | * The Connection edit screen will have very little information [1] and will entirely just be a listing of environment variable names to their value 81 | 82 | > [1] All the metadata comes from the Connection Type not the Connection itself; metadata such as: section information, the display name of the field, defaults, readonly, required, etc 83 | 84 | It is worth noting that if the Connection Type is just disabled, we will still pull the configuration details during edits, but it cannot be used in future creations until it is re-enabled. This gives an avenue to use this functionality as a way to version existing Connection Types without encountering issues with existing Connections. 85 | 86 | ## Out of the Box (ootb) Offerings 87 | 88 | Admins can disable this offering and/or duplicate it to provide defaults or read-only aspects to help their users with some information. 89 | 90 | > Note: It is important to note that since the Connection Types are stored as ConfigMaps, passwords and other credential information are exposed in plain text if stored in the Connection Type. We do not recommend storing this kind of information in Connection Types at this time. 91 | 92 | #### S3 compatible object storage - v1 93 | To help with existing usages before the upgrade 2.16, we naturally continue to have support for S3. 94 | 95 | #### URI - v1 96 | We also provide a URI ootb variant to help with connecting public [2] URL models to model serving. 97 | 98 | > [2] At this time, private connections are not supported 99 | 100 | #### OCI compliant registry - v1 101 | With the OCI compliant registry ootb connection type, users are able to connect to a private container registry by providing a pull secret. Due to the presence of the `.dockerconfigjson` env variable, the created connection becomes a secret of type `kubernetes.io/dockerconfigjson` which can be used the same way traditional pull secrets are used in kubernetes, with the additional fields from the connection type. 102 | 103 | To connect to a public container registry, a user can use the "URI - v1" connection type and provide the URI to the image tag and prepending it with `oci://` 104 | 105 | ## Connectivity 106 | 107 | Connections don't do a lot by themselves; they effectively store configurations about how to connect to another source [3]. They reside inside projects and can connect to a few other resources that share the same project. Each interacts slightly different, so lets cover what those scenario details are. 108 | 109 | > [3] Technically speaking, a Connection Type can template any type of information which does not need to reflect relationships with another storage. They can store reusable variable values so you can share them with multiple Workbenches -- [more details below](#workbench-connections). 110 | 111 | ### Workbench Connections 112 | 113 | Workbenches are by far the most flexible of Connection consumers. All Connections are connected via `envFrom` (see below for an example), which injects all the keys of the Secret as environment variables. Consumption of the data can be done through the Workbenches' standard access to the environment variables (in Python that's `os.environ["ENV_NAME_HERE"]`). 114 | 115 | Under the hood -- the connectivity between a Connection and a Workbench exists as such: 116 | 117 | ```yaml 118 | apiVersion: kubeflow.org/v1 119 | kind: Notebook 120 | metadata: 121 | name: example-workbench 122 | # ...other properties 123 | spec: 124 | template: 125 | spec: 126 | # ...other properties 127 | containers: 128 | - name: the-notebook-container 129 | # ...other properties 130 | envFrom: 131 | - secretRef: 132 | name: my-s3-connection 133 | - secretRef: 134 | name: my-uri-connection 135 | ``` 136 | 137 | The `my-s3-connection` (using the ootb S3 Connection Type) & `my-uri-connection` (using the ootb URI Connection Type) are connected via the `envFrom` section on the notebook container. Since all Connections are secrets & are injected the same way as environment variables, it will always be mounted from `envFrom.secretRef.name` for each Connection irrespective of their structure. 138 | 139 | > Note: It is important to note that since they are injected as environment variables, two Connections sharing the same variable will clobber each other and the "last one" wins. The UI will note this concern when you have two Connections overlapping. 140 | 141 | ### Model Serving Connections 142 | 143 | > Note: Due to the complexities of how Connections integrate with Model Serving, limited use-cases are available to Model Serving. 144 | 145 | Essentially we only have support for these types: 146 | * [S3-compatible](#s3-compatible-connection) 147 | * [URI](#uri-connection) 148 | * [OCI model cars](#oci-model-cars-connection) 149 | 150 | > Note: At this time there is not much else that can be done as it requires specific integration logic in order to connect a specific set of fields from the Connection to align it with the implementation of the Serving feature (KServe, Model Mesh, etc). 151 | 152 | #### S3-compatible Connection 153 | 154 | > Pulling a model from an S3-compatible Bucket 155 | 156 | Like _Data Connections_ in the previous world, these operate identically through the storage property. 157 | 158 | ```yaml 159 | apiVersion: serving.kserve.io/v1beta1 160 | kind: InferenceService 161 | metadata: 162 | name: model-example-using-s3 163 | # ...other properties 164 | spec: 165 | predictor: 166 | # ...other properties 167 | model: 168 | # ...other properties 169 | storage: 170 | key: my-s3-connection 171 | path: the/path/in/my/bucket 172 | ``` 173 | 174 | The `storage.key` is the Connection Secret. Note the `path` value is still used to qualify where in your S3 Connection bucket the model will be. 175 | 176 | #### URI Connection 177 | 178 | > Pulling a model from a public URI 179 | 180 | A new feature with the initial release of the Connection Types. 181 | 182 | ```yaml 183 | apiVersion: serving.kserve.io/v1beta1 184 | kind: InferenceService 185 | metadata: 186 | name: model-example-using-uri 187 | # ...other properties 188 | spec: 189 | predictor: 190 | # ...other properties 191 | model: 192 | # ...other properties 193 | storageUri: 'https://the-url-to-my-model.com/path' 194 | ``` 195 | 196 | The `storageUri` path is queried for the model and installed into the pod that is associated to your deployment. 197 | 198 | > Note: The `storageUri` field is an overloaded one in the KServe documentation and can have wider implications for usage. Anything that 199 | 200 | #### OCI Model Cars Connection 201 | 202 | > Pulling a model from an authenticated OCI container registry 203 | 204 | OCI is only supported on KServe single model serving deployments. Additionally, the image must be in a Modelcar[^Modelcar] format specified by KServe. 205 | 206 | ```yaml 207 | apiVersion: serving.kserve.io/v1beta1 208 | kind: InferenceService 209 | metadata: 210 | name: model-example-using-oci 211 | # ...other properties 212 | spec: 213 | predictor: 214 | imagePullSecrets: 215 | - name: oci-connection 216 | # ...other properties 217 | model: 218 | # ...other properties 219 | storageUri: 'oci://quay.io/someregistry/image:tag' 220 | ``` 221 | 222 | The `imagePullSecrets` points to the OCI connection. 223 | 224 | The `storageUri` path starts with `oci://` and points to an image. 225 | 226 | [^Modelcar]: https://kserve.github.io/website/latest/modelserving/storage/oci/#prepare-an-oci-image-with-model-data -------------------------------------------------------------------------------- /documentation/components/dashboard/k8sLabelsAndAnnotations.md: -------------------------------------------------------------------------------- 1 | [AcceleratorProfile]: ./README.md#acceleratorprofiles 2 | 3 | [`openshift.io/display-name`]: #openshiftiodisplay-name 4 | [`openshift.io/description`]: #openshiftiodescription 5 | [`opendatahub.io/recommended-accelerators`]: #opendatahubiorecommended-accelerators 6 | [`opendatahub.io/accelerator-name`]: #opendatahubioaccelerator-name 7 | [`opendatahub.io/sc-config`]: #opendatahubiosc-config 8 | 9 | # Dashboard K8s Labels & Annotations 10 | 11 | Dashboard has a reputation of using a lot of annotations and labels on various resources. This document should help serve to explain the use-cases behind each. 12 | 13 | > Note: Not all resources shown in the Dashboard are K8s driven resources. For those that are not, this page does not have any impact on them. 14 | 15 | > Note: This is not a comprehensive list of all labels & annotations used in OpenShift AI, just the ones managed and created by the Dashboard. Specific components may have ever-changing needs, so you should seek out those component's documentation for more information. 16 | 17 | * [Labels](#common-labels) 18 | * [`opendatahub.io/dashboard`](#opendatahubiodashboard) 19 | * [Annotations](#common-annotations) 20 | * [`openshift.io/display-name`] 21 | * [`openshift.io/description`] 22 | * [`opendatahub.io/recommended-accelerators`] 23 | * [`opendatahub.io/accelerator-name`] 24 | * [`opendatahub.io/sc-config`] 25 | * [Specific Use-Cases](#specific-use-cases) 26 | * [DS Projects](#data-science-projects) 27 | * [Connection Types](#connection-types) 28 | * [Connections](#connections) 29 | * [ImageStreams](#imagestreams) 30 | * [Notebooks](#notebooks) 31 | * [ServingRuntime Templates](#servingruntime-templates) 32 | * [Storage Classes](#storage-classes) 33 | * [Model Registry](#model-registry) 34 | 35 | ## Common Labels 36 | 37 | Common reused labels in the Dashboard. Key features of labels: 38 | 39 | * Is able to be used as a filter in a k8s request 40 | * Must be a restrictive k8s naming structure 41 | 42 | ### opendatahub.io/dashboard 43 | 44 | The most common dashboard label. The initial goal here was to mark all things created by the Dashboard, so we could reverse lookup said resources. This has proven to be a bit over aggressive, adding friction in customers making use of external of the Dashboard flows work with Dashboard flows (eg. gitops). 45 | 46 | This is a highly contentious label and will be seeing changes in the near future. 47 | 48 | > Note: This concept is deprecated for DS Projects, it is adding no value and is adding confusion to the concept of ["what is a DS Project?"](./README.md#projects---openshift-console-vs-data-science-differences) 49 | 50 | > Note: This concept is not entirely deprecated for some resources that have multiple uses, including those outside of OpenShift AI. But all OpenShift AI CRDs should not need this soon. 51 | 52 | ## Common Annotations 53 | 54 | Common reused annotations in the Dashboard. Key features of annotations 55 | 56 | * Can be a flexible field to be used for complex metadata or flexible usage of characters that are not K8s-safe (vs Labels) 57 | 58 | ### openshift.io/display-name 59 | 60 | Used heavily by Dashboard UI flows to allow OpenShift AI users to craft a readable & flexible name. 61 | 62 | > Note: This is optional, so we fall back on the resource's k8s name. 63 | 64 | ### openshift.io/description 65 | 66 | Used almost as heavily as the display-name annotation. This allows for a description of the resource the user is creating. This usually is shown next to the display-name once the resource is created. 67 | 68 | > Note: Some resources do not have the use for this, but that's more of an oversight than an intent. 69 | 70 | > Note: This annotation is not required, nor tied to the use of the display-name annotation. 71 | 72 | ### opendatahub.io/recommended-accelerators 73 | 74 | > Type: string array of [AcceleratorProfile] k8s names. 75 | 76 | This annotation is what we use to suggest a recommended connection between a resource's usage by the user & an accelerator profile created on the cluster. This appears as a tag next to the accelerator dropdown item in the UI. 77 | 78 | ### opendatahub.io/accelerator-name 79 | 80 | > Type: a string value of the [AcceleratorProfile] k8s name. 81 | 82 | This annotation is what we use to relate back to an accelerator profile. This is metadata to help with reselection of the right accelerator profile in read & edit modes. This is needed as a way to convert back from the resource values used to the proper profile the user selected. We have a fallback for legacy support for Nvidia GPU, but everything else will fail to locate a profile and show an intermediary custom profile that cannot be mutated in edit modes. 83 | 84 | If no accelerator was selected, this value should not appear. 85 | 86 | ### opendatahub.io/sc-config 87 | 88 | > Type: object 89 | ```js 90 | { 91 | displayName: string; 92 | isEnabled: boolean; 93 | isDefault: boolean; 94 | lastModified: string; 95 | description?: string; 96 | } 97 | ``` 98 | 99 | This annotation is used as internal Dashboard metadata to describe, enable, and set which storage class is the default. This annotation does not affect Openshift default storage classes. 100 | 101 | ## Specific Use-Cases 102 | 103 | ### Data Science Projects 104 | 105 | * Labels 106 | * `modelmesh-enabled` - required by Model Mesh to say the project is using model mesh configurations 107 | > Note: When this is `true`, the project is Model Mesh. When this is `false`, we key off it to say this project is KServe 108 | * Annotations 109 | * [`openshift.io/display-name`] 110 | * [`openshift.io/description`] 111 | 112 | For the Project Sharing feature specifically: 113 | * Label `opendatahub.io/project-sharing` is used to denote permissions crafted by Dashboard flows & thus show up in the Dashboard UI 114 | 115 | ### Connection Types 116 | 117 | * Labels 118 | * `opendatahub.io/connection-type` - a value of `true` indicates that the `ConfigMap` represents a connection type 119 | * Annotations 120 | * [`openshift.io/display-name`] 121 | * [`openshift.io/description`] 122 | * `opendatahub.io/disabled` - a `true` or `false` value indicates whether the connection type is disabled 123 | * `opendatahub.io/username` - the name of the user who created the connection type 124 | 125 | ### Connections 126 | 127 | * Labels 128 | * `opendatahub.io/managed` - Legacy value. Identifies data connections which are watched by the model mesh controller for the purpose of populating the model serving `storage-config` 129 | * Annotations 130 | * [`openshift.io/display-name`] 131 | * [`openshift.io/description`] 132 | * `opendatahub.io/connection-type` - Legacy value. Used to identify S3-compatible data connections; `s3` is the only supported value 133 | * `opendatahub.io/connection-type-ref` - a reference to the connection type that is used to create the connection 134 | 135 | ### ImageStreams 136 | 137 | > Note: Out-of-the-box variants of ImageStreams is a Workbench backed feature. 138 | 139 | These are configured by the admin in the UI and are provided as out-of-the-box examples. 140 | 141 | * General Annotations 142 | * [`opendatahub.io/recommended-accelerators`] 143 | * `opendatahub.io/notebook-python-dependencies` - the python dependencies that are included in the image to list to the user 144 | * `opendatahub.io/notebook-software` - the software that is included in the image to list to the user 145 | * Annotations used primarily by the out-of-the-box images provided by the workbench component 146 | * (tag) `opendatahub.io/image-tag-outdated` - a `true` or `false` value to say if the image is present for lookup, but not intended for selection 147 | * (tag) `opendatahub.io/workbench-image-recommended` - the recommended tag to suggest to the user 148 | * `opendatahub.io/notebook-image-order` - a weighed value to help with organization of the images in display lists for the user 149 | * Annotations used primarily by the Admin UI when created custom Notebook Images 150 | * `opendatahub.io/notebook-image-desc` - description provided by the user 151 | * `opendatahub.io/notebook-image-name` - a display name provided by the user 152 | * `opendatahub.io/notebook-image-url` - the original image value from the user before it's processed for the ImageStream 153 | 154 | ### Notebooks 155 | 156 | > Note: This is a Workbench backed feature. 157 | 158 | * Labels 159 | * `opendatahub.io/odh-managed` - (unknown, potential legacy without value) 160 | * `opendatahub.io/user` - a translated username; the Dashboard k8s-ifies the user's username so we can compare or look up by user in the future 161 | * Annotations 162 | * [`openshift.io/display-name`] 163 | * [`openshift.io/description`] 164 | * `opendatahub.io/username` - the actual username (related to the Label `opendatahub.io/user`) 165 | * [`opendatahub.io/accelerator-name`] 166 | * `opendatahub.io/workbench-image-namespace` - This annotation is used to indicate the scope of a workbench image. If the workbench image is project-scoped, this annotation is added with the workbench image’s namespace. If it’s global-scoped, the annotation is omitted. 167 | * `opendatahub.io/hardware-profile-namespace` - This annotation is used to indicate the scope of a hardware profile. If the hardware profile is project-scoped, this annotation is added with the hardware profile’s namespace. If it’s global-scoped, the annotation is omitted. 168 | * `opendatahub.io/accelerator-profile-namespace` - This annotation is used to indicate the scope of a accelerator profile. If the accelerator profile is project-scoped, this annotation is added with the accelerator profile’s namespace. If it’s global-scoped, the annotation is omitted. 169 | 170 | ### ServingRuntime Templates 171 | 172 | > Note: This is a Serving backed feature. 173 | 174 | These are configured by the admin in the UI and are provided as out-of-the-box examples. These are stored as OpenShift Templates under the hood, but the admin only ever sees a ServingRuntime when configuring. 175 | 176 | * Annotations (when configuring in the admin page) 177 | * [`openshift.io/display-name`] 178 | * `opendatahub.io/modelServingSupport` - (managed by the UI) an JSON Array of supported platforms; options: 'single', 'multi' 179 | * `opendatahub.io/apiProtocol` - (managed by the UI) the api protocols available; options (one of): 'REST', 'gRPC' 180 | * `opendatahub.io/disable-gpu` - (optional, typed in) if the ServingRuntime should not be used with GPUs (aka accelerators) 181 | * [`opendatahub.io/recommended-accelerators`] - (optional, typed in) 182 | 183 | * Annotations (when deploying in projects) 184 | * [`opendatahub.io/accelerator-name`] 185 | * `opendatahub.io/template-name` - the runtime used 186 | * `opendatahub.io/template-display-name` - the display name shown for the runtime 187 | * `opendatahub.io/serving-runtime-scope` - This annotation is used to identify whether a serving runtime template is project-scoped or global-scoped. 188 | * `opendatahub.io/hardware-profile-namespace` - This annotation is used to indicate the scope of a hardware profile. If the hardware profile is project-scoped, this annotation is added with the hardware profile’s namespace. If it’s global-scoped, the annotation is omitted. 189 | * `opendatahub.io/accelerator-profile-namespace` - This annotation is used to indicate the scope of a accelerator profile. If the accelerator profile is project-scoped, this annotation is added with the accelerator profile’s namespace. If it’s global-scoped, the annotation is omitted. 190 | 191 | ### Storage Classes 192 | 193 | * Annotations 194 | * [`opendatahub.io/sc-config`] - (managed by the UI) a JSON Blob of storage class metadata 195 | 196 | ### Model Registry 197 | 198 | * Labels 199 | * `opendatahub.io/rb-project-subject` - This label is used to distinguish RoleBindings with the group subject `system:serviceaccounts:{projectName}`, identifying them as specific to project service accounts. This allows us to use group RoleBindings separately for groups and projects, making sure they always appear in the view where they were created without relying on filtering by a string prefix. 200 | 201 | * `modelregistry.opendatahub.io/registered-model-id` and `modelregistry.opendatahub.io/model-version-id` - These labels identify InferenceServices deployed via the model registry UI and get the Model Registry Controller to sync the deployment. They are also used to filter InferenceServices when viewing the list of deployments for a specific model version. 202 | 203 | * `modelregistry.opendatahub.io/name` - This label provides a unique reference to InferenceServices deployed via a model registry. It ensures that models will be listed in the deployments tab of that specific registry, preventing incorrect listing across multiple registries with overlapping model IDs. 204 | -------------------------------------------------------------------------------- /documentation/components/devops/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/components/devops/.gitkeep -------------------------------------------------------------------------------- /documentation/components/distributed-workload/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/components/distributed-workload/.gitkeep -------------------------------------------------------------------------------- /documentation/components/edge/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/components/edge/.gitkeep -------------------------------------------------------------------------------- /documentation/components/explainability/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/components/explainability/.gitkeep -------------------------------------------------------------------------------- /documentation/components/explainability/README.md: -------------------------------------------------------------------------------- 1 | # Model Explainability Architecture 2 | 3 | ![Model Explainability Diagram](diagram.png) 4 | 5 | The TrustyAI operator[^operator] is responsible for managing the lifecycle of `TrustyAIService` (TAS) Custom Resources (CR). 6 | 7 | ## `TrustyAIService` 8 | 9 | TrustyAI is designed to support a single `TrustyAIService` per namespace/project. Although multiple TASs can be created in the same namespace, and indeed work, due to the architecture this will not bring any additional benefit, and will only duplicate the computations performed by a single TAS. 10 | 11 | In the following sections, we will always assume a single TAS per namespace. 12 | 13 | The general syntax of the `TrustyAIService` CR is as follows: 14 | 15 | ```yaml 16 | apiVersion: trustyai.opendatahub.io/v1alpha1 17 | kind: TrustyAIService 18 | metadata: 19 | name: trustyai-service 20 | spec: 21 | storage: 22 | format: "PVC" 23 | folder: "/inputs" 24 | size: "1Gi" 25 | data: 26 | filename: "data.csv" 27 | format: "CSV" 28 | metrics: 29 | schedule: "5s" 30 | ``` 31 | 32 | - `metadata.name` specifies the name of the `TrustyAIService`. 33 | - `spec.storage.format` specifies the storage format. Currently, only `PVC` is supported. 34 | - `spec.storage.folder` specifies the folder where the input data is stored. 35 | - `spec.storage.size` specifies the size of the PVC to be used for storage. 36 | - `spec.data.filename` specifies the suffix of the storage file. 37 | - `spec.data.format` specifies the format of the data file (only `CSV` supported at the moment). 38 | - `metrics.schedule` specifies the interval at which the metrics are calculated, when a a calculation request is register with the service. 39 | 40 | The default behaviour when installing a CR in a namespace is for the operator to provision the following resources: 41 | 42 | | Type | Name | Description | 43 | |-----------------------|--------------------------|----------------------------------------------------| 44 | | Deployment | `$(metadata.name)` | Deploys a pod with two containers (service and OAuth). | 45 | | PersistentVolumeClaim | `$(metadata.name)-pvc` | Claims a volume for the storage of the inference data. | 46 | | Service | `$(metadata.name)-service`| Internal service to the TrustyAI REST server. | 47 | | Service | `$(metadata.name)-tls` | Service to expose the TrustyAI OAuth server. | 48 | | Route | `$(metadata.name)` | Route exposing the `$(metadata.name)-tls`. | 49 | 50 | ## Payload consumption 51 | 52 | When a `InferenceService` (IS), either ModelMesh or KServe is detected by the operator in the same namespace as a `TrustyAIService`, the operator will try to configure the `InferenceService` to send the inference data to the `TrustyAIService` for processing. 53 | 54 | ### ModelMesh 55 | 56 | When a ModelMesh IS is detected, the operator will set the `PAYLOAD_PROCESSOR` environment to the internal `$(metadata.name)-service`. `PAYLOAD_PROCESSOR` is interpreted by ModelMesh as a space-delimited list of endpoints. If additional endpoints are present, the operator will append the `$(metadata.name)-service` to the list. If the processor is already present, the operator will not modify the list. 57 | 58 | ### KServe 59 | 60 | In the case of KServe, the operator will either add (if not present) or replace the `spec.logger` field with the internal `$(metadata.name)-service`. As example, the final[^kserveis] IS will look similar to: 61 | 62 | ```yaml 63 | apiVersion: serving.kserve.io/v1beta1 64 | kind: InferenceService 65 | metadata: 66 | name: sklearn-iris 67 | spec: 68 | predictor: 69 | logger: # Added by the TrustyAI operator 70 | mode: all 71 | url: http://$(metadata.name)-service.$namespace.svc.cluster 72 | model: 73 | modelFormat: 74 | name: sklearn 75 | storageUri: gs://kfserving-examples/models/sklearn/1.0/model 76 | ``` 77 | 78 | ## Authentication 79 | 80 | Each TAS will have two associated `Services`: 81 | 82 | - `$(metadata.name)-service` (1) 83 | - `$(metadata.name)-tls` (2) 84 | 85 | (1) will have no route associated with it, and will be used for internal communication between the IS and the TAS. This service does not support authentication or TLS at the moment (2) will be exposed via a `Route`, supports TLS and will be used for OAuth authentication. 86 | 87 | Request to (2) will be authenticated using a bearer token in the request header, `Authorization: Bearer `. These requests will be forwarded to the OAuth container (running `oauth-proxy`[^oauth-proxy]) for authentication. If the token is valid, the request will be forwarded to the TrustyAI service container. 88 | 89 | Requests to (1) will not be authenticated, and will be forwarded directly to the TrustyAI service container. 90 | 91 | 92 | 93 | [^operator]: [TrustyAI Operator repository](https://github.com/trustyai-explainability/trustyai-service-operator). 94 | [^kserveis]: Example IS taken from [KServe's documentation](https://kserve.github.io/website/0.11/modelserving/logger/logger/#create-message-dumper). 95 | [^oauth-proxy]: [OAuth Proxy repository](https://github.com/openshift/oauth-proxy) 96 | -------------------------------------------------------------------------------- /documentation/components/explainability/diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/components/explainability/diagram.png -------------------------------------------------------------------------------- /documentation/components/feature_store/README.md: -------------------------------------------------------------------------------- 1 | # Feast Feature Store Architecture 2 | 3 | 7 | 8 | ## Introduction 9 | 10 | Feast (Feature Store) is an open-source feature store that helps teams operate production ML systems at scale by allowing them to define, manage, validate, and serve features for production AI/ML. 11 | 12 | ## Feast Feature Store High Level Architecture 13 | 14 | The architecture diagram presents a high level overview of OpenShift AI environment that integrates Feast for feature store, and other component like Data Science Pipelines / Notebooks for Model training, a Model Registry for using models, and Model Serving for model inference/serving. 15 | 16 | ![feature-store-overview](images/feature-store-overview.jpg) 17 | 18 | ## Goals 19 | 20 | - Integrate Feast Feature Store as a centralized repository for feature management within OpenShift AI. 21 | - Provide a unified view of features used during model training and serving stages. 22 | 23 | 24 | ## Components 25 | 26 | Feast's feature store is composed of below foundational components: 27 | 31 | 32 | 1. [**Offline Store**](https://docs.feast.dev/getting-started/components/offline-store) : Used for historical feature extraction used in model training. 33 | 2. [**Online Store**](https://docs.feast.dev/getting-started/components/online-store): Used for serving features at low-latency for inference requests in production. 34 | 3. [**Registry**](https://docs.feast.dev/getting-started/components/registry): Used to keep track of feature store definitions and state (optionally backed by GCS or S3 based persistent storage). The Feast Registry acts as the backbone for storing feature definitions, which are typically written in Python, stored in feature repositories, and shared across environments such as staging and production. 35 | 4. [**Feast Python SDK/CLI**](https://docs.feast.dev/reference/feast-cli-commands): The primary user facing SDK 36 | - Manage version controlled feature definitions. 37 | - Materialize (load) feature values into the online store. 38 | - Build and retrieve training datasets from the offline store. 39 | - Retrieve online features. 40 | 5. [**Batch Materialization Engine**](https://docs.feast.dev/getting-started/components/batch-materialization-engine) : A batch materialization engine is a component of Feast that's responsible for moving data from the offline store into the online store. 41 | 6. [**Feature Repository**](https://docs.feast.dev/reference/feature-repository/feature-store-yaml) : Contains Feature definitions files written in Python , and the `feature_store.yaml` file to configure the feature store, including data sources with Feast project. 42 | 7. [**Feature Server**](https://docs.feast.dev/reference/feature-servers/python-feature-server): The Feature Server is a core architectural component in Feast, designed to provide low-latency feature retrieval and updates for machine learning applications. 43 | It is a REST API server built using [FastAPI](https://fastapi.tiangolo.com/) and exposes a limited set of endpoints to serve features, push data, and support materialization operations. The server is scalable, flexible, and designed to work seamlessly with various deployment environments, including local setups and cloud-based systems. 44 | 8. [**Feature Store Controller/Operator**](https://github.com/feast-dev/feast/tree/master/infra/feast-operator): The Feature Store Controller/Operator is responsible for the deployment and management of the Feast servers ([Offline Server](https://docs.feast.dev/reference/feature-servers/offline-feature-server), [Online Server](https://docs.feast.dev/reference/feature-servers/python-feature-server), [Registry Server](https://github.com/feast-dev/feast/blob/master/docs/reference/feature-servers/registry-server.md)) in Kubernetes/OpenShift environments: 45 | 46 | ## Feature Store Flow with OpenShift AI. 47 | 48 | **1. Feature Store Initialization** 49 | The UI Dashboard initializes/Creates the FeatureStore CR and sets the config for feature services used for data ingestion, transformation, and storage. 50 | 51 | **2. Data Ingestion into Feature Store** 52 | Data can be sent to Feast either pre-processed (e.g., via batch or streaming data pipelines) or raw and transformed by the Feast feature server during data ingestion (i.e., transformed prior to being written to the online store). Feast simplifies the integration with various data sources by providing an opinionated yet flexible API. 53 | 54 | **3. Data Storage in Feature Store** 55 | 56 | **Offline Store**: A lower-cost, persistent storage system (e.g., data warehouse) optimized for storing large volumes of historical feature data used in model training and batch scoring. It prioritizes storage efficiency over low-latency access. 57 | **Online Store**: A higher-cost, low-latency storage system (e.g., in-memory database or cache) designed to provide rapid access to frequently used features during real-time inference, optimizing for speed and responsiveness. 58 | **Feature Registry:** Metadata storage to track feature definitions, feature transformations, and feature metadata. 59 | 60 | **4. Data Retrieval for Model Training** 61 | Data Science Pipelines retrieve historical features from the Offline Store. Feature views in the Feature Registry define how features are joined and retrieved. 62 | 63 | **5. Model Training** 64 | The retrieved data is split into train, test, and hold-out sets. The model is iteratively trained and evaluated on these datasets. 65 | 66 | **6. Model Evaluation** 67 | The trained model is validated on a hold-out dataset to assess its accuracy, generalizability, and impact to its domain. 68 | **Workflow:** Data retrieved → Train/Test split → Train Model → Evaluate Performance. 69 | 70 | **7. Model Registration** 71 | The trained model is registered in the Model Registry. 72 | 73 | **8. Model Deployment** 74 | The registered model is deployed to the Model Server. 75 | 76 | **9. Inference Request** 77 | There are three ways to orchestrate inference requests. 78 | 1. The applications sends an inference requests to the Model Server and the Model Server retrieves real-time features from the Feature Server and generates predictions using the retrieved features and deployed model. 79 | 2. The application sends an inference request to the Feature Server and the Feature Server sends the features to the Model Server and generates predictions using the deployed model. 80 | 3. The application sends a request to the Feature Server and sends the features to the Model Server whcih generates predictions using the deployed model. 81 | 82 | 83 | ```mermaid 84 | sequenceDiagram 85 | actor U as UI Dashboard 86 | participant NB as Notebook Controller 87 | box Feature Store 88 | participant FS as Feature Store 89 | participant OS as Offline Store 90 | participant IS as Online Store 91 | participant REG as Feature Registry 92 | end 93 | participant DS as Data Science Pipelines 94 | participant MR as Model Registry 95 | participant MS as Model Serving 96 | participant K as Kubernetes/OpenShift 97 | 98 | %% Step 1: Notebook Controller Initialization 99 | U->>+NB: Launch Notebook Controller 100 | 101 | %% Step 2: Feature Store Creation 102 | U->>+FS: Create Feature Store 103 | FS-->>U: Feature Store ready 104 | 105 | %% Step 3: Data Ingestion into Feature Store 106 | U->>+FS: Send pre-processed data (Batch/Streaming) 107 | FS->>REG: Store/apply feature metadata 108 | FS->>OS: Store historical features (Offline Store) 109 | FS->>IS: Store real-time features (Online Store) 110 | Note right of FS: Features are stored in Offline/Online Stores and
Metadata about the features is registered in the Feature Registry. 111 | 112 | %% Step 4: Model Training 113 | U->>+DS: Data Science Pipeline 114 | DS->>+FS: Retrieve features for model training 115 | FS->>OS: Pull historical features 116 | OS-->>DS: Return historical features 117 | DS->>DS: Train and evaluate ML model 118 | 119 | %% Step 5: Model Registration 120 | DS->>+MR: Register trained model with metadata 121 | 122 | %% Step 6: Model Deployment 123 | DS->>+K: Deploy model to OpenShift AI (KServe) 124 | K->>+MS: Create InferenceService (ISVC) 125 | MS-->>K: ISVC deployed successfully 126 | 127 | %% Step 7: Real-Time Inference 128 | U->>+MS: Send inference request with new data 129 | MS->>+FS: Fetch real-time features (Online Store) 130 | FS->>IS: Retrieve real-time features 131 | IS-->>MS: Return real-time features 132 | MS-->>U: Return predictions 133 | 134 | ``` 135 | 136 | -------------------------------------------------------------------------------- /documentation/components/feature_store/images/feature-store-overview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/components/feature_store/images/feature-store-overview.jpg -------------------------------------------------------------------------------- /documentation/components/model-registry/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/components/model-registry/.gitkeep -------------------------------------------------------------------------------- /documentation/components/model-registry/README.md: -------------------------------------------------------------------------------- 1 | # Model Registry architecture 2 | 3 | 7 | 8 | ## Introduction 9 | 10 | A model registry plays a pivotal role in the lifecycle of AI/ML models, serving as the central repository holding metadata pertaining to machine learning models from inception to deployment. This encompasses both high-level details like deployment environment and project origins, as well as intricate information like training hyperparameters, performance metrics, and deployment events. Acting as a bridge between model experimentation and serving, it offers a secure, collaborative interface of a metadata store for stakeholders involved in the ML lifecycle. 11 | 12 | ## Model Registry High Level Architecture 13 | ![Model Registry High Level Architecture](./images/model-registry-overview.jpg) 14 | 15 | > [!NOTE] 16 | > The Model Registry is a passive repository for metadata and is not meant to be a Control Plane. It does not perform any orchestration or expose APIs to perform actions on underlying OpenShift AI components. 17 | 18 | The model registry is a backing store for various stages of MLOps that can log user flow of a model development and deployment. The model registry meets a data scientist's need to be able to visualize a model’s lineage and trace back the training executions, parameters, metrics, etc. It also help deployment engineers visualize model pipeline events, actions, progress through deployment stages, etc. 19 | 20 | ## Goals 21 | - Associate metadata from training, experimentation, studies and their metrics, with a model 22 | - Build a catalog of models and manage model versions for deployment 23 | - Management of model for multiple deployment environments 24 | - Build a Kube Native solution 25 | 26 | ## Architecture 27 | 28 | Google community project [ML-Metadata](https://github.com/google/ml-metadata) is used as the core component to build the Model Registry. ML-Metadata provides a very extensible schema that is generic, similar to a key-value store, but also allows for the creation of logical schemas that can be queried as if they were physical schemas. Those can be manipulated using their bindings in the Python library. We use this model to extend and provide metadata storage services for model serving, also known as Model Registry. 29 | 30 | The model registry uses the ml-metadata project’s C++ server as-is to handle the storing of the metadata, while domain-specific Model Registry features are added as extensions (aka microservices). As part of these extensions, Model Registry provides: 31 | - Python/Go extensions to support the Model Registry interaction 32 | - an OpenAPI interface to expose the Model Registry API to the clients 33 | 34 | ![Model Registry Connections](./images/model-registry-connections.png) 35 | 36 | Enforcing of RBAC policies can be handled at the REST API layer using service accounts with Authorino, details about [RBAC and Tenancy](model-registry-tenancy.md) are described here. 37 | 38 | ## Components 39 | - *[MLMD C++ Server](https://github.com/google/ml-metadata)* 40 | - This is the metadata server from Google's ml-metadata project. This component is hosted to communicate with a backend relational database that stores the actual metadata about the models. This server exposes a “gRPC” interface for its clients to communicate with. This server provides a very flexible schema model, where using this model one can define logical data models to fit the needs of different MLOps operations, for example, metadata during the training and experimentation, metadata about metrics or model versioning, etc. 41 | 42 | - *[OpenAPI/REST Server](https://github.com/kubeflow/model-registry)* 43 | - This component exposes a higher-level REST API of the Model Registry. In contrast, the MLMD server exposes a lower level generic API over gRPC, whereas this REST server exposes a higher level API that is much closer to the domain model of Model Registry, like: 44 | - Register a Model 45 | - Version a Model 46 | - Get a catalog of models 47 | - Manage the deployment statutes of a model 48 | 49 | - The REST API server converts its requests into one or more underlying gRPC requests on the MLMD Server. This layer is mainly designed to be used with UI. 50 | 51 | - *[Model Registry Controller](https://github.com/opendatahub-io/model-registry-operator)* 52 | - Model Registry controller is also called Model Registry Operator. The main purpose of this component is to install/deploy components of the Model Registry stack on the OpenShift. Once the components are installed, the reconciler in the controller will continuously run and monitor these components to keep them healthy and alive. 53 | 54 | - *[CLI (Python client, SDK)](https://github.com/kubeflow/model-registry/tree/main/clients/python)* 55 | - CLI is also called MR Python client/SDK, a command line tool for interacting with Model Registry. This tool can be used by a user to execute operations such as retrieving the registered models, get model’s deployment status, model’s version etc. 56 | 57 | - The model registry provides logical mappings from the high level [logical model](https://github.com/kubeflow/model-registry/blob/main/docs/logical_model.md) available through the OpenAPI/REST Server, to the underlying ml-metadata entities. 58 | 59 | ## Integration with Model Serving Components 60 | 61 | 66 | 67 | In a typical ML workflow, a ML model is registered on the Model Registry as a `RegisteredModel` logical entity, along with its versions and its associated `ModelArtifacts` resources. 68 | 69 | Then, Model serving controller advertises itself to the Model Registry, by creating a `ServingEnvironment` entity. 70 | 71 | Then, the Model Controller reconciler monitors `InferenceService` CRs having pre-defined `labels`, and based on those `labels` it syncs the model registry by keeping track of every deployment that occurred in the cluster. 72 | Then, the Model Controller reconciler updates the `InferenceService` CR by linking it to the Model Registry logical entity using a specific `label`. 73 | 74 | ```mermaid 75 | sequenceDiagram 76 | actor U as UI Dashboard 77 | participant K as Kubernetes 78 | participant MC as ODH Model Controller 79 | participant MR as Model Registry 80 | U->>+MR: Retrieve indexed model version 81 | MR-->>-U: Indexed model version 82 | U->>K: Create InferenceService (ISVC) 83 | Note right of U: Annotate/Label the ISVC with indexed
model information, like RegisteredModel and
ModelVersion IDs. 84 | Note right of K: Here all operators/controllers in charge to deploy
the model will make
their actions, e.g., KServe or ModelMesh. 85 | loop Every ISVC creation/deletion/update 86 | K-->>+MC: Send notification 87 | MC->>+K: Retrieve affected ISVC in the cluster 88 | K-->>-MC: ISVC resource 89 | MC->>+MR: Create/Update InferenceService in Model Registry 90 | Note left of MR: InferenceService records in Model Registry
are used to keep track of every deployment that
occurred in the monitored Kubernetes cluster. 91 | MR-->>-MC: InferenceService record 92 | MC-->>-K: Update ISVC with Model Registry record ID 93 | end 94 | ``` 95 | 96 | In this way, the Model Controller reconciler syncs those occurrence into the Model Registry to keep track of every deployment that occurred in the cluster for indexed models. 97 | 98 | -------------------------------------------------------------------------------- /documentation/components/model-registry/images/model-registry-connections.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/components/model-registry/images/model-registry-connections.png -------------------------------------------------------------------------------- /documentation/components/model-registry/images/model-registry-deployment-model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/components/model-registry/images/model-registry-deployment-model.png -------------------------------------------------------------------------------- /documentation/components/model-registry/images/model-registry-logical-model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/components/model-registry/images/model-registry-logical-model.png -------------------------------------------------------------------------------- /documentation/components/model-registry/images/model-registry-overview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/components/model-registry/images/model-registry-overview.jpg -------------------------------------------------------------------------------- /documentation/components/model-registry/images/model-registry-tenancy-model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/components/model-registry/images/model-registry-tenancy-model.png -------------------------------------------------------------------------------- /documentation/components/model-registry/model-registry-tenancy.md: -------------------------------------------------------------------------------- 1 | # Model Registry Tenancy 2 | 3 | OpenShift AI deploys components per RedHat OpenShift AI Project. In this model, all OpenShift AI components are deployed per Project (Kubernetes namespace) (except for Dashboard, which is per-cluster). 4 | 5 | Model Registry needs to support sharing ML Model metadata across multiple projects/environments, etc. The current RHOAI per-namespace deployment model uses “namespaces” as the tenant, i.e. any users or service accounts that are members of that namespace share all data as a single tenant. Ideally, there should be a _logical_ tenancy model that allows users to group ML Model development based on teams, groups, or organizations, independent of the underlying Kubernetes deployment architecture. 6 | 7 | 8 | This document lays tenancy architecture behind Model registry, which gives them complete control over what kind of deployment models a cluster Admin can choose. 9 | 10 | 11 | ## Model Registry Logical Architecture 12 | 13 | The diagram below shows the core components at a logical level without going into authentication and authorization implementation details. 14 | 15 | 16 | ![alt_text](images/model-registry-logical-model.png "image_tooltip") 17 | 18 | 19 | At a high level Model Registry architecture consists of the following: 20 | 21 | * Clients, which can be users or service accounts for client processes 22 | * Kubernetes Deployment that provides a Model Registry service. This service exposes the following API ports 23 | * An MLMD CPP Server that implements a gRPC API 24 | * A Golang Server that implements an HTTP REST API 25 | * An external user provided database service to store ML model metadata 26 | 27 | clients could be either human users that have authenticated with their user credentials and make API calls through the dashboard, CLI or could be other pods/clients using service accounts or non-human credentials such as secrets and tokens. 28 | 29 | 30 | ## Deployment Architecture - Use Service Mesh and Authorino for RBAC 31 | 32 | This architecture calls for deploying the Model Registry deployment(s) in a known namespace such as “odh-model-registries”, where OpenShift AI administrators can provision any number of Model Registries with a unique name. For example, the administrator can install a single Model Registry called “public” and configure its permissions such that any authenticated user to the Kubernetes cluster has access to its APIs. Similarly, another instance can be configured to be accessible to a set of known users/groups. 33 | 34 | The namespace “odh-model-registries” will be automatically signed up as a member of the service mesh and all the model registry deployments will be configured to run an envoy proxy as a sidecar component. 35 | 36 | For all the inter-service communication mTLS will be configured using a “DestinationRule”. For exposing the model registry’s service endpoints a “_Virtual Service_” will be configured. A “_Gateway_” for each “_Virtual Service_” is configured if the service needs to be exposed externally or to a non-mesh-member component. 37 | 38 | Service Mesh will also be configured with CUSTOM “AuthorizationPolicy” to delegate the authorization decisions to Authorino. An “_AuthPolicy_” to support Authorino will be configured to enforce the “_Role_” based RBAC rules for accessing the Model Registry that has been created by the administrator. 39 | 40 | **Note:** This proposal follows the [Principle of Least Privilege](https://en.wikipedia.org/wiki/Principle_of_least_privilege) for RBAC rules and resources. This ensures that any access granted is to a very specific resource to avoid accidentally granting access to other resources. 41 | 42 | The diagram below shows the RBAC proxy deployment model: 43 | 44 | 45 | ![alt_text](images/model-registry-deployment-model.png "image_tooltip") 46 | 47 | Model Registry Operator will create the following: 48 | 49 | * Kubernetes Role `registry-user-` - allows the verb `GET` on the Kubernetes service created for `registry-name`. This role will make it convenient for users, group, and service accounts to be granted access to a specific Model Registry service. 50 | * OpenShift User Group `-users` - with role binding to role `registry-user-` to help registry administrators easily add users to this group and grant them access to the registry. 51 | 52 | Using the above two resources to handle the access permissions, Model Registry creates the tenancy model. 53 | 54 | ### Multiple “Model Registry” deployments in OpenShift AI 55 | 56 | This is the prescribed deployment model for multiple Model Registries in the MVP release with tenancy model. The tenancy is enforced with RBAC access to given instance of the Model Registry. 57 | 58 | ![alt_text](images/model-registry-tenancy-model.png "image_tooltip") 59 | 60 | 61 | -------------------------------------------------------------------------------- /documentation/components/pipelines/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/components/pipelines/.gitkeep -------------------------------------------------------------------------------- /documentation/components/pipelines/README.md: -------------------------------------------------------------------------------- 1 | # Data Science Pipelines 2 | 3 | Data Science Pipelines is a platform for building and deploying portable, scalable machine learning (ML) workflows based on containers. It is based on Kubeflow Pipelines and relies on Argo Workflows to run the pipelines. Additionally, Data Science Pipelines includes a custom "control plane" on top of Kubeflow Pipelines -- an operator we refer to as Data Science Pipelines Operator (DSPO). DSPO manages the "data planes", the individual "Data Science Pipelines Applications" (aka "stacks") that are deployed in each Data Science Project (kubernetes namespace). 4 | 5 | ## Data Science Pipelines Operator APIs 6 | 7 | ### DataSciencePipelinesApplication (DSPA) 8 | 9 | * [API Reference](https://github.com/opendatahub-io/data-science-pipelines-operator/blob/main/api/v1alpha1/dspipeline_types.go) 10 | * This CRD is responsible for defining the configuration of the Data Science Pipelines stack. 11 | 12 | ## DSP High Level Architecture 13 | ![DSP High Level Architecture](./dsp-v2-high-level-architecture.png) 14 | 15 | ## DSP Detailed Architecture 16 | ![DSP Detailed Architecture](./dsp-v2-architecture.drawio.png) 17 | 18 | ## Kubeflow Pipelines Architecture references 19 | 20 | Note: you must join https://groups.google.com/g/kubeflow-discuss to access these documents 21 | 22 | [Kubeflow Pipelines v2 System Design](https://docs.google.com/document/d/1fHU29oScMEKPttDA1Th1ibImAKsFVVt2Ynr4ZME05i0/edit) -- goes into deeper detail for the data plane design. The Orchestration section in that document is particularly helpful. 23 | 24 | [KFP v2 control flow](https://docs.google.com/document/d/1TZeZtxwPzAImIu8Jk_e-4otSx467Ckf0smNe7JbPReE/edit) 25 | -------------------------------------------------------------------------------- /documentation/components/pipelines/dsp-v2-architecture.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/components/pipelines/dsp-v2-architecture.drawio.png -------------------------------------------------------------------------------- /documentation/components/pipelines/dsp-v2-high-level-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/components/pipelines/dsp-v2-high-level-architecture.png -------------------------------------------------------------------------------- /documentation/components/platform/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/components/platform/.gitkeep -------------------------------------------------------------------------------- /documentation/components/platform/Authorization in Service Mesh.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/components/platform/Authorization in Service Mesh.png -------------------------------------------------------------------------------- /documentation/components/platform/Platform Architecture Overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/components/platform/Platform Architecture Overview.png -------------------------------------------------------------------------------- /documentation/components/platform/README.md: -------------------------------------------------------------------------------- 1 | # Platform Architecture 2 | 3 | Platform component is responsible for maintaining the core ODH Operator and establishing standards for component 4 | deployments, monitoring, security and ecosystem integration. 5 | 6 | ## ODH Operator APIs 7 | 8 | ### DSCInitialization API 9 | 10 | * [API Reference](https://github.com/opendatahub-io/opendatahub-operator/blob/incubation/docs/api-overview.md#dscinitializationopendatahubiov1) 11 | * This CRD is responsible for defining config required by the ODH platform before the applications are deployed. 12 | * This includes creation of applications and monitoring namespaces, component wide configurations like Authorization, 13 | monitoring etc 14 | 15 | 16 | ### DataScienceCluster API 17 | 18 | * [API Reference](https://github.com/opendatahub-io/opendatahub-operator/blob/incubation/docs/api-overview.md#datascienceclusteropendatahubiov1) 19 | * This CRD will be created by the end user to enable various data science components. 20 | * It is responsible for enabling support for Notebooks, DataSciencePipelinesApplication, InferenceService etc based on 21 | the configuration 22 | 23 | 24 | ## Platform Architecture Overview 25 | ![Platform Architecture Overview](./Platform%20Architecture%20Overview.png) 26 | 27 | ## Authorization in ServiceMesh 28 | ![Authorization in ServiceMesh](./Authorization%20in%20Service%20Mesh.png) 29 | -------------------------------------------------------------------------------- /documentation/components/serving/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/components/serving/.gitkeep -------------------------------------------------------------------------------- /documentation/components/serving/README.md: -------------------------------------------------------------------------------- 1 | # ModelServing architecture 2 | 3 | ## Components 4 | - *[KSERVE](https://github.com/opendatahub-io/kserve)* 5 | - This supports a single model serving platform. For deploying large models such as large language models (LLMs), OpenShift AI includes a single model serving platform that is based on the KServe component. Because each model is deployed from its own model server, the single model serving platform helps you deploy, monitor, scale, and maintain large models that require increased resources. 6 | - *[MODEL MESH](https://github.com/opendatahub-io/modelmesh-serving)* 7 | - This supports a multi-model serving platform. For deploying small and medium-sized models, OpenShift AI includes a multi-model serving platform that is based on the ModelMesh component. On the multi-model serving platform, you can deploy multiple models on the same model server. Each of the deployed models shares the server resources. This approach can be advantageous on OpenShift clusters that have finite compute resources or pods. 8 | - *[ODH-MODEL-CONTROLLER](https://github.com/opendatahub-io/odh-model-controller)* 9 | - This component facilitates seamless integration between RHOAI's various components and model serving components, enhancing the interoperability and synergy within the RHOAI ecosystem. It streamlines the integration process, enabling smoother communication and interaction between different modules and services, thereby optimizing the overall performance and functionality of the RHOAI platform 10 | 11 | 12 | ## ModelServing Componets Architecture Diagram 13 | ![ModelServing Componets Architecture Diagram](./modelserving-architecture-High-Level%20Components%20Architecture.jpg) 14 | -------------------------------------------------------------------------------- /documentation/components/serving/modelserving-architecture-High-Level Components Architecture.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/components/serving/modelserving-architecture-High-Level Components Architecture.jpg -------------------------------------------------------------------------------- /documentation/components/workbenches/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/components/workbenches/.gitkeep -------------------------------------------------------------------------------- /documentation/components/workbenches/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Workbenches architecture 3 | 4 | 8 | 9 | Workbenches component provides a platform to run web-based development environments inside the openshift cluster. In the ML lifecycle, workbenches are utilize as platform in Model Development Stage. As it provides as avenue for the Data Scientist, to explore and experiment on the development of model. 10 | 11 | Key features include: 12 | 13 | - Native support for [JupyterLab](https://github.com/jupyterlab/jupyterlab), [RStudio](https://github.com/rstudio/rstudio), and [code-server](https://github.com/coder/code-server). 14 | - Tailored integrated environments equipped with the latest tools and libraries. 15 | - Users can create notebook containers directly in the cluster. 16 | - Admins can provide standard notebook images for their organization with required packages pre-installed. 17 | - Access control is managed by Admins, enabling easier notebook management in the organization. 18 | 19 | Components: 20 | 21 | - *[Notebooks/workbenches](https://github.com/opendatahub-io/notebooks/wiki/Workbenches)* 22 | - A collection of notebooks tailored for data analysis, machine learning, research and coding within the Openshift ecosystem. Designed to streamline data science workflows, these notebooks offer an integrated environment equipped with the latest tools and libraries. These notebooks were created to be used with Openshift with the Notebook Controller as the launcher. The following are the out of the box notebook images supported with One year cadence: 23 | 24 | - Minimal (includes: jupyterlab) 25 | - Data-science (includes: jupyterlab, numpy, scipy, pandas,etc) 26 | - PyTorch (includes: jupyterlab, torch,etc) 27 | - TensorFlow(includes: jupyterlab, tensorflow,etc) 28 | - TrustyAI (includes: jupyterlab, trustyai,etc) 29 | - GPU support: Nvidia (CUDA drivers), Intel (Habana-Gaudi drivers) 30 | 31 | - *[Notebook Controller](https://github.com/opendatahub-io/kubeflow/tree/v1.7-branch/components/odh-notebook-controller)* 32 | - The combination of two controller which acts as the backend for this component. Based on the upstream kubeflow notebook controller and it is responsible to watch the Notebook custom resource events to start the notebook environment along with the following capabilities: 33 | - Openshift ingress controller integration. 34 | - Openshift OAuth sidecar injection. 35 | - Openshift certs injection 36 | 37 | 43 | 44 | 45 | ## High Level architecture 46 | 47 | ![Workbenches High level Architecture Diagram](./high-level-workbench-arch.drawio.png) 48 | 49 | ## Workbenches 50 | 51 | ### Architecture 52 | 53 | The structure of the notebook's build chain is derived from the parent image. To better comprehend this concept, refer to the following graph. 54 | 55 | ![workbenches Architecture](./workbenches-imagestreams.drawio.png) 56 | 57 | Each notebook inherits the properties of its parent. For instance, the TrustyAI notebook inherits all the installed packages from the Standard Data Science notebook, which in turn inherits the characteristics from its parent, the Minimal notebook. 58 | 59 | The Rstudio arhitecture is little different than other as the component, is not shipped as image, 60 | instead shipped as buildconfig, so user can build the Rstudio on their cluster as per their need. 61 | As the RStudio is in Dev Preview. 62 | 63 | ![rstudio Architecture](./rstudio-imagestream.drawio.png) 64 | 65 | 66 | ## Notebook Controller 67 | 68 | ### Architecture 69 | 70 | ![Notebook Controller](./notebook-controller.drawio.png) 71 | 72 | ### Spec 73 | 74 | The user needs to specify the PodSpec for the Workbenches. Based on the selection made by user, the Dashboard component submits the Custom Resource to the cluster. 75 | For example: 76 | 77 | ```yaml 78 | apiVersion: kubeflow.org/v1 79 | kind: Notebook 80 | metadata: 81 | name: my-notebook 82 | spec: 83 | template: 84 | spec: 85 | containers: 86 | - name: my-notebook 87 | image: standard-data-science:ubi9-python3.9 88 | args: 89 | [ 90 | "start.sh", 91 | "lab", 92 | "--LabApp.token=''", 93 | "--LabApp.allow_remote_access='True'", 94 | "--LabApp.allow_root='True'", 95 | "--LabApp.ip='*'", 96 | "--LabApp.base_url=/test/my-notebook/", 97 | "--port=8888", 98 | "--no-browser", 99 | ] 100 | ``` 101 | 102 | The required fields are `containers[0].image` and (`containers[0].command` and/or `containers[0].args`). 103 | That is, the user should specify what and how to run. 104 | 105 | All other fields will be filled in with default value if not specified. 106 | 107 | By default, when the ODH notebook controller is deployed along with the 108 | Kubeflow notebook controller, it will expose the notebook in the Openshift 109 | ingress by creating a TLS `Route` object. 110 | 111 | If the notebook annotation `notebooks.opendatahub.io/inject-oauth` is set to 112 | true, the OAuth proxy will be injected as a sidecar proxy in the notebook 113 | deployment to provide authN and authZ capabilities: 114 | 115 | ```yaml 116 | apiVersion: kubeflow.org/v1 117 | kind: Notebook 118 | metadata: 119 | name: example 120 | annotations: 121 | notebooks.opendatahub.io/inject-oauth: "true" 122 | ``` 123 | 124 | A [mutating webhook](./controllers/notebook_webhook.go) is part of the ODH 125 | notebook controller, it will add the sidecar to the notebook deployment. The 126 | controller will create all the objects needed by the proxy as explained in the 127 | architecture diagram. 128 | 129 | When accessing the notebook, you will have to authenticate with your Openshift 130 | user, and you will only be able to access it if you have the necessary 131 | permissions. 132 | 133 | The authorization is delegated to Openshift RBAC through the `--openshfit-sar` 134 | flag in the OAuth proxy: 135 | 136 | ```json 137 | --openshift-sar= 138 | { 139 | "verb":"get", 140 | "resource":"notebooks", 141 | "resourceAPIGroup":"kubeflow.org", 142 | "resourceName":"example", 143 | "namespace":"opendatahub" 144 | } 145 | ``` 146 | 147 | That is, you will only be able to access the notebook if you can perform a `GET` 148 | notebook operation on the cluster: 149 | 150 | ```shell 151 | oc get notebook example -n 152 | ``` 153 | 154 | -------------------------------------------------------------------------------- /documentation/components/workbenches/high-level-workbench-arch.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/components/workbenches/high-level-workbench-arch.drawio.png -------------------------------------------------------------------------------- /documentation/components/workbenches/notebook-controller.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/components/workbenches/notebook-controller.drawio.png -------------------------------------------------------------------------------- /documentation/components/workbenches/rstudio-imagestream.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/components/workbenches/rstudio-imagestream.drawio.png -------------------------------------------------------------------------------- /documentation/components/workbenches/workbenches-imagestreams.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/components/workbenches/workbenches-imagestreams.drawio.png -------------------------------------------------------------------------------- /documentation/diagram/README.MD: -------------------------------------------------------------------------------- 1 | Folder with architecture diagrams 2 | Use https://www.drawio.com/ (client) to open and edit it -------------------------------------------------------------------------------- /documentation/enhancements/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/enhancements/.gitkeep -------------------------------------------------------------------------------- /documentation/images/RHOAI Architecture - D1 - Operator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/images/RHOAI Architecture - D1 - Operator.png -------------------------------------------------------------------------------- /documentation/images/RHOAI Architecture - D2 - DSP.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/images/RHOAI Architecture - D2 - DSP.png -------------------------------------------------------------------------------- /documentation/images/RHOAI Architecture - D3 - Workbenches.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/images/RHOAI Architecture - D3 - Workbenches.png -------------------------------------------------------------------------------- /documentation/images/RHOAI Architecture - D4 - Dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/images/RHOAI Architecture - D4 - Dashboard.png -------------------------------------------------------------------------------- /documentation/images/RHOAI Architecture - D5 - Distr Workloads.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/images/RHOAI Architecture - D5 - Distr Workloads.png -------------------------------------------------------------------------------- /documentation/images/RHOAI Architecture - D6a - Model Serving.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/images/RHOAI Architecture - D6a - Model Serving.png -------------------------------------------------------------------------------- /documentation/images/RHOAI Architecture - D6b - Model Serving.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/images/RHOAI Architecture - D6b - Model Serving.png -------------------------------------------------------------------------------- /documentation/images/RHOAI Architecture - D6c - Model Serving.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/images/RHOAI Architecture - D6c - Model Serving.png -------------------------------------------------------------------------------- /documentation/images/RHOAI Architecture - D7 - Trusty.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/images/RHOAI Architecture - D7 - Trusty.png -------------------------------------------------------------------------------- /documentation/images/RHOAI Architecture - D9 - Feature Store.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/images/RHOAI Architecture - D9 - Feature Store.png -------------------------------------------------------------------------------- /documentation/images/RHOAI Architecture-Overview.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/images/RHOAI Architecture-Overview.drawio.png -------------------------------------------------------------------------------- /documentation/images/RHODS Architecture - Network Diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/images/RHODS Architecture - Network Diagram.png -------------------------------------------------------------------------------- /documentation/images/network/Dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/images/network/Dashboard.png -------------------------------------------------------------------------------- /documentation/images/network/DataScienePipelines.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/images/network/DataScienePipelines.png -------------------------------------------------------------------------------- /documentation/images/network/DistributedWorkloads_KubeFlow_Training_Operator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/images/network/DistributedWorkloads_KubeFlow_Training_Operator.png -------------------------------------------------------------------------------- /documentation/images/network/DistributedWorkloads_KubeRay.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/images/network/DistributedWorkloads_KubeRay.png -------------------------------------------------------------------------------- /documentation/images/network/ModelRegistry.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/images/network/ModelRegistry.png -------------------------------------------------------------------------------- /documentation/images/network/ModelServing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/images/network/ModelServing.png -------------------------------------------------------------------------------- /documentation/images/network/TrustyAI.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/images/network/TrustyAI.png -------------------------------------------------------------------------------- /documentation/images/network/Workbenches.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opendatahub-io/architecture-decision-records/8063037c513ce9d51bc9f6f2d4a637a46ab536b9/documentation/images/network/Workbenches.png --------------------------------------------------------------------------------