├── .gitignore ├── GETTING_STARTED_README.md ├── INSTALLING_ONTO_EXISTING_CLUSTER_README.md ├── LICENSE.txt ├── README.md ├── RELEASE_NOTES.md ├── cluster_creation_terraform ├── CLUSTER_CREATION_STACK_VERSION ├── OCI_AI_BLUEPRINTS_LINK ├── oke-datasources.tf ├── oke-network.tf ├── oke-security-lists.tf ├── oke.tf ├── outputs.tf ├── providers.tf ├── schema.yaml ├── variables.tf └── versions.tf ├── docs ├── about.md ├── api_documentation.md ├── common_workflows │ ├── deploying_blueprints_onto_specific_nodes │ │ └── README.md │ └── working_with_large_models │ │ └── README.md ├── custom_blueprints │ ├── README.md │ └── blueprint_json_schema.json ├── iam_policies.md ├── images │ └── install.svg ├── installing_new_updates.md ├── known_issues.md ├── sample_blueprints │ ├── README.md │ ├── auto_scaling │ │ ├── README.md │ │ └── autoscaling_blueprint.json │ ├── cpu-inference │ │ ├── README.md │ │ ├── cpu-inference-gemma.json │ │ ├── cpu-inference-mistral-bm.json │ │ └── cpu-inference-mistral-vm.json │ ├── exisiting_cluster_installation │ │ ├── README.md │ │ └── add_node_to_control_plane.json │ ├── gpu-health-check │ │ ├── README.md │ │ ├── healthcheck_fp16_a10.json │ │ ├── healthcheck_fp16_h100.json │ │ └── healthcheck_fp32_a10.json │ ├── llm_inference_with_vllm │ │ ├── README.md │ │ ├── vllm-closed-hf-model.json │ │ ├── vllm-model-from-obj-storage.json │ │ ├── vllm-open-hf-model-api-key-functionality.json │ │ └── vllm-open-hf-model.json │ ├── lora-benchmarking │ │ ├── README.md │ │ └── mlcommons_lora_finetune_nvidia_sample_recipe.json │ ├── lora-fine-tuning │ │ ├── README.md │ │ ├── bucket_checkpoint_bucket_model_open_dataset.backend.json │ │ ├── bucket_model_open_dataset.backend.json │ │ ├── bucket_par_open_dataset.backend.json │ │ ├── closed_model_open_dataset_hf.backend.json │ │ └── open_model_open_dataset_hf.backend.json │ ├── mig_multi_instance_gpu │ │ ├── README.md │ │ ├── mig_enabled_shared_node_pool.json │ │ ├── mig_inference_multiple_replicas.json │ │ ├── mig_inference_single_replica.json │ │ ├── mig_inference_single_replica_10gb.json │ │ ├── mig_slices.png │ │ ├── mig_update_node_with_node_name.json │ │ └── mig_update_shared_pool_with_node_pool_name.json │ ├── model_storage │ │ ├── README.md │ │ ├── download_closed_hf_model_to_object_storage.json │ │ └── download_open_hf_model_to_object_storage.json │ ├── multi-node-inference │ │ ├── README.md │ │ ├── multinode_inference_BM_A10.json │ │ └── multinode_inference_VM_A10.json │ ├── shared_node_pools │ │ ├── README.md │ │ ├── shared_node_pool_A10_BM.json │ │ ├── shared_node_pool_A10_VM.json │ │ └── vllm_inference_sample_shared_pool_blueprint.json │ ├── startup_liveness_readiness_probes │ │ ├── README.md │ │ └── autoscale_with_fss.json │ ├── teams │ │ ├── README.md │ │ ├── create_job_with_team.json │ │ └── create_team.json │ └── using_rdma_enabled_node_pools │ │ ├── README.md │ │ ├── rdma_distributed_inference.json │ │ ├── rdma_shared_node_pool.json │ │ └── rdma_update_nodes.json ├── usage_guide.md ├── versions │ ├── ControlPlaneVersions.md │ ├── PortalVersions.md │ ├── QuickStartVersions.md │ └── README.md └── whisper_transcription │ ├── README.md │ ├── docs │ └── Whisper_Architecture.pdf │ ├── examples │ ├── test1 │ │ ├── test.wav │ │ ├── test_all_transcripts_20250601_201349.txt │ │ └── transcription_log_20250601_201340.log │ ├── test2 │ │ ├── transcription_log_20250601_203611.log │ │ ├── video1591686795.mp4 │ │ ├── video1591686795_all_transcripts_20250601_203730.json │ │ └── video1591686795_all_transcripts_20250601_203730.txt │ └── test3 │ │ ├── audio1788670787.m4a │ │ ├── audio1788670787_all_transcripts_20250601_191710.json │ │ ├── audio1788670787_all_transcripts_20250601_191710.txt │ │ └── transcription_log_20250601_191325.log │ ├── whisper-transcription-A10.json │ ├── whisper-transcription-A100.json │ └── whisper-transcription-H100.json └── oci_ai_blueprints_terraform ├── OCI_AI_BLUEPRINTS_STACK_VERSION ├── app-api.tf ├── app-background.tf ├── app-blueprint-portal.tf ├── app-configmap.tf ├── app-migration.tf ├── app-registration.tf ├── app-user.tf ├── data.tf ├── database.tf ├── helm.tf ├── ingress.tf ├── later.tf ├── locals.tf ├── lws ├── Chart.yaml ├── README.md ├── templates │ ├── _helpers.tpl │ ├── certmanager │ │ └── certificate.yaml │ ├── crds │ │ └── leaderworkerset.x-k8s.io_leaderworkersets.yaml │ ├── manager │ │ ├── deployment.yaml │ │ └── service.yaml │ ├── prometheus │ │ ├── monitor.yaml │ │ └── role.yaml │ ├── rbac │ │ ├── clusterrole.yaml │ │ ├── clusterrolebinding.yaml │ │ ├── role.yaml │ │ ├── rolebinding.yaml │ │ └── serviceaccount.yaml │ └── webhook │ │ ├── secret.yaml │ │ ├── service.yaml │ │ └── webhook.yaml └── values.yaml ├── modules └── corrino │ ├── .gitignore │ ├── LICENSE │ ├── NOTICE │ ├── cluster-tools.tf │ ├── datasources.tf │ ├── defaults.tf │ ├── main.tf │ ├── modules │ ├── cluster-tools │ │ ├── cert-manager.tf │ │ ├── cluster-tools.tf │ │ ├── dashboards │ │ │ └── vllm-dashboard.json │ │ ├── grafana.tf │ │ ├── ingress-nginx.tf │ │ ├── jaeger.tf │ │ ├── keycloak.tf │ │ ├── metrics-server.tf │ │ ├── modules │ │ │ ├── cert-manager │ │ │ │ ├── issuers │ │ │ │ │ ├── .helmignore │ │ │ │ │ ├── Chart.yaml │ │ │ │ │ ├── templates │ │ │ │ │ │ ├── NOTES.txt │ │ │ │ │ │ ├── _helpers.tpl │ │ │ │ │ │ └── clusterissuers.yaml │ │ │ │ │ └── values.yaml │ │ │ │ ├── main.tf │ │ │ │ ├── providers.tf │ │ │ │ ├── test.yaml │ │ │ │ └── variables.tf │ │ │ └── verrazzano │ │ │ │ ├── main.tf │ │ │ │ ├── providers.tf │ │ │ │ └── variables.tf │ │ ├── outputs.tf │ │ ├── postgresql.tf │ │ ├── prometheus.tf │ │ ├── providers.tf │ │ └── redis.tf │ ├── oci-networking │ │ └── README.md │ ├── oci-policies │ │ ├── main.tf │ │ ├── outputs.tf │ │ ├── variables.tf │ │ └── versions.tf │ ├── oci-vault-kms │ │ ├── main.tf │ │ ├── outputs.tf │ │ ├── policies.tf │ │ ├── providers.tf │ │ └── variables.tf │ ├── oke-cluster-autoscaler │ │ ├── datasources.tf │ │ ├── main.tf │ │ ├── variables.tf │ │ └── versions.tf │ ├── oke-node-pool │ │ ├── datasources.tf │ │ ├── main.tf │ │ ├── outputs.tf │ │ ├── variables.tf │ │ └── versions.tf │ └── oke │ │ ├── LICENSE │ │ ├── README.md │ │ ├── datasources.tf │ │ ├── main.tf │ │ ├── oke-orm-private-endpoint.tf │ │ ├── outputs.tf │ │ ├── variables.tf │ │ └── versions.tf │ ├── oci-networking.tf │ ├── outputs.tf │ ├── policies.tf │ ├── schema.org.yaml │ ├── variables.tf │ └── versions.tf ├── oke.tf ├── outputs.tf ├── policies.tf ├── providers.tf ├── random.tf ├── rbac.tf ├── schema.yaml ├── variables.tf └── versions.tf /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | _site 3 | .DS_Store 4 | .sw[a-z] 5 | .idea/ 6 | *.iml 7 | venv/ 8 | archive/ 9 | test_data/ 10 | HOLD* 11 | *.base64 12 | variables.json 13 | *pyc 14 | *.env 15 | util_setup_local_env.sh 16 | terraform.tfvars 17 | .terraform/ 18 | .terraform.lock.hcl 19 | terraform.tfstate 20 | terraform.tfstate.backup 21 | test*json 22 | generated/ 23 | *.zip -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2024 Oracle and/or its affiliates. All rights reserved. 2 | 3 | The Universal Permissive License (UPL), Version 1.0 4 | 5 | Subject to the condition set forth below, permission is hereby granted to any person obtaining a copy of this 6 | software, associated documentation and/or data (collectively the "Software"), free of charge and under any and 7 | all copyright rights in the Software, and any and all patent rights owned or freely licensable by each licensor 8 | hereunder covering either (i) the unmodified Software as contributed to or provided by such licensor, or 9 | (ii) the Larger Works (as defined below), to deal in both 10 | 11 | (a) the Software, and 12 | (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if one is included with the Software 13 | (each a “Larger Work” to which the Software is contributed by such licensors), 14 | 15 | without restriction, including without limitation the rights to copy, create derivative works of, display, 16 | perform, and distribute the Software and make, use, sell, offer for sale, import, export, have made, and have 17 | sold the Software and the Larger Work(s), and to sublicense the foregoing rights on either these or other terms. 18 | 19 | This license is subject to the following condition: 20 | The above copyright notice and either this complete permission notice or at a minimum a reference to the UPL must 21 | be included in all copies or substantial portions of the Software. 22 | 23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO 24 | THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 25 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 26 | CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 27 | IN THE SOFTWARE. 28 | 29 | -------------------------------------------------------------------------------- /RELEASE_NOTES.md: -------------------------------------------------------------------------------- 1 | # Release Notes 2 | 3 | The following document contains release notes. Each section will detail added features, what has changed, and what has been fixed. Release notes for the previous 5 releases will be maintained in this document. Click the dropdown next to a release to see its associated notes. 4 | 5 | TODO (This file is intended to serve as a template for now): 6 | 7 |
8 | 1.0.0 9 | 10 | ### Added Features 11 | 12 | - Multinode inference 13 | - description one 14 | - description two 15 | - Blueprints can utilize RDMA connectivity between nodes 16 | - my description one 17 | - my description two 18 | 19 | ### Changed 20 | 21 | - Kuberay replaced by LeaderWorkerSet 22 | - MLFlow, Prometheus, and Grafana now use persistent volume claims instead of local storage 23 | - Anchored all versions of helm installs to specific versions which can be found [here](docs/versions/QuickStartVersions.md). 24 | 25 | ### Fixed 26 | 27 | - Fixed an issue with mlflow deployments where all mlflow experiments would fail because "Experiment 1" did not exist - bug in mlflow and using :memory: as the runs database. 28 |
29 | -------------------------------------------------------------------------------- /cluster_creation_terraform/CLUSTER_CREATION_STACK_VERSION: -------------------------------------------------------------------------------- 1 | v1.0.2 -------------------------------------------------------------------------------- /cluster_creation_terraform/OCI_AI_BLUEPRINTS_LINK: -------------------------------------------------------------------------------- 1 | "https://cloud.oracle.com/resourcemanager/stacks/create?zipUrl=https://github.com/oracle-quickstart/oci-ai-blueprints/releases/download/v1.0.2/v1.0.2_app.zip" 2 | -------------------------------------------------------------------------------- /cluster_creation_terraform/oke-datasources.tf: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020-2022 Oracle and/or its affiliates. All rights reserved. 2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | # 4 | 5 | 6 | # Gets a list of supported images based on the shape, operating_system and operating_system_version provided 7 | data "oci_core_images" "shape_specific_images" { 8 | compartment_id = local.oke_compartment_ocid 9 | shape = var.node_pool_instance_shape.instanceShape 10 | } 11 | 12 | data "oci_containerengine_node_pool_option" "cluster_node_pool_option" { 13 | #Required 14 | node_pool_option_id = oci_containerengine_cluster.oke_cluster[0].id 15 | 16 | depends_on = [oci_containerengine_cluster.oke_cluster] 17 | } 18 | 19 | data "oci_containerengine_cluster_option" "oke" { 20 | cluster_option_id = "all" 21 | } 22 | data "oci_containerengine_node_pool_option" "oke" { 23 | node_pool_option_id = "all" 24 | } 25 | data "oci_containerengine_clusters" "oke" { 26 | compartment_id = local.oke_compartment_ocid 27 | } 28 | 29 | # Gets a list of Availability Domains 30 | data "oci_identity_availability_domains" "ADs" { 31 | compartment_id = var.tenancy_ocid 32 | } 33 | 34 | # Gets home and current regions 35 | data "oci_identity_tenancy" "tenant_details" { 36 | tenancy_id = var.tenancy_ocid 37 | 38 | provider = oci.current_region 39 | } 40 | 41 | data "oci_identity_regions" "home_region" { 42 | filter { 43 | name = "key" 44 | values = [data.oci_identity_tenancy.tenant_details.home_region_key] 45 | } 46 | 47 | provider = oci.current_region 48 | } 49 | 50 | # Gets kubeconfig 51 | data "oci_containerengine_cluster_kube_config" "oke" { 52 | cluster_id = oci_containerengine_cluster.oke_cluster[0].id 53 | 54 | depends_on = [oci_containerengine_node_pool.oke_node_pool] 55 | } 56 | 57 | # OCI Services 58 | ## Available Services 59 | data "oci_core_services" "all_services" { 60 | filter { 61 | name = "name" 62 | values = ["All .* Services In Oracle Services Network"] 63 | regex = true 64 | } 65 | } 66 | 67 | ## Object Storage 68 | data "oci_objectstorage_namespace" "ns" { 69 | compartment_id = local.oke_compartment_ocid 70 | } 71 | 72 | # Randoms 73 | resource "random_string" "deploy_id" { 74 | length = 4 75 | special = false 76 | } 77 | 78 | resource "random_string" "app_name_autogen" { 79 | length = 6 80 | special = false 81 | } 82 | 83 | locals { 84 | 85 | all_shape_compatible_images = data.oci_core_images.shape_specific_images.images 86 | all_cluster_compatible_images = data.oci_containerengine_node_pool_option.cluster_node_pool_option.sources 87 | 88 | all_shape_compatible_image_ids = [for image in local.all_shape_compatible_images : image.id] 89 | 90 | all_cluster_compatible_image_ids = [for source in local.all_cluster_compatible_images : source.image_id] 91 | 92 | first_compatible_image_id = tolist(setintersection(toset(local.all_shape_compatible_image_ids), toset(local.all_cluster_compatible_image_ids)))[0] 93 | 94 | } 95 | 96 | -------------------------------------------------------------------------------- /cluster_creation_terraform/outputs.tf: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Oracle and/or its affiliates. All rights reserved. 2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | # 4 | 5 | output "cluster_creation_stack_version" { 6 | value = file("${path.module}/CLUSTER_CREATION_STACK_VERSION") 7 | } 8 | 9 | output "oke_cluster_name" { 10 | value = oci_containerengine_cluster.oke_cluster[0].name 11 | } 12 | 13 | output "oke_cluster_id" { 14 | value = oci_containerengine_cluster.oke_cluster[0].id 15 | } 16 | 17 | output "oci_ai_blueprints_link_for_button" { 18 | value = local.oci_ai_blueprints_link 19 | } 20 | 21 | output "oci_ai_blueprints_link_for_section" { 22 | value = local.oci_ai_blueprints_link 23 | } 24 | 25 | output "vcn_name" { 26 | value = oci_core_virtual_network.oke_vcn[0].display_name 27 | } 28 | 29 | output "vcn_id" { 30 | value = oci_core_virtual_network.oke_vcn[0].id 31 | } 32 | 33 | output "node_subnet_name" { 34 | value = oci_core_subnet.oke_nodes_subnet[0].display_name 35 | } 36 | 37 | output "node_subnet_id" { 38 | value = oci_core_subnet.oke_nodes_subnet[0].id 39 | } 40 | 41 | output "lb_subnet_name" { 42 | value = oci_core_subnet.oke_lb_subnet[0].display_name 43 | } 44 | 45 | output "lb_subnet_id" { 46 | value = oci_core_subnet.oke_lb_subnet[0].id 47 | } 48 | 49 | output "endpoint_subnet_name" { 50 | value = oci_core_subnet.oke_k8s_endpoint_subnet[0].display_name 51 | } 52 | 53 | output "endpoint_subnet_id" { 54 | value = oci_core_subnet.oke_k8s_endpoint_subnet[0].id 55 | } 56 | 57 | -------------------------------------------------------------------------------- /cluster_creation_terraform/providers.tf: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020-2024 Oracle and/or its affiliates. All rights reserved. 2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | # 4 | 5 | provider "oci" { 6 | tenancy_ocid = var.tenancy_ocid 7 | region = var.region 8 | } 9 | 10 | provider "oci" { 11 | alias = "home_region" 12 | tenancy_ocid = var.tenancy_ocid 13 | region = lookup(data.oci_identity_regions.home_region.regions[0], "name") 14 | 15 | user_ocid = var.user_ocid 16 | } 17 | 18 | provider "oci" { 19 | alias = "current_region" 20 | tenancy_ocid = var.tenancy_ocid 21 | region = var.region 22 | 23 | user_ocid = var.user_ocid 24 | } 25 | 26 | # New configuration to avoid Terraform Kubernetes provider interpolation. https://registry.terraform.io/providers/hashicorp/kubernetes/2.2.0/docs#stacking-with-managed-kubernetes-cluster-resources 27 | # Currently need to uncheck to refresh (--refresh=false) when destroying or else the terraform destroy will fail 28 | 29 | # https://docs.cloud.oracle.com/en-us/iaas/Content/ContEng/Tasks/contengdownloadkubeconfigfile.htm#notes 30 | provider "kubernetes" { 31 | host = local.cluster_endpoint 32 | cluster_ca_certificate = local.cluster_ca_certificate 33 | exec { 34 | api_version = "client.authentication.k8s.io/v1beta1" 35 | args = ["ce", "cluster", "generate-token", "--cluster-id", local.cluster_id, "--region", local.cluster_region] 36 | command = "oci" 37 | } 38 | } 39 | 40 | # https://docs.cloud.oracle.com/en-us/iaas/Content/ContEng/Tasks/contengdownloadkubeconfigfile.htm#notes 41 | provider "helm" { 42 | kubernetes { 43 | host = local.cluster_endpoint 44 | cluster_ca_certificate = local.cluster_ca_certificate 45 | exec { 46 | api_version = "client.authentication.k8s.io/v1beta1" 47 | args = ["ce", "cluster", "generate-token", "--cluster-id", local.cluster_id, "--region", local.cluster_region] 48 | command = "oci" 49 | } 50 | } 51 | } 52 | 53 | locals { 54 | cluster_endpoint = yamldecode(data.oci_containerengine_cluster_kube_config.oke.content)["clusters"][0]["cluster"]["server"] 55 | cluster_ca_certificate = base64decode(yamldecode(data.oci_containerengine_cluster_kube_config.oke.content)["clusters"][0]["cluster"]["certificate-authority-data"]) 56 | cluster_id = yamldecode(data.oci_containerengine_cluster_kube_config.oke.content)["users"][0]["user"]["exec"]["args"][4] 57 | cluster_region = yamldecode(data.oci_containerengine_cluster_kube_config.oke.content)["users"][0]["user"]["exec"]["args"][6] 58 | } -------------------------------------------------------------------------------- /cluster_creation_terraform/variables.tf: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Oracle and/or its affiliates. All rights reserved. 2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | # 4 | 5 | # OKE Variables 6 | ## OKE Cluster Details 7 | variable "cluster_options_add_ons_is_kubernetes_dashboard_enabled" { 8 | default = false 9 | } 10 | 11 | ## OKE Visibility (Workers and Endpoint) 12 | 13 | variable "cluster_workers_visibility" { 14 | default = "Private" 15 | description = "The Kubernetes worker nodes that are created will be hosted in public or private subnet(s)" 16 | 17 | validation { 18 | condition = var.cluster_workers_visibility == "Private" || var.cluster_workers_visibility == "Public" 19 | error_message = "Sorry, but cluster visibility can only be Private or Public." 20 | } 21 | } 22 | 23 | variable "cluster_endpoint_visibility" { 24 | default = "Public" 25 | description = "The Kubernetes cluster that is created will be hosted on a public subnet with a public IP address auto-assigned or on a private subnet. If Private, additional configuration will be necessary to run kubectl commands" 26 | 27 | validation { 28 | condition = var.cluster_endpoint_visibility == "Private" || var.cluster_endpoint_visibility == "Public" 29 | error_message = "Sorry, but cluster endpoint visibility can only be Private or Public." 30 | } 31 | } 32 | 33 | 34 | ## OKE Node Pool Details 35 | variable "node_pool_name" { 36 | default = "pool1" 37 | description = "Name of the node pool" 38 | } 39 | variable "k8s_version" { 40 | default = "v1.31.1" 41 | description = "Kubernetes version installed on your master and worker nodes" 42 | } 43 | variable "num_pool_workers" { 44 | default = 6 45 | description = "The number of worker nodes in the node pool. If select Cluster Autoscaler, will assume the minimum number of nodes configured" 46 | } 47 | 48 | variable "node_pool_instance_shape" { 49 | type = map(any) 50 | default = { 51 | "instanceShape" = "VM.Standard.E3.Flex" 52 | "ocpus" = 6 53 | "memory" = 64 54 | } 55 | description = "A shape is a template that determines the number of OCPUs, amount of memory, and other resources allocated to a newly created instance for the Worker Node. Select at least 2 OCPUs and 16GB of memory if using Flex shapes" 56 | } 57 | variable "node_pool_boot_volume_size_in_gbs" { 58 | default = "60" 59 | description = "Specify a custom boot volume size (in GB)" 60 | } 61 | 62 | # Network Details 63 | ## CIDRs 64 | variable "network_cidrs" { 65 | type = map(string) 66 | 67 | default = { 68 | VCN-CIDR = "10.0.0.0/16" 69 | SUBNET-REGIONAL-CIDR = "10.0.64.0/20" 70 | LB-SUBNET-REGIONAL-CIDR = "10.0.96.0/20" 71 | ENDPOINT-SUBNET-REGIONAL-CIDR = "10.0.128.0/20" 72 | ALL-CIDR = "0.0.0.0/0" 73 | PODS-CIDR = "10.244.0.0/16" 74 | KUBERNETES-SERVICE-CIDR = "10.96.0.0/16" 75 | } 76 | } 77 | 78 | # OCI Provider 79 | variable "tenancy_ocid" {} 80 | variable "compartment_ocid" {} 81 | variable "region" {} 82 | variable "user_ocid" { 83 | default = "" 84 | } 85 | 86 | # ORM Schema visual control variables 87 | variable "show_advanced" { 88 | default = false 89 | } 90 | 91 | # App Name Locals 92 | locals { 93 | app_name = random_string.app_name_autogen.result 94 | app_name_normalized = random_string.app_name_autogen.result 95 | oci_ai_blueprints_link = file("${path.module}/OCI_AI_BLUEPRINTS_LINK") 96 | } 97 | 98 | # Dictionary Locals 99 | locals { 100 | compute_flexible_shapes = [ 101 | "VM.Standard.E3.Flex", 102 | "VM.Standard.E4.Flex", 103 | "VM.Standard.A1.Flex" 104 | ] 105 | } -------------------------------------------------------------------------------- /cluster_creation_terraform/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. 2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | # 4 | 5 | terraform { 6 | required_version = ">= 1.5" #>= 1.6 when using OpenTofu 7 | required_providers { 8 | oci = { 9 | source = "oracle/oci" 10 | version = ">= 5" 11 | # https://registry.terraform.io/providers/oracle/oci/ 12 | } 13 | kubernetes = { 14 | source = "hashicorp/kubernetes" 15 | version = ">= 2.27" 16 | # https://registry.terraform.io/providers/hashicorp/kubernetes/ 17 | } 18 | helm = { 19 | source = "hashicorp/helm" 20 | version = ">= 2.12" 21 | # https://registry.terraform.io/providers/hashicorp/helm/ 22 | } 23 | tls = { 24 | source = "hashicorp/tls" 25 | version = ">= 4" 26 | # https://registry.terraform.io/providers/hashicorp/tls/ 27 | } 28 | local = { 29 | source = "hashicorp/local" 30 | version = ">= 2.5" 31 | # https://registry.terraform.io/providers/hashicorp/local/ 32 | } 33 | random = { 34 | source = "hashicorp/random" 35 | version = ">= 3.6" 36 | # https://registry.terraform.io/providers/hashicorp/random/ 37 | } 38 | } 39 | } -------------------------------------------------------------------------------- /docs/common_workflows/deploying_blueprints_onto_specific_nodes/README.md: -------------------------------------------------------------------------------- 1 | # Deploying Blueprints Onto Specific Nodes 2 | 3 | **Note:** A basic understanding of how to use Kubernetes is required for this task 4 | 5 | Assumption: the node exists and you are installing OCI AI Blueprints alongside this pre-existing node (i.e. the node is in the same cluster as the OCI AI Blueprints application) 6 | 7 | ## Label Nodes 8 | 9 | If you have existing node pools in your original OKE cluster that you'd like Blueprints to be able to use, follow these steps after the stack is finished: 10 | 11 | 1. Find the private IP address of the node you'd like to add. 12 | - Console: 13 | - Go to the OKE cluster in the console like you did above 14 | - Click on "Node pools" 15 | - Click on the pool with the node you want to add 16 | - Identify the private ip address of the node under "Nodes" in the page. 17 | - Command line with `kubectl` (assumes cluster access is setup): 18 | - run `kubectl get nodes` 19 | - run `kubectl describe node ` on each node until you find the node you want to add 20 | - The private ip appears under the `Name` field of the output of `kubectl get nodes`. 21 | 2. Go to the stack and click "Application information". Click the API Url. 22 | 3. Login with the `Admin Username` and `Admin Password` in the Application information tab. 23 | 4. Click the link next to "deployment" which will take you to a page with "Deployment List", and a content box. 24 | 5. Paste in the sample blueprint json found [here](../../sample_blueprints/exisiting_cluster_installation/add_node_to_control_plane.json). 25 | 6. Modify the "recipe_node_name" field to the private IP address you found in step 1 above. 26 | 7. Click "POST". This is a fast operation. 27 | 8. Wait about 20 seconds and refresh the page. It should look like: 28 | 29 | ```json 30 | [ 31 | { 32 | "mode": "update", 33 | "recipe_id": null, 34 | "creation_date": "2025-03-28 11:12 AM UTC", 35 | "deployment_uuid": "750a________cc0bfd", 36 | "deployment_name": "startupaddnode", 37 | "deployment_status": "completed", 38 | "deployment_directive": "commission" 39 | } 40 | ] 41 | ``` 42 | 43 | ### Adding additional labels 44 | 45 | To add any additional labels to nodes that you may wish to use later to specify deployment targets, this field (`recipe_node_labels`) can take any arbitrary number of labels to apply to a given node. For example, in the blueprint json, you could add the following: 46 | 47 | ```json 48 | "recipe_node_labels": { 49 | "key1": "value1", 50 | "key2": "value2", 51 | "key3": "value3" 52 | } 53 | ``` 54 | 55 | ## Deploy a blueprint 56 | 57 | Now that you have artifically created a shared node pool using the node labels above, you can deploy a recipe to that node pool. 58 | 59 | ```json 60 | { 61 | "recipe_id": "example", 62 | "recipe_mode": "service", 63 | "deployment_name": "a10 deployment", 64 | "recipe_use_shared_node_pool": true, 65 | "recipe_image_uri": "hashicorp/http-echo", 66 | "recipe_container_command_args": ["-text=corrino"], 67 | "recipe_container_port": "5678", 68 | "recipe_node_shape": "BM.GPU.A10.4", 69 | "recipe_replica_count": 1, 70 | "recipe_nvidia_gpu_count": 4, 71 | "shared_node_pool_custom_node_selectors": [ 72 | { 73 | "key": "corrino", 74 | "value": "a10pool" 75 | } 76 | ] 77 | } 78 | ``` 79 | 80 | Note: In the example above, we specified `recipe_nvidia_gpu_count` as 4 which means we want to use 4 of the GPUs on the node. 81 | 82 | Note: We set `shared_node_pool_custom_node_selectors` to "a10pool" to match the name of the shared node pool we created with the exisiting node. Here, we could add any additional labels added to target specific nodes for work. 83 | 84 | Note: We set `recipe_use_shared_node_pool` to true so that we are using the shared node mode behavior for the blueprint (previously called recipe). 85 | 86 | ## Complete 87 | 88 | At this point, you have successfully deployed a blueprint to an exisiting node and utilized a portion of the existing node by specifiying the specific number of GPUs you wish to use for the blueprint. 89 | -------------------------------------------------------------------------------- /docs/installing_new_updates.md: -------------------------------------------------------------------------------- 1 | # Installing New Updates 2 | 3 | ## Overview 4 | 5 | The OCI AI Blueprints team regularly publishes **full-stack release packages** (control plane, frontend, blueprints, Terraform). 6 | To upgrade your existing deployment, replace your stack’s source zip with the **latest package** from GitHub Releases and re-apply the stack in **OCI Resource Manager**. 7 | 8 | --- 9 | 10 | ## Upgrade Steps 11 | 12 | 1. **Download and unzip the latest release package** 13 | 14 | - Go to **GitHub → Releases** for OCI AI Blueprints 15 | 16 | - Download the file that ends with `_app.zip` (for example `vX.Y.Z_app.zip`) and unzip it. 17 | 18 | 2. Open **OCI Console → Resource Manager → Stacks**. 19 | 20 | 3. Select the stack you originally used to deploy **OCI AI Blueprints**. 21 | 22 | 4. Click **Edit → Edit Stack**. 23 | 24 | 5. **Upload** the unzipped package (the `.zip` downloaded in Step 1). 25 | 26 | > _Tip: the file name should match the release you just downloaded._ 27 | 28 | 6. Click **Next → Next → Confirm** to save the new source. 29 | 30 | 7. Press **Apply** (top-right). A new job starts automatically. 31 | 32 | 8. Wait until the job’s **State** is **Succeeded** — your entire stack is now updated. 33 | 34 | --- 35 | 36 | ## Technical Background 37 | 38 | Updating the stack zip prompts **Resource Manager** to pull the newest Terraform code and container images. 39 | During _Apply_, OKE deployments roll automatically, so no manual pod restarts are needed. 40 | 41 | --- 42 | 43 | ## Error Handling 44 | 45 | If a job fails or you see errors in the console, please contact: 46 | 47 | - Vishnu Kammari — 48 | - Grant Neuman — 49 | 50 | Include the full set of logs when reaching out for fastest assistance. 51 | -------------------------------------------------------------------------------- /docs/known_issues.md: -------------------------------------------------------------------------------- 1 | # Known Issues & Solutions 2 | 3 | Place to record issues that arise and there corresponding workarounds. 4 | 5 | ## 500 Errors When Connecting to API 6 | 7 | 1. Check your permissions and verify that they match exactly as shown here: [IAM Policies](../docs/iam_policies.md) 8 | 2. Did you choose `*.nip.io` as your domain name when setting up OCI AI Blueprints? If so, this is an untrusted domain and will be blocked when behind VPN. Either choose to deploy OCI AI Blueprints via custom domain or access your `*.nip.io` OCI AI Blueprints domain outside of VPN 9 | 10 | ## Shape BM.GPU4.8 Cannot Schedule Blueprints 11 | 12 | Currently, there is an Oracle Kubernetes Engine (OKE) bug with the `BM.GPU4.8` shape. Since the toolkit runs on top of an OKE cluster, this shape cannot be used with the toolkit until the issue is resolved by OKE. We have diagnosed and reported the issue, and are following up with the OKE team for resolution. The error for this issue presents like: 13 | 14 | The following `kubectl` commands can be used to diagnose pods in this state: 15 | 16 | ```bash 17 | kubectl get pods # to find the name of the pod 18 | kubectl describe pod 19 | ``` 20 | 21 | This will output all information about the pod. In the `Events:` section (at the very bottom) you will see information like this: 22 | 23 | ``` 24 | Pod info: nvidia-dcgm-node-feature-discovery-worker always gets stuck in container creating with warning / error like: 25 | Warning FailedCreatePodSandBox 12s kubelet Failed to create pod sandbox: rpc error: code = Unknown desc = failed to create pod network sandbox k8s_gpu-operator-1738967226-node-feature-discovery-worker-dzwht_gpu-operator_06605d81-8dc8-48db-a9a9-b393e8bcd068_0 26 | ``` 27 | 28 | Where the nvidia-dcgm-node-feature-discovery-worker pod infinitely gets stuck in a "ContainerCreating" / "CrashLoopBackoff" cycle. 29 | 30 | ## Issues Connecting to APIs via Postman or Curl 31 | 32 | Make sure to append a slash ('/') to the end of the URL such as `https://api./deployment/` instead of `https://api./deployment`. 33 | This is especially important for all POST requests. 34 | -------------------------------------------------------------------------------- /docs/sample_blueprints/auto_scaling/autoscaling_blueprint.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_id": "llm_inference_nvidia", 3 | "recipe_mode": "service", 4 | "deployment_name": "autoscale_vllm_example", 5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:vllmv0.6.5", 6 | "recipe_node_shape": "VM.GPU.A10.2", 7 | "input_object_storage": [ 8 | { 9 | "par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/qFv5XzocpOoEXjlxL7Q3ZrrCFkx9GkA1fpg97zmnaNEX9WB_WMXLz2rykGuU1hqQ/n/iduyx1qnmway/b/metallama321binstruct/o/", 10 | "mount_location": "/models", 11 | "volume_size_in_gbs": 100 12 | } 13 | ], 14 | "recipe_container_env": [ 15 | { 16 | "key": "tensor_parallel_size", 17 | "value": "1" 18 | }, 19 | { 20 | "key": "model_name", 21 | "value": "" 22 | }, 23 | { 24 | "key": "Model_Path", 25 | "value": "/models" 26 | } 27 | ], 28 | "recipe_replica_count": 1, 29 | "recipe_container_port": "8000", 30 | "recipe_nvidia_gpu_count": 1, 31 | "recipe_container_command_args": [ 32 | "--model", 33 | "$(Model_Path)", 34 | "--tensor-parallel-size", 35 | "$(tensor_parallel_size)", 36 | "--gpu-memory-utilization", 37 | "0.99", 38 | "--max-model-len", 39 | "1024" 40 | ], 41 | "recipe_ephemeral_storage_size": 200, 42 | "recipe_node_boot_volume_size_in_gbs": 300, 43 | "recipe_node_pool_size": 1, 44 | "recipe_shared_memory_volume_size_limit_in_mb": 200, 45 | "recipe_startup_probe_params": { 46 | "failure_threshold": 30, 47 | "endpoint_path": "/health", 48 | "port": 8000, 49 | "scheme": "HTTP", 50 | "initial_delay_seconds": 60, 51 | "period_seconds": 2, 52 | "success_threshold": 1, 53 | "timeout_seconds": 10 54 | }, 55 | "recipe_liveness_probe_params": { 56 | "failure_threshold": 3, 57 | "endpoint_path": "/health", 58 | "port": 8000, 59 | "scheme": "HTTP", 60 | "initial_delay_seconds": 65, 61 | "period_seconds": 600, 62 | "success_threshold": 1, 63 | "timeout_seconds": 10 64 | }, 65 | "recipe_node_autoscaling_params": { 66 | "min_nodes": 1, 67 | "max_nodes": 2 68 | }, 69 | "recipe_pod_autoscaling_params": { 70 | "min_replicas": 1, 71 | "max_replicas": 4 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /docs/sample_blueprints/cpu-inference/cpu-inference-gemma.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_id": "cpu_inference", 3 | "recipe_mode": "service", 4 | "deployment_name": "cpu Inference gemma BME5", 5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:cpu_inference_service_v0.2", 6 | "recipe_node_shape": "BM.Standard.E5.192", 7 | "input_object_storage": [ 8 | { 9 | "par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/0LYMMBRGg_OEm_hzp9BG8BlQx7Ccpy3gY-gRzjQQFZRU6peG0pXyHTRHUGZLp82E/n/iduyx1qnmway/b/ollama-models/o/", 10 | "mount_location": "/models", 11 | "volume_size_in_gbs": 20 12 | } 13 | ], 14 | "recipe_container_env": [ 15 | { 16 | "key": "MODEL_NAME", 17 | "value": "gemma" 18 | }, 19 | { 20 | "key": "PROMPT", 21 | "value": "What is the capital of Germany?" 22 | } 23 | ], 24 | "recipe_replica_count": 1, 25 | "recipe_container_port": "11434", 26 | "recipe_node_pool_size": 1, 27 | "recipe_node_boot_volume_size_in_gbs": 200, 28 | "recipe_container_command_args": [ 29 | "--input_directory", 30 | "/models", 31 | "--model_name", 32 | "gemma" 33 | ], 34 | "recipe_ephemeral_storage_size": 100 35 | } 36 | -------------------------------------------------------------------------------- /docs/sample_blueprints/cpu-inference/cpu-inference-mistral-bm.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_id": "cpu_inference", 3 | "recipe_mode": "service", 4 | "deployment_name": "cpu Inference mistral BME4", 5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:cpu_inference_service_v0.2", 6 | "recipe_node_shape": "BM.Standard.E4.128", 7 | "input_object_storage": [ 8 | { 9 | "par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/0LYMMBRGg_OEm_hzp9BG8BlQx7Ccpy3gY-gRzjQQFZRU6peG0pXyHTRHUGZLp82E/n/iduyx1qnmway/b/ollama-models/o/", 10 | "mount_location": "/models", 11 | "volume_size_in_gbs": 20 12 | } 13 | ], 14 | "recipe_container_env": [ 15 | { 16 | "key": "MODEL_NAME", 17 | "value": "mistral" 18 | }, 19 | { 20 | "key": "PROMPT", 21 | "value": "What is the capital of France?" 22 | } 23 | ], 24 | "recipe_replica_count": 1, 25 | "recipe_container_port": "11434", 26 | "recipe_node_pool_size": 1, 27 | "recipe_node_boot_volume_size_in_gbs": 200, 28 | "recipe_container_command_args": [ 29 | "--input_directory", 30 | "/models", 31 | "--model_name", 32 | "mistral" 33 | ], 34 | "recipe_ephemeral_storage_size": 100 35 | } 36 | -------------------------------------------------------------------------------- /docs/sample_blueprints/cpu-inference/cpu-inference-mistral-vm.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_id": "cpu_inference", 3 | "recipe_mode": "service", 4 | "deployment_name": "cpu Inference mistral E4Flex", 5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:cpu_inference_service_v0.2", 6 | "recipe_node_shape": "VM.Standard.E4.Flex", 7 | "recipe_flex_shape_ocpu_count": 4, 8 | "recipe_flex_shape_memory_size_in_gbs": 64, 9 | "input_object_storage": [ 10 | { 11 | "par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/0LYMMBRGg_OEm_hzp9BG8BlQx7Ccpy3gY-gRzjQQFZRU6peG0pXyHTRHUGZLp82E/n/iduyx1qnmway/b/ollama-models/o/", 12 | "mount_location": "/models", 13 | "volume_size_in_gbs": 20 14 | } 15 | ], 16 | "recipe_container_env": [ 17 | { 18 | "key": "MODEL_NAME", 19 | "value": "mistral" 20 | }, 21 | { 22 | "key": "PROMPT", 23 | "value": "What is the capital of Spain?" 24 | } 25 | ], 26 | "recipe_replica_count": 1, 27 | "recipe_container_port": "11434", 28 | "recipe_node_pool_size": 1, 29 | "recipe_node_boot_volume_size_in_gbs": 200, 30 | "recipe_container_command_args": [ 31 | "--input_directory", 32 | "/models", 33 | "--model_name", 34 | "mistral" 35 | ], 36 | "recipe_ephemeral_storage_size": 100 37 | } 38 | -------------------------------------------------------------------------------- /docs/sample_blueprints/exisiting_cluster_installation/README.md: -------------------------------------------------------------------------------- 1 | # Install OCI AI Blueprints onto an Existing OKE Cluster 2 | 3 | #### Deploy OCI AI Blueprints on your existing OKE cluster without creating new infrastructure 4 | 5 | This guide helps you install and use **OCI AI Blueprints** on an existing OKE cluster that was created outside of blueprints and already has workflows running on it. Rather than installing blueprints onto a new cluster, you can leverage an existing cluster with node pools and tools already installed. 6 | 7 | The installation process involves ensuring you have the correct IAM policies in place, retrieving existing cluster OKE and VCN information from the console, deploying the OCI AI Blueprints application onto the existing cluster, and optionally adding existing nodes to be used by blueprints. You can then deploy sample recipes to test functionality. 8 | 9 | Key considerations include managing existing tooling like Prometheus, Grafana, or the GPU operator that may already be installed on your cluster. The blueprint installation process can detect and work around these existing components. Additionally, if you have the nvidia-gpu-operator installed and plan to use Multi-Instance GPUs with H100 nodes, special configuration steps are available. 10 | 11 | This approach allows you to: 12 | 13 | - Leverage existing cluster resources and configurations 14 | - Add blueprints capabilities without disrupting current workloads 15 | - Utilize existing node pools for blueprint deployments 16 | - Maintain compatibility with pre-installed cluster tools 17 | 18 | ## Pre-Filled Samples 19 | 20 | | Feature Showcase | Title | Description | Blueprint File | 21 | | --------------------------------------------------------------------------------------------- | ---------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------- | 22 | | Add existing cluster nodes to OCI AI Blueprints control plane for shared resource utilization | Add Existing Node to Control Plane | Configures an existing cluster node to be managed by OCI AI Blueprints, enabling shared node pool functionality and resource optimization across existing infrastructure. | [add_node_to_control_plane.json](add_node_to_control_plane.json) | 23 | 24 | For complete step-by-step instructions, see the [full installation guide](../../../INSTALLING_ONTO_EXISTING_CLUSTER_README.md). 25 | -------------------------------------------------------------------------------- /docs/sample_blueprints/exisiting_cluster_installation/add_node_to_control_plane.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_mode": "update", 3 | "deployment_name": "startupaddnode", 4 | "recipe_node_name": "10.0.10.164", 5 | "recipe_node_labels": { 6 | "corrino": "a10pool", 7 | "corrino/pool-shared-any": "true" 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /docs/sample_blueprints/gpu-health-check/README.md: -------------------------------------------------------------------------------- 1 | # Health Check 2 | 3 | #### Comprehensive GPU health validation and diagnostics for production readiness 4 | 5 | This repository offers a robust, pre-check recipe for thorough GPU health validation prior to deploying production or research workloads. Designed to operate seamlessly across both single-node and multi-node environments, this diagnostic toolset enables you to verify that your GPU infrastructure is primed for high-demand experiments. By systematically assessing key performance metrics—such as thermal behavior, power stability, and overall hardware reliability—you can proactively detect and address issues like thermal throttling, power irregularities, and GPU instability. This early-warning system minimizes the risk of unexpected downtime and performance degradation, ensuring that your system consistently operates at peak efficiency and reliability during critical computational tasks. 6 | 7 | ## Pre-Filled Samples 8 | 9 | | Feature Showcase | Title | Description | Blueprint File | 10 | | ------------------------------------------------------------------------------------------------------------- | ------------------------- | ----------------------------------------------------------------- | -------------------------------------------------------- | 11 | | Validate A10 GPU performance and stability using 16-bit floating point precision for memory-efficient testing | 2 A10 GPUs with dtype 16 | Deploys 2 A10 GPUs with dtype 16 on VM.GPU.A10.2 with 2 GPU(s). | [healthcheck_fp16_a10.json](healthcheck_fp16_a10.json) | 12 | | Validate A10 GPU performance and stability using 32-bit floating point precision for comprehensive testing | 2 A10 GPUs with dtype 32 | Deploys 2 A10 GPUs with dtype 32 on VM.GPU.A10.2 with 2 GPU(s). | [healthcheck_fp32_a10.json](healthcheck_fp32_a10.json) | 13 | | Validate H100 GPU cluster performance and stability using 16-bit precision for high-scale workloads | 8 H100 GPUs with dtype 16 | Deploys 8 H100 GPUs with dtype 16 on BM.GPU.H100.8 with 8 GPU(s). | [healthcheck_fp16_h100.json](healthcheck_fp16_h100.json) | 14 | -------------------------------------------------------------------------------- /docs/sample_blueprints/gpu-health-check/healthcheck_fp16_a10.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_id": "healthcheck", 3 | "recipe_mode": "job", 4 | "deployment_name": "healthcheck_fp16_a10", 5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:healthcheck_v0.3", 6 | "recipe_node_shape": "VM.GPU.A10.2", 7 | "output_object_storage": [ 8 | { 9 | "bucket_name": "healthcheck2", 10 | "mount_location": "/healthcheck_results", 11 | "volume_size_in_gbs": 20 12 | } 13 | ], 14 | "recipe_container_command_args": [ 15 | "--dtype", 16 | "float16", 17 | "--output_dir", 18 | "/healthcheck_results", 19 | "--expected_gpus", 20 | "A10:2,A100:0,H100:0" 21 | ], 22 | "recipe_replica_count": 1, 23 | "recipe_nvidia_gpu_count": 2, 24 | "recipe_node_pool_size": 1, 25 | "recipe_node_boot_volume_size_in_gbs": 200, 26 | "recipe_ephemeral_storage_size": 100, 27 | "recipe_shared_memory_volume_size_limit_in_mb": 1000 28 | } 29 | -------------------------------------------------------------------------------- /docs/sample_blueprints/gpu-health-check/healthcheck_fp16_h100.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_id": "healthcheck", 3 | "recipe_mode": "job", 4 | "deployment_name": "healthcheck_fp16_h100", 5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:healthcheck_v0.3", 6 | "recipe_node_shape": "BM.GPU.H100.8", 7 | "output_object_storage": [ 8 | { 9 | "bucket_name": "healthcheck2", 10 | "mount_location": "/healthcheck_results", 11 | "volume_size_in_gbs": 20 12 | } 13 | ], 14 | "recipe_container_command_args": [ 15 | "--dtype", 16 | "float16", 17 | "--output_dir", 18 | "/healthcheck_results", 19 | "--expected_gpus", 20 | "A10:0,A100:0,H100:8" 21 | ], 22 | "recipe_replica_count": 1, 23 | "recipe_nvidia_gpu_count": 8, 24 | "recipe_node_pool_size": 1, 25 | "recipe_node_boot_volume_size_in_gbs": 200, 26 | "recipe_ephemeral_storage_size": 100, 27 | "recipe_shared_memory_volume_size_limit_in_mb": 1000 28 | } 29 | -------------------------------------------------------------------------------- /docs/sample_blueprints/gpu-health-check/healthcheck_fp32_a10.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_id": "healthcheck", 3 | "recipe_mode": "job", 4 | "deployment_name": "healthcheck_fp32_a10", 5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:healthcheck_v0.3", 6 | "recipe_node_shape": "VM.GPU.A10.2", 7 | "output_object_storage": [ 8 | { 9 | "bucket_name": "healthcheck2", 10 | "mount_location": "/healthcheck_results", 11 | "volume_size_in_gbs": 20 12 | } 13 | ], 14 | "recipe_container_command_args": [ 15 | "--dtype", 16 | "float32", 17 | "--output_dir", 18 | "/healthcheck_results", 19 | "--expected_gpus", 20 | "A10:2,A100:0,H100:0" 21 | ], 22 | "recipe_replica_count": 1, 23 | "recipe_nvidia_gpu_count": 2, 24 | "recipe_node_pool_size": 1, 25 | "recipe_node_boot_volume_size_in_gbs": 200, 26 | "recipe_ephemeral_storage_size": 100, 27 | "recipe_shared_memory_volume_size_limit_in_mb": 1000 28 | } 29 | -------------------------------------------------------------------------------- /docs/sample_blueprints/llm_inference_with_vllm/README.md: -------------------------------------------------------------------------------- 1 | # LLM Inference with vLLM 2 | 3 | #### Deploy open-source LLMs to GPUs for inference with vLLM. 4 | 5 | This blueprint simplifies the deployment of LLMs using an open-source inference engine called vLLM. You can deploy a custom model or select from a variety of open-source models on Hugging Face. 6 | 7 | The blueprint deploys the model from an object storage bucket to a GPU node in an OKE cluster in your tenancy. Once deployed, you receive a ready-to-use API endpoint to start generating responses from the model. For mission-critical workloads, you can also configure auto-scaling driven by application metrics like inference latency. To summarize, this blueprint streamlines inference deployment, making it easy to scale and integrate into your applications without deep, technical expertise. 8 | 9 | ## Pre-Filled Samples 10 | 11 | | Feature Showcase | Title | Description | Blueprint File | 12 | | ------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------- | 13 | | Deploy models from OCI Object Storage using pre-authenticated requests (PARs) for faster model loading | Meta-Llama-3.1-8B-Instruct from OCI Object Storage on VM.GPU.A10.2 with vLLM | Deploys Meta-Llama-3.1-8B-Instruct from OCI Object Storage on VM.GPU.A10.2 with vLLM on VM.GPU.A10.2 with 2 GPU(s). | [vllm-model-from-obj-storage.json](vllm-model-from-obj-storage.json) | 14 | | Use vLLM with a gated HuggingFace model which requires pre-authentication and passing an authentication token | meta-llama/Llama-3.2-11B-Vision (Closed Model) from Hugging Face on VM.GPU.A10.2 with vLLM | Deploys meta-llama/Llama-3.2-11B-Vision (Closed Model) from Hugging Face on VM.GPU.A10.2 with vLLM on VM.GPU.A10.2 with 2 GPU(s). | [vllm-closed-hf-model.json](vllm-closed-hf-model.json) | 15 | | Deploy open-source models from HuggingFace and have them downloaded directly on the node | NousResearch/Meta-Llama-3-8B-Instruct (Open Model) from Hugging Face on VM.GPU.A10.2 with vLLM | Deploys NousResearch/Meta-Llama-3-8B-Instruct (Open Model) from Hugging Face on VM.GPU.A10.2 with vLLM on VM.GPU.A10.2 with 2 GPU(s). | [vllm-open-hf-model.json](vllm-open-hf-model.json) | 16 | | Secure vLLM endpoint with API key authentication to control access to the inference service | NousResearch/Meta-Llama-3-8B-Instruct (Open Model) from Hugging Face on VM.GPU.A10.2 with vLLM and Endpoint API Key | Deploys NousResearch/Meta-Llama-3-8B-Instruct (Open Model) from Hugging Face on VM.GPU.A10.2 with vLLM and Endpoint API Key on VM.GPU.A10.2 with 2 GPU(s). | [vllm-open-hf-model-api-key-functionality.json](vllm-open-hf-model-api-key-functionality.json) | 17 | -------------------------------------------------------------------------------- /docs/sample_blueprints/llm_inference_with_vllm/vllm-closed-hf-model.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_id": "llm_inference_nvidia", 3 | "recipe_mode": "service", 4 | "deployment_name": "vllm-closed-hf-model", 5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:vllmv0.6.2", 6 | "recipe_node_shape": "VM.GPU.A10.2", 7 | "recipe_container_env": [ 8 | { 9 | "key": "HF_TOKEN", 10 | "value": "" 11 | } 12 | ], 13 | "recipe_replica_count": 1, 14 | "recipe_container_port": "8000", 15 | "recipe_nvidia_gpu_count": 2, 16 | "recipe_node_pool_size": 1, 17 | "recipe_node_boot_volume_size_in_gbs": 200, 18 | "recipe_container_command_args": [ 19 | "--model", 20 | "meta-llama/Llama-3.2-11B-Vision", 21 | "--tensor-parallel-size", 22 | "2" 23 | ], 24 | "recipe_ephemeral_storage_size": 100, 25 | "recipe_shared_memory_volume_size_limit_in_mb": 200 26 | } 27 | -------------------------------------------------------------------------------- /docs/sample_blueprints/llm_inference_with_vllm/vllm-model-from-obj-storage.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_id": "llm_inference_nvidia", 3 | "recipe_mode": "service", 4 | "deployment_name": "vllm-model-from-obj-storage", 5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:vllmv0.6.6.pos1", 6 | "recipe_node_shape": "VM.GPU.A10.2", 7 | "input_object_storage": [ 8 | { 9 | "par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/IFknABDAjiiF5LATogUbRCcVQ9KL6aFUC1j-P5NSeUcaB2lntXLaR935rxa-E-u1/n/iduyx1qnmway/b/corrino_hf_oss_models/o/", 10 | "mount_location": "/models", 11 | "volume_size_in_gbs": 500, 12 | "include": ["NousResearch/Meta-Llama-3.1-8B-Instruct"] 13 | } 14 | ], 15 | "recipe_container_env": [ 16 | { 17 | "key": "tensor_parallel_size", 18 | "value": "2" 19 | }, 20 | { 21 | "key": "model_name", 22 | "value": "NousResearch/Meta-Llama-3.1-8B-Instruct" 23 | }, 24 | { 25 | "key": "Model_Path", 26 | "value": "/models/NousResearch/Meta-Llama-3.1-8B-Instruct" 27 | } 28 | ], 29 | "recipe_replica_count": 1, 30 | "recipe_container_port": "8000", 31 | "recipe_nvidia_gpu_count": 2, 32 | "recipe_node_pool_size": 1, 33 | "recipe_node_boot_volume_size_in_gbs": 200, 34 | "recipe_container_command_args": [ 35 | "--model", 36 | "$(Model_Path)", 37 | "--tensor-parallel-size", 38 | "$(tensor_parallel_size)" 39 | ], 40 | "recipe_ephemeral_storage_size": 100, 41 | "recipe_shared_memory_volume_size_limit_in_mb": 200 42 | } 43 | -------------------------------------------------------------------------------- /docs/sample_blueprints/llm_inference_with_vllm/vllm-open-hf-model-api-key-functionality.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_id": "llm_inference_nvidia", 3 | "recipe_mode": "service", 4 | "deployment_name": "vllm-open-hf-model-api-key-functionality", 5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:vllmv0.6.2", 6 | "recipe_node_shape": "VM.GPU.A10.2", 7 | "recipe_container_env": [ 8 | { 9 | "key": "VLLM_API_KEY", 10 | "value": "" 11 | } 12 | ], 13 | "recipe_replica_count": 1, 14 | "recipe_container_port": "8000", 15 | "recipe_nvidia_gpu_count": 2, 16 | "recipe_node_pool_size": 1, 17 | "recipe_node_boot_volume_size_in_gbs": 200, 18 | "recipe_container_command_args": [ 19 | "--model", 20 | "NousResearch/Meta-Llama-3-8B-Instruct", 21 | "--tensor-parallel-size", 22 | "2" 23 | ], 24 | "recipe_ephemeral_storage_size": 100, 25 | "recipe_shared_memory_volume_size_limit_in_mb": 200 26 | } 27 | -------------------------------------------------------------------------------- /docs/sample_blueprints/llm_inference_with_vllm/vllm-open-hf-model.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_id": "llm_inference_nvidia", 3 | "recipe_mode": "service", 4 | "deployment_name": "vllm-open-hf-model", 5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:vllmv0.6.2", 6 | "recipe_node_shape": "VM.GPU.A10.2", 7 | "recipe_container_env": [ 8 | { 9 | "key": "tensor_parallel_size", 10 | "value": "2" 11 | }, 12 | { 13 | "key": "model_name", 14 | "value": "NousResearch/Meta-Llama-3-8B-Instruct" 15 | } 16 | ], 17 | "recipe_replica_count": 1, 18 | "recipe_container_port": "8000", 19 | "recipe_nvidia_gpu_count": 2, 20 | "recipe_node_pool_size": 1, 21 | "recipe_node_boot_volume_size_in_gbs": 200, 22 | "recipe_container_command_args": [ 23 | "--model", 24 | "$(model_name)", 25 | "--tensor-parallel-size", 26 | "$(tensor_parallel_size)" 27 | ], 28 | "recipe_ephemeral_storage_size": 100, 29 | "recipe_shared_memory_volume_size_limit_in_mb": 200 30 | } 31 | -------------------------------------------------------------------------------- /docs/sample_blueprints/lora-benchmarking/README.md: -------------------------------------------------------------------------------- 1 | # Fine-Tuning Benchmarking 2 | 3 | #### Fine-tune quantized Llama-2-70B model using MLCommons methodology for infrastructure benchmarking 4 | 5 | The fine-tuning benchmarking blueprint streamlines infrastructure benchmarking for fine-tuning using the MLCommons methodology. It fine-tunes a quantized Llama-2-70B model and a standard dataset. 6 | 7 | Once complete, benchmarking results, such as training time and resource utilization, are available in MLFlow and Grafana for easy tracking. This blueprint enables data-driven infrastructure decisions for your fine-tuning jobs. 8 | 9 | ## Pre-Filled Samples 10 | 11 | | Feature Showcase | Title | Description | Blueprint File | 12 | | ------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------ | 13 | | Benchmark LoRA fine-tuning performance using MLCommons methodology with quantized large language models | LoRA fine-tuning of quantitized Llama-2-70B model on A100 node using MLCommons methodology | Deploys LoRA fine-tuning of quantitized Llama-2-70B model on A100 node using MLCommons methodology on BM.GPU.A100.8 with 8 GPU(s). | [mlcommons_lora_finetune_nvidia_sample_recipe.json](mlcommons_lora_finetune_nvidia_sample_recipe.json) | 14 | -------------------------------------------------------------------------------- /docs/sample_blueprints/lora-benchmarking/mlcommons_lora_finetune_nvidia_sample_recipe.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_id": "mlcommons_lora_finetune_nvidia", 3 | "deployment_name": "MLCommons Finetune LORA/PEFT", 4 | "recipe_mode": "job", 5 | "recipe_node_shape": "BM.GPU.A100.8", 6 | "recipe_use_shared_node_pool": false, 7 | "recipe_nvidia_gpu_count": 8, 8 | "recipe_ephemeral_storage_size": 50, 9 | "recipe_replica_count": 1, 10 | "recipe_node_pool_size": 1, 11 | "recipe_node_boot_volume_size_in_gbs": 200, 12 | "recipe_shared_memory_volume_size_limit_in_mb": 100, 13 | "input_object_storage": [ 14 | { 15 | "bucket_name": "corrino_mlcommons_llama2_70b_qkv", 16 | "mount_location": "/models", 17 | "volume_size_in_gbs": 500 18 | }, 19 | { 20 | "bucket_name": "corrino_ml_commons_scrolls_dataset", 21 | "mount_location": "/dataset", 22 | "volume_size_in_gbs": 100 23 | } 24 | ], 25 | "output_object_storage": [ 26 | { 27 | "bucket_name": "corrino_ml_commons_output", 28 | "mount_location": "/mlcommons_output", 29 | "volume_size_in_gbs": 200 30 | } 31 | ], 32 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:corrino-recipe-mlcommons", 33 | "recipe_container_env": [ 34 | { 35 | "key": "model_name", 36 | "value": "regisss/llama2-70b-fused-qkv-mlperf" 37 | }, 38 | { 39 | "key": "Model_Path", 40 | "value": "/models" 41 | }, 42 | { 43 | "key": "Dataset_Path", 44 | "value": "/dataset" 45 | }, 46 | { 47 | "key": "Lora_R", 48 | "value": "16" 49 | }, 50 | { 51 | "key": "Lora_Alpha", 52 | "value": "32" 53 | }, 54 | { 55 | "key": "Lora_Dropout", 56 | "value": "0.1" 57 | }, 58 | { 59 | "key": "Max_Seq_Len", 60 | "value": "8192" 61 | }, 62 | { 63 | "key": "bf16", 64 | "value": "true" 65 | }, 66 | { 67 | "key": "Logging_Steps", 68 | "value": "24" 69 | }, 70 | { 71 | "key": "Eval_Steps", 72 | "value": "48" 73 | }, 74 | { 75 | "key": "Per_Device_Train_Batch_Size", 76 | "value": "1" 77 | }, 78 | { 79 | "key": "Gradient_Accumulation_Steps", 80 | "value": "1" 81 | }, 82 | { 83 | "key": "Lr_Scheduler_Type", 84 | "value": "cosine" 85 | }, 86 | { 87 | "key": "Learning_Rate", 88 | "value": "0.0004" 89 | }, 90 | { 91 | "key": "Weight_Decay", 92 | "value": "0.0001" 93 | }, 94 | { 95 | "key": "Warmup_Ratio", 96 | "value": "0" 97 | }, 98 | { 99 | "key": "Max_Grad_Norm", 100 | "value": "0.3" 101 | }, 102 | { 103 | "key": "Use_Gradient_Checkpointing", 104 | "value": "true" 105 | }, 106 | { 107 | "key": "Target_Eval_Loss", 108 | "value": "0.925" 109 | }, 110 | { 111 | "key": "Use_Peft_Lora", 112 | "value": "true" 113 | }, 114 | { 115 | "key": "Max_Steps", 116 | "value": "1024" 117 | }, 118 | { 119 | "key": "Use_Flash_Attn", 120 | "value": "true" 121 | }, 122 | { 123 | "key": "Seed", 124 | "value": "1234" 125 | }, 126 | { 127 | "key": "Lora_Target_Modules", 128 | "value": "qkv_proj,o_proj" 129 | }, 130 | { 131 | "key": "Mlflow_Exp_Name", 132 | "value": "oci_ai_blueprints_nvidia_recipe" 133 | }, 134 | { 135 | "key": "Output_Dir", 136 | "value": "/mlcommons_output" 137 | } 138 | ] 139 | } 140 | -------------------------------------------------------------------------------- /docs/sample_blueprints/lora-fine-tuning/closed_model_open_dataset_hf.backend.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_id": "lora_finetune_nvidia", 3 | "deployment_name": "dk_closed_model_open_dataset", 4 | "recipe_mode": "job", 5 | "recipe_node_shape": "VM.GPU.A10.2", 6 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:finetune_lora_dev", 7 | "recipe_nvidia_gpu_count": 2, 8 | "recipe_ephemeral_storage_size": 300, 9 | "recipe_replica_count": 1, 10 | "recipe_node_pool_size": 1, 11 | "recipe_node_boot_volume_size_in_gbs": 500, 12 | "recipe_shared_memory_volume_size_limit_in_mb": 100, 13 | "recipe_container_env": [ 14 | { 15 | "key": "Mlflow_Endpoint", 16 | "value": "http://mlflow.cluster-tools.svc.cluster.local:5000" 17 | }, 18 | { 19 | "key": "Mlflow_Exp_Name", 20 | "value": "oci_ai_blueprints_nvidia_recipe" 21 | }, 22 | { 23 | "key": "Mlflow_Run_Name", 24 | "value": "llama-3.2-1B-Instruct-scrolls-gov_report" 25 | }, 26 | { 27 | "key": "Hf_Token", 28 | "value": "" 29 | }, 30 | { 31 | "key": "Download_Dataset_From_Hf", 32 | "value": "true" 33 | }, 34 | { 35 | "key": "Dataset_Name", 36 | "value": "tau/scrolls" 37 | }, 38 | { 39 | "key": "Dataset_Sub_Name", 40 | "value": "gov_report" 41 | }, 42 | { 43 | "key": "Dataset_Column_To_Use", 44 | "value": "None" 45 | }, 46 | { 47 | "key": "Dataset_Path", 48 | "value": "/workspace/datasets" 49 | }, 50 | { 51 | "key": "Download_Model_From_Hf", 52 | "value": "true" 53 | }, 54 | { 55 | "key": "Model_Name", 56 | "value": "meta-llama/Llama-3.2-1B-Instruct" 57 | }, 58 | { 59 | "key": "Model_Path", 60 | "value": "/workspace/models" 61 | }, 62 | { 63 | "key": "Max_Model_Length", 64 | "value": "8192" 65 | }, 66 | { 67 | "key": "Resume_From_Checkpoint", 68 | "value": "false" 69 | }, 70 | { 71 | "key": "Checkpoint_Path", 72 | "value": "/checkpoint" 73 | }, 74 | { 75 | "key": "Lora_R", 76 | "value": "8" 77 | }, 78 | { 79 | "key": "Lora_Alpha", 80 | "value": "32" 81 | }, 82 | { 83 | "key": "Lora_Dropout", 84 | "value": "0.1" 85 | }, 86 | { 87 | "key": "Lora_Target_Modules", 88 | "value": "q_proj,up_proj,o_proj,k_proj,down_proj,gate_proj,v_proj" 89 | }, 90 | { 91 | "key": "Bias", 92 | "value": "none" 93 | }, 94 | { 95 | "key": "Task_Type", 96 | "value": "CAUSAL_LM" 97 | }, 98 | { 99 | "key": "Per_Device_Train_Batch_Size", 100 | "value": "1" 101 | }, 102 | { 103 | "key": "Gradient_Accumulation_Steps", 104 | "value": "1" 105 | }, 106 | { 107 | "key": "Warmup_Steps", 108 | "value": "2" 109 | }, 110 | { 111 | "key": "Save_Steps", 112 | "value": "100" 113 | }, 114 | { 115 | "key": "Learning_Rate", 116 | "value": "0.0002" 117 | }, 118 | { 119 | "key": "Fp16", 120 | "value": "true" 121 | }, 122 | { 123 | "key": "Logging_Steps", 124 | "value": "1" 125 | }, 126 | { 127 | "key": "Output_Dir", 128 | "value": "/tunedmodels/Llama-3.1-8B-english_quotes" 129 | }, 130 | { 131 | "key": "Optim", 132 | "value": "paged_adamw_8bit" 133 | }, 134 | { 135 | "key": "Number_of_Training_Epochs", 136 | "value": "2" 137 | }, 138 | { 139 | "key": "Require_Persistent_Output_Dir", 140 | "value": "true" 141 | } 142 | ], 143 | "output_object_storage": [ 144 | { 145 | "bucket_name": "corrino_tuned_hf_oss_models", 146 | "mount_location": "/tunedmodels", 147 | "volume_size_in_gbs": 500 148 | } 149 | ] 150 | } 151 | -------------------------------------------------------------------------------- /docs/sample_blueprints/lora-fine-tuning/open_model_open_dataset_hf.backend.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_id": "lora_finetune_nvidia", 3 | "deployment_name": "dk_open_model_open_dataset", 4 | "recipe_mode": "job", 5 | "recipe_node_shape": "VM.GPU.A10.2", 6 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:finetune_lora_dev", 7 | "recipe_nvidia_gpu_count": 2, 8 | "recipe_ephemeral_storage_size": 300, 9 | "recipe_replica_count": 1, 10 | "recipe_node_pool_size": 1, 11 | "recipe_node_boot_volume_size_in_gbs": 500, 12 | "recipe_shared_memory_volume_size_limit_in_mb": 100, 13 | "recipe_container_env": [ 14 | { 15 | "key": "Mlflow_Endpoint", 16 | "value": "http://mlflow.cluster-tools.svc.cluster.local:5000" 17 | }, 18 | { 19 | "key": "Mlflow_Exp_Name", 20 | "value": "oci_ai_blueprints_nvidia_recipe" 21 | }, 22 | { 23 | "key": "Mlflow_Run_Name", 24 | "value": "oci_ai_blueprints_run" 25 | }, 26 | { 27 | "key": "Hf_Token", 28 | "value": "None" 29 | }, 30 | { 31 | "key": "Download_Dataset_From_Hf", 32 | "value": "true" 33 | }, 34 | { 35 | "key": "Dataset_Name", 36 | "value": "Abirate/english_quotes" 37 | }, 38 | { 39 | "key": "Dataset_Sub_Name", 40 | "value": "None" 41 | }, 42 | { 43 | "key": "Dataset_Column_To_Use", 44 | "value": "None" 45 | }, 46 | { 47 | "key": "Dataset_Path", 48 | "value": "/workspace/datasets" 49 | }, 50 | { 51 | "key": "Download_Model_From_Hf", 52 | "value": "true" 53 | }, 54 | { 55 | "key": "Model_Name", 56 | "value": "NousResearch/Meta-Llama-3.1-8B" 57 | }, 58 | { 59 | "key": "Model_Path", 60 | "value": "/workspace/models" 61 | }, 62 | { 63 | "key": "Max_Model_Length", 64 | "value": "8192" 65 | }, 66 | { 67 | "key": "Resume_From_Checkpoint", 68 | "value": "false" 69 | }, 70 | { 71 | "key": "Checkpoint_Path", 72 | "value": "/checkpoint" 73 | }, 74 | { 75 | "key": "Lora_R", 76 | "value": "8" 77 | }, 78 | { 79 | "key": "Lora_Alpha", 80 | "value": "32" 81 | }, 82 | { 83 | "key": "Lora_Dropout", 84 | "value": "0.1" 85 | }, 86 | { 87 | "key": "Lora_Target_Modules", 88 | "value": "q_proj,up_proj,o_proj,k_proj,down_proj,gate_proj,v_proj" 89 | }, 90 | { 91 | "key": "Bias", 92 | "value": "none" 93 | }, 94 | { 95 | "key": "Task_Type", 96 | "value": "CAUSAL_LM" 97 | }, 98 | { 99 | "key": "Per_Device_Train_Batch_Size", 100 | "value": "1" 101 | }, 102 | { 103 | "key": "Gradient_Accumulation_Steps", 104 | "value": "1" 105 | }, 106 | { 107 | "key": "Warmup_Steps", 108 | "value": "2" 109 | }, 110 | { 111 | "key": "Save_Steps", 112 | "value": "100" 113 | }, 114 | { 115 | "key": "Learning_Rate", 116 | "value": "0.0002" 117 | }, 118 | { 119 | "key": "Fp16", 120 | "value": "true" 121 | }, 122 | { 123 | "key": "Logging_Steps", 124 | "value": "1" 125 | }, 126 | { 127 | "key": "Output_Dir", 128 | "value": "/tunedmodels/Llama-3.1-8B-english_quotes" 129 | }, 130 | { 131 | "key": "Optim", 132 | "value": "paged_adamw_8bit" 133 | }, 134 | { 135 | "key": "Number_of_Training_Epochs", 136 | "value": "2" 137 | }, 138 | { 139 | "key": "Require_Persistent_Output_Dir", 140 | "value": "true" 141 | } 142 | ], 143 | "output_object_storage": [ 144 | { 145 | "bucket_name": "corrino_tuned_hf_oss_models", 146 | "mount_location": "/tunedmodels", 147 | "volume_size_in_gbs": 500 148 | } 149 | ] 150 | } 151 | -------------------------------------------------------------------------------- /docs/sample_blueprints/mig_multi_instance_gpu/mig_enabled_shared_node_pool.json: -------------------------------------------------------------------------------- 1 | { 2 | "deployment_name": "H100_pool_mig", 3 | "recipe_mode": "shared_node_pool", 4 | "shared_node_pool_size": 1, 5 | "shared_node_pool_shape": "BM.GPU.H100.8", 6 | "shared_node_pool_boot_volume_size_in_gbs": 1000, 7 | "shared_node_pool_mig_config": "all-1g.20gb" 8 | } 9 | -------------------------------------------------------------------------------- /docs/sample_blueprints/mig_multi_instance_gpu/mig_inference_multiple_replicas.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_id": "llm_inference_nvidia", 3 | "recipe_mode": "service", 4 | "deployment_name": "autoscale_mig", 5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:vllmv0.6.5", 6 | "recipe_node_shape": "BM.GPU.H100.8", 7 | "recipe_container_env": [ 8 | { 9 | "key": "tensor_parallel_size", 10 | "value": "1" 11 | }, 12 | { 13 | "key": "HF_TOKEN", 14 | "value": "" 15 | } 16 | ], 17 | "recipe_replica_count": 5, 18 | "recipe_container_port": "8000", 19 | "recipe_nvidia_gpu_count": 1, 20 | "recipe_use_shared_node_pool": true, 21 | "mig_resource_request": "1g.10gb", 22 | "recipe_container_command_args": [ 23 | "--model", 24 | "meta-llama/Llama-3.2-3B-Instruct", 25 | "--dtype", 26 | "bfloat16", 27 | "--tensor-parallel-size", 28 | "$(tensor_parallel_size)", 29 | "--gpu-memory-utilization", 30 | "0.99", 31 | "--max-model-len", 32 | "16384" 33 | ], 34 | "recipe_ephemeral_storage_size": 30, 35 | "recipe_node_boot_volume_size_in_gbs": 300, 36 | "recipe_shared_memory_volume_size_limit_in_mb": 1000, 37 | "recipe_startup_probe_params": { 38 | "failure_threshold": 30, 39 | "endpoint_path": "/health", 40 | "port": 8000, 41 | "scheme": "HTTP", 42 | "initial_delay_seconds": 10, 43 | "period_seconds": 2, 44 | "success_threshold": 1, 45 | "timeout_seconds": 1 46 | }, 47 | "recipe_liveness_probe_params": { 48 | "failure_threshold": 3, 49 | "endpoint_path": "/health", 50 | "port": 8000, 51 | "scheme": "HTTP", 52 | "initial_delay_seconds": 65, 53 | "period_seconds": 600, 54 | "success_threshold": 1, 55 | "timeout_seconds": 1 56 | }, 57 | "recipe_pod_autoscaling_params": { 58 | "min_replicas": 5, 59 | "max_replicas": 10 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /docs/sample_blueprints/mig_multi_instance_gpu/mig_inference_single_replica.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_id": "llm_inference_nvidia", 3 | "recipe_mode": "service", 4 | "deployment_name": "autoscale_mig", 5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:vllmv0.6.5", 6 | "recipe_node_shape": "BM.GPU.H100.8", 7 | "recipe_container_env": [ 8 | { 9 | "key": "tensor_parallel_size", 10 | "value": "1" 11 | }, 12 | { 13 | "key": "HF_TOKEN", 14 | "value": "" 15 | } 16 | ], 17 | "recipe_replica_count": 1, 18 | "recipe_container_port": "8000", 19 | "recipe_nvidia_gpu_count": 1, 20 | "recipe_use_shared_node_pool": true, 21 | "mig_resource_request": "1g.20gb", 22 | "recipe_container_command_args": [ 23 | "--model", 24 | "meta-llama/Llama-3.2-3B-Instruct", 25 | "--dtype", 26 | "bfloat16", 27 | "--tensor-parallel-size", 28 | "$(tensor_parallel_size)", 29 | "--gpu-memory-utilization", 30 | "0.99", 31 | "--max-model-len", 32 | "16384" 33 | ], 34 | "recipe_ephemeral_storage_size": 30, 35 | "recipe_node_boot_volume_size_in_gbs": 300, 36 | "recipe_shared_memory_volume_size_limit_in_mb": 1000, 37 | "recipe_startup_probe_params": { 38 | "failure_threshold": 30, 39 | "endpoint_path": "/health", 40 | "port": 8000, 41 | "scheme": "HTTP", 42 | "initial_delay_seconds": 10, 43 | "period_seconds": 2, 44 | "success_threshold": 1, 45 | "timeout_seconds": 1 46 | }, 47 | "recipe_liveness_probe_params": { 48 | "failure_threshold": 3, 49 | "endpoint_path": "/health", 50 | "port": 8000, 51 | "scheme": "HTTP", 52 | "initial_delay_seconds": 65, 53 | "period_seconds": 600, 54 | "success_threshold": 1, 55 | "timeout_seconds": 1 56 | }, 57 | "recipe_pod_autoscaling_params": { 58 | "min_replicas": 1, 59 | "max_replicas": 50 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /docs/sample_blueprints/mig_multi_instance_gpu/mig_inference_single_replica_10gb.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_id": "llm_inference_nvidia", 3 | "recipe_mode": "service", 4 | "deployment_name": "autoscale_mig", 5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:vllmv0.6.5", 6 | "recipe_node_shape": "BM.GPU.H100.8", 7 | "recipe_container_env": [ 8 | { 9 | "key": "tensor_parallel_size", 10 | "value": "1" 11 | }, 12 | { 13 | "key": "HF_TOKEN", 14 | "value": "" 15 | } 16 | ], 17 | "recipe_replica_count": 1, 18 | "recipe_container_port": "8000", 19 | "recipe_nvidia_gpu_count": 1, 20 | "recipe_use_shared_node_pool": true, 21 | "mig_resource_request": "1g.10gb", 22 | "recipe_container_command_args": [ 23 | "--model", 24 | "meta-llama/Llama-3.2-3B-Instruct", 25 | "--dtype", 26 | "bfloat16", 27 | "--tensor-parallel-size", 28 | "$(tensor_parallel_size)", 29 | "--gpu-memory-utilization", 30 | "0.99", 31 | "--max-model-len", 32 | "16384" 33 | ], 34 | "recipe_ephemeral_storage_size": 30, 35 | "recipe_node_boot_volume_size_in_gbs": 300, 36 | "recipe_shared_memory_volume_size_limit_in_mb": 1000, 37 | "recipe_startup_probe_params": { 38 | "failure_threshold": 30, 39 | "endpoint_path": "/health", 40 | "port": 8000, 41 | "scheme": "HTTP", 42 | "initial_delay_seconds": 10, 43 | "period_seconds": 2, 44 | "success_threshold": 1, 45 | "timeout_seconds": 1 46 | }, 47 | "recipe_liveness_probe_params": { 48 | "failure_threshold": 3, 49 | "endpoint_path": "/health", 50 | "port": 8000, 51 | "scheme": "HTTP", 52 | "initial_delay_seconds": 65, 53 | "period_seconds": 600, 54 | "success_threshold": 1, 55 | "timeout_seconds": 1 56 | }, 57 | "recipe_pod_autoscaling_params": { 58 | "min_replicas": 1, 59 | "max_replicas": 50 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /docs/sample_blueprints/mig_multi_instance_gpu/mig_slices.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-ai-blueprints/31c64899055982e73cb2c1b95345faafe6d5c7ff/docs/sample_blueprints/mig_multi_instance_gpu/mig_slices.png -------------------------------------------------------------------------------- /docs/sample_blueprints/mig_multi_instance_gpu/mig_update_node_with_node_name.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_mode": "update", 3 | "deployment_name": "all-1g10gb", 4 | "recipe_node_name": "10.0.10.138", 5 | "shared_node_pool_mig_config": "all-1g.10gb" 6 | } 7 | -------------------------------------------------------------------------------- /docs/sample_blueprints/mig_multi_instance_gpu/mig_update_shared_pool_with_node_pool_name.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_mode": "update", 3 | "deployment_name": "all-2g-20gb", 4 | "recipe_node_pool_name": "h100migpool", 5 | "shared_node_pool_mig_config": "all-2g.20gb" 6 | } 7 | -------------------------------------------------------------------------------- /docs/sample_blueprints/model_storage/download_closed_hf_model_to_object_storage.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_id": "example", 3 | "recipe_mode": "job", 4 | "deployment_name": "model_to_object", 5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:hf_downloader_v1", 6 | "recipe_container_command_args": [ 7 | "meta-llama/Llama-3.2-90B-Vision-Instruct", 8 | "--local-dir", 9 | "/models", 10 | "--max-workers", 11 | "4", 12 | "--token", 13 | "" 14 | ], 15 | "recipe_container_port": "5678", 16 | "recipe_node_shape": "VM.Standard.E4.Flex", 17 | "recipe_node_pool_size": 1, 18 | "recipe_flex_shape_ocpu_count": 4, 19 | "recipe_flex_shape_memory_size_in_gbs": 64, 20 | "recipe_node_boot_volume_size_in_gbs": 500, 21 | "recipe_ephemeral_storage_size": 450, 22 | "output_object_storage": [ 23 | { 24 | "bucket_name": "llama3290Bvisioninstruct", 25 | "mount_location": "/models", 26 | "volume_size_in_gbs": 450 27 | } 28 | ] 29 | } 30 | -------------------------------------------------------------------------------- /docs/sample_blueprints/model_storage/download_open_hf_model_to_object_storage.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_id": "example", 3 | "recipe_mode": "job", 4 | "deployment_name": "model_to_object", 5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:hf_downloader_v1", 6 | "recipe_container_command_args": [ 7 | "NousResearch/Meta-Llama-3.1-405B-FP8", 8 | "--local-dir", 9 | "/models", 10 | "--max-workers", 11 | "16" 12 | ], 13 | "recipe_container_port": "5678", 14 | "recipe_node_shape": "VM.Standard.E4.Flex", 15 | "recipe_node_pool_size": 1, 16 | "recipe_flex_shape_ocpu_count": 16, 17 | "recipe_flex_shape_memory_size_in_gbs": 256, 18 | "recipe_node_boot_volume_size_in_gbs": 1000, 19 | "recipe_ephemeral_storage_size": 900, 20 | "output_object_storage": [ 21 | { 22 | "bucket_name": "nousllama31405bfp8", 23 | "mount_location": "/models", 24 | "volume_size_in_gbs": 800 25 | } 26 | ] 27 | } 28 | -------------------------------------------------------------------------------- /docs/sample_blueprints/multi-node-inference/multinode_inference_BM_A10.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_id": "vllm_multinode_inference", 3 | "recipe_mode": "service", 4 | "deployment_name": "multinode_inference", 5 | "recipe_node_shape": "BM.GPU.A10.4", 6 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:ray2430_vllmv083", 7 | "input_object_storage": [ 8 | { 9 | "par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/IFknABDAjiiF5LATogUbRCcVQ9KL6aFUC1j-P5NSeUcaB2lntXLaR935rxa-E-u1/n/iduyx1qnmway/b/corrino_hf_oss_models/o/", 10 | "mount_location": "/models", 11 | "volume_size_in_gbs": 500, 12 | "include": ["NousResearch/Meta-Llama-3.1-8B-Instruct"] 13 | } 14 | ], 15 | "recipe_replica_count": 1, 16 | "recipe_nvidia_gpu_count": 4, 17 | "recipe_ephemeral_storage_size": 150, 18 | "recipe_shared_memory_volume_size_limit_in_mb": 10000, 19 | "recipe_container_port": "8000", 20 | "recipe_use_shared_node_pool": true, 21 | "multinode_num_nodes_to_use_from_shared_pool": 2, 22 | "recipe_container_command_args": [ 23 | "--port", 24 | "8000", 25 | "--model", 26 | "/models", 27 | "--tensor-parallel-size", 28 | "4", 29 | "--pipeline-parallel-size", 30 | "2", 31 | "--gpu-memory-utilization", 32 | "0.90", 33 | "--distributed-executor-backend", 34 | "ray" 35 | ], 36 | "recipe_readiness_probe_params": { 37 | "endpoint_path": "/health", 38 | "port": 8000, 39 | "initial_delay_seconds": 20, 40 | "period_seconds": 10 41 | } 42 | } 43 | 44 | -------------------------------------------------------------------------------- /docs/sample_blueprints/multi-node-inference/multinode_inference_VM_A10.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_id": "vllm_multinode_inference", 3 | "recipe_mode": "service", 4 | "deployment_name": "multinode_inference", 5 | "recipe_node_shape": "VM.GPU.A10.2", 6 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:ray2430_vllmv083", 7 | "input_object_storage": [ 8 | { 9 | "par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/IFknABDAjiiF5LATogUbRCcVQ9KL6aFUC1j-P5NSeUcaB2lntXLaR935rxa-E-u1/n/iduyx1qnmway/b/corrino_hf_oss_models/o/", 10 | "mount_location": "/models", 11 | "volume_size_in_gbs": 500, 12 | "include": ["NousResearch/Meta-Llama-3.1-8B-Instruct"] 13 | } 14 | ], 15 | "recipe_replica_count": 1, 16 | "recipe_nvidia_gpu_count": 2, 17 | "recipe_ephemeral_storage_size": 150, 18 | "recipe_shared_memory_volume_size_limit_in_mb": 10000, 19 | "recipe_container_port": "8000", 20 | "recipe_use_shared_node_pool": true, 21 | "multinode_num_nodes_to_use_from_shared_pool": 2, 22 | "recipe_container_command_args": [ 23 | "--port", 24 | "8000", 25 | "--model", 26 | "/models", 27 | "--tensor-parallel-size", 28 | "2", 29 | "--pipeline-parallel-size", 30 | "2", 31 | "--gpu-memory-utilization", 32 | "0.90", 33 | "--distributed-executor-backend", 34 | "ray" 35 | ], 36 | "recipe_readiness_probe_params": { 37 | "endpoint_path": "/health", 38 | "port": 8000, 39 | "initial_delay_seconds": 20, 40 | "period_seconds": 10 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /docs/sample_blueprints/shared_node_pools/shared_node_pool_A10_BM.json: -------------------------------------------------------------------------------- 1 | { 2 | "deployment_name": "BM.GPU.A10.4 shared pool", 3 | "recipe_mode": "shared_node_pool", 4 | "shared_node_pool_size": 2, 5 | "shared_node_pool_shape": "BM.GPU.A10.4", 6 | "shared_node_pool_boot_volume_size_in_gbs": 500 7 | } 8 | -------------------------------------------------------------------------------- /docs/sample_blueprints/shared_node_pools/shared_node_pool_A10_VM.json: -------------------------------------------------------------------------------- 1 | { 2 | "deployment_name": "VM.GPU.A10.2 shared pool", 3 | "recipe_mode": "shared_node_pool", 4 | "shared_node_pool_size": 2, 5 | "shared_node_pool_shape": "VM.GPU.A10.2", 6 | "shared_node_pool_boot_volume_size_in_gbs": 500 7 | } 8 | -------------------------------------------------------------------------------- /docs/sample_blueprints/shared_node_pools/vllm_inference_sample_shared_pool_blueprint.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_id": "llm_inference_nvidia", 3 | "recipe_mode": "service", 4 | "deployment_name": "vLLM Inference Deployment", 5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:vllmv0.6.6.pos1", 6 | "recipe_node_shape": "BM.GPU.A10.4", 7 | "input_object_storage": [ 8 | { 9 | "par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/IFknABDAjiiF5LATogUbRCcVQ9KL6aFUC1j-P5NSeUcaB2lntXLaR935rxa-E-u1/n/iduyx1qnmway/b/corrino_hf_oss_models/o/", 10 | "mount_location": "/models", 11 | "volume_size_in_gbs": 500, 12 | "include": ["NousResearch/Meta-Llama-3.1-8B-Instruct"] 13 | } 14 | ], 15 | "recipe_container_env": [ 16 | { 17 | "key": "tensor_parallel_size", 18 | "value": "2" 19 | }, 20 | { 21 | "key": "model_name", 22 | "value": "NousResearch/Meta-Llama-3.1-8B-Instruct" 23 | }, 24 | { 25 | "key": "Model_Path", 26 | "value": "/models/NousResearch/Meta-Llama-3.1-8B-Instruct" 27 | } 28 | ], 29 | "recipe_replica_count": 1, 30 | "recipe_container_port": "8000", 31 | "recipe_nvidia_gpu_count": 2, 32 | "recipe_use_shared_node_pool": true, 33 | "recipe_node_boot_volume_size_in_gbs": 200, 34 | "recipe_container_command_args": [ 35 | "--model", 36 | "$(Model_Path)", 37 | "--tensor-parallel-size", 38 | "$(tensor_parallel_size)" 39 | ], 40 | "recipe_ephemeral_storage_size": 100, 41 | "recipe_shared_memory_volume_size_limit_in_mb": 1000 42 | } 43 | -------------------------------------------------------------------------------- /docs/sample_blueprints/startup_liveness_readiness_probes/autoscale_with_fss.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_id": "llm_inference_nvidia", 3 | "recipe_mode": "service", 4 | "deployment_name": "autoscale_with_fss", 5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:vllmv0.6.5", 6 | "recipe_node_shape": "VM.GPU.A10.2", 7 | "recipe_container_env": [ 8 | { 9 | "key": "tensor_parallel_size", 10 | "value": "1" 11 | }, 12 | { 13 | "key": "Model_Path", 14 | "value": "/models/models/meta-llama/Llama-3.2-1B-Instruct" 15 | } 16 | ], 17 | "recipe_replica_count": 1, 18 | "recipe_container_port": "8000", 19 | "recipe_nvidia_gpu_count": 1, 20 | "recipe_container_command_args": [ 21 | "--model", 22 | "$(Model_Path)", 23 | "--tensor-parallel-size", 24 | "$(tensor_parallel_size)", 25 | "--gpu-memory-utilization", 26 | "0.99", 27 | "--max-model-len", 28 | "1024" 29 | ], 30 | "recipe_ephemeral_storage_size": 200, 31 | "recipe_node_boot_volume_size_in_gbs": 300, 32 | "recipe_node_pool_size": 1, 33 | "recipe_shared_memory_volume_size_limit_in_mb": 200, 34 | "recipe_startup_probe_params": { 35 | "failure_threshold": 30, 36 | "endpoint_path": "/health", 37 | "port": 8000, 38 | "scheme": "HTTP", 39 | "initial_delay_seconds": 10, 40 | "period_seconds": 2, 41 | "success_threshold": 1, 42 | "timeout_seconds": 1 43 | }, 44 | "recipe_liveness_probe_params": { 45 | "failure_threshold": 3, 46 | "endpoint_path": "/health", 47 | "port": 8000, 48 | "scheme": "HTTP", 49 | "initial_delay_seconds": 65, 50 | "period_seconds": 600, 51 | "success_threshold": 1, 52 | "timeout_seconds": 1 53 | }, 54 | "recipe_pod_autoscaling_params": { 55 | "min_replicas": 1, 56 | "max_replicas": 4 57 | }, 58 | "recipe_node_autoscaling_params": { 59 | "min_nodes": 1, 60 | "max_nodes": 2 61 | }, 62 | "input_file_system": [ 63 | { 64 | "file_system_ocid": "ocid1.filesystem.oc1.iad.aaaaaaaaaaklirslnfqwillqojxwiotjmfsc2ylefuzqaaaa", 65 | "mount_target_ocid": "ocid1.mounttarget.oc1.iad.aaaaacvipp3o7rlwnfqwillqojxwiotjmfsc2ylefuzqaaaa", 66 | "mount_location": "/models", 67 | "volume_size_in_gbs": 50 68 | } 69 | ] 70 | } 71 | -------------------------------------------------------------------------------- /docs/sample_blueprints/teams/create_job_with_team.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_id": "healthcheck", 3 | "recipe_mode": "job", 4 | "deployment_name": "create_job_with_team", 5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:healthcheck_v0.3", 6 | "recipe_node_shape": "VM.GPU.A10.2", 7 | "recipe_use_shared_node_pool": true, 8 | "recipe_team_info": { 9 | "team_name": "randomteam" 10 | }, 11 | "output_object_storage": [ 12 | { 13 | "bucket_name": "healthcheck2", 14 | "mount_location": "/healthcheck_results", 15 | "volume_size_in_gbs": 20 16 | } 17 | ], 18 | "recipe_container_command_args": [ 19 | "--dtype", 20 | "float16", 21 | "--output_dir", 22 | "/healthcheck_results", 23 | "--expected_gpus", 24 | "A10:2,A100:0,H100:0" 25 | ], 26 | "recipe_replica_count": 1, 27 | "recipe_nvidia_gpu_count": 2, 28 | "recipe_node_pool_size": 1, 29 | "recipe_node_boot_volume_size_in_gbs": 200, 30 | "recipe_ephemeral_storage_size": 100, 31 | "recipe_shared_memory_volume_size_limit_in_mb": 1000, 32 | "recipe_container_cpu_count": 4, 33 | "recipe_container_memory_size": 20 34 | } 35 | -------------------------------------------------------------------------------- /docs/sample_blueprints/teams/create_team.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_mode": "team", 3 | "deployment_name": "create_team", 4 | "team": { 5 | "team_name": "randomteam", 6 | "priority_threshold": 100, 7 | "quotas": [ 8 | { 9 | "shape_name": "BM.GPU.H100.8", 10 | "cpu_nominal_quota": "10", 11 | "cpu_borrowing_limit": "4", 12 | "cpu_lending_limit": "4", 13 | "mem_nominal_quota": "10", 14 | "mem_borrowing_limit": "4", 15 | "mem_lending_limit": "4", 16 | "gpu_nominal_quota": "10", 17 | "gpu_borrowing_limit": "4", 18 | "gpu_lending_limit": "4" 19 | }, 20 | { 21 | "shape_name": "VM.GPU.A10.2", 22 | "cpu_nominal_quota": "10", 23 | "cpu_borrowing_limit": "4", 24 | "cpu_lending_limit": "4", 25 | "mem_nominal_quota": "10", 26 | "mem_borrowing_limit": "4", 27 | "mem_lending_limit": "4", 28 | "gpu_nominal_quota": "10", 29 | "gpu_borrowing_limit": "4", 30 | "gpu_lending_limit": "4" 31 | } 32 | ] 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /docs/sample_blueprints/using_rdma_enabled_node_pools/rdma_distributed_inference.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_id": "llm_inference_nvidia", 3 | "recipe_mode": "service", 4 | "deployment_name": "405b", 5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:ray2430_vllmv083", 6 | "recipe_node_shape": "BM.GPU.H100.8", 7 | "recipe_replica_count": 1, 8 | "recipe_container_port": "8000", 9 | "recipe_nvidia_gpu_count": 8, 10 | "recipe_use_shared_node_pool": true, 11 | "multinode_rdma_enabled_in_shared_pool": true, 12 | "multinode_num_nodes_to_use_from_shared_pool": 2, 13 | "input_object_storage": [ 14 | { 15 | "par": "https://iduyx1qnmway.objectstorage.eu-frankfurt-1.oci.customer-oci.com/p/7N2O5JFirNX_CG70t-HPILzHvlTMP4FC9f_eauJVECosqNafIYxwcDwhItQHvaDK/n/iduyx1qnmway/b/llama31405binstruct/o/", 16 | "mount_location": "/models", 17 | "volume_size_in_gbs": 500 18 | } 19 | ], 20 | "recipe_container_env": [ 21 | {"key": "NCCL_DEBUG", "value": "INFO"}, 22 | {"key": "NCCL_DEBUG_SUBSYS", "value": "INIT,NET,ENV"} 23 | ], 24 | "recipe_readiness_probe_params": { 25 | "endpoint_path": "/health", 26 | "port": 8000, 27 | "initial_delay_seconds": 20, 28 | "period_seconds": 10 29 | }, 30 | "recipe_container_command_args": [ 31 | "--port", 32 | "8000", 33 | "--model", 34 | "/models", 35 | "--tensor-parallel-size", 36 | "8", 37 | "--gpu-memory-utilization", 38 | "0.90", 39 | "--pipeline-parallel-size", 40 | "2", 41 | "--distributed-executor-backend", 42 | "ray" 43 | ], 44 | "recipe_ephemeral_storage_size": 100, 45 | "recipe_shared_memory_volume_size_limit_in_mb": 10000 46 | } 47 | -------------------------------------------------------------------------------- /docs/sample_blueprints/using_rdma_enabled_node_pools/rdma_shared_node_pool.json: -------------------------------------------------------------------------------- 1 | { 2 | "deployment_name": "H100_rdma_pool", 3 | "recipe_mode": "shared_node_pool", 4 | "shared_node_pool_size": 2, 5 | "shared_node_pool_shape": "BM.GPU.H100.8", 6 | "shared_node_pool_boot_volume_size_in_gbs": 1000, 7 | "recipe_availability_domain": "TrcQ:EU-FRANKFURT-1-AD-3", 8 | "recipe_node_image_ocid": "ocid1.image.oc1.eu-frankfurt-1.aaaaaaaakhpy5kt3p6gjmeqbasnndemp6aetlnbkm57hohrkgksuh4476llq", 9 | "multinode_rdma_enabled_in_shared_pool": true 10 | } 11 | -------------------------------------------------------------------------------- /docs/sample_blueprints/using_rdma_enabled_node_pools/rdma_update_nodes.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_mode": "update", 3 | "deployment_name": "startupaddnode1", 4 | "recipe_node_name": "10.0.10.164", 5 | "recipe_node_labels": { 6 | "corrino": "h100pool", 7 | "corrino/pool-shared-any": "true", 8 | "corrino/rdma": "true" 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /docs/usage_guide.md: -------------------------------------------------------------------------------- 1 | # Ways to Access OCI AI Blueprints 2 | 3 | Once you've installed OCI AI Blueprints into your tenancy (see [here](../INSTALLING_ONTO_EXISTING_CLUSTER_README.md) for the steps to install OCI AI Blueprints), you can work with OCI AI Blueprints three ways: 4 | 5 | ## **Option #1: OCI AI Blueprints UI Portal:** 6 | 7 | 1. Inside the OCI Console > Resource Manager, select the stack you created for OCI AI Blueprints 8 | 9 | 2. Go to the "Application Information" tab under Stack Details. 10 | 11 | 3. Copy the "Portal URL" into your browser 12 | 13 | 4. Upon first access, you must login - providing the "Admin Username" and "Admin Password" from the "Application Information" tab under Stack Details 14 | 15 | ## **Option #2: OCI AI Blueprints APIs via Web:** 16 | 17 | OCI AI Blueprint's APIs are accessible via web interface. The APIs are shown as human-friendly HTML output for each OCI AI Blueprints API resource . These pages allow for easy browsing of resources, as well as forms for submitting data to the resources using `POST`, `PUT`, and `DELETE`. 18 | 19 | 1. Inside the OCI Console > Resource Manager, select the stack you created for OCI AI Blueprints 20 | 21 | 2. Go to the "Application Information" tab under Stack Details. 22 | 23 | 3. Copy the "OCI AI Blueprints API URL" into your browser 24 | 25 | 4. Upon first access, you must login - providing the "Admin Username" and "Admin Password" from the "Application Information" tab under Stack Details 26 | 27 | 5. Now, you can view and access all API endpoints for your instance of OCI AI Blueprints 28 | 29 | ## **Option #3: OCI AI Blueprints APIs via Curl/Postman** 30 | 31 | You can interact with the APIs locally using Postman, curl or any API platform by doing the following: 32 | 33 | 1. Get your `OCI AI Blueprints API URL` (will reference this as **API URL** going forward), `Admin Usernmae` (will reference this as **username** going forward) and `Admin Password` (will reference this as **password** going forward) by following steps 1 - 3 above in Option #1 34 | 2. Once you have your username, password and API URL make a POST request to`/login` API to get your auth token: 35 | 36 | ``` 37 | curl --location --request POST '/login/' \ 38 | --header 'Authorization: Token ' \ 39 | --form 'username=""' \ 40 | --form 'password=""' 41 | ``` 42 | 43 | The return JSON will be in the following format: 44 | 45 | ``` 46 | { 47 | 48 | "token": "", 49 | 50 | "is_new": true 51 | 52 | } 53 | ``` 54 | 55 | 3. Copy the `token` from the response 56 | 4. Now you can access any OCI AI Blueprints API by passing in this `token` for Authorization 57 | 58 | ### Curl Example 59 | 60 | ``` 61 | curl --location --request GET '/oci_shapes/' \ 62 | --header 'Authorization: Token ' 63 | ``` 64 | 65 | ### Postman 66 | 67 | 1. Click on the Authorization Tab for the request 68 | 2. Select Auth Type = OAuth 2.0 69 | 3. Paste your token value 70 | 4. Leave Header Prefix as "Token" 71 | 72 | ## **API Reference Documentation** 73 | 74 | [API Reference Documentation](./api_documentation.md) 75 | -------------------------------------------------------------------------------- /docs/versions/PortalVersions.md: -------------------------------------------------------------------------------- 1 | # Portal Versions 2 | 3 |
4 | v5.0.0 5 | 6 | **React & Framework** 7 | 8 | - react: ^19.0.0 9 | - react-dom: ^19.0.0 10 | - next: 15.2.0-canary.74 11 | - next-themes: ^0.4.4 12 | 13 | **UI Primitives (@radix-ui)** 14 | 15 | - @radix-ui/react-alert-dialog: ^1.1.11 16 | - @radix-ui/react-dialog: ^1.1.6 17 | - @radix-ui/react-label: ^2.1.2 18 | - @radix-ui/react-popover: ^1.1.6 19 | - @radix-ui/react-scroll-area: ^1.2.4 20 | - @radix-ui/react-slot: ^1.2.0 21 | - @radix-ui/react-tabs: ^1.1.3 22 | - @radix-ui/react-tooltip: ^1.1.8 23 | 24 | **Components & Utilities** 25 | 26 | - class-variance-authority: ^0.7.1 27 | - clsx: ^2.1.1 28 | - js-cookie: ^3.0.5 29 | - lucide-react: ^0.476.0 30 | - react-markdown: ^10.1.0 31 | - sonner: ^2.0.1 32 | - tailwind-merge: ^3.0.2 33 | - tailwindcss-animate: ^1.0.7 34 | 35 | **OCI SDK** 36 | 37 | - oci-common: ^2.104.0 38 | - oci-core: ^2.104.0 39 | - oci-identity: ^2.104.0 40 | - oci-objectstorage: ^2.104.0 41 | 42 | --- 43 | 44 | **DevDependencies** 45 | 46 | **Build & Lint** 47 | 48 | - typescript: ^5 49 | - tailwindcss: ^4 50 | - @tailwindcss/postcss: ^4 51 | - @tailwindcss/typography: ^0.5.16 52 | - eslint: ^9 53 | - @eslint/eslintrc: ^3 54 | - eslint-config-next: 15.2.0-canary.74 55 | 56 | **Type Definitions** 57 | 58 | - @types/node: ^20 59 | - @types/react: ^19 60 | - @types/react-dom: ^19 61 | - @types/js-cookie: ^3.0.6 62 | 63 |
64 | 65 |
66 | v1.0.1 67 | 68 | - react: ^19.0.0 69 | - react-dom: ^19.0.0 70 | - next: 15.2.0-canary.74 71 | - @radix-ui/react-alert-dialog: ^1.1.11 72 | - @radix-ui/react-dialog: ^1.1.6 73 | - @radix-ui/react-label: ^2.1.2 74 | - @radix-ui/react-popover: ^1.1.6 75 | - @radix-ui/react-scroll-area: ^1.2.4 76 | - @radix-ui/react-slot: ^1.2.0 77 | - @radix-ui/react-tabs: ^1.1.3 78 | - @radix-ui/react-tooltip: ^1.1.8 79 | - class-variance-authority: ^0.7.1 80 | - clsx: ^2.1.1 81 | - js-cookie: ^3.0.5 82 | - lucide-react: ^0.476.0 83 | - next-themes: ^0.4.4 84 | - oci-common: ^2.104.0 85 | - oci-core: ^2.104.0 86 | - oci-identity: ^2.104.0 87 | - oci-objectstorage: ^2.104.0 88 | - react-markdown: ^10.1.0 89 | - sonner: ^2.0.1 90 | - tailwind-merge: ^3.0.2 91 | - tailwindcss-animate: ^1.0.7 92 | - @eslint/eslintrc: ^3 93 | - @tailwindcss/postcss: ^4 94 | - @tailwindcss/typography: ^0.5.16 95 | - @types/js-cookie: ^3.0.6 96 | - @types/node: ^20 97 | - @types/react: ^19 98 | - @types/react-dom: ^19 99 | - eslint: ^9 100 | - eslint-config-next: 15.2.0-canary.74 101 | - tailwindcss: ^4 102 | - typescript: ^5 103 | 104 |
105 | -------------------------------------------------------------------------------- /docs/versions/README.md: -------------------------------------------------------------------------------- 1 | # Software Versions 2 | 3 | Each link provides software versions for tools utilized in the various components of the software managed by Blueprints: 4 | 5 | - [OCI AI Blueprints Quickstart Software Versions](./QuickStartVersions.md) 6 | - [Blueprints Control Plane Software Versions](./ControlPlaneVersions.md) 7 | - [Blueprints Portal Software Versions](./PortalVersions.md) 8 | -------------------------------------------------------------------------------- /docs/whisper_transcription/docs/Whisper_Architecture.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-ai-blueprints/31c64899055982e73cb2c1b95345faafe6d5c7ff/docs/whisper_transcription/docs/Whisper_Architecture.pdf -------------------------------------------------------------------------------- /docs/whisper_transcription/examples/test1/test.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-ai-blueprints/31c64899055982e73cb2c1b95345faafe6d5c7ff/docs/whisper_transcription/examples/test1/test.wav -------------------------------------------------------------------------------- /docs/whisper_transcription/examples/test1/test_all_transcripts_20250601_201349.txt: -------------------------------------------------------------------------------- 1 | [2025-06-01 20:13:40] Speaker 2: So, Aaron, in your email you said you wanted to talk about the exam. 2 | [2025-06-01 20:13:40] Speaker 1: Yeah, um, I've just never taken a class with so many different readings. 3 | [2025-06-01 20:13:40] Speaker 1: I've managed to keep up with all the assignments, but I'm not sure how to... 4 | [2025-06-01 20:13:45] Speaker 1: How to... 5 | [2025-06-01 20:13:40] Speaker 2: How to review everything. 6 | [2025-06-01 20:13:40] Speaker 1: Yeah, in other classes I've had, there's usually just one book to review, not three different books, plus all those other text excerpts and video... 7 | 8 | ====== Summary ====== 9 | 10 | Key points: 11 | 12 | * Speaker 1 is struggling to keep up with the readings in a class with multiple books and other materials. 13 | * Speaker 2 suggests reviewing everything to prepare for the exam. 14 | 15 | Decisions: 16 | 17 | * None made during the meeting. 18 | 19 | Action items: 20 | 21 | * Speaker 1 needs to find a strategy for reviewing all the materials and preparing for the exam. -------------------------------------------------------------------------------- /docs/whisper_transcription/examples/test2/video1591686795.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-ai-blueprints/31c64899055982e73cb2c1b95345faafe6d5c7ff/docs/whisper_transcription/examples/test2/video1591686795.mp4 -------------------------------------------------------------------------------- /docs/whisper_transcription/examples/test3/audio1788670787.m4a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-ai-blueprints/31c64899055982e73cb2c1b95345faafe6d5c7ff/docs/whisper_transcription/examples/test3/audio1788670787.m4a -------------------------------------------------------------------------------- /docs/whisper_transcription/whisper-transcription-A10.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_id": "whisper transcription", 3 | "recipe_mode": "service", 4 | "deployment_name": "whisper-transcription-a10", 5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:whisper_transcription_v8", 6 | "recipe_node_shape": "VM.GPU.A10.2", 7 | "recipe_replica_count": 1, 8 | "recipe_container_port": "8000", 9 | "recipe_nvidia_gpu_count": 2, 10 | "recipe_node_pool_size": 1, 11 | "recipe_node_boot_volume_size_in_gbs": 200, 12 | "recipe_ephemeral_storage_size": 100, 13 | "recipe_shared_memory_volume_size_limit_in_mb": 1000 14 | } -------------------------------------------------------------------------------- /docs/whisper_transcription/whisper-transcription-A100.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_id": "whisper transcription", 3 | "recipe_mode": "service", 4 | "deployment_name": "whisper-transcription-a100", 5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:whisper_transcription_v8", 6 | "recipe_node_shape": "BM.GPU.A100.8", 7 | "recipe_replica_count": 1, 8 | "recipe_container_port": "8000", 9 | "recipe_nvidia_gpu_count": 8, 10 | "recipe_node_pool_size": 1, 11 | "recipe_node_boot_volume_size_in_gbs": 200, 12 | "recipe_ephemeral_storage_size": 100, 13 | "recipe_shared_memory_volume_size_limit_in_mb": 1000 14 | } -------------------------------------------------------------------------------- /docs/whisper_transcription/whisper-transcription-H100.json: -------------------------------------------------------------------------------- 1 | { 2 | "recipe_id": "whisper transcription", 3 | "recipe_mode": "service", 4 | "deployment_name": "whisper-transcription-h100", 5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:whisper_transcription_v8", 6 | "recipe_node_shape": "BM.GPU.H100.8", 7 | "recipe_replica_count": 1, 8 | "recipe_container_port": "8000", 9 | "recipe_nvidia_gpu_count": 8, 10 | "recipe_node_pool_size": 1, 11 | "recipe_node_boot_volume_size_in_gbs": 200, 12 | "recipe_ephemeral_storage_size": 100, 13 | "recipe_shared_memory_volume_size_limit_in_mb": 1000 14 | } -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/OCI_AI_BLUEPRINTS_STACK_VERSION: -------------------------------------------------------------------------------- 1 | v1.0.2 -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/app-api.tf: -------------------------------------------------------------------------------- 1 | resource "kubernetes_service" "corrino_cp_service" { 2 | metadata { 3 | name = "corrino-cp" 4 | annotations = { 5 | "oci.oraclecloud.com/load-balancer-type" = "lb" 6 | "service.beta.kubernetes.io/oci-load-balancer-shape"= "flexible" 7 | } 8 | } 9 | spec { 10 | selector = { 11 | app = "corrino-cp" 12 | } 13 | port { 14 | port = 80 15 | target_port = 5000 16 | } 17 | } 18 | depends_on = [kubernetes_deployment.corrino_cp_deployment] 19 | } 20 | 21 | resource "kubernetes_deployment" "corrino_cp_deployment" { 22 | metadata { 23 | name = "corrino-cp" 24 | labels = { 25 | app = "corrino-cp" 26 | } 27 | } 28 | spec { 29 | replicas = 1 30 | 31 | strategy { 32 | type = "Recreate" 33 | } 34 | 35 | selector { 36 | match_labels = { 37 | app = "corrino-cp" 38 | } 39 | } 40 | template { 41 | metadata { 42 | labels = { 43 | app = "corrino-cp" 44 | } 45 | } 46 | spec { 47 | container { 48 | name = "corrino-cp" 49 | image = local.app.backend_image_uri 50 | image_pull_policy = "Always" 51 | 52 | dynamic "env" { 53 | for_each = local.env_universal 54 | content { 55 | name = env.value.name 56 | value = env.value.value 57 | } 58 | } 59 | 60 | dynamic "env" { 61 | for_each = local.env_app_api 62 | content { 63 | name = env.value.name 64 | value = env.value.value 65 | } 66 | } 67 | 68 | dynamic "env" { 69 | for_each = local.env_app_configmap 70 | content { 71 | name = env.value.name 72 | value_from { 73 | config_map_key_ref { 74 | name = env.value.config_map_name 75 | key = env.value.config_map_key 76 | } 77 | } 78 | } 79 | } 80 | 81 | dynamic "env" { 82 | for_each = local.env_adb_access 83 | content { 84 | name = env.value.name 85 | value = env.value.value 86 | } 87 | } 88 | 89 | dynamic "env" { 90 | for_each = local.env_adb_access_secrets 91 | content { 92 | name = env.value.name 93 | value_from { 94 | secret_key_ref { 95 | name = env.value.secret_name 96 | key = env.value.secret_key 97 | } 98 | } 99 | } 100 | } 101 | 102 | volume_mount { 103 | name = "adb-wallet-volume" 104 | mount_path = "/app/wallet" 105 | read_only = true 106 | } 107 | } 108 | volume { 109 | name = "adb-wallet-volume" 110 | secret { 111 | secret_name = "oadb-wallet" 112 | } 113 | } 114 | } 115 | } 116 | } 117 | depends_on = [kubernetes_job.corrino_migration_job] 118 | } 119 | 120 | -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/app-background.tf: -------------------------------------------------------------------------------- 1 | 2 | resource "kubernetes_deployment" "corrino_cp_background_deployment" { 3 | metadata { 4 | name = "corrino-cp-background" 5 | labels = { 6 | app = "corrino-cp-background" 7 | } 8 | } 9 | spec { 10 | replicas = 1 11 | selector { 12 | match_labels = { 13 | app = "corrino-cp-background" 14 | } 15 | } 16 | template { 17 | metadata { 18 | labels = { 19 | app = "corrino-cp-background" 20 | } 21 | } 22 | spec { 23 | container { 24 | name = "corrino-cp-background" 25 | image = local.app.backend_image_uri 26 | image_pull_policy = "Always" 27 | command = ["/bin/sh", "-c"] 28 | args = ["python3 manage.py runserver"] 29 | dynamic "env" { 30 | for_each = local.env_universal 31 | content { 32 | name = env.value.name 33 | value = env.value.value 34 | } 35 | } 36 | 37 | dynamic "env" { 38 | for_each = local.env_app_api_background 39 | content { 40 | name = env.value.name 41 | value = env.value.value 42 | } 43 | } 44 | 45 | dynamic "env" { 46 | for_each = local.env_app_configmap 47 | content { 48 | name = env.value.name 49 | value_from { 50 | config_map_key_ref { 51 | name = env.value.config_map_name 52 | key = env.value.config_map_key 53 | } 54 | } 55 | } 56 | } 57 | 58 | dynamic "env" { 59 | for_each = local.env_adb_access 60 | content { 61 | name = env.value.name 62 | value = env.value.value 63 | } 64 | } 65 | 66 | dynamic "env" { 67 | for_each = local.env_adb_access_secrets 68 | content { 69 | name = env.value.name 70 | value_from { 71 | secret_key_ref { 72 | name = env.value.secret_name 73 | key = env.value.secret_key 74 | } 75 | } 76 | } 77 | } 78 | 79 | volume_mount { 80 | name = "adb-wallet-volume" 81 | mount_path = "/app/wallet" 82 | read_only = true 83 | } 84 | } 85 | volume { 86 | name = "adb-wallet-volume" 87 | secret { 88 | secret_name = "oadb-wallet" 89 | } 90 | } 91 | } 92 | } 93 | } 94 | depends_on = [kubernetes_job.corrino_migration_job] 95 | } 96 | 97 | -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/app-blueprint-portal.tf: -------------------------------------------------------------------------------- 1 | resource "kubernetes_service" "oci_ai_blueprints_portal_service" { 2 | metadata { 3 | name = "oci-ai-blueprints-portal" 4 | annotations = { 5 | "oci.oraclecloud.com/load-balancer-type" = "lb" 6 | "service.beta.kubernetes.io/oci-load-balancer-shape" = "flexible" 7 | } 8 | } 9 | spec { 10 | selector = { 11 | app = "oci-ai-blueprints-portal" 12 | } 13 | port { 14 | port = 80 15 | target_port = 3000 16 | } 17 | } 18 | depends_on = [kubernetes_deployment.oci_ai_blueprints_portal_deployment] 19 | } 20 | 21 | resource "kubernetes_deployment" "oci_ai_blueprints_portal_deployment" { 22 | metadata { 23 | name = "oci-ai-blueprints-portal" 24 | labels = { 25 | app = "oci-ai-blueprints-portal" 26 | } 27 | } 28 | spec { 29 | replicas = 1 30 | selector { 31 | match_labels = { 32 | app = "oci-ai-blueprints-portal" 33 | } 34 | } 35 | template { 36 | metadata { 37 | labels = { 38 | app = "oci-ai-blueprints-portal" 39 | } 40 | } 41 | spec { 42 | container { 43 | name = "oci-ai-blueprints-portal" 44 | image = local.app.blueprint_portal_image_uri 45 | image_pull_policy = "Always" 46 | 47 | dynamic "env" { 48 | for_each = local.env_universal 49 | content { 50 | name = env.value.name 51 | value = env.value.value 52 | } 53 | } 54 | 55 | dynamic "env" { 56 | for_each = local.env_app_configmap 57 | content { 58 | name = env.value.name 59 | value_from { 60 | config_map_key_ref { 61 | name = env.value.config_map_name 62 | key = env.value.config_map_key 63 | } 64 | } 65 | } 66 | } 67 | } 68 | } 69 | } 70 | } 71 | depends_on = [kubernetes_deployment.corrino_cp_deployment] 72 | } 73 | 74 | -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/app-configmap.tf: -------------------------------------------------------------------------------- 1 | resource "kubernetes_config_map" "corrino-configmap" { 2 | metadata { 3 | name = "corrino-configmap" 4 | } 5 | 6 | 7 | data = { 8 | APP_IMAGE_URI = local.app.backend_image_uri 9 | ADDON_GRAFANA_TOKEN = local.addon.grafana_token 10 | ADDON_GRAFANA_USER = local.addon.grafana_user 11 | BACKEND_SERVICE_NAME = local.app.backend_service_name 12 | COMPARTMENT_ID = local.oci.compartment_id 13 | CONTROL_PLANE_VERSION = var.stack_version 14 | RELEASE_VERSION = var.stack_version 15 | DJANGO_ALLOWED_HOSTS = local.django.allowed_hosts 16 | DJANGO_CSRF_TRUSTED_ORIGINS = local.django.csrf_trusted_origins 17 | DJANGO_SECRET = local.django.secret 18 | FRONTEND_HTTPS_FLAG = local.app.https_flag 19 | IMAGE_REGISTRY_BASE_URI = local.ocir.base_uri 20 | LOGGING_LEVEL = local.django.logging_level 21 | NAMESPACE_NAME = local.oci.namespace_name 22 | OKE_CLUSTER_ID = local.oci.oke_cluster_id 23 | OKE_NODE_SUBNET_ID = local.network.oke_node_subnet_id 24 | PUBLIC_ENDPOINT_BASE = local.fqdn.name 25 | RECIPE_BUCKET_NAME = local.app.recipe_bucket_name 26 | RECIPE_VALIDATION_ENABLED = local.app.recipe_validation_enabled 27 | RECIPE_VALIDATION_SHAPE_AVAILABILITY_ENABLED = local.app.recipe_validation_shape_availability_enabled 28 | REGION_NAME = local.oci.region_name 29 | TENANCY_ID = local.oci.tenancy_id 30 | TENANCY_NAMESPACE = local.oci.tenancy_namespace 31 | DATA_UPLOAD_PATH = var.share_data_with_corrino_team_enabled ? local.registration.upload_path : "" 32 | DEPLOYMENT_UUID = random_uuid.registration_id.result 33 | DATA_SHARING_ENABLED = var.share_data_with_corrino_team_enabled ? "True" : "False" 34 | BLUEPRINTS_OBJECT_STORAGE_URL = local.app.blueprints_object_storage_url 35 | PORTAL_DEMO_FLAG = local.app.portal_demo_flag 36 | SHARED_NODE_POOL_BLUEPRINTS_OBJECT_STORAGE_URL = local.app.shared_node_pool_blueprints_object_storage_url 37 | SHARED_NODE_POOL_DOCUMENTATION_URL = local.app.shared_node_pool_documentation_url 38 | BLUEPRINT_DOCUMENTATION_URL = local.app.blueprint_documentation_url 39 | PROMETHEUS_NAMESPACE = local.third_party_namespaces.prometheus_namespace 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/app-migration.tf: -------------------------------------------------------------------------------- 1 | resource "kubernetes_job" "corrino_migration_job" { 2 | metadata { 3 | name = "corrino-migration-job" 4 | } 5 | spec { 6 | template { 7 | metadata {} 8 | spec { 9 | 10 | container { 11 | name = "corrino-migration-job" 12 | image = local.app.backend_image_uri 13 | image_pull_policy = "Always" 14 | command = ["/bin/sh", "-c"] 15 | args = [ 16 | "pwd; ls -al; uname -a; whoami; python3 manage.py print_settings; python3 manage.py makemigrations; python3 manage.py migrate" 17 | ] 18 | 19 | dynamic "env" { 20 | for_each = local.env_universal 21 | content { 22 | name = env.value.name 23 | value = env.value.value 24 | } 25 | } 26 | 27 | dynamic "env" { 28 | for_each = local.env_app_jobs 29 | content { 30 | name = env.value.name 31 | value = env.value.value 32 | } 33 | } 34 | 35 | dynamic "env" { 36 | for_each = local.env_app_configmap 37 | content { 38 | name = env.value.name 39 | value_from { 40 | config_map_key_ref { 41 | name = env.value.config_map_name 42 | key = env.value.config_map_key 43 | } 44 | } 45 | } 46 | } 47 | 48 | dynamic "env" { 49 | for_each = local.env_adb_access 50 | content { 51 | name = env.value.name 52 | value = env.value.value 53 | } 54 | } 55 | 56 | dynamic "env" { 57 | for_each = local.env_adb_access_secrets 58 | content { 59 | name = env.value.name 60 | value_from { 61 | secret_key_ref { 62 | name = env.value.secret_name 63 | key = env.value.secret_key 64 | } 65 | } 66 | } 67 | } 68 | 69 | volume_mount { 70 | name = "adb-wallet-volume" 71 | mount_path = "/app/wallet" 72 | read_only = true 73 | } 74 | } 75 | 76 | volume { 77 | name = "adb-wallet-volume" 78 | secret { 79 | secret_name = "oadb-wallet" 80 | } 81 | } 82 | 83 | restart_policy = "Never" 84 | } 85 | } 86 | backoff_limit = 0 87 | ttl_seconds_after_finished = 120 88 | } 89 | wait_for_completion = true 90 | timeouts { 91 | create = "10m" 92 | update = "10m" 93 | } 94 | 95 | depends_on = [kubernetes_job.wallet_extractor_job, kubernetes_config_map.corrino-configmap] 96 | 97 | # count = var.mushop_mock_mode_all ? 0 : 1 98 | count = 1 99 | } -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/app-registration.tf: -------------------------------------------------------------------------------- 1 | 2 | resource "local_file" "registration" { 3 | content = local.registration.object_content 4 | filename = local.registration.object_filepath 5 | } 6 | 7 | # curl -X PUT --data-binary '@local_filename' unique_PAR_URL 8 | 9 | resource "null_resource" "registration" { 10 | depends_on = [kubernetes_deployment.corrino_cp_deployment, local_file.registration] 11 | triggers = { 12 | always_run = timestamp() 13 | } 14 | provisioner "local-exec" { 15 | command = <<-EOT 16 | if [ "${var.share_data_with_corrino_team_enabled}" = "true" ]; then 17 | curl -X PUT --data-binary '@${local.registration.object_filepath}' ${local.registration.upload_path}${local.registration.object_filename} 18 | else 19 | echo "1" > /tmp/opted_out && curl -X PUT --data-binary '@/tmp/opted_out' ${local.registration.upload_path}opted_out 20 | fi 21 | EOT 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/app-user.tf: -------------------------------------------------------------------------------- 1 | resource "kubernetes_job" "corrino_user_job" { 2 | metadata { 3 | name = "corrino-user-job" 4 | } 5 | spec { 6 | template { 7 | metadata {} 8 | spec { 9 | 10 | container { 11 | name = "corrino-user-job" 12 | image = local.app.backend_image_uri 13 | image_pull_policy = "Always" 14 | command = ["/bin/sh", "-c"] 15 | args = ["python3 manage.py create_superuser_if_needed"] 16 | 17 | dynamic "env" { 18 | for_each = local.env_universal 19 | content { 20 | name = env.value.name 21 | value = env.value.value 22 | } 23 | } 24 | 25 | dynamic "env" { 26 | for_each = local.env_app_user 27 | content { 28 | name = env.value.name 29 | value = env.value.value 30 | } 31 | } 32 | 33 | dynamic "env" { 34 | for_each = local.env_app_jobs 35 | content { 36 | name = env.value.name 37 | value = env.value.value 38 | } 39 | } 40 | 41 | dynamic "env" { 42 | for_each = local.env_app_configmap 43 | content { 44 | name = env.value.name 45 | value_from { 46 | config_map_key_ref { 47 | name = env.value.config_map_name 48 | key = env.value.config_map_key 49 | } 50 | } 51 | } 52 | } 53 | 54 | dynamic "env" { 55 | for_each = local.env_adb_access 56 | content { 57 | name = env.value.name 58 | value = env.value.value 59 | } 60 | } 61 | 62 | dynamic "env" { 63 | for_each = local.env_adb_access_secrets 64 | content { 65 | name = env.value.name 66 | value_from { 67 | secret_key_ref { 68 | name = env.value.secret_name 69 | key = env.value.secret_key 70 | } 71 | } 72 | } 73 | } 74 | 75 | volume_mount { 76 | name = "adb-wallet-volume" 77 | mount_path = "/app/wallet" 78 | read_only = true 79 | } 80 | } 81 | 82 | volume { 83 | name = "adb-wallet-volume" 84 | secret { 85 | secret_name = "oadb-wallet" 86 | } 87 | } 88 | 89 | restart_policy = "Never" 90 | } 91 | } 92 | backoff_limit = 0 93 | ttl_seconds_after_finished = 120 94 | } 95 | 96 | wait_for_completion = true 97 | timeouts { 98 | create = "10m" 99 | update = "10m" 100 | } 101 | 102 | depends_on = [kubernetes_job.corrino_migration_job] 103 | 104 | # count = var.mushop_mock_mode_all ? 0 : 1 105 | } -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/data.tf: -------------------------------------------------------------------------------- 1 | 2 | 3 | data "oci_objectstorage_namespace" "ns" { 4 | compartment_id = var.compartment_ocid 5 | } 6 | 7 | data "oci_containerengine_cluster_kube_config" "oke_special" { 8 | cluster_id = var.existent_oke_cluster_id 9 | } 10 | 11 | #data "kubernetes_ingress" "corrino_cp_ingress" { 12 | # metadata { 13 | # name = local.app.backend_service_name_ingress 14 | # namespace = "default" 15 | # } 16 | # 17 | # depends_on = [module.oke-quickstart.helm_release_ingress_nginx] 18 | # count = var.ingress_nginx_enabled ? 1 : 0 19 | #} 20 | 21 | #data "kubernetes_service" "corrino_cp_service" { 22 | # metadata { 23 | # name = local.app.backend_service_name 24 | # namespace = "default" 25 | # } 26 | # depends_on = [module.oke-quickstart.helm_release_ingress_nginx] 27 | # count = var.ingress_nginx_enabled ? 1 : 0 28 | #} 29 | 30 | data "kubernetes_service" "ingress_nginx_controller_service" { 31 | metadata { 32 | name = "ingress-nginx-controller" 33 | namespace = "cluster-tools" 34 | } 35 | depends_on = [module.oke-quickstart.helm_release_ingress_nginx] 36 | count = var.ingress_nginx_enabled ? 1 : 0 37 | } 38 | 39 | data "kubernetes_secret" "grafana_password" { 40 | metadata { 41 | name = "grafana" 42 | namespace = "cluster-tools" 43 | } 44 | depends_on = [module.oke-quickstart.helm_release_grafana] 45 | count = var.grafana_enabled ? 1 : 0 46 | } 47 | 48 | data "kubernetes_namespace" "cluster_tools_namespace" { 49 | metadata { 50 | name = "cluster-tools" 51 | } 52 | depends_on = [module.oke-quickstart.cluster_tools_namespace] 53 | count = var.bring_your_own_prometheus ? 0 : 1 54 | } 55 | -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/helm.tf: -------------------------------------------------------------------------------- 1 | resource "helm_release" "mlflow" { 2 | name = "mlflow" 3 | repository = "https://community-charts.github.io/helm-charts" 4 | chart = "mlflow" 5 | namespace = "cluster-tools" 6 | wait = false 7 | version = "0.16.5" 8 | 9 | values = [ 10 | < __Warning__: Moved to [oracle-quickstart/terraform-oci-networking](https://github.com/oracle-quickstart/terraform-oci-networking). Sub modules with specific OCI Networking resource also available. example: [Virtual Cloud Network](https://github.com/oracle-quickstart/terraform-oci-networking/tree/main/modules/vcn). 4 | -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/modules/corrino/modules/oci-policies/main.tf: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved. 2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | # 4 | 5 | resource "oci_identity_dynamic_group" "for_policies" { 6 | name = "${local.app_name_normalized}-${local.dynamic_group_name_normalized}-${local.deploy_id}" 7 | description = "${local.app_name} ${var.dynamic_group_name} (${local.deploy_id})" 8 | compartment_id = var.tenancy_ocid 9 | matching_rule = "${var.dynamic_group_main_condition} {${join(",", var.dynamic_group_matching_rules)}}" 10 | freeform_tags = var.oci_tag_values.freeformTags 11 | defined_tags = var.oci_tag_values.definedTags 12 | 13 | provider = oci.home_region 14 | 15 | count = var.create_dynamic_group ? 1 : 0 16 | } 17 | 18 | resource "oci_identity_policy" "policies" { 19 | name = "${local.app_name_normalized}-${local.policy_name_normalized}-${local.deploy_id}" 20 | description = "${local.app_name} ${var.policy_name} (${local.deploy_id})" 21 | compartment_id = local.policy_compartment_ocid 22 | statements = var.policy_statements 23 | freeform_tags = var.oci_tag_values.freeformTags 24 | defined_tags = var.oci_tag_values.definedTags 25 | 26 | depends_on = [oci_identity_dynamic_group.for_policies] 27 | 28 | provider = oci.home_region 29 | 30 | count = var.create_policy ? 1 : 0 31 | } 32 | 33 | locals { 34 | policy_compartment_ocid = var.compartment_ocid != "" ? var.compartment_ocid : var.tenancy_ocid 35 | } -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/modules/corrino/modules/oci-policies/outputs.tf: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Oracle and/or its affiliates. All rights reserved. 2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | # 4 | 5 | output "dynamic_group_id" { 6 | value = try(oci_identity_dynamic_group.for_policies.0.id, null) 7 | } 8 | output "dynamic_group_name" { 9 | value = try(oci_identity_dynamic_group.for_policies.0.name, null) 10 | } 11 | output "compartment_policy_id" { 12 | value = try(oci_identity_policy.policies.0.id, null) 13 | } -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/modules/corrino/modules/oci-policies/variables.tf: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved. 2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | # 4 | 5 | # Create Dynamic Group and Policies 6 | variable "create_dynamic_group" { 7 | default = false 8 | description = "Creates dynamic group to use with policies. Note: You need to have proper rights on the Tenancy. If you only have rights in a compartment, uncheck and ask you administrator to create the Dynamic Group for you" 9 | } 10 | variable "dynamic_group_name" { 11 | default = "Dynamic Group" 12 | description = "Name of the dynamic group. e.g.: OKE Cluster Dynamic Group => -oke-cluster-dynamic-group-" 13 | } 14 | ## Dynamic Group Matching Rules 15 | variable "dynamic_group_matching_rules" { 16 | type = list(string) 17 | default = [] 18 | description = "List of matching rules for the dynamic group. e.g.: [\"ALL {instance.compartment.id = 'ocid1.compartment.oc1..aaaaaaaaxxxxxxxxxxxxxxxx'}\", \"ALL {instance.id = 'ocid1.instance.oc1.phx.xxxxxxxx'}\"]" 19 | } 20 | variable "dynamic_group_main_condition" { 21 | default = "ANY" 22 | description = "Main condition for the dynamic group. e.g.: ALL, ANY" 23 | 24 | validation { 25 | condition = var.dynamic_group_main_condition == "ALL" || var.dynamic_group_main_condition == "ANY" 26 | error_message = "Sorry, but cluster visibility can only be ALL or ANY." 27 | } 28 | } 29 | # Policy 30 | variable "create_policy" { 31 | default = false 32 | description = "Creates policy. e.g.: Compartment Policies to support Cluster Autoscaler, OCI Logging datasource on Grafana; Tenancy Policies to support OCI Metrics datasource on Grafana" 33 | } 34 | variable "policy_name" { 35 | default = "Policies" 36 | description = "Name of the policy. e.g.: Compartment Policies => -compartment-policies-" 37 | } 38 | # variable "create_tenancy_policies" { 39 | # default = false 40 | # description = "Creates policies that need to reside on the tenancy. e.g.: Policies to support OCI Metrics datasource on Grafana" 41 | # } 42 | variable "compartment_ocid" { 43 | default = "" 44 | description = "Compartment OCID where the policies will be created. If not specified, the policies will be created on the Tenancy OCID" 45 | } 46 | 47 | # Compartment Policies Statements 48 | variable "policy_statements" { 49 | type = list(string) 50 | default = [] 51 | description = "List of statements for the compartment policy. e.g.: [\"Allow dynamic-group to manage instances in compartment \", \"Allow dynamic-group to use instances in compartment where ALL {instance.compartment.id = 'ocid1.compartment.oc1..aaaaaaaaxxxxxxxxxxxxxxxx', instance.id = 'ocid1.instance.oc1.phx.xxxxxxxx'}\"]" 52 | } 53 | 54 | # Deployment Details + Freeform Tags 55 | variable "oci_tag_values" { 56 | description = "Tags to be added to the resources" 57 | } 58 | 59 | # OCI Provider 60 | variable "tenancy_ocid" {} 61 | # variable "region" {} 62 | # variable "user_ocid" { default = "" } 63 | # variable "fingerprint" { default = "" } 64 | # variable "private_key_path" { default = "" } 65 | 66 | locals { 67 | app_name_normalized = substr(replace(lower(var.oci_tag_values.freeformTags.AppName), " ", "-"), 0, 6) 68 | app_name = var.oci_tag_values.freeformTags.AppName 69 | deploy_id = var.oci_tag_values.freeformTags.DeploymentID 70 | policy_compartment_OCID = var.compartment_ocid == "" ? var.tenancy_ocid : var.compartment_ocid 71 | dynamic_group_name_normalized = substr(replace(lower(var.dynamic_group_name), " ", "-"), 0, 80) 72 | policy_name_normalized = substr(replace(lower(var.policy_name), " ", "-"), 0, 80) 73 | } -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/modules/corrino/modules/oci-policies/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved. 2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | # 4 | 5 | terraform { 6 | required_version = ">= 1.1" 7 | required_providers { 8 | oci = { 9 | source = "oracle/oci" 10 | version = "~> 4, < 5" 11 | # https://registry.terraform.io/providers/oracle/oci/ 12 | configuration_aliases = [oci.home_region] 13 | } 14 | local = { 15 | source = "hashicorp/local" 16 | version = "~> 2" 17 | # https://registry.terraform.io/providers/hashicorp/local/ 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/modules/corrino/modules/oci-vault-kms/main.tf: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, 2022 Oracle and/or its affiliates. All rights reserved. 2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | # 4 | 5 | ##************************************************************************** 6 | ## OCI KMS Vault 7 | ##************************************************************************** 8 | 9 | ### OCI Vault vault 10 | resource "oci_kms_vault" "oke_vault" { 11 | compartment_id = var.oke_cluster_compartment_ocid 12 | display_name = "${local.vault_display_name} - ${local.deploy_id}" 13 | vault_type = local.vault_type[0] 14 | freeform_tags = var.oci_tag_values.freeformTags 15 | defined_tags = var.oci_tag_values.definedTags 16 | 17 | # depends_on = [oci_identity_policy.kms_user_group_compartment_policies] 18 | 19 | count = var.use_encryption_from_oci_vault ? (var.create_new_encryption_key ? 1 : 0) : 0 20 | } 21 | ### OCI Vault key 22 | resource "oci_kms_key" "oke_key" { 23 | compartment_id = var.oke_cluster_compartment_ocid 24 | display_name = "${local.vault_key_display_name} - ${local.deploy_id}" 25 | management_endpoint = oci_kms_vault.oke_vault[0].management_endpoint 26 | protection_mode = local.vault_key_protection_mode 27 | freeform_tags = var.oci_tag_values.freeformTags 28 | defined_tags = var.oci_tag_values.definedTags 29 | 30 | key_shape { 31 | algorithm = local.vault_key_key_shape_algorithm 32 | length = local.vault_key_key_shape_length 33 | } 34 | 35 | count = var.use_encryption_from_oci_vault ? (var.create_new_encryption_key ? 1 : 0) : 0 36 | } 37 | 38 | ### Vault and Key definitions 39 | locals { 40 | vault_display_name = "OKE Vault" 41 | vault_key_display_name = "OKE Key" 42 | vault_key_key_shape_algorithm = "AES" 43 | vault_key_key_shape_length = 32 44 | vault_type = ["DEFAULT", "VIRTUAL_PRIVATE"] 45 | vault_key_protection_mode = "SOFTWARE" # HSM or SOFTWARE 46 | oci_vault_key_id = var.use_encryption_from_oci_vault ? (var.create_new_encryption_key ? oci_kms_key.oke_key[0].id : var.existent_encryption_key_id) : "void" 47 | } -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/modules/corrino/modules/oci-vault-kms/outputs.tf: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Oracle and/or its affiliates. All rights reserved. 2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | # 4 | 5 | output "oci_vault_key_id" { 6 | value = var.use_encryption_from_oci_vault ? (var.create_new_encryption_key ? oci_kms_key.oke_key[0].id : var.existent_encryption_key_id) : null 7 | } -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/modules/corrino/modules/oci-vault-kms/providers.tf: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved. 2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | # 4 | 5 | terraform { 6 | required_version = ">= 1.1" 7 | required_providers { 8 | oci = { 9 | source = "oracle/oci" 10 | version = "~> 4" 11 | # https://registry.terraform.io/providers/oracle/oci/ 12 | configuration_aliases = [oci.home_region] 13 | } 14 | local = { 15 | source = "hashicorp/local" 16 | version = "~> 2" 17 | # https://registry.terraform.io/providers/hashicorp/local/ 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/modules/corrino/modules/oci-vault-kms/variables.tf: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, 2022, Oracle and/or its affiliates. All rights reserved. 2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | # 4 | 5 | # OKE Encryption details 6 | variable "use_encryption_from_oci_vault" { 7 | default = false 8 | description = "By default, Oracle manages the keys that encrypts Kubernetes Secrets at Rest in Etcd, but you can choose a key from a vault that you have access to, if you want greater control over the key's lifecycle and how it's used" 9 | } 10 | variable "create_new_encryption_key" { 11 | default = false 12 | description = "Creates new vault and key on OCI Vault/Key Management/KMS and assign to boot volume of the worker nodes" 13 | } 14 | variable "existent_encryption_key_id" { 15 | default = "" 16 | description = "Use an existent master encryption key to encrypt boot volume and object storage bucket. NOTE: If the key resides in a different compartment or in a different tenancy, make sure you have the proper policies to access, or the provision of the worker nodes will fail" 17 | } 18 | 19 | # Deployment Details + Freeform Tags 20 | variable "oci_tag_values" { 21 | description = "Tags to be added to the resources" 22 | } 23 | 24 | # OKE Variables 25 | variable "oke_cluster_compartment_ocid" { 26 | description = "Compartment OCID used by the OKE Cluster" 27 | type = string 28 | } 29 | 30 | # Policies variables 31 | variable "create_vault_policies_for_group" { 32 | default = false 33 | description = "Creates policies to allow the user applying the stack to manage vault and keys. If you are on the Administrators group or already have the policies for a compartment, this policy is not needed. If you do not have access to allow the policy, ask your administrator to include it for you" 34 | } 35 | variable "user_admin_group_for_vault_policy" { 36 | default = "Administrators" 37 | description = "User Identity Group to allow manage vault and keys. The user running the Terraform scripts or Applying the ORM Stack need to be on this group" 38 | } 39 | ## Create Dynamic Group and Policies 40 | variable "create_dynamic_group_for_nodes_in_compartment" { 41 | default = false 42 | description = "Creates dynamic group of Nodes in the compartment. Note: You need to have proper rights on the Tenancy. If you only have rights in a compartment, uncheck and ask you administrator to create the Dynamic Group for you" 43 | } 44 | variable "create_compartment_policies" { 45 | default = false 46 | description = "Creates policies for KMS that will reside on the compartment." 47 | } 48 | 49 | # OCI Provider 50 | variable "tenancy_ocid" {} 51 | 52 | # Conditional locals 53 | locals { 54 | app_dynamic_group = (var.use_encryption_from_oci_vault && var.create_dynamic_group_for_nodes_in_compartment) ? oci_identity_dynamic_group.app_dynamic_group.0.name : "void" 55 | app_name_normalized = substr(replace(lower(var.oci_tag_values.freeformTags.AppName), " ", "-"), 0, 6) 56 | app_name = var.oci_tag_values.freeformTags.AppName 57 | deploy_id = var.oci_tag_values.freeformTags.DeploymentID 58 | } -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/modules/corrino/modules/oke-cluster-autoscaler/datasources.tf: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Oracle and/or its affiliates. All rights reserved. 2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | # 4 | 5 | # Gets supported Kubernetes versions for node pools 6 | # data "oci_containerengine_node_pool_option" "node_pool" { 7 | # node_pool_option_id = "all" 8 | # } 9 | # data "oci_containerengine_node_pool_option" "node_pool" { 10 | # node_pool_option_id = var.existent_oke_cluster_id 11 | # } 12 | -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/modules/corrino/modules/oke-cluster-autoscaler/variables.tf: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved. 2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | # 4 | 5 | # OKE Variables 6 | ## OKE Autoscaler 7 | # variable "cluster_autoscaler_enabled" { 8 | # default = true 9 | # description = "Enables OKE cluster autoscaler. Node pools will auto scale based on the resources usage" 10 | # } 11 | variable "cluster_autoscaler_supported_k8s_versions" { 12 | type = map(string) 13 | 14 | default = { "1.22" = "1.22.2-4", "1.23" = "1.23.0-4", "1.24" = "1.24.0-5", "1.25" = "1.25.0-6" } # There's no API to get that list. Need to be updated manually 15 | description = "Supported Kubernetes versions for OKE cluster autoscaler" 16 | } 17 | variable "custom_cluster_autoscaler_image" { 18 | default = "" 19 | description = "Custom Image for OKE cluster autoscaler" 20 | } 21 | variable "cluster_autoscaler_log_level_verbosity" { 22 | default = 4 23 | description = "Log level verbosity for OKE cluster autoscaler" 24 | } 25 | variable "cluster_autoscaler_max_node_provision_time" { 26 | default = "25m" 27 | description = "Maximum time in minutes for a node to be provisioned. If the node is not ready after this time, it will be deleted and recreated" 28 | } 29 | variable "cluster_autoscaler_scale_down_delay_after_add" { 30 | default = "10m" 31 | description = "Time to wait after scale up before attempting to scale down" 32 | } 33 | variable "cluster_autoscaler_scale_down_unneeded_time" { 34 | default = "10m" 35 | description = "Time after which a node should be deleted after it has been unneeded for this long" 36 | } 37 | variable "cluster_autoscaler_unremovable_node_recheck_timeout" { 38 | default = "5m" 39 | description = "Time after which a node which failed to be removed is retried" 40 | } 41 | variable "cluster_autoscaler_num_of_replicas" { 42 | default = 3 43 | description = "Number of replicas for OKE cluster autoscaler" 44 | } 45 | variable "cluster_autoscaler_extra_args" { 46 | default = [] 47 | description = "Extra arguments to pass to OKE cluster autoscaler" 48 | } 49 | 50 | ## OKE Node Pool Details 51 | variable "oke_node_pools" { 52 | type = list(any) 53 | 54 | default = [] 55 | description = "Node pools (id, min_nodes, max_nodes, k8s_version) to use with Cluster Autoscaler" 56 | } 57 | 58 | # OCI Provider 59 | variable "region" {} 60 | 61 | # Get OKE options 62 | # locals { 63 | # node_pool_k8s_latest_version = reverse(sort(data.oci_containerengine_node_pool_option.node_pool.kubernetes_versions))[0] 64 | # } -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/modules/corrino/modules/oke-cluster-autoscaler/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved. 2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | # 4 | 5 | terraform { 6 | required_version = ">= 1.1" 7 | required_providers { 8 | oci = { 9 | source = "oracle/oci" 10 | version = "~> 4, < 5" 11 | # https://registry.terraform.io/providers/oracle/oci/ 12 | } 13 | kubernetes = { 14 | source = "hashicorp/kubernetes" 15 | version = "~> 2" 16 | # https://registry.terraform.io/providers/hashicorp/kubernetes/ 17 | } 18 | local = { 19 | source = "hashicorp/local" 20 | version = "~> 2" 21 | # https://registry.terraform.io/providers/hashicorp/local/ 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/modules/corrino/modules/oke-node-pool/datasources.tf: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Oracle and/or its affiliates. All rights reserved. 2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | # 4 | # Gets supported Kubernetes versions for node pools 5 | data "oci_containerengine_node_pool_option" "node_pool" { 6 | node_pool_option_id = var.existent_oke_cluster_id 7 | } 8 | 9 | # Gets a list of supported images based on the shape, operating_system and operating_system_version provided 10 | data "oci_core_images" "node_pool_images" { 11 | compartment_id = var.oke_cluster_compartment_ocid 12 | operating_system = var.image_operating_system 13 | operating_system_version = var.image_operating_system_version 14 | shape = var.node_pool_shape 15 | sort_by = "TIMECREATED" 16 | sort_order = "DESC" 17 | } 18 | 19 | # Gets a list of Availability Domains 20 | data "oci_identity_availability_domains" "ADs" { 21 | compartment_id = var.oke_cluster_compartment_ocid 22 | } 23 | 24 | # Gets a specfic Availability Domain 25 | data "oci_identity_availability_domain" "specfic" { 26 | compartment_id = var.oke_cluster_compartment_ocid 27 | ad_number = var.node_pool_shape_specific_ad 28 | 29 | count = (var.node_pool_shape_specific_ad > 0) ? 1 : 0 30 | } 31 | 32 | # Prepare Cloud Unit for Node Pool nodes 33 | data "cloudinit_config" "nodes" { 34 | gzip = true 35 | base64_encode = true 36 | 37 | part { 38 | content_type = "text/x-shellscript" 39 | content = </var/run/oke-init.sh 42 | bash /var/run/oke-init.sh ${var.node_pool_oke_init_params} 43 | EOF 44 | } 45 | 46 | dynamic "part" { 47 | for_each = var.node_pool_cloud_init_parts 48 | content { 49 | content_type = part.value["content_type"] 50 | content = part.value["content"] 51 | filename = part.value["filename"] 52 | } 53 | } 54 | } -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/modules/corrino/modules/oke-node-pool/main.tf: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved. 2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | # 4 | 5 | # File Version: 0.7.1 6 | 7 | resource "oci_containerengine_node_pool" "oke_node_pool" { 8 | cluster_id = var.oke_cluster_ocid 9 | compartment_id = var.oke_cluster_compartment_ocid 10 | kubernetes_version = local.node_k8s_version 11 | name = var.node_pool_name 12 | node_shape = var.node_pool_shape 13 | ssh_public_key = var.public_ssh_key 14 | freeform_tags = var.node_pools_tags.freeformTags 15 | defined_tags = var.node_pools_tags.definedTags 16 | 17 | node_config_details { 18 | dynamic "placement_configs" { 19 | for_each = local.node_pool_ads # data.oci_identity_availability_domains.ADs.availability_domains 20 | 21 | content { 22 | availability_domain = placement_configs.value.name 23 | subnet_id = var.nodes_subnet_id 24 | } 25 | } 26 | node_pool_pod_network_option_details { 27 | cni_type = var.cni_type 28 | max_pods_per_node = 31 29 | pod_nsg_ids = [] 30 | pod_subnet_ids = [var.vcn_native_pod_networking_subnet_ocid] 31 | } 32 | # nsg_ids = [] 33 | size = var.node_pool_min_nodes 34 | kms_key_id = var.oci_vault_key_id_oke_node_boot_volume != "" ? var.oci_vault_key_id_oke_node_boot_volume : null 35 | freeform_tags = var.worker_nodes_tags.freeformTags 36 | defined_tags = var.worker_nodes_tags.definedTags 37 | } 38 | 39 | dynamic "node_shape_config" { 40 | for_each = local.is_flexible_node_shape ? [1] : [] 41 | content { 42 | ocpus = var.node_pool_node_shape_config_ocpus 43 | memory_in_gbs = var.node_pool_node_shape_config_memory_in_gbs 44 | } 45 | } 46 | 47 | node_source_details { 48 | source_type = "IMAGE" 49 | image_id = lookup(data.oci_core_images.node_pool_images.images[0], "id") 50 | boot_volume_size_in_gbs = var.node_pool_boot_volume_size_in_gbs 51 | } 52 | # node_eviction_node_pool_settings { 53 | # eviction_grace_duration = "PT1H" 54 | # is_force_delete_after_grace_duration = false 55 | # } 56 | node_metadata = { 57 | user_data = anytrue([var.node_pool_oke_init_params != "", var.node_pool_cloud_init_parts != []]) ? data.cloudinit_config.nodes.rendered : null 58 | } 59 | 60 | initial_node_labels { 61 | key = "name" 62 | value = var.node_pool_name 63 | } 64 | 65 | dynamic "initial_node_labels" { 66 | for_each = var.extra_initial_node_labels 67 | 68 | content { 69 | key = initial_node_labels.value.key 70 | value = initial_node_labels.value.value 71 | } 72 | } 73 | 74 | lifecycle { 75 | ignore_changes = [ 76 | node_config_details.0.size 77 | ] 78 | } 79 | 80 | count = var.create_new_node_pool ? 1 : 0 81 | } 82 | 83 | locals { 84 | # Checks if is using Flexible Compute Shapes 85 | is_flexible_node_shape = contains(split(".", var.node_pool_shape), "Flex") 86 | 87 | # Gets the latest Kubernetes version supported by the node pool 88 | node_pool_k8s_latest_version = reverse(sort(data.oci_containerengine_node_pool_option.node_pool.kubernetes_versions))[0] 89 | node_k8s_version = (var.node_k8s_version == "Latest") ? local.node_pool_k8s_latest_version : var.node_k8s_version 90 | 91 | # Get ADs for the shape to be used on the node pool 92 | node_pool_ads = (var.node_pool_shape_specific_ad > 0) ? data.oci_identity_availability_domain.specfic : data.oci_identity_availability_domains.ADs.availability_domains 93 | } 94 | -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/modules/corrino/modules/oke-node-pool/outputs.tf: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Oracle and/or its affiliates. All rights reserved. 2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | # 4 | 5 | output "node_pool_name" { 6 | value = var.create_new_node_pool ? oci_containerengine_node_pool.oke_node_pool.0.name : var.existent_oke_nodepool_id_for_autoscaler 7 | } 8 | output "node_pool_min_nodes" { 9 | value = var.node_pool_min_nodes 10 | } 11 | output "node_pool_max_nodes" { 12 | value = var.node_pool_max_nodes 13 | } 14 | output "node_pool_id" { 15 | value = var.create_new_node_pool ? oci_containerengine_node_pool.oke_node_pool.0.id : var.existent_oke_nodepool_id_for_autoscaler 16 | } 17 | output "node_k8s_version" { 18 | value = local.node_k8s_version 19 | } 20 | output "node_pool_autoscaler_enabled" { 21 | value = var.node_pool_autoscaler_enabled 22 | } -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/modules/corrino/modules/oke-node-pool/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved. 2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | # 4 | 5 | terraform { 6 | required_version = ">= 1.1" 7 | required_providers { 8 | oci = { 9 | source = "oracle/oci" 10 | version = "~> 4, < 5" 11 | # https://registry.terraform.io/providers/oracle/oci/ 12 | } 13 | local = { 14 | source = "hashicorp/local" 15 | version = "~> 2" 16 | # https://registry.terraform.io/providers/hashicorp/local/ 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/modules/corrino/modules/oke/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2022 Oracle and/or its affiliates. All rights reserved. 2 | 3 | The Universal Permissive License (UPL), Version 1.0 4 | 5 | Subject to the condition set forth below, permission is hereby granted to any person obtaining a copy of this 6 | software, associated documentation and/or data (collectively the "Software"), free of charge and under any and 7 | all copyright rights in the Software, and any and all patent rights owned or freely licensable by each licensor 8 | hereunder covering either (i) the unmodified Software as contributed to or provided by such licensor, or 9 | (ii) the Larger Works (as defined below), to deal in both 10 | 11 | (a) the Software, and 12 | (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if one is included with the Software 13 | (each a “Larger Work” to which the Software is contributed by such licensors), 14 | 15 | without restriction, including without limitation the rights to copy, create derivative works of, display, 16 | perform, and distribute the Software and make, use, sell, offer for sale, import, export, have made, and have 17 | sold the Software and the Larger Work(s), and to sublicense the foregoing rights on either these or other terms. 18 | 19 | This license is subject to the following condition: 20 | The above copyright notice and either this complete permission notice or at a minimum a reference to the UPL must 21 | be included in all copies or substantial portions of the Software. 22 | 23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO 24 | THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 25 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 26 | CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 27 | IN THE SOFTWARE. 28 | -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/modules/corrino/modules/oke/README.md: -------------------------------------------------------------------------------- 1 | # Terraform OKE Submodule 2 | 3 | This module deploys an OKE Kubernetes cluster. 4 | 5 | ## Usage 6 | 7 | ```hcl 8 | module "oke" { 9 | source = "./modules/oke" 10 | 11 | providers = { 12 | oci = oci 13 | oci.home_region = oci.home_region 14 | } 15 | 16 | # Oracle Cloud Infrastructure Tenancy and Compartment OCID 17 | tenancy_ocid = var.tenancy_ocid 18 | compartment_ocid = local.oke_compartment_ocid 19 | region = var.region 20 | 21 | # Deployment Tags + Freeform Tags + Defined Tags 22 | cluster_tags = local.oci_tag_values 23 | load_balancers_tags = local.oci_tag_values 24 | block_volumes_tags = local.oci_tag_values 25 | 26 | # OKE Cluster 27 | ## create_new_oke_cluster 28 | create_new_oke_cluster = var.create_new_oke_cluster 29 | existent_oke_cluster_id = var.existent_oke_cluster_id 30 | 31 | ## Network Details 32 | vcn_id = module.vcn.vcn_id 33 | network_cidrs = local.network_cidrs 34 | k8s_endpoint_subnet_id = local.create_subnets ? module.subnets["oke_k8s_endpoint_subnet"].subnet_id : var.existent_oke_k8s_endpoint_subnet_ocid 35 | lb_subnet_id = local.create_subnets ? module.subnets["oke_lb_subnet"].subnet_id : var.existent_oke_load_balancer_subnet_ocid 36 | cni_type = local.cni_type 37 | ### Cluster Workers visibility 38 | cluster_workers_visibility = var.cluster_workers_visibility 39 | ### Cluster API Endpoint visibility 40 | cluster_endpoint_visibility = var.cluster_endpoint_visibility 41 | 42 | ## Control Plane Kubernetes Version 43 | k8s_version = var.k8s_version 44 | 45 | ## Create Dynamic group and Policies for Autoscaler and OCI Metrics and Logging 46 | create_dynamic_group_for_nodes_in_compartment = var.create_dynamic_group_for_nodes_in_compartment 47 | create_compartment_policies = var.create_compartment_policies 48 | 49 | ## Encryption (OCI Vault/Key Management/KMS) 50 | oci_vault_key_id_oke_secrets = module.vault.oci_vault_key_id 51 | oci_vault_key_id_oke_image_policy = module.vault.oci_vault_key_id 52 | } 53 | ``` 54 | -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/modules/corrino/modules/oke/datasources.tf: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2022 Oracle and/or its affiliates. All rights reserved. 2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | # 4 | 5 | data "oci_containerengine_cluster_option" "oke" { 6 | cluster_option_id = "all" 7 | } 8 | data "oci_containerengine_clusters" "oke" { 9 | compartment_id = local.oke_compartment_ocid 10 | } 11 | 12 | # Gets a list of Availability Domains 13 | data "oci_identity_availability_domains" "ADs" { 14 | compartment_id = local.oke_compartment_ocid 15 | } 16 | 17 | # Gets kubeconfig 18 | data "oci_containerengine_cluster_kube_config" "oke" { 19 | cluster_id = var.create_new_oke_cluster ? oci_containerengine_cluster.oke_cluster[0].id : var.existent_oke_cluster_id 20 | } 21 | -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/modules/corrino/modules/oke/main.tf: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, 2022, Oracle and/or its affiliates. All rights reserved. 2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | # 4 | 5 | resource "oci_containerengine_cluster" "oke_cluster" { 6 | compartment_id = local.oke_compartment_ocid 7 | kubernetes_version = (var.k8s_version == "Latest") ? local.cluster_k8s_latest_version : var.k8s_version 8 | name = "${local.app_name} (${local.deploy_id})" 9 | vcn_id = var.vcn_id 10 | kms_key_id = var.oci_vault_key_id_oke_secrets != "" ? var.oci_vault_key_id_oke_secrets : null 11 | # type = var.cluster_type 12 | freeform_tags = var.cluster_tags.freeformTags 13 | defined_tags = var.cluster_tags.definedTags 14 | 15 | endpoint_config { 16 | is_public_ip_enabled = (var.cluster_endpoint_visibility == "Private") ? false : true 17 | subnet_id = var.k8s_endpoint_subnet_id 18 | nsg_ids = [] 19 | } 20 | options { 21 | service_lb_subnet_ids = [var.lb_subnet_id] 22 | add_ons { 23 | is_kubernetes_dashboard_enabled = var.cluster_options_add_ons_is_kubernetes_dashboard_enabled 24 | is_tiller_enabled = false # Default is false, left here for reference 25 | } 26 | admission_controller_options { 27 | is_pod_security_policy_enabled = var.cluster_options_admission_controller_options_is_pod_security_policy_enabled 28 | } 29 | kubernetes_network_config { 30 | services_cidr = lookup(var.network_cidrs, "KUBERNETES-SERVICE-CIDR") 31 | pods_cidr = lookup(var.network_cidrs, "PODS-CIDR") 32 | } 33 | persistent_volume_config { 34 | freeform_tags = var.block_volumes_tags.freeformTags 35 | # defined_tags = var.block_volumes_tags.definedTags 36 | } 37 | service_lb_config { 38 | freeform_tags = var.load_balancers_tags.freeformTags 39 | # defined_tags = var.load_balancers_tags.definedTags 40 | } 41 | } 42 | image_policy_config { 43 | is_policy_enabled = false 44 | # key_details { 45 | # # kms_key_id = var.oci_vault_key_id_oke_image_policy != "" ? var.oci_vault_key_id_oke_image_policy : null 46 | # } 47 | } 48 | cluster_pod_network_options { 49 | cni_type = var.cni_type 50 | } 51 | 52 | lifecycle { 53 | ignore_changes = [freeform_tags, defined_tags, kubernetes_version, id] 54 | } 55 | 56 | count = var.create_new_oke_cluster ? 1 : 0 57 | } 58 | 59 | # Local kubeconfig for when using Terraform locally. Not used by Oracle Resource Manager 60 | resource "local_file" "oke_kubeconfig" { 61 | content = data.oci_containerengine_cluster_kube_config.oke.content 62 | filename = "${path.root}/generated/kubeconfig" 63 | file_permission = "0644" 64 | } 65 | 66 | # Get OKE options 67 | locals { 68 | cluster_k8s_latest_version = reverse(sort(data.oci_containerengine_cluster_option.oke.kubernetes_versions))[0] 69 | deployed_k8s_version = var.create_new_oke_cluster ? ((var.k8s_version == "Latest") ? local.cluster_k8s_latest_version : var.k8s_version) : [ 70 | for x in data.oci_containerengine_clusters.oke.clusters : x.kubernetes_version if x.id == var.existent_oke_cluster_id][0] 71 | } 72 | -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/modules/corrino/modules/oke/oke-orm-private-endpoint.tf: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Oracle and/or its affiliates. All rights reserved. 2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | # 4 | 5 | ### Important Notice ### 6 | # OCI Resource Manager Private Endpoint is only available when using Resource Manager. 7 | # If you use local Terraform, you will need to setup an OCI Bastion for connectivity to the Private OKE. 8 | # If using OCI CloudShell, you need to activate the OCI Private Endpoint for OCI CLoud Shell. 9 | 10 | resource "oci_resourcemanager_private_endpoint" "private_kubernetes_endpoint" { 11 | compartment_id = local.oke_compartment_ocid 12 | display_name = "Private Endpoint for OKE ${local.app_name} - ${local.deploy_id}" 13 | description = "Resource Manager Private Endpoint for OKE for the ${local.app_name} - ${local.deploy_id}" 14 | vcn_id = var.vcn_id 15 | subnet_id = var.k8s_endpoint_subnet_id 16 | freeform_tags = var.cluster_tags.freeformTags 17 | defined_tags = var.cluster_tags.definedTags 18 | 19 | count = var.create_new_oke_cluster ? ((var.cluster_endpoint_visibility == "Private") ? 1 : 0) : 0 20 | } 21 | 22 | # Resolves the private IP of the customer's private endpoint to a NAT IP. 23 | data "oci_resourcemanager_private_endpoint_reachable_ip" "private_kubernetes_endpoint" { 24 | private_endpoint_id = var.create_new_oke_cluster ? oci_resourcemanager_private_endpoint.private_kubernetes_endpoint[0].id : var.existent_oke_cluster_private_endpoint 25 | private_ip = trimsuffix(oci_containerengine_cluster.oke_cluster[0].endpoints.0.private_endpoint, ":6443") # TODO: Pending rule when has existent cluster 26 | 27 | count = (var.cluster_endpoint_visibility == "Private") ? 1 : 0 28 | } -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/modules/corrino/modules/oke/outputs.tf: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Oracle and/or its affiliates. All rights reserved. 2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | # 4 | 5 | output "comments" { 6 | value = "The application URL will be unavailable for a few minutes after provisioning while the application is configured and deployed to Kubernetes" 7 | } 8 | output "deployed_oke_kubernetes_version" { 9 | value = local.deployed_k8s_version 10 | } 11 | output "deployed_to_region" { 12 | value = var.region 13 | } 14 | output "dev" { 15 | value = "Made with \u2764 by Oracle Developers" 16 | } 17 | output "kubeconfig" { 18 | value = data.oci_containerengine_cluster_kube_config.oke.content 19 | } 20 | output "kubeconfig_for_kubectl" { 21 | value = "export KUBECONFIG=${path.root}/generated/kubeconfig" 22 | description = "If using Terraform locally, this command set KUBECONFIG environment variable to run kubectl locally" 23 | } 24 | output "orm_private_endpoint_oke_api_ip_address" { 25 | value = (var.cluster_endpoint_visibility == "Private") ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_kubernetes_endpoint.0.ip_address : "" 26 | description = "OCI Resource Manager Private Endpoint ip address for OKE Kubernetes API Private Endpoint" 27 | 28 | depends_on = [ 29 | oci_resourcemanager_private_endpoint.private_kubernetes_endpoint 30 | ] 31 | } 32 | 33 | # OKE info 34 | output "oke_cluster_ocid" { 35 | value = var.create_new_oke_cluster ? oci_containerengine_cluster.oke_cluster[0].id : "" 36 | description = "OKE Cluster OCID" 37 | } 38 | output "oke_cluster_compartment_ocid" { 39 | value = local.oke_compartment_ocid 40 | description = "Compartment OCID used by the OKE Cluster" 41 | } 42 | -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/modules/corrino/modules/oke/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved. 2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | # 4 | 5 | terraform { 6 | required_version = ">= 1.1" 7 | required_providers { 8 | oci = { 9 | source = "oracle/oci" 10 | version = "~> 4, < 5" 11 | # https://registry.terraform.io/providers/oracle/oci/ 12 | } 13 | local = { 14 | source = "hashicorp/local" 15 | version = "~> 2" 16 | # https://registry.terraform.io/providers/hashicorp/local/ 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/modules/corrino/outputs.tf: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Oracle and/or its affiliates. All rights reserved. 2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | # 4 | 5 | # Deployment outputs 6 | 7 | output "deploy_id" { 8 | value = local.deploy_id 9 | } 10 | 11 | # OKE Outputs 12 | output "comments" { 13 | value = module.oke.comments 14 | } 15 | output "deployed_oke_kubernetes_version" { 16 | value = module.oke.deployed_oke_kubernetes_version 17 | } 18 | output "deployed_to_region" { 19 | value = module.oke.deployed_to_region 20 | } 21 | output "kubeconfig" { 22 | value = module.oke.kubeconfig 23 | sensitive = true 24 | } 25 | output "kubeconfig_for_kubectl" { 26 | value = module.oke.kubeconfig_for_kubectl 27 | description = "If using Terraform locally, this command set KUBECONFIG environment variable to run kubectl locally" 28 | } 29 | output "oke_cluster_ocid" { 30 | value = module.oke.oke_cluster_ocid 31 | } 32 | output "oke_node_pools" { 33 | value = module.oke_node_pools 34 | } 35 | output "subnets" { 36 | value = module.subnets 37 | } 38 | 39 | output "dev" { 40 | value = module.oke.dev 41 | } 42 | ### Important Security Notice ### 43 | # The private key generated by this resource will be stored unencrypted in your Terraform state file. 44 | # Use of this resource for production deployments is not recommended. 45 | # Instead, generate a private key file outside of Terraform and distribute it securely to the system where Terraform will be run. 46 | output "generated_private_key_pem" { 47 | value = var.generate_public_ssh_key ? tls_private_key.oke_worker_node_ssh_key.private_key_pem : "No Keys Auto Generated" 48 | sensitive = true 49 | } 50 | 51 | output "cluster_tools_namespace" { 52 | value = module.cluster-tools.cluster_tools_namespace 53 | } 54 | 55 | output "helm_release_ingress_nginx" { 56 | value = module.cluster-tools.helm_release_ingress_nginx 57 | } 58 | 59 | output "helm_release_grafana" { 60 | value = module.cluster-tools.helm_release_grafana 61 | } -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/modules/corrino/policies.tf: -------------------------------------------------------------------------------- 1 | ## Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved. 2 | ## Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | ## 4 | # 5 | #module "cluster-dynamic-group" { 6 | # source = "./modules/oci-policies" 7 | # 8 | # providers = { 9 | # oci = oci 10 | # oci.home_region = oci.home_region 11 | # } 12 | # 13 | # # Oracle Cloud Infrastructure Tenancy 14 | # tenancy_ocid = var.tenancy_ocid 15 | # 16 | # # Deployment Tags + Freeform Tags + Defined Tags 17 | # oci_tag_values = local.oci_tag_values 18 | # 19 | # create_dynamic_group = true 20 | # dynamic_group_name = "OKE Cluster Nodes" 21 | # dynamic_group_matching_rules = [ 22 | # "ALL {instance.compartment.id = '${local.oke_compartment_ocid}'}", 23 | # "ALL {resource.type = 'cluster', resource.compartment.id = '${local.oke_compartment_ocid}'}" 24 | # ] 25 | # 26 | # count = var.create_dynamic_group_for_nodes_in_compartment ? 1 : 0 27 | #} 28 | # 29 | #module "cluster-compartment-policies" { 30 | # source = "./modules/oci-policies" 31 | # 32 | # providers = { 33 | # oci = oci 34 | # oci.home_region = oci.home_region 35 | # } 36 | # 37 | # # Oracle Cloud Infrastructure Tenancy and Compartment OCID 38 | # tenancy_ocid = var.tenancy_ocid 39 | # compartment_ocid = local.oke_compartment_ocid 40 | # 41 | # oci_tag_values = local.oci_tag_values 42 | # 43 | # create_policy = true 44 | # policy_name = "OKE Cluster Compartment Policies" 45 | # policy_statements = [ 46 | # "Allow dynamic-group ${local.dynamic_group_name} to manage cluster-node-pools in compartment id ${local.oke_compartment_ocid}", 47 | # "Allow dynamic-group ${local.dynamic_group_name} to manage instance-family in compartment id ${local.oke_compartment_ocid}", 48 | # "Allow dynamic-group ${local.dynamic_group_name} to use subnets in compartment id ${local.oke_compartment_ocid}", 49 | # "Allow dynamic-group ${local.dynamic_group_name} to read virtual-network-family in compartment id ${local.oke_compartment_ocid}", 50 | # "Allow dynamic-group ${local.dynamic_group_name} to use vnics in compartment id ${local.oke_compartment_ocid}", 51 | # "Allow dynamic-group ${local.dynamic_group_name} to inspect compartments in compartment id ${local.oke_compartment_ocid}", 52 | # "Allow dynamic-group ${local.dynamic_group_name} to use network-security-groups in compartment id ${local.oke_compartment_ocid}", 53 | # "Allow dynamic-group ${local.dynamic_group_name} to use private-ips in compartment id ${local.oke_compartment_ocid}", 54 | # "Allow dynamic-group ${local.dynamic_group_name} to manage public-ips in compartment id ${local.oke_compartment_ocid}" 55 | # ] 56 | # 57 | # count = var.create_compartment_policies ? 1 : 0 58 | #} 59 | # 60 | #locals { 61 | # dynamic_group_name = var.create_dynamic_group_for_nodes_in_compartment ? module.cluster-dynamic-group.0.dynamic_group_name : var.existent_dynamic_group_for_nodes_in_compartment 62 | #} -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/modules/corrino/schema.org.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oracle-quickstart/oci-ai-blueprints/31c64899055982e73cb2c1b95345faafe6d5c7ff/oci_ai_blueprints_terraform/modules/corrino/schema.org.yaml -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/modules/corrino/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved. 2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | # 4 | 5 | terraform { 6 | required_version = ">= 1.1" 7 | required_providers { 8 | oci = { 9 | source = "oracle/oci" 10 | version = "~> 4, < 5" 11 | # https://registry.terraform.io/providers/oracle/oci/ 12 | configuration_aliases = [oci.home_region] 13 | } 14 | kubernetes = { 15 | source = "hashicorp/kubernetes" 16 | version = "~> 2" 17 | # https://registry.terraform.io/providers/hashicorp/kubernetes/ 18 | } 19 | helm = { 20 | source = "hashicorp/helm" 21 | version = "~> 2" 22 | # https://registry.terraform.io/providers/hashicorp/helm/ 23 | } 24 | tls = { 25 | source = "hashicorp/tls" 26 | version = "~> 4" 27 | # https://registry.terraform.io/providers/hashicorp/tls/ 28 | } 29 | local = { 30 | source = "hashicorp/local" 31 | version = "~> 2" 32 | # https://registry.terraform.io/providers/hashicorp/local/ 33 | } 34 | random = { 35 | source = "hashicorp/random" 36 | version = "~> 3" 37 | # https://registry.terraform.io/providers/hashicorp/random/ 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/oke.tf: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Oracle and/or its affiliates. All rights reserved. 2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | # 4 | 5 | module "oke-quickstart" { 6 | # source = "github.com/oracle-quickstart/terraform-oci-corrino?ref=0.9.0" 7 | source = "./modules/corrino" 8 | 9 | providers = { 10 | oci = oci 11 | oci.home_region = oci.home_region 12 | } 13 | 14 | # Oracle Cloud Infrastructure Tenancy and Compartment OCID 15 | tenancy_ocid = var.tenancy_ocid 16 | compartment_ocid = var.compartment_ocid 17 | region = var.region 18 | 19 | # Note: Just few arguments are showing here to simplify the basic example. All other arguments are using default values. 20 | # App Name to identify deployment. Used for naming resources. 21 | app_name = local.app_name 22 | deploy_id = local.deploy_id 23 | 24 | # Freeform Tags + Defined Tags. Tags are applied to all resources. 25 | tag_values = { "freeformTags" = { "Environment" = "Development", "DeploymentType" = "basic", "QuickstartExample" = "basic-cluster" }, "definedTags" = {} } 26 | 27 | # OKE Node Pool 1 arguments 28 | node_pool_cni_type_1 = "FLANNEL_OVERLAY" # Use "OCI_VCN_IP_NATIVE" for VCN Native PODs Network. If the node pool 1 uses the OCI_VCN_IP_NATIVE, the cluster will also be configured with same cni 29 | node_pool_autoscaler_enabled_1 = true 30 | node_pool_initial_num_worker_nodes_1 = 1 # Minimum number of nodes in the node pool 31 | node_pool_max_num_worker_nodes_1 = 10 # Maximum number of nodes in the node pool 32 | node_pool_instance_shape_1 = { "instanceShape" = "VM.Standard.E4.Flex", "ocpus" = 2, "memory" = 64 } # If not using a Flex shape, ocpus and memory are ignored 33 | node_pool_boot_volume_size_in_gbs_1 = 60 34 | 35 | # VCN for OKE arguments 36 | vcn_cidr_blocks = "10.22.0.0/16" 37 | 38 | ingress_nginx_enabled = var.ingress_nginx_enabled 39 | cert_manager_enabled = var.cert_manager_enabled 40 | # Inverse - we only want to install if the user is NOT brining their own. 41 | metrics_server_enabled = !var.bring_your_own_metrics_server 42 | prometheus_enabled = !var.bring_your_own_prometheus 43 | grafana_enabled = !var.bring_your_own_grafana 44 | existent_prometheus_namespace = var.existent_prometheus_namespace 45 | 46 | create_new_oke_cluster = false 47 | existent_oke_cluster_id = var.existent_oke_cluster_id 48 | 49 | create_new_vcn = false 50 | existent_vcn_ocid = var.existent_vcn_ocid 51 | 52 | create_new_compartment_for_oke = false 53 | existent_vcn_compartment_ocid = var.compartment_ocid 54 | 55 | create_vault_policies_for_group = false 56 | 57 | create_subnets = false 58 | existent_oke_k8s_endpoint_subnet_ocid = var.existent_oke_k8s_endpoint_subnet_ocid 59 | existent_oke_nodes_subnet_ocid = var.existent_oke_nodes_subnet_ocid 60 | existent_oke_load_balancer_subnet_ocid = var.existent_oke_load_balancer_subnet_ocid 61 | # existent_oke_vcn_native_pod_networking_subnet_ocid = "" # Optional. Existent VCN Native POD Networking subnet if the CNI Type is "OCI_VCN_IP_NATIVE" 62 | 63 | } 64 | 65 | -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/policies.tf: -------------------------------------------------------------------------------- 1 | # Get compartment name for policy 2 | data "oci_identity_compartment" "oci_compartment" { 3 | id = var.compartment_ocid 4 | } 5 | 6 | # Define the dynamic group 7 | resource "oci_identity_dynamic_group" "dyn_group" { 8 | provider = oci.home_region 9 | name = "${local.app_name}-instance-dg" 10 | description = "Dynamic group for OKE instances across the tenancy" 11 | compartment_id = var.tenancy_ocid 12 | matching_rule = "ALL {instance.compartment.id = '${var.compartment_ocid}'}" 13 | freeform_tags = local.corrino_tags 14 | count = var.policy_creation_enabled ? 1 : 0 15 | } 16 | 17 | # Define the IAM policy 18 | resource "oci_identity_policy" "oke_instances_tenancy_policy" { 19 | provider = oci.home_region 20 | name = "${local.app_name}-dg-inst-policy" 21 | description = "Tenancy-level policy to grant needed permissions to the dynamic group" 22 | compartment_id = var.tenancy_ocid 23 | 24 | statements = [ 25 | "Allow dynamic-group 'Default'/'${oci_identity_dynamic_group.dyn_group[0].name}' to manage all-resources in compartment ${data.oci_identity_compartment.oci_compartment.name}", 26 | "Allow dynamic-group 'Default'/'${oci_identity_dynamic_group.dyn_group[0].name}' to use all-resources in tenancy", 27 | "Allow dynamic-group 'Default'/'${oci_identity_dynamic_group.dyn_group[0].name}' to {CLUSTER_JOIN} in compartment ${data.oci_identity_compartment.oci_compartment.name}", 28 | "Allow dynamic-group 'Default'/'${oci_identity_dynamic_group.dyn_group[0].name}' to manage volumes in TENANCY where request.principal.type = 'cluster'", 29 | "Allow dynamic-group 'Default'/'${oci_identity_dynamic_group.dyn_group[0].name}' to manage volume-attachments in TENANCY where request.principal.type = 'cluster'" 30 | ] 31 | freeform_tags = local.corrino_tags 32 | count = var.policy_creation_enabled ? 1 : 0 33 | depends_on = [oci_identity_dynamic_group.dyn_group] 34 | } 35 | -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/random.tf: -------------------------------------------------------------------------------- 1 | resource "random_string" "generated_workspace_name" { 2 | length = 6 3 | special = false 4 | min_upper = 3 5 | min_lower = 3 6 | } 7 | 8 | resource "random_string" "generated_deployment_name" { 9 | length = 6 10 | special = false 11 | min_upper = 3 12 | min_lower = 3 13 | } 14 | 15 | resource "random_string" "corrino_django_secret" { 16 | length = 32 17 | special = true 18 | min_upper = 3 19 | min_lower = 3 20 | min_numeric = 3 21 | min_special = 3 22 | override_special = "{}#^*<>[]%~" 23 | } 24 | 25 | resource "random_string" "autonomous_database_wallet_password" { 26 | length = 16 27 | special = true 28 | min_upper = 3 29 | min_lower = 3 30 | min_numeric = 3 31 | min_special = 3 32 | override_special = "{}#^*<>[]%~" 33 | } 34 | 35 | resource "random_string" "autonomous_database_admin_password" { 36 | length = 16 37 | special = true 38 | min_upper = 3 39 | min_lower = 3 40 | min_numeric = 3 41 | min_special = 3 42 | override_special = "{}#^*<>[]%~" 43 | } 44 | 45 | resource "random_string" "subdomain" { 46 | length = 6 47 | special = false 48 | upper = false 49 | } 50 | 51 | resource "random_uuid" "registration_id" { 52 | } 53 | 54 | #resource "random_string" "registration_id" { 55 | # length = 8 56 | # special = false 57 | # upper = false 58 | #} -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/rbac.tf: -------------------------------------------------------------------------------- 1 | resource "kubernetes_cluster_role" "corrino_cluster_role" { 2 | metadata { 3 | name = "corrino-rbac" 4 | } 5 | rule { 6 | api_groups = [""] 7 | resources = ["*"] 8 | verbs = ["*"] 9 | } 10 | 11 | count = 1 12 | } 13 | 14 | resource "kubernetes_cluster_role_binding" "corrino_cluster_role_binding" { 15 | metadata { 16 | name = "corrino-rbac" 17 | } 18 | subject { 19 | kind = "ServiceAccount" 20 | name = "default" 21 | namespace = "default" 22 | } 23 | role_ref { 24 | kind = "ClusterRole" 25 | name = "cluster-admin" 26 | api_group = "rbac.authorization.k8s.io" 27 | } 28 | 29 | count = 1 30 | } 31 | -------------------------------------------------------------------------------- /oci_ai_blueprints_terraform/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved. 2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. 3 | # 4 | 5 | terraform { 6 | required_version = ">= 1.1" 7 | required_providers { 8 | oci = { 9 | source = "oracle/oci" 10 | version = "~> 4, < 5" 11 | # https://registry.terraform.io/providers/oracle/oci/ 12 | configuration_aliases = [oci.home_region] 13 | } 14 | kubernetes = { 15 | source = "hashicorp/kubernetes" 16 | version = "~> 2" 17 | # https://registry.terraform.io/providers/hashicorp/kubernetes/ 18 | } 19 | helm = { 20 | source = "hashicorp/helm" 21 | version = "~> 2" 22 | # https://registry.terraform.io/providers/hashicorp/helm/ 23 | } 24 | tls = { 25 | source = "hashicorp/tls" 26 | version = "~> 4" 27 | # https://registry.terraform.io/providers/hashicorp/tls/ 28 | } 29 | local = { 30 | source = "hashicorp/local" 31 | version = "~> 2" 32 | # https://registry.terraform.io/providers/hashicorp/local/ 33 | } 34 | random = { 35 | source = "hashicorp/random" 36 | version = "~> 3" 37 | # https://registry.terraform.io/providers/hashicorp/random/ 38 | } 39 | } 40 | } 41 | --------------------------------------------------------------------------------