├── .gitignore
├── GETTING_STARTED_README.md
├── INSTALLING_ONTO_EXISTING_CLUSTER_README.md
├── LICENSE.txt
├── README.md
├── RELEASE_NOTES.md
├── cluster_creation_terraform
├── CLUSTER_CREATION_STACK_VERSION
├── OCI_AI_BLUEPRINTS_LINK
├── oke-datasources.tf
├── oke-network.tf
├── oke-security-lists.tf
├── oke.tf
├── outputs.tf
├── providers.tf
├── schema.yaml
├── variables.tf
└── versions.tf
├── docs
├── about.md
├── api_documentation.md
├── common_workflows
│ ├── deploying_blueprints_onto_specific_nodes
│ │ └── README.md
│ └── working_with_large_models
│ │ └── README.md
├── custom_blueprints
│ ├── README.md
│ └── blueprint_json_schema.json
├── iam_policies.md
├── images
│ └── install.svg
├── installing_new_updates.md
├── known_issues.md
├── sample_blueprints
│ ├── README.md
│ ├── auto_scaling
│ │ ├── README.md
│ │ └── autoscaling_blueprint.json
│ ├── cpu-inference
│ │ ├── README.md
│ │ ├── cpu-inference-gemma.json
│ │ ├── cpu-inference-mistral-bm.json
│ │ └── cpu-inference-mistral-vm.json
│ ├── exisiting_cluster_installation
│ │ ├── README.md
│ │ └── add_node_to_control_plane.json
│ ├── gpu-health-check
│ │ ├── README.md
│ │ ├── healthcheck_fp16_a10.json
│ │ ├── healthcheck_fp16_h100.json
│ │ └── healthcheck_fp32_a10.json
│ ├── llm_inference_with_vllm
│ │ ├── README.md
│ │ ├── vllm-closed-hf-model.json
│ │ ├── vllm-model-from-obj-storage.json
│ │ ├── vllm-open-hf-model-api-key-functionality.json
│ │ └── vllm-open-hf-model.json
│ ├── lora-benchmarking
│ │ ├── README.md
│ │ └── mlcommons_lora_finetune_nvidia_sample_recipe.json
│ ├── lora-fine-tuning
│ │ ├── README.md
│ │ ├── bucket_checkpoint_bucket_model_open_dataset.backend.json
│ │ ├── bucket_model_open_dataset.backend.json
│ │ ├── bucket_par_open_dataset.backend.json
│ │ ├── closed_model_open_dataset_hf.backend.json
│ │ └── open_model_open_dataset_hf.backend.json
│ ├── mig_multi_instance_gpu
│ │ ├── README.md
│ │ ├── mig_enabled_shared_node_pool.json
│ │ ├── mig_inference_multiple_replicas.json
│ │ ├── mig_inference_single_replica.json
│ │ ├── mig_inference_single_replica_10gb.json
│ │ ├── mig_slices.png
│ │ ├── mig_update_node_with_node_name.json
│ │ └── mig_update_shared_pool_with_node_pool_name.json
│ ├── model_storage
│ │ ├── README.md
│ │ ├── download_closed_hf_model_to_object_storage.json
│ │ └── download_open_hf_model_to_object_storage.json
│ ├── multi-node-inference
│ │ ├── README.md
│ │ ├── multinode_inference_BM_A10.json
│ │ └── multinode_inference_VM_A10.json
│ ├── shared_node_pools
│ │ ├── README.md
│ │ ├── shared_node_pool_A10_BM.json
│ │ ├── shared_node_pool_A10_VM.json
│ │ └── vllm_inference_sample_shared_pool_blueprint.json
│ ├── startup_liveness_readiness_probes
│ │ ├── README.md
│ │ └── autoscale_with_fss.json
│ ├── teams
│ │ ├── README.md
│ │ ├── create_job_with_team.json
│ │ └── create_team.json
│ └── using_rdma_enabled_node_pools
│ │ ├── README.md
│ │ ├── rdma_distributed_inference.json
│ │ ├── rdma_shared_node_pool.json
│ │ └── rdma_update_nodes.json
├── usage_guide.md
├── versions
│ ├── ControlPlaneVersions.md
│ ├── PortalVersions.md
│ ├── QuickStartVersions.md
│ └── README.md
└── whisper_transcription
│ ├── README.md
│ ├── docs
│ └── Whisper_Architecture.pdf
│ ├── examples
│ ├── test1
│ │ ├── test.wav
│ │ ├── test_all_transcripts_20250601_201349.txt
│ │ └── transcription_log_20250601_201340.log
│ ├── test2
│ │ ├── transcription_log_20250601_203611.log
│ │ ├── video1591686795.mp4
│ │ ├── video1591686795_all_transcripts_20250601_203730.json
│ │ └── video1591686795_all_transcripts_20250601_203730.txt
│ └── test3
│ │ ├── audio1788670787.m4a
│ │ ├── audio1788670787_all_transcripts_20250601_191710.json
│ │ ├── audio1788670787_all_transcripts_20250601_191710.txt
│ │ └── transcription_log_20250601_191325.log
│ ├── whisper-transcription-A10.json
│ ├── whisper-transcription-A100.json
│ └── whisper-transcription-H100.json
└── oci_ai_blueprints_terraform
├── OCI_AI_BLUEPRINTS_STACK_VERSION
├── app-api.tf
├── app-background.tf
├── app-blueprint-portal.tf
├── app-configmap.tf
├── app-migration.tf
├── app-registration.tf
├── app-user.tf
├── data.tf
├── database.tf
├── helm.tf
├── ingress.tf
├── later.tf
├── locals.tf
├── lws
├── Chart.yaml
├── README.md
├── templates
│ ├── _helpers.tpl
│ ├── certmanager
│ │ └── certificate.yaml
│ ├── crds
│ │ └── leaderworkerset.x-k8s.io_leaderworkersets.yaml
│ ├── manager
│ │ ├── deployment.yaml
│ │ └── service.yaml
│ ├── prometheus
│ │ ├── monitor.yaml
│ │ └── role.yaml
│ ├── rbac
│ │ ├── clusterrole.yaml
│ │ ├── clusterrolebinding.yaml
│ │ ├── role.yaml
│ │ ├── rolebinding.yaml
│ │ └── serviceaccount.yaml
│ └── webhook
│ │ ├── secret.yaml
│ │ ├── service.yaml
│ │ └── webhook.yaml
└── values.yaml
├── modules
└── corrino
│ ├── .gitignore
│ ├── LICENSE
│ ├── NOTICE
│ ├── cluster-tools.tf
│ ├── datasources.tf
│ ├── defaults.tf
│ ├── main.tf
│ ├── modules
│ ├── cluster-tools
│ │ ├── cert-manager.tf
│ │ ├── cluster-tools.tf
│ │ ├── dashboards
│ │ │ └── vllm-dashboard.json
│ │ ├── grafana.tf
│ │ ├── ingress-nginx.tf
│ │ ├── jaeger.tf
│ │ ├── keycloak.tf
│ │ ├── metrics-server.tf
│ │ ├── modules
│ │ │ ├── cert-manager
│ │ │ │ ├── issuers
│ │ │ │ │ ├── .helmignore
│ │ │ │ │ ├── Chart.yaml
│ │ │ │ │ ├── templates
│ │ │ │ │ │ ├── NOTES.txt
│ │ │ │ │ │ ├── _helpers.tpl
│ │ │ │ │ │ └── clusterissuers.yaml
│ │ │ │ │ └── values.yaml
│ │ │ │ ├── main.tf
│ │ │ │ ├── providers.tf
│ │ │ │ ├── test.yaml
│ │ │ │ └── variables.tf
│ │ │ └── verrazzano
│ │ │ │ ├── main.tf
│ │ │ │ ├── providers.tf
│ │ │ │ └── variables.tf
│ │ ├── outputs.tf
│ │ ├── postgresql.tf
│ │ ├── prometheus.tf
│ │ ├── providers.tf
│ │ └── redis.tf
│ ├── oci-networking
│ │ └── README.md
│ ├── oci-policies
│ │ ├── main.tf
│ │ ├── outputs.tf
│ │ ├── variables.tf
│ │ └── versions.tf
│ ├── oci-vault-kms
│ │ ├── main.tf
│ │ ├── outputs.tf
│ │ ├── policies.tf
│ │ ├── providers.tf
│ │ └── variables.tf
│ ├── oke-cluster-autoscaler
│ │ ├── datasources.tf
│ │ ├── main.tf
│ │ ├── variables.tf
│ │ └── versions.tf
│ ├── oke-node-pool
│ │ ├── datasources.tf
│ │ ├── main.tf
│ │ ├── outputs.tf
│ │ ├── variables.tf
│ │ └── versions.tf
│ └── oke
│ │ ├── LICENSE
│ │ ├── README.md
│ │ ├── datasources.tf
│ │ ├── main.tf
│ │ ├── oke-orm-private-endpoint.tf
│ │ ├── outputs.tf
│ │ ├── variables.tf
│ │ └── versions.tf
│ ├── oci-networking.tf
│ ├── outputs.tf
│ ├── policies.tf
│ ├── schema.org.yaml
│ ├── variables.tf
│ └── versions.tf
├── oke.tf
├── outputs.tf
├── policies.tf
├── providers.tf
├── random.tf
├── rbac.tf
├── schema.yaml
├── variables.tf
└── versions.tf
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode
2 | _site
3 | .DS_Store
4 | .sw[a-z]
5 | .idea/
6 | *.iml
7 | venv/
8 | archive/
9 | test_data/
10 | HOLD*
11 | *.base64
12 | variables.json
13 | *pyc
14 | *.env
15 | util_setup_local_env.sh
16 | terraform.tfvars
17 | .terraform/
18 | .terraform.lock.hcl
19 | terraform.tfstate
20 | terraform.tfstate.backup
21 | test*json
22 | generated/
23 | *.zip
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2024 Oracle and/or its affiliates. All rights reserved.
2 |
3 | The Universal Permissive License (UPL), Version 1.0
4 |
5 | Subject to the condition set forth below, permission is hereby granted to any person obtaining a copy of this
6 | software, associated documentation and/or data (collectively the "Software"), free of charge and under any and
7 | all copyright rights in the Software, and any and all patent rights owned or freely licensable by each licensor
8 | hereunder covering either (i) the unmodified Software as contributed to or provided by such licensor, or
9 | (ii) the Larger Works (as defined below), to deal in both
10 |
11 | (a) the Software, and
12 | (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if one is included with the Software
13 | (each a “Larger Work” to which the Software is contributed by such licensors),
14 |
15 | without restriction, including without limitation the rights to copy, create derivative works of, display,
16 | perform, and distribute the Software and make, use, sell, offer for sale, import, export, have made, and have
17 | sold the Software and the Larger Work(s), and to sublicense the foregoing rights on either these or other terms.
18 |
19 | This license is subject to the following condition:
20 | The above copyright notice and either this complete permission notice or at a minimum a reference to the UPL must
21 | be included in all copies or substantial portions of the Software.
22 |
23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
24 | THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
26 | CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
27 | IN THE SOFTWARE.
28 |
29 |
--------------------------------------------------------------------------------
/RELEASE_NOTES.md:
--------------------------------------------------------------------------------
1 | # Release Notes
2 |
3 | The following document contains release notes. Each section will detail added features, what has changed, and what has been fixed. Release notes for the previous 5 releases will be maintained in this document. Click the dropdown next to a release to see its associated notes.
4 |
5 | TODO (This file is intended to serve as a template for now):
6 |
7 |
8 | 1.0.0
9 |
10 | ### Added Features
11 |
12 | - Multinode inference
13 | - description one
14 | - description two
15 | - Blueprints can utilize RDMA connectivity between nodes
16 | - my description one
17 | - my description two
18 |
19 | ### Changed
20 |
21 | - Kuberay replaced by LeaderWorkerSet
22 | - MLFlow, Prometheus, and Grafana now use persistent volume claims instead of local storage
23 | - Anchored all versions of helm installs to specific versions which can be found [here](docs/versions/QuickStartVersions.md).
24 |
25 | ### Fixed
26 |
27 | - Fixed an issue with mlflow deployments where all mlflow experiments would fail because "Experiment 1" did not exist - bug in mlflow and using :memory: as the runs database.
28 |
29 |
--------------------------------------------------------------------------------
/cluster_creation_terraform/CLUSTER_CREATION_STACK_VERSION:
--------------------------------------------------------------------------------
1 | v1.0.2
--------------------------------------------------------------------------------
/cluster_creation_terraform/OCI_AI_BLUEPRINTS_LINK:
--------------------------------------------------------------------------------
1 | "https://cloud.oracle.com/resourcemanager/stacks/create?zipUrl=https://github.com/oracle-quickstart/oci-ai-blueprints/releases/download/v1.0.2/v1.0.2_app.zip"
2 |
--------------------------------------------------------------------------------
/cluster_creation_terraform/oke-datasources.tf:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020-2022 Oracle and/or its affiliates. All rights reserved.
2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | #
4 |
5 |
6 | # Gets a list of supported images based on the shape, operating_system and operating_system_version provided
7 | data "oci_core_images" "shape_specific_images" {
8 | compartment_id = local.oke_compartment_ocid
9 | shape = var.node_pool_instance_shape.instanceShape
10 | }
11 |
12 | data "oci_containerengine_node_pool_option" "cluster_node_pool_option" {
13 | #Required
14 | node_pool_option_id = oci_containerengine_cluster.oke_cluster[0].id
15 |
16 | depends_on = [oci_containerengine_cluster.oke_cluster]
17 | }
18 |
19 | data "oci_containerengine_cluster_option" "oke" {
20 | cluster_option_id = "all"
21 | }
22 | data "oci_containerengine_node_pool_option" "oke" {
23 | node_pool_option_id = "all"
24 | }
25 | data "oci_containerengine_clusters" "oke" {
26 | compartment_id = local.oke_compartment_ocid
27 | }
28 |
29 | # Gets a list of Availability Domains
30 | data "oci_identity_availability_domains" "ADs" {
31 | compartment_id = var.tenancy_ocid
32 | }
33 |
34 | # Gets home and current regions
35 | data "oci_identity_tenancy" "tenant_details" {
36 | tenancy_id = var.tenancy_ocid
37 |
38 | provider = oci.current_region
39 | }
40 |
41 | data "oci_identity_regions" "home_region" {
42 | filter {
43 | name = "key"
44 | values = [data.oci_identity_tenancy.tenant_details.home_region_key]
45 | }
46 |
47 | provider = oci.current_region
48 | }
49 |
50 | # Gets kubeconfig
51 | data "oci_containerengine_cluster_kube_config" "oke" {
52 | cluster_id = oci_containerengine_cluster.oke_cluster[0].id
53 |
54 | depends_on = [oci_containerengine_node_pool.oke_node_pool]
55 | }
56 |
57 | # OCI Services
58 | ## Available Services
59 | data "oci_core_services" "all_services" {
60 | filter {
61 | name = "name"
62 | values = ["All .* Services In Oracle Services Network"]
63 | regex = true
64 | }
65 | }
66 |
67 | ## Object Storage
68 | data "oci_objectstorage_namespace" "ns" {
69 | compartment_id = local.oke_compartment_ocid
70 | }
71 |
72 | # Randoms
73 | resource "random_string" "deploy_id" {
74 | length = 4
75 | special = false
76 | }
77 |
78 | resource "random_string" "app_name_autogen" {
79 | length = 6
80 | special = false
81 | }
82 |
83 | locals {
84 |
85 | all_shape_compatible_images = data.oci_core_images.shape_specific_images.images
86 | all_cluster_compatible_images = data.oci_containerengine_node_pool_option.cluster_node_pool_option.sources
87 |
88 | all_shape_compatible_image_ids = [for image in local.all_shape_compatible_images : image.id]
89 |
90 | all_cluster_compatible_image_ids = [for source in local.all_cluster_compatible_images : source.image_id]
91 |
92 | first_compatible_image_id = tolist(setintersection(toset(local.all_shape_compatible_image_ids), toset(local.all_cluster_compatible_image_ids)))[0]
93 |
94 | }
95 |
96 |
--------------------------------------------------------------------------------
/cluster_creation_terraform/outputs.tf:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 Oracle and/or its affiliates. All rights reserved.
2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | #
4 |
5 | output "cluster_creation_stack_version" {
6 | value = file("${path.module}/CLUSTER_CREATION_STACK_VERSION")
7 | }
8 |
9 | output "oke_cluster_name" {
10 | value = oci_containerengine_cluster.oke_cluster[0].name
11 | }
12 |
13 | output "oke_cluster_id" {
14 | value = oci_containerengine_cluster.oke_cluster[0].id
15 | }
16 |
17 | output "oci_ai_blueprints_link_for_button" {
18 | value = local.oci_ai_blueprints_link
19 | }
20 |
21 | output "oci_ai_blueprints_link_for_section" {
22 | value = local.oci_ai_blueprints_link
23 | }
24 |
25 | output "vcn_name" {
26 | value = oci_core_virtual_network.oke_vcn[0].display_name
27 | }
28 |
29 | output "vcn_id" {
30 | value = oci_core_virtual_network.oke_vcn[0].id
31 | }
32 |
33 | output "node_subnet_name" {
34 | value = oci_core_subnet.oke_nodes_subnet[0].display_name
35 | }
36 |
37 | output "node_subnet_id" {
38 | value = oci_core_subnet.oke_nodes_subnet[0].id
39 | }
40 |
41 | output "lb_subnet_name" {
42 | value = oci_core_subnet.oke_lb_subnet[0].display_name
43 | }
44 |
45 | output "lb_subnet_id" {
46 | value = oci_core_subnet.oke_lb_subnet[0].id
47 | }
48 |
49 | output "endpoint_subnet_name" {
50 | value = oci_core_subnet.oke_k8s_endpoint_subnet[0].display_name
51 | }
52 |
53 | output "endpoint_subnet_id" {
54 | value = oci_core_subnet.oke_k8s_endpoint_subnet[0].id
55 | }
56 |
57 |
--------------------------------------------------------------------------------
/cluster_creation_terraform/providers.tf:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020-2024 Oracle and/or its affiliates. All rights reserved.
2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | #
4 |
5 | provider "oci" {
6 | tenancy_ocid = var.tenancy_ocid
7 | region = var.region
8 | }
9 |
10 | provider "oci" {
11 | alias = "home_region"
12 | tenancy_ocid = var.tenancy_ocid
13 | region = lookup(data.oci_identity_regions.home_region.regions[0], "name")
14 |
15 | user_ocid = var.user_ocid
16 | }
17 |
18 | provider "oci" {
19 | alias = "current_region"
20 | tenancy_ocid = var.tenancy_ocid
21 | region = var.region
22 |
23 | user_ocid = var.user_ocid
24 | }
25 |
26 | # New configuration to avoid Terraform Kubernetes provider interpolation. https://registry.terraform.io/providers/hashicorp/kubernetes/2.2.0/docs#stacking-with-managed-kubernetes-cluster-resources
27 | # Currently need to uncheck to refresh (--refresh=false) when destroying or else the terraform destroy will fail
28 |
29 | # https://docs.cloud.oracle.com/en-us/iaas/Content/ContEng/Tasks/contengdownloadkubeconfigfile.htm#notes
30 | provider "kubernetes" {
31 | host = local.cluster_endpoint
32 | cluster_ca_certificate = local.cluster_ca_certificate
33 | exec {
34 | api_version = "client.authentication.k8s.io/v1beta1"
35 | args = ["ce", "cluster", "generate-token", "--cluster-id", local.cluster_id, "--region", local.cluster_region]
36 | command = "oci"
37 | }
38 | }
39 |
40 | # https://docs.cloud.oracle.com/en-us/iaas/Content/ContEng/Tasks/contengdownloadkubeconfigfile.htm#notes
41 | provider "helm" {
42 | kubernetes {
43 | host = local.cluster_endpoint
44 | cluster_ca_certificate = local.cluster_ca_certificate
45 | exec {
46 | api_version = "client.authentication.k8s.io/v1beta1"
47 | args = ["ce", "cluster", "generate-token", "--cluster-id", local.cluster_id, "--region", local.cluster_region]
48 | command = "oci"
49 | }
50 | }
51 | }
52 |
53 | locals {
54 | cluster_endpoint = yamldecode(data.oci_containerengine_cluster_kube_config.oke.content)["clusters"][0]["cluster"]["server"]
55 | cluster_ca_certificate = base64decode(yamldecode(data.oci_containerengine_cluster_kube_config.oke.content)["clusters"][0]["cluster"]["certificate-authority-data"])
56 | cluster_id = yamldecode(data.oci_containerengine_cluster_kube_config.oke.content)["users"][0]["user"]["exec"]["args"][4]
57 | cluster_region = yamldecode(data.oci_containerengine_cluster_kube_config.oke.content)["users"][0]["user"]["exec"]["args"][6]
58 | }
--------------------------------------------------------------------------------
/cluster_creation_terraform/variables.tf:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 Oracle and/or its affiliates. All rights reserved.
2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | #
4 |
5 | # OKE Variables
6 | ## OKE Cluster Details
7 | variable "cluster_options_add_ons_is_kubernetes_dashboard_enabled" {
8 | default = false
9 | }
10 |
11 | ## OKE Visibility (Workers and Endpoint)
12 |
13 | variable "cluster_workers_visibility" {
14 | default = "Private"
15 | description = "The Kubernetes worker nodes that are created will be hosted in public or private subnet(s)"
16 |
17 | validation {
18 | condition = var.cluster_workers_visibility == "Private" || var.cluster_workers_visibility == "Public"
19 | error_message = "Sorry, but cluster visibility can only be Private or Public."
20 | }
21 | }
22 |
23 | variable "cluster_endpoint_visibility" {
24 | default = "Public"
25 | description = "The Kubernetes cluster that is created will be hosted on a public subnet with a public IP address auto-assigned or on a private subnet. If Private, additional configuration will be necessary to run kubectl commands"
26 |
27 | validation {
28 | condition = var.cluster_endpoint_visibility == "Private" || var.cluster_endpoint_visibility == "Public"
29 | error_message = "Sorry, but cluster endpoint visibility can only be Private or Public."
30 | }
31 | }
32 |
33 |
34 | ## OKE Node Pool Details
35 | variable "node_pool_name" {
36 | default = "pool1"
37 | description = "Name of the node pool"
38 | }
39 | variable "k8s_version" {
40 | default = "v1.31.1"
41 | description = "Kubernetes version installed on your master and worker nodes"
42 | }
43 | variable "num_pool_workers" {
44 | default = 6
45 | description = "The number of worker nodes in the node pool. If select Cluster Autoscaler, will assume the minimum number of nodes configured"
46 | }
47 |
48 | variable "node_pool_instance_shape" {
49 | type = map(any)
50 | default = {
51 | "instanceShape" = "VM.Standard.E3.Flex"
52 | "ocpus" = 6
53 | "memory" = 64
54 | }
55 | description = "A shape is a template that determines the number of OCPUs, amount of memory, and other resources allocated to a newly created instance for the Worker Node. Select at least 2 OCPUs and 16GB of memory if using Flex shapes"
56 | }
57 | variable "node_pool_boot_volume_size_in_gbs" {
58 | default = "60"
59 | description = "Specify a custom boot volume size (in GB)"
60 | }
61 |
62 | # Network Details
63 | ## CIDRs
64 | variable "network_cidrs" {
65 | type = map(string)
66 |
67 | default = {
68 | VCN-CIDR = "10.0.0.0/16"
69 | SUBNET-REGIONAL-CIDR = "10.0.64.0/20"
70 | LB-SUBNET-REGIONAL-CIDR = "10.0.96.0/20"
71 | ENDPOINT-SUBNET-REGIONAL-CIDR = "10.0.128.0/20"
72 | ALL-CIDR = "0.0.0.0/0"
73 | PODS-CIDR = "10.244.0.0/16"
74 | KUBERNETES-SERVICE-CIDR = "10.96.0.0/16"
75 | }
76 | }
77 |
78 | # OCI Provider
79 | variable "tenancy_ocid" {}
80 | variable "compartment_ocid" {}
81 | variable "region" {}
82 | variable "user_ocid" {
83 | default = ""
84 | }
85 |
86 | # ORM Schema visual control variables
87 | variable "show_advanced" {
88 | default = false
89 | }
90 |
91 | # App Name Locals
92 | locals {
93 | app_name = random_string.app_name_autogen.result
94 | app_name_normalized = random_string.app_name_autogen.result
95 | oci_ai_blueprints_link = file("${path.module}/OCI_AI_BLUEPRINTS_LINK")
96 | }
97 |
98 | # Dictionary Locals
99 | locals {
100 | compute_flexible_shapes = [
101 | "VM.Standard.E3.Flex",
102 | "VM.Standard.E4.Flex",
103 | "VM.Standard.A1.Flex"
104 | ]
105 | }
--------------------------------------------------------------------------------
/cluster_creation_terraform/versions.tf:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | #
4 |
5 | terraform {
6 | required_version = ">= 1.5" #>= 1.6 when using OpenTofu
7 | required_providers {
8 | oci = {
9 | source = "oracle/oci"
10 | version = ">= 5"
11 | # https://registry.terraform.io/providers/oracle/oci/
12 | }
13 | kubernetes = {
14 | source = "hashicorp/kubernetes"
15 | version = ">= 2.27"
16 | # https://registry.terraform.io/providers/hashicorp/kubernetes/
17 | }
18 | helm = {
19 | source = "hashicorp/helm"
20 | version = ">= 2.12"
21 | # https://registry.terraform.io/providers/hashicorp/helm/
22 | }
23 | tls = {
24 | source = "hashicorp/tls"
25 | version = ">= 4"
26 | # https://registry.terraform.io/providers/hashicorp/tls/
27 | }
28 | local = {
29 | source = "hashicorp/local"
30 | version = ">= 2.5"
31 | # https://registry.terraform.io/providers/hashicorp/local/
32 | }
33 | random = {
34 | source = "hashicorp/random"
35 | version = ">= 3.6"
36 | # https://registry.terraform.io/providers/hashicorp/random/
37 | }
38 | }
39 | }
--------------------------------------------------------------------------------
/docs/common_workflows/deploying_blueprints_onto_specific_nodes/README.md:
--------------------------------------------------------------------------------
1 | # Deploying Blueprints Onto Specific Nodes
2 |
3 | **Note:** A basic understanding of how to use Kubernetes is required for this task
4 |
5 | Assumption: the node exists and you are installing OCI AI Blueprints alongside this pre-existing node (i.e. the node is in the same cluster as the OCI AI Blueprints application)
6 |
7 | ## Label Nodes
8 |
9 | If you have existing node pools in your original OKE cluster that you'd like Blueprints to be able to use, follow these steps after the stack is finished:
10 |
11 | 1. Find the private IP address of the node you'd like to add.
12 | - Console:
13 | - Go to the OKE cluster in the console like you did above
14 | - Click on "Node pools"
15 | - Click on the pool with the node you want to add
16 | - Identify the private ip address of the node under "Nodes" in the page.
17 | - Command line with `kubectl` (assumes cluster access is setup):
18 | - run `kubectl get nodes`
19 | - run `kubectl describe node ` on each node until you find the node you want to add
20 | - The private ip appears under the `Name` field of the output of `kubectl get nodes`.
21 | 2. Go to the stack and click "Application information". Click the API Url.
22 | 3. Login with the `Admin Username` and `Admin Password` in the Application information tab.
23 | 4. Click the link next to "deployment" which will take you to a page with "Deployment List", and a content box.
24 | 5. Paste in the sample blueprint json found [here](../../sample_blueprints/exisiting_cluster_installation/add_node_to_control_plane.json).
25 | 6. Modify the "recipe_node_name" field to the private IP address you found in step 1 above.
26 | 7. Click "POST". This is a fast operation.
27 | 8. Wait about 20 seconds and refresh the page. It should look like:
28 |
29 | ```json
30 | [
31 | {
32 | "mode": "update",
33 | "recipe_id": null,
34 | "creation_date": "2025-03-28 11:12 AM UTC",
35 | "deployment_uuid": "750a________cc0bfd",
36 | "deployment_name": "startupaddnode",
37 | "deployment_status": "completed",
38 | "deployment_directive": "commission"
39 | }
40 | ]
41 | ```
42 |
43 | ### Adding additional labels
44 |
45 | To add any additional labels to nodes that you may wish to use later to specify deployment targets, this field (`recipe_node_labels`) can take any arbitrary number of labels to apply to a given node. For example, in the blueprint json, you could add the following:
46 |
47 | ```json
48 | "recipe_node_labels": {
49 | "key1": "value1",
50 | "key2": "value2",
51 | "key3": "value3"
52 | }
53 | ```
54 |
55 | ## Deploy a blueprint
56 |
57 | Now that you have artifically created a shared node pool using the node labels above, you can deploy a recipe to that node pool.
58 |
59 | ```json
60 | {
61 | "recipe_id": "example",
62 | "recipe_mode": "service",
63 | "deployment_name": "a10 deployment",
64 | "recipe_use_shared_node_pool": true,
65 | "recipe_image_uri": "hashicorp/http-echo",
66 | "recipe_container_command_args": ["-text=corrino"],
67 | "recipe_container_port": "5678",
68 | "recipe_node_shape": "BM.GPU.A10.4",
69 | "recipe_replica_count": 1,
70 | "recipe_nvidia_gpu_count": 4,
71 | "shared_node_pool_custom_node_selectors": [
72 | {
73 | "key": "corrino",
74 | "value": "a10pool"
75 | }
76 | ]
77 | }
78 | ```
79 |
80 | Note: In the example above, we specified `recipe_nvidia_gpu_count` as 4 which means we want to use 4 of the GPUs on the node.
81 |
82 | Note: We set `shared_node_pool_custom_node_selectors` to "a10pool" to match the name of the shared node pool we created with the exisiting node. Here, we could add any additional labels added to target specific nodes for work.
83 |
84 | Note: We set `recipe_use_shared_node_pool` to true so that we are using the shared node mode behavior for the blueprint (previously called recipe).
85 |
86 | ## Complete
87 |
88 | At this point, you have successfully deployed a blueprint to an exisiting node and utilized a portion of the existing node by specifiying the specific number of GPUs you wish to use for the blueprint.
89 |
--------------------------------------------------------------------------------
/docs/installing_new_updates.md:
--------------------------------------------------------------------------------
1 | # Installing New Updates
2 |
3 | ## Overview
4 |
5 | The OCI AI Blueprints team regularly publishes **full-stack release packages** (control plane, frontend, blueprints, Terraform).
6 | To upgrade your existing deployment, replace your stack’s source zip with the **latest package** from GitHub Releases and re-apply the stack in **OCI Resource Manager**.
7 |
8 | ---
9 |
10 | ## Upgrade Steps
11 |
12 | 1. **Download and unzip the latest release package**
13 |
14 | - Go to **GitHub → Releases** for OCI AI Blueprints
15 |
16 | - Download the file that ends with `_app.zip` (for example `vX.Y.Z_app.zip`) and unzip it.
17 |
18 | 2. Open **OCI Console → Resource Manager → Stacks**.
19 |
20 | 3. Select the stack you originally used to deploy **OCI AI Blueprints**.
21 |
22 | 4. Click **Edit → Edit Stack**.
23 |
24 | 5. **Upload** the unzipped package (the `.zip` downloaded in Step 1).
25 |
26 | > _Tip: the file name should match the release you just downloaded._
27 |
28 | 6. Click **Next → Next → Confirm** to save the new source.
29 |
30 | 7. Press **Apply** (top-right). A new job starts automatically.
31 |
32 | 8. Wait until the job’s **State** is **Succeeded** — your entire stack is now updated.
33 |
34 | ---
35 |
36 | ## Technical Background
37 |
38 | Updating the stack zip prompts **Resource Manager** to pull the newest Terraform code and container images.
39 | During _Apply_, OKE deployments roll automatically, so no manual pod restarts are needed.
40 |
41 | ---
42 |
43 | ## Error Handling
44 |
45 | If a job fails or you see errors in the console, please contact:
46 |
47 | - Vishnu Kammari —
48 | - Grant Neuman —
49 |
50 | Include the full set of logs when reaching out for fastest assistance.
51 |
--------------------------------------------------------------------------------
/docs/known_issues.md:
--------------------------------------------------------------------------------
1 | # Known Issues & Solutions
2 |
3 | Place to record issues that arise and there corresponding workarounds.
4 |
5 | ## 500 Errors When Connecting to API
6 |
7 | 1. Check your permissions and verify that they match exactly as shown here: [IAM Policies](../docs/iam_policies.md)
8 | 2. Did you choose `*.nip.io` as your domain name when setting up OCI AI Blueprints? If so, this is an untrusted domain and will be blocked when behind VPN. Either choose to deploy OCI AI Blueprints via custom domain or access your `*.nip.io` OCI AI Blueprints domain outside of VPN
9 |
10 | ## Shape BM.GPU4.8 Cannot Schedule Blueprints
11 |
12 | Currently, there is an Oracle Kubernetes Engine (OKE) bug with the `BM.GPU4.8` shape. Since the toolkit runs on top of an OKE cluster, this shape cannot be used with the toolkit until the issue is resolved by OKE. We have diagnosed and reported the issue, and are following up with the OKE team for resolution. The error for this issue presents like:
13 |
14 | The following `kubectl` commands can be used to diagnose pods in this state:
15 |
16 | ```bash
17 | kubectl get pods # to find the name of the pod
18 | kubectl describe pod
19 | ```
20 |
21 | This will output all information about the pod. In the `Events:` section (at the very bottom) you will see information like this:
22 |
23 | ```
24 | Pod info: nvidia-dcgm-node-feature-discovery-worker always gets stuck in container creating with warning / error like:
25 | Warning FailedCreatePodSandBox 12s kubelet Failed to create pod sandbox: rpc error: code = Unknown desc = failed to create pod network sandbox k8s_gpu-operator-1738967226-node-feature-discovery-worker-dzwht_gpu-operator_06605d81-8dc8-48db-a9a9-b393e8bcd068_0
26 | ```
27 |
28 | Where the nvidia-dcgm-node-feature-discovery-worker pod infinitely gets stuck in a "ContainerCreating" / "CrashLoopBackoff" cycle.
29 |
30 | ## Issues Connecting to APIs via Postman or Curl
31 |
32 | Make sure to append a slash ('/') to the end of the URL such as `https://api./deployment/` instead of `https://api./deployment`.
33 | This is especially important for all POST requests.
34 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/auto_scaling/autoscaling_blueprint.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_id": "llm_inference_nvidia",
3 | "recipe_mode": "service",
4 | "deployment_name": "autoscale_vllm_example",
5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:vllmv0.6.5",
6 | "recipe_node_shape": "VM.GPU.A10.2",
7 | "input_object_storage": [
8 | {
9 | "par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/qFv5XzocpOoEXjlxL7Q3ZrrCFkx9GkA1fpg97zmnaNEX9WB_WMXLz2rykGuU1hqQ/n/iduyx1qnmway/b/metallama321binstruct/o/",
10 | "mount_location": "/models",
11 | "volume_size_in_gbs": 100
12 | }
13 | ],
14 | "recipe_container_env": [
15 | {
16 | "key": "tensor_parallel_size",
17 | "value": "1"
18 | },
19 | {
20 | "key": "model_name",
21 | "value": ""
22 | },
23 | {
24 | "key": "Model_Path",
25 | "value": "/models"
26 | }
27 | ],
28 | "recipe_replica_count": 1,
29 | "recipe_container_port": "8000",
30 | "recipe_nvidia_gpu_count": 1,
31 | "recipe_container_command_args": [
32 | "--model",
33 | "$(Model_Path)",
34 | "--tensor-parallel-size",
35 | "$(tensor_parallel_size)",
36 | "--gpu-memory-utilization",
37 | "0.99",
38 | "--max-model-len",
39 | "1024"
40 | ],
41 | "recipe_ephemeral_storage_size": 200,
42 | "recipe_node_boot_volume_size_in_gbs": 300,
43 | "recipe_node_pool_size": 1,
44 | "recipe_shared_memory_volume_size_limit_in_mb": 200,
45 | "recipe_startup_probe_params": {
46 | "failure_threshold": 30,
47 | "endpoint_path": "/health",
48 | "port": 8000,
49 | "scheme": "HTTP",
50 | "initial_delay_seconds": 60,
51 | "period_seconds": 2,
52 | "success_threshold": 1,
53 | "timeout_seconds": 10
54 | },
55 | "recipe_liveness_probe_params": {
56 | "failure_threshold": 3,
57 | "endpoint_path": "/health",
58 | "port": 8000,
59 | "scheme": "HTTP",
60 | "initial_delay_seconds": 65,
61 | "period_seconds": 600,
62 | "success_threshold": 1,
63 | "timeout_seconds": 10
64 | },
65 | "recipe_node_autoscaling_params": {
66 | "min_nodes": 1,
67 | "max_nodes": 2
68 | },
69 | "recipe_pod_autoscaling_params": {
70 | "min_replicas": 1,
71 | "max_replicas": 4
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/cpu-inference/cpu-inference-gemma.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_id": "cpu_inference",
3 | "recipe_mode": "service",
4 | "deployment_name": "cpu Inference gemma BME5",
5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:cpu_inference_service_v0.2",
6 | "recipe_node_shape": "BM.Standard.E5.192",
7 | "input_object_storage": [
8 | {
9 | "par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/0LYMMBRGg_OEm_hzp9BG8BlQx7Ccpy3gY-gRzjQQFZRU6peG0pXyHTRHUGZLp82E/n/iduyx1qnmway/b/ollama-models/o/",
10 | "mount_location": "/models",
11 | "volume_size_in_gbs": 20
12 | }
13 | ],
14 | "recipe_container_env": [
15 | {
16 | "key": "MODEL_NAME",
17 | "value": "gemma"
18 | },
19 | {
20 | "key": "PROMPT",
21 | "value": "What is the capital of Germany?"
22 | }
23 | ],
24 | "recipe_replica_count": 1,
25 | "recipe_container_port": "11434",
26 | "recipe_node_pool_size": 1,
27 | "recipe_node_boot_volume_size_in_gbs": 200,
28 | "recipe_container_command_args": [
29 | "--input_directory",
30 | "/models",
31 | "--model_name",
32 | "gemma"
33 | ],
34 | "recipe_ephemeral_storage_size": 100
35 | }
36 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/cpu-inference/cpu-inference-mistral-bm.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_id": "cpu_inference",
3 | "recipe_mode": "service",
4 | "deployment_name": "cpu Inference mistral BME4",
5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:cpu_inference_service_v0.2",
6 | "recipe_node_shape": "BM.Standard.E4.128",
7 | "input_object_storage": [
8 | {
9 | "par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/0LYMMBRGg_OEm_hzp9BG8BlQx7Ccpy3gY-gRzjQQFZRU6peG0pXyHTRHUGZLp82E/n/iduyx1qnmway/b/ollama-models/o/",
10 | "mount_location": "/models",
11 | "volume_size_in_gbs": 20
12 | }
13 | ],
14 | "recipe_container_env": [
15 | {
16 | "key": "MODEL_NAME",
17 | "value": "mistral"
18 | },
19 | {
20 | "key": "PROMPT",
21 | "value": "What is the capital of France?"
22 | }
23 | ],
24 | "recipe_replica_count": 1,
25 | "recipe_container_port": "11434",
26 | "recipe_node_pool_size": 1,
27 | "recipe_node_boot_volume_size_in_gbs": 200,
28 | "recipe_container_command_args": [
29 | "--input_directory",
30 | "/models",
31 | "--model_name",
32 | "mistral"
33 | ],
34 | "recipe_ephemeral_storage_size": 100
35 | }
36 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/cpu-inference/cpu-inference-mistral-vm.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_id": "cpu_inference",
3 | "recipe_mode": "service",
4 | "deployment_name": "cpu Inference mistral E4Flex",
5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:cpu_inference_service_v0.2",
6 | "recipe_node_shape": "VM.Standard.E4.Flex",
7 | "recipe_flex_shape_ocpu_count": 4,
8 | "recipe_flex_shape_memory_size_in_gbs": 64,
9 | "input_object_storage": [
10 | {
11 | "par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/0LYMMBRGg_OEm_hzp9BG8BlQx7Ccpy3gY-gRzjQQFZRU6peG0pXyHTRHUGZLp82E/n/iduyx1qnmway/b/ollama-models/o/",
12 | "mount_location": "/models",
13 | "volume_size_in_gbs": 20
14 | }
15 | ],
16 | "recipe_container_env": [
17 | {
18 | "key": "MODEL_NAME",
19 | "value": "mistral"
20 | },
21 | {
22 | "key": "PROMPT",
23 | "value": "What is the capital of Spain?"
24 | }
25 | ],
26 | "recipe_replica_count": 1,
27 | "recipe_container_port": "11434",
28 | "recipe_node_pool_size": 1,
29 | "recipe_node_boot_volume_size_in_gbs": 200,
30 | "recipe_container_command_args": [
31 | "--input_directory",
32 | "/models",
33 | "--model_name",
34 | "mistral"
35 | ],
36 | "recipe_ephemeral_storage_size": 100
37 | }
38 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/exisiting_cluster_installation/README.md:
--------------------------------------------------------------------------------
1 | # Install OCI AI Blueprints onto an Existing OKE Cluster
2 |
3 | #### Deploy OCI AI Blueprints on your existing OKE cluster without creating new infrastructure
4 |
5 | This guide helps you install and use **OCI AI Blueprints** on an existing OKE cluster that was created outside of blueprints and already has workflows running on it. Rather than installing blueprints onto a new cluster, you can leverage an existing cluster with node pools and tools already installed.
6 |
7 | The installation process involves ensuring you have the correct IAM policies in place, retrieving existing cluster OKE and VCN information from the console, deploying the OCI AI Blueprints application onto the existing cluster, and optionally adding existing nodes to be used by blueprints. You can then deploy sample recipes to test functionality.
8 |
9 | Key considerations include managing existing tooling like Prometheus, Grafana, or the GPU operator that may already be installed on your cluster. The blueprint installation process can detect and work around these existing components. Additionally, if you have the nvidia-gpu-operator installed and plan to use Multi-Instance GPUs with H100 nodes, special configuration steps are available.
10 |
11 | This approach allows you to:
12 |
13 | - Leverage existing cluster resources and configurations
14 | - Add blueprints capabilities without disrupting current workloads
15 | - Utilize existing node pools for blueprint deployments
16 | - Maintain compatibility with pre-installed cluster tools
17 |
18 | ## Pre-Filled Samples
19 |
20 | | Feature Showcase | Title | Description | Blueprint File |
21 | | --------------------------------------------------------------------------------------------- | ---------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------- |
22 | | Add existing cluster nodes to OCI AI Blueprints control plane for shared resource utilization | Add Existing Node to Control Plane | Configures an existing cluster node to be managed by OCI AI Blueprints, enabling shared node pool functionality and resource optimization across existing infrastructure. | [add_node_to_control_plane.json](add_node_to_control_plane.json) |
23 |
24 | For complete step-by-step instructions, see the [full installation guide](../../../INSTALLING_ONTO_EXISTING_CLUSTER_README.md).
25 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/exisiting_cluster_installation/add_node_to_control_plane.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_mode": "update",
3 | "deployment_name": "startupaddnode",
4 | "recipe_node_name": "10.0.10.164",
5 | "recipe_node_labels": {
6 | "corrino": "a10pool",
7 | "corrino/pool-shared-any": "true"
8 | }
9 | }
10 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/gpu-health-check/README.md:
--------------------------------------------------------------------------------
1 | # Health Check
2 |
3 | #### Comprehensive GPU health validation and diagnostics for production readiness
4 |
5 | This repository offers a robust, pre-check recipe for thorough GPU health validation prior to deploying production or research workloads. Designed to operate seamlessly across both single-node and multi-node environments, this diagnostic toolset enables you to verify that your GPU infrastructure is primed for high-demand experiments. By systematically assessing key performance metrics—such as thermal behavior, power stability, and overall hardware reliability—you can proactively detect and address issues like thermal throttling, power irregularities, and GPU instability. This early-warning system minimizes the risk of unexpected downtime and performance degradation, ensuring that your system consistently operates at peak efficiency and reliability during critical computational tasks.
6 |
7 | ## Pre-Filled Samples
8 |
9 | | Feature Showcase | Title | Description | Blueprint File |
10 | | ------------------------------------------------------------------------------------------------------------- | ------------------------- | ----------------------------------------------------------------- | -------------------------------------------------------- |
11 | | Validate A10 GPU performance and stability using 16-bit floating point precision for memory-efficient testing | 2 A10 GPUs with dtype 16 | Deploys 2 A10 GPUs with dtype 16 on VM.GPU.A10.2 with 2 GPU(s). | [healthcheck_fp16_a10.json](healthcheck_fp16_a10.json) |
12 | | Validate A10 GPU performance and stability using 32-bit floating point precision for comprehensive testing | 2 A10 GPUs with dtype 32 | Deploys 2 A10 GPUs with dtype 32 on VM.GPU.A10.2 with 2 GPU(s). | [healthcheck_fp32_a10.json](healthcheck_fp32_a10.json) |
13 | | Validate H100 GPU cluster performance and stability using 16-bit precision for high-scale workloads | 8 H100 GPUs with dtype 16 | Deploys 8 H100 GPUs with dtype 16 on BM.GPU.H100.8 with 8 GPU(s). | [healthcheck_fp16_h100.json](healthcheck_fp16_h100.json) |
14 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/gpu-health-check/healthcheck_fp16_a10.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_id": "healthcheck",
3 | "recipe_mode": "job",
4 | "deployment_name": "healthcheck_fp16_a10",
5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:healthcheck_v0.3",
6 | "recipe_node_shape": "VM.GPU.A10.2",
7 | "output_object_storage": [
8 | {
9 | "bucket_name": "healthcheck2",
10 | "mount_location": "/healthcheck_results",
11 | "volume_size_in_gbs": 20
12 | }
13 | ],
14 | "recipe_container_command_args": [
15 | "--dtype",
16 | "float16",
17 | "--output_dir",
18 | "/healthcheck_results",
19 | "--expected_gpus",
20 | "A10:2,A100:0,H100:0"
21 | ],
22 | "recipe_replica_count": 1,
23 | "recipe_nvidia_gpu_count": 2,
24 | "recipe_node_pool_size": 1,
25 | "recipe_node_boot_volume_size_in_gbs": 200,
26 | "recipe_ephemeral_storage_size": 100,
27 | "recipe_shared_memory_volume_size_limit_in_mb": 1000
28 | }
29 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/gpu-health-check/healthcheck_fp16_h100.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_id": "healthcheck",
3 | "recipe_mode": "job",
4 | "deployment_name": "healthcheck_fp16_h100",
5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:healthcheck_v0.3",
6 | "recipe_node_shape": "BM.GPU.H100.8",
7 | "output_object_storage": [
8 | {
9 | "bucket_name": "healthcheck2",
10 | "mount_location": "/healthcheck_results",
11 | "volume_size_in_gbs": 20
12 | }
13 | ],
14 | "recipe_container_command_args": [
15 | "--dtype",
16 | "float16",
17 | "--output_dir",
18 | "/healthcheck_results",
19 | "--expected_gpus",
20 | "A10:0,A100:0,H100:8"
21 | ],
22 | "recipe_replica_count": 1,
23 | "recipe_nvidia_gpu_count": 8,
24 | "recipe_node_pool_size": 1,
25 | "recipe_node_boot_volume_size_in_gbs": 200,
26 | "recipe_ephemeral_storage_size": 100,
27 | "recipe_shared_memory_volume_size_limit_in_mb": 1000
28 | }
29 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/gpu-health-check/healthcheck_fp32_a10.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_id": "healthcheck",
3 | "recipe_mode": "job",
4 | "deployment_name": "healthcheck_fp32_a10",
5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:healthcheck_v0.3",
6 | "recipe_node_shape": "VM.GPU.A10.2",
7 | "output_object_storage": [
8 | {
9 | "bucket_name": "healthcheck2",
10 | "mount_location": "/healthcheck_results",
11 | "volume_size_in_gbs": 20
12 | }
13 | ],
14 | "recipe_container_command_args": [
15 | "--dtype",
16 | "float32",
17 | "--output_dir",
18 | "/healthcheck_results",
19 | "--expected_gpus",
20 | "A10:2,A100:0,H100:0"
21 | ],
22 | "recipe_replica_count": 1,
23 | "recipe_nvidia_gpu_count": 2,
24 | "recipe_node_pool_size": 1,
25 | "recipe_node_boot_volume_size_in_gbs": 200,
26 | "recipe_ephemeral_storage_size": 100,
27 | "recipe_shared_memory_volume_size_limit_in_mb": 1000
28 | }
29 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/llm_inference_with_vllm/README.md:
--------------------------------------------------------------------------------
1 | # LLM Inference with vLLM
2 |
3 | #### Deploy open-source LLMs to GPUs for inference with vLLM.
4 |
5 | This blueprint simplifies the deployment of LLMs using an open-source inference engine called vLLM. You can deploy a custom model or select from a variety of open-source models on Hugging Face.
6 |
7 | The blueprint deploys the model from an object storage bucket to a GPU node in an OKE cluster in your tenancy. Once deployed, you receive a ready-to-use API endpoint to start generating responses from the model. For mission-critical workloads, you can also configure auto-scaling driven by application metrics like inference latency. To summarize, this blueprint streamlines inference deployment, making it easy to scale and integrate into your applications without deep, technical expertise.
8 |
9 | ## Pre-Filled Samples
10 |
11 | | Feature Showcase | Title | Description | Blueprint File |
12 | | ------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------- |
13 | | Deploy models from OCI Object Storage using pre-authenticated requests (PARs) for faster model loading | Meta-Llama-3.1-8B-Instruct from OCI Object Storage on VM.GPU.A10.2 with vLLM | Deploys Meta-Llama-3.1-8B-Instruct from OCI Object Storage on VM.GPU.A10.2 with vLLM on VM.GPU.A10.2 with 2 GPU(s). | [vllm-model-from-obj-storage.json](vllm-model-from-obj-storage.json) |
14 | | Use vLLM with a gated HuggingFace model which requires pre-authentication and passing an authentication token | meta-llama/Llama-3.2-11B-Vision (Closed Model) from Hugging Face on VM.GPU.A10.2 with vLLM | Deploys meta-llama/Llama-3.2-11B-Vision (Closed Model) from Hugging Face on VM.GPU.A10.2 with vLLM on VM.GPU.A10.2 with 2 GPU(s). | [vllm-closed-hf-model.json](vllm-closed-hf-model.json) |
15 | | Deploy open-source models from HuggingFace and have them downloaded directly on the node | NousResearch/Meta-Llama-3-8B-Instruct (Open Model) from Hugging Face on VM.GPU.A10.2 with vLLM | Deploys NousResearch/Meta-Llama-3-8B-Instruct (Open Model) from Hugging Face on VM.GPU.A10.2 with vLLM on VM.GPU.A10.2 with 2 GPU(s). | [vllm-open-hf-model.json](vllm-open-hf-model.json) |
16 | | Secure vLLM endpoint with API key authentication to control access to the inference service | NousResearch/Meta-Llama-3-8B-Instruct (Open Model) from Hugging Face on VM.GPU.A10.2 with vLLM and Endpoint API Key | Deploys NousResearch/Meta-Llama-3-8B-Instruct (Open Model) from Hugging Face on VM.GPU.A10.2 with vLLM and Endpoint API Key on VM.GPU.A10.2 with 2 GPU(s). | [vllm-open-hf-model-api-key-functionality.json](vllm-open-hf-model-api-key-functionality.json) |
17 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/llm_inference_with_vllm/vllm-closed-hf-model.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_id": "llm_inference_nvidia",
3 | "recipe_mode": "service",
4 | "deployment_name": "vllm-closed-hf-model",
5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:vllmv0.6.2",
6 | "recipe_node_shape": "VM.GPU.A10.2",
7 | "recipe_container_env": [
8 | {
9 | "key": "HF_TOKEN",
10 | "value": ""
11 | }
12 | ],
13 | "recipe_replica_count": 1,
14 | "recipe_container_port": "8000",
15 | "recipe_nvidia_gpu_count": 2,
16 | "recipe_node_pool_size": 1,
17 | "recipe_node_boot_volume_size_in_gbs": 200,
18 | "recipe_container_command_args": [
19 | "--model",
20 | "meta-llama/Llama-3.2-11B-Vision",
21 | "--tensor-parallel-size",
22 | "2"
23 | ],
24 | "recipe_ephemeral_storage_size": 100,
25 | "recipe_shared_memory_volume_size_limit_in_mb": 200
26 | }
27 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/llm_inference_with_vllm/vllm-model-from-obj-storage.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_id": "llm_inference_nvidia",
3 | "recipe_mode": "service",
4 | "deployment_name": "vllm-model-from-obj-storage",
5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:vllmv0.6.6.pos1",
6 | "recipe_node_shape": "VM.GPU.A10.2",
7 | "input_object_storage": [
8 | {
9 | "par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/IFknABDAjiiF5LATogUbRCcVQ9KL6aFUC1j-P5NSeUcaB2lntXLaR935rxa-E-u1/n/iduyx1qnmway/b/corrino_hf_oss_models/o/",
10 | "mount_location": "/models",
11 | "volume_size_in_gbs": 500,
12 | "include": ["NousResearch/Meta-Llama-3.1-8B-Instruct"]
13 | }
14 | ],
15 | "recipe_container_env": [
16 | {
17 | "key": "tensor_parallel_size",
18 | "value": "2"
19 | },
20 | {
21 | "key": "model_name",
22 | "value": "NousResearch/Meta-Llama-3.1-8B-Instruct"
23 | },
24 | {
25 | "key": "Model_Path",
26 | "value": "/models/NousResearch/Meta-Llama-3.1-8B-Instruct"
27 | }
28 | ],
29 | "recipe_replica_count": 1,
30 | "recipe_container_port": "8000",
31 | "recipe_nvidia_gpu_count": 2,
32 | "recipe_node_pool_size": 1,
33 | "recipe_node_boot_volume_size_in_gbs": 200,
34 | "recipe_container_command_args": [
35 | "--model",
36 | "$(Model_Path)",
37 | "--tensor-parallel-size",
38 | "$(tensor_parallel_size)"
39 | ],
40 | "recipe_ephemeral_storage_size": 100,
41 | "recipe_shared_memory_volume_size_limit_in_mb": 200
42 | }
43 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/llm_inference_with_vllm/vllm-open-hf-model-api-key-functionality.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_id": "llm_inference_nvidia",
3 | "recipe_mode": "service",
4 | "deployment_name": "vllm-open-hf-model-api-key-functionality",
5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:vllmv0.6.2",
6 | "recipe_node_shape": "VM.GPU.A10.2",
7 | "recipe_container_env": [
8 | {
9 | "key": "VLLM_API_KEY",
10 | "value": ""
11 | }
12 | ],
13 | "recipe_replica_count": 1,
14 | "recipe_container_port": "8000",
15 | "recipe_nvidia_gpu_count": 2,
16 | "recipe_node_pool_size": 1,
17 | "recipe_node_boot_volume_size_in_gbs": 200,
18 | "recipe_container_command_args": [
19 | "--model",
20 | "NousResearch/Meta-Llama-3-8B-Instruct",
21 | "--tensor-parallel-size",
22 | "2"
23 | ],
24 | "recipe_ephemeral_storage_size": 100,
25 | "recipe_shared_memory_volume_size_limit_in_mb": 200
26 | }
27 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/llm_inference_with_vllm/vllm-open-hf-model.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_id": "llm_inference_nvidia",
3 | "recipe_mode": "service",
4 | "deployment_name": "vllm-open-hf-model",
5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:vllmv0.6.2",
6 | "recipe_node_shape": "VM.GPU.A10.2",
7 | "recipe_container_env": [
8 | {
9 | "key": "tensor_parallel_size",
10 | "value": "2"
11 | },
12 | {
13 | "key": "model_name",
14 | "value": "NousResearch/Meta-Llama-3-8B-Instruct"
15 | }
16 | ],
17 | "recipe_replica_count": 1,
18 | "recipe_container_port": "8000",
19 | "recipe_nvidia_gpu_count": 2,
20 | "recipe_node_pool_size": 1,
21 | "recipe_node_boot_volume_size_in_gbs": 200,
22 | "recipe_container_command_args": [
23 | "--model",
24 | "$(model_name)",
25 | "--tensor-parallel-size",
26 | "$(tensor_parallel_size)"
27 | ],
28 | "recipe_ephemeral_storage_size": 100,
29 | "recipe_shared_memory_volume_size_limit_in_mb": 200
30 | }
31 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/lora-benchmarking/README.md:
--------------------------------------------------------------------------------
1 | # Fine-Tuning Benchmarking
2 |
3 | #### Fine-tune quantized Llama-2-70B model using MLCommons methodology for infrastructure benchmarking
4 |
5 | The fine-tuning benchmarking blueprint streamlines infrastructure benchmarking for fine-tuning using the MLCommons methodology. It fine-tunes a quantized Llama-2-70B model and a standard dataset.
6 |
7 | Once complete, benchmarking results, such as training time and resource utilization, are available in MLFlow and Grafana for easy tracking. This blueprint enables data-driven infrastructure decisions for your fine-tuning jobs.
8 |
9 | ## Pre-Filled Samples
10 |
11 | | Feature Showcase | Title | Description | Blueprint File |
12 | | ------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------ |
13 | | Benchmark LoRA fine-tuning performance using MLCommons methodology with quantized large language models | LoRA fine-tuning of quantitized Llama-2-70B model on A100 node using MLCommons methodology | Deploys LoRA fine-tuning of quantitized Llama-2-70B model on A100 node using MLCommons methodology on BM.GPU.A100.8 with 8 GPU(s). | [mlcommons_lora_finetune_nvidia_sample_recipe.json](mlcommons_lora_finetune_nvidia_sample_recipe.json) |
14 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/lora-benchmarking/mlcommons_lora_finetune_nvidia_sample_recipe.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_id": "mlcommons_lora_finetune_nvidia",
3 | "deployment_name": "MLCommons Finetune LORA/PEFT",
4 | "recipe_mode": "job",
5 | "recipe_node_shape": "BM.GPU.A100.8",
6 | "recipe_use_shared_node_pool": false,
7 | "recipe_nvidia_gpu_count": 8,
8 | "recipe_ephemeral_storage_size": 50,
9 | "recipe_replica_count": 1,
10 | "recipe_node_pool_size": 1,
11 | "recipe_node_boot_volume_size_in_gbs": 200,
12 | "recipe_shared_memory_volume_size_limit_in_mb": 100,
13 | "input_object_storage": [
14 | {
15 | "bucket_name": "corrino_mlcommons_llama2_70b_qkv",
16 | "mount_location": "/models",
17 | "volume_size_in_gbs": 500
18 | },
19 | {
20 | "bucket_name": "corrino_ml_commons_scrolls_dataset",
21 | "mount_location": "/dataset",
22 | "volume_size_in_gbs": 100
23 | }
24 | ],
25 | "output_object_storage": [
26 | {
27 | "bucket_name": "corrino_ml_commons_output",
28 | "mount_location": "/mlcommons_output",
29 | "volume_size_in_gbs": 200
30 | }
31 | ],
32 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:corrino-recipe-mlcommons",
33 | "recipe_container_env": [
34 | {
35 | "key": "model_name",
36 | "value": "regisss/llama2-70b-fused-qkv-mlperf"
37 | },
38 | {
39 | "key": "Model_Path",
40 | "value": "/models"
41 | },
42 | {
43 | "key": "Dataset_Path",
44 | "value": "/dataset"
45 | },
46 | {
47 | "key": "Lora_R",
48 | "value": "16"
49 | },
50 | {
51 | "key": "Lora_Alpha",
52 | "value": "32"
53 | },
54 | {
55 | "key": "Lora_Dropout",
56 | "value": "0.1"
57 | },
58 | {
59 | "key": "Max_Seq_Len",
60 | "value": "8192"
61 | },
62 | {
63 | "key": "bf16",
64 | "value": "true"
65 | },
66 | {
67 | "key": "Logging_Steps",
68 | "value": "24"
69 | },
70 | {
71 | "key": "Eval_Steps",
72 | "value": "48"
73 | },
74 | {
75 | "key": "Per_Device_Train_Batch_Size",
76 | "value": "1"
77 | },
78 | {
79 | "key": "Gradient_Accumulation_Steps",
80 | "value": "1"
81 | },
82 | {
83 | "key": "Lr_Scheduler_Type",
84 | "value": "cosine"
85 | },
86 | {
87 | "key": "Learning_Rate",
88 | "value": "0.0004"
89 | },
90 | {
91 | "key": "Weight_Decay",
92 | "value": "0.0001"
93 | },
94 | {
95 | "key": "Warmup_Ratio",
96 | "value": "0"
97 | },
98 | {
99 | "key": "Max_Grad_Norm",
100 | "value": "0.3"
101 | },
102 | {
103 | "key": "Use_Gradient_Checkpointing",
104 | "value": "true"
105 | },
106 | {
107 | "key": "Target_Eval_Loss",
108 | "value": "0.925"
109 | },
110 | {
111 | "key": "Use_Peft_Lora",
112 | "value": "true"
113 | },
114 | {
115 | "key": "Max_Steps",
116 | "value": "1024"
117 | },
118 | {
119 | "key": "Use_Flash_Attn",
120 | "value": "true"
121 | },
122 | {
123 | "key": "Seed",
124 | "value": "1234"
125 | },
126 | {
127 | "key": "Lora_Target_Modules",
128 | "value": "qkv_proj,o_proj"
129 | },
130 | {
131 | "key": "Mlflow_Exp_Name",
132 | "value": "oci_ai_blueprints_nvidia_recipe"
133 | },
134 | {
135 | "key": "Output_Dir",
136 | "value": "/mlcommons_output"
137 | }
138 | ]
139 | }
140 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/lora-fine-tuning/closed_model_open_dataset_hf.backend.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_id": "lora_finetune_nvidia",
3 | "deployment_name": "dk_closed_model_open_dataset",
4 | "recipe_mode": "job",
5 | "recipe_node_shape": "VM.GPU.A10.2",
6 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:finetune_lora_dev",
7 | "recipe_nvidia_gpu_count": 2,
8 | "recipe_ephemeral_storage_size": 300,
9 | "recipe_replica_count": 1,
10 | "recipe_node_pool_size": 1,
11 | "recipe_node_boot_volume_size_in_gbs": 500,
12 | "recipe_shared_memory_volume_size_limit_in_mb": 100,
13 | "recipe_container_env": [
14 | {
15 | "key": "Mlflow_Endpoint",
16 | "value": "http://mlflow.cluster-tools.svc.cluster.local:5000"
17 | },
18 | {
19 | "key": "Mlflow_Exp_Name",
20 | "value": "oci_ai_blueprints_nvidia_recipe"
21 | },
22 | {
23 | "key": "Mlflow_Run_Name",
24 | "value": "llama-3.2-1B-Instruct-scrolls-gov_report"
25 | },
26 | {
27 | "key": "Hf_Token",
28 | "value": ""
29 | },
30 | {
31 | "key": "Download_Dataset_From_Hf",
32 | "value": "true"
33 | },
34 | {
35 | "key": "Dataset_Name",
36 | "value": "tau/scrolls"
37 | },
38 | {
39 | "key": "Dataset_Sub_Name",
40 | "value": "gov_report"
41 | },
42 | {
43 | "key": "Dataset_Column_To_Use",
44 | "value": "None"
45 | },
46 | {
47 | "key": "Dataset_Path",
48 | "value": "/workspace/datasets"
49 | },
50 | {
51 | "key": "Download_Model_From_Hf",
52 | "value": "true"
53 | },
54 | {
55 | "key": "Model_Name",
56 | "value": "meta-llama/Llama-3.2-1B-Instruct"
57 | },
58 | {
59 | "key": "Model_Path",
60 | "value": "/workspace/models"
61 | },
62 | {
63 | "key": "Max_Model_Length",
64 | "value": "8192"
65 | },
66 | {
67 | "key": "Resume_From_Checkpoint",
68 | "value": "false"
69 | },
70 | {
71 | "key": "Checkpoint_Path",
72 | "value": "/checkpoint"
73 | },
74 | {
75 | "key": "Lora_R",
76 | "value": "8"
77 | },
78 | {
79 | "key": "Lora_Alpha",
80 | "value": "32"
81 | },
82 | {
83 | "key": "Lora_Dropout",
84 | "value": "0.1"
85 | },
86 | {
87 | "key": "Lora_Target_Modules",
88 | "value": "q_proj,up_proj,o_proj,k_proj,down_proj,gate_proj,v_proj"
89 | },
90 | {
91 | "key": "Bias",
92 | "value": "none"
93 | },
94 | {
95 | "key": "Task_Type",
96 | "value": "CAUSAL_LM"
97 | },
98 | {
99 | "key": "Per_Device_Train_Batch_Size",
100 | "value": "1"
101 | },
102 | {
103 | "key": "Gradient_Accumulation_Steps",
104 | "value": "1"
105 | },
106 | {
107 | "key": "Warmup_Steps",
108 | "value": "2"
109 | },
110 | {
111 | "key": "Save_Steps",
112 | "value": "100"
113 | },
114 | {
115 | "key": "Learning_Rate",
116 | "value": "0.0002"
117 | },
118 | {
119 | "key": "Fp16",
120 | "value": "true"
121 | },
122 | {
123 | "key": "Logging_Steps",
124 | "value": "1"
125 | },
126 | {
127 | "key": "Output_Dir",
128 | "value": "/tunedmodels/Llama-3.1-8B-english_quotes"
129 | },
130 | {
131 | "key": "Optim",
132 | "value": "paged_adamw_8bit"
133 | },
134 | {
135 | "key": "Number_of_Training_Epochs",
136 | "value": "2"
137 | },
138 | {
139 | "key": "Require_Persistent_Output_Dir",
140 | "value": "true"
141 | }
142 | ],
143 | "output_object_storage": [
144 | {
145 | "bucket_name": "corrino_tuned_hf_oss_models",
146 | "mount_location": "/tunedmodels",
147 | "volume_size_in_gbs": 500
148 | }
149 | ]
150 | }
151 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/lora-fine-tuning/open_model_open_dataset_hf.backend.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_id": "lora_finetune_nvidia",
3 | "deployment_name": "dk_open_model_open_dataset",
4 | "recipe_mode": "job",
5 | "recipe_node_shape": "VM.GPU.A10.2",
6 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:finetune_lora_dev",
7 | "recipe_nvidia_gpu_count": 2,
8 | "recipe_ephemeral_storage_size": 300,
9 | "recipe_replica_count": 1,
10 | "recipe_node_pool_size": 1,
11 | "recipe_node_boot_volume_size_in_gbs": 500,
12 | "recipe_shared_memory_volume_size_limit_in_mb": 100,
13 | "recipe_container_env": [
14 | {
15 | "key": "Mlflow_Endpoint",
16 | "value": "http://mlflow.cluster-tools.svc.cluster.local:5000"
17 | },
18 | {
19 | "key": "Mlflow_Exp_Name",
20 | "value": "oci_ai_blueprints_nvidia_recipe"
21 | },
22 | {
23 | "key": "Mlflow_Run_Name",
24 | "value": "oci_ai_blueprints_run"
25 | },
26 | {
27 | "key": "Hf_Token",
28 | "value": "None"
29 | },
30 | {
31 | "key": "Download_Dataset_From_Hf",
32 | "value": "true"
33 | },
34 | {
35 | "key": "Dataset_Name",
36 | "value": "Abirate/english_quotes"
37 | },
38 | {
39 | "key": "Dataset_Sub_Name",
40 | "value": "None"
41 | },
42 | {
43 | "key": "Dataset_Column_To_Use",
44 | "value": "None"
45 | },
46 | {
47 | "key": "Dataset_Path",
48 | "value": "/workspace/datasets"
49 | },
50 | {
51 | "key": "Download_Model_From_Hf",
52 | "value": "true"
53 | },
54 | {
55 | "key": "Model_Name",
56 | "value": "NousResearch/Meta-Llama-3.1-8B"
57 | },
58 | {
59 | "key": "Model_Path",
60 | "value": "/workspace/models"
61 | },
62 | {
63 | "key": "Max_Model_Length",
64 | "value": "8192"
65 | },
66 | {
67 | "key": "Resume_From_Checkpoint",
68 | "value": "false"
69 | },
70 | {
71 | "key": "Checkpoint_Path",
72 | "value": "/checkpoint"
73 | },
74 | {
75 | "key": "Lora_R",
76 | "value": "8"
77 | },
78 | {
79 | "key": "Lora_Alpha",
80 | "value": "32"
81 | },
82 | {
83 | "key": "Lora_Dropout",
84 | "value": "0.1"
85 | },
86 | {
87 | "key": "Lora_Target_Modules",
88 | "value": "q_proj,up_proj,o_proj,k_proj,down_proj,gate_proj,v_proj"
89 | },
90 | {
91 | "key": "Bias",
92 | "value": "none"
93 | },
94 | {
95 | "key": "Task_Type",
96 | "value": "CAUSAL_LM"
97 | },
98 | {
99 | "key": "Per_Device_Train_Batch_Size",
100 | "value": "1"
101 | },
102 | {
103 | "key": "Gradient_Accumulation_Steps",
104 | "value": "1"
105 | },
106 | {
107 | "key": "Warmup_Steps",
108 | "value": "2"
109 | },
110 | {
111 | "key": "Save_Steps",
112 | "value": "100"
113 | },
114 | {
115 | "key": "Learning_Rate",
116 | "value": "0.0002"
117 | },
118 | {
119 | "key": "Fp16",
120 | "value": "true"
121 | },
122 | {
123 | "key": "Logging_Steps",
124 | "value": "1"
125 | },
126 | {
127 | "key": "Output_Dir",
128 | "value": "/tunedmodels/Llama-3.1-8B-english_quotes"
129 | },
130 | {
131 | "key": "Optim",
132 | "value": "paged_adamw_8bit"
133 | },
134 | {
135 | "key": "Number_of_Training_Epochs",
136 | "value": "2"
137 | },
138 | {
139 | "key": "Require_Persistent_Output_Dir",
140 | "value": "true"
141 | }
142 | ],
143 | "output_object_storage": [
144 | {
145 | "bucket_name": "corrino_tuned_hf_oss_models",
146 | "mount_location": "/tunedmodels",
147 | "volume_size_in_gbs": 500
148 | }
149 | ]
150 | }
151 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/mig_multi_instance_gpu/mig_enabled_shared_node_pool.json:
--------------------------------------------------------------------------------
1 | {
2 | "deployment_name": "H100_pool_mig",
3 | "recipe_mode": "shared_node_pool",
4 | "shared_node_pool_size": 1,
5 | "shared_node_pool_shape": "BM.GPU.H100.8",
6 | "shared_node_pool_boot_volume_size_in_gbs": 1000,
7 | "shared_node_pool_mig_config": "all-1g.20gb"
8 | }
9 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/mig_multi_instance_gpu/mig_inference_multiple_replicas.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_id": "llm_inference_nvidia",
3 | "recipe_mode": "service",
4 | "deployment_name": "autoscale_mig",
5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:vllmv0.6.5",
6 | "recipe_node_shape": "BM.GPU.H100.8",
7 | "recipe_container_env": [
8 | {
9 | "key": "tensor_parallel_size",
10 | "value": "1"
11 | },
12 | {
13 | "key": "HF_TOKEN",
14 | "value": ""
15 | }
16 | ],
17 | "recipe_replica_count": 5,
18 | "recipe_container_port": "8000",
19 | "recipe_nvidia_gpu_count": 1,
20 | "recipe_use_shared_node_pool": true,
21 | "mig_resource_request": "1g.10gb",
22 | "recipe_container_command_args": [
23 | "--model",
24 | "meta-llama/Llama-3.2-3B-Instruct",
25 | "--dtype",
26 | "bfloat16",
27 | "--tensor-parallel-size",
28 | "$(tensor_parallel_size)",
29 | "--gpu-memory-utilization",
30 | "0.99",
31 | "--max-model-len",
32 | "16384"
33 | ],
34 | "recipe_ephemeral_storage_size": 30,
35 | "recipe_node_boot_volume_size_in_gbs": 300,
36 | "recipe_shared_memory_volume_size_limit_in_mb": 1000,
37 | "recipe_startup_probe_params": {
38 | "failure_threshold": 30,
39 | "endpoint_path": "/health",
40 | "port": 8000,
41 | "scheme": "HTTP",
42 | "initial_delay_seconds": 10,
43 | "period_seconds": 2,
44 | "success_threshold": 1,
45 | "timeout_seconds": 1
46 | },
47 | "recipe_liveness_probe_params": {
48 | "failure_threshold": 3,
49 | "endpoint_path": "/health",
50 | "port": 8000,
51 | "scheme": "HTTP",
52 | "initial_delay_seconds": 65,
53 | "period_seconds": 600,
54 | "success_threshold": 1,
55 | "timeout_seconds": 1
56 | },
57 | "recipe_pod_autoscaling_params": {
58 | "min_replicas": 5,
59 | "max_replicas": 10
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/mig_multi_instance_gpu/mig_inference_single_replica.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_id": "llm_inference_nvidia",
3 | "recipe_mode": "service",
4 | "deployment_name": "autoscale_mig",
5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:vllmv0.6.5",
6 | "recipe_node_shape": "BM.GPU.H100.8",
7 | "recipe_container_env": [
8 | {
9 | "key": "tensor_parallel_size",
10 | "value": "1"
11 | },
12 | {
13 | "key": "HF_TOKEN",
14 | "value": ""
15 | }
16 | ],
17 | "recipe_replica_count": 1,
18 | "recipe_container_port": "8000",
19 | "recipe_nvidia_gpu_count": 1,
20 | "recipe_use_shared_node_pool": true,
21 | "mig_resource_request": "1g.20gb",
22 | "recipe_container_command_args": [
23 | "--model",
24 | "meta-llama/Llama-3.2-3B-Instruct",
25 | "--dtype",
26 | "bfloat16",
27 | "--tensor-parallel-size",
28 | "$(tensor_parallel_size)",
29 | "--gpu-memory-utilization",
30 | "0.99",
31 | "--max-model-len",
32 | "16384"
33 | ],
34 | "recipe_ephemeral_storage_size": 30,
35 | "recipe_node_boot_volume_size_in_gbs": 300,
36 | "recipe_shared_memory_volume_size_limit_in_mb": 1000,
37 | "recipe_startup_probe_params": {
38 | "failure_threshold": 30,
39 | "endpoint_path": "/health",
40 | "port": 8000,
41 | "scheme": "HTTP",
42 | "initial_delay_seconds": 10,
43 | "period_seconds": 2,
44 | "success_threshold": 1,
45 | "timeout_seconds": 1
46 | },
47 | "recipe_liveness_probe_params": {
48 | "failure_threshold": 3,
49 | "endpoint_path": "/health",
50 | "port": 8000,
51 | "scheme": "HTTP",
52 | "initial_delay_seconds": 65,
53 | "period_seconds": 600,
54 | "success_threshold": 1,
55 | "timeout_seconds": 1
56 | },
57 | "recipe_pod_autoscaling_params": {
58 | "min_replicas": 1,
59 | "max_replicas": 50
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/mig_multi_instance_gpu/mig_inference_single_replica_10gb.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_id": "llm_inference_nvidia",
3 | "recipe_mode": "service",
4 | "deployment_name": "autoscale_mig",
5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:vllmv0.6.5",
6 | "recipe_node_shape": "BM.GPU.H100.8",
7 | "recipe_container_env": [
8 | {
9 | "key": "tensor_parallel_size",
10 | "value": "1"
11 | },
12 | {
13 | "key": "HF_TOKEN",
14 | "value": ""
15 | }
16 | ],
17 | "recipe_replica_count": 1,
18 | "recipe_container_port": "8000",
19 | "recipe_nvidia_gpu_count": 1,
20 | "recipe_use_shared_node_pool": true,
21 | "mig_resource_request": "1g.10gb",
22 | "recipe_container_command_args": [
23 | "--model",
24 | "meta-llama/Llama-3.2-3B-Instruct",
25 | "--dtype",
26 | "bfloat16",
27 | "--tensor-parallel-size",
28 | "$(tensor_parallel_size)",
29 | "--gpu-memory-utilization",
30 | "0.99",
31 | "--max-model-len",
32 | "16384"
33 | ],
34 | "recipe_ephemeral_storage_size": 30,
35 | "recipe_node_boot_volume_size_in_gbs": 300,
36 | "recipe_shared_memory_volume_size_limit_in_mb": 1000,
37 | "recipe_startup_probe_params": {
38 | "failure_threshold": 30,
39 | "endpoint_path": "/health",
40 | "port": 8000,
41 | "scheme": "HTTP",
42 | "initial_delay_seconds": 10,
43 | "period_seconds": 2,
44 | "success_threshold": 1,
45 | "timeout_seconds": 1
46 | },
47 | "recipe_liveness_probe_params": {
48 | "failure_threshold": 3,
49 | "endpoint_path": "/health",
50 | "port": 8000,
51 | "scheme": "HTTP",
52 | "initial_delay_seconds": 65,
53 | "period_seconds": 600,
54 | "success_threshold": 1,
55 | "timeout_seconds": 1
56 | },
57 | "recipe_pod_autoscaling_params": {
58 | "min_replicas": 1,
59 | "max_replicas": 50
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/mig_multi_instance_gpu/mig_slices.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oracle-quickstart/oci-ai-blueprints/31c64899055982e73cb2c1b95345faafe6d5c7ff/docs/sample_blueprints/mig_multi_instance_gpu/mig_slices.png
--------------------------------------------------------------------------------
/docs/sample_blueprints/mig_multi_instance_gpu/mig_update_node_with_node_name.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_mode": "update",
3 | "deployment_name": "all-1g10gb",
4 | "recipe_node_name": "10.0.10.138",
5 | "shared_node_pool_mig_config": "all-1g.10gb"
6 | }
7 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/mig_multi_instance_gpu/mig_update_shared_pool_with_node_pool_name.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_mode": "update",
3 | "deployment_name": "all-2g-20gb",
4 | "recipe_node_pool_name": "h100migpool",
5 | "shared_node_pool_mig_config": "all-2g.20gb"
6 | }
7 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/model_storage/download_closed_hf_model_to_object_storage.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_id": "example",
3 | "recipe_mode": "job",
4 | "deployment_name": "model_to_object",
5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:hf_downloader_v1",
6 | "recipe_container_command_args": [
7 | "meta-llama/Llama-3.2-90B-Vision-Instruct",
8 | "--local-dir",
9 | "/models",
10 | "--max-workers",
11 | "4",
12 | "--token",
13 | ""
14 | ],
15 | "recipe_container_port": "5678",
16 | "recipe_node_shape": "VM.Standard.E4.Flex",
17 | "recipe_node_pool_size": 1,
18 | "recipe_flex_shape_ocpu_count": 4,
19 | "recipe_flex_shape_memory_size_in_gbs": 64,
20 | "recipe_node_boot_volume_size_in_gbs": 500,
21 | "recipe_ephemeral_storage_size": 450,
22 | "output_object_storage": [
23 | {
24 | "bucket_name": "llama3290Bvisioninstruct",
25 | "mount_location": "/models",
26 | "volume_size_in_gbs": 450
27 | }
28 | ]
29 | }
30 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/model_storage/download_open_hf_model_to_object_storage.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_id": "example",
3 | "recipe_mode": "job",
4 | "deployment_name": "model_to_object",
5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:hf_downloader_v1",
6 | "recipe_container_command_args": [
7 | "NousResearch/Meta-Llama-3.1-405B-FP8",
8 | "--local-dir",
9 | "/models",
10 | "--max-workers",
11 | "16"
12 | ],
13 | "recipe_container_port": "5678",
14 | "recipe_node_shape": "VM.Standard.E4.Flex",
15 | "recipe_node_pool_size": 1,
16 | "recipe_flex_shape_ocpu_count": 16,
17 | "recipe_flex_shape_memory_size_in_gbs": 256,
18 | "recipe_node_boot_volume_size_in_gbs": 1000,
19 | "recipe_ephemeral_storage_size": 900,
20 | "output_object_storage": [
21 | {
22 | "bucket_name": "nousllama31405bfp8",
23 | "mount_location": "/models",
24 | "volume_size_in_gbs": 800
25 | }
26 | ]
27 | }
28 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/multi-node-inference/multinode_inference_BM_A10.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_id": "vllm_multinode_inference",
3 | "recipe_mode": "service",
4 | "deployment_name": "multinode_inference",
5 | "recipe_node_shape": "BM.GPU.A10.4",
6 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:ray2430_vllmv083",
7 | "input_object_storage": [
8 | {
9 | "par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/IFknABDAjiiF5LATogUbRCcVQ9KL6aFUC1j-P5NSeUcaB2lntXLaR935rxa-E-u1/n/iduyx1qnmway/b/corrino_hf_oss_models/o/",
10 | "mount_location": "/models",
11 | "volume_size_in_gbs": 500,
12 | "include": ["NousResearch/Meta-Llama-3.1-8B-Instruct"]
13 | }
14 | ],
15 | "recipe_replica_count": 1,
16 | "recipe_nvidia_gpu_count": 4,
17 | "recipe_ephemeral_storage_size": 150,
18 | "recipe_shared_memory_volume_size_limit_in_mb": 10000,
19 | "recipe_container_port": "8000",
20 | "recipe_use_shared_node_pool": true,
21 | "multinode_num_nodes_to_use_from_shared_pool": 2,
22 | "recipe_container_command_args": [
23 | "--port",
24 | "8000",
25 | "--model",
26 | "/models",
27 | "--tensor-parallel-size",
28 | "4",
29 | "--pipeline-parallel-size",
30 | "2",
31 | "--gpu-memory-utilization",
32 | "0.90",
33 | "--distributed-executor-backend",
34 | "ray"
35 | ],
36 | "recipe_readiness_probe_params": {
37 | "endpoint_path": "/health",
38 | "port": 8000,
39 | "initial_delay_seconds": 20,
40 | "period_seconds": 10
41 | }
42 | }
43 |
44 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/multi-node-inference/multinode_inference_VM_A10.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_id": "vllm_multinode_inference",
3 | "recipe_mode": "service",
4 | "deployment_name": "multinode_inference",
5 | "recipe_node_shape": "VM.GPU.A10.2",
6 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:ray2430_vllmv083",
7 | "input_object_storage": [
8 | {
9 | "par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/IFknABDAjiiF5LATogUbRCcVQ9KL6aFUC1j-P5NSeUcaB2lntXLaR935rxa-E-u1/n/iduyx1qnmway/b/corrino_hf_oss_models/o/",
10 | "mount_location": "/models",
11 | "volume_size_in_gbs": 500,
12 | "include": ["NousResearch/Meta-Llama-3.1-8B-Instruct"]
13 | }
14 | ],
15 | "recipe_replica_count": 1,
16 | "recipe_nvidia_gpu_count": 2,
17 | "recipe_ephemeral_storage_size": 150,
18 | "recipe_shared_memory_volume_size_limit_in_mb": 10000,
19 | "recipe_container_port": "8000",
20 | "recipe_use_shared_node_pool": true,
21 | "multinode_num_nodes_to_use_from_shared_pool": 2,
22 | "recipe_container_command_args": [
23 | "--port",
24 | "8000",
25 | "--model",
26 | "/models",
27 | "--tensor-parallel-size",
28 | "2",
29 | "--pipeline-parallel-size",
30 | "2",
31 | "--gpu-memory-utilization",
32 | "0.90",
33 | "--distributed-executor-backend",
34 | "ray"
35 | ],
36 | "recipe_readiness_probe_params": {
37 | "endpoint_path": "/health",
38 | "port": 8000,
39 | "initial_delay_seconds": 20,
40 | "period_seconds": 10
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/shared_node_pools/shared_node_pool_A10_BM.json:
--------------------------------------------------------------------------------
1 | {
2 | "deployment_name": "BM.GPU.A10.4 shared pool",
3 | "recipe_mode": "shared_node_pool",
4 | "shared_node_pool_size": 2,
5 | "shared_node_pool_shape": "BM.GPU.A10.4",
6 | "shared_node_pool_boot_volume_size_in_gbs": 500
7 | }
8 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/shared_node_pools/shared_node_pool_A10_VM.json:
--------------------------------------------------------------------------------
1 | {
2 | "deployment_name": "VM.GPU.A10.2 shared pool",
3 | "recipe_mode": "shared_node_pool",
4 | "shared_node_pool_size": 2,
5 | "shared_node_pool_shape": "VM.GPU.A10.2",
6 | "shared_node_pool_boot_volume_size_in_gbs": 500
7 | }
8 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/shared_node_pools/vllm_inference_sample_shared_pool_blueprint.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_id": "llm_inference_nvidia",
3 | "recipe_mode": "service",
4 | "deployment_name": "vLLM Inference Deployment",
5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:vllmv0.6.6.pos1",
6 | "recipe_node_shape": "BM.GPU.A10.4",
7 | "input_object_storage": [
8 | {
9 | "par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/IFknABDAjiiF5LATogUbRCcVQ9KL6aFUC1j-P5NSeUcaB2lntXLaR935rxa-E-u1/n/iduyx1qnmway/b/corrino_hf_oss_models/o/",
10 | "mount_location": "/models",
11 | "volume_size_in_gbs": 500,
12 | "include": ["NousResearch/Meta-Llama-3.1-8B-Instruct"]
13 | }
14 | ],
15 | "recipe_container_env": [
16 | {
17 | "key": "tensor_parallel_size",
18 | "value": "2"
19 | },
20 | {
21 | "key": "model_name",
22 | "value": "NousResearch/Meta-Llama-3.1-8B-Instruct"
23 | },
24 | {
25 | "key": "Model_Path",
26 | "value": "/models/NousResearch/Meta-Llama-3.1-8B-Instruct"
27 | }
28 | ],
29 | "recipe_replica_count": 1,
30 | "recipe_container_port": "8000",
31 | "recipe_nvidia_gpu_count": 2,
32 | "recipe_use_shared_node_pool": true,
33 | "recipe_node_boot_volume_size_in_gbs": 200,
34 | "recipe_container_command_args": [
35 | "--model",
36 | "$(Model_Path)",
37 | "--tensor-parallel-size",
38 | "$(tensor_parallel_size)"
39 | ],
40 | "recipe_ephemeral_storage_size": 100,
41 | "recipe_shared_memory_volume_size_limit_in_mb": 1000
42 | }
43 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/startup_liveness_readiness_probes/autoscale_with_fss.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_id": "llm_inference_nvidia",
3 | "recipe_mode": "service",
4 | "deployment_name": "autoscale_with_fss",
5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:vllmv0.6.5",
6 | "recipe_node_shape": "VM.GPU.A10.2",
7 | "recipe_container_env": [
8 | {
9 | "key": "tensor_parallel_size",
10 | "value": "1"
11 | },
12 | {
13 | "key": "Model_Path",
14 | "value": "/models/models/meta-llama/Llama-3.2-1B-Instruct"
15 | }
16 | ],
17 | "recipe_replica_count": 1,
18 | "recipe_container_port": "8000",
19 | "recipe_nvidia_gpu_count": 1,
20 | "recipe_container_command_args": [
21 | "--model",
22 | "$(Model_Path)",
23 | "--tensor-parallel-size",
24 | "$(tensor_parallel_size)",
25 | "--gpu-memory-utilization",
26 | "0.99",
27 | "--max-model-len",
28 | "1024"
29 | ],
30 | "recipe_ephemeral_storage_size": 200,
31 | "recipe_node_boot_volume_size_in_gbs": 300,
32 | "recipe_node_pool_size": 1,
33 | "recipe_shared_memory_volume_size_limit_in_mb": 200,
34 | "recipe_startup_probe_params": {
35 | "failure_threshold": 30,
36 | "endpoint_path": "/health",
37 | "port": 8000,
38 | "scheme": "HTTP",
39 | "initial_delay_seconds": 10,
40 | "period_seconds": 2,
41 | "success_threshold": 1,
42 | "timeout_seconds": 1
43 | },
44 | "recipe_liveness_probe_params": {
45 | "failure_threshold": 3,
46 | "endpoint_path": "/health",
47 | "port": 8000,
48 | "scheme": "HTTP",
49 | "initial_delay_seconds": 65,
50 | "period_seconds": 600,
51 | "success_threshold": 1,
52 | "timeout_seconds": 1
53 | },
54 | "recipe_pod_autoscaling_params": {
55 | "min_replicas": 1,
56 | "max_replicas": 4
57 | },
58 | "recipe_node_autoscaling_params": {
59 | "min_nodes": 1,
60 | "max_nodes": 2
61 | },
62 | "input_file_system": [
63 | {
64 | "file_system_ocid": "ocid1.filesystem.oc1.iad.aaaaaaaaaaklirslnfqwillqojxwiotjmfsc2ylefuzqaaaa",
65 | "mount_target_ocid": "ocid1.mounttarget.oc1.iad.aaaaacvipp3o7rlwnfqwillqojxwiotjmfsc2ylefuzqaaaa",
66 | "mount_location": "/models",
67 | "volume_size_in_gbs": 50
68 | }
69 | ]
70 | }
71 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/teams/create_job_with_team.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_id": "healthcheck",
3 | "recipe_mode": "job",
4 | "deployment_name": "create_job_with_team",
5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:healthcheck_v0.3",
6 | "recipe_node_shape": "VM.GPU.A10.2",
7 | "recipe_use_shared_node_pool": true,
8 | "recipe_team_info": {
9 | "team_name": "randomteam"
10 | },
11 | "output_object_storage": [
12 | {
13 | "bucket_name": "healthcheck2",
14 | "mount_location": "/healthcheck_results",
15 | "volume_size_in_gbs": 20
16 | }
17 | ],
18 | "recipe_container_command_args": [
19 | "--dtype",
20 | "float16",
21 | "--output_dir",
22 | "/healthcheck_results",
23 | "--expected_gpus",
24 | "A10:2,A100:0,H100:0"
25 | ],
26 | "recipe_replica_count": 1,
27 | "recipe_nvidia_gpu_count": 2,
28 | "recipe_node_pool_size": 1,
29 | "recipe_node_boot_volume_size_in_gbs": 200,
30 | "recipe_ephemeral_storage_size": 100,
31 | "recipe_shared_memory_volume_size_limit_in_mb": 1000,
32 | "recipe_container_cpu_count": 4,
33 | "recipe_container_memory_size": 20
34 | }
35 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/teams/create_team.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_mode": "team",
3 | "deployment_name": "create_team",
4 | "team": {
5 | "team_name": "randomteam",
6 | "priority_threshold": 100,
7 | "quotas": [
8 | {
9 | "shape_name": "BM.GPU.H100.8",
10 | "cpu_nominal_quota": "10",
11 | "cpu_borrowing_limit": "4",
12 | "cpu_lending_limit": "4",
13 | "mem_nominal_quota": "10",
14 | "mem_borrowing_limit": "4",
15 | "mem_lending_limit": "4",
16 | "gpu_nominal_quota": "10",
17 | "gpu_borrowing_limit": "4",
18 | "gpu_lending_limit": "4"
19 | },
20 | {
21 | "shape_name": "VM.GPU.A10.2",
22 | "cpu_nominal_quota": "10",
23 | "cpu_borrowing_limit": "4",
24 | "cpu_lending_limit": "4",
25 | "mem_nominal_quota": "10",
26 | "mem_borrowing_limit": "4",
27 | "mem_lending_limit": "4",
28 | "gpu_nominal_quota": "10",
29 | "gpu_borrowing_limit": "4",
30 | "gpu_lending_limit": "4"
31 | }
32 | ]
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/using_rdma_enabled_node_pools/rdma_distributed_inference.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_id": "llm_inference_nvidia",
3 | "recipe_mode": "service",
4 | "deployment_name": "405b",
5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:ray2430_vllmv083",
6 | "recipe_node_shape": "BM.GPU.H100.8",
7 | "recipe_replica_count": 1,
8 | "recipe_container_port": "8000",
9 | "recipe_nvidia_gpu_count": 8,
10 | "recipe_use_shared_node_pool": true,
11 | "multinode_rdma_enabled_in_shared_pool": true,
12 | "multinode_num_nodes_to_use_from_shared_pool": 2,
13 | "input_object_storage": [
14 | {
15 | "par": "https://iduyx1qnmway.objectstorage.eu-frankfurt-1.oci.customer-oci.com/p/7N2O5JFirNX_CG70t-HPILzHvlTMP4FC9f_eauJVECosqNafIYxwcDwhItQHvaDK/n/iduyx1qnmway/b/llama31405binstruct/o/",
16 | "mount_location": "/models",
17 | "volume_size_in_gbs": 500
18 | }
19 | ],
20 | "recipe_container_env": [
21 | {"key": "NCCL_DEBUG", "value": "INFO"},
22 | {"key": "NCCL_DEBUG_SUBSYS", "value": "INIT,NET,ENV"}
23 | ],
24 | "recipe_readiness_probe_params": {
25 | "endpoint_path": "/health",
26 | "port": 8000,
27 | "initial_delay_seconds": 20,
28 | "period_seconds": 10
29 | },
30 | "recipe_container_command_args": [
31 | "--port",
32 | "8000",
33 | "--model",
34 | "/models",
35 | "--tensor-parallel-size",
36 | "8",
37 | "--gpu-memory-utilization",
38 | "0.90",
39 | "--pipeline-parallel-size",
40 | "2",
41 | "--distributed-executor-backend",
42 | "ray"
43 | ],
44 | "recipe_ephemeral_storage_size": 100,
45 | "recipe_shared_memory_volume_size_limit_in_mb": 10000
46 | }
47 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/using_rdma_enabled_node_pools/rdma_shared_node_pool.json:
--------------------------------------------------------------------------------
1 | {
2 | "deployment_name": "H100_rdma_pool",
3 | "recipe_mode": "shared_node_pool",
4 | "shared_node_pool_size": 2,
5 | "shared_node_pool_shape": "BM.GPU.H100.8",
6 | "shared_node_pool_boot_volume_size_in_gbs": 1000,
7 | "recipe_availability_domain": "TrcQ:EU-FRANKFURT-1-AD-3",
8 | "recipe_node_image_ocid": "ocid1.image.oc1.eu-frankfurt-1.aaaaaaaakhpy5kt3p6gjmeqbasnndemp6aetlnbkm57hohrkgksuh4476llq",
9 | "multinode_rdma_enabled_in_shared_pool": true
10 | }
11 |
--------------------------------------------------------------------------------
/docs/sample_blueprints/using_rdma_enabled_node_pools/rdma_update_nodes.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_mode": "update",
3 | "deployment_name": "startupaddnode1",
4 | "recipe_node_name": "10.0.10.164",
5 | "recipe_node_labels": {
6 | "corrino": "h100pool",
7 | "corrino/pool-shared-any": "true",
8 | "corrino/rdma": "true"
9 | }
10 | }
11 |
--------------------------------------------------------------------------------
/docs/usage_guide.md:
--------------------------------------------------------------------------------
1 | # Ways to Access OCI AI Blueprints
2 |
3 | Once you've installed OCI AI Blueprints into your tenancy (see [here](../INSTALLING_ONTO_EXISTING_CLUSTER_README.md) for the steps to install OCI AI Blueprints), you can work with OCI AI Blueprints three ways:
4 |
5 | ## **Option #1: OCI AI Blueprints UI Portal:**
6 |
7 | 1. Inside the OCI Console > Resource Manager, select the stack you created for OCI AI Blueprints
8 |
9 | 2. Go to the "Application Information" tab under Stack Details.
10 |
11 | 3. Copy the "Portal URL" into your browser
12 |
13 | 4. Upon first access, you must login - providing the "Admin Username" and "Admin Password" from the "Application Information" tab under Stack Details
14 |
15 | ## **Option #2: OCI AI Blueprints APIs via Web:**
16 |
17 | OCI AI Blueprint's APIs are accessible via web interface. The APIs are shown as human-friendly HTML output for each OCI AI Blueprints API resource . These pages allow for easy browsing of resources, as well as forms for submitting data to the resources using `POST`, `PUT`, and `DELETE`.
18 |
19 | 1. Inside the OCI Console > Resource Manager, select the stack you created for OCI AI Blueprints
20 |
21 | 2. Go to the "Application Information" tab under Stack Details.
22 |
23 | 3. Copy the "OCI AI Blueprints API URL" into your browser
24 |
25 | 4. Upon first access, you must login - providing the "Admin Username" and "Admin Password" from the "Application Information" tab under Stack Details
26 |
27 | 5. Now, you can view and access all API endpoints for your instance of OCI AI Blueprints
28 |
29 | ## **Option #3: OCI AI Blueprints APIs via Curl/Postman**
30 |
31 | You can interact with the APIs locally using Postman, curl or any API platform by doing the following:
32 |
33 | 1. Get your `OCI AI Blueprints API URL` (will reference this as **API URL** going forward), `Admin Usernmae` (will reference this as **username** going forward) and `Admin Password` (will reference this as **password** going forward) by following steps 1 - 3 above in Option #1
34 | 2. Once you have your username, password and API URL make a POST request to`/login` API to get your auth token:
35 |
36 | ```
37 | curl --location --request POST '/login/' \
38 | --header 'Authorization: Token ' \
39 | --form 'username=""' \
40 | --form 'password=""'
41 | ```
42 |
43 | The return JSON will be in the following format:
44 |
45 | ```
46 | {
47 |
48 | "token": "",
49 |
50 | "is_new": true
51 |
52 | }
53 | ```
54 |
55 | 3. Copy the `token` from the response
56 | 4. Now you can access any OCI AI Blueprints API by passing in this `token` for Authorization
57 |
58 | ### Curl Example
59 |
60 | ```
61 | curl --location --request GET '/oci_shapes/' \
62 | --header 'Authorization: Token '
63 | ```
64 |
65 | ### Postman
66 |
67 | 1. Click on the Authorization Tab for the request
68 | 2. Select Auth Type = OAuth 2.0
69 | 3. Paste your token value
70 | 4. Leave Header Prefix as "Token"
71 |
72 | ## **API Reference Documentation**
73 |
74 | [API Reference Documentation](./api_documentation.md)
75 |
--------------------------------------------------------------------------------
/docs/versions/PortalVersions.md:
--------------------------------------------------------------------------------
1 | # Portal Versions
2 |
3 |
4 | v5.0.0
5 |
6 | **React & Framework**
7 |
8 | - react: ^19.0.0
9 | - react-dom: ^19.0.0
10 | - next: 15.2.0-canary.74
11 | - next-themes: ^0.4.4
12 |
13 | **UI Primitives (@radix-ui)**
14 |
15 | - @radix-ui/react-alert-dialog: ^1.1.11
16 | - @radix-ui/react-dialog: ^1.1.6
17 | - @radix-ui/react-label: ^2.1.2
18 | - @radix-ui/react-popover: ^1.1.6
19 | - @radix-ui/react-scroll-area: ^1.2.4
20 | - @radix-ui/react-slot: ^1.2.0
21 | - @radix-ui/react-tabs: ^1.1.3
22 | - @radix-ui/react-tooltip: ^1.1.8
23 |
24 | **Components & Utilities**
25 |
26 | - class-variance-authority: ^0.7.1
27 | - clsx: ^2.1.1
28 | - js-cookie: ^3.0.5
29 | - lucide-react: ^0.476.0
30 | - react-markdown: ^10.1.0
31 | - sonner: ^2.0.1
32 | - tailwind-merge: ^3.0.2
33 | - tailwindcss-animate: ^1.0.7
34 |
35 | **OCI SDK**
36 |
37 | - oci-common: ^2.104.0
38 | - oci-core: ^2.104.0
39 | - oci-identity: ^2.104.0
40 | - oci-objectstorage: ^2.104.0
41 |
42 | ---
43 |
44 | **DevDependencies**
45 |
46 | **Build & Lint**
47 |
48 | - typescript: ^5
49 | - tailwindcss: ^4
50 | - @tailwindcss/postcss: ^4
51 | - @tailwindcss/typography: ^0.5.16
52 | - eslint: ^9
53 | - @eslint/eslintrc: ^3
54 | - eslint-config-next: 15.2.0-canary.74
55 |
56 | **Type Definitions**
57 |
58 | - @types/node: ^20
59 | - @types/react: ^19
60 | - @types/react-dom: ^19
61 | - @types/js-cookie: ^3.0.6
62 |
63 |
64 |
65 |
66 | v1.0.1
67 |
68 | - react: ^19.0.0
69 | - react-dom: ^19.0.0
70 | - next: 15.2.0-canary.74
71 | - @radix-ui/react-alert-dialog: ^1.1.11
72 | - @radix-ui/react-dialog: ^1.1.6
73 | - @radix-ui/react-label: ^2.1.2
74 | - @radix-ui/react-popover: ^1.1.6
75 | - @radix-ui/react-scroll-area: ^1.2.4
76 | - @radix-ui/react-slot: ^1.2.0
77 | - @radix-ui/react-tabs: ^1.1.3
78 | - @radix-ui/react-tooltip: ^1.1.8
79 | - class-variance-authority: ^0.7.1
80 | - clsx: ^2.1.1
81 | - js-cookie: ^3.0.5
82 | - lucide-react: ^0.476.0
83 | - next-themes: ^0.4.4
84 | - oci-common: ^2.104.0
85 | - oci-core: ^2.104.0
86 | - oci-identity: ^2.104.0
87 | - oci-objectstorage: ^2.104.0
88 | - react-markdown: ^10.1.0
89 | - sonner: ^2.0.1
90 | - tailwind-merge: ^3.0.2
91 | - tailwindcss-animate: ^1.0.7
92 | - @eslint/eslintrc: ^3
93 | - @tailwindcss/postcss: ^4
94 | - @tailwindcss/typography: ^0.5.16
95 | - @types/js-cookie: ^3.0.6
96 | - @types/node: ^20
97 | - @types/react: ^19
98 | - @types/react-dom: ^19
99 | - eslint: ^9
100 | - eslint-config-next: 15.2.0-canary.74
101 | - tailwindcss: ^4
102 | - typescript: ^5
103 |
104 |
105 |
--------------------------------------------------------------------------------
/docs/versions/README.md:
--------------------------------------------------------------------------------
1 | # Software Versions
2 |
3 | Each link provides software versions for tools utilized in the various components of the software managed by Blueprints:
4 |
5 | - [OCI AI Blueprints Quickstart Software Versions](./QuickStartVersions.md)
6 | - [Blueprints Control Plane Software Versions](./ControlPlaneVersions.md)
7 | - [Blueprints Portal Software Versions](./PortalVersions.md)
8 |
--------------------------------------------------------------------------------
/docs/whisper_transcription/docs/Whisper_Architecture.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oracle-quickstart/oci-ai-blueprints/31c64899055982e73cb2c1b95345faafe6d5c7ff/docs/whisper_transcription/docs/Whisper_Architecture.pdf
--------------------------------------------------------------------------------
/docs/whisper_transcription/examples/test1/test.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oracle-quickstart/oci-ai-blueprints/31c64899055982e73cb2c1b95345faafe6d5c7ff/docs/whisper_transcription/examples/test1/test.wav
--------------------------------------------------------------------------------
/docs/whisper_transcription/examples/test1/test_all_transcripts_20250601_201349.txt:
--------------------------------------------------------------------------------
1 | [2025-06-01 20:13:40] Speaker 2: So, Aaron, in your email you said you wanted to talk about the exam.
2 | [2025-06-01 20:13:40] Speaker 1: Yeah, um, I've just never taken a class with so many different readings.
3 | [2025-06-01 20:13:40] Speaker 1: I've managed to keep up with all the assignments, but I'm not sure how to...
4 | [2025-06-01 20:13:45] Speaker 1: How to...
5 | [2025-06-01 20:13:40] Speaker 2: How to review everything.
6 | [2025-06-01 20:13:40] Speaker 1: Yeah, in other classes I've had, there's usually just one book to review, not three different books, plus all those other text excerpts and video...
7 |
8 | ====== Summary ======
9 |
10 | Key points:
11 |
12 | * Speaker 1 is struggling to keep up with the readings in a class with multiple books and other materials.
13 | * Speaker 2 suggests reviewing everything to prepare for the exam.
14 |
15 | Decisions:
16 |
17 | * None made during the meeting.
18 |
19 | Action items:
20 |
21 | * Speaker 1 needs to find a strategy for reviewing all the materials and preparing for the exam.
--------------------------------------------------------------------------------
/docs/whisper_transcription/examples/test2/video1591686795.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oracle-quickstart/oci-ai-blueprints/31c64899055982e73cb2c1b95345faafe6d5c7ff/docs/whisper_transcription/examples/test2/video1591686795.mp4
--------------------------------------------------------------------------------
/docs/whisper_transcription/examples/test3/audio1788670787.m4a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oracle-quickstart/oci-ai-blueprints/31c64899055982e73cb2c1b95345faafe6d5c7ff/docs/whisper_transcription/examples/test3/audio1788670787.m4a
--------------------------------------------------------------------------------
/docs/whisper_transcription/whisper-transcription-A10.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_id": "whisper transcription",
3 | "recipe_mode": "service",
4 | "deployment_name": "whisper-transcription-a10",
5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:whisper_transcription_v8",
6 | "recipe_node_shape": "VM.GPU.A10.2",
7 | "recipe_replica_count": 1,
8 | "recipe_container_port": "8000",
9 | "recipe_nvidia_gpu_count": 2,
10 | "recipe_node_pool_size": 1,
11 | "recipe_node_boot_volume_size_in_gbs": 200,
12 | "recipe_ephemeral_storage_size": 100,
13 | "recipe_shared_memory_volume_size_limit_in_mb": 1000
14 | }
--------------------------------------------------------------------------------
/docs/whisper_transcription/whisper-transcription-A100.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_id": "whisper transcription",
3 | "recipe_mode": "service",
4 | "deployment_name": "whisper-transcription-a100",
5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:whisper_transcription_v8",
6 | "recipe_node_shape": "BM.GPU.A100.8",
7 | "recipe_replica_count": 1,
8 | "recipe_container_port": "8000",
9 | "recipe_nvidia_gpu_count": 8,
10 | "recipe_node_pool_size": 1,
11 | "recipe_node_boot_volume_size_in_gbs": 200,
12 | "recipe_ephemeral_storage_size": 100,
13 | "recipe_shared_memory_volume_size_limit_in_mb": 1000
14 | }
--------------------------------------------------------------------------------
/docs/whisper_transcription/whisper-transcription-H100.json:
--------------------------------------------------------------------------------
1 | {
2 | "recipe_id": "whisper transcription",
3 | "recipe_mode": "service",
4 | "deployment_name": "whisper-transcription-h100",
5 | "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:whisper_transcription_v8",
6 | "recipe_node_shape": "BM.GPU.H100.8",
7 | "recipe_replica_count": 1,
8 | "recipe_container_port": "8000",
9 | "recipe_nvidia_gpu_count": 8,
10 | "recipe_node_pool_size": 1,
11 | "recipe_node_boot_volume_size_in_gbs": 200,
12 | "recipe_ephemeral_storage_size": 100,
13 | "recipe_shared_memory_volume_size_limit_in_mb": 1000
14 | }
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/OCI_AI_BLUEPRINTS_STACK_VERSION:
--------------------------------------------------------------------------------
1 | v1.0.2
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/app-api.tf:
--------------------------------------------------------------------------------
1 | resource "kubernetes_service" "corrino_cp_service" {
2 | metadata {
3 | name = "corrino-cp"
4 | annotations = {
5 | "oci.oraclecloud.com/load-balancer-type" = "lb"
6 | "service.beta.kubernetes.io/oci-load-balancer-shape"= "flexible"
7 | }
8 | }
9 | spec {
10 | selector = {
11 | app = "corrino-cp"
12 | }
13 | port {
14 | port = 80
15 | target_port = 5000
16 | }
17 | }
18 | depends_on = [kubernetes_deployment.corrino_cp_deployment]
19 | }
20 |
21 | resource "kubernetes_deployment" "corrino_cp_deployment" {
22 | metadata {
23 | name = "corrino-cp"
24 | labels = {
25 | app = "corrino-cp"
26 | }
27 | }
28 | spec {
29 | replicas = 1
30 |
31 | strategy {
32 | type = "Recreate"
33 | }
34 |
35 | selector {
36 | match_labels = {
37 | app = "corrino-cp"
38 | }
39 | }
40 | template {
41 | metadata {
42 | labels = {
43 | app = "corrino-cp"
44 | }
45 | }
46 | spec {
47 | container {
48 | name = "corrino-cp"
49 | image = local.app.backend_image_uri
50 | image_pull_policy = "Always"
51 |
52 | dynamic "env" {
53 | for_each = local.env_universal
54 | content {
55 | name = env.value.name
56 | value = env.value.value
57 | }
58 | }
59 |
60 | dynamic "env" {
61 | for_each = local.env_app_api
62 | content {
63 | name = env.value.name
64 | value = env.value.value
65 | }
66 | }
67 |
68 | dynamic "env" {
69 | for_each = local.env_app_configmap
70 | content {
71 | name = env.value.name
72 | value_from {
73 | config_map_key_ref {
74 | name = env.value.config_map_name
75 | key = env.value.config_map_key
76 | }
77 | }
78 | }
79 | }
80 |
81 | dynamic "env" {
82 | for_each = local.env_adb_access
83 | content {
84 | name = env.value.name
85 | value = env.value.value
86 | }
87 | }
88 |
89 | dynamic "env" {
90 | for_each = local.env_adb_access_secrets
91 | content {
92 | name = env.value.name
93 | value_from {
94 | secret_key_ref {
95 | name = env.value.secret_name
96 | key = env.value.secret_key
97 | }
98 | }
99 | }
100 | }
101 |
102 | volume_mount {
103 | name = "adb-wallet-volume"
104 | mount_path = "/app/wallet"
105 | read_only = true
106 | }
107 | }
108 | volume {
109 | name = "adb-wallet-volume"
110 | secret {
111 | secret_name = "oadb-wallet"
112 | }
113 | }
114 | }
115 | }
116 | }
117 | depends_on = [kubernetes_job.corrino_migration_job]
118 | }
119 |
120 |
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/app-background.tf:
--------------------------------------------------------------------------------
1 |
2 | resource "kubernetes_deployment" "corrino_cp_background_deployment" {
3 | metadata {
4 | name = "corrino-cp-background"
5 | labels = {
6 | app = "corrino-cp-background"
7 | }
8 | }
9 | spec {
10 | replicas = 1
11 | selector {
12 | match_labels = {
13 | app = "corrino-cp-background"
14 | }
15 | }
16 | template {
17 | metadata {
18 | labels = {
19 | app = "corrino-cp-background"
20 | }
21 | }
22 | spec {
23 | container {
24 | name = "corrino-cp-background"
25 | image = local.app.backend_image_uri
26 | image_pull_policy = "Always"
27 | command = ["/bin/sh", "-c"]
28 | args = ["python3 manage.py runserver"]
29 | dynamic "env" {
30 | for_each = local.env_universal
31 | content {
32 | name = env.value.name
33 | value = env.value.value
34 | }
35 | }
36 |
37 | dynamic "env" {
38 | for_each = local.env_app_api_background
39 | content {
40 | name = env.value.name
41 | value = env.value.value
42 | }
43 | }
44 |
45 | dynamic "env" {
46 | for_each = local.env_app_configmap
47 | content {
48 | name = env.value.name
49 | value_from {
50 | config_map_key_ref {
51 | name = env.value.config_map_name
52 | key = env.value.config_map_key
53 | }
54 | }
55 | }
56 | }
57 |
58 | dynamic "env" {
59 | for_each = local.env_adb_access
60 | content {
61 | name = env.value.name
62 | value = env.value.value
63 | }
64 | }
65 |
66 | dynamic "env" {
67 | for_each = local.env_adb_access_secrets
68 | content {
69 | name = env.value.name
70 | value_from {
71 | secret_key_ref {
72 | name = env.value.secret_name
73 | key = env.value.secret_key
74 | }
75 | }
76 | }
77 | }
78 |
79 | volume_mount {
80 | name = "adb-wallet-volume"
81 | mount_path = "/app/wallet"
82 | read_only = true
83 | }
84 | }
85 | volume {
86 | name = "adb-wallet-volume"
87 | secret {
88 | secret_name = "oadb-wallet"
89 | }
90 | }
91 | }
92 | }
93 | }
94 | depends_on = [kubernetes_job.corrino_migration_job]
95 | }
96 |
97 |
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/app-blueprint-portal.tf:
--------------------------------------------------------------------------------
1 | resource "kubernetes_service" "oci_ai_blueprints_portal_service" {
2 | metadata {
3 | name = "oci-ai-blueprints-portal"
4 | annotations = {
5 | "oci.oraclecloud.com/load-balancer-type" = "lb"
6 | "service.beta.kubernetes.io/oci-load-balancer-shape" = "flexible"
7 | }
8 | }
9 | spec {
10 | selector = {
11 | app = "oci-ai-blueprints-portal"
12 | }
13 | port {
14 | port = 80
15 | target_port = 3000
16 | }
17 | }
18 | depends_on = [kubernetes_deployment.oci_ai_blueprints_portal_deployment]
19 | }
20 |
21 | resource "kubernetes_deployment" "oci_ai_blueprints_portal_deployment" {
22 | metadata {
23 | name = "oci-ai-blueprints-portal"
24 | labels = {
25 | app = "oci-ai-blueprints-portal"
26 | }
27 | }
28 | spec {
29 | replicas = 1
30 | selector {
31 | match_labels = {
32 | app = "oci-ai-blueprints-portal"
33 | }
34 | }
35 | template {
36 | metadata {
37 | labels = {
38 | app = "oci-ai-blueprints-portal"
39 | }
40 | }
41 | spec {
42 | container {
43 | name = "oci-ai-blueprints-portal"
44 | image = local.app.blueprint_portal_image_uri
45 | image_pull_policy = "Always"
46 |
47 | dynamic "env" {
48 | for_each = local.env_universal
49 | content {
50 | name = env.value.name
51 | value = env.value.value
52 | }
53 | }
54 |
55 | dynamic "env" {
56 | for_each = local.env_app_configmap
57 | content {
58 | name = env.value.name
59 | value_from {
60 | config_map_key_ref {
61 | name = env.value.config_map_name
62 | key = env.value.config_map_key
63 | }
64 | }
65 | }
66 | }
67 | }
68 | }
69 | }
70 | }
71 | depends_on = [kubernetes_deployment.corrino_cp_deployment]
72 | }
73 |
74 |
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/app-configmap.tf:
--------------------------------------------------------------------------------
1 | resource "kubernetes_config_map" "corrino-configmap" {
2 | metadata {
3 | name = "corrino-configmap"
4 | }
5 |
6 |
7 | data = {
8 | APP_IMAGE_URI = local.app.backend_image_uri
9 | ADDON_GRAFANA_TOKEN = local.addon.grafana_token
10 | ADDON_GRAFANA_USER = local.addon.grafana_user
11 | BACKEND_SERVICE_NAME = local.app.backend_service_name
12 | COMPARTMENT_ID = local.oci.compartment_id
13 | CONTROL_PLANE_VERSION = var.stack_version
14 | RELEASE_VERSION = var.stack_version
15 | DJANGO_ALLOWED_HOSTS = local.django.allowed_hosts
16 | DJANGO_CSRF_TRUSTED_ORIGINS = local.django.csrf_trusted_origins
17 | DJANGO_SECRET = local.django.secret
18 | FRONTEND_HTTPS_FLAG = local.app.https_flag
19 | IMAGE_REGISTRY_BASE_URI = local.ocir.base_uri
20 | LOGGING_LEVEL = local.django.logging_level
21 | NAMESPACE_NAME = local.oci.namespace_name
22 | OKE_CLUSTER_ID = local.oci.oke_cluster_id
23 | OKE_NODE_SUBNET_ID = local.network.oke_node_subnet_id
24 | PUBLIC_ENDPOINT_BASE = local.fqdn.name
25 | RECIPE_BUCKET_NAME = local.app.recipe_bucket_name
26 | RECIPE_VALIDATION_ENABLED = local.app.recipe_validation_enabled
27 | RECIPE_VALIDATION_SHAPE_AVAILABILITY_ENABLED = local.app.recipe_validation_shape_availability_enabled
28 | REGION_NAME = local.oci.region_name
29 | TENANCY_ID = local.oci.tenancy_id
30 | TENANCY_NAMESPACE = local.oci.tenancy_namespace
31 | DATA_UPLOAD_PATH = var.share_data_with_corrino_team_enabled ? local.registration.upload_path : ""
32 | DEPLOYMENT_UUID = random_uuid.registration_id.result
33 | DATA_SHARING_ENABLED = var.share_data_with_corrino_team_enabled ? "True" : "False"
34 | BLUEPRINTS_OBJECT_STORAGE_URL = local.app.blueprints_object_storage_url
35 | PORTAL_DEMO_FLAG = local.app.portal_demo_flag
36 | SHARED_NODE_POOL_BLUEPRINTS_OBJECT_STORAGE_URL = local.app.shared_node_pool_blueprints_object_storage_url
37 | SHARED_NODE_POOL_DOCUMENTATION_URL = local.app.shared_node_pool_documentation_url
38 | BLUEPRINT_DOCUMENTATION_URL = local.app.blueprint_documentation_url
39 | PROMETHEUS_NAMESPACE = local.third_party_namespaces.prometheus_namespace
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/app-migration.tf:
--------------------------------------------------------------------------------
1 | resource "kubernetes_job" "corrino_migration_job" {
2 | metadata {
3 | name = "corrino-migration-job"
4 | }
5 | spec {
6 | template {
7 | metadata {}
8 | spec {
9 |
10 | container {
11 | name = "corrino-migration-job"
12 | image = local.app.backend_image_uri
13 | image_pull_policy = "Always"
14 | command = ["/bin/sh", "-c"]
15 | args = [
16 | "pwd; ls -al; uname -a; whoami; python3 manage.py print_settings; python3 manage.py makemigrations; python3 manage.py migrate"
17 | ]
18 |
19 | dynamic "env" {
20 | for_each = local.env_universal
21 | content {
22 | name = env.value.name
23 | value = env.value.value
24 | }
25 | }
26 |
27 | dynamic "env" {
28 | for_each = local.env_app_jobs
29 | content {
30 | name = env.value.name
31 | value = env.value.value
32 | }
33 | }
34 |
35 | dynamic "env" {
36 | for_each = local.env_app_configmap
37 | content {
38 | name = env.value.name
39 | value_from {
40 | config_map_key_ref {
41 | name = env.value.config_map_name
42 | key = env.value.config_map_key
43 | }
44 | }
45 | }
46 | }
47 |
48 | dynamic "env" {
49 | for_each = local.env_adb_access
50 | content {
51 | name = env.value.name
52 | value = env.value.value
53 | }
54 | }
55 |
56 | dynamic "env" {
57 | for_each = local.env_adb_access_secrets
58 | content {
59 | name = env.value.name
60 | value_from {
61 | secret_key_ref {
62 | name = env.value.secret_name
63 | key = env.value.secret_key
64 | }
65 | }
66 | }
67 | }
68 |
69 | volume_mount {
70 | name = "adb-wallet-volume"
71 | mount_path = "/app/wallet"
72 | read_only = true
73 | }
74 | }
75 |
76 | volume {
77 | name = "adb-wallet-volume"
78 | secret {
79 | secret_name = "oadb-wallet"
80 | }
81 | }
82 |
83 | restart_policy = "Never"
84 | }
85 | }
86 | backoff_limit = 0
87 | ttl_seconds_after_finished = 120
88 | }
89 | wait_for_completion = true
90 | timeouts {
91 | create = "10m"
92 | update = "10m"
93 | }
94 |
95 | depends_on = [kubernetes_job.wallet_extractor_job, kubernetes_config_map.corrino-configmap]
96 |
97 | # count = var.mushop_mock_mode_all ? 0 : 1
98 | count = 1
99 | }
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/app-registration.tf:
--------------------------------------------------------------------------------
1 |
2 | resource "local_file" "registration" {
3 | content = local.registration.object_content
4 | filename = local.registration.object_filepath
5 | }
6 |
7 | # curl -X PUT --data-binary '@local_filename' unique_PAR_URL
8 |
9 | resource "null_resource" "registration" {
10 | depends_on = [kubernetes_deployment.corrino_cp_deployment, local_file.registration]
11 | triggers = {
12 | always_run = timestamp()
13 | }
14 | provisioner "local-exec" {
15 | command = <<-EOT
16 | if [ "${var.share_data_with_corrino_team_enabled}" = "true" ]; then
17 | curl -X PUT --data-binary '@${local.registration.object_filepath}' ${local.registration.upload_path}${local.registration.object_filename}
18 | else
19 | echo "1" > /tmp/opted_out && curl -X PUT --data-binary '@/tmp/opted_out' ${local.registration.upload_path}opted_out
20 | fi
21 | EOT
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/app-user.tf:
--------------------------------------------------------------------------------
1 | resource "kubernetes_job" "corrino_user_job" {
2 | metadata {
3 | name = "corrino-user-job"
4 | }
5 | spec {
6 | template {
7 | metadata {}
8 | spec {
9 |
10 | container {
11 | name = "corrino-user-job"
12 | image = local.app.backend_image_uri
13 | image_pull_policy = "Always"
14 | command = ["/bin/sh", "-c"]
15 | args = ["python3 manage.py create_superuser_if_needed"]
16 |
17 | dynamic "env" {
18 | for_each = local.env_universal
19 | content {
20 | name = env.value.name
21 | value = env.value.value
22 | }
23 | }
24 |
25 | dynamic "env" {
26 | for_each = local.env_app_user
27 | content {
28 | name = env.value.name
29 | value = env.value.value
30 | }
31 | }
32 |
33 | dynamic "env" {
34 | for_each = local.env_app_jobs
35 | content {
36 | name = env.value.name
37 | value = env.value.value
38 | }
39 | }
40 |
41 | dynamic "env" {
42 | for_each = local.env_app_configmap
43 | content {
44 | name = env.value.name
45 | value_from {
46 | config_map_key_ref {
47 | name = env.value.config_map_name
48 | key = env.value.config_map_key
49 | }
50 | }
51 | }
52 | }
53 |
54 | dynamic "env" {
55 | for_each = local.env_adb_access
56 | content {
57 | name = env.value.name
58 | value = env.value.value
59 | }
60 | }
61 |
62 | dynamic "env" {
63 | for_each = local.env_adb_access_secrets
64 | content {
65 | name = env.value.name
66 | value_from {
67 | secret_key_ref {
68 | name = env.value.secret_name
69 | key = env.value.secret_key
70 | }
71 | }
72 | }
73 | }
74 |
75 | volume_mount {
76 | name = "adb-wallet-volume"
77 | mount_path = "/app/wallet"
78 | read_only = true
79 | }
80 | }
81 |
82 | volume {
83 | name = "adb-wallet-volume"
84 | secret {
85 | secret_name = "oadb-wallet"
86 | }
87 | }
88 |
89 | restart_policy = "Never"
90 | }
91 | }
92 | backoff_limit = 0
93 | ttl_seconds_after_finished = 120
94 | }
95 |
96 | wait_for_completion = true
97 | timeouts {
98 | create = "10m"
99 | update = "10m"
100 | }
101 |
102 | depends_on = [kubernetes_job.corrino_migration_job]
103 |
104 | # count = var.mushop_mock_mode_all ? 0 : 1
105 | }
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/data.tf:
--------------------------------------------------------------------------------
1 |
2 |
3 | data "oci_objectstorage_namespace" "ns" {
4 | compartment_id = var.compartment_ocid
5 | }
6 |
7 | data "oci_containerengine_cluster_kube_config" "oke_special" {
8 | cluster_id = var.existent_oke_cluster_id
9 | }
10 |
11 | #data "kubernetes_ingress" "corrino_cp_ingress" {
12 | # metadata {
13 | # name = local.app.backend_service_name_ingress
14 | # namespace = "default"
15 | # }
16 | #
17 | # depends_on = [module.oke-quickstart.helm_release_ingress_nginx]
18 | # count = var.ingress_nginx_enabled ? 1 : 0
19 | #}
20 |
21 | #data "kubernetes_service" "corrino_cp_service" {
22 | # metadata {
23 | # name = local.app.backend_service_name
24 | # namespace = "default"
25 | # }
26 | # depends_on = [module.oke-quickstart.helm_release_ingress_nginx]
27 | # count = var.ingress_nginx_enabled ? 1 : 0
28 | #}
29 |
30 | data "kubernetes_service" "ingress_nginx_controller_service" {
31 | metadata {
32 | name = "ingress-nginx-controller"
33 | namespace = "cluster-tools"
34 | }
35 | depends_on = [module.oke-quickstart.helm_release_ingress_nginx]
36 | count = var.ingress_nginx_enabled ? 1 : 0
37 | }
38 |
39 | data "kubernetes_secret" "grafana_password" {
40 | metadata {
41 | name = "grafana"
42 | namespace = "cluster-tools"
43 | }
44 | depends_on = [module.oke-quickstart.helm_release_grafana]
45 | count = var.grafana_enabled ? 1 : 0
46 | }
47 |
48 | data "kubernetes_namespace" "cluster_tools_namespace" {
49 | metadata {
50 | name = "cluster-tools"
51 | }
52 | depends_on = [module.oke-quickstart.cluster_tools_namespace]
53 | count = var.bring_your_own_prometheus ? 0 : 1
54 | }
55 |
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/helm.tf:
--------------------------------------------------------------------------------
1 | resource "helm_release" "mlflow" {
2 | name = "mlflow"
3 | repository = "https://community-charts.github.io/helm-charts"
4 | chart = "mlflow"
5 | namespace = "cluster-tools"
6 | wait = false
7 | version = "0.16.5"
8 |
9 | values = [
10 | < __Warning__: Moved to [oracle-quickstart/terraform-oci-networking](https://github.com/oracle-quickstart/terraform-oci-networking). Sub modules with specific OCI Networking resource also available. example: [Virtual Cloud Network](https://github.com/oracle-quickstart/terraform-oci-networking/tree/main/modules/vcn).
4 |
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/modules/corrino/modules/oci-policies/main.tf:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | #
4 |
5 | resource "oci_identity_dynamic_group" "for_policies" {
6 | name = "${local.app_name_normalized}-${local.dynamic_group_name_normalized}-${local.deploy_id}"
7 | description = "${local.app_name} ${var.dynamic_group_name} (${local.deploy_id})"
8 | compartment_id = var.tenancy_ocid
9 | matching_rule = "${var.dynamic_group_main_condition} {${join(",", var.dynamic_group_matching_rules)}}"
10 | freeform_tags = var.oci_tag_values.freeformTags
11 | defined_tags = var.oci_tag_values.definedTags
12 |
13 | provider = oci.home_region
14 |
15 | count = var.create_dynamic_group ? 1 : 0
16 | }
17 |
18 | resource "oci_identity_policy" "policies" {
19 | name = "${local.app_name_normalized}-${local.policy_name_normalized}-${local.deploy_id}"
20 | description = "${local.app_name} ${var.policy_name} (${local.deploy_id})"
21 | compartment_id = local.policy_compartment_ocid
22 | statements = var.policy_statements
23 | freeform_tags = var.oci_tag_values.freeformTags
24 | defined_tags = var.oci_tag_values.definedTags
25 |
26 | depends_on = [oci_identity_dynamic_group.for_policies]
27 |
28 | provider = oci.home_region
29 |
30 | count = var.create_policy ? 1 : 0
31 | }
32 |
33 | locals {
34 | policy_compartment_ocid = var.compartment_ocid != "" ? var.compartment_ocid : var.tenancy_ocid
35 | }
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/modules/corrino/modules/oci-policies/outputs.tf:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Oracle and/or its affiliates. All rights reserved.
2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | #
4 |
5 | output "dynamic_group_id" {
6 | value = try(oci_identity_dynamic_group.for_policies.0.id, null)
7 | }
8 | output "dynamic_group_name" {
9 | value = try(oci_identity_dynamic_group.for_policies.0.name, null)
10 | }
11 | output "compartment_policy_id" {
12 | value = try(oci_identity_policy.policies.0.id, null)
13 | }
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/modules/corrino/modules/oci-policies/variables.tf:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | #
4 |
5 | # Create Dynamic Group and Policies
6 | variable "create_dynamic_group" {
7 | default = false
8 | description = "Creates dynamic group to use with policies. Note: You need to have proper rights on the Tenancy. If you only have rights in a compartment, uncheck and ask you administrator to create the Dynamic Group for you"
9 | }
10 | variable "dynamic_group_name" {
11 | default = "Dynamic Group"
12 | description = "Name of the dynamic group. e.g.: OKE Cluster Dynamic Group => -oke-cluster-dynamic-group-"
13 | }
14 | ## Dynamic Group Matching Rules
15 | variable "dynamic_group_matching_rules" {
16 | type = list(string)
17 | default = []
18 | description = "List of matching rules for the dynamic group. e.g.: [\"ALL {instance.compartment.id = 'ocid1.compartment.oc1..aaaaaaaaxxxxxxxxxxxxxxxx'}\", \"ALL {instance.id = 'ocid1.instance.oc1.phx.xxxxxxxx'}\"]"
19 | }
20 | variable "dynamic_group_main_condition" {
21 | default = "ANY"
22 | description = "Main condition for the dynamic group. e.g.: ALL, ANY"
23 |
24 | validation {
25 | condition = var.dynamic_group_main_condition == "ALL" || var.dynamic_group_main_condition == "ANY"
26 | error_message = "Sorry, but cluster visibility can only be ALL or ANY."
27 | }
28 | }
29 | # Policy
30 | variable "create_policy" {
31 | default = false
32 | description = "Creates policy. e.g.: Compartment Policies to support Cluster Autoscaler, OCI Logging datasource on Grafana; Tenancy Policies to support OCI Metrics datasource on Grafana"
33 | }
34 | variable "policy_name" {
35 | default = "Policies"
36 | description = "Name of the policy. e.g.: Compartment Policies => -compartment-policies-"
37 | }
38 | # variable "create_tenancy_policies" {
39 | # default = false
40 | # description = "Creates policies that need to reside on the tenancy. e.g.: Policies to support OCI Metrics datasource on Grafana"
41 | # }
42 | variable "compartment_ocid" {
43 | default = ""
44 | description = "Compartment OCID where the policies will be created. If not specified, the policies will be created on the Tenancy OCID"
45 | }
46 |
47 | # Compartment Policies Statements
48 | variable "policy_statements" {
49 | type = list(string)
50 | default = []
51 | description = "List of statements for the compartment policy. e.g.: [\"Allow dynamic-group to manage instances in compartment \", \"Allow dynamic-group to use instances in compartment where ALL {instance.compartment.id = 'ocid1.compartment.oc1..aaaaaaaaxxxxxxxxxxxxxxxx', instance.id = 'ocid1.instance.oc1.phx.xxxxxxxx'}\"]"
52 | }
53 |
54 | # Deployment Details + Freeform Tags
55 | variable "oci_tag_values" {
56 | description = "Tags to be added to the resources"
57 | }
58 |
59 | # OCI Provider
60 | variable "tenancy_ocid" {}
61 | # variable "region" {}
62 | # variable "user_ocid" { default = "" }
63 | # variable "fingerprint" { default = "" }
64 | # variable "private_key_path" { default = "" }
65 |
66 | locals {
67 | app_name_normalized = substr(replace(lower(var.oci_tag_values.freeformTags.AppName), " ", "-"), 0, 6)
68 | app_name = var.oci_tag_values.freeformTags.AppName
69 | deploy_id = var.oci_tag_values.freeformTags.DeploymentID
70 | policy_compartment_OCID = var.compartment_ocid == "" ? var.tenancy_ocid : var.compartment_ocid
71 | dynamic_group_name_normalized = substr(replace(lower(var.dynamic_group_name), " ", "-"), 0, 80)
72 | policy_name_normalized = substr(replace(lower(var.policy_name), " ", "-"), 0, 80)
73 | }
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/modules/corrino/modules/oci-policies/versions.tf:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | #
4 |
5 | terraform {
6 | required_version = ">= 1.1"
7 | required_providers {
8 | oci = {
9 | source = "oracle/oci"
10 | version = "~> 4, < 5"
11 | # https://registry.terraform.io/providers/oracle/oci/
12 | configuration_aliases = [oci.home_region]
13 | }
14 | local = {
15 | source = "hashicorp/local"
16 | version = "~> 2"
17 | # https://registry.terraform.io/providers/hashicorp/local/
18 | }
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/modules/corrino/modules/oci-vault-kms/main.tf:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021, 2022 Oracle and/or its affiliates. All rights reserved.
2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | #
4 |
5 | ##**************************************************************************
6 | ## OCI KMS Vault
7 | ##**************************************************************************
8 |
9 | ### OCI Vault vault
10 | resource "oci_kms_vault" "oke_vault" {
11 | compartment_id = var.oke_cluster_compartment_ocid
12 | display_name = "${local.vault_display_name} - ${local.deploy_id}"
13 | vault_type = local.vault_type[0]
14 | freeform_tags = var.oci_tag_values.freeformTags
15 | defined_tags = var.oci_tag_values.definedTags
16 |
17 | # depends_on = [oci_identity_policy.kms_user_group_compartment_policies]
18 |
19 | count = var.use_encryption_from_oci_vault ? (var.create_new_encryption_key ? 1 : 0) : 0
20 | }
21 | ### OCI Vault key
22 | resource "oci_kms_key" "oke_key" {
23 | compartment_id = var.oke_cluster_compartment_ocid
24 | display_name = "${local.vault_key_display_name} - ${local.deploy_id}"
25 | management_endpoint = oci_kms_vault.oke_vault[0].management_endpoint
26 | protection_mode = local.vault_key_protection_mode
27 | freeform_tags = var.oci_tag_values.freeformTags
28 | defined_tags = var.oci_tag_values.definedTags
29 |
30 | key_shape {
31 | algorithm = local.vault_key_key_shape_algorithm
32 | length = local.vault_key_key_shape_length
33 | }
34 |
35 | count = var.use_encryption_from_oci_vault ? (var.create_new_encryption_key ? 1 : 0) : 0
36 | }
37 |
38 | ### Vault and Key definitions
39 | locals {
40 | vault_display_name = "OKE Vault"
41 | vault_key_display_name = "OKE Key"
42 | vault_key_key_shape_algorithm = "AES"
43 | vault_key_key_shape_length = 32
44 | vault_type = ["DEFAULT", "VIRTUAL_PRIVATE"]
45 | vault_key_protection_mode = "SOFTWARE" # HSM or SOFTWARE
46 | oci_vault_key_id = var.use_encryption_from_oci_vault ? (var.create_new_encryption_key ? oci_kms_key.oke_key[0].id : var.existent_encryption_key_id) : "void"
47 | }
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/modules/corrino/modules/oci-vault-kms/outputs.tf:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Oracle and/or its affiliates. All rights reserved.
2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | #
4 |
5 | output "oci_vault_key_id" {
6 | value = var.use_encryption_from_oci_vault ? (var.create_new_encryption_key ? oci_kms_key.oke_key[0].id : var.existent_encryption_key_id) : null
7 | }
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/modules/corrino/modules/oci-vault-kms/providers.tf:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | #
4 |
5 | terraform {
6 | required_version = ">= 1.1"
7 | required_providers {
8 | oci = {
9 | source = "oracle/oci"
10 | version = "~> 4"
11 | # https://registry.terraform.io/providers/oracle/oci/
12 | configuration_aliases = [oci.home_region]
13 | }
14 | local = {
15 | source = "hashicorp/local"
16 | version = "~> 2"
17 | # https://registry.terraform.io/providers/hashicorp/local/
18 | }
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/modules/corrino/modules/oci-vault-kms/variables.tf:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021, 2022, Oracle and/or its affiliates. All rights reserved.
2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | #
4 |
5 | # OKE Encryption details
6 | variable "use_encryption_from_oci_vault" {
7 | default = false
8 | description = "By default, Oracle manages the keys that encrypts Kubernetes Secrets at Rest in Etcd, but you can choose a key from a vault that you have access to, if you want greater control over the key's lifecycle and how it's used"
9 | }
10 | variable "create_new_encryption_key" {
11 | default = false
12 | description = "Creates new vault and key on OCI Vault/Key Management/KMS and assign to boot volume of the worker nodes"
13 | }
14 | variable "existent_encryption_key_id" {
15 | default = ""
16 | description = "Use an existent master encryption key to encrypt boot volume and object storage bucket. NOTE: If the key resides in a different compartment or in a different tenancy, make sure you have the proper policies to access, or the provision of the worker nodes will fail"
17 | }
18 |
19 | # Deployment Details + Freeform Tags
20 | variable "oci_tag_values" {
21 | description = "Tags to be added to the resources"
22 | }
23 |
24 | # OKE Variables
25 | variable "oke_cluster_compartment_ocid" {
26 | description = "Compartment OCID used by the OKE Cluster"
27 | type = string
28 | }
29 |
30 | # Policies variables
31 | variable "create_vault_policies_for_group" {
32 | default = false
33 | description = "Creates policies to allow the user applying the stack to manage vault and keys. If you are on the Administrators group or already have the policies for a compartment, this policy is not needed. If you do not have access to allow the policy, ask your administrator to include it for you"
34 | }
35 | variable "user_admin_group_for_vault_policy" {
36 | default = "Administrators"
37 | description = "User Identity Group to allow manage vault and keys. The user running the Terraform scripts or Applying the ORM Stack need to be on this group"
38 | }
39 | ## Create Dynamic Group and Policies
40 | variable "create_dynamic_group_for_nodes_in_compartment" {
41 | default = false
42 | description = "Creates dynamic group of Nodes in the compartment. Note: You need to have proper rights on the Tenancy. If you only have rights in a compartment, uncheck and ask you administrator to create the Dynamic Group for you"
43 | }
44 | variable "create_compartment_policies" {
45 | default = false
46 | description = "Creates policies for KMS that will reside on the compartment."
47 | }
48 |
49 | # OCI Provider
50 | variable "tenancy_ocid" {}
51 |
52 | # Conditional locals
53 | locals {
54 | app_dynamic_group = (var.use_encryption_from_oci_vault && var.create_dynamic_group_for_nodes_in_compartment) ? oci_identity_dynamic_group.app_dynamic_group.0.name : "void"
55 | app_name_normalized = substr(replace(lower(var.oci_tag_values.freeformTags.AppName), " ", "-"), 0, 6)
56 | app_name = var.oci_tag_values.freeformTags.AppName
57 | deploy_id = var.oci_tag_values.freeformTags.DeploymentID
58 | }
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/modules/corrino/modules/oke-cluster-autoscaler/datasources.tf:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Oracle and/or its affiliates. All rights reserved.
2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | #
4 |
5 | # Gets supported Kubernetes versions for node pools
6 | # data "oci_containerengine_node_pool_option" "node_pool" {
7 | # node_pool_option_id = "all"
8 | # }
9 | # data "oci_containerengine_node_pool_option" "node_pool" {
10 | # node_pool_option_id = var.existent_oke_cluster_id
11 | # }
12 |
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/modules/corrino/modules/oke-cluster-autoscaler/variables.tf:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | #
4 |
5 | # OKE Variables
6 | ## OKE Autoscaler
7 | # variable "cluster_autoscaler_enabled" {
8 | # default = true
9 | # description = "Enables OKE cluster autoscaler. Node pools will auto scale based on the resources usage"
10 | # }
11 | variable "cluster_autoscaler_supported_k8s_versions" {
12 | type = map(string)
13 |
14 | default = { "1.22" = "1.22.2-4", "1.23" = "1.23.0-4", "1.24" = "1.24.0-5", "1.25" = "1.25.0-6" } # There's no API to get that list. Need to be updated manually
15 | description = "Supported Kubernetes versions for OKE cluster autoscaler"
16 | }
17 | variable "custom_cluster_autoscaler_image" {
18 | default = ""
19 | description = "Custom Image for OKE cluster autoscaler"
20 | }
21 | variable "cluster_autoscaler_log_level_verbosity" {
22 | default = 4
23 | description = "Log level verbosity for OKE cluster autoscaler"
24 | }
25 | variable "cluster_autoscaler_max_node_provision_time" {
26 | default = "25m"
27 | description = "Maximum time in minutes for a node to be provisioned. If the node is not ready after this time, it will be deleted and recreated"
28 | }
29 | variable "cluster_autoscaler_scale_down_delay_after_add" {
30 | default = "10m"
31 | description = "Time to wait after scale up before attempting to scale down"
32 | }
33 | variable "cluster_autoscaler_scale_down_unneeded_time" {
34 | default = "10m"
35 | description = "Time after which a node should be deleted after it has been unneeded for this long"
36 | }
37 | variable "cluster_autoscaler_unremovable_node_recheck_timeout" {
38 | default = "5m"
39 | description = "Time after which a node which failed to be removed is retried"
40 | }
41 | variable "cluster_autoscaler_num_of_replicas" {
42 | default = 3
43 | description = "Number of replicas for OKE cluster autoscaler"
44 | }
45 | variable "cluster_autoscaler_extra_args" {
46 | default = []
47 | description = "Extra arguments to pass to OKE cluster autoscaler"
48 | }
49 |
50 | ## OKE Node Pool Details
51 | variable "oke_node_pools" {
52 | type = list(any)
53 |
54 | default = []
55 | description = "Node pools (id, min_nodes, max_nodes, k8s_version) to use with Cluster Autoscaler"
56 | }
57 |
58 | # OCI Provider
59 | variable "region" {}
60 |
61 | # Get OKE options
62 | # locals {
63 | # node_pool_k8s_latest_version = reverse(sort(data.oci_containerengine_node_pool_option.node_pool.kubernetes_versions))[0]
64 | # }
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/modules/corrino/modules/oke-cluster-autoscaler/versions.tf:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | #
4 |
5 | terraform {
6 | required_version = ">= 1.1"
7 | required_providers {
8 | oci = {
9 | source = "oracle/oci"
10 | version = "~> 4, < 5"
11 | # https://registry.terraform.io/providers/oracle/oci/
12 | }
13 | kubernetes = {
14 | source = "hashicorp/kubernetes"
15 | version = "~> 2"
16 | # https://registry.terraform.io/providers/hashicorp/kubernetes/
17 | }
18 | local = {
19 | source = "hashicorp/local"
20 | version = "~> 2"
21 | # https://registry.terraform.io/providers/hashicorp/local/
22 | }
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/modules/corrino/modules/oke-node-pool/datasources.tf:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Oracle and/or its affiliates. All rights reserved.
2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | #
4 | # Gets supported Kubernetes versions for node pools
5 | data "oci_containerengine_node_pool_option" "node_pool" {
6 | node_pool_option_id = var.existent_oke_cluster_id
7 | }
8 |
9 | # Gets a list of supported images based on the shape, operating_system and operating_system_version provided
10 | data "oci_core_images" "node_pool_images" {
11 | compartment_id = var.oke_cluster_compartment_ocid
12 | operating_system = var.image_operating_system
13 | operating_system_version = var.image_operating_system_version
14 | shape = var.node_pool_shape
15 | sort_by = "TIMECREATED"
16 | sort_order = "DESC"
17 | }
18 |
19 | # Gets a list of Availability Domains
20 | data "oci_identity_availability_domains" "ADs" {
21 | compartment_id = var.oke_cluster_compartment_ocid
22 | }
23 |
24 | # Gets a specfic Availability Domain
25 | data "oci_identity_availability_domain" "specfic" {
26 | compartment_id = var.oke_cluster_compartment_ocid
27 | ad_number = var.node_pool_shape_specific_ad
28 |
29 | count = (var.node_pool_shape_specific_ad > 0) ? 1 : 0
30 | }
31 |
32 | # Prepare Cloud Unit for Node Pool nodes
33 | data "cloudinit_config" "nodes" {
34 | gzip = true
35 | base64_encode = true
36 |
37 | part {
38 | content_type = "text/x-shellscript"
39 | content = </var/run/oke-init.sh
42 | bash /var/run/oke-init.sh ${var.node_pool_oke_init_params}
43 | EOF
44 | }
45 |
46 | dynamic "part" {
47 | for_each = var.node_pool_cloud_init_parts
48 | content {
49 | content_type = part.value["content_type"]
50 | content = part.value["content"]
51 | filename = part.value["filename"]
52 | }
53 | }
54 | }
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/modules/corrino/modules/oke-node-pool/main.tf:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | #
4 |
5 | # File Version: 0.7.1
6 |
7 | resource "oci_containerengine_node_pool" "oke_node_pool" {
8 | cluster_id = var.oke_cluster_ocid
9 | compartment_id = var.oke_cluster_compartment_ocid
10 | kubernetes_version = local.node_k8s_version
11 | name = var.node_pool_name
12 | node_shape = var.node_pool_shape
13 | ssh_public_key = var.public_ssh_key
14 | freeform_tags = var.node_pools_tags.freeformTags
15 | defined_tags = var.node_pools_tags.definedTags
16 |
17 | node_config_details {
18 | dynamic "placement_configs" {
19 | for_each = local.node_pool_ads # data.oci_identity_availability_domains.ADs.availability_domains
20 |
21 | content {
22 | availability_domain = placement_configs.value.name
23 | subnet_id = var.nodes_subnet_id
24 | }
25 | }
26 | node_pool_pod_network_option_details {
27 | cni_type = var.cni_type
28 | max_pods_per_node = 31
29 | pod_nsg_ids = []
30 | pod_subnet_ids = [var.vcn_native_pod_networking_subnet_ocid]
31 | }
32 | # nsg_ids = []
33 | size = var.node_pool_min_nodes
34 | kms_key_id = var.oci_vault_key_id_oke_node_boot_volume != "" ? var.oci_vault_key_id_oke_node_boot_volume : null
35 | freeform_tags = var.worker_nodes_tags.freeformTags
36 | defined_tags = var.worker_nodes_tags.definedTags
37 | }
38 |
39 | dynamic "node_shape_config" {
40 | for_each = local.is_flexible_node_shape ? [1] : []
41 | content {
42 | ocpus = var.node_pool_node_shape_config_ocpus
43 | memory_in_gbs = var.node_pool_node_shape_config_memory_in_gbs
44 | }
45 | }
46 |
47 | node_source_details {
48 | source_type = "IMAGE"
49 | image_id = lookup(data.oci_core_images.node_pool_images.images[0], "id")
50 | boot_volume_size_in_gbs = var.node_pool_boot_volume_size_in_gbs
51 | }
52 | # node_eviction_node_pool_settings {
53 | # eviction_grace_duration = "PT1H"
54 | # is_force_delete_after_grace_duration = false
55 | # }
56 | node_metadata = {
57 | user_data = anytrue([var.node_pool_oke_init_params != "", var.node_pool_cloud_init_parts != []]) ? data.cloudinit_config.nodes.rendered : null
58 | }
59 |
60 | initial_node_labels {
61 | key = "name"
62 | value = var.node_pool_name
63 | }
64 |
65 | dynamic "initial_node_labels" {
66 | for_each = var.extra_initial_node_labels
67 |
68 | content {
69 | key = initial_node_labels.value.key
70 | value = initial_node_labels.value.value
71 | }
72 | }
73 |
74 | lifecycle {
75 | ignore_changes = [
76 | node_config_details.0.size
77 | ]
78 | }
79 |
80 | count = var.create_new_node_pool ? 1 : 0
81 | }
82 |
83 | locals {
84 | # Checks if is using Flexible Compute Shapes
85 | is_flexible_node_shape = contains(split(".", var.node_pool_shape), "Flex")
86 |
87 | # Gets the latest Kubernetes version supported by the node pool
88 | node_pool_k8s_latest_version = reverse(sort(data.oci_containerengine_node_pool_option.node_pool.kubernetes_versions))[0]
89 | node_k8s_version = (var.node_k8s_version == "Latest") ? local.node_pool_k8s_latest_version : var.node_k8s_version
90 |
91 | # Get ADs for the shape to be used on the node pool
92 | node_pool_ads = (var.node_pool_shape_specific_ad > 0) ? data.oci_identity_availability_domain.specfic : data.oci_identity_availability_domains.ADs.availability_domains
93 | }
94 |
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/modules/corrino/modules/oke-node-pool/outputs.tf:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Oracle and/or its affiliates. All rights reserved.
2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | #
4 |
5 | output "node_pool_name" {
6 | value = var.create_new_node_pool ? oci_containerengine_node_pool.oke_node_pool.0.name : var.existent_oke_nodepool_id_for_autoscaler
7 | }
8 | output "node_pool_min_nodes" {
9 | value = var.node_pool_min_nodes
10 | }
11 | output "node_pool_max_nodes" {
12 | value = var.node_pool_max_nodes
13 | }
14 | output "node_pool_id" {
15 | value = var.create_new_node_pool ? oci_containerengine_node_pool.oke_node_pool.0.id : var.existent_oke_nodepool_id_for_autoscaler
16 | }
17 | output "node_k8s_version" {
18 | value = local.node_k8s_version
19 | }
20 | output "node_pool_autoscaler_enabled" {
21 | value = var.node_pool_autoscaler_enabled
22 | }
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/modules/corrino/modules/oke-node-pool/versions.tf:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | #
4 |
5 | terraform {
6 | required_version = ">= 1.1"
7 | required_providers {
8 | oci = {
9 | source = "oracle/oci"
10 | version = "~> 4, < 5"
11 | # https://registry.terraform.io/providers/oracle/oci/
12 | }
13 | local = {
14 | source = "hashicorp/local"
15 | version = "~> 2"
16 | # https://registry.terraform.io/providers/hashicorp/local/
17 | }
18 | }
19 | }
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/modules/corrino/modules/oke/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2022 Oracle and/or its affiliates. All rights reserved.
2 |
3 | The Universal Permissive License (UPL), Version 1.0
4 |
5 | Subject to the condition set forth below, permission is hereby granted to any person obtaining a copy of this
6 | software, associated documentation and/or data (collectively the "Software"), free of charge and under any and
7 | all copyright rights in the Software, and any and all patent rights owned or freely licensable by each licensor
8 | hereunder covering either (i) the unmodified Software as contributed to or provided by such licensor, or
9 | (ii) the Larger Works (as defined below), to deal in both
10 |
11 | (a) the Software, and
12 | (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if one is included with the Software
13 | (each a “Larger Work” to which the Software is contributed by such licensors),
14 |
15 | without restriction, including without limitation the rights to copy, create derivative works of, display,
16 | perform, and distribute the Software and make, use, sell, offer for sale, import, export, have made, and have
17 | sold the Software and the Larger Work(s), and to sublicense the foregoing rights on either these or other terms.
18 |
19 | This license is subject to the following condition:
20 | The above copyright notice and either this complete permission notice or at a minimum a reference to the UPL must
21 | be included in all copies or substantial portions of the Software.
22 |
23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
24 | THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
26 | CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
27 | IN THE SOFTWARE.
28 |
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/modules/corrino/modules/oke/README.md:
--------------------------------------------------------------------------------
1 | # Terraform OKE Submodule
2 |
3 | This module deploys an OKE Kubernetes cluster.
4 |
5 | ## Usage
6 |
7 | ```hcl
8 | module "oke" {
9 | source = "./modules/oke"
10 |
11 | providers = {
12 | oci = oci
13 | oci.home_region = oci.home_region
14 | }
15 |
16 | # Oracle Cloud Infrastructure Tenancy and Compartment OCID
17 | tenancy_ocid = var.tenancy_ocid
18 | compartment_ocid = local.oke_compartment_ocid
19 | region = var.region
20 |
21 | # Deployment Tags + Freeform Tags + Defined Tags
22 | cluster_tags = local.oci_tag_values
23 | load_balancers_tags = local.oci_tag_values
24 | block_volumes_tags = local.oci_tag_values
25 |
26 | # OKE Cluster
27 | ## create_new_oke_cluster
28 | create_new_oke_cluster = var.create_new_oke_cluster
29 | existent_oke_cluster_id = var.existent_oke_cluster_id
30 |
31 | ## Network Details
32 | vcn_id = module.vcn.vcn_id
33 | network_cidrs = local.network_cidrs
34 | k8s_endpoint_subnet_id = local.create_subnets ? module.subnets["oke_k8s_endpoint_subnet"].subnet_id : var.existent_oke_k8s_endpoint_subnet_ocid
35 | lb_subnet_id = local.create_subnets ? module.subnets["oke_lb_subnet"].subnet_id : var.existent_oke_load_balancer_subnet_ocid
36 | cni_type = local.cni_type
37 | ### Cluster Workers visibility
38 | cluster_workers_visibility = var.cluster_workers_visibility
39 | ### Cluster API Endpoint visibility
40 | cluster_endpoint_visibility = var.cluster_endpoint_visibility
41 |
42 | ## Control Plane Kubernetes Version
43 | k8s_version = var.k8s_version
44 |
45 | ## Create Dynamic group and Policies for Autoscaler and OCI Metrics and Logging
46 | create_dynamic_group_for_nodes_in_compartment = var.create_dynamic_group_for_nodes_in_compartment
47 | create_compartment_policies = var.create_compartment_policies
48 |
49 | ## Encryption (OCI Vault/Key Management/KMS)
50 | oci_vault_key_id_oke_secrets = module.vault.oci_vault_key_id
51 | oci_vault_key_id_oke_image_policy = module.vault.oci_vault_key_id
52 | }
53 | ```
54 |
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/modules/corrino/modules/oke/datasources.tf:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021-2022 Oracle and/or its affiliates. All rights reserved.
2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | #
4 |
5 | data "oci_containerengine_cluster_option" "oke" {
6 | cluster_option_id = "all"
7 | }
8 | data "oci_containerengine_clusters" "oke" {
9 | compartment_id = local.oke_compartment_ocid
10 | }
11 |
12 | # Gets a list of Availability Domains
13 | data "oci_identity_availability_domains" "ADs" {
14 | compartment_id = local.oke_compartment_ocid
15 | }
16 |
17 | # Gets kubeconfig
18 | data "oci_containerengine_cluster_kube_config" "oke" {
19 | cluster_id = var.create_new_oke_cluster ? oci_containerengine_cluster.oke_cluster[0].id : var.existent_oke_cluster_id
20 | }
21 |
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/modules/corrino/modules/oke/main.tf:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021, 2022, Oracle and/or its affiliates. All rights reserved.
2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | #
4 |
5 | resource "oci_containerengine_cluster" "oke_cluster" {
6 | compartment_id = local.oke_compartment_ocid
7 | kubernetes_version = (var.k8s_version == "Latest") ? local.cluster_k8s_latest_version : var.k8s_version
8 | name = "${local.app_name} (${local.deploy_id})"
9 | vcn_id = var.vcn_id
10 | kms_key_id = var.oci_vault_key_id_oke_secrets != "" ? var.oci_vault_key_id_oke_secrets : null
11 | # type = var.cluster_type
12 | freeform_tags = var.cluster_tags.freeformTags
13 | defined_tags = var.cluster_tags.definedTags
14 |
15 | endpoint_config {
16 | is_public_ip_enabled = (var.cluster_endpoint_visibility == "Private") ? false : true
17 | subnet_id = var.k8s_endpoint_subnet_id
18 | nsg_ids = []
19 | }
20 | options {
21 | service_lb_subnet_ids = [var.lb_subnet_id]
22 | add_ons {
23 | is_kubernetes_dashboard_enabled = var.cluster_options_add_ons_is_kubernetes_dashboard_enabled
24 | is_tiller_enabled = false # Default is false, left here for reference
25 | }
26 | admission_controller_options {
27 | is_pod_security_policy_enabled = var.cluster_options_admission_controller_options_is_pod_security_policy_enabled
28 | }
29 | kubernetes_network_config {
30 | services_cidr = lookup(var.network_cidrs, "KUBERNETES-SERVICE-CIDR")
31 | pods_cidr = lookup(var.network_cidrs, "PODS-CIDR")
32 | }
33 | persistent_volume_config {
34 | freeform_tags = var.block_volumes_tags.freeformTags
35 | # defined_tags = var.block_volumes_tags.definedTags
36 | }
37 | service_lb_config {
38 | freeform_tags = var.load_balancers_tags.freeformTags
39 | # defined_tags = var.load_balancers_tags.definedTags
40 | }
41 | }
42 | image_policy_config {
43 | is_policy_enabled = false
44 | # key_details {
45 | # # kms_key_id = var.oci_vault_key_id_oke_image_policy != "" ? var.oci_vault_key_id_oke_image_policy : null
46 | # }
47 | }
48 | cluster_pod_network_options {
49 | cni_type = var.cni_type
50 | }
51 |
52 | lifecycle {
53 | ignore_changes = [freeform_tags, defined_tags, kubernetes_version, id]
54 | }
55 |
56 | count = var.create_new_oke_cluster ? 1 : 0
57 | }
58 |
59 | # Local kubeconfig for when using Terraform locally. Not used by Oracle Resource Manager
60 | resource "local_file" "oke_kubeconfig" {
61 | content = data.oci_containerengine_cluster_kube_config.oke.content
62 | filename = "${path.root}/generated/kubeconfig"
63 | file_permission = "0644"
64 | }
65 |
66 | # Get OKE options
67 | locals {
68 | cluster_k8s_latest_version = reverse(sort(data.oci_containerengine_cluster_option.oke.kubernetes_versions))[0]
69 | deployed_k8s_version = var.create_new_oke_cluster ? ((var.k8s_version == "Latest") ? local.cluster_k8s_latest_version : var.k8s_version) : [
70 | for x in data.oci_containerengine_clusters.oke.clusters : x.kubernetes_version if x.id == var.existent_oke_cluster_id][0]
71 | }
72 |
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/modules/corrino/modules/oke/oke-orm-private-endpoint.tf:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Oracle and/or its affiliates. All rights reserved.
2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | #
4 |
5 | ### Important Notice ###
6 | # OCI Resource Manager Private Endpoint is only available when using Resource Manager.
7 | # If you use local Terraform, you will need to setup an OCI Bastion for connectivity to the Private OKE.
8 | # If using OCI CloudShell, you need to activate the OCI Private Endpoint for OCI CLoud Shell.
9 |
10 | resource "oci_resourcemanager_private_endpoint" "private_kubernetes_endpoint" {
11 | compartment_id = local.oke_compartment_ocid
12 | display_name = "Private Endpoint for OKE ${local.app_name} - ${local.deploy_id}"
13 | description = "Resource Manager Private Endpoint for OKE for the ${local.app_name} - ${local.deploy_id}"
14 | vcn_id = var.vcn_id
15 | subnet_id = var.k8s_endpoint_subnet_id
16 | freeform_tags = var.cluster_tags.freeformTags
17 | defined_tags = var.cluster_tags.definedTags
18 |
19 | count = var.create_new_oke_cluster ? ((var.cluster_endpoint_visibility == "Private") ? 1 : 0) : 0
20 | }
21 |
22 | # Resolves the private IP of the customer's private endpoint to a NAT IP.
23 | data "oci_resourcemanager_private_endpoint_reachable_ip" "private_kubernetes_endpoint" {
24 | private_endpoint_id = var.create_new_oke_cluster ? oci_resourcemanager_private_endpoint.private_kubernetes_endpoint[0].id : var.existent_oke_cluster_private_endpoint
25 | private_ip = trimsuffix(oci_containerengine_cluster.oke_cluster[0].endpoints.0.private_endpoint, ":6443") # TODO: Pending rule when has existent cluster
26 |
27 | count = (var.cluster_endpoint_visibility == "Private") ? 1 : 0
28 | }
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/modules/corrino/modules/oke/outputs.tf:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Oracle and/or its affiliates. All rights reserved.
2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | #
4 |
5 | output "comments" {
6 | value = "The application URL will be unavailable for a few minutes after provisioning while the application is configured and deployed to Kubernetes"
7 | }
8 | output "deployed_oke_kubernetes_version" {
9 | value = local.deployed_k8s_version
10 | }
11 | output "deployed_to_region" {
12 | value = var.region
13 | }
14 | output "dev" {
15 | value = "Made with \u2764 by Oracle Developers"
16 | }
17 | output "kubeconfig" {
18 | value = data.oci_containerengine_cluster_kube_config.oke.content
19 | }
20 | output "kubeconfig_for_kubectl" {
21 | value = "export KUBECONFIG=${path.root}/generated/kubeconfig"
22 | description = "If using Terraform locally, this command set KUBECONFIG environment variable to run kubectl locally"
23 | }
24 | output "orm_private_endpoint_oke_api_ip_address" {
25 | value = (var.cluster_endpoint_visibility == "Private") ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_kubernetes_endpoint.0.ip_address : ""
26 | description = "OCI Resource Manager Private Endpoint ip address for OKE Kubernetes API Private Endpoint"
27 |
28 | depends_on = [
29 | oci_resourcemanager_private_endpoint.private_kubernetes_endpoint
30 | ]
31 | }
32 |
33 | # OKE info
34 | output "oke_cluster_ocid" {
35 | value = var.create_new_oke_cluster ? oci_containerengine_cluster.oke_cluster[0].id : ""
36 | description = "OKE Cluster OCID"
37 | }
38 | output "oke_cluster_compartment_ocid" {
39 | value = local.oke_compartment_ocid
40 | description = "Compartment OCID used by the OKE Cluster"
41 | }
42 |
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/modules/corrino/modules/oke/versions.tf:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | #
4 |
5 | terraform {
6 | required_version = ">= 1.1"
7 | required_providers {
8 | oci = {
9 | source = "oracle/oci"
10 | version = "~> 4, < 5"
11 | # https://registry.terraform.io/providers/oracle/oci/
12 | }
13 | local = {
14 | source = "hashicorp/local"
15 | version = "~> 2"
16 | # https://registry.terraform.io/providers/hashicorp/local/
17 | }
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/modules/corrino/outputs.tf:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Oracle and/or its affiliates. All rights reserved.
2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | #
4 |
5 | # Deployment outputs
6 |
7 | output "deploy_id" {
8 | value = local.deploy_id
9 | }
10 |
11 | # OKE Outputs
12 | output "comments" {
13 | value = module.oke.comments
14 | }
15 | output "deployed_oke_kubernetes_version" {
16 | value = module.oke.deployed_oke_kubernetes_version
17 | }
18 | output "deployed_to_region" {
19 | value = module.oke.deployed_to_region
20 | }
21 | output "kubeconfig" {
22 | value = module.oke.kubeconfig
23 | sensitive = true
24 | }
25 | output "kubeconfig_for_kubectl" {
26 | value = module.oke.kubeconfig_for_kubectl
27 | description = "If using Terraform locally, this command set KUBECONFIG environment variable to run kubectl locally"
28 | }
29 | output "oke_cluster_ocid" {
30 | value = module.oke.oke_cluster_ocid
31 | }
32 | output "oke_node_pools" {
33 | value = module.oke_node_pools
34 | }
35 | output "subnets" {
36 | value = module.subnets
37 | }
38 |
39 | output "dev" {
40 | value = module.oke.dev
41 | }
42 | ### Important Security Notice ###
43 | # The private key generated by this resource will be stored unencrypted in your Terraform state file.
44 | # Use of this resource for production deployments is not recommended.
45 | # Instead, generate a private key file outside of Terraform and distribute it securely to the system where Terraform will be run.
46 | output "generated_private_key_pem" {
47 | value = var.generate_public_ssh_key ? tls_private_key.oke_worker_node_ssh_key.private_key_pem : "No Keys Auto Generated"
48 | sensitive = true
49 | }
50 |
51 | output "cluster_tools_namespace" {
52 | value = module.cluster-tools.cluster_tools_namespace
53 | }
54 |
55 | output "helm_release_ingress_nginx" {
56 | value = module.cluster-tools.helm_release_ingress_nginx
57 | }
58 |
59 | output "helm_release_grafana" {
60 | value = module.cluster-tools.helm_release_grafana
61 | }
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/modules/corrino/policies.tf:
--------------------------------------------------------------------------------
1 | ## Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
2 | ## Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | ##
4 | #
5 | #module "cluster-dynamic-group" {
6 | # source = "./modules/oci-policies"
7 | #
8 | # providers = {
9 | # oci = oci
10 | # oci.home_region = oci.home_region
11 | # }
12 | #
13 | # # Oracle Cloud Infrastructure Tenancy
14 | # tenancy_ocid = var.tenancy_ocid
15 | #
16 | # # Deployment Tags + Freeform Tags + Defined Tags
17 | # oci_tag_values = local.oci_tag_values
18 | #
19 | # create_dynamic_group = true
20 | # dynamic_group_name = "OKE Cluster Nodes"
21 | # dynamic_group_matching_rules = [
22 | # "ALL {instance.compartment.id = '${local.oke_compartment_ocid}'}",
23 | # "ALL {resource.type = 'cluster', resource.compartment.id = '${local.oke_compartment_ocid}'}"
24 | # ]
25 | #
26 | # count = var.create_dynamic_group_for_nodes_in_compartment ? 1 : 0
27 | #}
28 | #
29 | #module "cluster-compartment-policies" {
30 | # source = "./modules/oci-policies"
31 | #
32 | # providers = {
33 | # oci = oci
34 | # oci.home_region = oci.home_region
35 | # }
36 | #
37 | # # Oracle Cloud Infrastructure Tenancy and Compartment OCID
38 | # tenancy_ocid = var.tenancy_ocid
39 | # compartment_ocid = local.oke_compartment_ocid
40 | #
41 | # oci_tag_values = local.oci_tag_values
42 | #
43 | # create_policy = true
44 | # policy_name = "OKE Cluster Compartment Policies"
45 | # policy_statements = [
46 | # "Allow dynamic-group ${local.dynamic_group_name} to manage cluster-node-pools in compartment id ${local.oke_compartment_ocid}",
47 | # "Allow dynamic-group ${local.dynamic_group_name} to manage instance-family in compartment id ${local.oke_compartment_ocid}",
48 | # "Allow dynamic-group ${local.dynamic_group_name} to use subnets in compartment id ${local.oke_compartment_ocid}",
49 | # "Allow dynamic-group ${local.dynamic_group_name} to read virtual-network-family in compartment id ${local.oke_compartment_ocid}",
50 | # "Allow dynamic-group ${local.dynamic_group_name} to use vnics in compartment id ${local.oke_compartment_ocid}",
51 | # "Allow dynamic-group ${local.dynamic_group_name} to inspect compartments in compartment id ${local.oke_compartment_ocid}",
52 | # "Allow dynamic-group ${local.dynamic_group_name} to use network-security-groups in compartment id ${local.oke_compartment_ocid}",
53 | # "Allow dynamic-group ${local.dynamic_group_name} to use private-ips in compartment id ${local.oke_compartment_ocid}",
54 | # "Allow dynamic-group ${local.dynamic_group_name} to manage public-ips in compartment id ${local.oke_compartment_ocid}"
55 | # ]
56 | #
57 | # count = var.create_compartment_policies ? 1 : 0
58 | #}
59 | #
60 | #locals {
61 | # dynamic_group_name = var.create_dynamic_group_for_nodes_in_compartment ? module.cluster-dynamic-group.0.dynamic_group_name : var.existent_dynamic_group_for_nodes_in_compartment
62 | #}
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/modules/corrino/schema.org.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oracle-quickstart/oci-ai-blueprints/31c64899055982e73cb2c1b95345faafe6d5c7ff/oci_ai_blueprints_terraform/modules/corrino/schema.org.yaml
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/modules/corrino/versions.tf:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | #
4 |
5 | terraform {
6 | required_version = ">= 1.1"
7 | required_providers {
8 | oci = {
9 | source = "oracle/oci"
10 | version = "~> 4, < 5"
11 | # https://registry.terraform.io/providers/oracle/oci/
12 | configuration_aliases = [oci.home_region]
13 | }
14 | kubernetes = {
15 | source = "hashicorp/kubernetes"
16 | version = "~> 2"
17 | # https://registry.terraform.io/providers/hashicorp/kubernetes/
18 | }
19 | helm = {
20 | source = "hashicorp/helm"
21 | version = "~> 2"
22 | # https://registry.terraform.io/providers/hashicorp/helm/
23 | }
24 | tls = {
25 | source = "hashicorp/tls"
26 | version = "~> 4"
27 | # https://registry.terraform.io/providers/hashicorp/tls/
28 | }
29 | local = {
30 | source = "hashicorp/local"
31 | version = "~> 2"
32 | # https://registry.terraform.io/providers/hashicorp/local/
33 | }
34 | random = {
35 | source = "hashicorp/random"
36 | version = "~> 3"
37 | # https://registry.terraform.io/providers/hashicorp/random/
38 | }
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/oke.tf:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Oracle and/or its affiliates. All rights reserved.
2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | #
4 |
5 | module "oke-quickstart" {
6 | # source = "github.com/oracle-quickstart/terraform-oci-corrino?ref=0.9.0"
7 | source = "./modules/corrino"
8 |
9 | providers = {
10 | oci = oci
11 | oci.home_region = oci.home_region
12 | }
13 |
14 | # Oracle Cloud Infrastructure Tenancy and Compartment OCID
15 | tenancy_ocid = var.tenancy_ocid
16 | compartment_ocid = var.compartment_ocid
17 | region = var.region
18 |
19 | # Note: Just few arguments are showing here to simplify the basic example. All other arguments are using default values.
20 | # App Name to identify deployment. Used for naming resources.
21 | app_name = local.app_name
22 | deploy_id = local.deploy_id
23 |
24 | # Freeform Tags + Defined Tags. Tags are applied to all resources.
25 | tag_values = { "freeformTags" = { "Environment" = "Development", "DeploymentType" = "basic", "QuickstartExample" = "basic-cluster" }, "definedTags" = {} }
26 |
27 | # OKE Node Pool 1 arguments
28 | node_pool_cni_type_1 = "FLANNEL_OVERLAY" # Use "OCI_VCN_IP_NATIVE" for VCN Native PODs Network. If the node pool 1 uses the OCI_VCN_IP_NATIVE, the cluster will also be configured with same cni
29 | node_pool_autoscaler_enabled_1 = true
30 | node_pool_initial_num_worker_nodes_1 = 1 # Minimum number of nodes in the node pool
31 | node_pool_max_num_worker_nodes_1 = 10 # Maximum number of nodes in the node pool
32 | node_pool_instance_shape_1 = { "instanceShape" = "VM.Standard.E4.Flex", "ocpus" = 2, "memory" = 64 } # If not using a Flex shape, ocpus and memory are ignored
33 | node_pool_boot_volume_size_in_gbs_1 = 60
34 |
35 | # VCN for OKE arguments
36 | vcn_cidr_blocks = "10.22.0.0/16"
37 |
38 | ingress_nginx_enabled = var.ingress_nginx_enabled
39 | cert_manager_enabled = var.cert_manager_enabled
40 | # Inverse - we only want to install if the user is NOT brining their own.
41 | metrics_server_enabled = !var.bring_your_own_metrics_server
42 | prometheus_enabled = !var.bring_your_own_prometheus
43 | grafana_enabled = !var.bring_your_own_grafana
44 | existent_prometheus_namespace = var.existent_prometheus_namespace
45 |
46 | create_new_oke_cluster = false
47 | existent_oke_cluster_id = var.existent_oke_cluster_id
48 |
49 | create_new_vcn = false
50 | existent_vcn_ocid = var.existent_vcn_ocid
51 |
52 | create_new_compartment_for_oke = false
53 | existent_vcn_compartment_ocid = var.compartment_ocid
54 |
55 | create_vault_policies_for_group = false
56 |
57 | create_subnets = false
58 | existent_oke_k8s_endpoint_subnet_ocid = var.existent_oke_k8s_endpoint_subnet_ocid
59 | existent_oke_nodes_subnet_ocid = var.existent_oke_nodes_subnet_ocid
60 | existent_oke_load_balancer_subnet_ocid = var.existent_oke_load_balancer_subnet_ocid
61 | # existent_oke_vcn_native_pod_networking_subnet_ocid = "" # Optional. Existent VCN Native POD Networking subnet if the CNI Type is "OCI_VCN_IP_NATIVE"
62 |
63 | }
64 |
65 |
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/policies.tf:
--------------------------------------------------------------------------------
1 | # Get compartment name for policy
2 | data "oci_identity_compartment" "oci_compartment" {
3 | id = var.compartment_ocid
4 | }
5 |
6 | # Define the dynamic group
7 | resource "oci_identity_dynamic_group" "dyn_group" {
8 | provider = oci.home_region
9 | name = "${local.app_name}-instance-dg"
10 | description = "Dynamic group for OKE instances across the tenancy"
11 | compartment_id = var.tenancy_ocid
12 | matching_rule = "ALL {instance.compartment.id = '${var.compartment_ocid}'}"
13 | freeform_tags = local.corrino_tags
14 | count = var.policy_creation_enabled ? 1 : 0
15 | }
16 |
17 | # Define the IAM policy
18 | resource "oci_identity_policy" "oke_instances_tenancy_policy" {
19 | provider = oci.home_region
20 | name = "${local.app_name}-dg-inst-policy"
21 | description = "Tenancy-level policy to grant needed permissions to the dynamic group"
22 | compartment_id = var.tenancy_ocid
23 |
24 | statements = [
25 | "Allow dynamic-group 'Default'/'${oci_identity_dynamic_group.dyn_group[0].name}' to manage all-resources in compartment ${data.oci_identity_compartment.oci_compartment.name}",
26 | "Allow dynamic-group 'Default'/'${oci_identity_dynamic_group.dyn_group[0].name}' to use all-resources in tenancy",
27 | "Allow dynamic-group 'Default'/'${oci_identity_dynamic_group.dyn_group[0].name}' to {CLUSTER_JOIN} in compartment ${data.oci_identity_compartment.oci_compartment.name}",
28 | "Allow dynamic-group 'Default'/'${oci_identity_dynamic_group.dyn_group[0].name}' to manage volumes in TENANCY where request.principal.type = 'cluster'",
29 | "Allow dynamic-group 'Default'/'${oci_identity_dynamic_group.dyn_group[0].name}' to manage volume-attachments in TENANCY where request.principal.type = 'cluster'"
30 | ]
31 | freeform_tags = local.corrino_tags
32 | count = var.policy_creation_enabled ? 1 : 0
33 | depends_on = [oci_identity_dynamic_group.dyn_group]
34 | }
35 |
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/random.tf:
--------------------------------------------------------------------------------
1 | resource "random_string" "generated_workspace_name" {
2 | length = 6
3 | special = false
4 | min_upper = 3
5 | min_lower = 3
6 | }
7 |
8 | resource "random_string" "generated_deployment_name" {
9 | length = 6
10 | special = false
11 | min_upper = 3
12 | min_lower = 3
13 | }
14 |
15 | resource "random_string" "corrino_django_secret" {
16 | length = 32
17 | special = true
18 | min_upper = 3
19 | min_lower = 3
20 | min_numeric = 3
21 | min_special = 3
22 | override_special = "{}#^*<>[]%~"
23 | }
24 |
25 | resource "random_string" "autonomous_database_wallet_password" {
26 | length = 16
27 | special = true
28 | min_upper = 3
29 | min_lower = 3
30 | min_numeric = 3
31 | min_special = 3
32 | override_special = "{}#^*<>[]%~"
33 | }
34 |
35 | resource "random_string" "autonomous_database_admin_password" {
36 | length = 16
37 | special = true
38 | min_upper = 3
39 | min_lower = 3
40 | min_numeric = 3
41 | min_special = 3
42 | override_special = "{}#^*<>[]%~"
43 | }
44 |
45 | resource "random_string" "subdomain" {
46 | length = 6
47 | special = false
48 | upper = false
49 | }
50 |
51 | resource "random_uuid" "registration_id" {
52 | }
53 |
54 | #resource "random_string" "registration_id" {
55 | # length = 8
56 | # special = false
57 | # upper = false
58 | #}
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/rbac.tf:
--------------------------------------------------------------------------------
1 | resource "kubernetes_cluster_role" "corrino_cluster_role" {
2 | metadata {
3 | name = "corrino-rbac"
4 | }
5 | rule {
6 | api_groups = [""]
7 | resources = ["*"]
8 | verbs = ["*"]
9 | }
10 |
11 | count = 1
12 | }
13 |
14 | resource "kubernetes_cluster_role_binding" "corrino_cluster_role_binding" {
15 | metadata {
16 | name = "corrino-rbac"
17 | }
18 | subject {
19 | kind = "ServiceAccount"
20 | name = "default"
21 | namespace = "default"
22 | }
23 | role_ref {
24 | kind = "ClusterRole"
25 | name = "cluster-admin"
26 | api_group = "rbac.authorization.k8s.io"
27 | }
28 |
29 | count = 1
30 | }
31 |
--------------------------------------------------------------------------------
/oci_ai_blueprints_terraform/versions.tf:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
2 | # Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.
3 | #
4 |
5 | terraform {
6 | required_version = ">= 1.1"
7 | required_providers {
8 | oci = {
9 | source = "oracle/oci"
10 | version = "~> 4, < 5"
11 | # https://registry.terraform.io/providers/oracle/oci/
12 | configuration_aliases = [oci.home_region]
13 | }
14 | kubernetes = {
15 | source = "hashicorp/kubernetes"
16 | version = "~> 2"
17 | # https://registry.terraform.io/providers/hashicorp/kubernetes/
18 | }
19 | helm = {
20 | source = "hashicorp/helm"
21 | version = "~> 2"
22 | # https://registry.terraform.io/providers/hashicorp/helm/
23 | }
24 | tls = {
25 | source = "hashicorp/tls"
26 | version = "~> 4"
27 | # https://registry.terraform.io/providers/hashicorp/tls/
28 | }
29 | local = {
30 | source = "hashicorp/local"
31 | version = "~> 2"
32 | # https://registry.terraform.io/providers/hashicorp/local/
33 | }
34 | random = {
35 | source = "hashicorp/random"
36 | version = "~> 3"
37 | # https://registry.terraform.io/providers/hashicorp/random/
38 | }
39 | }
40 | }
41 |
--------------------------------------------------------------------------------