├── .gitignore ├── 00-env-setup ├── README.md └── terraform │ ├── archive │ ├── artifact_registry.tf │ └── service_accounts.tf │ ├── caip-notebook.tf │ ├── gcs-bucket.tf │ ├── main.tf │ ├── services.tf │ ├── terraform.tfvars │ ├── variables.tf │ └── vertex-tensorboard.tf ├── 01-automl-tabular └── 01-vertex-automl-tabular-training-prediction.ipynb ├── 02-custom-job-tabular ├── 02-vertex-custom-job-tabular-training-prediction.ipynb ├── instances.json ├── model.png ├── predictor │ ├── Dockerfile │ └── model │ │ ├── saved_model.pb │ │ └── variables │ │ ├── variables.data-00000-of-00001 │ │ └── variables.index └── trainer │ └── Dockerfile ├── 03-distributed-training-text ├── 03-distributed-training-vertex-ai-bert-finetuning.ipynb └── scripts │ ├── Dockerfile │ └── trainer │ ├── __init__.py │ └── task.py ├── 04-vertex-pipeline-and-airflow ├── data_orchestration_bq_example_dag.py ├── get_composer_config.py ├── images │ ├── airflow_dag.png │ ├── airflow_dag_run.png │ ├── airflow_webserver_with_dag.png │ ├── pipeline_run.png │ └── trigger-airflow-dag-on-cloud-composer-from-vertex-pipeline.png └── vertex-pipeline-airflow.ipynb ├── 05-vertex-event-based-model-deploy ├── 05-event-based-vertex-model-deployment.ipynb └── images │ └── event_based_model_deployment.png ├── 06-vertex-train-deploy-r-model ├── images │ └── serving-with-custom-containers-on-vertex-predictions.png ├── vertex-train-deploy-r-model.ipynb └── vertex_ai_pipelines_r_model.ipynb ├── 07-vertex-train-deploy-lightgbm ├── images │ ├── serving-with-custom-containers-on-vertex-predictions.png │ └── training-with-custom-containers-on-vertex-training.png └── vertex-train-deploy-lightgbm-model.ipynb ├── 08-pytorch-distributed ├── README.md ├── pytorch_cifar10_distributed_reduction_server_sdk_custom.ipynb └── pytorch_cifar10_distributed_sdk_custom_batch_predictions.ipynb ├── 09-distributed-xgboost-dask ├── Dockerfile ├── README.md ├── train.py └── train.sh ├── 10-serving-ensemble-triton ├── models │ ├── combine │ │ ├── 1 │ │ │ └── model.py │ │ └── config.pbtxt │ ├── ensemble │ │ ├── 1 │ │ │ └── empty │ │ └── config.pbtxt │ ├── mux │ │ ├── 1 │ │ │ └── model.py │ │ └── config.pbtxt │ ├── sci_1 │ │ ├── 1 │ │ │ ├── checkpoint.tl │ │ │ └── sci_1.pkl │ │ └── config.pbtxt │ ├── sci_2 │ │ ├── 1 │ │ │ ├── checkpoint.tl │ │ │ └── sci_2.pkl │ │ └── config.pbtxt │ ├── tf │ │ ├── 1 │ │ │ └── model.savedmodel │ │ │ │ ├── saved_model.pb │ │ │ │ └── variables │ │ │ │ ├── variables.data-00000-of-00001 │ │ │ │ └── variables.index │ │ └── config.pbtxt │ └── xgb │ │ ├── 1 │ │ └── xgboost.json │ │ └── config.pbtxt ├── notebooks │ └── ensemble-nvidia-triton-custom-container-prediction.ipynb └── src │ ├── combine │ └── model.py │ ├── generate │ └── generate.py │ ├── mux │ └── model.py │ ├── sci_1 │ └── sci_1.py │ ├── sci_2 │ └── sci_2.py │ ├── test │ ├── combine_01.py │ ├── ensemble_01.py │ ├── mux.py │ ├── sci_1.py │ ├── sci_2.py │ ├── tf_01.py │ └── xgb_01.py │ ├── tf │ ├── tf.py │ └── tf.py_0 │ └── xgb │ └── xgb.py ├── 11-pytorch-on-tpu-vertex-ai └── pytorch-on-vertex-ai-tpu-train-mnist.ipynb ├── LICENSE ├── README.md └── images ├── automl.png ├── custom-tabular.png ├── custom-training-on-vertex-ai.png ├── pipeline.png ├── serving-with-custom-containers-on-vertex-predictions.png ├── training-with-custom-containers-on-vertex-training.png └── vertex-ai-labs-focus.png /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /00-env-setup/README.md: -------------------------------------------------------------------------------- 1 | # Creating a Vertex environment 2 | 3 | You can use the [Terraform](https://www.terraform.io/) scripts in the `terraform` folder to automatically provision the environment required by the samples. 4 | 5 | The scripts perform the following actions: 6 | 1. Enable the required Cloud APIs 7 | 2. Create a regional GCS bucket 8 | 3. Create an instance of Vertex Notebooks 9 | 4. Create service accounts for Vertex Training and Vertex Pipelines 10 | 5. Create an instance of Vertex Tensorboard 11 | 12 | ## Provision Environment 13 | 14 | To provision the environment: 15 | 16 | - Open Cloud Shell 17 | 18 | [![Open in Cloud Shell](https://gstatic.com/cloudssh/images/open-btn.svg)](https://ssh.cloud.google.com/cloudshell/editor?cloudshell_git_repo=https://github.com/RajeshThallam/vertex-ai-labs.git) 19 | 20 | 21 | - Navigate to `~/vertex-ai-labs/00-env-setup/terraform` 22 | ``` 23 | LOCAL_DIR=~/vertex-ai-labs 24 | cd $LOCAL_DIR/terraform 25 | ``` 26 | 27 | - Update the `terraform.tfvars` file with the values reflecting your environment (refer [Customize Configuration](#Customize-Configuration) to understand the variables for customization). Alternatively, you can provide the values using the Terraform CLI `-var` options when you execute `terraform apply` in the next step. 28 | 29 | - Execute the following commands. : 30 | ``` 31 | terraform init 32 | terraform apply 33 | ``` 34 | 35 | - To destroy the environment, execute: 36 | ``` 37 | terraform destroy 38 | ``` 39 | 40 | ## Customize Configuration 41 | 42 | You can customize your configuration using the following variables in `terraform.tfvars`: 43 | 44 | |Variable|Required|Default|Description| 45 | |--------|--------|-------|-----------| 46 | |name_prefix|Yes||Prefix added to the names of provisioned resources. **The prefix should start with a letter and include letters and digits only**.| 47 | |project_id|Yes||GCP project ID| 48 | |network_name|No|default|Name of the network for the Notebook instance. The network must already exist.| 49 | |subnet_name|No|default|Name of the subnet for the Notebook instance. The subnet must already exist.| 50 | |subnet_region|No|us-central1|Region where the subnet was created.| 51 | |zone|Yes||GCP zone for the Notebook instance. The zone must be in the region defined in the `subnet_region` variable| 52 | |machine_type|No|n1-standard-4|Machine type of the Notebook instance| 53 | |boot_disk_size|No|200GB|Size of the Notebook instance's boot disk| 54 | |image_family|No|tf-2-4-cpu|Image family for the Notebook instance| 55 | |gpu_type|No|null|GPU type of the Notebook instance. By default, the Notebook instance will be provisioned without a GPU| 56 | |gpu_count|No|null|GPU count of the Notebook instance| 57 | |install_gpu_driver|No|false|Whether to install a GPU driver| 58 | |region|No|Set to subnet_region.|GCP region for the GCS bucket and Artifact Registry. It is recommended that the same region is used for all: the bucket, the registry and the Notebook instance. If not provided the `egion` will be set to `subnet_region`.| 59 | |force_destroy|No|false|Whether to force the removal of the bucket on terraform destroy. **Note that by default the bucket will not be destroyed**.| -------------------------------------------------------------------------------- /00-env-setup/terraform/archive/artifact_registry.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | resource "google_artifact_registry_repository" "docker-repo" { 17 | provider = google-beta 18 | project = module.project-services.project_id 19 | location = local.region 20 | repository_id = "${var.name_prefix}-registry" 21 | description = "Docker repository" 22 | format = "DOCKER" 23 | } -------------------------------------------------------------------------------- /00-env-setup/terraform/archive/service_accounts.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Create Vertex Training service account 16 | resource "google_service_account" "training_sa" { 17 | project = module.project-services.project_id 18 | account_id = var.training_sa_name 19 | display_name = "Vertex Training service account" 20 | } 21 | 22 | # Create Vertex Training SA role bindings 23 | resource "google_project_iam_member" "training_sa_role_bindings" { 24 | project = module.project-services.project_id 25 | for_each = toset(var.training_sa_roles) 26 | member = "serviceAccount:${google_service_account.training_sa.email}" 27 | role = "roles/${each.value}" 28 | } 29 | 30 | # Create Vertex Pipelines service account 31 | resource "google_service_account" "pipelines_sa" { 32 | project = module.project-services.project_id 33 | account_id = var.pipelines_sa_name 34 | display_name = "Vertex Pipelines account name" 35 | } 36 | 37 | # Create Vertex Pipelines SA role bindings 38 | resource "google_project_iam_member" "role_bindings" { 39 | project = module.project-services.project_id 40 | for_each = toset(var.pipelines_sa_roles) 41 | member = "serviceAccount:${google_service_account.pipelines_sa.email}" 42 | role = "roles/${each.value}" 43 | } 44 | -------------------------------------------------------------------------------- /00-env-setup/terraform/caip-notebook.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | locals { 16 | image_project = "deeplearning-platform-release" 17 | } 18 | 19 | data "google_compute_network" "vm_network" { 20 | project = module.project-services.project_id 21 | name = var.network_name 22 | 23 | depends_on = [ 24 | module.project-services 25 | ] 26 | } 27 | 28 | data "google_compute_subnetwork" "vm_subnetwork" { 29 | project = module.project-services.project_id 30 | name = var.subnet_name 31 | region = var.subnet_region 32 | 33 | depends_on = [ 34 | module.project-services 35 | ] 36 | } 37 | 38 | resource "google_notebooks_instance" "notebook_instance" { 39 | project = module.project-services.project_id 40 | name = "${var.name_prefix}-notebook" 41 | machine_type = var.machine_type 42 | location = var.zone 43 | 44 | network = data.google_compute_network.vm_network.id 45 | subnet = data.google_compute_subnetwork.vm_subnetwork.id 46 | 47 | vm_image { 48 | project = local.image_project 49 | image_family = var.image_family 50 | } 51 | 52 | dynamic accelerator_config { 53 | for_each = var.gpu_type != null ? [1] : [] 54 | content { 55 | type = var.gpu_type 56 | core_count = var.gpu_count 57 | } 58 | } 59 | 60 | install_gpu_driver = var.install_gpu_driver 61 | 62 | boot_disk_size_gb = var.boot_disk_size 63 | } 64 | -------------------------------------------------------------------------------- /00-env-setup/terraform/gcs-bucket.tf: -------------------------------------------------------------------------------- 1 | 2 | # Copyright 2021 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | resource "google_storage_bucket" "artifact_repo" { 18 | project = module.project-services.project_id 19 | name = "${var.name_prefix}-${var.project_id}-bucket" 20 | location = local.region 21 | storage_class = local.bucket_type 22 | force_destroy = var.force_destroy 23 | } -------------------------------------------------------------------------------- /00-env-setup/terraform/main.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_version = ">= 0.14" 17 | required_providers { 18 | google = "~> 3.6" 19 | } 20 | 21 | # backend "gcs" { 22 | # bucket = "jk-terraform-state" 23 | # prefix = "vertex-ai-env" 24 | # } 25 | } 26 | 27 | provider "google" { 28 | project = var.project_id 29 | } 30 | 31 | 32 | data "google_project" "project" { 33 | project_id = var.project_id 34 | } 35 | 36 | locals { 37 | bucket_type = "REGIONAL" 38 | region = var.region == null ? var.subnet_region : var.region 39 | } 40 | 41 | 42 | -------------------------------------------------------------------------------- /00-env-setup/terraform/services.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | module "project-services" { 17 | source = "terraform-google-modules/project-factory/google//modules/project_services" 18 | 19 | project_id = data.google_project.project.project_id 20 | 21 | disable_services_on_destroy = false 22 | activate_apis = [ 23 | "compute.googleapis.com", 24 | "iam.googleapis.com", 25 | "container.googleapis.com", 26 | "artifactregistry.googleapis.com", 27 | "cloudresourcemanager.googleapis.com", 28 | "cloudtrace.googleapis.com", 29 | "iamcredentials.googleapis.com", 30 | "monitoring.googleapis.com", 31 | "logging.googleapis.com", 32 | "notebooks.googleapis.com", 33 | "aiplatform.googleapis.com", 34 | "dataflow.googleapis.com", 35 | "bigquery.googleapis.com", 36 | "cloudbuild.googleapis.com", 37 | "bigquerydatatransfer.googleapis.com", 38 | ] 39 | } 40 | 41 | output "api_activated" { 42 | value = true 43 | } -------------------------------------------------------------------------------- /00-env-setup/terraform/terraform.tfvars: -------------------------------------------------------------------------------- 1 | project_id = "CHANGE THIS WITH YOUR PROJECT ID e.g. from Qwiklabs console" 2 | subnet_region = "us-central1" 3 | zone = "us-central1-a" 4 | name_prefix = "CHANGE THIS WITH YOUR STRING e.g. your initials" 5 | machine_type = "n1-standard-8" 6 | #gpu_type = "NVIDIA_TESLA_T4" 7 | #gpu_count = 1 8 | #install_gpu_driver = true 9 | #image_family = "tf-2-4-gpu" 10 | 11 | 12 | -------------------------------------------------------------------------------- /00-env-setup/terraform/variables.tf: -------------------------------------------------------------------------------- 1 | 2 | # Copyright 2021 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | variable "project_id" { 18 | description = "The GCP project ID" 19 | type = string 20 | } 21 | 22 | variable "region" { 23 | description = "The region for the GCS bucket and Artifact Registry" 24 | type = string 25 | default = null 26 | } 27 | 28 | variable "zone" { 29 | description = "The zone for a Vertex Notebook instance" 30 | type = string 31 | } 32 | 33 | variable "name_prefix" { 34 | description = "The name prefix to add to the resource names" 35 | type = string 36 | } 37 | 38 | variable "machine_type" { 39 | description = "The Notebook instance's machine type" 40 | type = string 41 | } 42 | 43 | variable "network_name" { 44 | description = "The network name for the Notebook instance" 45 | type = string 46 | default = "default" 47 | } 48 | 49 | variable "subnet_name" { 50 | description = "The subnet name for the Notebook instance" 51 | type = string 52 | default = "default" 53 | } 54 | 55 | variable "subnet_region" { 56 | description = "The region for the Notebook subnet" 57 | type = string 58 | default = "us-central1" 59 | } 60 | 61 | variable "boot_disk_size" { 62 | description = "The size of the boot disk" 63 | default = 200 64 | } 65 | 66 | variable "image_family" { 67 | description = "A Deep Learning image family for the Notebook instance" 68 | type = string 69 | default = "tf-2-4-cpu" 70 | } 71 | 72 | variable "gpu_type" { 73 | description = "A GPU type for the Notebook instance" 74 | type = string 75 | default = null 76 | } 77 | 78 | variable "gpu_count" { 79 | description = "A GPU count for the Notebook instance" 80 | type = string 81 | default = null 82 | } 83 | 84 | variable "install_gpu_driver" { 85 | description = "Whether to install GPU driver" 86 | type = bool 87 | default = false 88 | } 89 | 90 | variable "force_destroy" { 91 | description = "Whether to remove the bucket on destroy" 92 | type = bool 93 | default = false 94 | } 95 | 96 | variable "training_sa_roles" { 97 | description = "The roles to assign to the Vertex Training service account" 98 | default = [ 99 | "storage.admin", 100 | "aiplatform.user", 101 | "bigquery.admin" 102 | ] 103 | } 104 | 105 | variable "pipelines_sa_roles" { 106 | description = "The roles to assign to the Vertex Pipelines service account" 107 | default = [ 108 | "storage.admin", 109 | "bigquery.admin", 110 | "aiplatform.user" 111 | ] 112 | } 113 | 114 | variable "training_sa_name" { 115 | description = "Vertex training service account name." 116 | default = "training-sa" 117 | } 118 | 119 | variable "pipelines_sa_name" { 120 | description = "Vertex pipelines service account name." 121 | default = "pipelines-sa" 122 | } 123 | -------------------------------------------------------------------------------- /00-env-setup/terraform/vertex-tensorboard.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | module tensorboard { 16 | source = "terraform-google-modules/gcloud/google" 17 | version = "~> 2.0" 18 | 19 | platform = "linux" 20 | 21 | create_cmd_entrypoint = "printf 'yes' | gcloud" 22 | create_cmd_body = "beta ai tensorboards create --display-name ${var.name_prefix}-${var.subnet_region}-tensorboard --project ${var.project_id} --region ${var.subnet_region}" 23 | destroy_cmd_entrypoint = "printf 'yes' | gcloud" 24 | destroy_cmd_body = "beta ai tensorboards delete $(gcloud beta ai tensorboards list --region ${var.subnet_region} --filter='displayName:${var.name_prefix}-${var.subnet_region}-tensorboard' --format='value(name)' --project ${var.project_id})" 25 | 26 | depends_on = [module.project-services.api_activated] 27 | } -------------------------------------------------------------------------------- /01-automl-tabular/01-vertex-automl-tabular-training-prediction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Copyright 2020 Google LLC\n", 10 | "#\n", 11 | "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", 12 | "# you may not use this file except in compliance with the License.\n", 13 | "# You may obtain a copy of the License at\n", 14 | "#\n", 15 | "# https://www.apache.org/licenses/LICENSE-2.0\n", 16 | "#\n", 17 | "# Unless required by applicable law or agreed to in writing, software\n", 18 | "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", 19 | "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", 20 | "# See the License for the specific language governing permissions and\n", 21 | "# limitations under the License." 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "# Training and deploying a tabular model using Vertex AutoML.\n", 29 | "\n", 30 | "![Training pipeline](../images/automl.png)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "## Install required packages" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "# Get the site-packages directory so we can remove invalid packages.\n", 47 | "import site\n", 48 | "sp = site.getsitepackages()[0]\n", 49 | "print(sp)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "%%bash -s \"$sp\"\n", 59 | "# Remove the invalide site-packages\n", 60 | "echo $1\n", 61 | "sudo rm -rf $1/~*" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "%%bash\n", 71 | "pip install --user google-cloud-aiplatform --upgrade\n", 72 | "pip install --user kfp --upgrade\n", 73 | "pip install --user google-cloud-pipeline-components --upgrade\n", 74 | "pip install --user google-cloud-bigquery-datatransfer --upgrade" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "\n", 82 | "### Restart the kernel\n", 83 | "Once you've installed the required packages, you need to restart the notebook kernel so it can find the packages." 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "# Automatically restart kernel after installs\n", 93 | "import IPython\n", 94 | "app = IPython.Application.instance()\n", 95 | "app.kernel.do_shutdown(True)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "## Import the required packages" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "import os\n", 112 | "import pprint\n", 113 | "import pandas as pd\n", 114 | "import tensorflow as tf\n", 115 | "import time\n", 116 | "from datetime import datetime\n", 117 | "import csv\n", 118 | "\n", 119 | "import matplotlib.pyplot as plt\n", 120 | "\n", 121 | "import google.auth\n", 122 | "\n", 123 | "from google.cloud import aiplatform as vertex_ai\n", 124 | "from google.cloud.aiplatform_v1beta1 import types\n", 125 | "from google.cloud import bigquery\n", 126 | "from google.cloud import exceptions\n", 127 | "\n", 128 | "from tensorflow.keras import layers\n", 129 | "from tensorflow.keras.layers.experimental import preprocessing\n", 130 | "\n", 131 | "from tensorflow_io import bigquery as tfio_bq\n" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "## Configure GCP settings\n", 139 | "\n", 140 | "*Before running the notebook make sure to follow the repo's README file to install the pre-requisites and configure GCP authentication.*" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "creds, PROJECT = google.auth.default()\n", 150 | "print(creds)\n", 151 | "REGION = 'us-central1'\n", 152 | "\n", 153 | "STAGING_BUCKET = f'gs://{PROJECT}-labs'\n", 154 | "\n", 155 | "# Get the configured service account this notebook is running as\n", 156 | "bash_output = !gcloud config list account --format \"value(core.account)\" 2> /dev/null\n", 157 | "VERTEX_SA = bash_output[0]\n", 158 | "\n", 159 | "print(f\"PROJECT = {PROJECT}\")\n", 160 | "print(f\"STAGING_BUCKET = {STAGING_BUCKET}\")\n", 161 | "print(f\"VERTEX_SA = {VERTEX_SA}\")\n", 162 | "\n", 163 | "# Create the bucket. Ignore error if it already exists.\n", 164 | "!gsutil mb -l $REGION $STAGING_BUCKET" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "## Preparing training data in BigQuery" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "### Explore Chicago Taxi dataset" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "%%bigquery data\n", 188 | "\n", 189 | "SELECT \n", 190 | " *\n", 191 | "FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`\n", 192 | "LIMIT 3" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "data.head().T" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "%%bigquery data\n", 211 | "\n", 212 | "SELECT \n", 213 | " CAST(EXTRACT(DAYOFWEEK FROM trip_start_timestamp) AS string) AS trip_dayofweek, \n", 214 | " FORMAT_DATE('%A',cast(trip_start_timestamp as date)) AS trip_dayname,\n", 215 | " COUNT(*) as trip_count,\n", 216 | "FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`\n", 217 | "WHERE\n", 218 | " EXTRACT(YEAR FROM trip_start_timestamp) = 2020 \n", 219 | "GROUP BY\n", 220 | " trip_dayofweek,\n", 221 | " trip_dayname\n", 222 | "ORDER BY\n", 223 | " trip_dayofweek" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "data" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "data.plot(kind='bar', x='trip_dayname', y='trip_count')" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "### Create data splits" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "BQ_DATASET_NAME = f'vertex_lab01' \n", 258 | "BQ_TABLE_NAME = 'features'\n", 259 | "BQ_LOCATION = 'US'\n", 260 | "SAMPLE_SIZE = 500000\n", 261 | "YEAR = 2020" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "metadata": {}, 267 | "source": [ 268 | "#### Create a BQ dataset to host the splits" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [ 277 | "client = bigquery.Client()\n", 278 | "\n", 279 | "dataset_id = f'{PROJECT}.{BQ_DATASET_NAME}'\n", 280 | "dataset = bigquery.Dataset(dataset_id)\n", 281 | "dataset.location = BQ_LOCATION\n", 282 | "\n", 283 | "try:\n", 284 | " dataset = client.create_dataset(dataset, timeout=30)\n", 285 | " print('Created dataset: ', dataset_id)\n", 286 | "except exceptions.Conflict:\n", 287 | " print('Dataset {} already exists'.format(dataset_id))" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": {}, 293 | "source": [ 294 | "#### Create a table with training features" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "sample_size = 1000000\n", 304 | "year = 2020\n", 305 | "\n", 306 | "sql_script_template = '''\n", 307 | "CREATE OR REPLACE TABLE `@PROJECT.@DATASET.@TABLE` \n", 308 | "AS (\n", 309 | " WITH\n", 310 | " taxitrips AS (\n", 311 | " SELECT\n", 312 | " FORMAT_DATETIME('%Y-%d-%m', trip_start_timestamp) AS date,\n", 313 | " trip_start_timestamp,\n", 314 | " trip_seconds,\n", 315 | " trip_miles,\n", 316 | " payment_type,\n", 317 | " pickup_longitude,\n", 318 | " pickup_latitude,\n", 319 | " dropoff_longitude,\n", 320 | " dropoff_latitude,\n", 321 | " tips,\n", 322 | " fare\n", 323 | " FROM\n", 324 | " `bigquery-public-data.chicago_taxi_trips.taxi_trips`\n", 325 | " WHERE 1=1 \n", 326 | " AND pickup_longitude IS NOT NULL\n", 327 | " AND pickup_latitude IS NOT NULL\n", 328 | " AND dropoff_longitude IS NOT NULL\n", 329 | " AND dropoff_latitude IS NOT NULL\n", 330 | " AND trip_miles > 0\n", 331 | " AND trip_seconds > 0\n", 332 | " AND fare > 0\n", 333 | " AND EXTRACT(YEAR FROM trip_start_timestamp) = @YEAR\n", 334 | " )\n", 335 | "\n", 336 | " SELECT\n", 337 | " trip_start_timestamp,\n", 338 | " EXTRACT(MONTH from trip_start_timestamp) as trip_month,\n", 339 | " EXTRACT(DAY from trip_start_timestamp) as trip_day,\n", 340 | " EXTRACT(DAYOFWEEK from trip_start_timestamp) as trip_day_of_week,\n", 341 | " EXTRACT(HOUR from trip_start_timestamp) as trip_hour,\n", 342 | " trip_seconds,\n", 343 | " trip_miles,\n", 344 | " payment_type,\n", 345 | " ST_AsText(\n", 346 | " ST_SnapToGrid(ST_GeogPoint(pickup_longitude, pickup_latitude), 0.1)\n", 347 | " ) AS pickup_grid,\n", 348 | " ST_AsText(\n", 349 | " ST_SnapToGrid(ST_GeogPoint(dropoff_longitude, dropoff_latitude), 0.1)\n", 350 | " ) AS dropoff_grid,\n", 351 | " ST_Distance(\n", 352 | " ST_GeogPoint(pickup_longitude, pickup_latitude), \n", 353 | " ST_GeogPoint(dropoff_longitude, dropoff_latitude)\n", 354 | " ) AS euclidean,\n", 355 | " IF((tips/fare >= 0.2), 1, 0) AS tip_bin,\n", 356 | " CASE (ABS(MOD(FARM_FINGERPRINT(date),10))) \n", 357 | " WHEN 9 THEN 'TEST'\n", 358 | " WHEN 8 THEN 'VALIDATE'\n", 359 | " ELSE 'TRAIN' END AS data_split\n", 360 | " FROM\n", 361 | " taxitrips\n", 362 | " LIMIT @LIMIT\n", 363 | ")\n", 364 | "'''\n", 365 | "\n", 366 | "sql_script = sql_script_template.replace(\n", 367 | " '@PROJECT', PROJECT).replace(\n", 368 | " '@DATASET', BQ_DATASET_NAME).replace(\n", 369 | " '@TABLE', BQ_TABLE_NAME).replace(\n", 370 | " '@YEAR', str(year)).replace(\n", 371 | " '@LIMIT', str(sample_size))\n", 372 | "\n", 373 | "job = client.query(sql_script)\n", 374 | "job.result()" 375 | ] 376 | }, 377 | { 378 | "cell_type": "markdown", 379 | "metadata": {}, 380 | "source": [ 381 | "#### Review the created features" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [ 390 | "sql_script = f'''\n", 391 | "SELECT * EXCEPT (trip_start_timestamp)\n", 392 | "FROM `{PROJECT}.{BQ_DATASET_NAME}.{BQ_TABLE_NAME}`\n", 393 | "'''\n", 394 | "df = client.query(sql_script).result().to_dataframe()" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "df.head().T" 404 | ] 405 | }, 406 | { 407 | "cell_type": "markdown", 408 | "metadata": {}, 409 | "source": [ 410 | "## Creating a tabular dataset in Vertex" 411 | ] 412 | }, 413 | { 414 | "cell_type": "markdown", 415 | "metadata": {}, 416 | "source": [ 417 | "### Initialize Vertex AI SDK" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": null, 423 | "metadata": {}, 424 | "outputs": [], 425 | "source": [ 426 | "vertex_ai.init(\n", 427 | " project=PROJECT,\n", 428 | " location=REGION,\n", 429 | " staging_bucket=STAGING_BUCKET\n", 430 | ")" 431 | ] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "metadata": {}, 436 | "source": [ 437 | "### Create a dataset and import data" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": null, 443 | "metadata": {}, 444 | "outputs": [], 445 | "source": [ 446 | "display_name = 'Chicago taxi trips'\n", 447 | "bq_source_uri = f'bq://{PROJECT}.{BQ_DATASET_NAME}.{BQ_TABLE_NAME}'\n", 448 | "\n", 449 | "filter = f'display_name=\"{display_name}\"'\n", 450 | "\n", 451 | "dataset = vertex_ai.TabularDataset.list(filter=filter)\n", 452 | "if not dataset:\n", 453 | " print(\"Creating a new dataset.\")\n", 454 | " dataset = vertex_ai.TabularDataset.create(\n", 455 | " display_name=display_name, bq_source=bq_source_uri,\n", 456 | " )\n", 457 | "\n", 458 | " dataset.wait()\n", 459 | "else:\n", 460 | " print(\"Using existing dataset: \", dataset[0].resource_name)\n", 461 | " dataset = vertex_ai.TabularDataset(dataset_name=dataset[0].resource_name)" 462 | ] 463 | }, 464 | { 465 | "cell_type": "markdown", 466 | "metadata": {}, 467 | "source": [ 468 | "## Launching an AutoML training job" 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": null, 474 | "metadata": {}, 475 | "outputs": [], 476 | "source": [ 477 | "display_name = 'Chicago Taxi classifier training'\n", 478 | "model_display_name = 'Chicago Taxi classifier'\n", 479 | "target_column = 'tip_bin'\n", 480 | "optimization_prediction_type = 'classification'\n", 481 | "optimization_objective = 'maximize-recall-at-precision'\n", 482 | "optimization_objective_precision_value = 0.7\n", 483 | "split_column = 'data_split'\n", 484 | "budget_milli_node_hours = 1000\n", 485 | "\n", 486 | "column_transformations = [\n", 487 | " {'categorical': {'column_name': 'trip_month'}},\n", 488 | " {'categorical': {'column_name': 'trip_day'}},\n", 489 | " {'categorical': {'column_name': 'trip_day_of_week'}},\n", 490 | " {'categorical': {'column_name': 'trip_hour'}},\n", 491 | " {'categorical': {'column_name': 'payment_type'}},\n", 492 | " {'categorical': {'column_name': 'pickup_grid'}},\n", 493 | " {'categorical': {'column_name': 'dropoff_grid'}},\n", 494 | " {'numeric': {'column_name': 'trip_seconds'}},\n", 495 | " {'numeric': {'column_name': 'euclidean'}},\n", 496 | " {'numeric': {'column_name': 'trip_miles'}},\n", 497 | "]\n", 498 | "\n", 499 | "job = vertex_ai.AutoMLTabularTrainingJob(\n", 500 | " display_name=display_name,\n", 501 | " optimization_prediction_type=optimization_prediction_type,\n", 502 | " optimization_objective=optimization_objective,\n", 503 | " optimization_objective_precision_value=optimization_objective_precision_value,\n", 504 | " column_transformations=column_transformations,\n", 505 | ")\n", 506 | "\n", 507 | "model = job.run(\n", 508 | " dataset=dataset,\n", 509 | " target_column=target_column,\n", 510 | " budget_milli_node_hours=budget_milli_node_hours,\n", 511 | " model_display_name=model_display_name,\n", 512 | " predefined_split_column_name=split_column,\n", 513 | " sync=False\n", 514 | ")" 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": null, 520 | "metadata": {}, 521 | "outputs": [], 522 | "source": [ 523 | "print(f\"Job Name: {job.display_name}\")\n", 524 | "print(f\"Job Resource Name: {job.resource_name}\\n\")\n", 525 | "print(f\"Check training progress at {job._dashboard_uri()}\")" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": null, 531 | "metadata": {}, 532 | "outputs": [], 533 | "source": [ 534 | "# This blocks until the model is finished training.\n", 535 | "# This takes about 70min, given 1000 milli_node_hours budget (1 hour, paid for),\n", 536 | "# plus 10min startup times (free).\n", 537 | "\n", 538 | "model.wait()\n", 539 | "print(f\"Job Name: {model.display_name}\")" 540 | ] 541 | }, 542 | { 543 | "cell_type": "markdown", 544 | "metadata": {}, 545 | "source": [ 546 | "## Deploy Model\n" 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "execution_count": null, 552 | "metadata": {}, 553 | "outputs": [], 554 | "source": [ 555 | "endpoint = model.deploy(machine_type=\"n1-standard-4\", sync=False)" 556 | ] 557 | }, 558 | { 559 | "cell_type": "markdown", 560 | "metadata": {}, 561 | "source": [ 562 | "## Model Deployment\n", 563 | "\n", 564 | "Now deploy the trained Vertex Model resource for batch and online prediction.\n", 565 | "\n", 566 | "For online prediction, you:\n", 567 | "\n", 568 | "- Create an Endpoint resource for deploying the Model resource to.\n", 569 | "- Deploy the Model resource to the Endpoint resource.\n", 570 | "- Make online prediction requests to the Endpoint resource.\n", 571 | "\n", 572 | "For batch-prediction, you:\n", 573 | "\n", 574 | "- Create a batch prediction job.\n", 575 | "- The job service will provision resources for the batch prediction request.\n", 576 | "- The results of the batch prediction request are returned to the caller.\n", 577 | "- The job service will unprovision the resoures for the batch prediction request." 578 | ] 579 | }, 580 | { 581 | "cell_type": "markdown", 582 | "metadata": {}, 583 | "source": [ 584 | "### Predict on Endpoint - Online Prediction" 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": null, 590 | "metadata": {}, 591 | "outputs": [], 592 | "source": [ 593 | "job.state" 594 | ] 595 | }, 596 | { 597 | "cell_type": "code", 598 | "execution_count": null, 599 | "metadata": {}, 600 | "outputs": [], 601 | "source": [ 602 | "# Block until the endpoint is deployed, which takes about 12min.\n", 603 | "endpoint.wait()" 604 | ] 605 | }, 606 | { 607 | "cell_type": "code", 608 | "execution_count": null, 609 | "metadata": {}, 610 | "outputs": [], 611 | "source": [ 612 | "test_instances = [ \n", 613 | " \n", 614 | " {\n", 615 | " \"dropoff_grid\": \"POINT(-87.6 41.9)\",\n", 616 | " \"euclidean\": 2064.2696,\n", 617 | " \"payment_type\": \"Credit Card\",\n", 618 | " \"pickup_grid\": \"POINT(-87.6 41.9)\",\n", 619 | " \"trip_miles\": 1.37,\n", 620 | " \"trip_day\": \"12\",\n", 621 | " \"trip_hour\": \"16\",\n", 622 | " \"trip_month\": \"2\",\n", 623 | " \"trip_day_of_week\": \"4\",\n", 624 | " \"trip_seconds\": \"555\"\n", 625 | " }\n", 626 | "]\n", 627 | "\n", 628 | "predictions = endpoint.predict(instances=test_instances)" 629 | ] 630 | }, 631 | { 632 | "cell_type": "code", 633 | "execution_count": null, 634 | "metadata": {}, 635 | "outputs": [], 636 | "source": [ 637 | "predictions" 638 | ] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": null, 643 | "metadata": {}, 644 | "outputs": [], 645 | "source": [ 646 | "predictions = endpoint.predict(instances=test_instances)\n", 647 | "predictions" 648 | ] 649 | }, 650 | { 651 | "cell_type": "markdown", 652 | "metadata": {}, 653 | "source": [ 654 | "### Batch Prediction Job\n", 655 | "\n", 656 | "Now do a batch prediction to your deployed model." 657 | ] 658 | }, 659 | { 660 | "cell_type": "markdown", 661 | "metadata": {}, 662 | "source": [ 663 | "#### Make test items" 664 | ] 665 | }, 666 | { 667 | "cell_type": "code", 668 | "execution_count": null, 669 | "metadata": {}, 670 | "outputs": [], 671 | "source": [ 672 | "sql_script = f'''\n", 673 | "SELECT trip_month, trip_day, trip_day_of_week, trip_hour, payment_type, pickup_grid, dropoff_grid, trip_seconds, euclidean, trip_miles\n", 674 | "FROM `{PROJECT}.{BQ_DATASET_NAME}.{BQ_TABLE_NAME}`\n", 675 | "LIMIT 1000\n", 676 | "'''\n", 677 | "\n", 678 | "dtypes = {\n", 679 | " 'dropoff_grid': str,\n", 680 | " 'euclidean': 'float64',\n", 681 | " 'trip_month': str,\n", 682 | " 'trip_day': str,\n", 683 | " 'trip_day_of_week': str,\n", 684 | " 'trip_hour': str,\n", 685 | " 'payment_type': str,\n", 686 | " 'pickup_grid': str,\n", 687 | " 'trip_seconds': str,\n", 688 | " 'trip_miles': 'float64'\n", 689 | "}\n", 690 | "\n", 691 | "df_test_batch = client.query(sql_script).result().to_dataframe(dtypes=dtypes)" 692 | ] 693 | }, 694 | { 695 | "cell_type": "code", 696 | "execution_count": null, 697 | "metadata": {}, 698 | "outputs": [], 699 | "source": [ 700 | "df_test_batch.head()" 701 | ] 702 | }, 703 | { 704 | "cell_type": "code", 705 | "execution_count": null, 706 | "metadata": {}, 707 | "outputs": [], 708 | "source": [ 709 | "df_test_batch.dtypes" 710 | ] 711 | }, 712 | { 713 | "cell_type": "code", 714 | "execution_count": null, 715 | "metadata": {}, 716 | "outputs": [], 717 | "source": [ 718 | "out_file_name = \"bq_export_features_test.csv\"\n", 719 | "gcs_batch_request_csv = f'{STAGING_BUCKET}/test/batch/{out_file_name}'\n", 720 | "df_test_batch.to_csv(f'{STAGING_BUCKET}/test/batch/bq_export_features_test.csv',\n", 721 | " header=True, \n", 722 | " index=False,\n", 723 | " quoting=csv.QUOTE_NONNUMERIC,\n", 724 | " escapechar=\"\\\\\",\n", 725 | " doublequote=False\n", 726 | " )" 727 | ] 728 | }, 729 | { 730 | "cell_type": "code", 731 | "execution_count": null, 732 | "metadata": {}, 733 | "outputs": [], 734 | "source": [ 735 | "!gsutil cat $gcs_batch_request_csv | head" 736 | ] 737 | }, 738 | { 739 | "cell_type": "markdown", 740 | "metadata": {}, 741 | "source": [ 742 | "#### Make the batch prediction request\n", 743 | "\n", 744 | "Now that your Model resource is trained, you can make a batch prediction by invoking the batch_request() method, with the following parameters:\n", 745 | "\n", 746 | "- `job_display_name`: The human readable name for the batch prediction job.\n", 747 | "- `gcs_source`: A list of one or more batch request input files.\n", 748 | "- `gcs_destination_prefix`: The Cloud Storage location for storing the batch prediction resuls.\n", 749 | "- `sync`: If set to True, the call will block while waiting for the asynchronous batch job to complete." 750 | ] 751 | }, 752 | { 753 | "cell_type": "code", 754 | "execution_count": null, 755 | "metadata": {}, 756 | "outputs": [], 757 | "source": [ 758 | "TIMESTAMP = datetime.now().strftime(\"%Y%m%d%H%M%S\")" 759 | ] 760 | }, 761 | { 762 | "cell_type": "code", 763 | "execution_count": null, 764 | "metadata": {}, 765 | "outputs": [], 766 | "source": [ 767 | "batch_predict_job = model.batch_predict(\n", 768 | " job_display_name=f\"{model_display_name}-batch-{TIMESTAMP}\",\n", 769 | " gcs_source=gcs_batch_request_csv,\n", 770 | " instances_format=\"csv\",\n", 771 | " gcs_destination_prefix=f'{STAGING_BUCKET}/test/batch_results/',\n", 772 | " predictions_format=\"csv\",\n", 773 | " sync=False\n", 774 | ")\n", 775 | "\n", 776 | "print(batch_predict_job)" 777 | ] 778 | }, 779 | { 780 | "cell_type": "markdown", 781 | "metadata": {}, 782 | "source": [ 783 | "#### Wait for completion of batch prediction job\n", 784 | "Next, wait for the batch job to complete." 785 | ] 786 | }, 787 | { 788 | "cell_type": "code", 789 | "execution_count": null, 790 | "metadata": {}, 791 | "outputs": [], 792 | "source": [ 793 | "# Blocks while the batch prediction job is running, which takes about 18min.\n", 794 | "batch_predict_job.wait()" 795 | ] 796 | }, 797 | { 798 | "cell_type": "markdown", 799 | "metadata": {}, 800 | "source": [ 801 | "#### Get the predictions\n", 802 | "Next, get the results from the completed batch prediction job.\n", 803 | "\n", 804 | "The results are written to the Cloud Storage output bucket you specified in the batch prediction request. You call the method iter_outputs() to get a list of each Cloud Storage file generated with the results. Each file contains one or more prediction requests in a JSON format:\n", 805 | "\n", 806 | "- `content`: The prediction request.\n", 807 | "- `prediction`: The prediction response.\n", 808 | " - `ids`: The internal assigned unique identifiers for each prediction request.\n", 809 | " - `displayNames`: The class names for each class label.\n", 810 | " - `confidences`: The predicted confidence, between 0 and 1, per class label." 811 | ] 812 | }, 813 | { 814 | "cell_type": "markdown", 815 | "metadata": {}, 816 | "source": [ 817 | "---\n", 818 | "\n", 819 | "**NOTE: There is issue with batch prediction job where input data types are not matching with model inputs. Skip the section below if you hit into issues**\n", 820 | "\n", 821 | "---" 822 | ] 823 | }, 824 | { 825 | "cell_type": "code", 826 | "execution_count": null, 827 | "metadata": {}, 828 | "outputs": [], 829 | "source": [ 830 | "bp_iter_outputs = batch_predict_job.iter_outputs()\n", 831 | "\n", 832 | "prediction_results = list()\n", 833 | "for blob in bp_iter_outputs:\n", 834 | " if blob.name.split(\"/\")[-1].startswith(\"prediction\"):\n", 835 | " prediction_results.append(blob.name)" 836 | ] 837 | }, 838 | { 839 | "cell_type": "code", 840 | "execution_count": null, 841 | "metadata": {}, 842 | "outputs": [], 843 | "source": [ 844 | "tags = list()\n", 845 | "for prediction_result in prediction_results:\n", 846 | " gfile_name = f\"gs://{bp_iter_outputs.bucket.name}/{prediction_result}\"\n", 847 | " with tf.io.gfile.GFile(name=gfile_name, mode=\"r\") as gfile:\n", 848 | " for line in gfile.readlines():\n", 849 | " print(line)" 850 | ] 851 | }, 852 | { 853 | "cell_type": "markdown", 854 | "metadata": {}, 855 | "source": [ 856 | "## Clean up" 857 | ] 858 | }, 859 | { 860 | "cell_type": "markdown", 861 | "metadata": {}, 862 | "source": [ 863 | "### Undeploy Models\n", 864 | "When you are done doing predictions, you undeploy the Model resource from the Endpoint resouce. This deprovisions all compute resources and ends billing for the deployed model." 865 | ] 866 | }, 867 | { 868 | "cell_type": "code", 869 | "execution_count": null, 870 | "metadata": {}, 871 | "outputs": [], 872 | "source": [ 873 | "endpoint.list_models()" 874 | ] 875 | }, 876 | { 877 | "cell_type": "code", 878 | "execution_count": null, 879 | "metadata": {}, 880 | "outputs": [], 881 | "source": [ 882 | "endpoint.undeploy_all()" 883 | ] 884 | }, 885 | { 886 | "cell_type": "markdown", 887 | "metadata": {}, 888 | "source": [ 889 | "### Delete Endpoint" 890 | ] 891 | }, 892 | { 893 | "cell_type": "code", 894 | "execution_count": null, 895 | "metadata": {}, 896 | "outputs": [], 897 | "source": [ 898 | "endpoint.delete()" 899 | ] 900 | }, 901 | { 902 | "cell_type": "markdown", 903 | "metadata": {}, 904 | "source": [ 905 | "### Delete Model" 906 | ] 907 | }, 908 | { 909 | "cell_type": "code", 910 | "execution_count": null, 911 | "metadata": {}, 912 | "outputs": [], 913 | "source": [ 914 | "model.delete()" 915 | ] 916 | } 917 | ], 918 | "metadata": { 919 | "colab": { 920 | "collapsed_sections": [], 921 | "name": "AI_Platform_(Unified)_SDK_AutoML_Image_Classification_Training.ipynb", 922 | "toc_visible": true 923 | }, 924 | "environment": { 925 | "name": "tf2-gpu.2-5.m74", 926 | "type": "gcloud", 927 | "uri": "gcr.io/deeplearning-platform-release/tf2-gpu.2-5:m74" 928 | }, 929 | "kernelspec": { 930 | "display_name": "Python [conda env:root] *", 931 | "language": "python", 932 | "name": "conda-root-py" 933 | }, 934 | "language_info": { 935 | "codemirror_mode": { 936 | "name": "ipython", 937 | "version": 3 938 | }, 939 | "file_extension": ".py", 940 | "mimetype": "text/x-python", 941 | "name": "python", 942 | "nbconvert_exporter": "python", 943 | "pygments_lexer": "ipython3", 944 | "version": "3.7.10" 945 | } 946 | }, 947 | "nbformat": 4, 948 | "nbformat_minor": 4 949 | } 950 | -------------------------------------------------------------------------------- /02-custom-job-tabular/instances.json: -------------------------------------------------------------------------------- 1 | {"instances" : [ 2 | { 3 | "dropoff_grid": ["POINT(-87.6 41.9)"], 4 | "euclidean": [2064.2696], 5 | "payment_type": ["Credit Card"], 6 | "pickup_grid": ["POINT(-87.6 41.9)"], 7 | "trip_miles": [1.37], 8 | "trip_day": [12], 9 | "trip_hour": [16], 10 | "trip_month": [2], 11 | "trip_day_of_week": [4], 12 | "trip_seconds": [555] 13 | } 14 | ] 15 | } 16 | -------------------------------------------------------------------------------- /02-custom-job-tabular/model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/02-custom-job-tabular/model.png -------------------------------------------------------------------------------- /02-custom-job-tabular/predictor/Dockerfile: -------------------------------------------------------------------------------- 1 | 2 | FROM tensorflow/serving:2.4.0 3 | 4 | # Set where models should be stored in the container 5 | ENV MODEL_BASE_PATH=/models 6 | ENV MODEL_NAME=model 7 | 8 | RUN mkdir -p ${MODEL_BASE_PATH}/${MODEL_NAME}/1 9 | 10 | # copy the model file 11 | COPY model ${MODEL_BASE_PATH}/${MODEL_NAME}/1/ 12 | 13 | # Create a script that runs the model server so we can use environment variables 14 | # while also passing in arguments from the docker command line 15 | RUN echo '#!/bin/bash \n\n\ 16 | tensorflow_model_server --port=5000 --rest_api_port=8080 \ 17 | --model_name=${MODEL_NAME} --model_base_path=${MODEL_BASE_PATH}/${MODEL_NAME} \ 18 | "$@"' > /usr/bin/predictor.sh \ 19 | && chmod +x /usr/bin/predictor.sh 20 | 21 | EXPOSE 5000 22 | EXPOSE 8080 23 | 24 | # Remove entrypoint from parent image 25 | ENTRYPOINT [] 26 | 27 | CMD ["/usr/bin/predictor.sh"] 28 | -------------------------------------------------------------------------------- /02-custom-job-tabular/predictor/model/saved_model.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/02-custom-job-tabular/predictor/model/saved_model.pb -------------------------------------------------------------------------------- /02-custom-job-tabular/predictor/model/variables/variables.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/02-custom-job-tabular/predictor/model/variables/variables.data-00000-of-00001 -------------------------------------------------------------------------------- /02-custom-job-tabular/predictor/model/variables/variables.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/02-custom-job-tabular/predictor/model/variables/variables.index -------------------------------------------------------------------------------- /02-custom-job-tabular/trainer/Dockerfile: -------------------------------------------------------------------------------- 1 | 2 | FROM gcr.io/deeplearning-platform-release/tf2-cpu.2-4 3 | 4 | WORKDIR /trainer 5 | RUN pip install cloudml-hypertune 6 | 7 | # Copies the trainer code to the docker image. 8 | COPY train.py . 9 | 10 | ENTRYPOINT ["python", "train.py"] 11 | -------------------------------------------------------------------------------- /03-distributed-training-text/scripts/Dockerfile: -------------------------------------------------------------------------------- 1 | 2 | FROM gcr.io/deeplearning-platform-release/tf2-gpu.2-4 3 | 4 | RUN pip install pip install tf-models-official==2.4.0 tensorflow-text==2.4.1 5 | 6 | WORKDIR / 7 | 8 | # Copies the trainer code to the docker image. 9 | COPY trainer /trainer 10 | 11 | # Sets up the entry point to invoke the trainer. 12 | ENTRYPOINT ["python", "-m", "trainer.task"] 13 | -------------------------------------------------------------------------------- /03-distributed-training-text/scripts/trainer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/03-distributed-training-text/scripts/trainer/__init__.py -------------------------------------------------------------------------------- /03-distributed-training-text/scripts/trainer/task.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Copyright 2021 Google Inc. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | 16 | import os 17 | import tensorflow as tf 18 | import tensorflow_hub as hub 19 | import tensorflow_text as text 20 | 21 | from absl import app 22 | from absl import flags 23 | from absl import logging 24 | from official.nlp import optimization 25 | 26 | 27 | TFHUB_HANDLE_ENCODER = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3' 28 | TFHUB_HANDLE_PREPROCESS = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3' 29 | LOCAL_TB_FOLDER = '/tmp/logs' 30 | LOCAL_SAVED_MODEL_DIR = '/tmp/saved_model' 31 | 32 | FLAGS = flags.FLAGS 33 | flags.DEFINE_integer('steps_per_epoch', 625, 'Steps per training epoch') 34 | flags.DEFINE_integer('eval_steps', 150, 'Evaluation steps') 35 | flags.DEFINE_integer('epochs', 2, 'Nubmer of epochs') 36 | flags.DEFINE_integer('per_replica_batch_size', 32, 'Per replica batch size') 37 | flags.DEFINE_string('training_data_path', f'/bert-finetuning/imdb/tfrecords/train', 'Training data GCS path') 38 | flags.DEFINE_string('validation_data_path', f'/bert-finetuning/imdb/tfrecords/valid', 'Validation data GCS path') 39 | flags.DEFINE_string('testing_data_path', f'/bert-finetuning/imdb/tfrecords/test', 'Testing data GCS path') 40 | 41 | flags.DEFINE_string('job_dir', f'/jobs', 'A base GCS path for jobs') 42 | flags.DEFINE_enum('strategy', 'multiworker', ['mirrored', 'multiworker'], 'Distribution strategy') 43 | flags.DEFINE_enum('auto_shard_policy', 'auto', ['auto', 'data', 'file', 'off'], 'Dataset sharing strategy') 44 | 45 | 46 | 47 | auto_shard_policy = { 48 | 'auto': tf.data.experimental.AutoShardPolicy.AUTO, 49 | 'data': tf.data.experimental.AutoShardPolicy.DATA, 50 | 'file': tf.data.experimental.AutoShardPolicy.FILE, 51 | 'off': tf.data.experimental.AutoShardPolicy.OFF, 52 | } 53 | 54 | 55 | def create_unbatched_dataset(tfrecords_folder): 56 | """Creates an unbatched dataset in the format required by the 57 | sentiment analysis model from the folder with TFrecords files.""" 58 | 59 | feature_description = { 60 | 'text_fragment': tf.io.FixedLenFeature([], tf.string, default_value=''), 61 | 'label': tf.io.FixedLenFeature([], tf.int64, default_value=0), 62 | } 63 | 64 | def _parse_function(example_proto): 65 | parsed_example = tf.io.parse_single_example(example_proto, feature_description) 66 | return parsed_example['text_fragment'], parsed_example['label'] 67 | 68 | file_paths = [f'{tfrecords_folder}/{file_path}' for file_path in tf.io.gfile.listdir(tfrecords_folder)] 69 | dataset = tf.data.TFRecordDataset(file_paths) 70 | dataset = dataset.map(_parse_function) 71 | 72 | return dataset 73 | 74 | 75 | def configure_dataset(ds, auto_shard_policy): 76 | """ 77 | Optimizes the performance of a dataset. 78 | """ 79 | 80 | options = tf.data.Options() 81 | options.experimental_distribute.auto_shard_policy = ( 82 | auto_shard_policy 83 | ) 84 | 85 | ds = ds.repeat(-1).cache() 86 | ds = ds.prefetch(buffer_size=tf.data.AUTOTUNE) 87 | ds = ds.with_options(options) 88 | return ds 89 | 90 | 91 | def create_input_pipelines(train_dir, valid_dir, test_dir, batch_size, auto_shard_policy): 92 | """Creates input pipelines from Imdb dataset.""" 93 | 94 | train_ds = create_unbatched_dataset(train_dir) 95 | train_ds = train_ds.batch(batch_size) 96 | train_ds = configure_dataset(train_ds, auto_shard_policy) 97 | 98 | valid_ds = create_unbatched_dataset(valid_dir) 99 | valid_ds = valid_ds.batch(batch_size) 100 | valid_ds = configure_dataset(valid_ds, auto_shard_policy) 101 | 102 | test_ds = create_unbatched_dataset(test_dir) 103 | test_ds = test_ds.batch(batch_size) 104 | test_ds = configure_dataset(test_ds, auto_shard_policy) 105 | 106 | return train_ds, valid_ds, test_ds 107 | 108 | 109 | def build_classifier_model(tfhub_handle_preprocess, tfhub_handle_encoder): 110 | """Builds a simple binary classification model with BERT trunk.""" 111 | 112 | text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text') 113 | preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing') 114 | encoder_inputs = preprocessing_layer(text_input) 115 | encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder') 116 | outputs = encoder(encoder_inputs) 117 | net = outputs['pooled_output'] 118 | net = tf.keras.layers.Dropout(0.1)(net) 119 | net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net) 120 | 121 | return tf.keras.Model(text_input, net) 122 | 123 | 124 | def copy_tensorboard_logs(local_path: str, gcs_path: str): 125 | """Copies Tensorboard logs from a local dir to a GCS location. 126 | 127 | After training, batch copy Tensorboard logs locally to a GCS location. This can result 128 | in faster pipeline runtimes over streaming logs per batch to GCS that can get bottlenecked 129 | when streaming large volumes. 130 | 131 | Args: 132 | local_path: local filesystem directory uri. 133 | gcs_path: cloud filesystem directory uri. 134 | Returns: 135 | None. 136 | """ 137 | pattern = '{}/*/events.out.tfevents.*'.format(local_path) 138 | local_files = tf.io.gfile.glob(pattern) 139 | gcs_log_files = [local_file.replace(local_path, gcs_path) for local_file in local_files] 140 | for local_file, gcs_file in zip(local_files, gcs_log_files): 141 | tf.io.gfile.copy(local_file, gcs_file) 142 | 143 | 144 | def main(argv): 145 | del argv 146 | 147 | def _is_chief(task_type, task_id): 148 | return ((task_type == 'chief' or task_type == 'worker') and task_id == 0) or task_type is None 149 | 150 | 151 | logging.info('Setting up training.') 152 | logging.info(' epochs: {}'.format(FLAGS.epochs)) 153 | logging.info(' steps_per_epoch: {}'.format(FLAGS.steps_per_epoch)) 154 | logging.info(' eval_steps: {}'.format(FLAGS.eval_steps)) 155 | logging.info(' strategy: {}'.format(FLAGS.strategy)) 156 | 157 | tb_dir = os.getenv('AIP_TENSORBOARD_LOG_DIR', LOCAL_TB_FOLDER) 158 | model_dir = os.getenv('AIP_MODEL_DIR', LOCAL_MODEL_DIR) 159 | 160 | if FLAGS.strategy == 'mirrored': 161 | strategy = tf.distribute.MirroredStrategy() 162 | else: 163 | strategy = tf.distribute.MultiWorkerMirroredStrategy() 164 | 165 | if strategy.cluster_resolver: 166 | task_type, task_id = (strategy.cluster_resolver.task_type, 167 | strategy.cluster_resolver.task_id) 168 | else: 169 | task_type, task_id =(None, None) 170 | 171 | 172 | global_batch_size = (strategy.num_replicas_in_sync * 173 | FLAGS.per_replica_batch_size) 174 | 175 | 176 | train_ds, valid_ds, test_ds = create_input_pipelines( 177 | FLAGS.training_data_path, 178 | FLAGS.validation_data_path, 179 | FLAGS.testing_data_path, 180 | global_batch_size, 181 | auto_shard_policy[FLAGS.auto_shard_policy]) 182 | 183 | num_train_steps = FLAGS.steps_per_epoch * FLAGS.epochs 184 | num_warmup_steps = int(0.1*num_train_steps) 185 | init_lr = 3e-5 186 | 187 | with strategy.scope(): 188 | model = build_classifier_model(TFHUB_HANDLE_PREPROCESS, TFHUB_HANDLE_ENCODER) 189 | loss = tf.keras.losses.BinaryCrossentropy(from_logits=True) 190 | metrics = tf.metrics.BinaryAccuracy() 191 | optimizer = optimization.create_optimizer( 192 | init_lr=init_lr, 193 | num_train_steps=num_train_steps, 194 | num_warmup_steps=num_warmup_steps, 195 | optimizer_type='adamw') 196 | 197 | model.compile(optimizer=optimizer, 198 | loss=loss, 199 | metrics=metrics) 200 | 201 | # Configure BackupAndRestore callback 202 | backup_dir = '{}/backupandrestore'.format(FLAGS.job_dir) 203 | callbacks = [tf.keras.callbacks.experimental.BackupAndRestore(backup_dir=backup_dir)] 204 | 205 | # Configure TensorBoard callback on Chief 206 | if _is_chief(task_type, task_id): 207 | callbacks.append(tf.keras.callbacks.TensorBoard( 208 | log_dir=tb_dir, update_freq='batch')) 209 | 210 | logging.info('Starting training ...') 211 | 212 | history = model.fit(x=train_ds, 213 | validation_data=valid_ds, 214 | steps_per_epoch=FLAGS.steps_per_epoch, 215 | validation_steps=FLAGS.eval_steps, 216 | epochs=FLAGS.epochs, 217 | callbacks=callbacks) 218 | 219 | if _is_chief(task_type, task_id): 220 | # Copy tensorboard logs to GCS 221 | # tb_logs = '{}/tb_logs'.format(FLAGS.job_dir) 222 | # logging.info('Copying TensorBoard logs to: {}'.format(tb_logs)) 223 | # copy_tensorboard_logs(LOCAL_TB_FOLDER, tb_logs) 224 | saved_model_dir = '{}/saved_model'.format(model_dir) 225 | else: 226 | saved_model_dir = model_dir 227 | 228 | # Save trained model 229 | saved_model_dir = '{}/saved_model'.format(model_dir) 230 | logging.info('Training completed. Saving the trained model to: {}'.format(saved_model_dir)) 231 | model.save(saved_model_dir) 232 | #tf.saved_model.save(model, saved_model_dir) 233 | 234 | 235 | if __name__ == '__main__': 236 | logging.set_verbosity(logging.INFO) 237 | app.run(main) 238 | -------------------------------------------------------------------------------- /04-vertex-pipeline-and-airflow/data_orchestration_bq_example_dag.py: -------------------------------------------------------------------------------- 1 | 2 | """An example Composer workflow integrating GCS and BigQuery. 3 | 4 | A .csv is read from a GCS bucket to a BigQuery table; a query is made, and the 5 | result is written back to a different BigQuery table within a new dataset. 6 | """ 7 | 8 | from datetime import datetime, timedelta 9 | from airflow import DAG 10 | from airflow.contrib.operators.bigquery_operator import BigQueryOperator 11 | from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator 12 | from airflow.operators.bash_operator import BashOperator 13 | 14 | YESTERDAY = datetime.combine( 15 | datetime.today() - timedelta(days=1), datetime.min.time()) 16 | BQ_DATASET_NAME = 'bq_demos' 17 | 18 | default_args = { 19 | 'owner': 'airflow', 20 | 'depends_on_past': False, 21 | 'start_date': YESTERDAY, 22 | 'email_on_failure': False, 23 | 'email_on_retry': False, 24 | 'retries': 1, 25 | 'retry_delay': timedelta(minutes=5), 26 | } 27 | 28 | # Solution: pass a schedule_interval argument to DAG instantiation. 29 | with DAG('dag_gcs_to_bq_orch', default_args=default_args, 30 | schedule_interval=None) as dag: 31 | create_bq_dataset_if_not_exist = """ 32 | bq ls {0} 33 | if [ $? -ne 0 ]; then 34 | bq mk {0} 35 | fi 36 | """.format(BQ_DATASET_NAME) 37 | 38 | # Create destination dataset. 39 | t1 = BashOperator( 40 | task_id='create_destination_dataset', 41 | bash_command=create_bq_dataset_if_not_exist, 42 | dag=dag) 43 | 44 | # Create a bigquery table from a .csv file located in a GCS bucket 45 | # (gs://example-datasets/game_data_condensed.csv). 46 | # Store it in our dataset. 47 | t2 = GoogleCloudStorageToBigQueryOperator( 48 | task_id='gcs_to_bq', 49 | bucket='example-datasets', 50 | source_objects=['game_data_condensed.csv'], 51 | destination_project_dataset_table='{0}.composer_game_data_table' 52 | .format(BQ_DATASET_NAME), 53 | schema_fields=[ 54 | {'name': 'name', 'type': 'string', 'mode': 'nullable'}, 55 | {'name': 'team', 'type': 'string', 'mode': 'nullable'}, 56 | {'name': 'total_score', 'type': 'integer', 'mode': 'nullable'}, 57 | {'name': 'timestamp', 'type': 'integer', 'mode': 'nullable'}, 58 | {'name': 'window_start', 'type': 'string', 'mode': 'nullable'}, 59 | ], 60 | write_disposition='WRITE_TRUNCATE') 61 | 62 | # Run example query (http://shortn/_BdF1UTEYOb) and save result to the 63 | # destination table. 64 | t3 = BigQueryOperator( 65 | task_id='bq_example_query', 66 | bql=f""" 67 | SELECT 68 | name, team, total_score 69 | FROM 70 | {BQ_DATASET_NAME}.composer_game_data_table 71 | WHERE total_score > 15 72 | LIMIT 100; 73 | """, 74 | destination_dataset_table='{0}.gcp_example_query_result' 75 | .format(BQ_DATASET_NAME), 76 | write_disposition='WRITE_TRUNCATE') 77 | 78 | t1 >> t2 >> t3 79 | -------------------------------------------------------------------------------- /04-vertex-pipeline-and-airflow/get_composer_config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Get the client ID associated with a Cloud Composer environment.""" 16 | 17 | import argparse 18 | 19 | 20 | def get_client_id(project_id, location, composer_environment): 21 | # [START composer_get_environment_client_id] 22 | import google.auth 23 | import google.auth.transport.requests 24 | import requests 25 | import six.moves.urllib.parse 26 | 27 | # Authenticate with Google Cloud. 28 | # See: https://cloud.google.com/docs/authentication/getting-started 29 | credentials, _ = google.auth.default( 30 | scopes=['https://www.googleapis.com/auth/cloud-platform']) 31 | authed_session = google.auth.transport.requests.AuthorizedSession( 32 | credentials) 33 | 34 | # project_id = 'YOUR_PROJECT_ID' 35 | # location = 'us-central1' 36 | # composer_environment = 'YOUR_COMPOSER_ENVIRONMENT_NAME' 37 | 38 | environment_url = ( 39 | 'https://composer.googleapis.com/v1beta1/projects/{}/locations/{}' 40 | '/environments/{}').format(project_id, location, composer_environment) 41 | composer_response = authed_session.request('GET', environment_url) 42 | environment_data = composer_response.json() 43 | airflow_uri = environment_data['config']['airflowUri'] 44 | print(airflow_uri) 45 | dag_gcs_prefix = environment_data['config']['dagGcsPrefix'] 46 | print(dag_gcs_prefix) 47 | 48 | # The Composer environment response does not include the IAP client ID. 49 | # Make a second, unauthenticated HTTP request to the web server to get the 50 | # redirect URI. 51 | redirect_response = requests.get(airflow_uri, allow_redirects=False) 52 | redirect_location = redirect_response.headers['location'] 53 | 54 | # Extract the client_id query parameter from the redirect. 55 | parsed = six.moves.urllib.parse.urlparse(redirect_location) 56 | query_string = six.moves.urllib.parse.parse_qs(parsed.query) 57 | print(query_string['client_id'][0]) 58 | # [END composer_get_environment_client_id] 59 | 60 | 61 | # Usage: python get_client_id.py your_project_id your_region your_environment_name 62 | if __name__ == '__main__': 63 | parser = argparse.ArgumentParser( 64 | description=__doc__, 65 | formatter_class=argparse.RawDescriptionHelpFormatter) 66 | parser.add_argument('project_id', help='Your Project ID.') 67 | parser.add_argument( 68 | 'location', help='Region of the Cloud Composer environment.') 69 | parser.add_argument( 70 | 'composer_environment', help='Name of the Cloud Composer environment.') 71 | 72 | args = parser.parse_args() 73 | get_client_id( 74 | args.project_id, args.location, args.composer_environment) 75 | -------------------------------------------------------------------------------- /04-vertex-pipeline-and-airflow/images/airflow_dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/04-vertex-pipeline-and-airflow/images/airflow_dag.png -------------------------------------------------------------------------------- /04-vertex-pipeline-and-airflow/images/airflow_dag_run.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/04-vertex-pipeline-and-airflow/images/airflow_dag_run.png -------------------------------------------------------------------------------- /04-vertex-pipeline-and-airflow/images/airflow_webserver_with_dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/04-vertex-pipeline-and-airflow/images/airflow_webserver_with_dag.png -------------------------------------------------------------------------------- /04-vertex-pipeline-and-airflow/images/pipeline_run.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/04-vertex-pipeline-and-airflow/images/pipeline_run.png -------------------------------------------------------------------------------- /04-vertex-pipeline-and-airflow/images/trigger-airflow-dag-on-cloud-composer-from-vertex-pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/04-vertex-pipeline-and-airflow/images/trigger-airflow-dag-on-cloud-composer-from-vertex-pipeline.png -------------------------------------------------------------------------------- /04-vertex-pipeline-and-airflow/vertex-pipeline-airflow.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Copyright 2021 Google LLC\n", 10 | "\n", 11 | "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", 12 | "# you may not use this file except in compliance with the License.\n", 13 | "# You may obtain a copy of the License at\n", 14 | "\n", 15 | "# https://www.apache.org/licenses/LICENSE-2.0\n", 16 | "\n", 17 | "# Unless required by applicable law or agreed to in writing, software\n", 18 | "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", 19 | "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", 20 | "# See the License for the specific language governing permissions and\n", 21 | "# limitations under the License." 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "# Trigger Airflow DAG in Cloud Composer from a Vertex Pipeline\n", 29 | "\n", 30 | "Apache Airflow is most popular choice for data pipelining in general. However, arguably not a good choice to run Machine learning pipelines due to lack of ML metadata tracking, artifact lineage, tracking ML metrics across metrics etc. [Vertex Pipelines](https://cloud.google.com/vertex-ai/docs/pipelines/introduction) solves this problem and automates, monitors, and governs your ML systems by orchestrating your ML workflow in a serverless manner, and storing your workflow's artifacts using Vertex ML Metadata.\n", 31 | "\n", 32 | "In this notebook, we will show you how you can trigger a data pipeline i.e. Airflow DAG on Cloud Composer from a ML pipeline running on Vertex Pipelines.\n", 33 | "\n", 34 | "![Trigger Airflow DAG on Cloud Composer from Vertex Pipeline](images/trigger-airflow-dag-on-cloud-composer-from-vertex-pipeline.png)" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "Following are high level steps:\n", 42 | "\n", 43 | "1. Create Cloud Composer environment\n", 44 | "2. Upload Airflow DAG to Composer environment that performs data processing\n", 45 | "3. Create a Vertex Pipeline that triggers the Airflow DAG" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "### Installing packages\n", 53 | "\n", 54 | "Start with installing KFP SDK and Google Cloud Pipeline components in the environment" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "USER_FLAG = \"--user\"" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "!pip3 install {USER_FLAG} google-cloud-aiplatform==1.0.0 --upgrade\n", 73 | "!pip3 install {USER_FLAG} kfp google-cloud-pipeline-components==0.1.1 --upgrade" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "After installing these packages you'll need to restart the kernel:" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "import os\n", 90 | "\n", 91 | "if not os.getenv(\"IS_TESTING\"):\n", 92 | " # Automatically restart kernel after installs\n", 93 | " import IPython\n", 94 | "\n", 95 | " app = IPython.Application.instance()\n", 96 | " app.kernel.do_shutdown(True)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "Finally, check that you have correctly installed the packages. The KFP SDK version should be >=1.6:" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "!python3 -c \"import kfp; print('KFP SDK version: {}'.format(kfp.__version__))\"\n", 113 | "!python3 -c \"import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))\"" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "### Set your project ID and bucket" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "Throughout this notebook you'll reference your Cloud project ID and the bucket you created earlier. Next we'll create variables for each of those.\n", 128 | "\n", 129 | "If you don't know your project ID you may be able to get it by running the following:" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "import google.auth\n", 139 | "\n", 140 | "creds, PROJECT_ID = google.auth.default()\n", 141 | "REGION = 'us-central1'" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "Otherwise, set it here:" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "if PROJECT_ID == \"\" or PROJECT_ID is None:\n", 158 | " PROJECT_ID = \"your-project-id\" # @param {type:\"string\"}" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "print(f\"PROJECT_ID = {PROJECT_ID}\")\n", 168 | "print(f\"REGION = {REGION}\")" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "Then create a variable to store your bucket name and create the bucket if it does not exists already." 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "BUCKET_NAME = \"gs://\" + \"cloud-ai-platform-2f444b6a-a742-444b-b91a-c7519f51bd77\"" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "# run only if the bucket does not exists already\n", 194 | "!gsutil mb -l $REGION $BUCKET_NAME" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "## Create Composer Environment\n", 202 | "\n", 203 | "Please follow the instructions in the [document](https://cloud.google.com/composer/docs/how-to/managing/creating#) to create a Composer Environment with the configuration you need. For this sample demonstration, we create a bare minimum Composer environment. \n", 204 | "\n", 205 | "To trigger an Airflow DAG from Verte Pipeline, we will using Airflow web server REST API. By default, the API authentication feature is disabled in Airflow 1.10.11 and above which would deny all requests made to Airflow web server. To trigger DAG, we enable this feature. To enable the API authentication feature we override `auth_backend` configuration in Composer environment to `airflow.api.auth.backend.default`.\n", 206 | "\n", 207 | "**NOTE:** Cloud Composer environment creation may take up to 30 min. Grab your favorite beverage until then." 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "COMPOSER_ENV_NAME = \"test-composer-env\"\n", 217 | "ZONE = \"us-central1-f\"" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "!gcloud beta composer environments create $COMPOSER_ENV_NAME \\\n", 227 | " --location $REGION \\\n", 228 | " --zone $ZONE\\\n", 229 | " --machine-type n1-standard-2 \\\n", 230 | " --image-version composer-latest-airflow-1.10.15 \\\n", 231 | " --airflow-configs=api-auth_backend=airflow.api.auth.backend.default" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "### Get Composer Environment configuration\n", 239 | "\n", 240 | "We will get Composer environment configuration such as webserver URL and client ID to use in the Vertex Pipeline using the script `get_composer_client_id.py`" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "# This code is modified version of https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/composer/rest/get_client_id.py\n", 250 | "\n", 251 | "shell_output=!python3 get_composer_config.py $PROJECT_ID $REGION $COMPOSER_ENV_NAME\n", 252 | "COMPOSER_WEB_URI = shell_output[0]\n", 253 | "COMPOSER_DAG_GCS = shell_output[1]\n", 254 | "COMPOSER_CLIENT_ID = shell_output[2]\n", 255 | "\n", 256 | "print(f\"COMPOSER_WEB_URI = {COMPOSER_WEB_URI}\")\n", 257 | "print(f\"COMPOSER_DAG_GCS = {COMPOSER_DAG_GCS}\")\n", 258 | "print(f\"COMPOSER_CLIENT_ID = {COMPOSER_CLIENT_ID}\")" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "You can navigate to Airflow webserver by going to this URL" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [ 274 | "COMPOSER_WEB_URI" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": {}, 280 | "source": [ 281 | "## Upload DAG to Cloud Composer environment\n", 282 | "\n", 283 | "We have a sample data processing DAG `data_orchestration_bq_example_dag.py` that reads a CSV file from GCS bucket and writes to BigQuery. We will add this file to the GCS bucket configure for the Composer environment that Airflow watches." 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "COMPOSER_DAG_NAME = \"dag_gcs_to_bq_orch\"\n", 293 | "COMPOSER_DAG_FILENAME = \"data_orchestration_bq_example_dag.py\"" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "%%writefile $COMPOSER_DAG_FILENAME\n", 303 | "\n", 304 | "\"\"\"An example Composer workflow integrating GCS and BigQuery.\n", 305 | "\n", 306 | "A .csv is read from a GCS bucket to a BigQuery table; a query is made, and the\n", 307 | "result is written back to a different BigQuery table within a new dataset.\n", 308 | "\"\"\"\n", 309 | "\n", 310 | "from datetime import datetime, timedelta\n", 311 | "from airflow import DAG\n", 312 | "from airflow.contrib.operators.bigquery_operator import BigQueryOperator\n", 313 | "from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator\n", 314 | "from airflow.operators.bash_operator import BashOperator\n", 315 | "\n", 316 | "YESTERDAY = datetime.combine(\n", 317 | " datetime.today() - timedelta(days=1), datetime.min.time())\n", 318 | "BQ_DATASET_NAME = 'bq_demos'\n", 319 | "\n", 320 | "default_args = {\n", 321 | " 'owner': 'airflow',\n", 322 | " 'depends_on_past': False,\n", 323 | " 'start_date': YESTERDAY,\n", 324 | " 'email_on_failure': False,\n", 325 | " 'email_on_retry': False,\n", 326 | " 'retries': 1,\n", 327 | " 'retry_delay': timedelta(minutes=5),\n", 328 | "}\n", 329 | "\n", 330 | "# Solution: pass a schedule_interval argument to DAG instantiation.\n", 331 | "with DAG('dag_gcs_to_bq_orch', default_args=default_args,\n", 332 | " schedule_interval=None) as dag:\n", 333 | " create_bq_dataset_if_not_exist = \"\"\"\n", 334 | " bq ls {0}\n", 335 | " if [ $? -ne 0 ]; then\n", 336 | " bq mk {0}\n", 337 | " fi\n", 338 | " \"\"\".format(BQ_DATASET_NAME)\n", 339 | "\n", 340 | " # Create destination dataset.\n", 341 | " t1 = BashOperator(\n", 342 | " task_id='create_destination_dataset',\n", 343 | " bash_command=create_bq_dataset_if_not_exist,\n", 344 | " dag=dag)\n", 345 | "\n", 346 | " # Create a bigquery table from a .csv file located in a GCS bucket\n", 347 | " # (gs://example-datasets/game_data_condensed.csv).\n", 348 | " # Store it in our dataset.\n", 349 | " t2 = GoogleCloudStorageToBigQueryOperator(\n", 350 | " task_id='gcs_to_bq',\n", 351 | " bucket='example-datasets',\n", 352 | " source_objects=['game_data_condensed.csv'],\n", 353 | " destination_project_dataset_table='{0}.composer_game_data_table'\n", 354 | " .format(BQ_DATASET_NAME),\n", 355 | " schema_fields=[\n", 356 | " {'name': 'name', 'type': 'string', 'mode': 'nullable'},\n", 357 | " {'name': 'team', 'type': 'string', 'mode': 'nullable'},\n", 358 | " {'name': 'total_score', 'type': 'integer', 'mode': 'nullable'},\n", 359 | " {'name': 'timestamp', 'type': 'integer', 'mode': 'nullable'},\n", 360 | " {'name': 'window_start', 'type': 'string', 'mode': 'nullable'},\n", 361 | " ],\n", 362 | " write_disposition='WRITE_TRUNCATE')\n", 363 | "\n", 364 | " # Run example query (http://shortn/_BdF1UTEYOb) and save result to the\n", 365 | " # destination table.\n", 366 | " t3 = BigQueryOperator(\n", 367 | " task_id='bq_example_query',\n", 368 | " bql=f\"\"\"\n", 369 | " SELECT\n", 370 | " name, team, total_score\n", 371 | " FROM\n", 372 | " {BQ_DATASET_NAME}.composer_game_data_table\n", 373 | " WHERE total_score > 15\n", 374 | " LIMIT 100;\n", 375 | " \"\"\",\n", 376 | " destination_dataset_table='{0}.gcp_example_query_result'\n", 377 | " .format(BQ_DATASET_NAME),\n", 378 | " write_disposition='WRITE_TRUNCATE')\n", 379 | "\n", 380 | " t1 >> t2 >> t3" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": null, 386 | "metadata": {}, 387 | "outputs": [], 388 | "source": [ 389 | "!gsutil cp $COMPOSER_DAG_FILENAME $COMPOSER_DAG_GCS/" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": null, 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [ 398 | "!gsutil ls -l $COMPOSER_DAG_GCS/$COMPOSER_DAG_FILENAME" 399 | ] 400 | }, 401 | { 402 | "cell_type": "markdown", 403 | "metadata": {}, 404 | "source": [ 405 | "You should the DAG in your Airflow webserver\n", 406 | "\n", 407 | "![](images/airflow_webserver_with_dag.png)\n", 408 | "\n", 409 | "![](images/airflow_dag.png)" 410 | ] 411 | }, 412 | { 413 | "cell_type": "markdown", 414 | "metadata": {}, 415 | "source": [ 416 | "## Vertex Pipelines setup" 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "metadata": {}, 422 | "source": [ 423 | "### Import libraries\n", 424 | "\n", 425 | "Add the following to import the libraries we'll be using throughout this codelab:" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": null, 431 | "metadata": {}, 432 | "outputs": [], 433 | "source": [ 434 | "from typing import NamedTuple\n", 435 | "import re\n", 436 | "\n", 437 | "import kfp\n", 438 | "from kfp import dsl\n", 439 | "from kfp.v2 import compiler\n", 440 | "from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,\n", 441 | " OutputPath, ClassificationMetrics, Metrics, component)\n", 442 | "from kfp.v2.google.client import AIPlatformClient\n", 443 | "\n", 444 | "from google.cloud import aiplatform\n", 445 | "from google_cloud_pipeline_components import aiplatform as gcc_aip" 446 | ] 447 | }, 448 | { 449 | "cell_type": "markdown", 450 | "metadata": {}, 451 | "source": [ 452 | "### Define constants\n", 453 | "\n", 454 | "Before building the pipeline define some constant variables:\n", 455 | "\n", 456 | "- `PIPELINE_ROOT` is the Cloud Storage path where the artifacts created by the pipeline will be written. We're using us-central1 as the region here, but if you used a different region when you created your bucket, update the REGION variable in the code above" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": null, 462 | "metadata": {}, 463 | "outputs": [], 464 | "source": [ 465 | "PATH=%env PATH\n", 466 | "%env PATH={PATH}:/home/jupyter/.local/bin\n", 467 | "\n", 468 | "PIPELINE_ROOT = f\"{BUCKET_NAME}/pipeline_root/\"\n", 469 | "PIPELINE_ROOT" 470 | ] 471 | }, 472 | { 473 | "cell_type": "markdown", 474 | "metadata": {}, 475 | "source": [ 476 | "After running the code above, you should see the root directory for your pipeline printed. This is the Cloud Storage location where the artifacts from your pipeline will be written. It will be in the format of `gs://BUCKET_NAME/pipeline_root/`" 477 | ] 478 | }, 479 | { 480 | "cell_type": "markdown", 481 | "metadata": {}, 482 | "source": [ 483 | "### Create a Python function based component to trigger Airflow DAG\n", 484 | "\n", 485 | "Using the KFP SDK, we can create components based on Python functions. The component takes Airflow DAG name `dag_name` a string as input and returns response from Airflow web server as an `Artifact` that contains Airflow DAG run information. The component makes a request to Airflow REST API of your Cloud Composer environment. Airflow processes this request and runs a DAG. The DAG outputs information about the change that is logged as artifact (you can output as string as well." 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": null, 491 | "metadata": {}, 492 | "outputs": [], 493 | "source": [ 494 | "@component(\n", 495 | " base_image=\"gcr.io/ml-pipeline/google-cloud-pipeline-components:0.1.3\",\n", 496 | " output_component_file=\"composer-trigger-dag-component.yaml\",\n", 497 | " packages_to_install=[\"requests\"],\n", 498 | ")\n", 499 | "def trigger_airflow_dag(\n", 500 | " dag_name: str,\n", 501 | " composer_client_id: str,\n", 502 | " composer_webserver_id: str,\n", 503 | " response: Output[Artifact]\n", 504 | "):\n", 505 | " # [START composer_trigger]\n", 506 | "\n", 507 | " from google.auth.transport.requests import Request\n", 508 | " from google.oauth2 import id_token\n", 509 | " import requests\n", 510 | " import json\n", 511 | " import os\n", 512 | "\n", 513 | "\n", 514 | " IAM_SCOPE = 'https://www.googleapis.com/auth/iam'\n", 515 | " OAUTH_TOKEN_URI = 'https://www.googleapis.com/oauth2/v4/token'\n", 516 | " \n", 517 | " data = '{\"replace_microseconds\":\"false\"}'\n", 518 | " context = None\n", 519 | "\n", 520 | " \"\"\"Makes a POST request to the Composer DAG Trigger API\n", 521 | "\n", 522 | " When called via Google Cloud Functions (GCF),\n", 523 | " data and context are Background function parameters.\n", 524 | "\n", 525 | " For more info, refer to\n", 526 | " https://cloud.google.com/functions/docs/writing/background#functions_background_parameters-python\n", 527 | "\n", 528 | " To call this function from a Python script, omit the ``context`` argument\n", 529 | " and pass in a non-null value for the ``data`` argument.\n", 530 | " \"\"\"\n", 531 | "\n", 532 | " # Form webserver URL to make REST API calls\n", 533 | " webserver_url = f'{composer_webserver_id}/api/experimental/dags/{dag_name}/dag_runs'\n", 534 | " # print(webserver_url)\n", 535 | "\n", 536 | " # This code is copied from\n", 537 | " # https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/iap/make_iap_request.py\n", 538 | " # START COPIED IAP CODE\n", 539 | " def make_iap_request(url, client_id, method='GET', **kwargs):\n", 540 | " \"\"\"Makes a request to an application protected by Identity-Aware Proxy.\n", 541 | " Args:\n", 542 | " url: The Identity-Aware Proxy-protected URL to fetch.\n", 543 | " client_id: The client ID used by Identity-Aware Proxy.\n", 544 | " method: The request method to use\n", 545 | " ('GET', 'OPTIONS', 'HEAD', 'POST', 'PUT', 'PATCH', 'DELETE')\n", 546 | " **kwargs: Any of the parameters defined for the request function:\n", 547 | " https://github.com/requests/requests/blob/master/requests/api.py\n", 548 | " If no timeout is provided, it is set to 90 by default.\n", 549 | " Returns:\n", 550 | " The page body, or raises an exception if the page couldn't be retrieved.\n", 551 | " \"\"\"\n", 552 | " # Set the default timeout, if missing\n", 553 | " if 'timeout' not in kwargs:\n", 554 | " kwargs['timeout'] = 90\n", 555 | "\n", 556 | " # Obtain an OpenID Connect (OIDC) token from metadata server or using service\n", 557 | " # account.\n", 558 | " google_open_id_connect_token = id_token.fetch_id_token(Request(), client_id)\n", 559 | "\n", 560 | " # Fetch the Identity-Aware Proxy-protected URL, including an\n", 561 | " # Authorization header containing \"Bearer \" followed by a\n", 562 | " # Google-issued OpenID Connect token for the service account.\n", 563 | " resp = requests.request(\n", 564 | " method, url,\n", 565 | " headers={'Authorization': 'Bearer {}'.format(\n", 566 | " google_open_id_connect_token)}, **kwargs)\n", 567 | " if resp.status_code == 403:\n", 568 | " raise Exception('Service account does not have permission to '\n", 569 | " 'access the IAP-protected application.')\n", 570 | " elif resp.status_code != 200:\n", 571 | " raise Exception(\n", 572 | " 'Bad response from application: {!r} / {!r} / {!r}'.format(\n", 573 | " resp.status_code, resp.headers, resp.text))\n", 574 | " else:\n", 575 | " print(f\"response = {resp.text}\")\n", 576 | " file_path = os.path.join(response.path)\n", 577 | " os.makedirs(file_path)\n", 578 | " with open(os.path.join(file_path, \"airflow_response.json\"), 'w') as f:\n", 579 | " json.dump(resp.text, f)\n", 580 | "\n", 581 | " # END COPIED IAP CODE\n", 582 | "\n", 583 | " \n", 584 | " # Make a POST request to IAP which then Triggers the DAG\n", 585 | " make_iap_request(\n", 586 | " webserver_url, composer_client_id, method='POST', json={\"conf\": data, \"replace_microseconds\": 'false'})\n", 587 | " \n", 588 | " # [END composer_trigger]" 589 | ] 590 | }, 591 | { 592 | "cell_type": "markdown", 593 | "metadata": {}, 594 | "source": [ 595 | "Understanding the component structure\n", 596 | "- The **`@component`** decorator compiles this function to a component when the pipeline is run. You'll use this anytime you write a custom component.\n", 597 | "- The **`base_image parameter`** specifies the container image this component will use.\n", 598 | "- The **`output_component_file`** parameter is optional, and specifies the yaml file to write the compiled component to.\n", 599 | "- The **`packages_to_install`** parameter installs required python packages in the container to run the component" 600 | ] 601 | }, 602 | { 603 | "cell_type": "markdown", 604 | "metadata": {}, 605 | "source": [ 606 | "### Test Triggering Airflow DAG from Notebook" 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": null, 612 | "metadata": {}, 613 | "outputs": [], 614 | "source": [ 615 | "# before running comment out @component annotation in the cell above\n", 616 | "trigger_airflow_dag(\n", 617 | " dag_name=COMPOSER_DAG_NAME,\n", 618 | " composer_client_id=COMPOSER_CLIENT_ID,\n", 619 | " composer_webserver_id=COMPOSER_WEB_URI,\n", 620 | " response=None\n", 621 | ")" 622 | ] 623 | }, 624 | { 625 | "cell_type": "code", 626 | "execution_count": null, 627 | "metadata": {}, 628 | "outputs": [], 629 | "source": [ 630 | "COMPOSER_WEB_URI" 631 | ] 632 | }, 633 | { 634 | "cell_type": "markdown", 635 | "metadata": {}, 636 | "source": [ 637 | "### Adding the components to a pipeline" 638 | ] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": null, 643 | "metadata": {}, 644 | "outputs": [], 645 | "source": [ 646 | "PIPELINE_NAME = \"pipeline-trigger-airflow-dag\"" 647 | ] 648 | }, 649 | { 650 | "cell_type": "code", 651 | "execution_count": null, 652 | "metadata": {}, 653 | "outputs": [], 654 | "source": [ 655 | "@dsl.pipeline(\n", 656 | " name=PIPELINE_NAME,\n", 657 | " description=\"Trigger Airflow DAG from Vertex Pipelines\",\n", 658 | " pipeline_root=PIPELINE_ROOT,\n", 659 | ")\n", 660 | "\n", 661 | "# You can change the `text` and `emoji_str` parameters here to update the pipeline output\n", 662 | "def pipeline():\n", 663 | " data_processing_task_dag_name = COMPOSER_DAG_NAME\n", 664 | " data_processing_task = trigger_airflow_dag(\n", 665 | " dag_name=data_processing_task_dag_name,\n", 666 | " composer_client_id=COMPOSER_CLIENT_ID,\n", 667 | " composer_webserver_id=COMPOSER_WEB_URI\n", 668 | " )" 669 | ] 670 | }, 671 | { 672 | "cell_type": "markdown", 673 | "metadata": {}, 674 | "source": [ 675 | "### Compile and run the pipeline" 676 | ] 677 | }, 678 | { 679 | "cell_type": "markdown", 680 | "metadata": {}, 681 | "source": [ 682 | "With your pipeline defined, you're ready to compile it. The following will generate a JSON file that you'll use to run the pipeline:" 683 | ] 684 | }, 685 | { 686 | "cell_type": "code", 687 | "execution_count": null, 688 | "metadata": {}, 689 | "outputs": [], 690 | "source": [ 691 | "compiler.Compiler().compile(\n", 692 | " pipeline_func=pipeline, package_path=f\"{PIPELINE_NAME}.json\"\n", 693 | ")" 694 | ] 695 | }, 696 | { 697 | "cell_type": "markdown", 698 | "metadata": {}, 699 | "source": [ 700 | "Next, instantiate an API client:" 701 | ] 702 | }, 703 | { 704 | "cell_type": "code", 705 | "execution_count": null, 706 | "metadata": {}, 707 | "outputs": [], 708 | "source": [ 709 | "api_client = AIPlatformClient(\n", 710 | " project_id=PROJECT_ID,\n", 711 | " region=REGION,\n", 712 | ")" 713 | ] 714 | }, 715 | { 716 | "cell_type": "markdown", 717 | "metadata": {}, 718 | "source": [ 719 | "Finally, run the pipeline:" 720 | ] 721 | }, 722 | { 723 | "cell_type": "code", 724 | "execution_count": null, 725 | "metadata": {}, 726 | "outputs": [], 727 | "source": [ 728 | "response = api_client.create_run_from_job_spec(\n", 729 | " job_spec_path=f\"{PIPELINE_NAME}.json\",\n", 730 | " # pipeline_root=PIPELINE_ROOT # this argument is necessary if you did not specify PIPELINE_ROOT as part of the pipeline definition.\n", 731 | ")" 732 | ] 733 | }, 734 | { 735 | "cell_type": "markdown", 736 | "metadata": {}, 737 | "source": [ 738 | "### Monitor Vertex Pipeline status\n", 739 | "\n", 740 | "From Cloud Console, you can monitor the pipeline run status and view the output artifact\n", 741 | "\n", 742 | "![](images/pipeline_run.png)" 743 | ] 744 | }, 745 | { 746 | "cell_type": "markdown", 747 | "metadata": {}, 748 | "source": [ 749 | "You can also API client to get pipeline status and artifact information." 750 | ] 751 | }, 752 | { 753 | "cell_type": "code", 754 | "execution_count": null, 755 | "metadata": {}, 756 | "outputs": [], 757 | "source": [ 758 | "def get_job_id(job_name):\n", 759 | " \"\"\"get job id from pipeline job name\"\"\"\n", 760 | " p = re.compile('projects/(?P.*)/locations/(?P.*)/pipelineJobs/(?P.*)')\n", 761 | " result = p.search(job_name)\n", 762 | " return result.group('job_id') if result else None" 763 | ] 764 | }, 765 | { 766 | "cell_type": "code", 767 | "execution_count": null, 768 | "metadata": {}, 769 | "outputs": [], 770 | "source": [ 771 | "job_status = api_client.get_job(get_job_id(response['name']))\n", 772 | "print(f\"JOB STATUS: {job_status['state']}\")" 773 | ] 774 | }, 775 | { 776 | "cell_type": "markdown", 777 | "metadata": {}, 778 | "source": [ 779 | "Get Airflow DAG run instance from the output artifact" 780 | ] 781 | }, 782 | { 783 | "cell_type": "code", 784 | "execution_count": null, 785 | "metadata": {}, 786 | "outputs": [], 787 | "source": [ 788 | "airflow_response_uri = [task['outputs']['response']['artifacts'][0]['uri'] for task in job_status['jobDetail']['taskDetails'] if task['taskName']=='trigger-airflow-dag'][0]\n", 789 | "airflow_response_uri" 790 | ] 791 | }, 792 | { 793 | "cell_type": "code", 794 | "execution_count": null, 795 | "metadata": {}, 796 | "outputs": [], 797 | "source": [ 798 | "!gsutil ls $airflow_response_uri/" 799 | ] 800 | }, 801 | { 802 | "cell_type": "code", 803 | "execution_count": null, 804 | "metadata": {}, 805 | "outputs": [], 806 | "source": [ 807 | "!gsutil cat $airflow_response_uri/airflow_response.json" 808 | ] 809 | }, 810 | { 811 | "cell_type": "markdown", 812 | "metadata": {}, 813 | "source": [ 814 | "### Monitor Airflow DAG run\n", 815 | "\n", 816 | "Go to Airflow webserver and monitor the status of data processing DAG. Airflow webserver URL is" 817 | ] 818 | }, 819 | { 820 | "cell_type": "code", 821 | "execution_count": null, 822 | "metadata": {}, 823 | "outputs": [], 824 | "source": [ 825 | "COMPOSER_WEB_URI" 826 | ] 827 | }, 828 | { 829 | "cell_type": "markdown", 830 | "metadata": {}, 831 | "source": [ 832 | "![](images/airflow_dag_run.png)" 833 | ] 834 | }, 835 | { 836 | "cell_type": "markdown", 837 | "metadata": {}, 838 | "source": [ 839 | "## Clean Up" 840 | ] 841 | }, 842 | { 843 | "cell_type": "markdown", 844 | "metadata": {}, 845 | "source": [ 846 | "- Delete Cloud Storage bucket\n", 847 | "- Delete Cloud Composer environment" 848 | ] 849 | } 850 | ], 851 | "metadata": { 852 | "environment": { 853 | "name": "tf2-gpu.2-4.m65", 854 | "type": "gcloud", 855 | "uri": "gcr.io/deeplearning-platform-release/tf2-gpu.2-4:m65" 856 | }, 857 | "kernelspec": { 858 | "display_name": "vertex", 859 | "language": "python", 860 | "name": "vertex" 861 | }, 862 | "language_info": { 863 | "codemirror_mode": { 864 | "name": "ipython", 865 | "version": 3 866 | }, 867 | "file_extension": ".py", 868 | "mimetype": "text/x-python", 869 | "name": "python", 870 | "nbconvert_exporter": "python", 871 | "pygments_lexer": "ipython3", 872 | "version": "3.7.10" 873 | } 874 | }, 875 | "nbformat": 4, 876 | "nbformat_minor": 4 877 | } 878 | -------------------------------------------------------------------------------- /05-vertex-event-based-model-deploy/images/event_based_model_deployment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/05-vertex-event-based-model-deploy/images/event_based_model_deployment.png -------------------------------------------------------------------------------- /06-vertex-train-deploy-r-model/images/serving-with-custom-containers-on-vertex-predictions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/06-vertex-train-deploy-r-model/images/serving-with-custom-containers-on-vertex-predictions.png -------------------------------------------------------------------------------- /07-vertex-train-deploy-lightgbm/images/serving-with-custom-containers-on-vertex-predictions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/07-vertex-train-deploy-lightgbm/images/serving-with-custom-containers-on-vertex-predictions.png -------------------------------------------------------------------------------- /07-vertex-train-deploy-lightgbm/images/training-with-custom-containers-on-vertex-training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/07-vertex-train-deploy-lightgbm/images/training-with-custom-containers-on-vertex-training.png -------------------------------------------------------------------------------- /08-pytorch-distributed/README.md: -------------------------------------------------------------------------------- 1 | # PyTorch Distributed Training 2 | 3 | Work in progress 4 | -------------------------------------------------------------------------------- /09-distributed-xgboost-dask/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/rapidsai/rapidsai:22.04-cuda11.2-base-ubuntu18.04 2 | 3 | RUN . /opt/conda/etc/profile.d/conda.sh \ 4 | && conda activate rapids \ 5 | && pip install -U gcsfs 6 | 7 | RUN mkdir /train 8 | WORKDIR /train 9 | 10 | ADD train.py /train 11 | ADD train.sh /train 12 | 13 | ENTRYPOINT ["bash", "train.sh"] -------------------------------------------------------------------------------- /09-distributed-xgboost-dask/README.md: -------------------------------------------------------------------------------- 1 | ``` 2 | # set variables 3 | PROJECT_ID=$(gcloud config list --format 'value(core.project)') 4 | REGION=us-central1 5 | TRAIN_IMAGE_URI=${REGION}-docker.pkg.dev/${PROJECT_ID}/vertex-rapidsai/distributed-xgboost-dask 6 | 7 | STAGING_BUCKET_NAME=cloud-ai-platform-2f444b6a-a742-444b-b91a-c7519f51bd77 8 | TRAIN_FILES=gs://rthallam-demo-project/rapids-on-gcp/data/latest/a/higgs_00.csv 9 | 10 | # create artifact registry repository 11 | gcloud artifacts repositories create vertex-rapidsai \ 12 | --repository-format=docker \ 13 | --location=${REGION} \ 14 | --description="Vertex AI RAPIDS" 15 | 16 | # build training image and push to Artifact Registry 17 | gcloud builds submit --tag $TRAIN_IMAGE_URI --timeout=3600 . 18 | 19 | # create training job config for multi-node multi-gpu dask job 20 | date_now=$(date "+%Y%m%d-%H%M%S") 21 | 22 | cat << EOF > ./dask-xgb-multi-node.yml 23 | 24 | baseOutputDirectory: 25 | outputUriPrefix: gs://${STAGING_BUCKET_NAME}/rapidsai/distributed-xgboost-dask/${date_now}/ 26 | workerPoolSpecs: 27 | - 28 | machineSpec: 29 | machineType: n1-highmem-4 30 | acceleratorType: NVIDIA_TESLA_T4 31 | acceleratorCount: 1 32 | replicaCount: 1 33 | containerSpec: 34 | imageUri: ${TRAIN_IMAGE_URI} 35 | args: 36 | - --train-files=${TRAIN_FILES} 37 | - --rmm-pool-size=4G 38 | - --num-workers=4 39 | - --nthreads=4 40 | - 41 | machineSpec: 42 | machineType: n1-highmem-4 43 | acceleratorType: NVIDIA_TESLA_T4 44 | acceleratorCount: 2 45 | replicaCount: 4 46 | containerSpec: 47 | imageUri: ${TRAIN_IMAGE_URI} 48 | args: 49 | - --train-files=${TRAIN_FILES} 50 | - --rmm-pool-size=4G 51 | - --num-workers=4 52 | - --nthreads=4 53 | EOF 54 | 55 | # submit vertex ai custom training job 56 | gcloud beta ai custom-jobs create \ 57 | --display-name=rapids-dstrbtd-xgb-dask-multi-node \ 58 | --region=$REGION \ 59 | --project=$PROJECT_ID \ 60 | --config=dask-xgb-multi-node.yml 61 | ``` -------------------------------------------------------------------------------- /09-distributed-xgboost-dask/train.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import time 4 | import argparse 5 | import subprocess 6 | import logging 7 | from pathlib import Path 8 | import json 9 | import asyncio 10 | import socket 11 | 12 | import xgboost as xgb 13 | from xgboost.dask import DaskDMatrix, DaskDeviceQuantileDMatrix 14 | 15 | import dask 16 | import dask.dataframe as dd 17 | import dask_cudf as cudf 18 | from dask.distributed import Client, wait 19 | 20 | dask.config.set({"distributed.comm.timeouts.connect": "60s"}) 21 | 22 | def get_args(): 23 | """Define the task arguments with the default values. 24 | Returns: 25 | experiment parameters 26 | """ 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument( 29 | '--model-dir', 30 | default=os.getenv('AIP_MODEL_DIR'), 31 | type=str, 32 | help='Cloud Storage URI of a directory for saving model artifacts') 33 | parser.add_argument( 34 | '--train-files', 35 | type=str, 36 | help='Training files local or GCS', 37 | required=True) 38 | parser.add_argument( 39 | '--scheduler-ip-file', 40 | type=str, 41 | help='Scratch temp file to store scheduler ip in GCS', 42 | required=False) 43 | parser.add_argument( 44 | '--num-workers', 45 | type=int, 46 | help='num of workers for rabit') 47 | parser.add_argument( 48 | '--rmm-pool-size', 49 | type=str, 50 | help='RMM pool size', 51 | default='8G') 52 | parser.add_argument( 53 | '--nthreads', 54 | type=str, 55 | help='nthreads for master and worker', 56 | default='4') 57 | parser.add_argument( 58 | '--parquet', 59 | action='store_false', 60 | help='parquet files are used') 61 | 62 | return parser.parse_args() 63 | 64 | async def start_client( 65 | scheduler_addr, 66 | train_dir, 67 | num_workers, 68 | gpu_mode=True, 69 | do_wait=False, 70 | parquet=False): 71 | """ 72 | """ 73 | async with Client(scheduler_addr, asynchronous=True) as client: 74 | # wait until all workers are up and running 75 | dask.config.set({'distributed.scheduler.work-stealing': False}) 76 | dask.config.set({'distributed.scheduler.bandwidth': 1}) 77 | logging.info(f'distributed.scheduler.work-stealing={dask.config.get("distributed.scheduler.work-stealing")}') 78 | logging.info(f'distributed.scheduler.bandwidth={dask.config.get("distributed.scheduler.bandwidth")}') 79 | await client.wait_for_workers(num_workers) 80 | 81 | # read dataframe 82 | colnames = ['label'] + ['feature-%02d' % i for i in range(1, 29)] 83 | 84 | # read as csv or parquet 85 | if parquet is True: 86 | if gpu_mode: 87 | df = cudf.read_parquet(train_dir, columns=colnames) 88 | else: 89 | df = dd.read_parquet(train_dir, columns=colnames) 90 | else: 91 | if gpu_mode: 92 | df = cudf.read_csv(train_dir, header=None, names=colnames, chunksize=None) 93 | else: 94 | df = dd.read_csv(train_dir, header=None, names=colnames, chunksize=None) 95 | 96 | # get features and target label 97 | X = df[df.columns.difference(['label'])] 98 | y = df['label'] 99 | 100 | # wait for fully computing results 101 | if do_wait is True: 102 | df = df.persist() 103 | X = X.persist() 104 | wait(df) 105 | wait(X) 106 | logging.info("[debug:leader]: ------ Long waited but the data is ready now") 107 | 108 | # compute DMatrix for training xgboost 109 | # for GPU compute DaskDeviceQuantileDMatrix 110 | start_time = time.time() 111 | if gpu_mode: 112 | dtrain = await DaskDeviceQuantileDMatrix(client, X, y) 113 | else: 114 | dtrain = DaskDMatrix(client, X, y) 115 | logging.info("[debug:leader]: ------ QuantileDMatrix is formed in {} seconds ---".format((time.time() - start_time))) 116 | 117 | # remove data from distributed RAM by removing the collection from local process 118 | del df 119 | del X 120 | del y 121 | 122 | # start training 123 | logging.info("[debug:leader]: ------ training started") 124 | start_time = time.time() 125 | xgb_params = { 126 | 'verbosity': 2, 127 | 'learning_rate': 0.1, 128 | 'max_depth': 8, 129 | 'objective': 'reg:squarederror', 130 | 'subsample': 0.6, 131 | 'gamma': 1, 132 | 'verbose_eval': True, 133 | 'tree_method': 'gpu_hist' if gpu_mode else 'hist', 134 | 'nthread': 1 135 | } 136 | output = await xgb.dask.train( 137 | client, 138 | xgb_params, 139 | dtrain, 140 | num_boost_round=100, 141 | evals=[(dtrain, 'train')]) 142 | logging.info("[debug:leader]: ------ training finished") 143 | 144 | # evaluation history 145 | history = output['history'] 146 | logging.info('[debug:leader]: ------ Training evaluation history:', history) 147 | 148 | # save model 149 | model_file = f"{model_dir}/model/xgboost.model" 150 | output['booster'].save_model(model_file) 151 | logging.info(f"[debug:leader]: ------model saved {model_file}") 152 | 153 | logging.info("[debug:leader]: ------ %s seconds ---" % (time.time() - start_time)) 154 | 155 | # wait for client to shutdown 156 | await client.shutdown() 157 | 158 | def launch_dask(cmd, is_shell): 159 | """ launch dask scheduler 160 | """ 161 | return subprocess.Popen(cmd, stdout=None, stderr=None, shell=is_shell) 162 | 163 | def launch_worker(cmd): 164 | """ launch dask workers 165 | """ 166 | return subprocess.check_call(cmd, stdout=sys.stdout, stderr=sys.stderr) 167 | 168 | def get_scheduler_ip(scheduler_ip_file): 169 | with open(scheduler_ip_file, 'r') as f: 170 | scheduler_ip = f.read().rstrip("\n") 171 | return scheduler_ip 172 | 173 | if __name__=='__main__': 174 | logging.basicConfig(format='%(message)s') 175 | logging.getLogger().setLevel(logging.INFO) 176 | 177 | # get program args 178 | args = get_args() 179 | 180 | # set and create local directories if does not exists 181 | local_tmp_dir = os.path.join(os.getcwd(), "tmp") 182 | Path(local_tmp_dir).mkdir(parents=True, exist_ok=True) 183 | local_model_dir = os.path.join(local_tmp_dir, 'model') 184 | Path(local_model_dir).mkdir(parents=True, exist_ok=True) 185 | 186 | # define variables 187 | gs_prefix = 'gs://' 188 | gcsfuse_prefix = '/gcs/' 189 | 190 | logging.info(f'[INFO]: args.model_dir = {args.model_dir}') 191 | 192 | model_dir = args.model_dir or local_model_dir 193 | if model_dir and model_dir.startswith(gs_prefix): 194 | model_dir = model_dir.replace(gs_prefix, gcsfuse_prefix) 195 | Path(model_dir).mkdir(parents=True, exist_ok=True) 196 | 197 | tmp_dir = model_dir or local_tmp_dir 198 | if not tmp_dir.startswith(gs_prefix): 199 | Path(tmp_dir).mkdir(parents=True, exist_ok=True) 200 | 201 | scheduler_ip_file = f"{tmp_dir}dask_scheduler.txt" if tmp_dir[-1] == "/" else f"{tmp_dir}/dask_scheduler.txt" 202 | 203 | logging.info(f'[INFO]: model_dir = {model_dir}') 204 | logging.info(f'[INFO]: tmp_dir = {tmp_dir}') 205 | logging.info(f'[INFO]: scheduler_ip_file = {scheduler_ip_file}') 206 | 207 | # read worker pool config and launch dask scheduler and workers 208 | TF_CONFIG = os.environ.get('TF_CONFIG') 209 | 210 | if TF_CONFIG: 211 | TF_CONFIG = json.loads(TF_CONFIG) 212 | logging.info(TF_CONFIG) 213 | task_name = TF_CONFIG.get('task', {}).get('type') 214 | else: 215 | logging.info(f'Running locally') 216 | task_name = 'chief' 217 | 218 | scheduler_port = '8786' 219 | 220 | if task_name == 'chief': 221 | host_name = socket.gethostname() 222 | host_ip = socket.gethostbyname(host_name) 223 | 224 | with open(scheduler_ip_file, 'w') as f: 225 | f.write(host_ip) 226 | 227 | scheduler_addr = f'{host_ip}:{scheduler_port}' 228 | logging.info('[INFO]: The scheduler IP is %s', scheduler_addr) 229 | proc_scheduler = launch_dask(f'dask-scheduler --protocol tcp > {tmp_dir}/scheduler.log 2>&1 &', True) 230 | logging.info('[debug:leader]: ------ start scheduler') 231 | 232 | proc_worker = launch_dask([ 233 | 'dask-cuda-worker', 234 | '--rmm-pool-size', args.rmm_pool_size, 235 | '--nthreads', args.nthreads, 236 | scheduler_addr], 237 | False) 238 | logging.info('[debug:leader]: ------ start worker') 239 | asyncio.get_event_loop().run_until_complete( 240 | start_client( 241 | scheduler_addr, 242 | args.train_files, 243 | args.num_workers, 244 | parquet=False)) 245 | 246 | # launch dask worker, redirect output to sys stdout/err 247 | elif task_name == 'worker': 248 | while not os.path.isfile(scheduler_ip_file): 249 | time.sleep(1) 250 | 251 | # with open(scheduler_ip_file, 'r') as f: 252 | # scheduler_ip = f.read().rstrip("\n") 253 | scheduler_ip = get_scheduler_ip(scheduler_ip_file) 254 | while not scheduler_ip: 255 | time.sleep(1) 256 | scheduler_ip = get_scheduler_ip(scheduler_ip_file) 257 | 258 | scheduler_ip = get_scheduler_ip(scheduler_ip_file) 259 | logging.info(f'[debug:scheduler_ip]: ------ {scheduler_ip}') 260 | scheduler_addr = f'{scheduler_ip}:{scheduler_port}' 261 | 262 | proc_worker = launch_worker([ 263 | 'dask-cuda-worker', 264 | '--rmm-pool-size', args.rmm_pool_size, 265 | '--nthreads' , args.nthreads, 266 | scheduler_addr]) -------------------------------------------------------------------------------- /09-distributed-xgboost-dask/train.sh: -------------------------------------------------------------------------------- 1 | source /conda/etc/profile.d/conda.sh 2 | conda activate rapids 3 | 4 | echo "Running: train.py $@" 5 | python train.py $@ 6 | -------------------------------------------------------------------------------- /10-serving-ensemble-triton/models/combine/1/model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | import json 4 | 5 | import triton_python_backend_utils as pb_utils 6 | import transformers 7 | 8 | class TritonPythonModel: 9 | 10 | def initialize(self, args): 11 | self.log = open("/tmp/combine.loq", "w") 12 | self.log.write("DEBUG: ------------------------ hello world init combine/model.py------------------------------------\n") 13 | 14 | self.model_config = model_config = json.loads(args['model_config']) 15 | output_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") 16 | self.output_dtype = pb_utils.triton_string_to_numpy(output_config['data_type']) 17 | 18 | def execute(self, requests): 19 | self.log.write("DEBUG: ------------------------hello world execute combine/model.py\n") 20 | 21 | output_dtype = self.output_dtype 22 | responses = [] 23 | out_tensor = [] 24 | self.log.write("DEBUG: ------------------------requests: combine/model.py " + str(requests) + "\n") 25 | for request in requests: 26 | xgb_class = pb_utils.get_input_tensor_by_name(request, "xgb_class") 27 | tf_class = pb_utils.get_input_tensor_by_name(request, "tf_class") 28 | sci_1_class = pb_utils.get_input_tensor_by_name(request, "sci_1_class") 29 | sci_2_class = pb_utils.get_input_tensor_by_name(request, "sci_2_class") 30 | 31 | self.log.write("DEBUG: ------------------------ xgb_class tf_class sci_1_class sci_2_class \n" 32 | + str(xgb_class.as_numpy()) + '\n' 33 | + str(tf_class.as_numpy()) + '\n' 34 | + str(sci_1_class.as_numpy()) + '\n' 35 | + str(sci_2_class.as_numpy()) + '\n' ) 36 | 37 | out_tensor.append(pb_utils.Tensor("OUTPUT0", 38 | (xgb_class.as_numpy() 39 | + tf_class.as_numpy() 40 | + sci_1_class.as_numpy() 41 | + sci_2_class.as_numpy()) / 4.0)) 42 | 43 | inference_response = pb_utils.InferenceResponse(output_tensors = out_tensor) 44 | responses.append(inference_response) 45 | 46 | self.log.flush() 47 | return responses 48 | 49 | def finalize(self): 50 | self.log.write("DEBUG: ------------------------ hello world finalize combine/model.py------------------------------------\n") 51 | self.log.write('Cleaning up - custom model combine') 52 | self.log.close() 53 | 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /10-serving-ensemble-triton/models/combine/config.pbtxt: -------------------------------------------------------------------------------- 1 | name: "combine" 2 | backend: "python" 3 | max_batch_size: 0 4 | input [ 5 | { 6 | name: "xgb_class" 7 | data_type: TYPE_FP32 8 | dims: [ 1 ] 9 | }, 10 | { 11 | name: "tf_class" 12 | data_type: TYPE_FP32 13 | dims: [ -1, 1 ] 14 | }, 15 | { 16 | name: "sci_1_class" 17 | data_type: TYPE_FP32 18 | dims: [ 1 ] 19 | }, 20 | { 21 | name: "sci_2_class" 22 | data_type: TYPE_FP32 23 | dims: [ 1 ] 24 | } 25 | ] 26 | output [ 27 | { 28 | name: "OUTPUT0" 29 | data_type: TYPE_FP32 30 | dims: [ -1, 1 ] 31 | } 32 | ] 33 | parameters [ 34 | { 35 | key: "output_class" 36 | value: { string_value: "true" } 37 | }, 38 | { 39 | key: "threshold" 40 | value: { string_value: "0.5" } 41 | } 42 | ] 43 | 44 | instance_group[ { kind: KIND_CPU } ] 45 | 46 | 47 | -------------------------------------------------------------------------------- /10-serving-ensemble-triton/models/ensemble/1/empty: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/10-serving-ensemble-triton/models/ensemble/1/empty -------------------------------------------------------------------------------- /10-serving-ensemble-triton/models/ensemble/config.pbtxt: -------------------------------------------------------------------------------- 1 | platform: "ensemble" 2 | max_batch_size: 0 3 | input [ 4 | { 5 | name: "INPUT0" 6 | data_type: TYPE_FP32 7 | dims: [ -1, 4 ] 8 | } 9 | ] 10 | output [ 11 | { 12 | name: "OUTPUT0" 13 | data_type: TYPE_FP32 14 | dims: [ -1, 1 ] 15 | } 16 | ] 17 | ensemble_scheduling { 18 | step [ 19 | { 20 | model_name: "mux" 21 | model_version: -1 22 | input_map { 23 | key: "mux_in" 24 | value: "INPUT0" 25 | } 26 | output_map { 27 | key: "mux_xgb_out" 28 | value: "mux_xgb_out" 29 | } 30 | output_map { 31 | key: "mux_tf_out" 32 | value: "mux_tf_out" 33 | } 34 | output_map { 35 | key: "mux_sci_1_out" 36 | value: "mux_sci_1_out" 37 | } 38 | output_map { 39 | key: "mux_sci_2_out" 40 | value: "mux_sci_2_out" 41 | } 42 | }, 43 | { 44 | model_name: "xgb" 45 | model_version: -1 46 | input_map { 47 | key: "input__0" 48 | value: "mux_xgb_out" 49 | } 50 | output_map { 51 | key: "output__0" 52 | value: "xgb_class" 53 | } 54 | }, 55 | { 56 | model_name: "tf" 57 | model_version: -1 58 | input_map { 59 | key: "dense_input" 60 | value: "mux_tf_out" 61 | } 62 | output_map { 63 | key: "round" 64 | value: "tf_class" 65 | } 66 | }, 67 | { 68 | model_name: "sci_1" 69 | model_version: -1 70 | input_map { 71 | key: "input__0" 72 | value: "mux_sci_1_out" 73 | } 74 | output_map { 75 | key: "output__0" 76 | value: "sci_1_class" 77 | } 78 | }, 79 | { 80 | model_name: "sci_2" 81 | model_version: -1 82 | input_map { 83 | key: "input__0" 84 | value: "mux_sci_2_out" 85 | } 86 | output_map { 87 | key: "output__0" 88 | value: "sci_2_class" 89 | } 90 | }, 91 | { 92 | model_name: "combine" 93 | model_version: -1 94 | input_map { 95 | key: "xgb_class" 96 | value: "xgb_class" 97 | } 98 | input_map { 99 | key: "tf_class" 100 | value: "tf_class" 101 | } 102 | input_map { 103 | key: "sci_1_class" 104 | value: "sci_1_class" 105 | } 106 | input_map { 107 | key: "sci_2_class" 108 | value: "sci_2_class" 109 | } 110 | output_map { 111 | key: "OUTPUT0" 112 | value: "OUTPUT0" 113 | } 114 | } 115 | ] 116 | } 117 | parameters: [ 118 | { 119 | key: "predict_proba" 120 | value: { string_value: "false" } 121 | }, 122 | { 123 | key: "output_class" 124 | value: { string_value: "false" } 125 | }, 126 | { 127 | key: "threshold" 128 | value: { string_value: "0.5" } 129 | }, 130 | { 131 | key: "algo" 132 | value: { string_value: "ALGO_AUTO" } 133 | }, 134 | { 135 | key: "storage_type" 136 | value: { string_value: "AUTO" } 137 | }, 138 | { 139 | key: "blocks_per_sm" 140 | value: { string_value: "0" } 141 | } 142 | ] 143 | 144 | -------------------------------------------------------------------------------- /10-serving-ensemble-triton/models/mux/1/model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | import json 4 | 5 | import triton_python_backend_utils as pb_utils 6 | import transformers 7 | 8 | class TritonPythonModel: 9 | 10 | def initialize(self, args): 11 | self.log = open("/tmp/mux.loq", "w") 12 | 13 | self.log.write("DEBUG: ------------------------ hello world init mux/model.py ------------------------------------\n") 14 | self.out_dtypes = {} 15 | self.model_config = model_config = json.loads(args['model_config']) 16 | 17 | mux_xgb_out_config = pb_utils.get_output_config_by_name(model_config, "mux_xgb_out") 18 | self.out_dtypes["mux_xgb_out"] = pb_utils.triton_string_to_numpy(mux_xgb_out_config["data_type"]) 19 | 20 | mux_tf_out_config = pb_utils.get_output_config_by_name(model_config, "mux_tf_out") 21 | self.out_dtypes["mux_tf_out"] = pb_utils.triton_string_to_numpy(mux_tf_out_config["data_type"]) 22 | 23 | mux_sci_1_out_config = pb_utils.get_output_config_by_name(model_config, "mux_sci_1_out") 24 | self.out_dtypes["mux_sci_1_out"] = pb_utils.triton_string_to_numpy(mux_sci_1_out_config["data_type"]) 25 | 26 | mux_sci_2_out_config = pb_utils.get_output_config_by_name(model_config, "mux_sci_2_out") 27 | self.out_dtypes["mux_sci_2_out"] = pb_utils.triton_string_to_numpy(mux_sci_1_out_config["data_type"]) 28 | 29 | 30 | def execute(self, requests): 31 | 32 | self.log.write("DEBUG: ------------------------requests: mux/model.py \n" + str(requests) + '\n') 33 | 34 | responses = [] 35 | for request in requests: 36 | 37 | mux_in = pb_utils.get_input_tensor_by_name(request, "mux_in") 38 | out_tensors = [] 39 | for model in ["mux_xgb_out", "mux_tf_out", "mux_sci_1_out", "mux_sci_2_out"]: 40 | self.log.write("DEBUG: ------------------------ model dtype out_tensor tensor.astype" + model + '\n' 41 | + str(self.out_dtypes[model]) + " " 42 | + str(mux_in.as_numpy()) + " " 43 | + str(mux_in.as_numpy().astype(self.out_dtypes[model])) + '\n') 44 | 45 | out_tensors.append(pb_utils.Tensor(model, mux_in.as_numpy().astype(self.out_dtypes[model]))) 46 | 47 | inference_response = pb_utils.InferenceResponse(output_tensors = out_tensors) 48 | responses.append(inference_response) 49 | 50 | self.log.flush() 51 | return responses 52 | 53 | def finalize(self): 54 | self.log.write("DEBUG: ------------------------ hello world finalize mux/model.py ------------------------------------ \n") 55 | 56 | self.log.write('Cleaning up - custom model combine \n') 57 | self.log.close() 58 | 59 | 60 | -------------------------------------------------------------------------------- /10-serving-ensemble-triton/models/mux/config.pbtxt: -------------------------------------------------------------------------------- 1 | name: "mux" 2 | backend: "python" 3 | max_batch_size: 0 4 | input [ 5 | { 6 | name: "mux_in" 7 | data_type: TYPE_FP32 8 | dims: [ -1, 4 ] 9 | } 10 | ] 11 | output [ 12 | { 13 | name: "mux_xgb_out" 14 | data_type: TYPE_FP32 15 | dims: [ -1, 4 ] 16 | }, 17 | { 18 | name: "mux_tf_out" 19 | data_type: TYPE_FP32 20 | dims: [ -1, 4 ] 21 | }, 22 | { 23 | name: "mux_sci_1_out" 24 | data_type: TYPE_FP32 25 | dims: [ -1, 4 ] 26 | }, 27 | { 28 | name: "mux_sci_2_out" 29 | data_type: TYPE_FP32 30 | dims: [ -1, 4 ] 31 | } 32 | ] 33 | 34 | parameters [ 35 | { 36 | key: "output_class" 37 | value: { string_value: "false" } 38 | }, 39 | { 40 | key: "threshold" 41 | value: { string_value: "0.5" } 42 | } 43 | ] 44 | 45 | instance_group[ { kind: KIND_CPU } ] 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /10-serving-ensemble-triton/models/sci_1/1/checkpoint.tl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/10-serving-ensemble-triton/models/sci_1/1/checkpoint.tl -------------------------------------------------------------------------------- /10-serving-ensemble-triton/models/sci_1/1/sci_1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/10-serving-ensemble-triton/models/sci_1/1/sci_1.pkl -------------------------------------------------------------------------------- /10-serving-ensemble-triton/models/sci_1/config.pbtxt: -------------------------------------------------------------------------------- 1 | backend: "fil" 2 | max_batch_size: 0 3 | input [ 4 | { 5 | name: "input__0" 6 | data_type: TYPE_FP32 7 | dims: [ -1, 4 ] 8 | } 9 | ] 10 | output [ 11 | { 12 | name: "output__0" 13 | data_type: TYPE_FP32 14 | dims: [ 1 ] 15 | } 16 | ] 17 | instance_group [{ kind: KIND_GPU }] 18 | parameters [ 19 | { 20 | key: "model_type" 21 | value: { string_value: "treelite_checkpoint" } 22 | }, 23 | { 24 | key: "predict_proba" 25 | value: { string_value: "false" } 26 | }, 27 | { 28 | key: "output_class" 29 | value: { string_value: "true" } 30 | }, 31 | { 32 | key: "threshold" 33 | value: { string_value: "0.5" } 34 | }, 35 | { 36 | key: "algo" 37 | value: { string_value: "ALGO_AUTO" } 38 | }, 39 | { 40 | key: "storage_type" 41 | value: { string_value: "AUTO" } 42 | }, 43 | { 44 | key: "blocks_per_sm" 45 | value: { string_value: "0" } 46 | } 47 | ] 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /10-serving-ensemble-triton/models/sci_2/1/checkpoint.tl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/10-serving-ensemble-triton/models/sci_2/1/checkpoint.tl -------------------------------------------------------------------------------- /10-serving-ensemble-triton/models/sci_2/1/sci_2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/10-serving-ensemble-triton/models/sci_2/1/sci_2.pkl -------------------------------------------------------------------------------- /10-serving-ensemble-triton/models/sci_2/config.pbtxt: -------------------------------------------------------------------------------- 1 | backend: "fil" 2 | max_batch_size: 0 3 | input [ 4 | { 5 | name: "input__0" 6 | data_type: TYPE_FP32 7 | dims: [ -1, 4 ] 8 | } 9 | ] 10 | output [ 11 | { 12 | name: "output__0" 13 | data_type: TYPE_FP32 14 | dims: [ 1 ] 15 | } 16 | ] 17 | instance_group [{ kind: KIND_GPU }] 18 | parameters [ 19 | { 20 | key: "model_type" 21 | value: { string_value: "treelite_checkpoint" } 22 | }, 23 | { 24 | key: "predict_proba" 25 | value: { string_value: "false" } 26 | }, 27 | { 28 | key: "output_class" 29 | value: { string_value: "true" } 30 | }, 31 | { 32 | key: "threshold" 33 | value: { string_value: "0.5" } 34 | }, 35 | { 36 | key: "algo" 37 | value: { string_value: "ALGO_AUTO" } 38 | }, 39 | { 40 | key: "storage_type" 41 | value: { string_value: "AUTO" } 42 | }, 43 | { 44 | key: "blocks_per_sm" 45 | value: { string_value: "0" } 46 | } 47 | ] 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /10-serving-ensemble-triton/models/tf/1/model.savedmodel/saved_model.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/10-serving-ensemble-triton/models/tf/1/model.savedmodel/saved_model.pb -------------------------------------------------------------------------------- /10-serving-ensemble-triton/models/tf/1/model.savedmodel/variables/variables.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/10-serving-ensemble-triton/models/tf/1/model.savedmodel/variables/variables.data-00000-of-00001 -------------------------------------------------------------------------------- /10-serving-ensemble-triton/models/tf/1/model.savedmodel/variables/variables.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/10-serving-ensemble-triton/models/tf/1/model.savedmodel/variables/variables.index -------------------------------------------------------------------------------- /10-serving-ensemble-triton/models/tf/config.pbtxt: -------------------------------------------------------------------------------- 1 | backend: "tensorflow" 2 | platform: "tensorflow_savedmodel" 3 | max_batch_size: 0 4 | input [ 5 | { 6 | name: "dense_input" 7 | data_type: TYPE_FP32 8 | dims: [ -1, 4 ] 9 | } 10 | ] 11 | output [ 12 | { 13 | name: "round" 14 | data_type: TYPE_FP32 15 | dims: [ -1, 1 ] 16 | } 17 | ] 18 | instance_group [{ kind: KIND_GPU }] 19 | parameters [ 20 | { 21 | key: "model_type" 22 | value: { string_value: "tensorflow_savedmodel" } 23 | }, 24 | { 25 | key: "predict_proba" 26 | value: { string_value: "false" } 27 | }, 28 | { 29 | key: "output_class" 30 | value: { string_value: "true" } 31 | }, 32 | { 33 | key: "threshold" 34 | value: { string_value: "0.5" } 35 | }, 36 | { 37 | key: "algo" 38 | value: { string_value: "ALGO_AUTO" } 39 | }, 40 | { 41 | key: "storage_type" 42 | value: { string_value: "AUTO" } 43 | }, 44 | { 45 | key: "blocks_per_sm" 46 | value: { string_value: "0" } 47 | } 48 | ] 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /10-serving-ensemble-triton/models/xgb/config.pbtxt: -------------------------------------------------------------------------------- 1 | backend: "fil" 2 | max_batch_size: 0 3 | input [ 4 | { 5 | name: "input__0" 6 | data_type: TYPE_FP32 7 | dims: [ -1, 4 ] 8 | } 9 | ] 10 | output [ 11 | { 12 | name: "output__0" 13 | data_type: TYPE_FP32 14 | dims: [ 1 ] 15 | } 16 | ] 17 | instance_group [{ kind: KIND_GPU }] 18 | parameters [ 19 | { 20 | key: "model_type" 21 | value: { string_value: "xgboost_json" } 22 | }, 23 | { 24 | key: "predict_proba" 25 | value: { string_value: "false" } 26 | }, 27 | { 28 | key: "output_class" 29 | value: { string_value: "true" } 30 | }, 31 | { 32 | key: "threshold" 33 | value: { string_value: "0.5" } 34 | }, 35 | { 36 | key: "algo" 37 | value: { string_value: "ALGO_AUTO" } 38 | }, 39 | { 40 | key: "storage_type" 41 | value: { string_value: "AUTO" } 42 | }, 43 | { 44 | key: "blocks_per_sm" 45 | value: { string_value: "0" } 46 | } 47 | ] 48 | 49 | 50 | -------------------------------------------------------------------------------- /10-serving-ensemble-triton/src/combine/model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | import json 4 | import triton_python_backend_utils as pb_utils 5 | import transformers 6 | 7 | class TritonPythonModel: 8 | def initialize(self, args): 9 | self.model_config = model_config = json.loads(args['model_config']) 10 | output_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") 11 | self.output_dtype = pb_utils.triton_string_to_numpy(output_config['data_type']) 12 | 13 | def execute(self, requests): 14 | output_dtype = self.output_dtype 15 | responses = [] 16 | out_tensor = [] 17 | for request in requests: 18 | xgb_class = pb_utils.get_input_tensor_by_name(request, "xgb_class") 19 | tf_class = pb_utils.get_input_tensor_by_name(request, "tf_class") 20 | sci_1_class = pb_utils.get_input_tensor_by_name(request, "sci_1_class") 21 | sci_2_class = pb_utils.get_input_tensor_by_name(request, "sci_2_class") 22 | out_tensor.append(pb_utils.Tensor("OUTPUT0", 23 | (xgb_class.as_numpy() 24 | + tf_class.as_numpy() 25 | + sci_1_class.as_numpy() 26 | + sci_2_class.as_numpy()) / 4.0)) 27 | inference_response = pb_utils.InferenceResponse(output_tensors = out_tensor) 28 | responses.append(inference_response) 29 | return responses 30 | 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /10-serving-ensemble-triton/src/generate/generate.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import os 4 | 5 | features = 4 6 | samples = 1000 7 | 8 | data_path = os.environ['HOME'] + "/Triton/ensemble/data/" 9 | X_data = data_path + 'X.data.npy' 10 | Y_data = data_path + 'Y.data.npy' 11 | 12 | if (os.path.exists(X_data)): 13 | print("Data already exists at {}".format(X_data)) 14 | else: 15 | print("Generating data at {}".format(data_path)) 16 | X = np.random.rand(samples, features).astype('float32') 17 | Y = np.random.randint(2, size=samples) 18 | np.save(X_data, X) 19 | np.save(Y_data, Y) 20 | 21 | -------------------------------------------------------------------------------- /10-serving-ensemble-triton/src/mux/model.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import sys 4 | import json 5 | import triton_python_backend_utils as pb_utils 6 | import transformers 7 | 8 | class TritonPythonModel: 9 | 10 | def initialize(self, args): 11 | self.out_dtypes = {} 12 | self.model_config = model_config = json.loads(args['model_config']) 13 | mux_xgb_out_config = pb_utils.get_output_config_by_name(model_config, "mux_xgb_out") 14 | self.out_dtypes["mux_xgb_out"] = pb_utils.triton_string_to_numpy(mux_xgb_out_config["data_type"]) 15 | mux_tf_out_config = pb_utils.get_output_config_by_name(model_config, "mux_tf_out") 16 | self.out_dtypes["mux_tf_out"] = pb_utils.triton_string_to_numpy(mux_tf_out_config["data_type"]) 17 | mux_sci_1_out_config = pb_utils.get_output_config_by_name(model_config, "mux_sci_1_out") 18 | self.out_dtypes["mux_sci_1_out"] = pb_utils.triton_string_to_numpy(mux_sci_1_out_config["data_type"]) 19 | mux_sci_2_out_config = pb_utils.get_output_config_by_name(model_config, "mux_sci_2_out") 20 | self.out_dtypes["mux_sci_2_out"] = pb_utils.triton_string_to_numpy(mux_sci_1_out_config["data_type"]) 21 | 22 | 23 | def execute(self, requests): 24 | responses = [] 25 | for request in requests: 26 | mux_in = pb_utils.get_input_tensor_by_name(request, "mux_in") 27 | out_tensors = [] 28 | for model in ["mux_xgb_out", "mux_tf_out", "mux_sci_1_out", "mux_sci_2_out"]: 29 | out_tensors.append(pb_utils.Tensor(model, mux_in.as_numpy().astype(self.out_dtypes[model]))) 30 | inference_response = pb_utils.InferenceResponse(output_tensors = out_tensors) 31 | responses.append(inference_response) 32 | return responses 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /10-serving-ensemble-triton/src/sci_1/sci_1.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy as np 4 | import os 5 | 6 | from sklearn.ensemble import RandomForestClassifier 7 | 8 | from numpy import mean 9 | from numpy import std 10 | from sklearn.datasets import make_classification 11 | from sklearn.model_selection import cross_val_score 12 | from sklearn.model_selection import RepeatedStratifiedKFold 13 | from sklearn.ensemble import RandomForestClassifier 14 | from sklearn.model_selection import train_test_split 15 | 16 | import pickle 17 | 18 | import subprocess 19 | 20 | 21 | seed = 7 22 | features = 4 23 | samples = 1000 24 | 25 | data_path = os.environ['HOME'] + "/Triton/ensemble/data/" 26 | X_data = data_path + 'X.data.npy' 27 | Y_data = data_path + 'Y.data.npy' 28 | 29 | if (not os.path.exists(X_data)): 30 | print("Please run src/generate.py to create dummy data for modes") 31 | else: 32 | X = np.load(X_data) 33 | Y = np.load(Y_data) 34 | 35 | print("shape X " + str(X.shape)) 36 | print("shape Y " + str(Y.shape)) 37 | 38 | model_path = os.environ['HOME'] + "/Triton/ensemble/models/sci_1/1" 39 | 40 | test_size = 0.33 41 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed) 42 | 43 | 44 | model = RandomForestClassifier(max_depth=2, random_state=0) 45 | model.fit(X_train, y_train) 46 | ### print(model.predict([[0, 0, 0, 0]])) 47 | 48 | cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) 49 | n_scores = cross_val_score(model, X_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise') 50 | 51 | print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores))) 52 | 53 | 54 | pickle.dump(model, open(model_path + "/sci_1.pkl", 'wb')) 55 | 56 | subprocess.run(["{}/Triton/ensemble/fil_backend/scripts/convert_sklearn".format(os.environ['HOME']), model_path + "/sci_1.pkl"]) 57 | 58 | -------------------------------------------------------------------------------- /10-serving-ensemble-triton/src/sci_2/sci_2.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy as np 4 | import os 5 | 6 | 7 | from sklearn.ensemble import ExtraTreesClassifier 8 | from numpy import mean 9 | from numpy import std 10 | from sklearn.datasets import make_classification 11 | from sklearn.model_selection import cross_val_score 12 | from sklearn.model_selection import RepeatedStratifiedKFold 13 | from sklearn.model_selection import train_test_split 14 | 15 | import pickle 16 | 17 | import subprocess 18 | 19 | 20 | seed = 7 21 | features = 4 22 | samples = 1000 23 | 24 | data_path = os.environ['HOME'] + "/Triton/ensemble/data/" 25 | X_data = data_path + 'X.data.npy' 26 | Y_data = data_path + 'Y.data.npy' 27 | 28 | if (not os.path.exists(X_data)): 29 | print("Please run src/generate.py to create dummy data for modes") 30 | else: 31 | X = np.load(X_data) 32 | Y = np.load(Y_data) 33 | 34 | print("shape X " + str(X.shape)) 35 | print("shape Y " + str(Y.shape)) 36 | 37 | model_path = os.environ['HOME'] + "/Triton/ensemble/models/sci_2/1" 38 | 39 | test_size = 0.33 40 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed) 41 | 42 | 43 | model = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0) 44 | model.fit(X_train, y_train) 45 | ### print(model.predict([[0, 0, 0, 0]])) 46 | 47 | cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) 48 | n_scores = cross_val_score(model, X_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise') 49 | 50 | print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores))) 51 | 52 | pickle.dump(model, open(model_path + "/sci_2.pkl", 'wb')) 53 | 54 | subprocess.run(["{}/Triton/ensemble/fil_backend/scripts/convert_sklearn".format(os.environ['HOME']), model_path + "/sci_2.pkl"]) 55 | 56 | 57 | -------------------------------------------------------------------------------- /10-serving-ensemble-triton/src/test/combine_01.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy 4 | import tritonclient.http as triton_http 5 | 6 | # Set up both HTTP and GRPC clients. Note that the GRPC client is generally 7 | # somewhat faster. 8 | http_client = triton_http.InferenceServerClient( 9 | url='localhost:8000', 10 | verbose=False, 11 | concurrency=5 12 | ) 13 | 14 | # Generate example data to classify 15 | features = 1 16 | samples = 1 17 | data = numpy.random.rand(features).astype('float32') 18 | tf_data = numpy.random.rand(samples, features).astype('float32') 19 | 20 | print("data.shape:" + str(data.shape)) 21 | 22 | # Set up Triton input and output objects for HTTP 23 | inputs = [] 24 | triton_input_http_1 = triton_http.InferInput( 25 | 'xgb_class', 26 | [features], 27 | 'FP32' 28 | ) 29 | triton_input_http_1.set_data_from_numpy(data, binary_data=True) 30 | inputs.append(triton_input_http_1) 31 | 32 | triton_input_http_2 = triton_http.InferInput( 33 | 'tf_class', 34 | [samples, features], 35 | 'FP32' 36 | ) 37 | triton_input_http_2.set_data_from_numpy(tf_data, binary_data=True) 38 | inputs.append(triton_input_http_2) 39 | 40 | triton_input_http_3 = triton_http.InferInput( 41 | 'sci_1_class', 42 | [features], 43 | 'FP32' 44 | ) 45 | triton_input_http_3.set_data_from_numpy(data, binary_data=True) 46 | inputs.append(triton_input_http_3) 47 | 48 | triton_input_http_4 = triton_http.InferInput( 49 | 'sci_2_class', 50 | [features], 51 | 'FP32' 52 | ) 53 | triton_input_http_4.set_data_from_numpy(data, binary_data=True) 54 | inputs.append(triton_input_http_4) 55 | 56 | triton_output_http = triton_http.InferRequestedOutput( 57 | 'OUTPUT0', 58 | binary_data=True 59 | ) 60 | 61 | 62 | # Submit inference request 63 | request_http = http_client.infer( 64 | 'combine', 65 | model_version='1', 66 | inputs=inputs, 67 | outputs=[triton_output_http] 68 | ) 69 | 70 | # Get results as numpy arrays 71 | result_http = request_http.as_numpy('OUTPUT0') 72 | 73 | print(result_http) 74 | -------------------------------------------------------------------------------- /10-serving-ensemble-triton/src/test/ensemble_01.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy 4 | import tritonclient.http as triton_http 5 | 6 | http_client = triton_http.InferenceServerClient( 7 | url='localhost:8000', 8 | verbose=False, 9 | concurrency=5 10 | ) 11 | 12 | features = 4 13 | samples = 1 14 | data = numpy.random.rand(samples, features).astype('float32') 15 | 16 | triton_input_http = triton_http.InferInput( 17 | 'INPUT0', 18 | (samples, features), 19 | 'FP32' 20 | ) 21 | triton_input_http.set_data_from_numpy(data, binary_data=True) 22 | 23 | triton_output_http = triton_http.InferRequestedOutput( 24 | 'OUTPUT0', 25 | binary_data=True 26 | ) 27 | 28 | 29 | request_http = http_client.infer( 30 | 'ensemble', 31 | model_version='1', 32 | inputs=[triton_input_http], 33 | outputs=[triton_output_http] 34 | ) 35 | 36 | result_http = request_http.as_numpy('OUTPUT0') 37 | 38 | print(result_http) 39 | -------------------------------------------------------------------------------- /10-serving-ensemble-triton/src/test/mux.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy as np 4 | import tritonclient.http as triton_http 5 | 6 | # Set up HTTP client. 7 | http_client = triton_http.InferenceServerClient( 8 | url = 'localhost:8000', 9 | verbose = False, 10 | concurrency = 5 11 | ) 12 | 13 | features = 4 14 | samples = 1 15 | data = np.random.rand(samples, features).astype('float32') 16 | 17 | 18 | triton_input_http = triton_http.InferInput( 19 | 'mux_in', 20 | (samples, features), 21 | 'FP32' 22 | ) 23 | triton_input_http.set_data_from_numpy(data, binary_data=True) 24 | 25 | 26 | # Set up Triton input and output objects for HTTP 27 | outputs = [] 28 | 29 | triton_output_http_1 = triton_http.InferRequestedOutput( 30 | 'mux_xgb_out', 31 | binary_data = True 32 | ) 33 | outputs.append(triton_output_http_1) 34 | 35 | triton_output_http_2 = triton_http.InferRequestedOutput( 36 | 'mux_tf_out', 37 | binary_data = True 38 | ) 39 | outputs.append(triton_output_http_2) 40 | 41 | triton_output_http_3 = triton_http.InferRequestedOutput( 42 | 'mux_sci_1_out', 43 | binary_data = True 44 | ) 45 | outputs.append(triton_output_http_3) 46 | 47 | triton_output_http_4 = triton_http.InferRequestedOutput( 48 | 'mux_sci_2_out', 49 | binary_data = True 50 | ) 51 | outputs.append(triton_output_http_4) 52 | 53 | # Submit inference request 54 | request_http = http_client.infer( 55 | 'mux', 56 | model_version = '1', 57 | inputs = [triton_input_http], 58 | outputs = outputs 59 | ) 60 | 61 | # Get results as numpy arrays 62 | 63 | results = {} 64 | for model in ["mux_xgb_out", "mux_tf_out", "mux_sci_1_out", "mux_sci_2_out"]: 65 | results[model] = request_http.as_numpy(model) 66 | 67 | for model in ["mux_xgb_out", "mux_tf_out", "mux_sci_1_out", "mux_sci_2_out"]: 68 | print(model + " " + str(results[model]) + '\n') 69 | 70 | -------------------------------------------------------------------------------- /10-serving-ensemble-triton/src/test/sci_1.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy 4 | import tritonclient.http as triton_http 5 | 6 | # Set up both HTTP and GRPC clients. Note that the GRPC client is generally 7 | # somewhat faster. 8 | http_client = triton_http.InferenceServerClient( 9 | url='localhost:8000', 10 | verbose=False, 11 | concurrency=5 12 | ) 13 | 14 | # Generate example data to classify 15 | features = 4 16 | samples = 1 17 | data = numpy.random.rand(samples, features).astype('float32') 18 | 19 | # Set up Triton input and output objects for HTTP 20 | triton_input_http = triton_http.InferInput( 21 | 'input__0', 22 | (samples, features), 23 | 'FP32' 24 | ) 25 | triton_input_http.set_data_from_numpy(data, binary_data=True) 26 | 27 | triton_output_http = triton_http.InferRequestedOutput( 28 | 'output__0', 29 | binary_data=True 30 | ) 31 | 32 | 33 | # Submit inference request 34 | request_http = http_client.infer( 35 | 'sci_1', 36 | model_version='1', 37 | inputs=[triton_input_http], 38 | outputs=[triton_output_http] 39 | ) 40 | 41 | # Get results as numpy arrays 42 | result_http = request_http.as_numpy('output__0') 43 | 44 | print(result_http) 45 | -------------------------------------------------------------------------------- /10-serving-ensemble-triton/src/test/sci_2.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy 4 | import tritonclient.http as triton_http 5 | 6 | # Set up both HTTP and GRPC clients. Note that the GRPC client is generally 7 | # somewhat faster. 8 | http_client = triton_http.InferenceServerClient( 9 | url='localhost:8000', 10 | verbose=False, 11 | concurrency=5 12 | ) 13 | 14 | # Generate example data to classify 15 | features = 4 16 | samples = 1 17 | data = numpy.random.rand(samples, features).astype('float32') 18 | 19 | # Set up Triton input and output objects for HTTP 20 | triton_input_http = triton_http.InferInput( 21 | 'input__0', 22 | (samples, features), 23 | 'FP32' 24 | ) 25 | triton_input_http.set_data_from_numpy(data, binary_data=True) 26 | 27 | triton_output_http = triton_http.InferRequestedOutput( 28 | 'output__0', 29 | binary_data=True 30 | ) 31 | 32 | 33 | # Submit inference request 34 | request_http = http_client.infer( 35 | 'sci_2', 36 | model_version='1', 37 | inputs=[triton_input_http], 38 | outputs=[triton_output_http] 39 | ) 40 | 41 | # Get results as numpy arrays 42 | result_http = request_http.as_numpy('output__0') 43 | 44 | print(result_http) 45 | -------------------------------------------------------------------------------- /10-serving-ensemble-triton/src/test/tf_01.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy 4 | import tritonclient.http as triton_http 5 | 6 | # Set up both HTTP and GRPC clients. Note that the GRPC client is generally 7 | # somewhat faster. 8 | http_client = triton_http.InferenceServerClient( 9 | url='localhost:8000', 10 | verbose=False, 11 | concurrency=5 12 | ) 13 | 14 | # Generate example data to classify 15 | features = 4 16 | samples = 1 17 | data = numpy.random.rand(samples, features).astype('float32') 18 | 19 | # Set up Triton input and output objects for HTTP 20 | triton_input_http = triton_http.InferInput( 21 | 'dense_input', 22 | (samples, features), 23 | 'FP32' 24 | ) 25 | triton_input_http.set_data_from_numpy(data, binary_data=True) 26 | 27 | triton_output_http = triton_http.InferRequestedOutput( 28 | 'round', 29 | binary_data=True 30 | ) 31 | 32 | 33 | # Submit inference request 34 | request_http = http_client.infer( 35 | 'tf', 36 | model_version='1', 37 | inputs=[triton_input_http], 38 | outputs=[triton_output_http] 39 | ) 40 | 41 | # Get results as numpy arrays 42 | result_http = request_http.as_numpy('round') 43 | 44 | print(result_http) 45 | -------------------------------------------------------------------------------- /10-serving-ensemble-triton/src/test/xgb_01.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy 3 | import tritonclient.http as triton_http 4 | 5 | # Set up both HTTP and GRPC clients. Note that the GRPC client is generally 6 | # somewhat faster. 7 | http_client = triton_http.InferenceServerClient( 8 | url='localhost:8000', 9 | verbose=False, 10 | concurrency=5 11 | ) 12 | 13 | # Generate example data to classify 14 | features = 4 15 | samples = 1 16 | data = numpy.random.rand(samples, features).astype('float32') 17 | 18 | # Set up Triton input and output objects for HTTP 19 | triton_input_http = triton_http.InferInput( 20 | 'input__0', 21 | (samples, features), 22 | 'FP32' 23 | ) 24 | triton_input_http.set_data_from_numpy(data, binary_data=True) 25 | 26 | triton_output_http = triton_http.InferRequestedOutput( 27 | 'output__0', 28 | binary_data=True 29 | ) 30 | 31 | 32 | # Submit inference request 33 | request_http = http_client.infer( 34 | 'xgb', 35 | model_version='1', 36 | inputs=[triton_input_http], 37 | outputs=[triton_output_http] 38 | ) 39 | 40 | # Get results as numpy arrays 41 | result_http = request_http.as_numpy('output__0') 42 | 43 | print(result_http) 44 | -------------------------------------------------------------------------------- /10-serving-ensemble-triton/src/tf/tf.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy as np 4 | import os 5 | 6 | import tensorflow as tf 7 | import tensorflow.compat.v2.feature_column as fc 8 | 9 | from sklearn.model_selection import train_test_split 10 | 11 | 12 | class Round(tf.keras.layers.Layer): 13 | def __init__(self, num_outputs): 14 | super(Round, self).__init__() 15 | self.num_outputs = num_outputs 16 | 17 | def call(self, inputs): 18 | #print("inputs:" + str(inputs)) 19 | #outputs = inputs.__floordiv__(1.0) 20 | outputs = inputs 21 | return outputs 22 | 23 | 24 | 25 | seed = 7 26 | features = 4 27 | samples = 1000 28 | 29 | data_path = os.environ['HOME'] + "/Triton/ensemble/data/" 30 | X_data = data_path + 'X.data.npy' 31 | Y_data = data_path + 'Y.data.npy' 32 | 33 | if (not os.path.exists(X_data)): 34 | print("Please run src/generate.py to create dummy data for modes") 35 | else: 36 | X = np.load(X_data) 37 | Y = np.load(Y_data) 38 | 39 | print("s)hape X " + str(X.shape)) 40 | print("shape Y " + str(Y.shape)) 41 | 42 | model_path = os.environ['HOME'] + "/Triton/ensemble/models/tf/1" 43 | 44 | test_size = 0.33 45 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed) 46 | 47 | 48 | model = tf.keras.models.Sequential() 49 | model.add(tf.keras.layers.Dense(4, input_dim=4, activation='relu')) 50 | model.add(tf.keras.layers.Dense(1, activation='sigmoid')) 51 | model.add(Round(1)) 52 | model.compile(optimizer="adam", loss="binary_crossentropy", metrics=['accuracy']) 53 | model.fit(X_train, y_train) 54 | 55 | test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2) 56 | 57 | print('\nTest accuracy:', test_acc) 58 | 59 | # creates /models/tf/model.tf/saved_model.pb 60 | tf.saved_model.save(model, model_path + "/model.savedmodel") 61 | 62 | 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /10-serving-ensemble-triton/src/tf/tf.py_0: -------------------------------------------------------------------------------- 1 | #! /home/dfisk/miniconda3/envs/ensemble/bin/python 2 | 3 | import numpy as np 4 | import os 5 | 6 | import tensorflow as tf 7 | from sklearn.model_selection import train_test_split 8 | 9 | seed = 7 10 | features = 4 11 | samples = 1000 12 | 13 | data_path = os.environ['HOME'] + "/Triton/ensemble/data/" 14 | X_data = data_path + 'X.data.npy' 15 | Y_data = data_path + 'Y.data.npy' 16 | 17 | if (not os.path.exists(X_data)): 18 | print("Please run src/generate.py to create dummy data for modes") 19 | else: 20 | X = np.load(X_data) 21 | Y = np.load(Y_data) 22 | 23 | print("shape X " + str(X.shape)) 24 | print("shape Y " + str(Y.shape)) 25 | 26 | model_path = os.environ['HOME'] + "/Triton/ensemble/models/tf/1" 27 | 28 | test_size = 0.33 29 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed) 30 | 31 | 32 | class_names = ['Zero', 'One'] 33 | 34 | model = tf.keras.Sequential([ 35 | tf.keras.layers.Dense(128, activation='relu'), 36 | tf.keras.layers.Dense(10) 37 | ]) 38 | 39 | 40 | model.compile(optimizer='adam', 41 | loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 42 | metrics=['accuracy']) 43 | 44 | 45 | 46 | model.fit(X_train, y_train, epochs=10) 47 | 48 | 49 | test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2) 50 | 51 | print('\nTest accuracy:', test_acc) 52 | 53 | # creates /models/tf/model.tf/saved_model.pb 54 | tf.saved_model.save(model, model_path + "model.savedmodel") 55 | 56 | 57 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /10-serving-ensemble-triton/src/xgb/xgb.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy as np 4 | import os 5 | 6 | from xgboost import XGBClassifier 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.metrics import accuracy_score 9 | 10 | seed = 7 11 | features = 4 12 | samples = 1000 13 | 14 | data_path = os.environ['HOME'] + "/Triton/ensemble/data/" 15 | X_data = data_path + 'X.data.npy' 16 | Y_data = data_path + 'Y.data.npy' 17 | 18 | if (not os.path.exists(X_data)): 19 | print("Please run src/generate.py to create dummy data for modes") 20 | else: 21 | X = np.load(X_data) 22 | Y = np.load(Y_data) 23 | 24 | model_path = os.environ['HOME'] + "/Triton/ensemble/models/xgb/1" 25 | 26 | print("shape X " + str(X.shape)) 27 | print("shape Y " + str(Y.shape)) 28 | 29 | test_size = 0.33 30 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed) 31 | 32 | model = XGBClassifier() 33 | model.fit(X_train, y_train) 34 | y_pred = model.predict(X_test) 35 | accuracy = accuracy_score(y_test, y_pred) 36 | print("Test Accuracy: {:.2f}".format(accuracy * 100.0)) 37 | 38 | ### .save_config() 39 | ### print("config:") 40 | ### print(config) 41 | 42 | ###model.save_model(model_path + "/xgboost.json") 43 | 44 | model.save_model(model_path + "/xgboost.model") 45 | -------------------------------------------------------------------------------- /11-pytorch-on-tpu-vertex-ai/pytorch-on-vertex-ai-tpu-train-mnist.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "63ecb70a-c170-4721-a895-c67cf1ff873a", 6 | "metadata": {}, 7 | "source": [ 8 | "# Training MNIST with PyTorch on TPU-VM using Vertex AI" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "c473acbc-9370-4de8-ad50-f4471495db2f", 14 | "metadata": {}, 15 | "source": [ 16 | "# Imports and initialization" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "id": "4dae2be6-e296-45e5-9021-c116a96e72d2", 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "! pip -q install google-cloud-aiplatform" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "id": "7b78a856-4264-40b9-a71c-e8acb6b197f3", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "from datetime import datetime\n", 37 | "from google.cloud import aiplatform" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "id": "a9d711f4-8b33-467b-bccc-c7d501ffcfd5", 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "PROJECT_ID = 'rthallam-demo-project'\n", 48 | "BUCKET_NAME = \"cloud-ai-platform-2f444b6a-a742-444b-b91a-c7519f51bd77\"\n", 49 | "BUCKET_URI = f'gs://{BUCKET_NAME}'\n", 50 | "REGION = 'us-central1'" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "id": "249ce760-396b-43c8-9022-d9551ba6c12e", 56 | "metadata": {}, 57 | "source": [ 58 | "## Create Training Script" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "id": "ad233804-8b02-4249-bd45-77531f589a46", 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "%%writefile train.py\n", 69 | "\n", 70 | "# adapted from https://github.com/pytorch/xla/blob/master/test/test_train_mp_mnist.py\n", 71 | "\n", 72 | "import args_parse\n", 73 | "\n", 74 | "FLAGS = args_parse.parse_common_options(\n", 75 | " datadir='/tmp/mnist-data',\n", 76 | " batch_size=128,\n", 77 | " momentum=0.5,\n", 78 | " lr=0.01,\n", 79 | " target_accuracy=98.0,\n", 80 | " num_epochs=18)\n", 81 | "\n", 82 | "import os\n", 83 | "import shutil\n", 84 | "import sys\n", 85 | "import numpy as np\n", 86 | "import torch\n", 87 | "import torch.nn as nn\n", 88 | "import torch.nn.functional as F\n", 89 | "import torch.optim as optim\n", 90 | "from torchvision import datasets, transforms\n", 91 | "import torch_xla\n", 92 | "import torch_xla.debug.metrics as met\n", 93 | "import torch_xla.distributed.parallel_loader as pl\n", 94 | "import torch_xla.utils.utils as xu\n", 95 | "import torch_xla.core.xla_model as xm\n", 96 | "import torch_xla.distributed.xla_multiprocessing as xmp\n", 97 | "import torch_xla.test.test_utils as test_utils\n", 98 | "\n", 99 | "\n", 100 | "class MNIST(nn.Module):\n", 101 | "\n", 102 | " def __init__(self):\n", 103 | " super(MNIST, self).__init__()\n", 104 | " self.conv1 = nn.Conv2d(1, 10, kernel_size=5)\n", 105 | " self.bn1 = nn.BatchNorm2d(10)\n", 106 | " self.conv2 = nn.Conv2d(10, 20, kernel_size=5)\n", 107 | " self.bn2 = nn.BatchNorm2d(20)\n", 108 | " self.fc1 = nn.Linear(320, 50)\n", 109 | " self.fc2 = nn.Linear(50, 10)\n", 110 | "\n", 111 | " def forward(self, x):\n", 112 | " x = F.relu(F.max_pool2d(self.conv1(x), 2))\n", 113 | " x = self.bn1(x)\n", 114 | " x = F.relu(F.max_pool2d(self.conv2(x), 2))\n", 115 | " x = self.bn2(x)\n", 116 | " x = torch.flatten(x, 1)\n", 117 | " x = F.relu(self.fc1(x))\n", 118 | " x = self.fc2(x)\n", 119 | " return F.log_softmax(x, dim=1)\n", 120 | "\n", 121 | "\n", 122 | "def _train_update(device, x, loss, tracker, writer):\n", 123 | " test_utils.print_training_update(\n", 124 | " device,\n", 125 | " x,\n", 126 | " loss.item(),\n", 127 | " tracker.rate(),\n", 128 | " tracker.global_rate(),\n", 129 | " summary_writer=writer)\n", 130 | "\n", 131 | "\n", 132 | "def train_mnist(flags, **kwargs):\n", 133 | " torch.manual_seed(1)\n", 134 | "\n", 135 | " if flags.fake_data:\n", 136 | " train_loader = xu.SampleGenerator(\n", 137 | " data=(torch.zeros(flags.batch_size, 1, 28,\n", 138 | " 28), torch.zeros(flags.batch_size,\n", 139 | " dtype=torch.int64)),\n", 140 | " sample_count=60000 // flags.batch_size // xm.xrt_world_size())\n", 141 | " test_loader = xu.SampleGenerator(\n", 142 | " data=(torch.zeros(flags.batch_size, 1, 28,\n", 143 | " 28), torch.zeros(flags.batch_size,\n", 144 | " dtype=torch.int64)),\n", 145 | " sample_count=10000 // flags.batch_size // xm.xrt_world_size())\n", 146 | " else:\n", 147 | " train_dataset = datasets.MNIST(\n", 148 | " os.path.join(flags.datadir, str(xm.get_ordinal())),\n", 149 | " train=True,\n", 150 | " download=True,\n", 151 | " transform=transforms.Compose(\n", 152 | " [transforms.ToTensor(),\n", 153 | " transforms.Normalize((0.1307,), (0.3081,))]))\n", 154 | " test_dataset = datasets.MNIST(\n", 155 | " os.path.join(flags.datadir, str(xm.get_ordinal())),\n", 156 | " train=False,\n", 157 | " download=True,\n", 158 | " transform=transforms.Compose(\n", 159 | " [transforms.ToTensor(),\n", 160 | " transforms.Normalize((0.1307,), (0.3081,))]))\n", 161 | " train_sampler = None\n", 162 | " if xm.xrt_world_size() > 1:\n", 163 | " train_sampler = torch.utils.data.distributed.DistributedSampler(\n", 164 | " train_dataset,\n", 165 | " num_replicas=xm.xrt_world_size(),\n", 166 | " rank=xm.get_ordinal(),\n", 167 | " shuffle=True)\n", 168 | " train_loader = torch.utils.data.DataLoader(\n", 169 | " train_dataset,\n", 170 | " batch_size=flags.batch_size,\n", 171 | " sampler=train_sampler,\n", 172 | " drop_last=flags.drop_last,\n", 173 | " shuffle=False if train_sampler else True,\n", 174 | " num_workers=flags.num_workers)\n", 175 | " test_loader = torch.utils.data.DataLoader(\n", 176 | " test_dataset,\n", 177 | " batch_size=flags.batch_size,\n", 178 | " drop_last=flags.drop_last,\n", 179 | " shuffle=False,\n", 180 | " num_workers=flags.num_workers)\n", 181 | "\n", 182 | " # Scale learning rate to num cores\n", 183 | " lr = flags.lr * xm.xrt_world_size()\n", 184 | "\n", 185 | " device = xm.xla_device()\n", 186 | " model = MNIST().to(device)\n", 187 | " writer = None\n", 188 | " if xm.is_master_ordinal():\n", 189 | " writer = test_utils.get_summary_writer(flags.logdir)\n", 190 | " optimizer = optim.SGD(model.parameters(), lr=lr, momentum=flags.momentum)\n", 191 | " loss_fn = nn.NLLLoss()\n", 192 | "\n", 193 | " def train_loop_fn(loader):\n", 194 | " tracker = xm.RateTracker()\n", 195 | " model.train()\n", 196 | " for step, (data, target) in enumerate(loader):\n", 197 | " optimizer.zero_grad()\n", 198 | " output = model(data)\n", 199 | " loss = loss_fn(output, target)\n", 200 | " loss.backward()\n", 201 | " xm.optimizer_step(optimizer)\n", 202 | " tracker.add(flags.batch_size)\n", 203 | " if step % flags.log_steps == 0:\n", 204 | " xm.add_step_closure(\n", 205 | " _train_update,\n", 206 | " args=(device, step, loss, tracker, writer),\n", 207 | " run_async=FLAGS.async_closures)\n", 208 | "\n", 209 | " def test_loop_fn(loader):\n", 210 | " total_samples = 0\n", 211 | " correct = 0\n", 212 | " model.eval()\n", 213 | " for data, target in loader:\n", 214 | " output = model(data)\n", 215 | " pred = output.max(1, keepdim=True)[1]\n", 216 | " correct += pred.eq(target.view_as(pred)).sum()\n", 217 | " total_samples += data.size()[0]\n", 218 | "\n", 219 | " accuracy = 100.0 * correct.item() / total_samples\n", 220 | " accuracy = xm.mesh_reduce('test_accuracy', accuracy, np.mean)\n", 221 | " return accuracy\n", 222 | "\n", 223 | " train_device_loader = pl.MpDeviceLoader(train_loader, device)\n", 224 | " test_device_loader = pl.MpDeviceLoader(test_loader, device)\n", 225 | " accuracy, max_accuracy = 0.0, 0.0\n", 226 | " for epoch in range(1, flags.num_epochs + 1):\n", 227 | " xm.master_print('Epoch {} train begin {}'.format(epoch, test_utils.now()))\n", 228 | " train_loop_fn(train_device_loader)\n", 229 | " xm.master_print('Epoch {} train end {}'.format(epoch, test_utils.now()))\n", 230 | "\n", 231 | " accuracy = test_loop_fn(test_device_loader)\n", 232 | " xm.master_print('Epoch {} test end {}, Accuracy={:.2f}'.format(\n", 233 | " epoch, test_utils.now(), accuracy))\n", 234 | " max_accuracy = max(accuracy, max_accuracy)\n", 235 | " test_utils.write_to_summary(\n", 236 | " writer,\n", 237 | " epoch,\n", 238 | " dict_to_write={'Accuracy/test': accuracy},\n", 239 | " write_xla_metrics=True)\n", 240 | " if flags.metrics_debug:\n", 241 | " xm.master_print(met.metrics_report())\n", 242 | "\n", 243 | " test_utils.close_summary_writer(writer)\n", 244 | " xm.master_print('Max Accuracy: {:.2f}%'.format(max_accuracy))\n", 245 | " return max_accuracy\n", 246 | "\n", 247 | "\n", 248 | "def _mp_fn(index, flags):\n", 249 | " torch.set_default_tensor_type('torch.FloatTensor')\n", 250 | " accuracy = train_mnist(flags)\n", 251 | " if flags.tidy and os.path.isdir(flags.datadir):\n", 252 | " shutil.rmtree(flags.datadir)\n", 253 | " if accuracy < flags.target_accuracy:\n", 254 | " print('Accuracy {} is below target {}'.format(accuracy,\n", 255 | " flags.target_accuracy))\n", 256 | " sys.exit(21)\n", 257 | "\n", 258 | "\n", 259 | "if __name__ == '__main__':\n", 260 | " xmp.spawn(_mp_fn, args=(FLAGS,), nprocs=FLAGS.num_cores)" 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "id": "c3aeb216-d3c7-4c8b-8e8f-9cc9a32db2ac", 266 | "metadata": {}, 267 | "source": [ 268 | "## Build custom container image with dependencies" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "id": "dfeb71de-039e-4b80-815f-a253d1070329", 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "%%writefile Dockerfile.pytorch-tpu-mnist\n", 279 | "\n", 280 | "FROM gcr.io/tpu-pytorch/xla:r1.12_3.8_tpuvm\n", 281 | "\n", 282 | "RUN pip install https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/wheels/libtpu-nightly/libtpu_nightly-0.1.dev20221020-py3-none-any.whl\n", 283 | "\n", 284 | "WORKDIR /\n", 285 | "COPY train.py /" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "id": "9e806073-7c70-42ad-beea-615970e64203", 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "# base container image name\n", 296 | "DOCKER_ARTIFACT_REPO = 'pytorch-on-tpu-vm'\n", 297 | "IMAGE_NAME = \"train-mnist\"\n", 298 | "# IMAGE_URI = f\"{REGION}-docker.pkg.dev/{PROJECT_ID}/{DOCKER_ARTIFACT_REPO}/{IMAGE_NAME}\"\n", 299 | "IMAGE_URI = f\"us.gcr.io/{PROJECT_ID}/{DOCKER_ARTIFACT_REPO}/{IMAGE_NAME}\"\n", 300 | "\n", 301 | "IMAGE_URI" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "id": "ac721c86-1e4b-4619-b967-11916d4a306a", 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "%%writefile cloudbuild.yaml\n", 312 | "\n", 313 | "steps:\n", 314 | "- name: 'gcr.io/cloud-builders/docker'\n", 315 | " args: ['build', '-t', '$_IMAGE_URI', '$_FILE_LOCATION', '-f', '$_FILE_LOCATION/Dockerfile.$_DOCKERNAME']\n", 316 | "images:\n", 317 | "- '$_IMAGE_URI'" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "id": "b095c2ee-1c89-4858-87b9-e5a258cac87f", 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "FILE_LOCATION = './'\n", 328 | "\n", 329 | "! gcloud builds submit \\\n", 330 | " --region $REGION \\\n", 331 | " --config src/cloudbuild.yaml \\\n", 332 | " --substitutions _DOCKERNAME=\"pytorch-tpu-mnist\",_IMAGE_URI=$IMAGE_URI,_FILE_LOCATION=$FILE_LOCATION \\\n", 333 | " --timeout \"2h\" \\\n", 334 | " --machine-type=e2-highcpu-32 \\\n", 335 | " --quiet" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "id": "7154d3d8-7e55-4d20-9605-a4572b9481bb", 341 | "metadata": {}, 342 | "source": [ 343 | "## Submit training job" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "id": "e4d498f0-0fa3-4a92-a064-ed67d12692b0", 350 | "metadata": {}, 351 | "outputs": [], 352 | "source": [ 353 | "# initialize Vertex AI SDK\n", 354 | "aiplatform.init(project=PROJECT_ID, location=REGION)" 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "id": "ccd667b6-ac74-457e-8efb-78606f4ef2e0", 360 | "metadata": {}, 361 | "source": [ 362 | "### Using CustomJob" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": null, 368 | "id": "5d69934c-b16a-4605-933a-ec84b97d36c3", 369 | "metadata": {}, 370 | "outputs": [], 371 | "source": [ 372 | "TIMESTAMP = datetime.now().strftime(\"%Y%m%d%H%M%S\")\n", 373 | "APP_NAME = \"pytorch-train-mnist-tpu\"\n", 374 | "JOB_NAME = f\"{APP_NAME}-{TIMESTAMP}\"\n", 375 | "print(f\"JOB_NAME = {JOB_NAME}\")" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "id": "3e95db28-c233-439f-9538-f0214cf158df", 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "# define worker pool specs\n", 386 | "worker_pool_specs = [\n", 387 | " {\n", 388 | " \"machine_spec\": {\n", 389 | " \"machine_type\": \"cloud-tpu\",\n", 390 | " \"accelerator_type\": \"TPU_V2\",\n", 391 | " \"accelerator_count\": 8,\n", 392 | " },\n", 393 | " \"replica_count\": 1,\n", 394 | " \"container_spec\": {\n", 395 | " \"image_uri\": IMAGE_URI,\n", 396 | " \"command\": [\"python3\", \"/train.py\"],\n", 397 | " \"args\": [],\n", 398 | " \"env\": [\n", 399 | " {\n", 400 | " \"name\": \"XRT_TPU_CONFIG\",\n", 401 | " \"value\": \"localservice;0;localhost:51011\"\n", 402 | " }\n", 403 | " ]\n", 404 | " },\n", 405 | " }\n", 406 | "]" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": null, 412 | "id": "b5bfd12b-2bd8-456b-ad2d-cb92bc69c7da", 413 | "metadata": {}, 414 | "outputs": [], 415 | "source": [ 416 | "# create custom job\n", 417 | "job = aiplatform.CustomJob(\n", 418 | " display_name=JOB_NAME,\n", 419 | " worker_pool_specs=worker_pool_specs,\n", 420 | " staging_bucket=BUCKET_URI\n", 421 | ")" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": null, 427 | "id": "ceda0b19-0fc8-414c-a03b-ccd17f23a3b9", 428 | "metadata": {}, 429 | "outputs": [], 430 | "source": [ 431 | "# run the job\n", 432 | "job_response = job.run()" 433 | ] 434 | }, 435 | { 436 | "cell_type": "markdown", 437 | "id": "517bc61e-a7d0-4c27-8ef7-18ed2cde8dc6", 438 | "metadata": {}, 439 | "source": [ 440 | "### Using CustomContainerTrainingJob" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": null, 446 | "id": "fd6d3a59-564c-4eef-9b5a-81007df4059f", 447 | "metadata": {}, 448 | "outputs": [], 449 | "source": [ 450 | "TIMESTAMP = datetime.now().strftime(\"%Y%m%d%H%M%S\")\n", 451 | "APP_NAME = \"pytorch-train-mnist-tpu\"\n", 452 | "JOB_NAME = f\"{APP_NAME}-{TIMESTAMP}\"\n", 453 | "print(f\"JOB_NAME = {JOB_NAME}\")" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": null, 459 | "id": "076e7b48-1a27-4e03-9d92-8981cc213fd1", 460 | "metadata": {}, 461 | "outputs": [], 462 | "source": [ 463 | "# configure the job with container image spec\n", 464 | "job = aiplatform.CustomContainerTrainingJob(\n", 465 | " display_name=JOB_NAME, \n", 466 | " container_uri=IMAGE_URI,\n", 467 | " command=[\"python3\", \"/train.py\"],\n", 468 | " staging_bucket=BUCKET_URI\n", 469 | ")" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": null, 475 | "id": "e25bc7b2-5382-4d1b-83f0-d70fad83d0c0", 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [ 479 | "# run the job\n", 480 | "job_response = job.run(\n", 481 | " replica_count=1,\n", 482 | " machine_type='cloud-tpu',\n", 483 | " accelerator_type='TPU_V2',\n", 484 | " accelerator_count=8,\n", 485 | " base_output_dir=f'{BUCKET_URI}/tpu-experiments/{APP_NAME}/'\n", 486 | ")" 487 | ] 488 | } 489 | ], 490 | "metadata": { 491 | "environment": { 492 | "kernel": "tpu-gke", 493 | "name": "tf2-gpu.2-7.m87", 494 | "type": "gcloud", 495 | "uri": "gcr.io/deeplearning-platform-release/tf2-gpu.2-7:m87" 496 | }, 497 | "kernelspec": { 498 | "display_name": "tpu-gke", 499 | "language": "python", 500 | "name": "tpu-gke" 501 | }, 502 | "language_info": { 503 | "codemirror_mode": { 504 | "name": "ipython", 505 | "version": 3 506 | }, 507 | "file_extension": ".py", 508 | "mimetype": "text/x-python", 509 | "name": "python", 510 | "nbconvert_exporter": "python", 511 | "pygments_lexer": "ipython3", 512 | "version": "3.9.13" 513 | } 514 | }, 515 | "nbformat": 4, 516 | "nbformat_minor": 5 517 | } 518 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Vertex AI Labs 2 | --- 3 | 4 | **Hands-on labs introducing GCP Vertex AI features** 5 | 6 | These labs introduce following components of Vertex AI 7 | 8 | - Vertex Notebooks 9 | - Vertex AI Training 10 | - Using pre-built and custom containers 11 | - Hyperparameter tuning 12 | - Distributed Training 13 | - Vertex AI Predictions 14 | - using pre-built and custom containers 15 | - Vertex Tensorboard 16 | - Vertex ML Metadata 17 | 18 | ![Labs Focus](./images/vertex-ai-labs-focus.png) 19 | 20 | 21 | ## Environment Setup 22 | 23 | The following section describes requirements for setting up a GCP environment required for the workshop. Note that we have provided example [Terraform](https://www.terraform.io/) scripts to automate the process. You can find the scripts and the instructions in the `00-env-setup` folder. These are prerequisites for running the labs. 24 | 25 | ### GCP Project 26 | 27 | Each participant should have their own GCP project (through Qwiklabs) with project owner permissions to complete the setup steps. 28 | 29 | The setup performs following asks 30 | 31 | 1. Activate Google Cloud APIs required for the labs. 32 | 2. Create service accounts required for running the labs. 33 | 3. Create Google Cloud Storage bucket in the region configured (we will be using `us-central1`) 34 | 4. Create a Vertex Notebooks instance to provision a managed JupyterLab notebook instance. 35 | 5. Create a Vertex Tensorboard instance to monitor the experiments run as part of the lab. 36 | 37 | --- 38 | 39 | Please navigate to [00-env-setup](./00-env-setup/README.md) to setup the environment. 40 | 41 | --- 42 | 43 | Following are the details of the setup to run the labs: 44 | 45 | ### Cloud APIs 46 | 47 | The following APIs need to be enabled in the project: 48 | 49 | - compute.googleapis.com 50 | - iam.googleapis.com 51 | - container.googleapis.com 52 | - artifactregistry.googleapis.com 53 | - cloudresourcemanager.googleapis.com 54 | - cloudtrace.googleapis.com 55 | - iamcredentials.googleapis.com 56 | - monitoring.googleapis.com 57 | - logging.googleapis.com 58 | - notebooks.googleapis.com 59 | - aiplatform.googleapis.com 60 | - dataflow.googleapis.com 61 | - bigquery.googleapis.com 62 | - cloudbuild.googleapis.com 63 | - bigquerydatatransfer.googleapis.com 64 | 65 | ### GCP Region 66 | 67 | Note that some services used during the notebook are only available in a limited number of regions. We recommend using `us-central1`. 68 | 69 | ### Service accounts 70 | 71 | Two service accounts must be created in the project. 72 | 73 | #### Vertex AI training service account 74 | 75 | This account will be used by Vertex Training service. The account needs the following permissions: 76 | 77 | - storage.admin 78 | - aiplatform.user 79 | - bigquery.admin 80 | 81 | The account email should be 82 | 83 | `training-sa@{PROJECT_ID}.iam.gserviceaccount.com` 84 | 85 | #### Vertex AI pipelines service account 86 | 87 | This account will be used by Vertex Pipelines service. The account needs the following permissions: 88 | 89 | - storage.admin 90 | - aiplatform.user 91 | - bigquery.admin 92 | 93 | The account email should be 94 | 95 | `pipelines-sa@{PROJECT_ID}.iam.gserviceaccount.com` 96 | 97 | ### GCS buckets 98 | 99 | Each participant should have their own regional GCS bucket. The bucket should be created in the GCP region that will be used during the workshop. The bucket name should use the following naming convention 100 | 101 | `gs://{PREFIX}-bucket` 102 | 103 | The goal of the prefix is too avoid conflicts between participants as such it should be unique for each participant. **The prefix should start with a letter and include letters and digits only** 104 | 105 | The workshop notebooks assume this naming convention. 106 | 107 | 108 | ### Vertex AI Notebook 109 | 110 | Each participant should have any instance of Vertex AI Notebook. The instances can be pre-created or can be created during the workshop. 111 | 112 | The instance should be configured as follows: 113 | 114 | - Machine type: **n1-standard-4** 115 | - Optionally GPUs can be added to the machine configuration if participants want to experiment with GPUs 116 | - Image family: **tf-2-4-cpu** or **tf-2-4-cu110** (if using GPUs) 117 | - Configured with the default compute engine service account 118 | 119 | #### Vertex AI Notebook setup 120 | 121 | The following setup steps will be performed during the workshop, individually by each of the participants. 122 | 123 | In JupyterLab, open a terminal and: 124 | 125 | ##### Install the required Python packages 126 | 127 | ``` 128 | pip install --user google-cloud-aiplatform 129 | pip install --user kfp 130 | pip install --user google-cloud-pipeline-components 131 | pip install --user google-cloud-bigquery-datatransfer 132 | ``` 133 | 134 | ##### Vertex Tensorboard instance 135 | 136 | Each project will have their own Vertex Tensorboard instance created (by the script) in the region configured. 137 | 138 | You can get the Tensorboard instance names at any time by listing Tensorboards in the project 139 | 140 | ``` 141 | gcloud beta ai tensorboards list \ 142 | --project $PROJECT --region $REGION 143 | ``` 144 | 145 | ##### Clone this repo 146 | ``` 147 | git clone https://github.com/RajeshThallam/vertex-ai-labs 148 | ``` 149 | 150 | 151 | ## References: 152 | 153 | - https://github.com/jarokaz/vertex-ai-workshop/ 154 | -------------------------------------------------------------------------------- /images/automl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/images/automl.png -------------------------------------------------------------------------------- /images/custom-tabular.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/images/custom-tabular.png -------------------------------------------------------------------------------- /images/custom-training-on-vertex-ai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/images/custom-training-on-vertex-ai.png -------------------------------------------------------------------------------- /images/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/images/pipeline.png -------------------------------------------------------------------------------- /images/serving-with-custom-containers-on-vertex-predictions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/images/serving-with-custom-containers-on-vertex-predictions.png -------------------------------------------------------------------------------- /images/training-with-custom-containers-on-vertex-training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/images/training-with-custom-containers-on-vertex-training.png -------------------------------------------------------------------------------- /images/vertex-ai-labs-focus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/images/vertex-ai-labs-focus.png --------------------------------------------------------------------------------