├── .gitignore
├── 00-env-setup
    ├── README.md
    └── terraform
    │   ├── archive
    │       ├── artifact_registry.tf
    │       └── service_accounts.tf
    │   ├── caip-notebook.tf
    │   ├── gcs-bucket.tf
    │   ├── main.tf
    │   ├── services.tf
    │   ├── terraform.tfvars
    │   ├── variables.tf
    │   └── vertex-tensorboard.tf
├── 01-automl-tabular
    └── 01-vertex-automl-tabular-training-prediction.ipynb
├── 02-custom-job-tabular
    ├── 02-vertex-custom-job-tabular-training-prediction.ipynb
    ├── instances.json
    ├── model.png
    ├── predictor
    │   ├── Dockerfile
    │   └── model
    │   │   ├── saved_model.pb
    │   │   └── variables
    │   │       ├── variables.data-00000-of-00001
    │   │       └── variables.index
    └── trainer
    │   └── Dockerfile
├── 03-distributed-training-text
    ├── 03-distributed-training-vertex-ai-bert-finetuning.ipynb
    └── scripts
    │   ├── Dockerfile
    │   └── trainer
    │       ├── __init__.py
    │       └── task.py
├── 04-vertex-pipeline-and-airflow
    ├── data_orchestration_bq_example_dag.py
    ├── get_composer_config.py
    ├── images
    │   ├── airflow_dag.png
    │   ├── airflow_dag_run.png
    │   ├── airflow_webserver_with_dag.png
    │   ├── pipeline_run.png
    │   └── trigger-airflow-dag-on-cloud-composer-from-vertex-pipeline.png
    └── vertex-pipeline-airflow.ipynb
├── 05-vertex-event-based-model-deploy
    ├── 05-event-based-vertex-model-deployment.ipynb
    └── images
    │   └── event_based_model_deployment.png
├── 06-vertex-train-deploy-r-model
    ├── images
    │   └── serving-with-custom-containers-on-vertex-predictions.png
    ├── vertex-train-deploy-r-model.ipynb
    └── vertex_ai_pipelines_r_model.ipynb
├── 07-vertex-train-deploy-lightgbm
    ├── images
    │   ├── serving-with-custom-containers-on-vertex-predictions.png
    │   └── training-with-custom-containers-on-vertex-training.png
    └── vertex-train-deploy-lightgbm-model.ipynb
├── 08-pytorch-distributed
    ├── README.md
    ├── pytorch_cifar10_distributed_reduction_server_sdk_custom.ipynb
    └── pytorch_cifar10_distributed_sdk_custom_batch_predictions.ipynb
├── 09-distributed-xgboost-dask
    ├── Dockerfile
    ├── README.md
    ├── train.py
    └── train.sh
├── 10-serving-ensemble-triton
    ├── models
    │   ├── combine
    │   │   ├── 1
    │   │   │   └── model.py
    │   │   └── config.pbtxt
    │   ├── ensemble
    │   │   ├── 1
    │   │   │   └── empty
    │   │   └── config.pbtxt
    │   ├── mux
    │   │   ├── 1
    │   │   │   └── model.py
    │   │   └── config.pbtxt
    │   ├── sci_1
    │   │   ├── 1
    │   │   │   ├── checkpoint.tl
    │   │   │   └── sci_1.pkl
    │   │   └── config.pbtxt
    │   ├── sci_2
    │   │   ├── 1
    │   │   │   ├── checkpoint.tl
    │   │   │   └── sci_2.pkl
    │   │   └── config.pbtxt
    │   ├── tf
    │   │   ├── 1
    │   │   │   └── model.savedmodel
    │   │   │   │   ├── saved_model.pb
    │   │   │   │   └── variables
    │   │   │   │       ├── variables.data-00000-of-00001
    │   │   │   │       └── variables.index
    │   │   └── config.pbtxt
    │   └── xgb
    │   │   ├── 1
    │   │       └── xgboost.json
    │   │   └── config.pbtxt
    ├── notebooks
    │   └── ensemble-nvidia-triton-custom-container-prediction.ipynb
    └── src
    │   ├── combine
    │       └── model.py
    │   ├── generate
    │       └── generate.py
    │   ├── mux
    │       └── model.py
    │   ├── sci_1
    │       └── sci_1.py
    │   ├── sci_2
    │       └── sci_2.py
    │   ├── test
    │       ├── combine_01.py
    │       ├── ensemble_01.py
    │       ├── mux.py
    │       ├── sci_1.py
    │       ├── sci_2.py
    │       ├── tf_01.py
    │       └── xgb_01.py
    │   ├── tf
    │       ├── tf.py
    │       └── tf.py_0
    │   └── xgb
    │       └── xgb.py
├── 11-pytorch-on-tpu-vertex-ai
    └── pytorch-on-vertex-ai-tpu-train-mnist.ipynb
├── LICENSE
├── README.md
└── images
    ├── automl.png
    ├── custom-tabular.png
    ├── custom-training-on-vertex-ai.png
    ├── pipeline.png
    ├── serving-with-custom-containers-on-vertex-predictions.png
    ├── training-with-custom-containers-on-vertex-training.png
    └── vertex-ai-labs-focus.png


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/00-env-setup/README.md:
--------------------------------------------------------------------------------
 1 | # Creating a Vertex environment
 2 | 
 3 | You can use the [Terraform](https://www.terraform.io/) scripts in the `terraform` folder to automatically provision the environment required by the samples. 
 4 | 
 5 | The scripts perform the following actions:
 6 | 1. Enable the required Cloud APIs
 7 | 2. Create a regional GCS bucket
 8 | 3. Create an instance of Vertex Notebooks
 9 | 4. Create service accounts for Vertex Training and Vertex Pipelines
10 | 5. Create an instance of Vertex Tensorboard
11 | 
12 | ## Provision Environment
13 | 
14 | To provision the environment:
15 | 
16 | - Open Cloud Shell
17 | 
18 | [![Open in Cloud Shell](https://gstatic.com/cloudssh/images/open-btn.svg)](https://ssh.cloud.google.com/cloudshell/editor?cloudshell_git_repo=https://github.com/RajeshThallam/vertex-ai-labs.git)
19 | 
20 | 
21 | - Navigate to `~/vertex-ai-labs/00-env-setup/terraform`
22 | ```
23 | LOCAL_DIR=~/vertex-ai-labs
24 | cd $LOCAL_DIR/terraform
25 | ```
26 | 
27 | - Update the `terraform.tfvars` file with the values reflecting your environment (refer [Customize Configuration](#Customize-Configuration) to understand the variables for customization). Alternatively, you can provide the values using the Terraform CLI `-var` options when you execute `terraform apply` in the next step.
28 | 
29 | - Execute the following commands. :
30 | ```
31 | terraform init
32 | terraform apply
33 | ```
34 | 
35 | - To destroy the environment, execute:
36 | ```
37 | terraform destroy
38 | ```
39 | 
40 | ## Customize Configuration
41 | 
42 | You can customize your configuration using the following variables in `terraform.tfvars`:
43 | 
44 | |Variable|Required|Default|Description|
45 | |--------|--------|-------|-----------|
46 | |name_prefix|Yes||Prefix added to the names of provisioned resources. **The prefix should start with a letter and include letters and digits only**.|
47 | |project_id|Yes||GCP project ID|
48 | |network_name|No|default|Name of the network for the Notebook instance. The network must already exist.|
49 | |subnet_name|No|default|Name of the subnet for the Notebook instance. The subnet must already exist.|
50 | |subnet_region|No|us-central1|Region where the subnet was created.|
51 | |zone|Yes||GCP zone for the Notebook instance. The zone must be in the region defined in the `subnet_region` variable|
52 | |machine_type|No|n1-standard-4|Machine type of the  Notebook instance|
53 | |boot_disk_size|No|200GB|Size of the Notebook instance's boot disk|
54 | |image_family|No|tf-2-4-cpu|Image family for the Notebook instance|
55 | |gpu_type|No|null|GPU type of the Notebook instance. By default, the Notebook instance will be provisioned without a GPU|
56 | |gpu_count|No|null|GPU count of the Notebook instance|
57 | |install_gpu_driver|No|false|Whether to install a GPU driver|
58 | |region|No|Set to subnet_region.|GCP region for the GCS bucket and Artifact Registry. It is recommended that the same region is used for all: the bucket, the registry and the Notebook instance. If not provided the `egion` will be set to `subnet_region`.|
59 | |force_destroy|No|false|Whether to force the removal of the bucket on terraform destroy. **Note that by default the bucket will not be destroyed**.|


--------------------------------------------------------------------------------
/00-env-setup/terraform/archive/artifact_registry.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | resource "google_artifact_registry_repository" "docker-repo" {
17 |     provider = google-beta
18 |     project = module.project-services.project_id
19 |     location = local.region
20 |     repository_id = "${var.name_prefix}-registry"
21 |     description = "Docker repository"
22 |     format = "DOCKER"
23 | }


--------------------------------------------------------------------------------
/00-env-setup/terraform/archive/service_accounts.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Create Vertex Training service account
16 | resource "google_service_account" "training_sa" {
17 |     project       = module.project-services.project_id
18 |     account_id   = var.training_sa_name
19 |     display_name = "Vertex Training service account"
20 | }
21 | 
22 | # Create Vertex Training SA role bindings
23 | resource "google_project_iam_member" "training_sa_role_bindings" {
24 |     project       = module.project-services.project_id
25 |     for_each = toset(var.training_sa_roles)
26 |     member   = "serviceAccount:${google_service_account.training_sa.email}"
27 |     role     = "roles/${each.value}"
28 | }
29 | 
30 | # Create Vertex Pipelines service account
31 | resource "google_service_account" "pipelines_sa" {
32 |     project       = module.project-services.project_id
33 |     account_id   = var.pipelines_sa_name
34 |     display_name = "Vertex Pipelines account name"
35 | }
36 | 
37 | # Create Vertex Pipelines SA role bindings
38 | resource "google_project_iam_member" "role_bindings" {
39 |     project       = module.project-services.project_id
40 |     for_each = toset(var.pipelines_sa_roles)
41 |     member   = "serviceAccount:${google_service_account.pipelines_sa.email}"
42 |     role     = "roles/${each.value}"
43 | }
44 | 


--------------------------------------------------------------------------------
/00-env-setup/terraform/caip-notebook.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | locals {
16 |     image_project = "deeplearning-platform-release"
17 | }
18 | 
19 | data "google_compute_network" "vm_network" {
20 |     project = module.project-services.project_id
21 |     name    = var.network_name
22 | 
23 |     depends_on = [
24 |         module.project-services
25 |     ]
26 | }
27 | 
28 | data "google_compute_subnetwork" "vm_subnetwork" {
29 |     project = module.project-services.project_id
30 |     name   = var.subnet_name
31 |     region = var.subnet_region
32 | 
33 |     depends_on = [
34 |         module.project-services
35 |     ]
36 | }
37 | 
38 | resource "google_notebooks_instance" "notebook_instance" {
39 |     project          = module.project-services.project_id
40 |     name             = "${var.name_prefix}-notebook"
41 |     machine_type     = var.machine_type
42 |     location         = var.zone
43 | 
44 |     network = data.google_compute_network.vm_network.id
45 |     subnet  = data.google_compute_subnetwork.vm_subnetwork.id
46 | 
47 |     vm_image {
48 |         project      = local.image_project
49 |         image_family = var.image_family
50 |     }
51 | 
52 |     dynamic accelerator_config {
53 |       for_each = var.gpu_type != null ? [1] : []
54 |       content {
55 |           type = var.gpu_type
56 |           core_count = var.gpu_count
57 |       }
58 |     }
59 | 
60 |     install_gpu_driver  = var.install_gpu_driver
61 | 
62 |     boot_disk_size_gb   = var.boot_disk_size
63 | }
64 | 


--------------------------------------------------------------------------------
/00-env-setup/terraform/gcs-bucket.tf:
--------------------------------------------------------------------------------
 1 | 
 2 | # Copyright 2021 Google LLC
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | resource "google_storage_bucket" "artifact_repo" {
18 |     project       = module.project-services.project_id
19 |     name          = "${var.name_prefix}-${var.project_id}-bucket"
20 |     location      = local.region
21 |     storage_class = local.bucket_type
22 |     force_destroy = var.force_destroy
23 | }


--------------------------------------------------------------------------------
/00-env-setup/terraform/main.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | terraform {
16 |   required_version = ">= 0.14"
17 |   required_providers {
18 |     google = "~> 3.6"
19 |   }
20 | 
21 | #  backend "gcs" {
22 | #    bucket  = "jk-terraform-state"
23 | #    prefix  = "vertex-ai-env"
24 | #  }
25 | }
26 | 
27 | provider "google" {
28 |     project = var.project_id 
29 | }
30 | 
31 | 
32 | data "google_project" "project" {
33 |     project_id = var.project_id    
34 | }
35 | 
36 | locals {
37 |     bucket_type = "REGIONAL"
38 |     region = var.region == null ? var.subnet_region : var.region
39 | }
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/00-env-setup/terraform/services.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | module "project-services" {
17 |   source  = "terraform-google-modules/project-factory/google//modules/project_services"
18 | 
19 |   project_id  = data.google_project.project.project_id
20 | 
21 |   disable_services_on_destroy = false
22 |   activate_apis = [
23 |     "compute.googleapis.com",
24 |     "iam.googleapis.com",
25 |     "container.googleapis.com",
26 |     "artifactregistry.googleapis.com",
27 |     "cloudresourcemanager.googleapis.com",
28 |     "cloudtrace.googleapis.com",
29 |     "iamcredentials.googleapis.com",
30 |     "monitoring.googleapis.com",
31 |     "logging.googleapis.com",
32 |     "notebooks.googleapis.com",
33 |     "aiplatform.googleapis.com",
34 |     "dataflow.googleapis.com",
35 |     "bigquery.googleapis.com",
36 |     "cloudbuild.googleapis.com",
37 |     "bigquerydatatransfer.googleapis.com",
38 |   ]
39 | }
40 | 
41 | output "api_activated" {
42 |     value = true
43 | }


--------------------------------------------------------------------------------
/00-env-setup/terraform/terraform.tfvars:
--------------------------------------------------------------------------------
 1 | project_id = "CHANGE THIS WITH YOUR PROJECT ID e.g. from Qwiklabs console"
 2 | subnet_region = "us-central1"
 3 | zone = "us-central1-a"
 4 | name_prefix = "CHANGE THIS WITH YOUR STRING e.g. your initials"
 5 | machine_type = "n1-standard-8"
 6 | #gpu_type = "NVIDIA_TESLA_T4"
 7 | #gpu_count = 1
 8 | #install_gpu_driver = true
 9 | #image_family = "tf-2-4-gpu"
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/00-env-setup/terraform/variables.tf:
--------------------------------------------------------------------------------
  1 | 
  2 | # Copyright 2021 Google LLC
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | 
 17 | variable "project_id" {
 18 |     description = "The GCP project ID"
 19 |     type        = string
 20 | }
 21 | 
 22 | variable "region" {
 23 |     description = "The region for the GCS bucket and Artifact Registry"
 24 |     type        = string
 25 |     default     = null
 26 | }
 27 | 
 28 | variable "zone" {
 29 |     description = "The zone for a Vertex Notebook instance"
 30 |     type        = string
 31 | }
 32 | 
 33 | variable "name_prefix" {
 34 |     description = "The name prefix to add to the resource names"
 35 |     type        = string
 36 | }
 37 | 
 38 | variable "machine_type" {
 39 |     description = "The Notebook instance's machine type"
 40 |     type        = string
 41 | }
 42 | 
 43 | variable "network_name" {
 44 |   description = "The network name for the Notebook instance"
 45 |   type        = string
 46 |   default     = "default"
 47 | }
 48 | 
 49 | variable "subnet_name" {
 50 |   description = "The subnet name for the Notebook instance"
 51 |   type        = string
 52 |   default     = "default"
 53 | }
 54 | 
 55 | variable "subnet_region" {
 56 |     description = "The region for the Notebook subnet"
 57 |     type        = string
 58 |     default     = "us-central1"
 59 | }
 60 | 
 61 | variable "boot_disk_size" {
 62 |     description = "The size of the boot disk"
 63 |     default     = 200
 64 | }
 65 | 
 66 | variable "image_family" {
 67 |     description = "A Deep Learning image family for the Notebook instance"
 68 |     type        = string
 69 |     default     = "tf-2-4-cpu"
 70 | }
 71 | 
 72 | variable "gpu_type" {
 73 |     description = "A GPU type for the Notebook instance"
 74 |     type        = string
 75 |     default     = null
 76 | }
 77 | 
 78 | variable "gpu_count" {
 79 |     description = "A GPU count for the Notebook instance"
 80 |     type        = string
 81 |     default     = null
 82 | }
 83 | 
 84 | variable "install_gpu_driver" {
 85 |     description = "Whether to install GPU driver"
 86 |     type        = bool
 87 |     default     = false
 88 | }
 89 | 
 90 | variable "force_destroy" {
 91 |     description = "Whether to remove the bucket on destroy"
 92 |     type        = bool
 93 |     default     = false
 94 | }
 95 | 
 96 | variable "training_sa_roles" {
 97 |   description = "The roles to assign to the Vertex Training service account"
 98 |   default = [
 99 |     "storage.admin",
100 |     "aiplatform.user",
101 |     "bigquery.admin"
102 |     ] 
103 | }
104 | 
105 | variable "pipelines_sa_roles" {
106 |   description = "The roles to assign to the Vertex Pipelines service account"
107 |   default = [    
108 |     "storage.admin", 
109 |     "bigquery.admin", 
110 |     "aiplatform.user"
111 |   ]
112 | }
113 | 
114 | variable "training_sa_name" {
115 |     description = "Vertex training service account name."
116 |     default = "training-sa"
117 | }
118 | 
119 | variable "pipelines_sa_name" {
120 |     description = "Vertex pipelines service account name."
121 |     default = "pipelines-sa"
122 | }
123 | 


--------------------------------------------------------------------------------
/00-env-setup/terraform/vertex-tensorboard.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | module tensorboard {
16 |   source  = "terraform-google-modules/gcloud/google"
17 |   version = "~> 2.0"
18 | 
19 |   platform = "linux"
20 | 
21 |   create_cmd_entrypoint  = "printf 'yes' | gcloud"
22 |   create_cmd_body        = "beta ai tensorboards create --display-name ${var.name_prefix}-${var.subnet_region}-tensorboard --project ${var.project_id} --region ${var.subnet_region}"
23 |   destroy_cmd_entrypoint = "printf 'yes' | gcloud"
24 |   destroy_cmd_body       = "beta ai tensorboards delete $(gcloud beta ai tensorboards list --region ${var.subnet_region} --filter='displayName:${var.name_prefix}-${var.subnet_region}-tensorboard' --format='value(name)' --project ${var.project_id})"
25 |   
26 |   depends_on = [module.project-services.api_activated]
27 | }


--------------------------------------------------------------------------------
/01-automl-tabular/01-vertex-automl-tabular-training-prediction.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# Copyright 2020 Google LLC\n",
 10 |     "#\n",
 11 |     "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
 12 |     "# you may not use this file except in compliance with the License.\n",
 13 |     "# You may obtain a copy of the License at\n",
 14 |     "#\n",
 15 |     "#     https://www.apache.org/licenses/LICENSE-2.0\n",
 16 |     "#\n",
 17 |     "# Unless required by applicable law or agreed to in writing, software\n",
 18 |     "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
 19 |     "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
 20 |     "# See the License for the specific language governing permissions and\n",
 21 |     "# limitations under the License."
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# Training and deploying a tabular model using Vertex AutoML.\n",
 29 |     "\n",
 30 |     "![Training pipeline](../images/automl.png)"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "## Install required packages"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "# Get the site-packages directory so we can remove invalid packages.\n",
 47 |     "import site\n",
 48 |     "sp = site.getsitepackages()[0]\n",
 49 |     "print(sp)"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "%%bash -s \"$sp\"\n",
 59 |     "# Remove the invalide site-packages\n",
 60 |     "echo $1\n",
 61 |     "sudo rm -rf $1/~*"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "%%bash\n",
 71 |     "pip install --user google-cloud-aiplatform --upgrade\n",
 72 |     "pip install --user kfp --upgrade\n",
 73 |     "pip install --user google-cloud-pipeline-components --upgrade\n",
 74 |     "pip install --user google-cloud-bigquery-datatransfer --upgrade"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "\n",
 82 |     "### Restart the kernel\n",
 83 |     "Once you've installed the required packages, you need to restart the notebook kernel so it can find the packages."
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "# Automatically restart kernel after installs\n",
 93 |     "import IPython\n",
 94 |     "app = IPython.Application.instance()\n",
 95 |     "app.kernel.do_shutdown(True)"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "## Import the required packages"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "import os\n",
112 |     "import pprint\n",
113 |     "import pandas as pd\n",
114 |     "import tensorflow as tf\n",
115 |     "import time\n",
116 |     "from datetime import datetime\n",
117 |     "import csv\n",
118 |     "\n",
119 |     "import matplotlib.pyplot as plt\n",
120 |     "\n",
121 |     "import google.auth\n",
122 |     "\n",
123 |     "from google.cloud import aiplatform as vertex_ai\n",
124 |     "from google.cloud.aiplatform_v1beta1 import types\n",
125 |     "from google.cloud import bigquery\n",
126 |     "from google.cloud import exceptions\n",
127 |     "\n",
128 |     "from tensorflow.keras import layers\n",
129 |     "from tensorflow.keras.layers.experimental import preprocessing\n",
130 |     "\n",
131 |     "from tensorflow_io import bigquery as tfio_bq\n"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "## Configure GCP settings\n",
139 |     "\n",
140 |     "*Before running the notebook make sure to follow the repo's README file to install the pre-requisites and configure GCP authentication.*"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "creds, PROJECT = google.auth.default()\n",
150 |     "print(creds)\n",
151 |     "REGION = 'us-central1'\n",
152 |     "\n",
153 |     "STAGING_BUCKET = f'gs://{PROJECT}-labs'\n",
154 |     "\n",
155 |     "# Get the configured service account this notebook is running as\n",
156 |     "bash_output = !gcloud config list account --format \"value(core.account)\" 2> /dev/null\n",
157 |     "VERTEX_SA = bash_output[0]\n",
158 |     "\n",
159 |     "print(f\"PROJECT = {PROJECT}\")\n",
160 |     "print(f\"STAGING_BUCKET = {STAGING_BUCKET}\")\n",
161 |     "print(f\"VERTEX_SA = {VERTEX_SA}\")\n",
162 |     "\n",
163 |     "# Create the bucket. Ignore error if it already exists.\n",
164 |     "!gsutil mb -l $REGION $STAGING_BUCKET"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {},
170 |    "source": [
171 |     "## Preparing training data in BigQuery"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "metadata": {},
177 |    "source": [
178 |     "### Explore Chicago Taxi dataset"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": null,
184 |    "metadata": {},
185 |    "outputs": [],
186 |    "source": [
187 |     "%%bigquery data\n",
188 |     "\n",
189 |     "SELECT \n",
190 |     "    *\n",
191 |     "FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`\n",
192 |     "LIMIT 3"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": null,
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "data.head().T"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "%%bigquery data\n",
211 |     "\n",
212 |     "SELECT \n",
213 |     "    CAST(EXTRACT(DAYOFWEEK FROM trip_start_timestamp) AS string) AS trip_dayofweek, \n",
214 |     "    FORMAT_DATE('%A',cast(trip_start_timestamp as date)) AS trip_dayname,\n",
215 |     "    COUNT(*) as trip_count,\n",
216 |     "FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`\n",
217 |     "WHERE\n",
218 |     "    EXTRACT(YEAR FROM trip_start_timestamp) = 2020 \n",
219 |     "GROUP BY\n",
220 |     "    trip_dayofweek,\n",
221 |     "    trip_dayname\n",
222 |     "ORDER BY\n",
223 |     "    trip_dayofweek"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "metadata": {},
230 |    "outputs": [],
231 |    "source": [
232 |     "data"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": null,
238 |    "metadata": {},
239 |    "outputs": [],
240 |    "source": [
241 |     "data.plot(kind='bar', x='trip_dayname', y='trip_count')"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "markdown",
246 |    "metadata": {},
247 |    "source": [
248 |     "### Create data splits"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "BQ_DATASET_NAME = f'vertex_lab01' \n",
258 |     "BQ_TABLE_NAME = 'features'\n",
259 |     "BQ_LOCATION = 'US'\n",
260 |     "SAMPLE_SIZE = 500000\n",
261 |     "YEAR = 2020"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "markdown",
266 |    "metadata": {},
267 |    "source": [
268 |     "#### Create a BQ dataset to host the splits"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": null,
274 |    "metadata": {},
275 |    "outputs": [],
276 |    "source": [
277 |     "client = bigquery.Client()\n",
278 |     "\n",
279 |     "dataset_id = f'{PROJECT}.{BQ_DATASET_NAME}'\n",
280 |     "dataset = bigquery.Dataset(dataset_id)\n",
281 |     "dataset.location = BQ_LOCATION\n",
282 |     "\n",
283 |     "try:\n",
284 |     "    dataset = client.create_dataset(dataset, timeout=30)\n",
285 |     "    print('Created dataset: ', dataset_id)\n",
286 |     "except exceptions.Conflict:\n",
287 |     "    print('Dataset {} already exists'.format(dataset_id))"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "markdown",
292 |    "metadata": {},
293 |    "source": [
294 |     "#### Create a table with training features"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "metadata": {},
301 |    "outputs": [],
302 |    "source": [
303 |     "sample_size = 1000000\n",
304 |     "year = 2020\n",
305 |     "\n",
306 |     "sql_script_template = '''\n",
307 |     "CREATE OR REPLACE TABLE `@PROJECT.@DATASET.@TABLE` \n",
308 |     "AS (\n",
309 |     "    WITH\n",
310 |     "      taxitrips AS (\n",
311 |     "      SELECT\n",
312 |     "        FORMAT_DATETIME('%Y-%d-%m', trip_start_timestamp) AS date,\n",
313 |     "        trip_start_timestamp,\n",
314 |     "        trip_seconds,\n",
315 |     "        trip_miles,\n",
316 |     "        payment_type,\n",
317 |     "        pickup_longitude,\n",
318 |     "        pickup_latitude,\n",
319 |     "        dropoff_longitude,\n",
320 |     "        dropoff_latitude,\n",
321 |     "        tips,\n",
322 |     "        fare\n",
323 |     "      FROM\n",
324 |     "        `bigquery-public-data.chicago_taxi_trips.taxi_trips`\n",
325 |     "      WHERE 1=1 \n",
326 |     "      AND pickup_longitude IS NOT NULL\n",
327 |     "      AND pickup_latitude IS NOT NULL\n",
328 |     "      AND dropoff_longitude IS NOT NULL\n",
329 |     "      AND dropoff_latitude IS NOT NULL\n",
330 |     "      AND trip_miles > 0\n",
331 |     "      AND trip_seconds > 0\n",
332 |     "      AND fare > 0\n",
333 |     "      AND EXTRACT(YEAR FROM trip_start_timestamp) = @YEAR\n",
334 |     "    )\n",
335 |     "\n",
336 |     "    SELECT\n",
337 |     "      trip_start_timestamp,\n",
338 |     "      EXTRACT(MONTH from trip_start_timestamp) as trip_month,\n",
339 |     "      EXTRACT(DAY from trip_start_timestamp) as trip_day,\n",
340 |     "      EXTRACT(DAYOFWEEK from trip_start_timestamp) as trip_day_of_week,\n",
341 |     "      EXTRACT(HOUR from trip_start_timestamp) as trip_hour,\n",
342 |     "      trip_seconds,\n",
343 |     "      trip_miles,\n",
344 |     "      payment_type,\n",
345 |     "      ST_AsText(\n",
346 |     "          ST_SnapToGrid(ST_GeogPoint(pickup_longitude, pickup_latitude), 0.1)\n",
347 |     "      ) AS pickup_grid,\n",
348 |     "      ST_AsText(\n",
349 |     "          ST_SnapToGrid(ST_GeogPoint(dropoff_longitude, dropoff_latitude), 0.1)\n",
350 |     "      ) AS dropoff_grid,\n",
351 |     "      ST_Distance(\n",
352 |     "          ST_GeogPoint(pickup_longitude, pickup_latitude), \n",
353 |     "          ST_GeogPoint(dropoff_longitude, dropoff_latitude)\n",
354 |     "      ) AS euclidean,\n",
355 |     "      IF((tips/fare >= 0.2), 1, 0) AS tip_bin,\n",
356 |     "      CASE (ABS(MOD(FARM_FINGERPRINT(date),10))) \n",
357 |     "          WHEN 9 THEN 'TEST'\n",
358 |     "          WHEN 8 THEN 'VALIDATE'\n",
359 |     "          ELSE 'TRAIN' END AS data_split\n",
360 |     "    FROM\n",
361 |     "      taxitrips\n",
362 |     "    LIMIT @LIMIT\n",
363 |     ")\n",
364 |     "'''\n",
365 |     "\n",
366 |     "sql_script = sql_script_template.replace(\n",
367 |     "    '@PROJECT', PROJECT).replace(\n",
368 |     "    '@DATASET', BQ_DATASET_NAME).replace(\n",
369 |     "    '@TABLE', BQ_TABLE_NAME).replace(\n",
370 |     "    '@YEAR', str(year)).replace(\n",
371 |     "    '@LIMIT', str(sample_size))\n",
372 |     "\n",
373 |     "job = client.query(sql_script)\n",
374 |     "job.result()"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "markdown",
379 |    "metadata": {},
380 |    "source": [
381 |     "#### Review the created features"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": null,
387 |    "metadata": {},
388 |    "outputs": [],
389 |    "source": [
390 |     "sql_script = f'''\n",
391 |     "SELECT * EXCEPT (trip_start_timestamp)\n",
392 |     "FROM `{PROJECT}.{BQ_DATASET_NAME}.{BQ_TABLE_NAME}`\n",
393 |     "'''\n",
394 |     "df = client.query(sql_script).result().to_dataframe()"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "code",
399 |    "execution_count": null,
400 |    "metadata": {},
401 |    "outputs": [],
402 |    "source": [
403 |     "df.head().T"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "markdown",
408 |    "metadata": {},
409 |    "source": [
410 |     "## Creating a tabular dataset in Vertex"
411 |    ]
412 |   },
413 |   {
414 |    "cell_type": "markdown",
415 |    "metadata": {},
416 |    "source": [
417 |     "### Initialize Vertex AI SDK"
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "code",
422 |    "execution_count": null,
423 |    "metadata": {},
424 |    "outputs": [],
425 |    "source": [
426 |     "vertex_ai.init(\n",
427 |     "    project=PROJECT,\n",
428 |     "    location=REGION,\n",
429 |     "    staging_bucket=STAGING_BUCKET\n",
430 |     ")"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "markdown",
435 |    "metadata": {},
436 |    "source": [
437 |     "### Create a dataset and import data"
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "code",
442 |    "execution_count": null,
443 |    "metadata": {},
444 |    "outputs": [],
445 |    "source": [
446 |     "display_name = 'Chicago taxi trips'\n",
447 |     "bq_source_uri = f'bq://{PROJECT}.{BQ_DATASET_NAME}.{BQ_TABLE_NAME}'\n",
448 |     "\n",
449 |     "filter = f'display_name=\"{display_name}\"'\n",
450 |     "\n",
451 |     "dataset = vertex_ai.TabularDataset.list(filter=filter)\n",
452 |     "if not dataset:\n",
453 |     "    print(\"Creating a new dataset.\")\n",
454 |     "    dataset = vertex_ai.TabularDataset.create(\n",
455 |     "        display_name=display_name, bq_source=bq_source_uri,\n",
456 |     "    )\n",
457 |     "\n",
458 |     "    dataset.wait()\n",
459 |     "else:\n",
460 |     "    print(\"Using existing dataset: \", dataset[0].resource_name)\n",
461 |     "    dataset = vertex_ai.TabularDataset(dataset_name=dataset[0].resource_name)"
462 |    ]
463 |   },
464 |   {
465 |    "cell_type": "markdown",
466 |    "metadata": {},
467 |    "source": [
468 |     "## Launching an AutoML training job"
469 |    ]
470 |   },
471 |   {
472 |    "cell_type": "code",
473 |    "execution_count": null,
474 |    "metadata": {},
475 |    "outputs": [],
476 |    "source": [
477 |     "display_name = 'Chicago Taxi classifier training'\n",
478 |     "model_display_name = 'Chicago Taxi classifier'\n",
479 |     "target_column = 'tip_bin'\n",
480 |     "optimization_prediction_type = 'classification'\n",
481 |     "optimization_objective = 'maximize-recall-at-precision'\n",
482 |     "optimization_objective_precision_value = 0.7\n",
483 |     "split_column = 'data_split'\n",
484 |     "budget_milli_node_hours = 1000\n",
485 |     "\n",
486 |     "column_transformations = [\n",
487 |     "    {'categorical': {'column_name': 'trip_month'}},\n",
488 |     "    {'categorical': {'column_name': 'trip_day'}},\n",
489 |     "    {'categorical': {'column_name': 'trip_day_of_week'}},\n",
490 |     "    {'categorical': {'column_name': 'trip_hour'}},\n",
491 |     "    {'categorical': {'column_name': 'payment_type'}},\n",
492 |     "    {'categorical': {'column_name': 'pickup_grid'}},\n",
493 |     "    {'categorical': {'column_name': 'dropoff_grid'}},\n",
494 |     "    {'numeric': {'column_name': 'trip_seconds'}},\n",
495 |     "    {'numeric': {'column_name': 'euclidean'}},\n",
496 |     "    {'numeric': {'column_name': 'trip_miles'}},\n",
497 |     "]\n",
498 |     "\n",
499 |     "job = vertex_ai.AutoMLTabularTrainingJob(\n",
500 |     "    display_name=display_name,\n",
501 |     "    optimization_prediction_type=optimization_prediction_type,\n",
502 |     "    optimization_objective=optimization_objective,\n",
503 |     "    optimization_objective_precision_value=optimization_objective_precision_value,\n",
504 |     "    column_transformations=column_transformations,\n",
505 |     ")\n",
506 |     "\n",
507 |     "model = job.run(\n",
508 |     "    dataset=dataset,\n",
509 |     "    target_column=target_column,\n",
510 |     "    budget_milli_node_hours=budget_milli_node_hours,\n",
511 |     "    model_display_name=model_display_name,\n",
512 |     "    predefined_split_column_name=split_column,\n",
513 |     "    sync=False\n",
514 |     ")"
515 |    ]
516 |   },
517 |   {
518 |    "cell_type": "code",
519 |    "execution_count": null,
520 |    "metadata": {},
521 |    "outputs": [],
522 |    "source": [
523 |     "print(f\"Job Name: {job.display_name}\")\n",
524 |     "print(f\"Job Resource Name: {job.resource_name}\\n\")\n",
525 |     "print(f\"Check training progress at {job._dashboard_uri()}\")"
526 |    ]
527 |   },
528 |   {
529 |    "cell_type": "code",
530 |    "execution_count": null,
531 |    "metadata": {},
532 |    "outputs": [],
533 |    "source": [
534 |     "# This blocks until the model is finished training.\n",
535 |     "# This takes about 70min, given 1000 milli_node_hours budget (1 hour, paid for),\n",
536 |     "# plus 10min startup times (free).\n",
537 |     "\n",
538 |     "model.wait()\n",
539 |     "print(f\"Job Name: {model.display_name}\")"
540 |    ]
541 |   },
542 |   {
543 |    "cell_type": "markdown",
544 |    "metadata": {},
545 |    "source": [
546 |     "## Deploy Model\n"
547 |    ]
548 |   },
549 |   {
550 |    "cell_type": "code",
551 |    "execution_count": null,
552 |    "metadata": {},
553 |    "outputs": [],
554 |    "source": [
555 |     "endpoint = model.deploy(machine_type=\"n1-standard-4\", sync=False)"
556 |    ]
557 |   },
558 |   {
559 |    "cell_type": "markdown",
560 |    "metadata": {},
561 |    "source": [
562 |     "## Model Deployment\n",
563 |     "\n",
564 |     "Now deploy the trained Vertex Model resource for batch and online prediction.\n",
565 |     "\n",
566 |     "For online prediction, you:\n",
567 |     "\n",
568 |     "- Create an Endpoint resource for deploying the Model resource to.\n",
569 |     "- Deploy the Model resource to the Endpoint resource.\n",
570 |     "- Make online prediction requests to the Endpoint resource.\n",
571 |     "\n",
572 |     "For batch-prediction, you:\n",
573 |     "\n",
574 |     "- Create a batch prediction job.\n",
575 |     "- The job service will provision resources for the batch prediction request.\n",
576 |     "- The results of the batch prediction request are returned to the caller.\n",
577 |     "- The job service will unprovision the resoures for the batch prediction request."
578 |    ]
579 |   },
580 |   {
581 |    "cell_type": "markdown",
582 |    "metadata": {},
583 |    "source": [
584 |     "### Predict on Endpoint - Online Prediction"
585 |    ]
586 |   },
587 |   {
588 |    "cell_type": "code",
589 |    "execution_count": null,
590 |    "metadata": {},
591 |    "outputs": [],
592 |    "source": [
593 |     "job.state"
594 |    ]
595 |   },
596 |   {
597 |    "cell_type": "code",
598 |    "execution_count": null,
599 |    "metadata": {},
600 |    "outputs": [],
601 |    "source": [
602 |     "# Block until the endpoint is deployed, which takes about 12min.\n",
603 |     "endpoint.wait()"
604 |    ]
605 |   },
606 |   {
607 |    "cell_type": "code",
608 |    "execution_count": null,
609 |    "metadata": {},
610 |    "outputs": [],
611 |    "source": [
612 |     "test_instances = [  \n",
613 |     "    \n",
614 |     "    {\n",
615 |     "        \"dropoff_grid\": \"POINT(-87.6 41.9)\",\n",
616 |     "        \"euclidean\": 2064.2696,\n",
617 |     "        \"payment_type\": \"Credit Card\",\n",
618 |     "        \"pickup_grid\": \"POINT(-87.6 41.9)\",\n",
619 |     "        \"trip_miles\": 1.37,\n",
620 |     "        \"trip_day\": \"12\",\n",
621 |     "        \"trip_hour\": \"16\",\n",
622 |     "        \"trip_month\": \"2\",\n",
623 |     "        \"trip_day_of_week\": \"4\",\n",
624 |     "        \"trip_seconds\": \"555\"\n",
625 |     "    }\n",
626 |     "]\n",
627 |     "\n",
628 |     "predictions = endpoint.predict(instances=test_instances)"
629 |    ]
630 |   },
631 |   {
632 |    "cell_type": "code",
633 |    "execution_count": null,
634 |    "metadata": {},
635 |    "outputs": [],
636 |    "source": [
637 |     "predictions"
638 |    ]
639 |   },
640 |   {
641 |    "cell_type": "code",
642 |    "execution_count": null,
643 |    "metadata": {},
644 |    "outputs": [],
645 |    "source": [
646 |     "predictions = endpoint.predict(instances=test_instances)\n",
647 |     "predictions"
648 |    ]
649 |   },
650 |   {
651 |    "cell_type": "markdown",
652 |    "metadata": {},
653 |    "source": [
654 |     "### Batch Prediction Job\n",
655 |     "\n",
656 |     "Now do a batch prediction to your deployed model."
657 |    ]
658 |   },
659 |   {
660 |    "cell_type": "markdown",
661 |    "metadata": {},
662 |    "source": [
663 |     "#### Make test items"
664 |    ]
665 |   },
666 |   {
667 |    "cell_type": "code",
668 |    "execution_count": null,
669 |    "metadata": {},
670 |    "outputs": [],
671 |    "source": [
672 |     "sql_script = f'''\n",
673 |     "SELECT trip_month, trip_day, trip_day_of_week, trip_hour, payment_type, pickup_grid, dropoff_grid, trip_seconds, euclidean, trip_miles\n",
674 |     "FROM `{PROJECT}.{BQ_DATASET_NAME}.{BQ_TABLE_NAME}`\n",
675 |     "LIMIT 1000\n",
676 |     "'''\n",
677 |     "\n",
678 |     "dtypes = {\n",
679 |     "    'dropoff_grid': str,\n",
680 |     "    'euclidean': 'float64',\n",
681 |     "    'trip_month': str,\n",
682 |     "    'trip_day': str,\n",
683 |     "    'trip_day_of_week': str,\n",
684 |     "    'trip_hour': str,\n",
685 |     "    'payment_type': str,\n",
686 |     "    'pickup_grid': str,\n",
687 |     "    'trip_seconds': str,\n",
688 |     "    'trip_miles': 'float64'\n",
689 |     "}\n",
690 |     "\n",
691 |     "df_test_batch = client.query(sql_script).result().to_dataframe(dtypes=dtypes)"
692 |    ]
693 |   },
694 |   {
695 |    "cell_type": "code",
696 |    "execution_count": null,
697 |    "metadata": {},
698 |    "outputs": [],
699 |    "source": [
700 |     "df_test_batch.head()"
701 |    ]
702 |   },
703 |   {
704 |    "cell_type": "code",
705 |    "execution_count": null,
706 |    "metadata": {},
707 |    "outputs": [],
708 |    "source": [
709 |     "df_test_batch.dtypes"
710 |    ]
711 |   },
712 |   {
713 |    "cell_type": "code",
714 |    "execution_count": null,
715 |    "metadata": {},
716 |    "outputs": [],
717 |    "source": [
718 |     "out_file_name = \"bq_export_features_test.csv\"\n",
719 |     "gcs_batch_request_csv = f'{STAGING_BUCKET}/test/batch/{out_file_name}'\n",
720 |     "df_test_batch.to_csv(f'{STAGING_BUCKET}/test/batch/bq_export_features_test.csv',\n",
721 |     "                     header=True, \n",
722 |     "                     index=False,\n",
723 |     "                     quoting=csv.QUOTE_NONNUMERIC,\n",
724 |     "                     escapechar=\"\\\\\",\n",
725 |     "                     doublequote=False\n",
726 |     "                    )"
727 |    ]
728 |   },
729 |   {
730 |    "cell_type": "code",
731 |    "execution_count": null,
732 |    "metadata": {},
733 |    "outputs": [],
734 |    "source": [
735 |     "!gsutil cat $gcs_batch_request_csv  | head"
736 |    ]
737 |   },
738 |   {
739 |    "cell_type": "markdown",
740 |    "metadata": {},
741 |    "source": [
742 |     "#### Make the batch prediction request\n",
743 |     "\n",
744 |     "Now that your Model resource is trained, you can make a batch prediction by invoking the batch_request() method, with the following parameters:\n",
745 |     "\n",
746 |     "- `job_display_name`: The human readable name for the batch prediction job.\n",
747 |     "- `gcs_source`: A list of one or more batch request input files.\n",
748 |     "- `gcs_destination_prefix`: The Cloud Storage location for storing the batch prediction resuls.\n",
749 |     "- `sync`: If set to True, the call will block while waiting for the asynchronous batch job to complete."
750 |    ]
751 |   },
752 |   {
753 |    "cell_type": "code",
754 |    "execution_count": null,
755 |    "metadata": {},
756 |    "outputs": [],
757 |    "source": [
758 |     "TIMESTAMP = datetime.now().strftime(\"%Y%m%d%H%M%S\")"
759 |    ]
760 |   },
761 |   {
762 |    "cell_type": "code",
763 |    "execution_count": null,
764 |    "metadata": {},
765 |    "outputs": [],
766 |    "source": [
767 |     "batch_predict_job = model.batch_predict(\n",
768 |     "    job_display_name=f\"{model_display_name}-batch-{TIMESTAMP}\",\n",
769 |     "    gcs_source=gcs_batch_request_csv,\n",
770 |     "    instances_format=\"csv\",\n",
771 |     "    gcs_destination_prefix=f'{STAGING_BUCKET}/test/batch_results/',\n",
772 |     "    predictions_format=\"csv\",\n",
773 |     "    sync=False\n",
774 |     ")\n",
775 |     "\n",
776 |     "print(batch_predict_job)"
777 |    ]
778 |   },
779 |   {
780 |    "cell_type": "markdown",
781 |    "metadata": {},
782 |    "source": [
783 |     "#### Wait for completion of batch prediction job\n",
784 |     "Next, wait for the batch job to complete."
785 |    ]
786 |   },
787 |   {
788 |    "cell_type": "code",
789 |    "execution_count": null,
790 |    "metadata": {},
791 |    "outputs": [],
792 |    "source": [
793 |     "# Blocks while the batch prediction job is running, which takes about 18min.\n",
794 |     "batch_predict_job.wait()"
795 |    ]
796 |   },
797 |   {
798 |    "cell_type": "markdown",
799 |    "metadata": {},
800 |    "source": [
801 |     "#### Get the predictions\n",
802 |     "Next, get the results from the completed batch prediction job.\n",
803 |     "\n",
804 |     "The results are written to the Cloud Storage output bucket you specified in the batch prediction request. You call the method iter_outputs() to get a list of each Cloud Storage file generated with the results. Each file contains one or more prediction requests in a JSON format:\n",
805 |     "\n",
806 |     "- `content`: The prediction request.\n",
807 |     "- `prediction`: The prediction response.\n",
808 |     "    - `ids`: The internal assigned unique identifiers for each prediction request.\n",
809 |     "    - `displayNames`: The class names for each class label.\n",
810 |     "    - `confidences`: The predicted confidence, between 0 and 1, per class label."
811 |    ]
812 |   },
813 |   {
814 |    "cell_type": "markdown",
815 |    "metadata": {},
816 |    "source": [
817 |     "---\n",
818 |     "\n",
819 |     "**NOTE: There is issue with batch prediction job where input data types are not matching with model inputs. Skip the section below if you hit into issues**\n",
820 |     "\n",
821 |     "---"
822 |    ]
823 |   },
824 |   {
825 |    "cell_type": "code",
826 |    "execution_count": null,
827 |    "metadata": {},
828 |    "outputs": [],
829 |    "source": [
830 |     "bp_iter_outputs = batch_predict_job.iter_outputs()\n",
831 |     "\n",
832 |     "prediction_results = list()\n",
833 |     "for blob in bp_iter_outputs:\n",
834 |     "    if blob.name.split(\"/\")[-1].startswith(\"prediction\"):\n",
835 |     "        prediction_results.append(blob.name)"
836 |    ]
837 |   },
838 |   {
839 |    "cell_type": "code",
840 |    "execution_count": null,
841 |    "metadata": {},
842 |    "outputs": [],
843 |    "source": [
844 |     "tags = list()\n",
845 |     "for prediction_result in prediction_results:\n",
846 |     "    gfile_name = f\"gs://{bp_iter_outputs.bucket.name}/{prediction_result}\"\n",
847 |     "    with tf.io.gfile.GFile(name=gfile_name, mode=\"r\") as gfile:\n",
848 |     "        for line in gfile.readlines():\n",
849 |     "            print(line)"
850 |    ]
851 |   },
852 |   {
853 |    "cell_type": "markdown",
854 |    "metadata": {},
855 |    "source": [
856 |     "## Clean up"
857 |    ]
858 |   },
859 |   {
860 |    "cell_type": "markdown",
861 |    "metadata": {},
862 |    "source": [
863 |     "### Undeploy Models\n",
864 |     "When you are done doing predictions, you undeploy the Model resource from the Endpoint resouce. This deprovisions all compute resources and ends billing for the deployed model."
865 |    ]
866 |   },
867 |   {
868 |    "cell_type": "code",
869 |    "execution_count": null,
870 |    "metadata": {},
871 |    "outputs": [],
872 |    "source": [
873 |     "endpoint.list_models()"
874 |    ]
875 |   },
876 |   {
877 |    "cell_type": "code",
878 |    "execution_count": null,
879 |    "metadata": {},
880 |    "outputs": [],
881 |    "source": [
882 |     "endpoint.undeploy_all()"
883 |    ]
884 |   },
885 |   {
886 |    "cell_type": "markdown",
887 |    "metadata": {},
888 |    "source": [
889 |     "### Delete Endpoint"
890 |    ]
891 |   },
892 |   {
893 |    "cell_type": "code",
894 |    "execution_count": null,
895 |    "metadata": {},
896 |    "outputs": [],
897 |    "source": [
898 |     "endpoint.delete()"
899 |    ]
900 |   },
901 |   {
902 |    "cell_type": "markdown",
903 |    "metadata": {},
904 |    "source": [
905 |     "### Delete Model"
906 |    ]
907 |   },
908 |   {
909 |    "cell_type": "code",
910 |    "execution_count": null,
911 |    "metadata": {},
912 |    "outputs": [],
913 |    "source": [
914 |     "model.delete()"
915 |    ]
916 |   }
917 |  ],
918 |  "metadata": {
919 |   "colab": {
920 |    "collapsed_sections": [],
921 |    "name": "AI_Platform_(Unified)_SDK_AutoML_Image_Classification_Training.ipynb",
922 |    "toc_visible": true
923 |   },
924 |   "environment": {
925 |    "name": "tf2-gpu.2-5.m74",
926 |    "type": "gcloud",
927 |    "uri": "gcr.io/deeplearning-platform-release/tf2-gpu.2-5:m74"
928 |   },
929 |   "kernelspec": {
930 |    "display_name": "Python [conda env:root] *",
931 |    "language": "python",
932 |    "name": "conda-root-py"
933 |   },
934 |   "language_info": {
935 |    "codemirror_mode": {
936 |     "name": "ipython",
937 |     "version": 3
938 |    },
939 |    "file_extension": ".py",
940 |    "mimetype": "text/x-python",
941 |    "name": "python",
942 |    "nbconvert_exporter": "python",
943 |    "pygments_lexer": "ipython3",
944 |    "version": "3.7.10"
945 |   }
946 |  },
947 |  "nbformat": 4,
948 |  "nbformat_minor": 4
949 | }
950 | 


--------------------------------------------------------------------------------
/02-custom-job-tabular/instances.json:
--------------------------------------------------------------------------------
 1 | {"instances" : [  
 2 |     {
 3 |         "dropoff_grid": ["POINT(-87.6 41.9)"],
 4 |         "euclidean": [2064.2696],
 5 |         "payment_type": ["Credit Card"],
 6 |         "pickup_grid": ["POINT(-87.6 41.9)"],
 7 |         "trip_miles": [1.37],
 8 |         "trip_day": [12],
 9 |         "trip_hour": [16],
10 |         "trip_month": [2],
11 |         "trip_day_of_week": [4],
12 |         "trip_seconds": [555]
13 |     }
14 | ]
15 | }
16 | 


--------------------------------------------------------------------------------
/02-custom-job-tabular/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/02-custom-job-tabular/model.png


--------------------------------------------------------------------------------
/02-custom-job-tabular/predictor/Dockerfile:
--------------------------------------------------------------------------------
 1 | 
 2 | FROM tensorflow/serving:2.4.0
 3 | 
 4 | # Set where models should be stored in the container
 5 | ENV MODEL_BASE_PATH=/models
 6 | ENV MODEL_NAME=model
 7 | 
 8 | RUN mkdir -p ${MODEL_BASE_PATH}/${MODEL_NAME}/1
 9 | 
10 | # copy the model file
11 | COPY model ${MODEL_BASE_PATH}/${MODEL_NAME}/1/
12 | 
13 | # Create a script that runs the model server so we can use environment variables
14 | # while also passing in arguments from the docker command line
15 | RUN echo '#!/bin/bash \n\n\
16 | tensorflow_model_server --port=5000 --rest_api_port=8080 \
17 | --model_name=${MODEL_NAME} --model_base_path=${MODEL_BASE_PATH}/${MODEL_NAME} \
18 | "$@"' > /usr/bin/predictor.sh \
19 | && chmod +x /usr/bin/predictor.sh
20 | 
21 | EXPOSE 5000
22 | EXPOSE 8080
23 | 
24 | # Remove entrypoint from parent image
25 | ENTRYPOINT []
26 | 
27 | CMD ["/usr/bin/predictor.sh"]
28 | 


--------------------------------------------------------------------------------
/02-custom-job-tabular/predictor/model/saved_model.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/02-custom-job-tabular/predictor/model/saved_model.pb


--------------------------------------------------------------------------------
/02-custom-job-tabular/predictor/model/variables/variables.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/02-custom-job-tabular/predictor/model/variables/variables.data-00000-of-00001


--------------------------------------------------------------------------------
/02-custom-job-tabular/predictor/model/variables/variables.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/02-custom-job-tabular/predictor/model/variables/variables.index


--------------------------------------------------------------------------------
/02-custom-job-tabular/trainer/Dockerfile:
--------------------------------------------------------------------------------
 1 | 
 2 | FROM gcr.io/deeplearning-platform-release/tf2-cpu.2-4
 3 | 
 4 | WORKDIR /trainer
 5 | RUN pip install cloudml-hypertune
 6 | 
 7 | # Copies the trainer code to the docker image.
 8 | COPY train.py .
 9 | 
10 | ENTRYPOINT ["python", "train.py"]
11 | 


--------------------------------------------------------------------------------
/03-distributed-training-text/scripts/Dockerfile:
--------------------------------------------------------------------------------
 1 | 
 2 | FROM gcr.io/deeplearning-platform-release/tf2-gpu.2-4
 3 | 
 4 | RUN pip install pip install tf-models-official==2.4.0 tensorflow-text==2.4.1
 5 | 
 6 | WORKDIR /
 7 | 
 8 | # Copies the trainer code to the docker image.
 9 | COPY trainer /trainer
10 | 
11 | # Sets up the entry point to invoke the trainer.
12 | ENTRYPOINT ["python", "-m", "trainer.task"]
13 | 


--------------------------------------------------------------------------------
/03-distributed-training-text/scripts/trainer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/03-distributed-training-text/scripts/trainer/__init__.py


--------------------------------------------------------------------------------
/03-distributed-training-text/scripts/trainer/task.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # Copyright 2021 Google Inc. All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #            http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | 
 16 | import os
 17 | import tensorflow as tf
 18 | import tensorflow_hub as hub
 19 | import tensorflow_text as text
 20 | 
 21 | from absl import app
 22 | from absl import flags
 23 | from absl import logging
 24 | from official.nlp import optimization 
 25 | 
 26 | 
 27 | TFHUB_HANDLE_ENCODER = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3'
 28 | TFHUB_HANDLE_PREPROCESS = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
 29 | LOCAL_TB_FOLDER = '/tmp/logs'
 30 | LOCAL_SAVED_MODEL_DIR = '/tmp/saved_model'
 31 | 
 32 | FLAGS = flags.FLAGS
 33 | flags.DEFINE_integer('steps_per_epoch', 625, 'Steps per training epoch')
 34 | flags.DEFINE_integer('eval_steps', 150, 'Evaluation steps')
 35 | flags.DEFINE_integer('epochs', 2, 'Nubmer of epochs')
 36 | flags.DEFINE_integer('per_replica_batch_size', 32, 'Per replica batch size')
 37 | flags.DEFINE_string('training_data_path', f'/bert-finetuning/imdb/tfrecords/train', 'Training data GCS path')
 38 | flags.DEFINE_string('validation_data_path', f'/bert-finetuning/imdb/tfrecords/valid', 'Validation data GCS path')
 39 | flags.DEFINE_string('testing_data_path', f'/bert-finetuning/imdb/tfrecords/test', 'Testing data GCS path')
 40 | 
 41 | flags.DEFINE_string('job_dir', f'/jobs', 'A base GCS path for jobs')
 42 | flags.DEFINE_enum('strategy', 'multiworker', ['mirrored', 'multiworker'], 'Distribution strategy')
 43 | flags.DEFINE_enum('auto_shard_policy', 'auto', ['auto', 'data', 'file', 'off'], 'Dataset sharing strategy')
 44 | 
 45 | 
 46 | 
 47 | auto_shard_policy = {
 48 |     'auto': tf.data.experimental.AutoShardPolicy.AUTO,
 49 |     'data': tf.data.experimental.AutoShardPolicy.DATA,
 50 |     'file': tf.data.experimental.AutoShardPolicy.FILE,
 51 |     'off': tf.data.experimental.AutoShardPolicy.OFF,
 52 | }
 53 | 
 54 | 
 55 | def create_unbatched_dataset(tfrecords_folder):
 56 |     """Creates an unbatched dataset in the format required by the 
 57 |        sentiment analysis model from the folder with TFrecords files."""
 58 |     
 59 |     feature_description = {
 60 |         'text_fragment': tf.io.FixedLenFeature([], tf.string, default_value=''),
 61 |         'label': tf.io.FixedLenFeature([], tf.int64, default_value=0),
 62 |     }
 63 | 
 64 |     def _parse_function(example_proto):
 65 |         parsed_example = tf.io.parse_single_example(example_proto, feature_description)
 66 |         return parsed_example['text_fragment'], parsed_example['label']
 67 |   
 68 |     file_paths = [f'{tfrecords_folder}/{file_path}' for file_path in tf.io.gfile.listdir(tfrecords_folder)]
 69 |     dataset = tf.data.TFRecordDataset(file_paths)
 70 |     dataset = dataset.map(_parse_function)
 71 |     
 72 |     return dataset
 73 | 
 74 | 
 75 | def configure_dataset(ds, auto_shard_policy):
 76 |     """
 77 |     Optimizes the performance of a dataset.
 78 |     """
 79 |     
 80 |     options = tf.data.Options()
 81 |     options.experimental_distribute.auto_shard_policy = (
 82 |         auto_shard_policy
 83 |     )
 84 |     
 85 |     ds = ds.repeat(-1).cache()
 86 |     ds = ds.prefetch(buffer_size=tf.data.AUTOTUNE)
 87 |     ds = ds.with_options(options)
 88 |     return ds
 89 | 
 90 | 
 91 | def create_input_pipelines(train_dir, valid_dir, test_dir, batch_size, auto_shard_policy):
 92 |     """Creates input pipelines from Imdb dataset."""
 93 |     
 94 |     train_ds = create_unbatched_dataset(train_dir)
 95 |     train_ds = train_ds.batch(batch_size)
 96 |     train_ds = configure_dataset(train_ds, auto_shard_policy)
 97 |     
 98 |     valid_ds = create_unbatched_dataset(valid_dir)
 99 |     valid_ds = valid_ds.batch(batch_size)
100 |     valid_ds = configure_dataset(valid_ds, auto_shard_policy)
101 |     
102 |     test_ds = create_unbatched_dataset(test_dir)
103 |     test_ds = test_ds.batch(batch_size)
104 |     test_ds = configure_dataset(test_ds, auto_shard_policy)
105 | 
106 |     return train_ds, valid_ds, test_ds
107 | 
108 | 
109 | def build_classifier_model(tfhub_handle_preprocess, tfhub_handle_encoder):
110 |     """Builds a simple binary classification model with BERT trunk."""
111 |     
112 |     text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
113 |     preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
114 |     encoder_inputs = preprocessing_layer(text_input)
115 |     encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
116 |     outputs = encoder(encoder_inputs)
117 |     net = outputs['pooled_output']
118 |     net = tf.keras.layers.Dropout(0.1)(net)
119 |     net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
120 |     
121 |     return tf.keras.Model(text_input, net)
122 | 
123 | 
124 | def copy_tensorboard_logs(local_path: str, gcs_path: str):
125 |     """Copies Tensorboard logs from a local dir to a GCS location.
126 |     
127 |     After training, batch copy Tensorboard logs locally to a GCS location. This can result
128 |     in faster pipeline runtimes over streaming logs per batch to GCS that can get bottlenecked
129 |     when streaming large volumes.
130 |     
131 |     Args:
132 |       local_path: local filesystem directory uri.
133 |       gcs_path: cloud filesystem directory uri.
134 |     Returns:
135 |       None.
136 |     """
137 |     pattern = '{}/*/events.out.tfevents.*'.format(local_path)
138 |     local_files = tf.io.gfile.glob(pattern)
139 |     gcs_log_files = [local_file.replace(local_path, gcs_path) for local_file in local_files]
140 |     for local_file, gcs_file in zip(local_files, gcs_log_files):
141 |         tf.io.gfile.copy(local_file, gcs_file)
142 | 
143 | 
144 | def main(argv):
145 |     del argv
146 |     
147 |     def _is_chief(task_type, task_id):
148 |         return ((task_type == 'chief' or task_type == 'worker') and task_id == 0) or task_type is None
149 |         
150 |     
151 |     logging.info('Setting up training.')
152 |     logging.info('   epochs: {}'.format(FLAGS.epochs))
153 |     logging.info('   steps_per_epoch: {}'.format(FLAGS.steps_per_epoch))
154 |     logging.info('   eval_steps: {}'.format(FLAGS.eval_steps))
155 |     logging.info('   strategy: {}'.format(FLAGS.strategy))
156 |     
157 |     tb_dir = os.getenv('AIP_TENSORBOARD_LOG_DIR', LOCAL_TB_FOLDER)
158 |     model_dir = os.getenv('AIP_MODEL_DIR', LOCAL_MODEL_DIR)
159 |     
160 |     if FLAGS.strategy == 'mirrored':
161 |         strategy = tf.distribute.MirroredStrategy()
162 |     else:
163 |         strategy = tf.distribute.MultiWorkerMirroredStrategy()
164 |         
165 |     if strategy.cluster_resolver:    
166 |         task_type, task_id = (strategy.cluster_resolver.task_type,
167 |                               strategy.cluster_resolver.task_id)
168 |     else:
169 |         task_type, task_id =(None, None)
170 |         
171 |     
172 |     global_batch_size = (strategy.num_replicas_in_sync *
173 |                          FLAGS.per_replica_batch_size)
174 |     
175 |     
176 |     train_ds, valid_ds, test_ds = create_input_pipelines(
177 |         FLAGS.training_data_path,
178 |         FLAGS.validation_data_path,
179 |         FLAGS.testing_data_path,
180 |         global_batch_size,
181 |         auto_shard_policy[FLAGS.auto_shard_policy])
182 |         
183 |     num_train_steps = FLAGS.steps_per_epoch * FLAGS.epochs
184 |     num_warmup_steps = int(0.1*num_train_steps)
185 |     init_lr = 3e-5
186 |     
187 |     with strategy.scope():
188 |         model = build_classifier_model(TFHUB_HANDLE_PREPROCESS, TFHUB_HANDLE_ENCODER)
189 |         loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
190 |         metrics = tf.metrics.BinaryAccuracy()
191 |         optimizer = optimization.create_optimizer(
192 |             init_lr=init_lr,
193 |             num_train_steps=num_train_steps,
194 |             num_warmup_steps=num_warmup_steps,
195 |             optimizer_type='adamw')
196 | 
197 |         model.compile(optimizer=optimizer,
198 |                       loss=loss,
199 |                       metrics=metrics)
200 |         
201 |     # Configure BackupAndRestore callback
202 |     backup_dir = '{}/backupandrestore'.format(FLAGS.job_dir)
203 |     callbacks = [tf.keras.callbacks.experimental.BackupAndRestore(backup_dir=backup_dir)]
204 |     
205 |     # Configure TensorBoard callback on Chief
206 |     if _is_chief(task_type, task_id):
207 |         callbacks.append(tf.keras.callbacks.TensorBoard(
208 |             log_dir=tb_dir, update_freq='batch'))
209 |     
210 |     logging.info('Starting training ...')
211 |     
212 |     history = model.fit(x=train_ds,
213 |                         validation_data=valid_ds,
214 |                         steps_per_epoch=FLAGS.steps_per_epoch,
215 |                         validation_steps=FLAGS.eval_steps,
216 |                         epochs=FLAGS.epochs,
217 |                         callbacks=callbacks)
218 | 
219 |     if _is_chief(task_type, task_id):
220 |         # Copy tensorboard logs to GCS
221 |         # tb_logs = '{}/tb_logs'.format(FLAGS.job_dir)
222 |         # logging.info('Copying TensorBoard logs to: {}'.format(tb_logs))
223 |         # copy_tensorboard_logs(LOCAL_TB_FOLDER, tb_logs)
224 |         saved_model_dir = '{}/saved_model'.format(model_dir)
225 |     else:
226 |         saved_model_dir = model_dir
227 |         
228 |     # Save trained model
229 |     saved_model_dir = '{}/saved_model'.format(model_dir)
230 |     logging.info('Training completed. Saving the trained model to: {}'.format(saved_model_dir))
231 |     model.save(saved_model_dir)
232 |     #tf.saved_model.save(model, saved_model_dir)
233 |     
234 |     
235 | if __name__ == '__main__':
236 |     logging.set_verbosity(logging.INFO)
237 |     app.run(main)
238 | 


--------------------------------------------------------------------------------
/04-vertex-pipeline-and-airflow/data_orchestration_bq_example_dag.py:
--------------------------------------------------------------------------------
 1 | 
 2 | """An example Composer workflow integrating GCS and BigQuery.
 3 | 
 4 | A .csv is read from a GCS bucket to a BigQuery table; a query is made, and the
 5 | result is written back to a different BigQuery table within a new dataset.
 6 | """
 7 | 
 8 | from datetime import datetime, timedelta
 9 | from airflow import DAG
10 | from airflow.contrib.operators.bigquery_operator import BigQueryOperator
11 | from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator
12 | from airflow.operators.bash_operator import BashOperator
13 | 
14 | YESTERDAY = datetime.combine(
15 |     datetime.today() - timedelta(days=1), datetime.min.time())
16 | BQ_DATASET_NAME = 'bq_demos'
17 | 
18 | default_args = {
19 |     'owner': 'airflow',
20 |     'depends_on_past': False,
21 |     'start_date': YESTERDAY,
22 |     'email_on_failure': False,
23 |     'email_on_retry': False,
24 |     'retries': 1,
25 |     'retry_delay': timedelta(minutes=5),
26 | }
27 | 
28 | # Solution: pass a schedule_interval argument to DAG instantiation.
29 | with DAG('dag_gcs_to_bq_orch', default_args=default_args,
30 |          schedule_interval=None) as dag:
31 |   create_bq_dataset_if_not_exist = """
32 |     bq ls {0}
33 |     if [ $? -ne 0 ]; then
34 |       bq mk {0}
35 |     fi
36 |   """.format(BQ_DATASET_NAME)
37 | 
38 |   # Create destination dataset.
39 |   t1 = BashOperator(
40 |       task_id='create_destination_dataset',
41 |       bash_command=create_bq_dataset_if_not_exist,
42 |       dag=dag)
43 | 
44 |   # Create a bigquery table from a .csv file located in a GCS bucket
45 |   # (gs://example-datasets/game_data_condensed.csv).
46 |   # Store it in our dataset.
47 |   t2 = GoogleCloudStorageToBigQueryOperator(
48 |       task_id='gcs_to_bq',
49 |       bucket='example-datasets',
50 |       source_objects=['game_data_condensed.csv'],
51 |       destination_project_dataset_table='{0}.composer_game_data_table'
52 |       .format(BQ_DATASET_NAME),
53 |       schema_fields=[
54 |           {'name': 'name', 'type': 'string', 'mode': 'nullable'},
55 |           {'name': 'team', 'type': 'string', 'mode': 'nullable'},
56 |           {'name': 'total_score', 'type': 'integer', 'mode': 'nullable'},
57 |           {'name': 'timestamp', 'type': 'integer', 'mode': 'nullable'},
58 |           {'name': 'window_start', 'type': 'string', 'mode': 'nullable'},
59 |       ],
60 |       write_disposition='WRITE_TRUNCATE')
61 | 
62 |   # Run example query (http://shortn/_BdF1UTEYOb) and save result to the
63 |   # destination table.
64 |   t3 = BigQueryOperator(
65 |       task_id='bq_example_query',
66 |       bql=f"""
67 |         SELECT
68 |           name, team, total_score
69 |         FROM
70 |           {BQ_DATASET_NAME}.composer_game_data_table
71 |         WHERE total_score > 15
72 |         LIMIT 100;
73 |       """,
74 |       destination_dataset_table='{0}.gcp_example_query_result'
75 |       .format(BQ_DATASET_NAME),
76 |       write_disposition='WRITE_TRUNCATE')
77 | 
78 |   t1 >> t2 >> t3
79 | 


--------------------------------------------------------------------------------
/04-vertex-pipeline-and-airflow/get_composer_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Get the client ID associated with a Cloud Composer environment."""
16 | 
17 | import argparse
18 | 
19 | 
20 | def get_client_id(project_id, location, composer_environment):
21 |     # [START composer_get_environment_client_id]
22 |     import google.auth
23 |     import google.auth.transport.requests
24 |     import requests
25 |     import six.moves.urllib.parse
26 | 
27 |     # Authenticate with Google Cloud.
28 |     # See: https://cloud.google.com/docs/authentication/getting-started
29 |     credentials, _ = google.auth.default(
30 |         scopes=['https://www.googleapis.com/auth/cloud-platform'])
31 |     authed_session = google.auth.transport.requests.AuthorizedSession(
32 |         credentials)
33 | 
34 |     # project_id = 'YOUR_PROJECT_ID'
35 |     # location = 'us-central1'
36 |     # composer_environment = 'YOUR_COMPOSER_ENVIRONMENT_NAME'
37 | 
38 |     environment_url = (
39 |         'https://composer.googleapis.com/v1beta1/projects/{}/locations/{}'
40 |         '/environments/{}').format(project_id, location, composer_environment)
41 |     composer_response = authed_session.request('GET', environment_url)
42 |     environment_data = composer_response.json()
43 |     airflow_uri = environment_data['config']['airflowUri']
44 |     print(airflow_uri)
45 |     dag_gcs_prefix = environment_data['config']['dagGcsPrefix']
46 |     print(dag_gcs_prefix)
47 | 
48 |     # The Composer environment response does not include the IAP client ID.
49 |     # Make a second, unauthenticated HTTP request to the web server to get the
50 |     # redirect URI.
51 |     redirect_response = requests.get(airflow_uri, allow_redirects=False)
52 |     redirect_location = redirect_response.headers['location']
53 | 
54 |     # Extract the client_id query parameter from the redirect.
55 |     parsed = six.moves.urllib.parse.urlparse(redirect_location)
56 |     query_string = six.moves.urllib.parse.parse_qs(parsed.query)
57 |     print(query_string['client_id'][0])
58 |     # [END composer_get_environment_client_id]
59 | 
60 | 
61 | # Usage: python get_client_id.py your_project_id your_region your_environment_name
62 | if __name__ == '__main__':
63 |     parser = argparse.ArgumentParser(
64 |         description=__doc__,
65 |         formatter_class=argparse.RawDescriptionHelpFormatter)
66 |     parser.add_argument('project_id', help='Your Project ID.')
67 |     parser.add_argument(
68 |         'location', help='Region of the Cloud Composer environment.')
69 |     parser.add_argument(
70 |         'composer_environment', help='Name of the Cloud Composer environment.')
71 | 
72 |     args = parser.parse_args()
73 |     get_client_id(
74 |         args.project_id, args.location, args.composer_environment)
75 | 


--------------------------------------------------------------------------------
/04-vertex-pipeline-and-airflow/images/airflow_dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/04-vertex-pipeline-and-airflow/images/airflow_dag.png


--------------------------------------------------------------------------------
/04-vertex-pipeline-and-airflow/images/airflow_dag_run.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/04-vertex-pipeline-and-airflow/images/airflow_dag_run.png


--------------------------------------------------------------------------------
/04-vertex-pipeline-and-airflow/images/airflow_webserver_with_dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/04-vertex-pipeline-and-airflow/images/airflow_webserver_with_dag.png


--------------------------------------------------------------------------------
/04-vertex-pipeline-and-airflow/images/pipeline_run.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/04-vertex-pipeline-and-airflow/images/pipeline_run.png


--------------------------------------------------------------------------------
/04-vertex-pipeline-and-airflow/images/trigger-airflow-dag-on-cloud-composer-from-vertex-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/04-vertex-pipeline-and-airflow/images/trigger-airflow-dag-on-cloud-composer-from-vertex-pipeline.png


--------------------------------------------------------------------------------
/04-vertex-pipeline-and-airflow/vertex-pipeline-airflow.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# Copyright 2021 Google LLC\n",
 10 |     "\n",
 11 |     "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
 12 |     "# you may not use this file except in compliance with the License.\n",
 13 |     "# You may obtain a copy of the License at\n",
 14 |     "\n",
 15 |     "#     https://www.apache.org/licenses/LICENSE-2.0\n",
 16 |     "\n",
 17 |     "# Unless required by applicable law or agreed to in writing, software\n",
 18 |     "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
 19 |     "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
 20 |     "# See the License for the specific language governing permissions and\n",
 21 |     "# limitations under the License."
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# Trigger Airflow DAG in Cloud Composer from a Vertex Pipeline\n",
 29 |     "\n",
 30 |     "Apache Airflow is most popular choice for data pipelining in general. However, arguably not a good choice to run Machine learning pipelines due to lack of ML metadata tracking, artifact lineage, tracking ML metrics across metrics etc. [Vertex Pipelines](https://cloud.google.com/vertex-ai/docs/pipelines/introduction) solves this problem and automates, monitors, and governs your ML systems by orchestrating your ML workflow in a serverless manner, and storing your workflow's artifacts using Vertex ML Metadata.\n",
 31 |     "\n",
 32 |     "In this notebook, we will show you how you can trigger a data pipeline i.e. Airflow DAG on Cloud Composer from a ML pipeline running on Vertex Pipelines.\n",
 33 |     "\n",
 34 |     "![Trigger Airflow DAG on Cloud Composer from Vertex Pipeline](images/trigger-airflow-dag-on-cloud-composer-from-vertex-pipeline.png)"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "Following are high level steps:\n",
 42 |     "\n",
 43 |     "1. Create Cloud Composer environment\n",
 44 |     "2. Upload Airflow DAG to Composer environment that performs data processing\n",
 45 |     "3. Create a Vertex Pipeline that triggers the Airflow DAG"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "### Installing packages\n",
 53 |     "\n",
 54 |     "Start with installing KFP SDK and Google Cloud Pipeline components in the environment"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "USER_FLAG = \"--user\""
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "!pip3 install {USER_FLAG} google-cloud-aiplatform==1.0.0 --upgrade\n",
 73 |     "!pip3 install {USER_FLAG} kfp google-cloud-pipeline-components==0.1.1 --upgrade"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "After installing these packages you'll need to restart the kernel:"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "import os\n",
 90 |     "\n",
 91 |     "if not os.getenv(\"IS_TESTING\"):\n",
 92 |     "    # Automatically restart kernel after installs\n",
 93 |     "    import IPython\n",
 94 |     "\n",
 95 |     "    app = IPython.Application.instance()\n",
 96 |     "    app.kernel.do_shutdown(True)"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {},
102 |    "source": [
103 |     "Finally, check that you have correctly installed the packages. The KFP SDK version should be >=1.6:"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "!python3 -c \"import kfp; print('KFP SDK version: {}'.format(kfp.__version__))\"\n",
113 |     "!python3 -c \"import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))\""
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "### Set your project ID and bucket"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "Throughout this notebook you'll reference your Cloud project ID and the bucket you created earlier. Next we'll create variables for each of those.\n",
128 |     "\n",
129 |     "If you don't know your project ID you may be able to get it by running the following:"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "import google.auth\n",
139 |     "\n",
140 |     "creds, PROJECT_ID = google.auth.default()\n",
141 |     "REGION = 'us-central1'"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "metadata": {},
147 |    "source": [
148 |     "Otherwise, set it here:"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "if PROJECT_ID == \"\" or PROJECT_ID is None:\n",
158 |     "    PROJECT_ID = \"your-project-id\"  # @param {type:\"string\"}"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "print(f\"PROJECT_ID = {PROJECT_ID}\")\n",
168 |     "print(f\"REGION = {REGION}\")"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "metadata": {},
174 |    "source": [
175 |     "Then create a variable to store your bucket name and create the bucket if it does not exists already."
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": [
184 |     "BUCKET_NAME = \"gs://\" + \"cloud-ai-platform-2f444b6a-a742-444b-b91a-c7519f51bd77\""
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "# run only if the bucket does not exists already\n",
194 |     "!gsutil mb -l $REGION $BUCKET_NAME"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "markdown",
199 |    "metadata": {},
200 |    "source": [
201 |     "## Create Composer Environment\n",
202 |     "\n",
203 |     "Please follow the instructions in the [document](https://cloud.google.com/composer/docs/how-to/managing/creating#) to create a Composer Environment with the configuration you need. For this sample demonstration, we create a bare minimum Composer environment. \n",
204 |     "\n",
205 |     "To trigger an Airflow DAG from Verte Pipeline, we will using Airflow web server REST API. By default, the API authentication feature is disabled in Airflow 1.10.11 and above which would deny all requests made to Airflow web server. To trigger DAG, we enable this feature. To enable the API authentication feature we override `auth_backend` configuration in Composer environment to `airflow.api.auth.backend.default`.\n",
206 |     "\n",
207 |     "**NOTE:** Cloud Composer environment creation may take up to 30 min. Grab your favorite beverage until then."
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": [
216 |     "COMPOSER_ENV_NAME = \"test-composer-env\"\n",
217 |     "ZONE = \"us-central1-f\""
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": null,
223 |    "metadata": {},
224 |    "outputs": [],
225 |    "source": [
226 |     "!gcloud beta composer environments create $COMPOSER_ENV_NAME \\\n",
227 |     "    --location $REGION \\\n",
228 |     "    --zone $ZONE\\\n",
229 |     "    --machine-type n1-standard-2 \\\n",
230 |     "    --image-version composer-latest-airflow-1.10.15 \\\n",
231 |     "    --airflow-configs=api-auth_backend=airflow.api.auth.backend.default"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "markdown",
236 |    "metadata": {},
237 |    "source": [
238 |     "### Get Composer Environment configuration\n",
239 |     "\n",
240 |     "We will get Composer environment configuration such as webserver URL and client ID to use in the Vertex Pipeline using the script `get_composer_client_id.py`"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": null,
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": [
249 |     "# This code is modified version of https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/composer/rest/get_client_id.py\n",
250 |     "\n",
251 |     "shell_output=!python3 get_composer_config.py $PROJECT_ID $REGION $COMPOSER_ENV_NAME\n",
252 |     "COMPOSER_WEB_URI = shell_output[0]\n",
253 |     "COMPOSER_DAG_GCS = shell_output[1]\n",
254 |     "COMPOSER_CLIENT_ID = shell_output[2]\n",
255 |     "\n",
256 |     "print(f\"COMPOSER_WEB_URI = {COMPOSER_WEB_URI}\")\n",
257 |     "print(f\"COMPOSER_DAG_GCS = {COMPOSER_DAG_GCS}\")\n",
258 |     "print(f\"COMPOSER_CLIENT_ID = {COMPOSER_CLIENT_ID}\")"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "markdown",
263 |    "metadata": {},
264 |    "source": [
265 |     "You can navigate to Airflow webserver by going to this URL"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": null,
271 |    "metadata": {},
272 |    "outputs": [],
273 |    "source": [
274 |     "COMPOSER_WEB_URI"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "markdown",
279 |    "metadata": {},
280 |    "source": [
281 |     "## Upload DAG to Cloud Composer environment\n",
282 |     "\n",
283 |     "We have a sample data processing DAG `data_orchestration_bq_example_dag.py` that reads a CSV file from GCS bucket and writes to BigQuery. We will add this file to the GCS bucket configure for the Composer environment that Airflow watches."
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": null,
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": [
292 |     "COMPOSER_DAG_NAME = \"dag_gcs_to_bq_orch\"\n",
293 |     "COMPOSER_DAG_FILENAME = \"data_orchestration_bq_example_dag.py\""
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": null,
299 |    "metadata": {},
300 |    "outputs": [],
301 |    "source": [
302 |     "%%writefile $COMPOSER_DAG_FILENAME\n",
303 |     "\n",
304 |     "\"\"\"An example Composer workflow integrating GCS and BigQuery.\n",
305 |     "\n",
306 |     "A .csv is read from a GCS bucket to a BigQuery table; a query is made, and the\n",
307 |     "result is written back to a different BigQuery table within a new dataset.\n",
308 |     "\"\"\"\n",
309 |     "\n",
310 |     "from datetime import datetime, timedelta\n",
311 |     "from airflow import DAG\n",
312 |     "from airflow.contrib.operators.bigquery_operator import BigQueryOperator\n",
313 |     "from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator\n",
314 |     "from airflow.operators.bash_operator import BashOperator\n",
315 |     "\n",
316 |     "YESTERDAY = datetime.combine(\n",
317 |     "    datetime.today() - timedelta(days=1), datetime.min.time())\n",
318 |     "BQ_DATASET_NAME = 'bq_demos'\n",
319 |     "\n",
320 |     "default_args = {\n",
321 |     "    'owner': 'airflow',\n",
322 |     "    'depends_on_past': False,\n",
323 |     "    'start_date': YESTERDAY,\n",
324 |     "    'email_on_failure': False,\n",
325 |     "    'email_on_retry': False,\n",
326 |     "    'retries': 1,\n",
327 |     "    'retry_delay': timedelta(minutes=5),\n",
328 |     "}\n",
329 |     "\n",
330 |     "# Solution: pass a schedule_interval argument to DAG instantiation.\n",
331 |     "with DAG('dag_gcs_to_bq_orch', default_args=default_args,\n",
332 |     "         schedule_interval=None) as dag:\n",
333 |     "  create_bq_dataset_if_not_exist = \"\"\"\n",
334 |     "    bq ls {0}\n",
335 |     "    if [ $? -ne 0 ]; then\n",
336 |     "      bq mk {0}\n",
337 |     "    fi\n",
338 |     "  \"\"\".format(BQ_DATASET_NAME)\n",
339 |     "\n",
340 |     "  # Create destination dataset.\n",
341 |     "  t1 = BashOperator(\n",
342 |     "      task_id='create_destination_dataset',\n",
343 |     "      bash_command=create_bq_dataset_if_not_exist,\n",
344 |     "      dag=dag)\n",
345 |     "\n",
346 |     "  # Create a bigquery table from a .csv file located in a GCS bucket\n",
347 |     "  # (gs://example-datasets/game_data_condensed.csv).\n",
348 |     "  # Store it in our dataset.\n",
349 |     "  t2 = GoogleCloudStorageToBigQueryOperator(\n",
350 |     "      task_id='gcs_to_bq',\n",
351 |     "      bucket='example-datasets',\n",
352 |     "      source_objects=['game_data_condensed.csv'],\n",
353 |     "      destination_project_dataset_table='{0}.composer_game_data_table'\n",
354 |     "      .format(BQ_DATASET_NAME),\n",
355 |     "      schema_fields=[\n",
356 |     "          {'name': 'name', 'type': 'string', 'mode': 'nullable'},\n",
357 |     "          {'name': 'team', 'type': 'string', 'mode': 'nullable'},\n",
358 |     "          {'name': 'total_score', 'type': 'integer', 'mode': 'nullable'},\n",
359 |     "          {'name': 'timestamp', 'type': 'integer', 'mode': 'nullable'},\n",
360 |     "          {'name': 'window_start', 'type': 'string', 'mode': 'nullable'},\n",
361 |     "      ],\n",
362 |     "      write_disposition='WRITE_TRUNCATE')\n",
363 |     "\n",
364 |     "  # Run example query (http://shortn/_BdF1UTEYOb) and save result to the\n",
365 |     "  # destination table.\n",
366 |     "  t3 = BigQueryOperator(\n",
367 |     "      task_id='bq_example_query',\n",
368 |     "      bql=f\"\"\"\n",
369 |     "        SELECT\n",
370 |     "          name, team, total_score\n",
371 |     "        FROM\n",
372 |     "          {BQ_DATASET_NAME}.composer_game_data_table\n",
373 |     "        WHERE total_score > 15\n",
374 |     "        LIMIT 100;\n",
375 |     "      \"\"\",\n",
376 |     "      destination_dataset_table='{0}.gcp_example_query_result'\n",
377 |     "      .format(BQ_DATASET_NAME),\n",
378 |     "      write_disposition='WRITE_TRUNCATE')\n",
379 |     "\n",
380 |     "  t1 >> t2 >> t3"
381 |    ]
382 |   },
383 |   {
384 |    "cell_type": "code",
385 |    "execution_count": null,
386 |    "metadata": {},
387 |    "outputs": [],
388 |    "source": [
389 |     "!gsutil cp $COMPOSER_DAG_FILENAME $COMPOSER_DAG_GCS/"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": null,
395 |    "metadata": {},
396 |    "outputs": [],
397 |    "source": [
398 |     "!gsutil ls -l $COMPOSER_DAG_GCS/$COMPOSER_DAG_FILENAME"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "markdown",
403 |    "metadata": {},
404 |    "source": [
405 |     "You should the DAG in your Airflow webserver\n",
406 |     "\n",
407 |     "![](images/airflow_webserver_with_dag.png)\n",
408 |     "\n",
409 |     "![](images/airflow_dag.png)"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "markdown",
414 |    "metadata": {},
415 |    "source": [
416 |     "## Vertex Pipelines setup"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "markdown",
421 |    "metadata": {},
422 |    "source": [
423 |     "### Import libraries\n",
424 |     "\n",
425 |     "Add the following to import the libraries we'll be using throughout this codelab:"
426 |    ]
427 |   },
428 |   {
429 |    "cell_type": "code",
430 |    "execution_count": null,
431 |    "metadata": {},
432 |    "outputs": [],
433 |    "source": [
434 |     "from typing import NamedTuple\n",
435 |     "import re\n",
436 |     "\n",
437 |     "import kfp\n",
438 |     "from kfp import dsl\n",
439 |     "from kfp.v2 import compiler\n",
440 |     "from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,\n",
441 |     "                        OutputPath, ClassificationMetrics, Metrics, component)\n",
442 |     "from kfp.v2.google.client import AIPlatformClient\n",
443 |     "\n",
444 |     "from google.cloud import aiplatform\n",
445 |     "from google_cloud_pipeline_components import aiplatform as gcc_aip"
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "markdown",
450 |    "metadata": {},
451 |    "source": [
452 |     "### Define constants\n",
453 |     "\n",
454 |     "Before building the pipeline define some constant variables:\n",
455 |     "\n",
456 |     "- `PIPELINE_ROOT` is the Cloud Storage path where the artifacts created by the pipeline will be written. We're using us-central1 as the region here, but if you used a different region when you created your bucket, update the REGION variable in the code above"
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "code",
461 |    "execution_count": null,
462 |    "metadata": {},
463 |    "outputs": [],
464 |    "source": [
465 |     "PATH=%env PATH\n",
466 |     "%env PATH={PATH}:/home/jupyter/.local/bin\n",
467 |     "\n",
468 |     "PIPELINE_ROOT = f\"{BUCKET_NAME}/pipeline_root/\"\n",
469 |     "PIPELINE_ROOT"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "markdown",
474 |    "metadata": {},
475 |    "source": [
476 |     "After running the code above, you should see the root directory for your pipeline printed. This is the Cloud Storage location where the artifacts from your pipeline will be written. It will be in the format of `gs://BUCKET_NAME/pipeline_root/`"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "markdown",
481 |    "metadata": {},
482 |    "source": [
483 |     "### Create a Python function based component to trigger Airflow DAG\n",
484 |     "\n",
485 |     "Using the KFP SDK, we can create components based on Python functions. The component takes Airflow DAG name `dag_name` a string as input and returns response from Airflow web server as an `Artifact` that contains Airflow DAG run information. The component makes a request to Airflow REST API of your Cloud Composer environment. Airflow processes this request and runs a DAG. The DAG outputs information about the change that is logged as artifact (you can output as string as well."
486 |    ]
487 |   },
488 |   {
489 |    "cell_type": "code",
490 |    "execution_count": null,
491 |    "metadata": {},
492 |    "outputs": [],
493 |    "source": [
494 |     "@component(\n",
495 |     "    base_image=\"gcr.io/ml-pipeline/google-cloud-pipeline-components:0.1.3\",\n",
496 |     "    output_component_file=\"composer-trigger-dag-component.yaml\",\n",
497 |     "    packages_to_install=[\"requests\"],\n",
498 |     ")\n",
499 |     "def trigger_airflow_dag(\n",
500 |     "    dag_name: str,\n",
501 |     "    composer_client_id: str,\n",
502 |     "    composer_webserver_id: str,\n",
503 |     "    response: Output[Artifact]\n",
504 |     "):\n",
505 |     "    # [START composer_trigger]\n",
506 |     "\n",
507 |     "    from google.auth.transport.requests import Request\n",
508 |     "    from google.oauth2 import id_token\n",
509 |     "    import requests\n",
510 |     "    import json\n",
511 |     "    import os\n",
512 |     "\n",
513 |     "\n",
514 |     "    IAM_SCOPE = 'https://www.googleapis.com/auth/iam'\n",
515 |     "    OAUTH_TOKEN_URI = 'https://www.googleapis.com/oauth2/v4/token'\n",
516 |     "    \n",
517 |     "    data = '{\"replace_microseconds\":\"false\"}'\n",
518 |     "    context = None\n",
519 |     "\n",
520 |     "    \"\"\"Makes a POST request to the Composer DAG Trigger API\n",
521 |     "\n",
522 |     "    When called via Google Cloud Functions (GCF),\n",
523 |     "    data and context are Background function parameters.\n",
524 |     "\n",
525 |     "    For more info, refer to\n",
526 |     "    https://cloud.google.com/functions/docs/writing/background#functions_background_parameters-python\n",
527 |     "\n",
528 |     "    To call this function from a Python script, omit the ``context`` argument\n",
529 |     "    and pass in a non-null value for the ``data`` argument.\n",
530 |     "    \"\"\"\n",
531 |     "\n",
532 |     "    # Form webserver URL to make REST API calls\n",
533 |     "    webserver_url = f'{composer_webserver_id}/api/experimental/dags/{dag_name}/dag_runs'\n",
534 |     "    # print(webserver_url)\n",
535 |     "\n",
536 |     "    # This code is copied from\n",
537 |     "    # https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/iap/make_iap_request.py\n",
538 |     "    # START COPIED IAP CODE\n",
539 |     "    def make_iap_request(url, client_id, method='GET', **kwargs):\n",
540 |     "        \"\"\"Makes a request to an application protected by Identity-Aware Proxy.\n",
541 |     "        Args:\n",
542 |     "          url: The Identity-Aware Proxy-protected URL to fetch.\n",
543 |     "          client_id: The client ID used by Identity-Aware Proxy.\n",
544 |     "          method: The request method to use\n",
545 |     "                  ('GET', 'OPTIONS', 'HEAD', 'POST', 'PUT', 'PATCH', 'DELETE')\n",
546 |     "          **kwargs: Any of the parameters defined for the request function:\n",
547 |     "                    https://github.com/requests/requests/blob/master/requests/api.py\n",
548 |     "                    If no timeout is provided, it is set to 90 by default.\n",
549 |     "        Returns:\n",
550 |     "          The page body, or raises an exception if the page couldn't be retrieved.\n",
551 |     "        \"\"\"\n",
552 |     "        # Set the default timeout, if missing\n",
553 |     "        if 'timeout' not in kwargs:\n",
554 |     "            kwargs['timeout'] = 90\n",
555 |     "\n",
556 |     "        # Obtain an OpenID Connect (OIDC) token from metadata server or using service\n",
557 |     "        # account.\n",
558 |     "        google_open_id_connect_token = id_token.fetch_id_token(Request(), client_id)\n",
559 |     "\n",
560 |     "        # Fetch the Identity-Aware Proxy-protected URL, including an\n",
561 |     "        # Authorization header containing \"Bearer \" followed by a\n",
562 |     "        # Google-issued OpenID Connect token for the service account.\n",
563 |     "        resp = requests.request(\n",
564 |     "            method, url,\n",
565 |     "            headers={'Authorization': 'Bearer {}'.format(\n",
566 |     "                google_open_id_connect_token)}, **kwargs)\n",
567 |     "        if resp.status_code == 403:\n",
568 |     "            raise Exception('Service account does not have permission to '\n",
569 |     "                            'access the IAP-protected application.')\n",
570 |     "        elif resp.status_code != 200:\n",
571 |     "            raise Exception(\n",
572 |     "                'Bad response from application: {!r} / {!r} / {!r}'.format(\n",
573 |     "                    resp.status_code, resp.headers, resp.text))\n",
574 |     "        else:\n",
575 |     "            print(f\"response = {resp.text}\")\n",
576 |     "            file_path = os.path.join(response.path)\n",
577 |     "            os.makedirs(file_path)\n",
578 |     "            with open(os.path.join(file_path, \"airflow_response.json\"), 'w') as f:\n",
579 |     "                json.dump(resp.text, f)\n",
580 |     "\n",
581 |     "    # END COPIED IAP CODE\n",
582 |     "\n",
583 |     "    \n",
584 |     "    # Make a POST request to IAP which then Triggers the DAG\n",
585 |     "    make_iap_request(\n",
586 |     "        webserver_url, composer_client_id, method='POST', json={\"conf\": data, \"replace_microseconds\": 'false'})\n",
587 |     "    \n",
588 |     "    # [END composer_trigger]"
589 |    ]
590 |   },
591 |   {
592 |    "cell_type": "markdown",
593 |    "metadata": {},
594 |    "source": [
595 |     "Understanding the component structure\n",
596 |     "- The **`@component`** decorator compiles this function to a component when the pipeline is run. You'll use this anytime you write a custom component.\n",
597 |     "- The **`base_image parameter`** specifies the container image this component will use.\n",
598 |     "- The **`output_component_file`** parameter is optional, and specifies the yaml file to write the compiled component to.\n",
599 |     "- The **`packages_to_install`** parameter installs required python packages in the container to run the component"
600 |    ]
601 |   },
602 |   {
603 |    "cell_type": "markdown",
604 |    "metadata": {},
605 |    "source": [
606 |     "### Test Triggering Airflow DAG from Notebook"
607 |    ]
608 |   },
609 |   {
610 |    "cell_type": "code",
611 |    "execution_count": null,
612 |    "metadata": {},
613 |    "outputs": [],
614 |    "source": [
615 |     "# before running comment out @component annotation in the cell above\n",
616 |     "trigger_airflow_dag(\n",
617 |     "    dag_name=COMPOSER_DAG_NAME,\n",
618 |     "    composer_client_id=COMPOSER_CLIENT_ID,\n",
619 |     "    composer_webserver_id=COMPOSER_WEB_URI,\n",
620 |     "    response=None\n",
621 |     ")"
622 |    ]
623 |   },
624 |   {
625 |    "cell_type": "code",
626 |    "execution_count": null,
627 |    "metadata": {},
628 |    "outputs": [],
629 |    "source": [
630 |     "COMPOSER_WEB_URI"
631 |    ]
632 |   },
633 |   {
634 |    "cell_type": "markdown",
635 |    "metadata": {},
636 |    "source": [
637 |     "### Adding the components to a pipeline"
638 |    ]
639 |   },
640 |   {
641 |    "cell_type": "code",
642 |    "execution_count": null,
643 |    "metadata": {},
644 |    "outputs": [],
645 |    "source": [
646 |     "PIPELINE_NAME = \"pipeline-trigger-airflow-dag\""
647 |    ]
648 |   },
649 |   {
650 |    "cell_type": "code",
651 |    "execution_count": null,
652 |    "metadata": {},
653 |    "outputs": [],
654 |    "source": [
655 |     "@dsl.pipeline(\n",
656 |     "    name=PIPELINE_NAME,\n",
657 |     "    description=\"Trigger Airflow DAG from Vertex Pipelines\",\n",
658 |     "    pipeline_root=PIPELINE_ROOT,\n",
659 |     ")\n",
660 |     "\n",
661 |     "# You can change the `text` and `emoji_str` parameters here to update the pipeline output\n",
662 |     "def pipeline():\n",
663 |     "    data_processing_task_dag_name = COMPOSER_DAG_NAME\n",
664 |     "    data_processing_task = trigger_airflow_dag(\n",
665 |     "        dag_name=data_processing_task_dag_name,\n",
666 |     "        composer_client_id=COMPOSER_CLIENT_ID,\n",
667 |     "        composer_webserver_id=COMPOSER_WEB_URI\n",
668 |     "    )"
669 |    ]
670 |   },
671 |   {
672 |    "cell_type": "markdown",
673 |    "metadata": {},
674 |    "source": [
675 |     "### Compile and run the pipeline"
676 |    ]
677 |   },
678 |   {
679 |    "cell_type": "markdown",
680 |    "metadata": {},
681 |    "source": [
682 |     "With your pipeline defined, you're ready to compile it. The following will generate a JSON file that you'll use to run the pipeline:"
683 |    ]
684 |   },
685 |   {
686 |    "cell_type": "code",
687 |    "execution_count": null,
688 |    "metadata": {},
689 |    "outputs": [],
690 |    "source": [
691 |     "compiler.Compiler().compile(\n",
692 |     "    pipeline_func=pipeline, package_path=f\"{PIPELINE_NAME}.json\"\n",
693 |     ")"
694 |    ]
695 |   },
696 |   {
697 |    "cell_type": "markdown",
698 |    "metadata": {},
699 |    "source": [
700 |     "Next, instantiate an API client:"
701 |    ]
702 |   },
703 |   {
704 |    "cell_type": "code",
705 |    "execution_count": null,
706 |    "metadata": {},
707 |    "outputs": [],
708 |    "source": [
709 |     "api_client = AIPlatformClient(\n",
710 |     "    project_id=PROJECT_ID,\n",
711 |     "    region=REGION,\n",
712 |     ")"
713 |    ]
714 |   },
715 |   {
716 |    "cell_type": "markdown",
717 |    "metadata": {},
718 |    "source": [
719 |     "Finally, run the pipeline:"
720 |    ]
721 |   },
722 |   {
723 |    "cell_type": "code",
724 |    "execution_count": null,
725 |    "metadata": {},
726 |    "outputs": [],
727 |    "source": [
728 |     "response = api_client.create_run_from_job_spec(\n",
729 |     "    job_spec_path=f\"{PIPELINE_NAME}.json\",\n",
730 |     "    # pipeline_root=PIPELINE_ROOT  # this argument is necessary if you did not specify PIPELINE_ROOT as part of the pipeline definition.\n",
731 |     ")"
732 |    ]
733 |   },
734 |   {
735 |    "cell_type": "markdown",
736 |    "metadata": {},
737 |    "source": [
738 |     "### Monitor Vertex Pipeline status\n",
739 |     "\n",
740 |     "From Cloud Console, you can monitor the pipeline run status and view the output artifact\n",
741 |     "\n",
742 |     "![](images/pipeline_run.png)"
743 |    ]
744 |   },
745 |   {
746 |    "cell_type": "markdown",
747 |    "metadata": {},
748 |    "source": [
749 |     "You can also API client to get pipeline status and artifact information."
750 |    ]
751 |   },
752 |   {
753 |    "cell_type": "code",
754 |    "execution_count": null,
755 |    "metadata": {},
756 |    "outputs": [],
757 |    "source": [
758 |     "def get_job_id(job_name):\n",
759 |     "    \"\"\"get job id from pipeline job name\"\"\"\n",
760 |     "    p = re.compile('projects/(?P<project_id>.*)/locations/(?P<region>.*)/pipelineJobs/(?P<job_id>.*)')\n",
761 |     "    result = p.search(job_name)\n",
762 |     "    return result.group('job_id') if result else None"
763 |    ]
764 |   },
765 |   {
766 |    "cell_type": "code",
767 |    "execution_count": null,
768 |    "metadata": {},
769 |    "outputs": [],
770 |    "source": [
771 |     "job_status = api_client.get_job(get_job_id(response['name']))\n",
772 |     "print(f\"JOB STATUS: {job_status['state']}\")"
773 |    ]
774 |   },
775 |   {
776 |    "cell_type": "markdown",
777 |    "metadata": {},
778 |    "source": [
779 |     "Get Airflow DAG run instance from the output artifact"
780 |    ]
781 |   },
782 |   {
783 |    "cell_type": "code",
784 |    "execution_count": null,
785 |    "metadata": {},
786 |    "outputs": [],
787 |    "source": [
788 |     "airflow_response_uri = [task['outputs']['response']['artifacts'][0]['uri'] for task in job_status['jobDetail']['taskDetails'] if task['taskName']=='trigger-airflow-dag'][0]\n",
789 |     "airflow_response_uri"
790 |    ]
791 |   },
792 |   {
793 |    "cell_type": "code",
794 |    "execution_count": null,
795 |    "metadata": {},
796 |    "outputs": [],
797 |    "source": [
798 |     "!gsutil ls $airflow_response_uri/"
799 |    ]
800 |   },
801 |   {
802 |    "cell_type": "code",
803 |    "execution_count": null,
804 |    "metadata": {},
805 |    "outputs": [],
806 |    "source": [
807 |     "!gsutil cat $airflow_response_uri/airflow_response.json"
808 |    ]
809 |   },
810 |   {
811 |    "cell_type": "markdown",
812 |    "metadata": {},
813 |    "source": [
814 |     "### Monitor Airflow DAG run\n",
815 |     "\n",
816 |     "Go to Airflow webserver and monitor the status of data processing DAG. Airflow webserver URL is"
817 |    ]
818 |   },
819 |   {
820 |    "cell_type": "code",
821 |    "execution_count": null,
822 |    "metadata": {},
823 |    "outputs": [],
824 |    "source": [
825 |     "COMPOSER_WEB_URI"
826 |    ]
827 |   },
828 |   {
829 |    "cell_type": "markdown",
830 |    "metadata": {},
831 |    "source": [
832 |     "![](images/airflow_dag_run.png)"
833 |    ]
834 |   },
835 |   {
836 |    "cell_type": "markdown",
837 |    "metadata": {},
838 |    "source": [
839 |     "## Clean Up"
840 |    ]
841 |   },
842 |   {
843 |    "cell_type": "markdown",
844 |    "metadata": {},
845 |    "source": [
846 |     "- Delete Cloud Storage bucket\n",
847 |     "- Delete Cloud Composer environment"
848 |    ]
849 |   }
850 |  ],
851 |  "metadata": {
852 |   "environment": {
853 |    "name": "tf2-gpu.2-4.m65",
854 |    "type": "gcloud",
855 |    "uri": "gcr.io/deeplearning-platform-release/tf2-gpu.2-4:m65"
856 |   },
857 |   "kernelspec": {
858 |    "display_name": "vertex",
859 |    "language": "python",
860 |    "name": "vertex"
861 |   },
862 |   "language_info": {
863 |    "codemirror_mode": {
864 |     "name": "ipython",
865 |     "version": 3
866 |    },
867 |    "file_extension": ".py",
868 |    "mimetype": "text/x-python",
869 |    "name": "python",
870 |    "nbconvert_exporter": "python",
871 |    "pygments_lexer": "ipython3",
872 |    "version": "3.7.10"
873 |   }
874 |  },
875 |  "nbformat": 4,
876 |  "nbformat_minor": 4
877 | }
878 | 


--------------------------------------------------------------------------------
/05-vertex-event-based-model-deploy/images/event_based_model_deployment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/05-vertex-event-based-model-deploy/images/event_based_model_deployment.png


--------------------------------------------------------------------------------
/06-vertex-train-deploy-r-model/images/serving-with-custom-containers-on-vertex-predictions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/06-vertex-train-deploy-r-model/images/serving-with-custom-containers-on-vertex-predictions.png


--------------------------------------------------------------------------------
/07-vertex-train-deploy-lightgbm/images/serving-with-custom-containers-on-vertex-predictions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/07-vertex-train-deploy-lightgbm/images/serving-with-custom-containers-on-vertex-predictions.png


--------------------------------------------------------------------------------
/07-vertex-train-deploy-lightgbm/images/training-with-custom-containers-on-vertex-training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/07-vertex-train-deploy-lightgbm/images/training-with-custom-containers-on-vertex-training.png


--------------------------------------------------------------------------------
/08-pytorch-distributed/README.md:
--------------------------------------------------------------------------------
1 | # PyTorch Distributed Training
2 | 
3 | Work in progress
4 | 


--------------------------------------------------------------------------------
/09-distributed-xgboost-dask/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvcr.io/nvidia/rapidsai/rapidsai:22.04-cuda11.2-base-ubuntu18.04
 2 | 
 3 | RUN . /opt/conda/etc/profile.d/conda.sh \
 4 |     && conda activate rapids \
 5 |     && pip install -U gcsfs
 6 | 
 7 | RUN mkdir /train
 8 | WORKDIR /train
 9 | 
10 | ADD train.py /train
11 | ADD train.sh /train
12 | 
13 | ENTRYPOINT ["bash", "train.sh"]


--------------------------------------------------------------------------------
/09-distributed-xgboost-dask/README.md:
--------------------------------------------------------------------------------
 1 | ```
 2 | # set variables
 3 | PROJECT_ID=$(gcloud config list --format 'value(core.project)')
 4 | REGION=us-central1
 5 | TRAIN_IMAGE_URI=${REGION}-docker.pkg.dev/${PROJECT_ID}/vertex-rapidsai/distributed-xgboost-dask
 6 | 
 7 | STAGING_BUCKET_NAME=cloud-ai-platform-2f444b6a-a742-444b-b91a-c7519f51bd77
 8 | TRAIN_FILES=gs://rthallam-demo-project/rapids-on-gcp/data/latest/a/higgs_00.csv
 9 | 
10 | # create artifact registry repository
11 | gcloud artifacts repositories create vertex-rapidsai \
12 |  --repository-format=docker \
13 |  --location=${REGION} \
14 |  --description="Vertex AI RAPIDS"
15 | 
16 | # build training image and push to Artifact Registry
17 | gcloud builds submit --tag $TRAIN_IMAGE_URI --timeout=3600 .
18 | 
19 | # create training job config for multi-node multi-gpu dask job
20 | date_now=$(date "+%Y%m%d-%H%M%S")
21 | 
22 | cat << EOF > ./dask-xgb-multi-node.yml
23 | 
24 | baseOutputDirectory:
25 |     outputUriPrefix: gs://${STAGING_BUCKET_NAME}/rapidsai/distributed-xgboost-dask/${date_now}/
26 | workerPoolSpecs:
27 |   -
28 |     machineSpec:
29 |       machineType: n1-highmem-4
30 |       acceleratorType: NVIDIA_TESLA_T4
31 |       acceleratorCount: 1
32 |     replicaCount: 1
33 |     containerSpec:
34 |       imageUri: ${TRAIN_IMAGE_URI}
35 |       args:
36 |       - --train-files=${TRAIN_FILES}
37 |       - --rmm-pool-size=4G
38 |       - --num-workers=4
39 |       - --nthreads=4
40 |   -
41 |     machineSpec:
42 |       machineType: n1-highmem-4
43 |       acceleratorType: NVIDIA_TESLA_T4
44 |       acceleratorCount: 2
45 |     replicaCount: 4
46 |     containerSpec:
47 |       imageUri: ${TRAIN_IMAGE_URI}
48 |       args:
49 |       - --train-files=${TRAIN_FILES}
50 |       - --rmm-pool-size=4G
51 |       - --num-workers=4
52 |       - --nthreads=4
53 | EOF
54 | 
55 | # submit vertex ai custom training job
56 | gcloud beta ai custom-jobs create \
57 |   --display-name=rapids-dstrbtd-xgb-dask-multi-node \
58 |   --region=$REGION \
59 |   --project=$PROJECT_ID \
60 |   --config=dask-xgb-multi-node.yml
61 | ```


--------------------------------------------------------------------------------
/09-distributed-xgboost-dask/train.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import time
  4 | import argparse
  5 | import subprocess
  6 | import logging
  7 | from pathlib import Path
  8 | import json
  9 | import asyncio
 10 | import socket
 11 | 
 12 | import xgboost as xgb
 13 | from xgboost.dask import DaskDMatrix, DaskDeviceQuantileDMatrix
 14 | 
 15 | import dask
 16 | import dask.dataframe as dd
 17 | import dask_cudf as cudf
 18 | from dask.distributed import Client, wait
 19 | 
 20 | dask.config.set({"distributed.comm.timeouts.connect": "60s"})
 21 | 
 22 | def get_args():
 23 |     """Define the task arguments with the default values.
 24 |     Returns:
 25 |         experiment parameters
 26 |     """
 27 |     parser = argparse.ArgumentParser()
 28 |     parser.add_argument(
 29 |         '--model-dir', 
 30 |         default=os.getenv('AIP_MODEL_DIR'), 
 31 |         type=str,
 32 |         help='Cloud Storage URI of a directory for saving model artifacts')
 33 |     parser.add_argument(
 34 |         '--train-files',
 35 |         type=str,
 36 |         help='Training files local or GCS',
 37 |         required=True)
 38 |     parser.add_argument(
 39 |         '--scheduler-ip-file',
 40 |         type=str,
 41 |         help='Scratch temp file to store scheduler ip in GCS',
 42 |         required=False)
 43 |     parser.add_argument(
 44 |         '--num-workers',
 45 |         type=int,
 46 |         help='num of workers for rabit')
 47 |     parser.add_argument(
 48 |         '--rmm-pool-size',
 49 |         type=str,
 50 |         help='RMM pool size',
 51 |         default='8G')
 52 |     parser.add_argument(
 53 |         '--nthreads',
 54 |         type=str,
 55 |         help='nthreads for master and worker',
 56 |         default='4')
 57 |     parser.add_argument(
 58 |         '--parquet',
 59 |         action='store_false',
 60 |         help='parquet files are used')
 61 |     
 62 |     return parser.parse_args()
 63 | 
 64 | async def start_client(
 65 |     scheduler_addr, 
 66 |     train_dir, 
 67 |     num_workers, 
 68 |     gpu_mode=True,
 69 |     do_wait=False, 
 70 |     parquet=False):
 71 |     """
 72 |     """
 73 |     async with Client(scheduler_addr, asynchronous=True) as client:
 74 |         # wait until all workers are up and running
 75 |         dask.config.set({'distributed.scheduler.work-stealing': False})
 76 |         dask.config.set({'distributed.scheduler.bandwidth': 1})
 77 |         logging.info(f'distributed.scheduler.work-stealing={dask.config.get("distributed.scheduler.work-stealing")}')
 78 |         logging.info(f'distributed.scheduler.bandwidth={dask.config.get("distributed.scheduler.bandwidth")}')
 79 |         await client.wait_for_workers(num_workers)
 80 | 
 81 |         # read dataframe
 82 |         colnames = ['label'] + ['feature-%02d' % i for i in range(1, 29)]
 83 | 
 84 |         # read as csv or parquet
 85 |         if parquet is True:
 86 |             if gpu_mode:
 87 |                 df = cudf.read_parquet(train_dir, columns=colnames)
 88 |             else:
 89 |                 df = dd.read_parquet(train_dir, columns=colnames)
 90 |         else:
 91 |             if gpu_mode:
 92 |                 df = cudf.read_csv(train_dir, header=None, names=colnames, chunksize=None)
 93 |             else:
 94 |                 df = dd.read_csv(train_dir, header=None, names=colnames, chunksize=None)
 95 | 
 96 |         # get features and target label
 97 |         X = df[df.columns.difference(['label'])]
 98 |         y = df['label']
 99 |         
100 |         # wait for fully computing results
101 |         if do_wait is True:
102 |             df = df.persist()
103 |             X = X.persist()
104 |             wait(df)
105 |             wait(X)
106 |             logging.info("[debug:leader]: ------ Long waited but the data is ready now")
107 | 
108 |         # compute DMatrix for training xgboost
109 |         # for GPU compute DaskDeviceQuantileDMatrix
110 |         start_time = time.time()
111 |         if gpu_mode:
112 |             dtrain = await DaskDeviceQuantileDMatrix(client, X, y)
113 |         else:
114 |             dtrain = DaskDMatrix(client, X, y)
115 |         logging.info("[debug:leader]: ------ QuantileDMatrix is formed in {} seconds ---".format((time.time() - start_time)))
116 | 
117 |         # remove data from distributed RAM by removing the collection from local process
118 |         del df
119 |         del X
120 |         del y
121 | 
122 |         # start training
123 |         logging.info("[debug:leader]: ------ training started")
124 |         start_time = time.time()
125 |         xgb_params = {
126 |             'verbosity': 2,
127 |             'learning_rate': 0.1,
128 |             'max_depth': 8,
129 |             'objective': 'reg:squarederror',
130 |             'subsample': 0.6,
131 |             'gamma': 1,
132 |             'verbose_eval': True,
133 |             'tree_method': 'gpu_hist' if gpu_mode else 'hist',
134 |             'nthread': 1
135 |         }
136 |         output = await xgb.dask.train(
137 |             client,
138 |             xgb_params,
139 |             dtrain,
140 |             num_boost_round=100, 
141 |             evals=[(dtrain, 'train')])
142 |         logging.info("[debug:leader]: ------ training finished")
143 | 
144 |         # evaluation history
145 |         history = output['history']
146 |         logging.info('[debug:leader]: ------ Training evaluation history:', history)
147 | 
148 |         # save model
149 |         model_file = f"{model_dir}/model/xgboost.model"
150 |         output['booster'].save_model(model_file)
151 |         logging.info(f"[debug:leader]: ------model saved {model_file}")
152 | 
153 |         logging.info("[debug:leader]: ------ %s seconds ---" % (time.time() - start_time))
154 | 
155 |         # wait for client to shutdown
156 |         await client.shutdown()
157 | 
158 | def launch_dask(cmd, is_shell):
159 |     """ launch dask scheduler
160 |     """
161 |     return subprocess.Popen(cmd, stdout=None, stderr=None, shell=is_shell)
162 | 
163 | def launch_worker(cmd):
164 |     """ launch dask workers
165 |     """
166 |     return subprocess.check_call(cmd, stdout=sys.stdout, stderr=sys.stderr)
167 | 
168 | def get_scheduler_ip(scheduler_ip_file):
169 |     with open(scheduler_ip_file, 'r') as f:
170 |         scheduler_ip = f.read().rstrip("\n")
171 |     return scheduler_ip
172 | 
173 | if __name__=='__main__':
174 |     logging.basicConfig(format='%(message)s')
175 |     logging.getLogger().setLevel(logging.INFO)
176 | 
177 |     # get program args
178 |     args = get_args()
179 | 
180 |     # set and create local directories if does not exists
181 |     local_tmp_dir = os.path.join(os.getcwd(), "tmp")
182 |     Path(local_tmp_dir).mkdir(parents=True, exist_ok=True)
183 |     local_model_dir = os.path.join(local_tmp_dir, 'model')
184 |     Path(local_model_dir).mkdir(parents=True, exist_ok=True)
185 | 
186 |     # define variables
187 |     gs_prefix = 'gs://'
188 |     gcsfuse_prefix = '/gcs/'
189 | 
190 |     logging.info(f'[INFO]: args.model_dir = {args.model_dir}')
191 | 
192 |     model_dir = args.model_dir or local_model_dir
193 |     if model_dir and model_dir.startswith(gs_prefix):
194 |         model_dir = model_dir.replace(gs_prefix, gcsfuse_prefix)
195 |         Path(model_dir).mkdir(parents=True, exist_ok=True)
196 | 
197 |     tmp_dir = model_dir or local_tmp_dir
198 |     if not tmp_dir.startswith(gs_prefix):
199 |         Path(tmp_dir).mkdir(parents=True, exist_ok=True)
200 | 
201 |     scheduler_ip_file = f"{tmp_dir}dask_scheduler.txt" if tmp_dir[-1] == "/" else f"{tmp_dir}/dask_scheduler.txt"
202 | 
203 |     logging.info(f'[INFO]: model_dir = {model_dir}')
204 |     logging.info(f'[INFO]: tmp_dir = {tmp_dir}')
205 |     logging.info(f'[INFO]: scheduler_ip_file = {scheduler_ip_file}')
206 | 
207 |     # read worker pool config and launch dask scheduler and workers
208 |     TF_CONFIG = os.environ.get('TF_CONFIG')
209 |     
210 |     if TF_CONFIG:
211 |         TF_CONFIG = json.loads(TF_CONFIG)
212 |         logging.info(TF_CONFIG)
213 |         task_name = TF_CONFIG.get('task', {}).get('type')
214 |     else:
215 |         logging.info(f'Running locally')
216 |         task_name = 'chief'
217 | 
218 |     scheduler_port = '8786'
219 |         
220 |     if task_name == 'chief':
221 |         host_name = socket.gethostname()
222 |         host_ip = socket.gethostbyname(host_name)
223 | 
224 |         with open(scheduler_ip_file, 'w') as f:
225 |             f.write(host_ip)
226 | 
227 |         scheduler_addr = f'{host_ip}:{scheduler_port}'
228 |         logging.info('[INFO]: The scheduler IP is %s', scheduler_addr)
229 |         proc_scheduler = launch_dask(f'dask-scheduler --protocol tcp > {tmp_dir}/scheduler.log 2>&1 &', True)
230 |         logging.info('[debug:leader]: ------ start scheduler')
231 | 
232 |         proc_worker = launch_dask([
233 |             'dask-cuda-worker', 
234 |             '--rmm-pool-size', args.rmm_pool_size, 
235 |             '--nthreads', args.nthreads,
236 |             scheduler_addr], 
237 |             False)
238 |         logging.info('[debug:leader]: ------ start worker')
239 |         asyncio.get_event_loop().run_until_complete(
240 |             start_client(
241 |                 scheduler_addr,
242 |                 args.train_files,
243 |                 args.num_workers,
244 |                 parquet=False))
245 | 
246 |     # launch dask worker, redirect output to sys stdout/err
247 |     elif task_name == 'worker':
248 |         while not os.path.isfile(scheduler_ip_file):
249 |             time.sleep(1)
250 |         
251 |         # with open(scheduler_ip_file, 'r') as f:
252 |         #     scheduler_ip = f.read().rstrip("\n")
253 |         scheduler_ip = get_scheduler_ip(scheduler_ip_file)
254 |         while not scheduler_ip:
255 |             time.sleep(1)
256 |             scheduler_ip = get_scheduler_ip(scheduler_ip_file)
257 | 
258 |         scheduler_ip = get_scheduler_ip(scheduler_ip_file)
259 |         logging.info(f'[debug:scheduler_ip]: ------ {scheduler_ip}')
260 |         scheduler_addr = f'{scheduler_ip}:{scheduler_port}'
261 | 
262 |         proc_worker = launch_worker([
263 |             'dask-cuda-worker', 
264 |             '--rmm-pool-size', args.rmm_pool_size, 
265 |             '--nthreads' , args.nthreads, 
266 |             scheduler_addr])


--------------------------------------------------------------------------------
/09-distributed-xgboost-dask/train.sh:
--------------------------------------------------------------------------------
1 | source /conda/etc/profile.d/conda.sh
2 | conda activate rapids
3 | 
4 | echo "Running: train.py $@"
5 | python train.py $@
6 | 


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/models/combine/1/model.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sys
 3 | import json
 4 | 
 5 | import triton_python_backend_utils as pb_utils
 6 | import transformers
 7 | 
 8 | class TritonPythonModel:
 9 | 
10 |     def initialize(self, args):
11 |         self.log = open("/tmp/combine.loq", "w")
12 |         self.log.write("DEBUG: ------------------------ hello world init combine/model.py------------------------------------\n")
13 | 
14 |         self.model_config = model_config = json.loads(args['model_config'])
15 |         output_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
16 |         self.output_dtype = pb_utils.triton_string_to_numpy(output_config['data_type'])
17 | 
18 |     def execute(self, requests):
19 |         self.log.write("DEBUG: ------------------------hello world execute combine/model.py\n")
20 | 
21 |         output_dtype = self.output_dtype
22 |         responses = []
23 |         out_tensor = []
24 |         self.log.write("DEBUG: ------------------------requests: combine/model.py " + str(requests) + "\n")
25 |         for request in requests:
26 |             xgb_class = pb_utils.get_input_tensor_by_name(request, "xgb_class")
27 |             tf_class = pb_utils.get_input_tensor_by_name(request, "tf_class")
28 |             sci_1_class = pb_utils.get_input_tensor_by_name(request, "sci_1_class")
29 |             sci_2_class = pb_utils.get_input_tensor_by_name(request, "sci_2_class")
30 | 
31 |             self.log.write("DEBUG: ------------------------ xgb_class tf_class sci_1_class sci_2_class \n" 
32 |             + str(xgb_class.as_numpy())   + '\n' 
33 |             + str(tf_class.as_numpy())    + '\n'
34 |             + str(sci_1_class.as_numpy()) + '\n'
35 |             + str(sci_2_class.as_numpy()) + '\n' )  
36 | 
37 |             out_tensor.append(pb_utils.Tensor("OUTPUT0", 
38 |             (xgb_class.as_numpy() 
39 |             + tf_class.as_numpy()  
40 |             + sci_1_class.as_numpy() 
41 |             + sci_2_class.as_numpy()) / 4.0))
42 | 
43 |             inference_response = pb_utils.InferenceResponse(output_tensors = out_tensor)
44 |             responses.append(inference_response)
45 | 
46 |         self.log.flush()
47 |         return responses
48 | 
49 |     def finalize(self):
50 |         self.log.write("DEBUG: ------------------------ hello world finalize combine/model.py------------------------------------\n")
51 |         self.log.write('Cleaning up - custom model combine')
52 |         self.log.close()
53 | 
54 | 
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/models/combine/config.pbtxt:
--------------------------------------------------------------------------------
 1 | name: "combine"
 2 | backend: "python"
 3 | max_batch_size: 0
 4 | input [
 5 |   {
 6 |     name: "xgb_class"
 7 |     data_type: TYPE_FP32
 8 |     dims: [ 1 ]
 9 |   },
10 |   {
11 |     name: "tf_class"
12 |     data_type: TYPE_FP32
13 |     dims:  [ -1, 1 ]
14 |   },
15 |   {
16 |     name: "sci_1_class"
17 |     data_type: TYPE_FP32
18 |     dims:  [ 1 ] 
19 |   },
20 |   {
21 |     name: "sci_2_class"
22 |     data_type: TYPE_FP32
23 |     dims:  [ 1 ]
24 |   }
25 | ]
26 | output [
27 |   {
28 |     name: "OUTPUT0"
29 |     data_type: TYPE_FP32
30 |     dims:  [ -1, 1 ] 
31 |   }
32 | ]
33 | parameters [
34 |   {
35 |     key: "output_class"
36 |     value: { string_value: "true" }
37 |   },
38 |   {
39 |     key: "threshold"
40 |     value: { string_value: "0.5" }
41 |   }
42 | ]
43 | 
44 | instance_group[ { kind: KIND_CPU } ]
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/models/ensemble/1/empty:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/10-serving-ensemble-triton/models/ensemble/1/empty


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/models/ensemble/config.pbtxt:
--------------------------------------------------------------------------------
  1 | platform: "ensemble"
  2 | max_batch_size: 0
  3 | input [
  4 |   {
  5 |     name: "INPUT0"
  6 |     data_type: TYPE_FP32
  7 |     dims: [ -1, 4 ]
  8 |   }
  9 | ]
 10 | output [
 11 |   {
 12 |     name: "OUTPUT0"
 13 |     data_type: TYPE_FP32
 14 |     dims: [ -1,  1 ]
 15 |   }
 16 | ]
 17 | ensemble_scheduling {
 18 |   step [
 19 |     {
 20 |       model_name: "mux"
 21 |       model_version: -1
 22 |       input_map {
 23 |         key: "mux_in"
 24 |         value: "INPUT0"
 25 |       }
 26 |       output_map {
 27 |         key: "mux_xgb_out"
 28 |         value: "mux_xgb_out"
 29 |       }
 30 |       output_map {
 31 |         key: "mux_tf_out"
 32 |         value: "mux_tf_out"
 33 |       }
 34 |       output_map {
 35 |         key: "mux_sci_1_out"
 36 |         value: "mux_sci_1_out"
 37 |       }
 38 |       output_map {
 39 |         key: "mux_sci_2_out"
 40 |         value: "mux_sci_2_out"
 41 |       }
 42 |     },
 43 |     {
 44 |       model_name: "xgb"
 45 |       model_version: -1
 46 |       input_map {
 47 |         key: "input__0"
 48 |         value: "mux_xgb_out"
 49 |       }
 50 |       output_map {
 51 |         key: "output__0"
 52 |         value: "xgb_class"
 53 |       }
 54 |     },
 55 |     {
 56 |       model_name: "tf"
 57 |       model_version: -1
 58 |       input_map {
 59 |         key: "dense_input"
 60 |         value: "mux_tf_out"
 61 |       }
 62 |       output_map {
 63 |         key: "round"
 64 |         value: "tf_class"
 65 |       }
 66 |     },
 67 |     {
 68 |       model_name: "sci_1"
 69 |       model_version: -1
 70 |       input_map {
 71 |         key: "input__0"
 72 |         value: "mux_sci_1_out"
 73 |       }
 74 |       output_map {
 75 |         key: "output__0"
 76 |         value: "sci_1_class"
 77 |       }
 78 |     },
 79 |     {
 80 |       model_name: "sci_2"
 81 |       model_version: -1
 82 |       input_map {
 83 |         key: "input__0"
 84 |         value: "mux_sci_2_out"
 85 |       }
 86 |       output_map {
 87 |         key: "output__0"
 88 |         value: "sci_2_class"
 89 |       }
 90 |     },
 91 |     {
 92 |       model_name: "combine"
 93 |       model_version: -1
 94 |       input_map {
 95 |         key: "xgb_class"
 96 |         value: "xgb_class"
 97 |       }
 98 |       input_map {
 99 |         key: "tf_class"
100 |         value: "tf_class"
101 |       }
102 |       input_map {
103 |         key: "sci_1_class"
104 |         value: "sci_1_class"
105 |       }
106 |       input_map {
107 |         key: "sci_2_class" 
108 |         value: "sci_2_class"
109 |       }
110 |       output_map {
111 |         key: "OUTPUT0"
112 |         value: "OUTPUT0"
113 |       }
114 |     }
115 |   ]
116 | }
117 | parameters: [
118 |   {
119 |     key: "predict_proba"
120 |     value: { string_value: "false" }
121 |   },
122 |   {
123 |     key: "output_class"
124 |     value: { string_value: "false" }
125 |   },
126 |   {
127 |     key: "threshold"
128 |     value: { string_value: "0.5" }
129 |   },
130 |   {
131 |     key: "algo"
132 |     value: { string_value: "ALGO_AUTO" }
133 |   },
134 |   {
135 |     key: "storage_type"
136 |     value: { string_value: "AUTO" }
137 |   },
138 |   {
139 |     key: "blocks_per_sm"
140 |     value: { string_value: "0" }
141 |   }
142 | ]
143 | 
144 | 


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/models/mux/1/model.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sys
 3 | import json
 4 | 
 5 | import triton_python_backend_utils as pb_utils
 6 | import transformers
 7 | 
 8 | class TritonPythonModel:
 9 | 
10 |     def initialize(self, args):
11 |         self.log = open("/tmp/mux.loq", "w")
12 | 
13 |         self.log.write("DEBUG: ------------------------ hello world init mux/model.py ------------------------------------\n")
14 |         self.out_dtypes = {}
15 |         self.model_config = model_config = json.loads(args['model_config'])
16 | 
17 |         mux_xgb_out_config  = pb_utils.get_output_config_by_name(model_config, "mux_xgb_out")
18 |         self.out_dtypes["mux_xgb_out"] =  pb_utils.triton_string_to_numpy(mux_xgb_out_config["data_type"])
19 | 
20 |         mux_tf_out_config  = pb_utils.get_output_config_by_name(model_config, "mux_tf_out")
21 |         self.out_dtypes["mux_tf_out"] =  pb_utils.triton_string_to_numpy(mux_tf_out_config["data_type"])
22 | 
23 |         mux_sci_1_out_config  = pb_utils.get_output_config_by_name(model_config, "mux_sci_1_out")
24 |         self.out_dtypes["mux_sci_1_out"] =  pb_utils.triton_string_to_numpy(mux_sci_1_out_config["data_type"])
25 | 
26 |         mux_sci_2_out_config  = pb_utils.get_output_config_by_name(model_config, "mux_sci_2_out")
27 |         self.out_dtypes["mux_sci_2_out"] =  pb_utils.triton_string_to_numpy(mux_sci_1_out_config["data_type"])
28 | 
29 | 
30 |     def execute(self, requests):
31 | 
32 |         self.log.write("DEBUG: ------------------------requests:  mux/model.py \n" + str(requests) + '\n') 
33 | 
34 |         responses = []
35 |         for request in requests:
36 | 
37 |             mux_in = pb_utils.get_input_tensor_by_name(request, "mux_in")
38 |             out_tensors = []
39 |             for model in ["mux_xgb_out", "mux_tf_out", "mux_sci_1_out", "mux_sci_2_out"]:
40 |                 self.log.write("DEBUG: ------------------------ model dtype out_tensor tensor.astype" + model + '\n'
41 |                 + str(self.out_dtypes[model]) + " "
42 |                 + str(mux_in.as_numpy()) + " "
43 |                 + str(mux_in.as_numpy().astype(self.out_dtypes[model])) + '\n')
44 | 
45 |                 out_tensors.append(pb_utils.Tensor(model, mux_in.as_numpy().astype(self.out_dtypes[model])))
46 | 
47 |             inference_response = pb_utils.InferenceResponse(output_tensors = out_tensors)
48 |             responses.append(inference_response)
49 | 
50 |         self.log.flush()
51 |         return responses
52 | 
53 |     def finalize(self):
54 |         self.log.write("DEBUG: ------------------------ hello world finalize mux/model.py ------------------------------------ \n")
55 | 
56 |         self.log.write('Cleaning up - custom model combine \n')
57 |         self.log.close()
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/models/mux/config.pbtxt:
--------------------------------------------------------------------------------
 1 | name: "mux"
 2 | backend: "python"
 3 | max_batch_size: 0
 4 | input [
 5 |   {
 6 |     name: "mux_in"
 7 |     data_type: TYPE_FP32
 8 |     dims: [ -1, 4 ]
 9 |   }
10 | ]
11 | output [
12 |   {
13 |     name: "mux_xgb_out"
14 |     data_type: TYPE_FP32
15 |     dims: [ -1, 4 ]
16 |   },
17 |   {
18 |     name: "mux_tf_out"
19 |     data_type: TYPE_FP32
20 |     dims: [ -1, 4 ]
21 |   },
22 |   {
23 |     name: "mux_sci_1_out"
24 |     data_type: TYPE_FP32
25 |     dims: [ -1, 4 ]
26 |   },
27 |   {
28 |     name: "mux_sci_2_out"
29 |     data_type: TYPE_FP32
30 |     dims: [ -1, 4 ]
31 |   }
32 | ]
33 | 
34 | parameters [
35 |   {
36 |     key: "output_class"
37 |     value: { string_value: "false" }
38 |   },
39 |   {
40 |     key: "threshold"
41 |     value: { string_value: "0.5" }
42 |   }
43 | ]
44 | 
45 | instance_group[ { kind: KIND_CPU } ]
46 | 
47 | 
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/models/sci_1/1/checkpoint.tl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/10-serving-ensemble-triton/models/sci_1/1/checkpoint.tl


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/models/sci_1/1/sci_1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/10-serving-ensemble-triton/models/sci_1/1/sci_1.pkl


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/models/sci_1/config.pbtxt:
--------------------------------------------------------------------------------
 1 |   backend: "fil"
 2 |   max_batch_size: 0
 3 |   input [
 4 |     {
 5 |       name: "input__0"
 6 |       data_type: TYPE_FP32
 7 |       dims: [ -1, 4 ]
 8 |     }
 9 |   ]
10 |   output [
11 |     {
12 |       name: "output__0"
13 |       data_type: TYPE_FP32
14 |       dims: [ 1 ]
15 |     }
16 |   ]
17 | instance_group [{ kind: KIND_GPU }]
18 | parameters [
19 |   {
20 |     key: "model_type"
21 |     value: { string_value: "treelite_checkpoint" }
22 |   },
23 |   {
24 |     key: "predict_proba"
25 |     value: { string_value: "false" }
26 |   },
27 |   {
28 |     key: "output_class"
29 |     value: { string_value: "true" }
30 |   },
31 |   {
32 |     key: "threshold"
33 |     value: { string_value: "0.5" }
34 |   },
35 |   {
36 |     key: "algo"
37 |     value: { string_value: "ALGO_AUTO" }
38 |   },
39 |   {
40 |     key: "storage_type"
41 |     value: { string_value: "AUTO" }
42 |   },
43 |   {
44 |     key: "blocks_per_sm"
45 |     value: { string_value: "0" }
46 |   }
47 | ]
48 | 
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/models/sci_2/1/checkpoint.tl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/10-serving-ensemble-triton/models/sci_2/1/checkpoint.tl


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/models/sci_2/1/sci_2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/10-serving-ensemble-triton/models/sci_2/1/sci_2.pkl


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/models/sci_2/config.pbtxt:
--------------------------------------------------------------------------------
 1 |   backend: "fil"
 2 |   max_batch_size: 0
 3 |   input [
 4 |     {
 5 |       name: "input__0"
 6 |       data_type: TYPE_FP32
 7 |       dims: [ -1, 4 ]
 8 |     }
 9 |   ]
10 |   output [
11 |     {
12 |       name: "output__0"
13 |       data_type: TYPE_FP32
14 |       dims: [ 1 ]
15 |     }
16 |   ]
17 | instance_group [{ kind: KIND_GPU }]
18 | parameters [
19 |   {
20 |     key: "model_type"
21 |     value: { string_value: "treelite_checkpoint" }
22 |   },
23 |   {
24 |     key: "predict_proba"
25 |     value: { string_value: "false" }
26 |   },
27 |   {
28 |     key: "output_class"
29 |     value: { string_value: "true" }
30 |   },
31 |   {
32 |     key: "threshold"
33 |     value: { string_value: "0.5" }
34 |   },
35 |   {
36 |     key: "algo"
37 |     value: { string_value: "ALGO_AUTO" }
38 |   },
39 |   {
40 |     key: "storage_type"
41 |     value: { string_value: "AUTO" }
42 |   },
43 |   {
44 |     key: "blocks_per_sm"
45 |     value: { string_value: "0" }
46 |   }
47 | ]
48 | 
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/models/tf/1/model.savedmodel/saved_model.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/10-serving-ensemble-triton/models/tf/1/model.savedmodel/saved_model.pb


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/models/tf/1/model.savedmodel/variables/variables.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/10-serving-ensemble-triton/models/tf/1/model.savedmodel/variables/variables.data-00000-of-00001


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/models/tf/1/model.savedmodel/variables/variables.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/10-serving-ensemble-triton/models/tf/1/model.savedmodel/variables/variables.index


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/models/tf/config.pbtxt:
--------------------------------------------------------------------------------
 1 | backend: "tensorflow"
 2 | platform: "tensorflow_savedmodel"
 3 | max_batch_size: 0
 4 |   input [
 5 |     {
 6 |       name: "dense_input"
 7 |       data_type: TYPE_FP32
 8 |       dims: [ -1, 4 ] 
 9 |     }
10 |   ]
11 |   output [
12 |     {
13 |       name: "round"
14 |       data_type: TYPE_FP32
15 |       dims:  [ -1, 1 ]
16 |     }
17 | ]
18 | instance_group [{ kind: KIND_GPU }]
19 | parameters [
20 |   {
21 |     key: "model_type"
22 |     value: { string_value: "tensorflow_savedmodel" }
23 |   },
24 |   {
25 |     key: "predict_proba"
26 |     value: { string_value: "false" }
27 |   },
28 |   {
29 |     key: "output_class"
30 |     value: { string_value: "true" }
31 |   },
32 |   {
33 |     key: "threshold"
34 |     value: { string_value: "0.5" }
35 |   },
36 |   {
37 |     key: "algo"
38 |     value: { string_value: "ALGO_AUTO" }
39 |   },
40 |   {
41 |     key: "storage_type"
42 |     value: { string_value: "AUTO" }
43 |   },
44 |   {
45 |     key: "blocks_per_sm"
46 |     value: { string_value: "0" }
47 |   }
48 | ]
49 | 
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/models/xgb/config.pbtxt:
--------------------------------------------------------------------------------
 1 |   backend: "fil"
 2 |   max_batch_size: 0
 3 |   input [
 4 |     {
 5 |       name: "input__0"
 6 |       data_type: TYPE_FP32
 7 |       dims: [ -1, 4 ]
 8 |     }
 9 |   ]
10 |   output [
11 |     {
12 |       name: "output__0"
13 |       data_type: TYPE_FP32
14 |       dims: [ 1 ]
15 |     }
16 |   ]
17 | instance_group [{ kind: KIND_GPU }]
18 | parameters [
19 |   {
20 |     key: "model_type"
21 |     value: { string_value: "xgboost_json" }
22 |   },
23 |   {
24 |     key: "predict_proba"
25 |     value: { string_value: "false" }
26 |   },
27 |   {
28 |     key: "output_class"
29 |     value: { string_value: "true" }
30 |   },
31 |   {
32 |     key: "threshold"
33 |     value: { string_value: "0.5" }
34 |   },
35 |   {
36 |     key: "algo"
37 |     value: { string_value: "ALGO_AUTO" }
38 |   },
39 |   {
40 |     key: "storage_type"
41 |     value: { string_value: "AUTO" }
42 |   },
43 |   {
44 |     key: "blocks_per_sm"
45 |     value: { string_value: "0" }
46 |   }
47 | ]
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/src/combine/model.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sys
 3 | import json
 4 | import triton_python_backend_utils as pb_utils
 5 | import transformers
 6 | 
 7 | class TritonPythonModel:
 8 |     def initialize(self, args):
 9 |         self.model_config = model_config = json.loads(args['model_config'])
10 |         output_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
11 |         self.output_dtype = pb_utils.triton_string_to_numpy(output_config['data_type'])
12 | 
13 |     def execute(self, requests):
14 |         output_dtype = self.output_dtype
15 |         responses = []
16 |         out_tensor = []
17 |         for request in requests:
18 |             xgb_class = pb_utils.get_input_tensor_by_name(request, "xgb_class")
19 |             tf_class = pb_utils.get_input_tensor_by_name(request, "tf_class")
20 |             sci_1_class = pb_utils.get_input_tensor_by_name(request, "sci_1_class")
21 |             sci_2_class = pb_utils.get_input_tensor_by_name(request, "sci_2_class")
22 |             out_tensor.append(pb_utils.Tensor("OUTPUT0", 
23 |             (xgb_class.as_numpy() 
24 |             + tf_class.as_numpy()  
25 |             + sci_1_class.as_numpy() 
26 |             + sci_2_class.as_numpy()) / 4.0))
27 |             inference_response = pb_utils.InferenceResponse(output_tensors = out_tensor)
28 |             responses.append(inference_response)
29 |         return responses
30 | 
31 | 
32 | 
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/src/generate/generate.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | import os
 4 | 
 5 | features = 4
 6 | samples = 1000
 7 | 
 8 | data_path = os.environ['HOME'] + "/Triton/ensemble/data/"
 9 | X_data = data_path + 'X.data.npy'
10 | Y_data = data_path + 'Y.data.npy'
11 | 
12 | if (os.path.exists(X_data)):
13 |     print("Data already exists at {}".format(X_data))
14 | else:
15 |     print("Generating data at {}".format(data_path))
16 |     X = np.random.rand(samples, features).astype('float32')
17 |     Y = np.random.randint(2, size=samples)
18 |     np.save(X_data, X)
19 |     np.save(Y_data, Y)
20 | 
21 | 


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/src/mux/model.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | import sys
 4 | import json
 5 | import triton_python_backend_utils as pb_utils
 6 | import transformers
 7 | 
 8 | class TritonPythonModel:
 9 | 
10 |     def initialize(self, args):
11 |         self.out_dtypes = {}
12 |         self.model_config = model_config = json.loads(args['model_config'])
13 |         mux_xgb_out_config  = pb_utils.get_output_config_by_name(model_config, "mux_xgb_out")
14 |         self.out_dtypes["mux_xgb_out"] =  pb_utils.triton_string_to_numpy(mux_xgb_out_config["data_type"])
15 |         mux_tf_out_config  = pb_utils.get_output_config_by_name(model_config, "mux_tf_out")
16 |         self.out_dtypes["mux_tf_out"] =  pb_utils.triton_string_to_numpy(mux_tf_out_config["data_type"])
17 |         mux_sci_1_out_config  = pb_utils.get_output_config_by_name(model_config, "mux_sci_1_out")
18 |         self.out_dtypes["mux_sci_1_out"] =  pb_utils.triton_string_to_numpy(mux_sci_1_out_config["data_type"])
19 |         mux_sci_2_out_config  = pb_utils.get_output_config_by_name(model_config, "mux_sci_2_out")
20 |         self.out_dtypes["mux_sci_2_out"] =  pb_utils.triton_string_to_numpy(mux_sci_1_out_config["data_type"])
21 | 
22 | 
23 |     def execute(self, requests):
24 |         responses = []
25 |         for request in requests:
26 |             mux_in = pb_utils.get_input_tensor_by_name(request, "mux_in")
27 |             out_tensors = []
28 |             for model in ["mux_xgb_out", "mux_tf_out", "mux_sci_1_out", "mux_sci_2_out"]:
29 |                 out_tensors.append(pb_utils.Tensor(model, mux_in.as_numpy().astype(self.out_dtypes[model])))
30 |             inference_response = pb_utils.InferenceResponse(output_tensors = out_tensors)
31 |             responses.append(inference_response)
32 |         return responses
33 | 
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/src/sci_1/sci_1.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import numpy as np
 4 | import os
 5 | 
 6 | from sklearn.ensemble import RandomForestClassifier
 7 | 
 8 | from numpy import mean
 9 | from numpy import std
10 | from sklearn.datasets import make_classification
11 | from sklearn.model_selection import cross_val_score
12 | from sklearn.model_selection import RepeatedStratifiedKFold
13 | from sklearn.ensemble import RandomForestClassifier
14 | from sklearn.model_selection import train_test_split
15 | 
16 | import pickle
17 | 
18 | import subprocess
19 | 
20 | 
21 | seed = 7
22 | features = 4
23 | samples = 1000
24 | 
25 | data_path = os.environ['HOME'] + "/Triton/ensemble/data/"
26 | X_data = data_path + 'X.data.npy'
27 | Y_data = data_path + 'Y.data.npy'
28 | 
29 | if (not os.path.exists(X_data)):
30 |     print("Please run src/generate.py to create dummy data for modes")
31 | else:
32 |    X = np.load(X_data)
33 |    Y = np.load(Y_data)
34 | 
35 | print("shape X " + str(X.shape))
36 | print("shape Y " + str(Y.shape))
37 | 
38 | model_path = os.environ['HOME'] + "/Triton/ensemble/models/sci_1/1"
39 | 
40 | test_size = 0.33
41 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
42 | 
43 | 
44 | model = RandomForestClassifier(max_depth=2, random_state=0)
45 | model.fit(X_train, y_train)
46 | ### print(model.predict([[0, 0, 0, 0]]))
47 | 
48 | cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
49 | n_scores = cross_val_score(model, X_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
50 | 
51 | print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
52 | 
53 | 
54 | pickle.dump(model, open(model_path + "/sci_1.pkl", 'wb'))
55 | 
56 | subprocess.run(["{}/Triton/ensemble/fil_backend/scripts/convert_sklearn".format(os.environ['HOME']), model_path + "/sci_1.pkl"])
57 | 
58 | 


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/src/sci_2/sci_2.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import numpy as np
 4 | import os
 5 | 
 6 | 
 7 | from sklearn.ensemble import ExtraTreesClassifier
 8 | from numpy import mean
 9 | from numpy import std
10 | from sklearn.datasets import make_classification
11 | from sklearn.model_selection import cross_val_score
12 | from sklearn.model_selection import RepeatedStratifiedKFold
13 | from sklearn.model_selection import train_test_split
14 | 
15 | import pickle
16 | 
17 | import subprocess
18 | 
19 | 
20 | seed = 7
21 | features = 4
22 | samples = 1000
23 | 
24 | data_path = os.environ['HOME'] + "/Triton/ensemble/data/"
25 | X_data = data_path + 'X.data.npy'
26 | Y_data = data_path + 'Y.data.npy'
27 | 
28 | if (not os.path.exists(X_data)):
29 |     print("Please run src/generate.py to create dummy data for modes")
30 | else:
31 |    X = np.load(X_data)
32 |    Y = np.load(Y_data)
33 | 
34 | print("shape X " + str(X.shape))
35 | print("shape Y " + str(Y.shape))
36 | 
37 | model_path = os.environ['HOME'] + "/Triton/ensemble/models/sci_2/1"
38 | 
39 | test_size = 0.33
40 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
41 | 
42 | 
43 | model = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
44 | model.fit(X_train, y_train)
45 | ### print(model.predict([[0, 0, 0, 0]]))
46 | 
47 | cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
48 | n_scores = cross_val_score(model, X_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
49 | 
50 | print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
51 | 
52 | pickle.dump(model, open(model_path + "/sci_2.pkl", 'wb'))
53 | 
54 | subprocess.run(["{}/Triton/ensemble/fil_backend/scripts/convert_sklearn".format(os.environ['HOME']), model_path + "/sci_2.pkl"])
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/src/test/combine_01.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import numpy
 4 | import tritonclient.http as triton_http
 5 | 
 6 | # Set up both HTTP and GRPC clients. Note that the GRPC client is generally
 7 | # somewhat faster.
 8 | http_client = triton_http.InferenceServerClient(
 9 |     url='localhost:8000',
10 |     verbose=False,
11 |     concurrency=5
12 | )
13 | 
14 | # Generate example data to classify
15 | features = 1
16 | samples = 1
17 | data = numpy.random.rand(features).astype('float32')
18 | tf_data = numpy.random.rand(samples, features).astype('float32')
19 | 
20 | print("data.shape:" + str(data.shape))
21 | 
22 | # Set up Triton input and output objects for HTTP
23 | inputs = []
24 | triton_input_http_1 = triton_http.InferInput(
25 |     'xgb_class',
26 |     [features],
27 |     'FP32'
28 | )
29 | triton_input_http_1.set_data_from_numpy(data, binary_data=True)
30 | inputs.append(triton_input_http_1)
31 | 
32 | triton_input_http_2 = triton_http.InferInput(
33 |     'tf_class',
34 |     [samples, features],
35 |     'FP32'
36 | )
37 | triton_input_http_2.set_data_from_numpy(tf_data, binary_data=True)
38 | inputs.append(triton_input_http_2)
39 | 
40 | triton_input_http_3 = triton_http.InferInput(
41 |     'sci_1_class',
42 |     [features],
43 |     'FP32'
44 | )
45 | triton_input_http_3.set_data_from_numpy(data, binary_data=True)
46 | inputs.append(triton_input_http_3)
47 | 
48 | triton_input_http_4 = triton_http.InferInput(
49 |     'sci_2_class',
50 |     [features],
51 |     'FP32'
52 | )
53 | triton_input_http_4.set_data_from_numpy(data, binary_data=True)
54 | inputs.append(triton_input_http_4)
55 | 
56 | triton_output_http = triton_http.InferRequestedOutput(
57 |     'OUTPUT0',
58 |     binary_data=True
59 | )
60 | 
61 | 
62 | # Submit inference request
63 | request_http = http_client.infer(
64 |     'combine',
65 |     model_version='1',
66 |     inputs=inputs,
67 |     outputs=[triton_output_http]
68 | )
69 | 
70 | # Get results as numpy arrays
71 | result_http = request_http.as_numpy('OUTPUT0')
72 | 
73 | print(result_http)
74 | 


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/src/test/ensemble_01.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import numpy
 4 | import tritonclient.http as triton_http
 5 | 
 6 | http_client = triton_http.InferenceServerClient(
 7 |     url='localhost:8000',
 8 |     verbose=False,
 9 |     concurrency=5
10 | )
11 | 
12 | features = 4
13 | samples = 1
14 | data = numpy.random.rand(samples, features).astype('float32')
15 | 
16 | triton_input_http = triton_http.InferInput(
17 |     'INPUT0',
18 |     (samples, features),
19 |     'FP32'
20 | )
21 | triton_input_http.set_data_from_numpy(data, binary_data=True)
22 | 
23 | triton_output_http = triton_http.InferRequestedOutput(
24 |     'OUTPUT0',
25 |     binary_data=True
26 | )
27 | 
28 | 
29 | request_http = http_client.infer(
30 |     'ensemble',
31 |     model_version='1',
32 |     inputs=[triton_input_http],
33 |     outputs=[triton_output_http]
34 | )
35 | 
36 | result_http = request_http.as_numpy('OUTPUT0')
37 | 
38 | print(result_http)
39 | 


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/src/test/mux.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import numpy as np
 4 | import tritonclient.http as triton_http
 5 | 
 6 | # Set up HTTP client.
 7 | http_client = triton_http.InferenceServerClient(
 8 |     url = 'localhost:8000',
 9 |     verbose = False,
10 |     concurrency = 5
11 | )
12 | 
13 | features = 4
14 | samples = 1
15 | data = np.random.rand(samples, features).astype('float32')
16 | 
17 | 
18 | triton_input_http = triton_http.InferInput(
19 |     'mux_in',
20 |     (samples, features),
21 |     'FP32'
22 | )
23 | triton_input_http.set_data_from_numpy(data, binary_data=True)
24 | 
25 | 
26 | # Set up Triton input and output objects for HTTP
27 | outputs = []
28 | 
29 | triton_output_http_1 = triton_http.InferRequestedOutput(
30 |     'mux_xgb_out',
31 |      binary_data = True
32 | )
33 | outputs.append(triton_output_http_1)
34 | 
35 | triton_output_http_2 = triton_http.InferRequestedOutput(
36 |     'mux_tf_out',
37 |     binary_data = True
38 | )
39 | outputs.append(triton_output_http_2)
40 | 
41 | triton_output_http_3 = triton_http.InferRequestedOutput(
42 |     'mux_sci_1_out',
43 |      binary_data = True
44 | )
45 | outputs.append(triton_output_http_3)
46 | 
47 | triton_output_http_4 = triton_http.InferRequestedOutput(
48 |     'mux_sci_2_out',
49 |      binary_data = True
50 | )
51 | outputs.append(triton_output_http_4)
52 | 
53 | # Submit inference request
54 | request_http = http_client.infer(
55 |     'mux',
56 |     model_version = '1',
57 |     inputs = [triton_input_http],
58 |     outputs = outputs
59 | )
60 | 
61 | # Get results as numpy arrays
62 | 
63 | results = {}
64 | for model in ["mux_xgb_out", "mux_tf_out", "mux_sci_1_out", "mux_sci_2_out"]:
65 |    results[model] = request_http.as_numpy(model)
66 | 
67 | for model in ["mux_xgb_out", "mux_tf_out", "mux_sci_1_out", "mux_sci_2_out"]:
68 |    print(model + " " + str(results[model]) + '\n')
69 | 
70 | 


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/src/test/sci_1.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import numpy
 4 | import tritonclient.http as triton_http
 5 | 
 6 | # Set up both HTTP and GRPC clients. Note that the GRPC client is generally
 7 | # somewhat faster.
 8 | http_client = triton_http.InferenceServerClient(
 9 |     url='localhost:8000',
10 |     verbose=False,
11 |     concurrency=5
12 | )
13 | 
14 | # Generate example data to classify
15 | features = 4
16 | samples = 1
17 | data = numpy.random.rand(samples, features).astype('float32')
18 | 
19 | # Set up Triton input and output objects for HTTP
20 | triton_input_http = triton_http.InferInput(
21 |     'input__0',
22 |     (samples, features),
23 |     'FP32'
24 | )
25 | triton_input_http.set_data_from_numpy(data, binary_data=True)
26 | 
27 | triton_output_http = triton_http.InferRequestedOutput(
28 |     'output__0',
29 |     binary_data=True
30 | )
31 | 
32 | 
33 | # Submit inference request
34 | request_http = http_client.infer(
35 |     'sci_1',
36 |     model_version='1',
37 |     inputs=[triton_input_http],
38 |     outputs=[triton_output_http]
39 | )
40 | 
41 | # Get results as numpy arrays
42 | result_http = request_http.as_numpy('output__0')
43 | 
44 | print(result_http)
45 | 


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/src/test/sci_2.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import numpy
 4 | import tritonclient.http as triton_http
 5 | 
 6 | # Set up both HTTP and GRPC clients. Note that the GRPC client is generally
 7 | # somewhat faster.
 8 | http_client = triton_http.InferenceServerClient(
 9 |     url='localhost:8000',
10 |     verbose=False,
11 |     concurrency=5
12 | )
13 | 
14 | # Generate example data to classify
15 | features = 4
16 | samples = 1
17 | data = numpy.random.rand(samples, features).astype('float32')
18 | 
19 | # Set up Triton input and output objects for HTTP
20 | triton_input_http = triton_http.InferInput(
21 |     'input__0',
22 |     (samples, features),
23 |     'FP32'
24 | )
25 | triton_input_http.set_data_from_numpy(data, binary_data=True)
26 | 
27 | triton_output_http = triton_http.InferRequestedOutput(
28 |     'output__0',
29 |     binary_data=True
30 | )
31 | 
32 | 
33 | # Submit inference request
34 | request_http = http_client.infer(
35 |     'sci_2',
36 |     model_version='1',
37 |     inputs=[triton_input_http],
38 |     outputs=[triton_output_http]
39 | )
40 | 
41 | # Get results as numpy arrays
42 | result_http = request_http.as_numpy('output__0')
43 | 
44 | print(result_http)
45 | 


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/src/test/tf_01.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import numpy
 4 | import tritonclient.http as triton_http
 5 | 
 6 | # Set up both HTTP and GRPC clients. Note that the GRPC client is generally
 7 | # somewhat faster.
 8 | http_client = triton_http.InferenceServerClient(
 9 |     url='localhost:8000',
10 |     verbose=False,
11 |     concurrency=5
12 | )
13 | 
14 | # Generate example data to classify
15 | features = 4
16 | samples = 1
17 | data = numpy.random.rand(samples, features).astype('float32')
18 | 
19 | # Set up Triton input and output objects for HTTP
20 | triton_input_http = triton_http.InferInput(
21 |     'dense_input',
22 |     (samples, features),
23 |     'FP32'
24 | )
25 | triton_input_http.set_data_from_numpy(data, binary_data=True)
26 | 
27 | triton_output_http = triton_http.InferRequestedOutput(
28 |     'round',
29 |     binary_data=True
30 | )
31 | 
32 | 
33 | # Submit inference request
34 | request_http = http_client.infer(
35 |     'tf',
36 |     model_version='1',
37 |     inputs=[triton_input_http],
38 |     outputs=[triton_output_http]
39 | )
40 | 
41 | # Get results as numpy arrays
42 | result_http = request_http.as_numpy('round')
43 | 
44 | print(result_http)
45 | 


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/src/test/xgb_01.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy
 3 | import tritonclient.http as triton_http
 4 | 
 5 | # Set up both HTTP and GRPC clients. Note that the GRPC client is generally
 6 | # somewhat faster.
 7 | http_client = triton_http.InferenceServerClient(
 8 |     url='localhost:8000',
 9 |     verbose=False,
10 |     concurrency=5
11 | )
12 | 
13 | # Generate example data to classify
14 | features = 4
15 | samples = 1
16 | data = numpy.random.rand(samples, features).astype('float32')
17 | 
18 | # Set up Triton input and output objects for HTTP
19 | triton_input_http = triton_http.InferInput(
20 |     'input__0',
21 |     (samples, features),
22 |     'FP32'
23 | )
24 | triton_input_http.set_data_from_numpy(data, binary_data=True)
25 | 
26 | triton_output_http = triton_http.InferRequestedOutput(
27 |     'output__0',
28 |     binary_data=True
29 | )
30 | 
31 | 
32 | # Submit inference request
33 | request_http = http_client.infer(
34 |     'xgb',
35 |     model_version='1',
36 |     inputs=[triton_input_http],
37 |     outputs=[triton_output_http]
38 | )
39 | 
40 | # Get results as numpy arrays
41 | result_http = request_http.as_numpy('output__0')
42 | 
43 | print(result_http)
44 | 


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/src/tf/tf.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import numpy as np
 4 | import os
 5 | 
 6 | import tensorflow as tf
 7 | import tensorflow.compat.v2.feature_column as fc
 8 | 
 9 | from sklearn.model_selection import train_test_split
10 | 
11 | 
12 | class Round(tf.keras.layers.Layer):
13 |     def __init__(self, num_outputs):
14 |         super(Round, self).__init__()
15 |         self.num_outputs = num_outputs
16 | 
17 |     def call(self, inputs):
18 |         #print("inputs:" + str(inputs))
19 |         #outputs = inputs.__floordiv__(1.0)
20 |         outputs = inputs
21 |         return outputs
22 | 
23 | 
24 | 
25 | seed = 7
26 | features = 4
27 | samples = 1000
28 | 
29 | data_path = os.environ['HOME'] + "/Triton/ensemble/data/"
30 | X_data = data_path + 'X.data.npy'
31 | Y_data = data_path + 'Y.data.npy'
32 | 
33 | if (not os.path.exists(X_data)):
34 |     print("Please run src/generate.py to create dummy data for modes")
35 | else:
36 |    X = np.load(X_data)
37 |    Y = np.load(Y_data)
38 | 
39 | print("s)hape X " + str(X.shape))
40 | print("shape Y " + str(Y.shape))
41 | 
42 | model_path = os.environ['HOME'] + "/Triton/ensemble/models/tf/1"
43 | 
44 | test_size = 0.33
45 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
46 | 
47 | 
48 | model =  tf.keras.models.Sequential()
49 | model.add(tf.keras.layers.Dense(4, input_dim=4, activation='relu'))
50 | model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
51 | model.add(Round(1))
52 | model.compile(optimizer="adam", loss="binary_crossentropy", metrics=['accuracy'])
53 | model.fit(X_train, y_train)
54 | 
55 | test_loss, test_acc = model.evaluate(X_test,  y_test, verbose=2)
56 | 
57 | print('\nTest accuracy:', test_acc)
58 | 
59 | # creates /models/tf/model.tf/saved_model.pb
60 | tf.saved_model.save(model, model_path + "/model.savedmodel")
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/src/tf/tf.py_0:
--------------------------------------------------------------------------------
 1 | #! /home/dfisk/miniconda3/envs/ensemble/bin/python
 2 | 
 3 | import numpy as np
 4 | import os
 5 | 
 6 | import tensorflow as tf
 7 | from sklearn.model_selection import train_test_split
 8 | 
 9 | seed = 7
10 | features = 4
11 | samples = 1000
12 | 
13 | data_path = os.environ['HOME'] + "/Triton/ensemble/data/"
14 | X_data = data_path + 'X.data.npy'
15 | Y_data = data_path + 'Y.data.npy'
16 | 
17 | if (not os.path.exists(X_data)):
18 |     print("Please run src/generate.py to create dummy data for modes")
19 | else:
20 |    X = np.load(X_data)
21 |    Y = np.load(Y_data)
22 | 
23 | print("shape X " + str(X.shape))
24 | print("shape Y " + str(Y.shape))
25 | 
26 | model_path = os.environ['HOME'] + "/Triton/ensemble/models/tf/1"
27 | 
28 | test_size = 0.33
29 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
30 | 
31 | 
32 | class_names = ['Zero', 'One']
33 | 
34 | model = tf.keras.Sequential([
35 |     tf.keras.layers.Dense(128, activation='relu'),
36 |     tf.keras.layers.Dense(10)
37 | ])
38 | 
39 | 
40 | model.compile(optimizer='adam',
41 |               loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
42 |               metrics=['accuracy'])
43 | 
44 | 
45 | 
46 | model.fit(X_train, y_train, epochs=10)
47 | 
48 | 
49 | test_loss, test_acc = model.evaluate(X_test,  y_test, verbose=2)
50 | 
51 | print('\nTest accuracy:', test_acc)
52 | 
53 | # creates /models/tf/model.tf/saved_model.pb
54 | tf.saved_model.save(model, model_path + "model.savedmodel")
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/10-serving-ensemble-triton/src/xgb/xgb.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import numpy as np
 4 | import os
 5 | 
 6 | from xgboost import XGBClassifier
 7 | from sklearn.model_selection import train_test_split
 8 | from sklearn.metrics import accuracy_score
 9 | 
10 | seed = 7
11 | features = 4
12 | samples = 1000
13 | 
14 | data_path = os.environ['HOME'] + "/Triton/ensemble/data/"
15 | X_data = data_path + 'X.data.npy'
16 | Y_data = data_path + 'Y.data.npy'
17 | 
18 | if (not os.path.exists(X_data)):
19 |     print("Please run src/generate.py to create dummy data for modes")
20 | else:
21 |    X = np.load(X_data)
22 |    Y = np.load(Y_data)
23 | 
24 | model_path = os.environ['HOME'] + "/Triton/ensemble/models/xgb/1"
25 | 
26 | print("shape X " + str(X.shape))
27 | print("shape Y " + str(Y.shape))
28 | 
29 | test_size = 0.33
30 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
31 | 
32 | model = XGBClassifier()
33 | model.fit(X_train, y_train)
34 | y_pred = model.predict(X_test)
35 | accuracy = accuracy_score(y_test, y_pred)
36 | print("Test Accuracy: {:.2f}".format(accuracy * 100.0))
37 | 
38 | ### .save_config()
39 | ### print("config:")
40 | ### print(config)
41 | 
42 | ###model.save_model(model_path + "/xgboost.json")
43 | 
44 | model.save_model(model_path + "/xgboost.model")
45 | 


--------------------------------------------------------------------------------
/11-pytorch-on-tpu-vertex-ai/pytorch-on-vertex-ai-tpu-train-mnist.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "63ecb70a-c170-4721-a895-c67cf1ff873a",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Training MNIST with PyTorch on TPU-VM using Vertex AI"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "c473acbc-9370-4de8-ad50-f4471495db2f",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# Imports and initialization"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "id": "4dae2be6-e296-45e5-9021-c116a96e72d2",
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "! pip -q install google-cloud-aiplatform"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "id": "7b78a856-4264-40b9-a71c-e8acb6b197f3",
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "from datetime import datetime\n",
 37 |     "from google.cloud import aiplatform"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "id": "a9d711f4-8b33-467b-bccc-c7d501ffcfd5",
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "PROJECT_ID = 'rthallam-demo-project'\n",
 48 |     "BUCKET_NAME = \"cloud-ai-platform-2f444b6a-a742-444b-b91a-c7519f51bd77\"\n",
 49 |     "BUCKET_URI = f'gs://{BUCKET_NAME}'\n",
 50 |     "REGION = 'us-central1'"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "id": "249ce760-396b-43c8-9022-d9551ba6c12e",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "## Create Training Script"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "id": "ad233804-8b02-4249-bd45-77531f589a46",
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "%%writefile train.py\n",
 69 |     "\n",
 70 |     "# adapted from https://github.com/pytorch/xla/blob/master/test/test_train_mp_mnist.py\n",
 71 |     "\n",
 72 |     "import args_parse\n",
 73 |     "\n",
 74 |     "FLAGS = args_parse.parse_common_options(\n",
 75 |     "    datadir='/tmp/mnist-data',\n",
 76 |     "    batch_size=128,\n",
 77 |     "    momentum=0.5,\n",
 78 |     "    lr=0.01,\n",
 79 |     "    target_accuracy=98.0,\n",
 80 |     "    num_epochs=18)\n",
 81 |     "\n",
 82 |     "import os\n",
 83 |     "import shutil\n",
 84 |     "import sys\n",
 85 |     "import numpy as np\n",
 86 |     "import torch\n",
 87 |     "import torch.nn as nn\n",
 88 |     "import torch.nn.functional as F\n",
 89 |     "import torch.optim as optim\n",
 90 |     "from torchvision import datasets, transforms\n",
 91 |     "import torch_xla\n",
 92 |     "import torch_xla.debug.metrics as met\n",
 93 |     "import torch_xla.distributed.parallel_loader as pl\n",
 94 |     "import torch_xla.utils.utils as xu\n",
 95 |     "import torch_xla.core.xla_model as xm\n",
 96 |     "import torch_xla.distributed.xla_multiprocessing as xmp\n",
 97 |     "import torch_xla.test.test_utils as test_utils\n",
 98 |     "\n",
 99 |     "\n",
100 |     "class MNIST(nn.Module):\n",
101 |     "\n",
102 |     "  def __init__(self):\n",
103 |     "    super(MNIST, self).__init__()\n",
104 |     "    self.conv1 = nn.Conv2d(1, 10, kernel_size=5)\n",
105 |     "    self.bn1 = nn.BatchNorm2d(10)\n",
106 |     "    self.conv2 = nn.Conv2d(10, 20, kernel_size=5)\n",
107 |     "    self.bn2 = nn.BatchNorm2d(20)\n",
108 |     "    self.fc1 = nn.Linear(320, 50)\n",
109 |     "    self.fc2 = nn.Linear(50, 10)\n",
110 |     "\n",
111 |     "  def forward(self, x):\n",
112 |     "    x = F.relu(F.max_pool2d(self.conv1(x), 2))\n",
113 |     "    x = self.bn1(x)\n",
114 |     "    x = F.relu(F.max_pool2d(self.conv2(x), 2))\n",
115 |     "    x = self.bn2(x)\n",
116 |     "    x = torch.flatten(x, 1)\n",
117 |     "    x = F.relu(self.fc1(x))\n",
118 |     "    x = self.fc2(x)\n",
119 |     "    return F.log_softmax(x, dim=1)\n",
120 |     "\n",
121 |     "\n",
122 |     "def _train_update(device, x, loss, tracker, writer):\n",
123 |     "  test_utils.print_training_update(\n",
124 |     "      device,\n",
125 |     "      x,\n",
126 |     "      loss.item(),\n",
127 |     "      tracker.rate(),\n",
128 |     "      tracker.global_rate(),\n",
129 |     "      summary_writer=writer)\n",
130 |     "\n",
131 |     "\n",
132 |     "def train_mnist(flags, **kwargs):\n",
133 |     "  torch.manual_seed(1)\n",
134 |     "\n",
135 |     "  if flags.fake_data:\n",
136 |     "    train_loader = xu.SampleGenerator(\n",
137 |     "        data=(torch.zeros(flags.batch_size, 1, 28,\n",
138 |     "                          28), torch.zeros(flags.batch_size,\n",
139 |     "                                           dtype=torch.int64)),\n",
140 |     "        sample_count=60000 // flags.batch_size // xm.xrt_world_size())\n",
141 |     "    test_loader = xu.SampleGenerator(\n",
142 |     "        data=(torch.zeros(flags.batch_size, 1, 28,\n",
143 |     "                          28), torch.zeros(flags.batch_size,\n",
144 |     "                                           dtype=torch.int64)),\n",
145 |     "        sample_count=10000 // flags.batch_size // xm.xrt_world_size())\n",
146 |     "  else:\n",
147 |     "    train_dataset = datasets.MNIST(\n",
148 |     "        os.path.join(flags.datadir, str(xm.get_ordinal())),\n",
149 |     "        train=True,\n",
150 |     "        download=True,\n",
151 |     "        transform=transforms.Compose(\n",
152 |     "            [transforms.ToTensor(),\n",
153 |     "             transforms.Normalize((0.1307,), (0.3081,))]))\n",
154 |     "    test_dataset = datasets.MNIST(\n",
155 |     "        os.path.join(flags.datadir, str(xm.get_ordinal())),\n",
156 |     "        train=False,\n",
157 |     "        download=True,\n",
158 |     "        transform=transforms.Compose(\n",
159 |     "            [transforms.ToTensor(),\n",
160 |     "             transforms.Normalize((0.1307,), (0.3081,))]))\n",
161 |     "    train_sampler = None\n",
162 |     "    if xm.xrt_world_size() > 1:\n",
163 |     "      train_sampler = torch.utils.data.distributed.DistributedSampler(\n",
164 |     "          train_dataset,\n",
165 |     "          num_replicas=xm.xrt_world_size(),\n",
166 |     "          rank=xm.get_ordinal(),\n",
167 |     "          shuffle=True)\n",
168 |     "    train_loader = torch.utils.data.DataLoader(\n",
169 |     "        train_dataset,\n",
170 |     "        batch_size=flags.batch_size,\n",
171 |     "        sampler=train_sampler,\n",
172 |     "        drop_last=flags.drop_last,\n",
173 |     "        shuffle=False if train_sampler else True,\n",
174 |     "        num_workers=flags.num_workers)\n",
175 |     "    test_loader = torch.utils.data.DataLoader(\n",
176 |     "        test_dataset,\n",
177 |     "        batch_size=flags.batch_size,\n",
178 |     "        drop_last=flags.drop_last,\n",
179 |     "        shuffle=False,\n",
180 |     "        num_workers=flags.num_workers)\n",
181 |     "\n",
182 |     "  # Scale learning rate to num cores\n",
183 |     "  lr = flags.lr * xm.xrt_world_size()\n",
184 |     "\n",
185 |     "  device = xm.xla_device()\n",
186 |     "  model = MNIST().to(device)\n",
187 |     "  writer = None\n",
188 |     "  if xm.is_master_ordinal():\n",
189 |     "    writer = test_utils.get_summary_writer(flags.logdir)\n",
190 |     "  optimizer = optim.SGD(model.parameters(), lr=lr, momentum=flags.momentum)\n",
191 |     "  loss_fn = nn.NLLLoss()\n",
192 |     "\n",
193 |     "  def train_loop_fn(loader):\n",
194 |     "    tracker = xm.RateTracker()\n",
195 |     "    model.train()\n",
196 |     "    for step, (data, target) in enumerate(loader):\n",
197 |     "      optimizer.zero_grad()\n",
198 |     "      output = model(data)\n",
199 |     "      loss = loss_fn(output, target)\n",
200 |     "      loss.backward()\n",
201 |     "      xm.optimizer_step(optimizer)\n",
202 |     "      tracker.add(flags.batch_size)\n",
203 |     "      if step % flags.log_steps == 0:\n",
204 |     "        xm.add_step_closure(\n",
205 |     "            _train_update,\n",
206 |     "            args=(device, step, loss, tracker, writer),\n",
207 |     "            run_async=FLAGS.async_closures)\n",
208 |     "\n",
209 |     "  def test_loop_fn(loader):\n",
210 |     "    total_samples = 0\n",
211 |     "    correct = 0\n",
212 |     "    model.eval()\n",
213 |     "    for data, target in loader:\n",
214 |     "      output = model(data)\n",
215 |     "      pred = output.max(1, keepdim=True)[1]\n",
216 |     "      correct += pred.eq(target.view_as(pred)).sum()\n",
217 |     "      total_samples += data.size()[0]\n",
218 |     "\n",
219 |     "    accuracy = 100.0 * correct.item() / total_samples\n",
220 |     "    accuracy = xm.mesh_reduce('test_accuracy', accuracy, np.mean)\n",
221 |     "    return accuracy\n",
222 |     "\n",
223 |     "  train_device_loader = pl.MpDeviceLoader(train_loader, device)\n",
224 |     "  test_device_loader = pl.MpDeviceLoader(test_loader, device)\n",
225 |     "  accuracy, max_accuracy = 0.0, 0.0\n",
226 |     "  for epoch in range(1, flags.num_epochs + 1):\n",
227 |     "    xm.master_print('Epoch {} train begin {}'.format(epoch, test_utils.now()))\n",
228 |     "    train_loop_fn(train_device_loader)\n",
229 |     "    xm.master_print('Epoch {} train end {}'.format(epoch, test_utils.now()))\n",
230 |     "\n",
231 |     "    accuracy = test_loop_fn(test_device_loader)\n",
232 |     "    xm.master_print('Epoch {} test end {}, Accuracy={:.2f}'.format(\n",
233 |     "        epoch, test_utils.now(), accuracy))\n",
234 |     "    max_accuracy = max(accuracy, max_accuracy)\n",
235 |     "    test_utils.write_to_summary(\n",
236 |     "        writer,\n",
237 |     "        epoch,\n",
238 |     "        dict_to_write={'Accuracy/test': accuracy},\n",
239 |     "        write_xla_metrics=True)\n",
240 |     "    if flags.metrics_debug:\n",
241 |     "      xm.master_print(met.metrics_report())\n",
242 |     "\n",
243 |     "  test_utils.close_summary_writer(writer)\n",
244 |     "  xm.master_print('Max Accuracy: {:.2f}%'.format(max_accuracy))\n",
245 |     "  return max_accuracy\n",
246 |     "\n",
247 |     "\n",
248 |     "def _mp_fn(index, flags):\n",
249 |     "  torch.set_default_tensor_type('torch.FloatTensor')\n",
250 |     "  accuracy = train_mnist(flags)\n",
251 |     "  if flags.tidy and os.path.isdir(flags.datadir):\n",
252 |     "    shutil.rmtree(flags.datadir)\n",
253 |     "  if accuracy < flags.target_accuracy:\n",
254 |     "    print('Accuracy {} is below target {}'.format(accuracy,\n",
255 |     "                                                  flags.target_accuracy))\n",
256 |     "    sys.exit(21)\n",
257 |     "\n",
258 |     "\n",
259 |     "if __name__ == '__main__':\n",
260 |     "  xmp.spawn(_mp_fn, args=(FLAGS,), nprocs=FLAGS.num_cores)"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "markdown",
265 |    "id": "c3aeb216-d3c7-4c8b-8e8f-9cc9a32db2ac",
266 |    "metadata": {},
267 |    "source": [
268 |     "## Build custom container image with dependencies"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": null,
274 |    "id": "dfeb71de-039e-4b80-815f-a253d1070329",
275 |    "metadata": {},
276 |    "outputs": [],
277 |    "source": [
278 |     "%%writefile Dockerfile.pytorch-tpu-mnist\n",
279 |     "\n",
280 |     "FROM gcr.io/tpu-pytorch/xla:r1.12_3.8_tpuvm\n",
281 |     "\n",
282 |     "RUN pip install https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/wheels/libtpu-nightly/libtpu_nightly-0.1.dev20221020-py3-none-any.whl\n",
283 |     "\n",
284 |     "WORKDIR /\n",
285 |     "COPY train.py /"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": null,
291 |    "id": "9e806073-7c70-42ad-beea-615970e64203",
292 |    "metadata": {},
293 |    "outputs": [],
294 |    "source": [
295 |     "# base container image name\n",
296 |     "DOCKER_ARTIFACT_REPO = 'pytorch-on-tpu-vm'\n",
297 |     "IMAGE_NAME = \"train-mnist\"\n",
298 |     "# IMAGE_URI = f\"{REGION}-docker.pkg.dev/{PROJECT_ID}/{DOCKER_ARTIFACT_REPO}/{IMAGE_NAME}\"\n",
299 |     "IMAGE_URI = f\"us.gcr.io/{PROJECT_ID}/{DOCKER_ARTIFACT_REPO}/{IMAGE_NAME}\"\n",
300 |     "\n",
301 |     "IMAGE_URI"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": null,
307 |    "id": "ac721c86-1e4b-4619-b967-11916d4a306a",
308 |    "metadata": {},
309 |    "outputs": [],
310 |    "source": [
311 |     "%%writefile cloudbuild.yaml\n",
312 |     "\n",
313 |     "steps:\n",
314 |     "- name: 'gcr.io/cloud-builders/docker'\n",
315 |     "  args: ['build', '-t', '$_IMAGE_URI', '$_FILE_LOCATION', '-f', '$_FILE_LOCATION/Dockerfile.$_DOCKERNAME']\n",
316 |     "images:\n",
317 |     "- '$_IMAGE_URI'"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": null,
323 |    "id": "b095c2ee-1c89-4858-87b9-e5a258cac87f",
324 |    "metadata": {},
325 |    "outputs": [],
326 |    "source": [
327 |     "FILE_LOCATION = './'\n",
328 |     "\n",
329 |     "! gcloud builds submit \\\n",
330 |     "      --region $REGION \\\n",
331 |     "      --config src/cloudbuild.yaml \\\n",
332 |     "      --substitutions _DOCKERNAME=\"pytorch-tpu-mnist\",_IMAGE_URI=$IMAGE_URI,_FILE_LOCATION=$FILE_LOCATION \\\n",
333 |     "      --timeout \"2h\" \\\n",
334 |     "      --machine-type=e2-highcpu-32 \\\n",
335 |     "      --quiet"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "markdown",
340 |    "id": "7154d3d8-7e55-4d20-9605-a4572b9481bb",
341 |    "metadata": {},
342 |    "source": [
343 |     "## Submit training job"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": null,
349 |    "id": "e4d498f0-0fa3-4a92-a064-ed67d12692b0",
350 |    "metadata": {},
351 |    "outputs": [],
352 |    "source": [
353 |     "# initialize Vertex AI SDK\n",
354 |     "aiplatform.init(project=PROJECT_ID, location=REGION)"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "markdown",
359 |    "id": "ccd667b6-ac74-457e-8efb-78606f4ef2e0",
360 |    "metadata": {},
361 |    "source": [
362 |     "### Using CustomJob"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": null,
368 |    "id": "5d69934c-b16a-4605-933a-ec84b97d36c3",
369 |    "metadata": {},
370 |    "outputs": [],
371 |    "source": [
372 |     "TIMESTAMP = datetime.now().strftime(\"%Y%m%d%H%M%S\")\n",
373 |     "APP_NAME = \"pytorch-train-mnist-tpu\"\n",
374 |     "JOB_NAME = f\"{APP_NAME}-{TIMESTAMP}\"\n",
375 |     "print(f\"JOB_NAME = {JOB_NAME}\")"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": null,
381 |    "id": "3e95db28-c233-439f-9538-f0214cf158df",
382 |    "metadata": {},
383 |    "outputs": [],
384 |    "source": [
385 |     "# define worker pool specs\n",
386 |     "worker_pool_specs = [\n",
387 |     "    {\n",
388 |     "        \"machine_spec\": {\n",
389 |     "            \"machine_type\": \"cloud-tpu\",\n",
390 |     "            \"accelerator_type\": \"TPU_V2\",\n",
391 |     "            \"accelerator_count\": 8,\n",
392 |     "        },\n",
393 |     "        \"replica_count\": 1,\n",
394 |     "        \"container_spec\": {\n",
395 |     "            \"image_uri\": IMAGE_URI,\n",
396 |     "            \"command\": [\"python3\", \"/train.py\"],\n",
397 |     "            \"args\": [],\n",
398 |     "            \"env\": [\n",
399 |     "                {\n",
400 |     "                    \"name\": \"XRT_TPU_CONFIG\",\n",
401 |     "                    \"value\": \"localservice;0;localhost:51011\"\n",
402 |     "                }\n",
403 |     "            ]\n",
404 |     "        },\n",
405 |     "    }\n",
406 |     "]"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": null,
412 |    "id": "b5bfd12b-2bd8-456b-ad2d-cb92bc69c7da",
413 |    "metadata": {},
414 |    "outputs": [],
415 |    "source": [
416 |     "# create custom job\n",
417 |     "job = aiplatform.CustomJob(\n",
418 |     "    display_name=JOB_NAME,\n",
419 |     "    worker_pool_specs=worker_pool_specs,\n",
420 |     "    staging_bucket=BUCKET_URI\n",
421 |     ")"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": null,
427 |    "id": "ceda0b19-0fc8-414c-a03b-ccd17f23a3b9",
428 |    "metadata": {},
429 |    "outputs": [],
430 |    "source": [
431 |     "# run the job\n",
432 |     "job_response = job.run()"
433 |    ]
434 |   },
435 |   {
436 |    "cell_type": "markdown",
437 |    "id": "517bc61e-a7d0-4c27-8ef7-18ed2cde8dc6",
438 |    "metadata": {},
439 |    "source": [
440 |     "### Using CustomContainerTrainingJob"
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "code",
445 |    "execution_count": null,
446 |    "id": "fd6d3a59-564c-4eef-9b5a-81007df4059f",
447 |    "metadata": {},
448 |    "outputs": [],
449 |    "source": [
450 |     "TIMESTAMP = datetime.now().strftime(\"%Y%m%d%H%M%S\")\n",
451 |     "APP_NAME = \"pytorch-train-mnist-tpu\"\n",
452 |     "JOB_NAME = f\"{APP_NAME}-{TIMESTAMP}\"\n",
453 |     "print(f\"JOB_NAME = {JOB_NAME}\")"
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "code",
458 |    "execution_count": null,
459 |    "id": "076e7b48-1a27-4e03-9d92-8981cc213fd1",
460 |    "metadata": {},
461 |    "outputs": [],
462 |    "source": [
463 |     "# configure the job with container image spec\n",
464 |     "job = aiplatform.CustomContainerTrainingJob(\n",
465 |     "    display_name=JOB_NAME, \n",
466 |     "    container_uri=IMAGE_URI,\n",
467 |     "    command=[\"python3\", \"/train.py\"],\n",
468 |     "    staging_bucket=BUCKET_URI\n",
469 |     ")"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "code",
474 |    "execution_count": null,
475 |    "id": "e25bc7b2-5382-4d1b-83f0-d70fad83d0c0",
476 |    "metadata": {},
477 |    "outputs": [],
478 |    "source": [
479 |     "# run the job\n",
480 |     "job_response = job.run(\n",
481 |     "    replica_count=1,\n",
482 |     "    machine_type='cloud-tpu',\n",
483 |     "    accelerator_type='TPU_V2',\n",
484 |     "    accelerator_count=8,\n",
485 |     "    base_output_dir=f'{BUCKET_URI}/tpu-experiments/{APP_NAME}/'\n",
486 |     ")"
487 |    ]
488 |   }
489 |  ],
490 |  "metadata": {
491 |   "environment": {
492 |    "kernel": "tpu-gke",
493 |    "name": "tf2-gpu.2-7.m87",
494 |    "type": "gcloud",
495 |    "uri": "gcr.io/deeplearning-platform-release/tf2-gpu.2-7:m87"
496 |   },
497 |   "kernelspec": {
498 |    "display_name": "tpu-gke",
499 |    "language": "python",
500 |    "name": "tpu-gke"
501 |   },
502 |   "language_info": {
503 |    "codemirror_mode": {
504 |     "name": "ipython",
505 |     "version": 3
506 |    },
507 |    "file_extension": ".py",
508 |    "mimetype": "text/x-python",
509 |    "name": "python",
510 |    "nbconvert_exporter": "python",
511 |    "pygments_lexer": "ipython3",
512 |    "version": "3.9.13"
513 |   }
514 |  },
515 |  "nbformat": 4,
516 |  "nbformat_minor": 5
517 | }
518 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Vertex AI Labs
  2 | ---
  3 | 
  4 | **Hands-on labs introducing GCP Vertex AI features**
  5 | 
  6 | These labs introduce following components of Vertex AI
  7 | 
  8 | - Vertex Notebooks
  9 | - Vertex AI Training
 10 |     - Using pre-built and custom containers
 11 |     - Hyperparameter tuning
 12 |     - Distributed Training
 13 | - Vertex AI Predictions
 14 |     - using pre-built and custom containers
 15 | - Vertex Tensorboard
 16 | - Vertex ML Metadata
 17 | 
 18 | ![Labs Focus](./images/vertex-ai-labs-focus.png)
 19 | 
 20 | 
 21 | ## Environment Setup
 22 | 
 23 | The following section describes requirements for setting up a GCP environment required for the workshop. Note that we have provided example [Terraform](https://www.terraform.io/) scripts to automate the process. You can find the scripts and the instructions in the `00-env-setup` folder. These are prerequisites for running the labs.
 24 | 
 25 | ### GCP Project
 26 | 
 27 | Each participant should have their own GCP project (through Qwiklabs) with project owner permissions to complete the setup steps.
 28 | 
 29 | The setup performs following asks
 30 | 
 31 |  1. Activate Google Cloud APIs required for the labs.
 32 |  2. Create service accounts required for running the labs.
 33 |  3. Create Google Cloud Storage bucket in the region configured (we will be using `us-central1`)
 34 |  4. Create a Vertex Notebooks instance to provision a managed JupyterLab notebook instance.
 35 |  5. Create a Vertex Tensorboard instance to monitor the experiments run as part of the lab.
 36 | 
 37 | ---
 38 | 
 39 | Please navigate to [00-env-setup](./00-env-setup/README.md) to setup the environment.
 40 | 
 41 | ---
 42 | 
 43 | Following are the details of the setup to run the labs:
 44 | 
 45 | ### Cloud APIs
 46 | 
 47 | The following APIs need to be enabled in the project:
 48 | 
 49 | - compute.googleapis.com
 50 | - iam.googleapis.com
 51 | - container.googleapis.com
 52 | - artifactregistry.googleapis.com
 53 | - cloudresourcemanager.googleapis.com
 54 | - cloudtrace.googleapis.com
 55 | - iamcredentials.googleapis.com
 56 | - monitoring.googleapis.com
 57 | - logging.googleapis.com
 58 | - notebooks.googleapis.com
 59 | - aiplatform.googleapis.com
 60 | - dataflow.googleapis.com
 61 | - bigquery.googleapis.com
 62 | - cloudbuild.googleapis.com
 63 | - bigquerydatatransfer.googleapis.com
 64 | 
 65 | ### GCP Region
 66 | 
 67 | Note that some services used during the notebook are only available in a limited number of regions. We recommend using `us-central1`.
 68 | 
 69 | ### Service accounts
 70 | 
 71 | Two service accounts must be created in the project.
 72 | 
 73 | #### Vertex AI training service account
 74 | 
 75 | This account will be used by Vertex Training service. The account needs the following permissions:
 76 | 
 77 | - storage.admin
 78 | - aiplatform.user
 79 | - bigquery.admin
 80 | 
 81 | The account email should be 
 82 | 
 83 | `training-sa@{PROJECT_ID}.iam.gserviceaccount.com`
 84 | 
 85 | #### Vertex AI pipelines service account
 86 | 
 87 | This account will be used by Vertex Pipelines service. The account needs the following permissions:
 88 | 
 89 | - storage.admin
 90 | - aiplatform.user
 91 | - bigquery.admin
 92 | 
 93 | The account email should be 
 94 | 
 95 | `pipelines-sa@{PROJECT_ID}.iam.gserviceaccount.com`
 96 | 
 97 | ### GCS buckets
 98 | 
 99 | Each participant should have their own regional GCS bucket. The bucket should be created in the GCP region that will be used during the workshop. The bucket name should use the following naming convention
100 | 
101 | `gs://{PREFIX}-bucket`
102 | 
103 | The goal of the prefix is too avoid conflicts between participants as such it should be unique for each participant. **The prefix should start with a letter and include letters and digits only**
104 | 
105 | The workshop notebooks assume this naming convention.
106 | 
107 | 
108 | ### Vertex AI Notebook
109 | 
110 | Each participant should have any instance of Vertex AI Notebook. The instances can be pre-created or can be created during the workshop.
111 | 
112 | The instance should be configured as follows:
113 | 
114 | - Machine type: **n1-standard-4**
115 | - Optionally GPUs can be added to the machine configuration if participants want to experiment with GPUs
116 | - Image family: **tf-2-4-cpu** or **tf-2-4-cu110** (if using GPUs)
117 | - Configured with the default compute engine service account
118 | 
119 | #### Vertex AI Notebook setup
120 | 
121 | The following setup steps will be performed during the workshop, individually by each of the participants.
122 | 
123 | In JupyterLab, open a terminal and:
124 | 
125 | #####  Install the required Python packages
126 | 
127 | ```
128 | pip install --user google-cloud-aiplatform
129 | pip install --user kfp
130 | pip install --user google-cloud-pipeline-components
131 | pip install --user google-cloud-bigquery-datatransfer
132 | ```
133 | 
134 | ##### Vertex Tensorboard instance
135 | 
136 | Each project will have their own Vertex Tensorboard instance created (by the script) in the region configured.
137 | 
138 | You can get the Tensorboard instance names at any time by listing Tensorboards in the project
139 | 
140 | ```
141 | gcloud beta ai tensorboards list \
142 |   --project $PROJECT --region $REGION
143 | ```
144 | 
145 | ##### Clone this repo
146 | ```
147 | git clone https://github.com/RajeshThallam/vertex-ai-labs
148 | ```
149 | 
150 | 
151 | ## References:
152 | 
153 | - https://github.com/jarokaz/vertex-ai-workshop/
154 | 


--------------------------------------------------------------------------------
/images/automl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/images/automl.png


--------------------------------------------------------------------------------
/images/custom-tabular.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/images/custom-tabular.png


--------------------------------------------------------------------------------
/images/custom-training-on-vertex-ai.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/images/custom-training-on-vertex-ai.png


--------------------------------------------------------------------------------
/images/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/images/pipeline.png


--------------------------------------------------------------------------------
/images/serving-with-custom-containers-on-vertex-predictions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/images/serving-with-custom-containers-on-vertex-predictions.png


--------------------------------------------------------------------------------
/images/training-with-custom-containers-on-vertex-training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/images/training-with-custom-containers-on-vertex-training.png


--------------------------------------------------------------------------------
/images/vertex-ai-labs-focus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RajeshThallam/vertex-ai-labs/42f7fcc00c585087684b6581d0671d0691d47724/images/vertex-ai-labs-focus.png


--------------------------------------------------------------------------------