├── test ├── setup │ ├── .gitignore │ ├── outputs.tf │ ├── variables.tf │ ├── versions.tf │ ├── iam.tf │ └── main.tf ├── .gitignore └── integration │ ├── discover_test.go │ ├── go.mod │ ├── analytics_lakehouse │ └── analytics_lakehouse_test.go │ └── go.sum ├── assets └── lakehouse-architecture.png ├── .github ├── renovate.json ├── release-please.yml ├── trusted-contribution.yml └── workflows │ ├── stale.yml │ ├── lint.yaml │ └── periodic-reporter.yaml ├── SECURITY.md ├── roles.txt ├── CODEOWNERS ├── examples └── analytics_lakehouse │ ├── variables.tf │ ├── main.tf │ ├── README.md │ ├── outputs.tf │ └── versions.tf ├── cloudbuild_mim.yaml ├── .gitIgnore ├── versions.tf ├── src ├── sql │ ├── sp_bigqueryml_model.sql │ ├── sp_sample_queries.sql │ ├── view_ecommerce.sql │ └── sp_lookerstudio_report.sql ├── python │ ├── bigquery.py │ └── bigtable.py ├── shell │ └── post_startup.sh ├── yaml │ ├── project-setup.yaml │ └── copy-data.yaml └── ipynb │ ├── exploratory-analysis.ipynb │ ├── spark_langchain.ipynb │ └── spark_ml.ipynb ├── metadata.display.yaml ├── variables.tf ├── outputs.tf ├── dataproc.tf ├── workbench.tf ├── Makefile ├── metadata.yaml ├── tutorial.md ├── README.md ├── deploy_solution.sh ├── deploy_via_trigger.sh ├── CONTRIBUTING.md ├── workflows.tf ├── bigquery.tf ├── main.tf ├── dataplex.tf ├── LICENSE └── CHANGELOG.md /test/setup/.gitignore: -------------------------------------------------------------------------------- 1 | terraform.tfvars 2 | source.sh 3 | -------------------------------------------------------------------------------- /assets/lakehouse-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/HEAD/assets/lakehouse-architecture.png -------------------------------------------------------------------------------- /test/.gitignore: -------------------------------------------------------------------------------- 1 | source.sh 2 | 3 | # Local .terraform directories 4 | **/.terraform/* 5 | **/.terraform.lock.* 6 | 7 | # .tfstate files 8 | *.tfstate 9 | *.tfstate.* 10 | -------------------------------------------------------------------------------- /.github/renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://docs.renovatebot.com/renovate-schema.json", 3 | "extends": ["github>GoogleCloudPlatform/cloud-foundation-toolkit//infra/terraform/test-org/github/resources/renovate"] 4 | } 5 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | To report a security issue, please use http://g.co/vulnz. We use 2 | http://g.co/vulnz for our intake, and do coordination and disclosure here on 3 | GitHub (including using GitHub Security Advisory). The Google Security Team will 4 | respond within 5 working days of your report on g.co/vulnz. 5 | -------------------------------------------------------------------------------- /roles.txt: -------------------------------------------------------------------------------- 1 | roles/bigquery.admin 2 | roles/compute.admin 3 | roles/config.agent 4 | roles/dataplex.admin 5 | roles/dataproc.admin 6 | roles/iam.serviceAccountAdmin 7 | roles/iam.serviceAccountUser 8 | roles/logging.configWriter 9 | roles/notebooks.admin 10 | roles/resourcemanager.projectIamAdmin 11 | roles/serviceusage.serviceUsageAdmin 12 | roles/storage.admin 13 | roles/workflows.admin 14 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | # NOTE: This file is automatically generated from values at: 2 | # https://github.com/GoogleCloudPlatform/cloud-foundation-toolkit/blob/main/infra/terraform/test-org/org/locals.tf 3 | 4 | * @GoogleCloudPlatform/blueprint-solutions @bradmiro @davenportjw @GoogleCloudPlatform/jump-start-solutions-admins 5 | 6 | # NOTE: GitHub CODEOWNERS locations: 7 | # https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners#codeowners-and-branch-protection 8 | 9 | CODEOWNERS @GoogleCloudPlatform/blueprint-solutions 10 | .github/CODEOWNERS @GoogleCloudPlatform/blueprint-solutions 11 | docs/CODEOWNERS @GoogleCloudPlatform/blueprint-solutions 12 | 13 | -------------------------------------------------------------------------------- /.github/release-please.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | releaseType: terraform-module 16 | handleGHRelease: true 17 | bumpMinorPreMajor: true 18 | -------------------------------------------------------------------------------- /examples/analytics_lakehouse/variables.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2021 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | variable "project_id" { 18 | description = "The ID of the project in which to provision resources." 19 | type = string 20 | } 21 | -------------------------------------------------------------------------------- /test/integration/discover_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package test 16 | 17 | import ( 18 | "testing" 19 | 20 | "github.com/GoogleCloudPlatform/cloud-foundation-toolkit/infra/blueprint-test/pkg/tft" 21 | ) 22 | 23 | func TestAll(t *testing.T) { 24 | tft.AutoDiscoverAndTest(t) 25 | } 26 | -------------------------------------------------------------------------------- /examples/analytics_lakehouse/main.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2021 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | module "analytics_lakehouse" { 18 | source = "GoogleCloudPlatform/analytics-lakehouse/google" 19 | version = "~> 0.4" 20 | 21 | project_id = var.project_id 22 | region = "us-central1" 23 | force_destroy = true 24 | 25 | } 26 | -------------------------------------------------------------------------------- /test/setup/outputs.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2019 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | output "project_id" { 18 | value = module.project.project_id 19 | } 20 | 21 | output "sa_key" { 22 | value = google_service_account_key.int_test.private_key 23 | sensitive = true 24 | } 25 | 26 | output "kms_keys" { 27 | value = module.kms_keyring.keys 28 | } 29 | -------------------------------------------------------------------------------- /cloudbuild_mim.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | steps: 15 | - name: 'gcr.io/google.com/cloudsdktool/cloud-sdk' 16 | entrypoint: 'bash' 17 | args: ['./deploy_via_trigger.sh', '-p$PROJECT_ID'] 18 | serviceAccount: 'projects/$PROJECT_ID/serviceAccounts/cloudbuild-trigger-default@$PROJECT_ID.iam.gserviceaccount.com' 19 | options: 20 | logging: CLOUD_LOGGING_ONLY 21 | -------------------------------------------------------------------------------- /test/setup/variables.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2019 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | variable "org_id" { 17 | description = "The numeric organization id" 18 | } 19 | 20 | variable "folder_id" { 21 | description = "The folder to deploy in" 22 | } 23 | 24 | variable "billing_account" { 25 | description = "The billing account id associated with the project, e.g. XXXXXX-YYYYYY-ZZZZZZ" 26 | } 27 | -------------------------------------------------------------------------------- /test/setup/versions.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2019 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | terraform { 18 | required_version = ">= 1.5" 19 | required_providers { 20 | google = { 21 | source = "hashicorp/google" 22 | version = ">= 3.25.0" 23 | } 24 | google-beta = { 25 | source = "hashicorp/google-beta" 26 | version = ">= 3.25.0" 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /.github/trusted-contribution.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # NOTE: This file is automatically generated from: 16 | # https://github.com/GoogleCloudPlatform/cloud-foundation-toolkit/blob/main/infra/terraform/test-org/github 17 | 18 | annotations: 19 | - type: comment 20 | text: "/gcbrun" 21 | trustedContributors: 22 | - release-please[bot] 23 | - renovate[bot] 24 | - renovate-bot 25 | - forking-renovate[bot] 26 | - dependabot[bot] 27 | -------------------------------------------------------------------------------- /examples/analytics_lakehouse/README.md: -------------------------------------------------------------------------------- 1 | # Analytics Lakehouse Example 2 | 3 | This example illustrates how to use the `analytics_lakehouse` module. 4 | 5 | 6 | ## Inputs 7 | 8 | | Name | Description | Type | Default | Required | 9 | |------|-------------|------|---------|:--------:| 10 | | project\_id | The ID of the project in which to provision resources. | `string` | n/a | yes | 11 | 12 | ## Outputs 13 | 14 | | Name | Description | 15 | |------|-------------| 16 | | bigquery\_editor\_url | The URL to launch the BigQuery editor | 17 | | lakehouse\_colab\_url | The URL to launch the Colab instance | 18 | | lookerstudio\_report\_url | The URL to create a new Looker Studio report | 19 | 20 | 21 | 22 | To provision this example, run the following from within this directory: 23 | - `terraform init` to get the plugins 24 | - `terraform plan` to see the infrastructure plan 25 | - `terraform apply` to apply the infrastructure build 26 | - `terraform destroy` to destroy the built infrastructure 27 | -------------------------------------------------------------------------------- /.gitIgnore: -------------------------------------------------------------------------------- 1 | #vs code 2 | *code-workspace* 3 | 4 | # OSX leaves these everywhere on SMB shares 5 | ._* 6 | 7 | # OSX trash 8 | .DS_Store 9 | 10 | # BIN 11 | 12 | .venv/bin/* 13 | .venv/lib/* 14 | 15 | # Python 16 | *.pyc 17 | *.txt 18 | *.exe 19 | *.cfg 20 | *.sh 21 | 22 | 23 | 24 | # Emacs save files 25 | *~ 26 | \#*\# 27 | .\#* 28 | 29 | # Vim-related files 30 | [._]*.s[a-w][a-z] 31 | [._]s[a-w][a-z] 32 | *.un~ 33 | Session.vim 34 | .netrwhist 35 | 36 | ### https://raw.github.com/github/gitignore/90f149de451a5433aebd94d02d11b0e28843a1af/Terraform.gitignore 37 | 38 | # Local .terraform directories 39 | **/.terraform/* 40 | **/.terraform.lock.* 41 | 42 | # .tfstate files 43 | *.tfstate 44 | *.tfstate.* 45 | 46 | # Crash log files 47 | crash.log 48 | 49 | # Kitchen files 50 | **/inspec.lock 51 | **/.kitchen 52 | **/kitchen.local.yml 53 | **/Gemfile.lock 54 | 55 | # Ignore any .tfvars files that are generated automatically for each Terraform run. Most 56 | # .tfvars files are managed as part of configuration and so should be included in 57 | # version control. 58 | **/*.tfvars 59 | 60 | credentials.json 61 | 62 | # tf lock file 63 | .terraform.lock.hcl 64 | 65 | **/*.zip 66 | 67 | .vscode/ 68 | -------------------------------------------------------------------------------- /examples/analytics_lakehouse/outputs.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2021 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | output "lookerstudio_report_url" { 18 | value = module.analytics_lakehouse.lookerstudio_report_url 19 | description = "The URL to create a new Looker Studio report" 20 | } 21 | 22 | output "bigquery_editor_url" { 23 | value = module.analytics_lakehouse.bigquery_editor_url 24 | description = "The URL to launch the BigQuery editor" 25 | } 26 | 27 | output "lakehouse_colab_url" { 28 | value = module.analytics_lakehouse.lakehouse_colab_url 29 | description = "The URL to launch the Colab instance" 30 | } 31 | -------------------------------------------------------------------------------- /test/setup/iam.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2019 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | locals { 18 | int_required_roles = [ 19 | "roles/owner", 20 | "roles/bigquery.dataViewer" 21 | ] 22 | } 23 | 24 | resource "google_service_account" "int_test" { 25 | project = module.project.project_id 26 | account_id = "ci-account" 27 | display_name = "ci-account" 28 | } 29 | 30 | resource "google_project_iam_member" "int_test" { 31 | count = length(local.int_required_roles) 32 | 33 | project = module.project.project_id 34 | role = local.int_required_roles[count.index] 35 | member = "serviceAccount:${google_service_account.int_test.email}" 36 | } 37 | 38 | resource "google_service_account_key" "int_test" { 39 | service_account_id = google_service_account.int_test.id 40 | } 41 | -------------------------------------------------------------------------------- /examples/analytics_lakehouse/versions.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2021 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | terraform { 18 | required_version = ">= 1.5" 19 | required_providers { 20 | google = { 21 | source = "hashicorp/google" 22 | version = ">= 5.10.0, < 7.0.0" 23 | } 24 | google-beta = { 25 | source = "hashicorp/google-beta" 26 | version = ">= 5.10.0, < 7.0.0" 27 | } 28 | random = { 29 | source = "hashicorp/random" 30 | version = ">= 2" 31 | } 32 | archive = { 33 | source = "hashicorp/archive" 34 | version = ">= 2" 35 | } 36 | time = { 37 | source = "hashicorp/time" 38 | version = ">= 0.9.1" 39 | } 40 | http = { 41 | source = "hashicorp/http" 42 | version = ">= 3.2.1" 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /versions.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2023 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | terraform { 18 | required_version = ">= 1.5" 19 | required_providers { 20 | google = { 21 | source = "hashicorp/google" 22 | version = ">= 6.11.0, < 7.0.0" 23 | } 24 | google-beta = { 25 | source = "hashicorp/google-beta" 26 | version = ">= 5.10.0, < 7.0.0" 27 | } 28 | random = { 29 | source = "hashicorp/random" 30 | version = ">= 2" 31 | } 32 | archive = { 33 | source = "hashicorp/archive" 34 | version = ">= 2" 35 | } 36 | time = { 37 | source = "hashicorp/time" 38 | version = ">= 0.9.1" 39 | } 40 | http = { 41 | source = "hashicorp/http" 42 | version = ">= 3.2.1" 43 | } 44 | } 45 | 46 | provider_meta "google" { 47 | module_name = "blueprints/terraform/terraform-google-analytics-lakehouse/v0.4.0" 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/sql/sp_bigqueryml_model.sql: -------------------------------------------------------------------------------- 1 | -- Copyright 2023 Google LLC 2 | -- 3 | -- Licensed under the Apache License, Version 2.0 (the "License"); 4 | -- you may not use this file except in compliance with the License. 5 | -- You may obtain a copy of the License at 6 | -- 7 | -- http://www.apache.org/licenses/LICENSE-2.0 8 | -- 9 | -- Unless required by applicable law or agreed to in writing, software 10 | -- distributed under the License is distributed on an "AS IS" BASIS, 11 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | -- See the License for the specific language governing permissions and 13 | -- limitations under the License. 14 | 15 | /* Run a query to see the prediction results of the model 16 | -- 17 | select * from ML.PREDICT(MODEL ds_edw.model_taxi_estimate, 18 | TABLE ds_edw.taxi_trips) 19 | limit 1000; */ 20 | 21 | --Model Example 22 | CREATE OR REPLACE MODEL 23 | `${project_id}.ds_edw.model_taxi_estimate` OPTIONS ( MODEL_TYPE='LINEAR_REG', 24 | LS_INIT_LEARN_RATE=0.15, 25 | L1_REG=1, 26 | MAX_ITERATIONS=5 ) AS 27 | SELECT 28 | pickup_datetime, 29 | dropoff_datetime, 30 | IFNULL(passenger_count,0) passenger_count, 31 | IFNULL(trip_distance,0) trip_distance, 32 | IFNULL(rate_code,'') rate_code, 33 | IFNULL(payment_type,'') payment_type, 34 | IFNULL(fare_amount,0) label, 35 | IFNULL(pickup_location_id,'') pickup_location_id, 36 | IFNULL(dropoff_location_id,'')dropoff_location_id 37 | FROM 38 | `${project_id}.ds_edw.taxi_trips` 39 | WHERE 40 | fare_amount > 0; 41 | -------------------------------------------------------------------------------- /src/sql/sp_sample_queries.sql: -------------------------------------------------------------------------------- 1 | -- Copyright 2023 Google LLC 2 | -- 3 | -- Licensed under the Apache License, Version 2.0 (the "License"); 4 | -- you may not use this file except in compliance with the License. 5 | -- You may obtain a copy of the License at 6 | -- 7 | -- http://www.apache.org/licenses/LICENSE-2.0 8 | -- 9 | -- Unless required by applicable law or agreed to in writing, software 10 | -- distributed under the License is distributed on an "AS IS" BASIS, 11 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | -- See the License for the specific language governing permissions and 13 | -- limitations under the License. 14 | 15 | /* 16 | Use Cases: 17 | - BigQuery supports full SQL syntax and many analytic functions that make complex queries of lots of data easy 18 | 19 | Description: 20 | - Show joins, date functions, rank, partition, pivot 21 | 22 | Reference: 23 | - Rank/Partition: https://cloud.google.com/bigquery/docs/reference/standard-sql/analytic-function-concepts 24 | - Pivot: https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#pivot_operator 25 | 26 | Clean up / Reset script: 27 | n/a 28 | */ 29 | 30 | --Rank, Pivot, Json 31 | 32 | -- Query: Get number of orders by category, name, id 33 | SELECT 34 | oi.product_id AS product_id, 35 | p.name AS product_name, 36 | p.category AS product_category, 37 | COUNT(*) AS num_of_orders 38 | FROM 39 | `solutions-2023-testing-c.gcp_lakehouse_ds.gcp_tbl_products` AS p 40 | JOIN 41 | `solutions-2023-testing-c.gcp_lakehouse_ds.gcp_tbl_order_items` AS oi 42 | ON 43 | p.id = oi.product_id 44 | GROUP BY 45 | 1, 46 | 2, 47 | 3 48 | ORDER BY 49 | num_of_orders DESC 50 | -------------------------------------------------------------------------------- /.github/workflows/stale.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2022-2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # NOTE: This file is automatically generated from: 16 | # https://github.com/GoogleCloudPlatform/cloud-foundation-toolkit/blob/main/infra/terraform/test-org/github 17 | 18 | name: "Close stale issues" 19 | on: 20 | schedule: 21 | - cron: "0 23 * * *" 22 | 23 | permissions: 24 | contents: read 25 | issues: write 26 | pull-requests: write 27 | actions: write 28 | 29 | jobs: 30 | stale: 31 | if: github.repository_owner == 'GoogleCloudPlatform' || github.repository_owner == 'terraform-google-modules' 32 | runs-on: ubuntu-latest 33 | steps: 34 | - uses: actions/stale@v10 35 | with: 36 | repo-token: ${{ secrets.GITHUB_TOKEN }} 37 | stale-issue-message: 'This issue is stale because it has been open 60 days with no activity. Remove stale label or comment or this will be closed in 7 days' 38 | stale-pr-message: 'This PR is stale because it has been open 60 days with no activity. Remove stale label or comment or this will be closed in 7 days' 39 | exempt-issue-labels: 'triaged' 40 | exempt-pr-labels: 'dependencies,autorelease: pending' 41 | operations-per-run: 100 42 | -------------------------------------------------------------------------------- /metadata.display.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: blueprints.cloud.google.com/v1alpha1 16 | kind: BlueprintMetadata 17 | metadata: 18 | name: terraform-google-analytics-lakehouse-display 19 | spec: 20 | info: 21 | title: terraform-google-lakehouse 22 | source: 23 | repo: https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse.git 24 | sourceType: git 25 | ui: 26 | input: 27 | variables: 28 | deletion_protection: 29 | name: deletion_protection 30 | title: Deletion Protection 31 | enable_apis: 32 | name: enable_apis 33 | title: Enable Apis 34 | force_destroy: 35 | name: force_destroy 36 | title: Force Destroy 37 | labels: 38 | name: labels 39 | title: Labels 40 | project_id: 41 | name: project_id 42 | title: Project Id 43 | public_data_bucket: 44 | name: public_data_bucket 45 | title: Public Data Bucket 46 | region: 47 | name: region 48 | title: Region 49 | use_case_short: 50 | name: use_case_short 51 | title: Use Case Short 52 | -------------------------------------------------------------------------------- /test/setup/main.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2019 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | module "project" { 18 | source = "terraform-google-modules/project-factory/google" 19 | version = "~> 18.0" 20 | 21 | name = "ci-bigquery" 22 | random_project_id = "true" 23 | random_project_id_length = 10 24 | org_id = var.org_id 25 | folder_id = var.folder_id 26 | billing_account = var.billing_account 27 | default_service_account = "keep" 28 | 29 | activate_apis = [ 30 | "cloudkms.googleapis.com", 31 | "cloudresourcemanager.googleapis.com", 32 | "bigquery.googleapis.com", 33 | "bigquerystorage.googleapis.com", 34 | "bigqueryconnection.googleapis.com", 35 | "serviceusage.googleapis.com", 36 | "iam.googleapis.com", 37 | ] 38 | } 39 | 40 | module "kms_keyring" { 41 | source = "terraform-google-modules/kms/google" 42 | version = "~> 4.0" 43 | 44 | project_id = module.project.project_id 45 | location = "us" 46 | keyring = "ci-bigquery-keyring" 47 | keys = ["foo"] 48 | prevent_destroy = "false" 49 | depends_on = [ 50 | module.project 51 | ] 52 | } 53 | 54 | data "google_bigquery_default_service_account" "initialize_encryption_account" { 55 | project = module.project.project_id 56 | } 57 | -------------------------------------------------------------------------------- /variables.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2023 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | # -------------------------------------------------- 18 | # VARIABLES 19 | # Set these before applying the configuration 20 | # -------------------------------------------------- 21 | 22 | variable "project_id" { 23 | type = string 24 | description = "Google Cloud Project ID" 25 | } 26 | 27 | variable "region" { 28 | type = string 29 | description = "Google Cloud Region" 30 | default = "us-central1" 31 | } 32 | 33 | variable "labels" { 34 | type = map(string) 35 | description = "A map of labels to apply to contained resources." 36 | default = { "analytics-lakehouse" = true } 37 | } 38 | 39 | variable "enable_apis" { 40 | type = string 41 | description = "Whether or not to enable underlying apis in this solution. ." 42 | default = true 43 | } 44 | 45 | variable "force_destroy" { 46 | type = string 47 | description = "Whether or not to protect GCS resources from deletion when solution is modified or changed." 48 | default = false 49 | } 50 | 51 | variable "use_case_short" { 52 | type = string 53 | description = "Short name for use case" 54 | default = "lakehouse" 55 | } 56 | 57 | variable "public_data_bucket" { 58 | type = string 59 | description = "Public Data bucket for access" 60 | default = "data-analytics-demos" 61 | } 62 | -------------------------------------------------------------------------------- /outputs.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2021 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | output "workflow_return_project_setup" { 18 | description = "Output of the project setup workflow" 19 | value = data.http.call_workflows_project_setup.response_body 20 | } 21 | 22 | output "lookerstudio_report_url" { 23 | value = "https://lookerstudio.google.com/reporting/create?c.reportId=79675b4f-9ed8-4ee4-bb35-709b8fd5306a&ds.ds0.datasourceName=vw_ecommerce&ds.ds0.projectId=${var.project_id}&ds.ds0.type=TABLE&ds.ds0.datasetId=gcp_lakehouse_ds&ds.ds0.tableId=view_ecommerce" 24 | description = "The URL to create a new Looker Studio report displays a sample dashboard for data analysis" 25 | } 26 | 27 | output "bigquery_editor_url" { 28 | value = "https://console.cloud.google.com/bigquery?project=${var.project_id}" 29 | description = "The URL to launch the BigQuery editor" 30 | } 31 | 32 | output "neos_tutorial_url" { 33 | value = "http://console.cloud.google.com/products/solutions/deployments?walkthrough_id=panels--sic--analytics-lakehouse_toc" 34 | description = "The URL to launch the in-console tutorial for the Analytics Lakehouse solution" 35 | } 36 | 37 | output "lakehouse_colab_url" { 38 | value = "https://colab.research.google.com/github/GoogleCloudPlatform/terraform-google-analytics-lakehouse/blob/main/src/ipynb/exploratory-analysis.ipynb" 39 | description = "The URL to launch the in-console tutorial for the Analytics Lakehouse solution" 40 | } 41 | -------------------------------------------------------------------------------- /src/python/bigquery.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Copyright 2023 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """BigQuery I/O with BigLake Iceberg PySpark example.""" 17 | from pyspark.sql import SparkSession 18 | import json 19 | import os 20 | 21 | spark = SparkSession \ 22 | .builder \ 23 | .appName("spark-bigquery-demo") \ 24 | .enableHiveSupport() \ 25 | .getOrCreate() 26 | 27 | 28 | def load_arg(arg): 29 | return str(json.loads(os.environ[f"BIGQUERY_PROC_PARAM.{arg}"])) 30 | 31 | 32 | catalog = load_arg("lakehouse_catalog") 33 | database = load_arg("lakehouse_database") 34 | bq_dataset = load_arg("bq_dataset") 35 | 36 | # Delete the BigLake Catalog if it currently exists to ensure proper setup. 37 | spark.sql(f"DROP NAMESPACE IF EXISTS {catalog} CASCADE;") 38 | 39 | # Create BigLake Catalog and Database if they are not already created. 40 | spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {catalog};") 41 | spark.sql(f"CREATE DATABASE IF NOT EXISTS {catalog}.{database};") 42 | spark.sql(f"DROP TABLE IF EXISTS {catalog}.{database}.agg_events_iceberg;") 43 | 44 | # Load data from BigQuery. 45 | events = spark.read.format("bigquery") \ 46 | .option("table", "gcp_primary_staging.thelook_ecommerce_events") \ 47 | .load() 48 | events.createOrReplaceTempView("events") 49 | 50 | # Create Iceberg Table if not exists 51 | spark.sql( 52 | f"""CREATE TABLE IF NOT EXISTS {catalog}.{database}.agg_events_iceberg 53 | (user_id string, event_count bigint) 54 | USING iceberg 55 | TBLPROPERTIES( 56 | bq_table='{bq_dataset}.agg_events_iceberg'); 57 | """ 58 | ) 59 | 60 | # Create Iceberg Table if not exists 61 | spark.sql( 62 | f"""INSERT INTO {catalog}.{database}.agg_events_iceberg 63 | (user_id, event_count) 64 | select user_id, count(session_id) 65 | from events 66 | group by user_id; 67 | """ 68 | ) 69 | -------------------------------------------------------------------------------- /src/sql/view_ecommerce.sql: -------------------------------------------------------------------------------- 1 | -- Copyright 2023 Google LLC 2 | -- 3 | -- Licensed under the Apache License, Version 2.0 (the "License"); 4 | -- you may not use this file except in compliance with the License. 5 | -- You may obtain a copy of the License at 6 | -- 7 | -- http://www.apache.org/licenses/LICENSE-2.0 8 | -- 9 | -- Unless required by applicable law or agreed to in writing, software 10 | -- distributed under the License is distributed on an "AS IS" BASIS, 11 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | -- See the License for the specific language governing permissions and 13 | -- limitations under the License. 14 | CREATE OR REPLACE VIEW 15 | gcp_lakehouse_ds.view_ecommerce AS 16 | SELECT 17 | o.order_id, 18 | o.user_id order_user_id, 19 | o.status order_status, 20 | o.created_at order_created_at, 21 | o.returned_at order_returned_at, 22 | o.shipped_at order_shipped_at, 23 | o.delivered_at order_delivered_at, 24 | o.num_of_item order_number_of_items, 25 | i.id AS order_items_id, 26 | i.product_id AS order_items_product_id, 27 | i.status order_items_status, 28 | i.sale_price order_items_sale_price, 29 | p.id AS product_id, 30 | p.cost product_cost, 31 | p.category product_category, 32 | p.name product_name, 33 | p.brand product_brand, 34 | p.retail_price product_retail_price, 35 | p.department product_department, 36 | p.sku product_sku, 37 | p.distribution_center_id, 38 | d.name AS dist_center_name, 39 | d.latitude dist_center_lat, 40 | d.longitude dist_center_long, 41 | u.id AS user_id, 42 | u.first_name user_first_name, 43 | u.last_name user_last_name, 44 | u.age user_age, 45 | u.gender user_gender, 46 | u.state user_state, 47 | u.postal_code user_postal_code, 48 | u.city user_city, 49 | u.country user_country, 50 | u.latitude user_lat, 51 | u.longitude user_long, 52 | u.traffic_source user_traffic_source 53 | FROM 54 | gcp_primary_staging.thelook_ecommerce_orders o 55 | INNER JOIN 56 | gcp_primary_staging.thelook_ecommerce_order_items i 57 | ON 58 | o.order_id = i.order_id 59 | INNER JOIN 60 | `gcp_primary_staging.thelook_ecommerce_products` p 61 | ON 62 | i.product_id = p.id 63 | INNER JOIN 64 | `gcp_primary_staging.thelook_ecommerce_distribution_centers` d 65 | ON 66 | p.distribution_center_id = d.id 67 | INNER JOIN 68 | `gcp_primary_staging.thelook_ecommerce_users` u 69 | ON 70 | o.user_id = u.id 71 | ; 72 | -------------------------------------------------------------------------------- /src/shell/post_startup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2024 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | # Retrieve current project and location using gcloud 18 | PROJECT=$(gcloud config get-value project) 19 | ZONE=$(curl -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/zone) 20 | LOCATION="$(echo "$ZONE" | awk -F/ '{split($4, a, "-"); print a[1]"-"a[2]}')" 21 | echo "Current instance location: $LOCATION" 22 | 23 | # Specify the file name 24 | YAML_FILE="temp.yaml" 25 | declare -a NOTEBOOKS=("spark_langchain.ipynb" "spark_ml.ipynb") 26 | 27 | # Define the content for the YAML file 28 | YAML_CONTENT=$(cat < /home/jupyter/"$YAML_FILE" 46 | 47 | # Use wget to download the file and check if the download was successful 48 | for NOTEBOOK in "${NOTEBOOKS[@]}" 49 | do 50 | # Specify the GitHub repository URL and the file path 51 | REPO_URL="https://raw.githubusercontent.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/main/src/ipynb/$NOTEBOOK" 52 | if wget "$REPO_URL" -O /home/jupyter/"$NOTEBOOK"; then 53 | echo "File downloaded successfully." 54 | else 55 | echo "Error downloading the file." 56 | fi 57 | done 58 | 59 | # Import Dataproc session template 60 | gcloud beta dataproc session-templates import sparkml-template \ 61 | --source=/home/jupyter/"$YAML_FILE" --project="$PROJECT" --location="$LOCATION" --quiet 62 | 63 | # Delete temporal YAML config file 64 | rm /home/jupyter/"$YAML_FILE" 65 | -------------------------------------------------------------------------------- /dataproc.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2023 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | # Set up networking 18 | resource "google_compute_network" "default_network" { 19 | project = module.project-services.project_id 20 | name = "vpc-${var.use_case_short}" 21 | description = "Default network" 22 | auto_create_subnetworks = false 23 | mtu = 1460 24 | } 25 | 26 | # add destroy sleep here 27 | 28 | resource "google_compute_subnetwork" "subnet" { 29 | project = module.project-services.project_id 30 | name = "dataproc-subnet" 31 | ip_cidr_range = "10.3.0.0/16" 32 | region = var.region 33 | network = google_compute_network.default_network.id 34 | private_ip_google_access = true 35 | } 36 | 37 | # Firewall rule for dataproc cluster 38 | resource "google_compute_firewall" "subnet_firewall_rule" { 39 | project = module.project-services.project_id 40 | name = "dataproc-firewall" 41 | network = google_compute_network.default_network.id 42 | 43 | allow { 44 | protocol = "icmp" 45 | } 46 | 47 | allow { 48 | protocol = "tcp" 49 | } 50 | 51 | allow { 52 | protocol = "udp" 53 | } 54 | source_ranges = ["10.3.0.0/16"] 55 | 56 | depends_on = [ 57 | google_compute_subnetwork.subnet 58 | ] 59 | } 60 | 61 | 62 | # Set up Dataproc service account for the Cloud Function to execute as 63 | # # Set up the Dataproc service account 64 | resource "google_service_account" "dataproc_service_account" { 65 | project = module.project-services.project_id 66 | account_id = "dataproc-sa-${random_id.id.hex}" 67 | display_name = "Service Account for Dataproc Execution" 68 | } 69 | 70 | resource "google_project_iam_member" "dataproc_sa_roles" { 71 | for_each = toset([ 72 | "roles/storage.objectAdmin", 73 | "roles/bigquery.connectionAdmin", 74 | "roles/biglake.admin", 75 | "roles/bigquery.dataOwner", 76 | "roles/bigquery.user", 77 | "roles/dataproc.worker", 78 | ]) 79 | 80 | project = module.project-services.project_id 81 | role = each.key 82 | member = "serviceAccount:${google_service_account.dataproc_service_account.email}" 83 | } 84 | -------------------------------------------------------------------------------- /workbench.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2024 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | # Creates a service account specifically for the Workbench instance. 18 | resource "google_service_account" "workbench_service_account" { 19 | project = module.project-services.project_id 20 | account_id = "workbench-sa-${random_id.id.hex}" 21 | display_name = "Service Account for Workbench Instance" 22 | } 23 | 24 | # Grants necessary roles to the Workbench service account. 25 | resource "google_project_iam_member" "workbench_sa_roles" { 26 | for_each = toset([ 27 | "roles/iam.serviceAccountUser", 28 | "roles/storage.objectAdmin", 29 | "roles/compute.osAdminLogin", 30 | "roles/dataproc.admin", 31 | ]) 32 | 33 | project = module.project-services.project_id 34 | role = each.key 35 | member = "serviceAccount:${google_service_account.workbench_service_account.email}" 36 | } 37 | 38 | # Provisions a new Workbench instance. 39 | resource "google_workbench_instance" "workbench_instance" { 40 | name = "gcp-${var.use_case_short}-workbench-instance-${random_id.id.hex}" 41 | project = module.project-services.project_id 42 | location = "${var.region}-b" 43 | desired_state = "STOPPED" 44 | 45 | gce_setup { 46 | machine_type = "e2-standard-4" 47 | 48 | vm_image { 49 | project = "cloud-notebooks-managed" 50 | name = "workbench-instances-v20231108-py310" 51 | } 52 | 53 | boot_disk { 54 | disk_type = "PD_STANDARD" 55 | } 56 | 57 | data_disks { 58 | disk_type = "PD_STANDARD" 59 | } 60 | 61 | network_interfaces { 62 | network = google_compute_network.default_network.id 63 | subnet = google_compute_subnetwork.subnet.id 64 | nic_type = "GVNIC" 65 | } 66 | 67 | disable_public_ip = false 68 | 69 | service_accounts { 70 | email = google_service_account.workbench_service_account.email 71 | } 72 | 73 | metadata = { 74 | proxy-mode = "service_account" 75 | idle-timeout-seconds = "10800" 76 | report-event-health = "true" 77 | disable-mixer = "false" 78 | post-startup-script = "gs://${google_storage_bucket.provisioning_bucket.name}/post_startup.sh" 79 | report-dns-resolution = "true" 80 | } 81 | 82 | enable_ip_forwarding = true 83 | } 84 | 85 | depends_on = [ 86 | google_project_iam_member.workbench_sa_roles, 87 | google_compute_firewall.subnet_firewall_rule 88 | ] 89 | } 90 | -------------------------------------------------------------------------------- /src/yaml/project-setup.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This defines the Google Workflow for the Analytics lakehouse Soultion: https://console.cloud.google.com/products/solutions/details/analytics-lakehouse 16 | # This Workflow executes through Terraform. For Google Workflows executed via Terraform, variables are defined such that: 17 | # 18 | # - Terraform environment variables are denoted by $ 19 | # - Google Workflow variables are escaped via $$ 20 | # 21 | # To modify this Workflow to stand alone (no Terraform): 22 | # 23 | # - Replace vars in `main` -> `steps` -> `assign` with your own (or use https://cloud.google.com/workflows/docs/passing-runtime-arguments#gcloud) 24 | # - Change all $$ to $ 25 | 26 | main: 27 | params: [] 28 | steps: 29 | # If this workflow has been run before, do not run again 30 | - sub_check_if_run: 31 | steps: 32 | - assign_values: 33 | assign: 34 | - project_id: $${sys.get_env("GOOGLE_CLOUD_PROJECT_ID")} 35 | - location: $${sys.get_env("GOOGLE_CLOUD_LOCATION")} 36 | - workflow_id: $${sys.get_env("GOOGLE_CLOUD_WORKFLOW_ID")} 37 | - get_executions: 38 | call: http.get 39 | args: 40 | url: $${"https://workflowexecutions.googleapis.com/v1/projects/"+project_id+"/locations/"+location+"/workflows/"+workflow_id+"/executions"} 41 | auth: 42 | type: OAuth2 43 | result: Operation 44 | - check_if_run: 45 | switch: 46 | - condition: $${len(Operation.body.executions) > 1} 47 | next: end 48 | - sub_create_taxonomy: 49 | call: create_taxonomy 50 | result: create_taxonomy_output 51 | 52 | # Subworkflow to Dataplex taxonomy 53 | create_taxonomy: 54 | steps: 55 | - assign_values: 56 | assign: 57 | - project_id: $${sys.get_env("GOOGLE_CLOUD_PROJECT_ID")} 58 | - location: $${sys.get_env("GOOGLE_CLOUD_LOCATION")} 59 | - ufdataplex_job: 60 | call: http.post 61 | args: 62 | url: $${"https://dataplex.googleapis.com/v1/projects/"+project_id+"/locations/"+location+"/dataTaxonomies?alt=json&dataTaxonomyId=sample-taxonomy&validateOnly=False"} 63 | auth: 64 | type: OAuth2 65 | body: 66 | description: Sample Taxonomy Description 67 | displayName: Sample Taxonomy Display Name 68 | result: Operation 69 | - returnResult: 70 | return: $${Operation} 71 | -------------------------------------------------------------------------------- /test/integration/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/terraform-google-modules/terraform-google-analytics-lakehouse/test/integration 2 | 3 | go 1.23.0 4 | 5 | toolchain go1.23.7 6 | 7 | require ( 8 | github.com/GoogleCloudPlatform/cloud-foundation-toolkit/infra/blueprint-test v0.17.6 9 | github.com/stretchr/testify v1.10.0 10 | ) 11 | 12 | require ( 13 | github.com/agext/levenshtein v1.2.3 // indirect 14 | github.com/alexflint/go-filemutex v1.3.0 // indirect 15 | github.com/apparentlymart/go-textseg/v15 v15.0.0 // indirect 16 | github.com/bgentry/go-netrc v0.0.0-20140422174119-9fd32a8b3d3d // indirect 17 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect 18 | github.com/go-errors/errors v1.5.0 // indirect 19 | github.com/go-openapi/jsonpointer v0.21.0 // indirect 20 | github.com/go-openapi/jsonreference v0.20.2 // indirect 21 | github.com/go-openapi/swag v0.23.0 // indirect 22 | github.com/google/gnostic-models v0.6.9 // indirect 23 | github.com/google/go-cmp v0.6.0 // indirect 24 | github.com/gruntwork-io/terratest v0.48.2 // indirect 25 | github.com/hashicorp/errwrap v1.1.0 // indirect 26 | github.com/hashicorp/go-cleanhttp v0.5.2 // indirect 27 | github.com/hashicorp/go-getter/v2 v2.2.3 // indirect 28 | github.com/hashicorp/go-multierror v1.1.1 // indirect 29 | github.com/hashicorp/go-safetemp v1.0.0 // indirect 30 | github.com/hashicorp/go-version v1.7.0 // indirect 31 | github.com/hashicorp/hcl v0.0.0-20170504190234-a4b07c25de5f // indirect 32 | github.com/hashicorp/hcl/v2 v2.22.0 // indirect 33 | github.com/hashicorp/terraform-config-inspect v0.0.0-20250203082807-efaa306e97b4 // indirect 34 | github.com/hashicorp/terraform-json v0.24.0 // indirect 35 | github.com/jinzhu/copier v0.4.0 // indirect 36 | github.com/josharian/intern v1.0.0 // indirect 37 | github.com/klauspost/compress v1.16.7 // indirect 38 | github.com/mailru/easyjson v0.7.7 // indirect 39 | github.com/mattn/go-shellwords v1.0.12 // indirect 40 | github.com/mattn/go-zglob v0.0.4 // indirect 41 | github.com/mitchellh/go-homedir v1.1.0 // indirect 42 | github.com/mitchellh/go-testing-interface v1.14.2-0.20210821155943-2d9075ca8770 // indirect 43 | github.com/mitchellh/go-wordwrap v1.0.1 // indirect 44 | github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect 45 | github.com/tidwall/gjson v1.18.0 // indirect 46 | github.com/tidwall/match v1.1.1 // indirect 47 | github.com/tidwall/pretty v1.2.1 // indirect 48 | github.com/tidwall/sjson v1.2.5 // indirect 49 | github.com/tmccombs/hcl2json v0.6.4 // indirect 50 | github.com/ulikunitz/xz v0.5.11 // indirect 51 | github.com/zclconf/go-cty v1.15.1 // indirect 52 | golang.org/x/crypto v0.35.0 // indirect 53 | golang.org/x/mod v0.23.0 // indirect 54 | golang.org/x/net v0.36.0 // indirect 55 | golang.org/x/sync v0.11.0 // indirect 56 | golang.org/x/sys v0.30.0 // indirect 57 | golang.org/x/text v0.22.0 // indirect 58 | golang.org/x/tools v0.26.0 // indirect 59 | google.golang.org/protobuf v1.35.1 // indirect 60 | gopkg.in/yaml.v3 v3.0.1 // indirect 61 | k8s.io/kube-openapi v0.0.0-20241212222426-2c72e554b1e7 // indirect 62 | sigs.k8s.io/kustomize/kyaml v0.19.0 // indirect 63 | sigs.k8s.io/yaml v1.4.0 // indirect 64 | ) 65 | -------------------------------------------------------------------------------- /.github/workflows/lint.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # NOTE: This file is automatically generated from values at: 16 | # https://github.com/GoogleCloudPlatform/cloud-foundation-toolkit/blob/main/infra/terraform/test-org/org/locals.tf 17 | 18 | name: 'lint' 19 | 20 | on: 21 | workflow_dispatch: 22 | pull_request: 23 | types: [opened, edited, reopened, synchronize] 24 | branches: [main] 25 | 26 | permissions: 27 | contents: read 28 | 29 | concurrency: 30 | group: '${{ github.workflow }}-${{ github.head_ref || github.ref }}' 31 | cancel-in-progress: true 32 | 33 | jobs: 34 | lint: 35 | name: 'lint' 36 | runs-on: 'ubuntu-latest' 37 | steps: 38 | - uses: 'actions/checkout@v6' 39 | - id: variables 40 | run: | 41 | MAKEFILE=$(find . -name Makefile -print -quit) 42 | if [ -z "$MAKEFILE" ]; then 43 | echo dev-tools=gcr.io/cloud-foundation-cicd/cft/developer-tools:1 >> "$GITHUB_OUTPUT" 44 | else 45 | VERSION=$(grep "DOCKER_TAG_VERSION_DEVELOPER_TOOLS := " $MAKEFILE | cut -d\ -f3) 46 | IMAGE=$(grep "DOCKER_IMAGE_DEVELOPER_TOOLS := " $MAKEFILE | cut -d\ -f3) 47 | REGISTRY=$(grep "REGISTRY_URL := " $MAKEFILE | cut -d\ -f3) 48 | echo dev-tools=${REGISTRY}/${IMAGE}:${VERSION} >> "$GITHUB_OUTPUT" 49 | fi 50 | - run: docker run --rm -e ENABLE_BPMETADATA -v ${{ github.workspace }}:/workspace ${{ steps.variables.outputs.dev-tools }} module-swapper 51 | env: 52 | ENABLE_BPMETADATA: 1 53 | 54 | - run: docker run --rm -e ENABLE_BPMETADATA -v ${{ github.workspace }}:/workspace ${{ steps.variables.outputs.dev-tools }} /usr/local/bin/test_lint.sh 55 | env: 56 | ENABLE_BPMETADATA: 1 57 | 58 | commitlint: 59 | runs-on: ubuntu-latest 60 | steps: 61 | - uses: actions/checkout@v6 62 | with: 63 | fetch-depth: 0 64 | - name: Setup node 65 | uses: actions/setup-node@v6 66 | with: 67 | node-version: lts/* 68 | - name: Install commitlint 69 | run: | 70 | npm install -D @commitlint/cli@20.2.0 @commitlint/config-conventional@20.2.0 71 | echo "module.exports = { extends: ['@commitlint/config-conventional'], rules: {'subject-case': [0], 'header-max-length': [0]} };" > commitlint.config.js 72 | npx commitlint --version 73 | - name: Validate PR commits with commitlint 74 | if: github.event_name == 'pull_request' 75 | env: 76 | TITLE: ${{ github.event.pull_request.title }} 77 | run: 'echo "$TITLE" | npx commitlint --verbose' 78 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Please note that this file was generated from [terraform-google-module-template](https://github.com/terraform-google-modules/terraform-google-module-template). 16 | # Please make sure to contribute relevant changes upstream! 17 | 18 | # Make will use bash instead of sh 19 | SHELL := /usr/bin/env bash 20 | 21 | DOCKER_TAG_VERSION_DEVELOPER_TOOLS := 1.23 22 | DOCKER_IMAGE_DEVELOPER_TOOLS := cft/developer-tools 23 | REGISTRY_URL := gcr.io/cloud-foundation-cicd 24 | ENABLE_BPMETADATA := 1 25 | export ENABLE_BPMETADATA 26 | 27 | # Enter docker container for local development 28 | .PHONY: docker_run 29 | docker_run: 30 | docker run --rm -it \ 31 | -e SERVICE_ACCOUNT_JSON \ 32 | -v "$(CURDIR)":/workspace \ 33 | $(REGISTRY_URL)/${DOCKER_IMAGE_DEVELOPER_TOOLS}:${DOCKER_TAG_VERSION_DEVELOPER_TOOLS} \ 34 | /bin/bash 35 | 36 | # Execute prepare tests within the docker container 37 | .PHONY: docker_test_prepare 38 | docker_test_prepare: 39 | docker run --rm -it \ 40 | -e SERVICE_ACCOUNT_JSON \ 41 | -e TF_VAR_org_id \ 42 | -e TF_VAR_folder_id \ 43 | -e TF_VAR_billing_account \ 44 | -v "$(CURDIR)":/workspace \ 45 | $(REGISTRY_URL)/${DOCKER_IMAGE_DEVELOPER_TOOLS}:${DOCKER_TAG_VERSION_DEVELOPER_TOOLS} \ 46 | /usr/local/bin/execute_with_credentials.sh prepare_environment 47 | 48 | # Clean up test environment within the docker container 49 | .PHONY: docker_test_cleanup 50 | docker_test_cleanup: 51 | docker run --rm -it \ 52 | -e SERVICE_ACCOUNT_JSON \ 53 | -e TF_VAR_org_id \ 54 | -e TF_VAR_folder_id \ 55 | -e TF_VAR_billing_account \ 56 | -v "$(CURDIR)":/workspace \ 57 | $(REGISTRY_URL)/${DOCKER_IMAGE_DEVELOPER_TOOLS}:${DOCKER_TAG_VERSION_DEVELOPER_TOOLS} \ 58 | /usr/local/bin/execute_with_credentials.sh cleanup_environment 59 | 60 | # Execute integration tests within the docker container 61 | .PHONY: docker_test_integration 62 | docker_test_integration: 63 | docker run --rm -it \ 64 | -e SERVICE_ACCOUNT_JSON \ 65 | -v "$(CURDIR)":/workspace \ 66 | $(REGISTRY_URL)/${DOCKER_IMAGE_DEVELOPER_TOOLS}:${DOCKER_TAG_VERSION_DEVELOPER_TOOLS} \ 67 | /usr/local/bin/test_integration.sh 68 | 69 | # Execute lint tests within the docker container 70 | .PHONY: docker_test_lint 71 | docker_test_lint: 72 | docker run --rm -it \ 73 | -e ENABLE_BPMETADATA \ 74 | -e EXCLUDE_LINT_DIRS \ 75 | -v "$(CURDIR)":/workspace \ 76 | $(REGISTRY_URL)/${DOCKER_IMAGE_DEVELOPER_TOOLS}:${DOCKER_TAG_VERSION_DEVELOPER_TOOLS} \ 77 | /usr/local/bin/test_lint.sh 78 | 79 | # Generate documentation 80 | .PHONY: docker_generate_docs 81 | docker_generate_docs: 82 | docker run --rm -it \ 83 | -e ENABLE_BPMETADATA \ 84 | -v "$(CURDIR)":/workspace \ 85 | $(REGISTRY_URL)/${DOCKER_IMAGE_DEVELOPER_TOOLS}:${DOCKER_TAG_VERSION_DEVELOPER_TOOLS} \ 86 | /bin/bash -c 'source /usr/local/bin/task_helper_functions.sh && generate_docs -d' 87 | 88 | # Alias for backwards compatibility 89 | .PHONY: generate_docs 90 | generate_docs: docker_generate_docs 91 | -------------------------------------------------------------------------------- /metadata.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: blueprints.cloud.google.com/v1alpha1 16 | kind: BlueprintMetadata 17 | metadata: 18 | name: terraform-google-analytics-lakehouse 19 | annotations: 20 | config.kubernetes.io/local-config: "true" 21 | spec: 22 | info: 23 | title: terraform-google-lakehouse 24 | source: 25 | repo: https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse.git 26 | sourceType: git 27 | version: 0.4.0 28 | actuationTool: 29 | flavor: Terraform 30 | version: ">= 0.13" 31 | description: {} 32 | content: 33 | documentation: 34 | - title: Create an Analytics Lakehouse 35 | url: https://cloud.google.com/architecture/big-data-analytics/analytics-lakehouse 36 | examples: 37 | - name: analytics_lakehouse 38 | location: examples/analytics_lakehouse 39 | interfaces: 40 | variables: 41 | - name: enable_apis 42 | description: Whether or not to enable underlying apis in this solution. . 43 | varType: string 44 | defaultValue: true 45 | - name: force_destroy 46 | description: Whether or not to protect GCS resources from deletion when solution is modified or changed. 47 | varType: string 48 | defaultValue: false 49 | - name: labels 50 | description: A map of labels to apply to contained resources. 51 | varType: map(string) 52 | defaultValue: 53 | analytics-lakehouse: true 54 | - name: project_id 55 | description: Google Cloud Project ID 56 | varType: string 57 | required: true 58 | - name: public_data_bucket 59 | description: Public Data bucket for access 60 | varType: string 61 | defaultValue: data-analytics-demos 62 | - name: region 63 | description: Google Cloud Region 64 | varType: string 65 | defaultValue: us-central1 66 | - name: use_case_short 67 | description: Short name for use case 68 | varType: string 69 | defaultValue: lakehouse 70 | outputs: 71 | - name: bigquery_editor_url 72 | description: The URL to launch the BigQuery editor 73 | - name: lakehouse_colab_url 74 | description: The URL to launch the in-console tutorial for the Analytics Lakehouse solution 75 | - name: lookerstudio_report_url 76 | description: The URL to create a new Looker Studio report displays a sample dashboard for data analysis 77 | - name: neos_tutorial_url 78 | description: The URL to launch the in-console tutorial for the Analytics Lakehouse solution 79 | - name: workflow_return_project_setup 80 | description: Output of the project setup workflow 81 | requirements: 82 | roles: 83 | - level: Project 84 | roles: 85 | - roles/owner 86 | - roles/bigquery.dataViewer 87 | services: 88 | - cloudkms.googleapis.com 89 | - cloudresourcemanager.googleapis.com 90 | - bigquery.googleapis.com 91 | - bigquerystorage.googleapis.com 92 | - bigqueryconnection.googleapis.com 93 | - serviceusage.googleapis.com 94 | - iam.googleapis.com 95 | -------------------------------------------------------------------------------- /tutorial.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | # Customize an Analytics Lakehouse Solution 9 | 10 | Learn how to build and deploy your own proof of concept based on the deployed [Analytics Lakehouse](https://console.cloud.google.com/products/solutions/details/analytics-lakehouse) Jump Start Solution. You can customize the Jump Start Solution deployment by creating a copy of the source code. You can modify the infrastructure and application code as needed and redeploy the solution with the changes. 11 | 12 | To avoid conflicts, only one user should modify and deploy a solution in a single Google Cloud project. 13 | 14 | ## Open cloned repository as workspace 15 | 16 | Open the directory where the repository is cloned as a workspace in the editor, follow the steps based on whether you are using the Cloud Shell Editor in Preview Mode or Legacy Mode. 17 | 18 | --- 19 | **Legacy Cloud Shell Editor** 20 | 21 | 1. Go to the `File` menu. 22 | 2. Select `Open Workspace`. 23 | 3. Choose the directory where the repository has been cloned. This directory is the current directory in the cloud shell terminal. 24 | 25 | **New Cloud Shell Editor** 26 | 27 | 1. Go the hamburger icon located in the top left corner of the editor. 28 | 2. Go to the `File` Menu. 29 | 3. Select `Open Folder`. 30 | 4. Choose the directory where the repository has been cloned. This directory is the current directory in the cloud shell terminal. 31 | 32 | ## Before you begin 33 | 34 | We also strongly recommend that you familiarize yourself with the Analytics Lakehouse solution by reading the [solution guide](https://cloud.google.com/architecture/big-data-analytics/analytics-lakehouse). 35 | 36 | NOTE: A change in the infrastructure code might cause a change in the incurred cost. 37 | 38 | --- 39 | **Create an automated deployment** 40 | 41 | Run the deploy_solution.sh script. 42 | 43 | ```bash 44 | ./deploy_solution.sh 45 | ``` 46 | 47 | --- 48 | **Monitor the deployment** 49 | 50 | Get the deployment details. 51 | 52 | ```bash 53 | gcloud infra-manager deployments describe DEPLOYMENT_NAME --location REGION 54 | ``` 55 | 56 | Monitor your deployment at [Solution deployments page](https://console.cloud.google.com/products/solutions/deployments?pageState=(%22deployments%22:(%22f%22:%22%255B%257B_22k_22_3A_22Labels_22_2C_22t_22_3A13_2C_22v_22_3A_22_5C_22modification-reason%2520_3A%2520make-it-mine_5C_22_22_2C_22s_22_3Atrue_2C_22i_22_3A_22deployment.labels_22%257D%255D%22))). 57 | 58 | ## Save your edits to the solution 59 | 60 | Use any of the following methods to save your edits to the solution 61 | 62 | --- 63 | **Download the solution** 64 | 65 | To download your solution, in the `File` menu, select `Download Workspace`. The solution is downloaded in a compressed format. 66 | 67 | 68 | --- 69 | **Save the solution to your Git repository** 70 | 71 | Set the remote URL to your Git repository 72 | ```bash 73 | git remote set-url origin [git-repo-url] 74 | ``` 75 | 76 | Review the modified files, commit and push to your remote repository branch. 77 | 78 | ## Delete the deployed solution 79 | 80 | Optional: Use one of the below options in case you want to delete the deployed solution 81 | 82 | * Go to [Solution deployments page](https://console.cloud.google.com/products/solutions/deployments?pageState=(%22deployments%22:(%22f%22:%22%255B%257B_22k_22_3A_22Labels_22_2C_22t_22_3A13_2C_22v_22_3A_22_5C_22modification-reason%2520_3A%2520make-it-mine_5C_22_22_2C_22s_22_3Atrue_2C_22i_22_3A_22deployment.labels_22%257D%255D%22))). 83 | * Click on the link under "Deployment name". It will take you to the deployment details page for the solution. 84 | * Click on the "DELETE" button located at the top right corner of the page. 85 | 86 | -------------------------------------------------------------------------------- /test/integration/analytics_lakehouse/analytics_lakehouse_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package multiple_buckets 16 | 17 | import ( 18 | "fmt" 19 | "testing" 20 | "time" 21 | 22 | "github.com/GoogleCloudPlatform/cloud-foundation-toolkit/infra/blueprint-test/pkg/bq" 23 | "github.com/GoogleCloudPlatform/cloud-foundation-toolkit/infra/blueprint-test/pkg/gcloud" 24 | "github.com/GoogleCloudPlatform/cloud-foundation-toolkit/infra/blueprint-test/pkg/tft" 25 | "github.com/GoogleCloudPlatform/cloud-foundation-toolkit/infra/blueprint-test/pkg/utils" 26 | "github.com/stretchr/testify/assert" 27 | ) 28 | 29 | // Retry if these errors are encountered. 30 | var retryErrors = map[string]string{ 31 | ".*does not have enough resources available to fulfill the request. Try a different zone,.*": "Compute zone resources currently unavailable.", 32 | ".*Error 400: The subnetwork resource*": "Subnet is eventually drained", 33 | } 34 | 35 | func TestAnalyticsLakehouse(t *testing.T) { 36 | dwh := tft.NewTFBlueprintTest(t, tft.WithRetryableTerraformErrors(retryErrors, 60, time.Minute)) 37 | 38 | dwh.DefineVerify(func(assert *assert.Assertions) { 39 | // Commented out until Workbench provider proxy-byoid-url bug is fixed 40 | // dwh.DefaultVerify(assert) 41 | 42 | time.Sleep(300 * time.Second) 43 | 44 | projectID := dwh.GetTFSetupStringOutput("project_id") 45 | 46 | verifyWorkflow := func(workflow string) (bool, error) { 47 | executions := gcloud.Runf(t, "workflows executions list %s --project %s --sort-by=startTime", workflow, projectID) 48 | state := executions.Get("0.state").String() 49 | if state == "FAILED" { 50 | id := executions.Get("0.name") 51 | gcloud.Runf(t, "workflows executions describe %s", id) 52 | t.FailNow() 53 | } 54 | if state == "SUCCEEDED" { 55 | return false, nil 56 | } 57 | return true, nil 58 | } 59 | 60 | // Assert copy-data workflow ran successfully 61 | verifyCopyDataWorkflow := func() (bool, error) { 62 | return verifyWorkflow("copy-data") 63 | } 64 | utils.Poll(t, verifyCopyDataWorkflow, 50, 15*time.Second) 65 | 66 | // Assert project-setup workflow ran successfully 67 | verifyProjectSetupWorkflow := func() (bool, error) { 68 | return verifyWorkflow("project-setup") 69 | } 70 | utils.Poll(t, verifyProjectSetupWorkflow, 100, 15*time.Second) 71 | 72 | tables := []string{ 73 | "gcp_primary_raw.ga4_obfuscated_sample_ecommerce_images", 74 | "gcp_primary_raw.textocr_images", 75 | "gcp_primary_staging.new_york_taxi_trips_tlc_yellow_trips_2022", 76 | "gcp_primary_staging.thelook_ecommerce_distribution_centers", 77 | "gcp_primary_staging.thelook_ecommerce_events", 78 | "gcp_primary_staging.thelook_ecommerce_inventory_items", 79 | "gcp_primary_staging.thelook_ecommerce_order_items", 80 | "gcp_primary_staging.thelook_ecommerce_orders", 81 | "gcp_primary_staging.thelook_ecommerce_products", 82 | "gcp_primary_staging.thelook_ecommerce_users", 83 | "gcp_lakehouse_ds.agg_events_iceberg", 84 | } 85 | 86 | query_template := "SELECT count(*) AS count FROM `%[1]s.%[2]s`;" 87 | for _, table := range tables { 88 | query := fmt.Sprintf(query_template, projectID, table) 89 | op := bq.Runf(t, "--project_id=%[1]s query --nouse_legacy_sql %[2]s", projectID, query) 90 | 91 | count := op.Get("0.count").Int() 92 | assert.Greater(count, int64(0), table) 93 | } 94 | }) 95 | 96 | dwh.DefineTeardown(func(assert *assert.Assertions) { 97 | dwh.DefaultTeardown(assert) 98 | 99 | }) 100 | dwh.Test() 101 | } 102 | -------------------------------------------------------------------------------- /src/python/bigtable.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Copyright 2024 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # This file is used as a part of the Neos journey for the Analytics 17 | # Lakehouse Jumpstart solution. It is not automatically executed as a 18 | # part of the default deployment. 19 | 20 | """Bigtable to PySpark to BigQuery example.""" 21 | from pyspark.sql import SparkSession 22 | import sys 23 | 24 | # Must provide a project ID and an instance ID. 25 | if len(sys.argv) < 3: 26 | print("Please provide a project ID and an instance ID.") 27 | 28 | project_id = sys.argv[1] 29 | instance_id = sys.argv[2] 30 | 31 | # Create a Spark session and configure the spark-bigtable connector. 32 | spark = SparkSession.builder \ 33 | .config("spark.jars", 34 | "gs://spark-bigtable-preview/jars/" + 35 | "spark-bigtable-0.0.1-preview5-SNAPSHOT.jar") \ 36 | .getOrCreate() 37 | 38 | # Create the catalog schema to convert Bigtable columns to Spark. 39 | # "table" defnes the Bigtable namespace and table to read data from. 40 | # "rowkey" defines the rowkey. 41 | # "columns" are formatted as 42 | # "SPARK_DF_COLUMN_NAME":{ 43 | # "cf":"BIGTABLE_COLUMN_FAMILY", 44 | # "col":"BIGTABLE_COLUMN_NAME", 45 | # "type":"BIGTABLE_TYPE" 46 | # } 47 | catalog = ''.join(("""{ 48 | "table":{"namespace":"default", "name":"UserPersonalization"}, 49 | "rowkey":"rowkey", 50 | "columns":{ 51 | "_rowkey":{"cf":"rowkey", "col":"rowkey", "type":"string"}, 52 | "rec0":{ 53 | "cf":"Recommendations", 54 | "col":"Recommendation0", 55 | "type":"string" 56 | }, 57 | "rec1":{ 58 | "cf":"Recommendations", 59 | "col":"Recommendation1", 60 | "type":"string" 61 | }, 62 | "rec2":{ 63 | "cf":"Recommendations", 64 | "col":"Recommendation2", 65 | "type":"string" 66 | }, 67 | "rec3":{ 68 | "cf":"Recommendations", 69 | "col":"Recommendation3", 70 | "type":"string" 71 | } 72 | } 73 | }""").split()) 74 | 75 | # Load Bigtable data. 76 | df = spark.read \ 77 | .format('bigtable') \ 78 | .option('spark.bigtable.project.id', project_id) \ 79 | .option('spark.bigtable.instance.id', instance_id) \ 80 | .options(catalog=catalog) \ 81 | .load() 82 | 83 | 84 | # Create new dfs counting each recommended item per rec position. 85 | # Rename columns to join later. 86 | def groupby_count_rename(df, col): 87 | return df.groupBy(col) \ 88 | .count() \ 89 | .withColumnRenamed(col, "item") \ 90 | .withColumnRenamed("count", col) 91 | 92 | 93 | r0 = groupby_count_rename(df, "rec0") 94 | r1 = groupby_count_rename(df, "rec1") 95 | r2 = groupby_count_rename(df, "rec2") 96 | r3 = groupby_count_rename(df, "rec3") 97 | 98 | # Join all columns together. The output is a table with 99 | # item names and number of times each name appears in each rec column. 100 | joined_df = r0.join(r1, r0.item == r1.item, 'outer') \ 101 | .join(r2, r0.item == r2.item, 'outer') \ 102 | .join(r3, r0.item == r3.item, 'outer') \ 103 | .select(r0.item, "rec0", "rec1", "rec2", "rec3") 104 | 105 | # Write the table to BigQuery. 106 | joined_df.write \ 107 | .format("bigquery") \ 108 | .option("writeMethod", "direct") \ 109 | .save("gcp_lakehouse_ds.user_recommendations") 110 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # terraform-google-lakehouse 2 | 3 | ## Description 4 | ### tagline 5 | This is an auto-generated module. 6 | 7 | ### detailed 8 | This module was generated from [terraform-google-module-template](https://github.com/terraform-google-modules/terraform-google-module-template/), which by default generates a module that simply creates a GCS bucket. As the module develops, this README should be updated. 9 | 10 | The resources/services/activations/deletions that this module will create/trigger are: 11 | 12 | - Create a GCS bucket with the provided name 13 | 14 | ### preDeploy 15 | To deploy this blueprint you must have an active billing account and billing permissions. 16 | 17 | ## Documentation 18 | - [Create an Analytics Lakehouse](https://cloud.google.com/architecture/big-data-analytics/analytics-lakehouse) 19 | 20 | ## Usage 21 | 22 | Basic usage of this module is as follows: 23 | 24 | ```hcl 25 | module "analytics_lakehouse" { 26 | source = "../.." 27 | 28 | project_id = var.project_id 29 | region = "us-central1" 30 | force_destroy = true 31 | 32 | } 33 | ``` 34 | 35 | Functional examples are included in the 36 | [examples](./examples/) directory. 37 | 38 | 39 | ## Inputs 40 | 41 | | Name | Description | Type | Default | Required | 42 | |------|-------------|------|---------|:--------:| 43 | | enable\_apis | Whether or not to enable underlying apis in this solution. . | `string` | `true` | no | 44 | | force\_destroy | Whether or not to protect GCS resources from deletion when solution is modified or changed. | `string` | `false` | no | 45 | | labels | A map of labels to apply to contained resources. | `map(string)` |
{
"analytics-lakehouse": true
}
| no | 46 | | project\_id | Google Cloud Project ID | `string` | n/a | yes | 47 | | public\_data\_bucket | Public Data bucket for access | `string` | `"data-analytics-demos"` | no | 48 | | region | Google Cloud Region | `string` | `"us-central1"` | no | 49 | | use\_case\_short | Short name for use case | `string` | `"lakehouse"` | no | 50 | 51 | ## Outputs 52 | 53 | | Name | Description | 54 | |------|-------------| 55 | | bigquery\_editor\_url | The URL to launch the BigQuery editor | 56 | | lakehouse\_colab\_url | The URL to launch the in-console tutorial for the Analytics Lakehouse solution | 57 | | lookerstudio\_report\_url | The URL to create a new Looker Studio report displays a sample dashboard for data analysis | 58 | | neos\_tutorial\_url | The URL to launch the in-console tutorial for the Analytics Lakehouse solution | 59 | | workflow\_return\_project\_setup | Output of the project setup workflow | 60 | 61 | 62 | 63 | ## Requirements 64 | 65 | These sections describe requirements for using this module. 66 | 67 | ### Software 68 | 69 | The following dependencies must be available: 70 | 71 | - [Terraform][terraform] >= v0.13 72 | - [Terraform Provider for GCP][terraform-provider-gcp] plugin ~> v4.56 73 | 74 | ### Service Account 75 | 76 | A service account with the following roles must be used to provision 77 | the resources of this module: 78 | 79 | - Storage Admin: `roles/storage.admin` 80 | 81 | The [Project Factory module][project-factory-module] and the 82 | [IAM module][iam-module] may be used in combination to provision a 83 | service account with the necessary roles applied. 84 | 85 | ### APIs 86 | 87 | A project with the following APIs enabled must be used to host the 88 | resources of this module: 89 | 90 | - Google Cloud Storage JSON API: `storage-api.googleapis.com` 91 | 92 | The [Project Factory module][project-factory-module] can be used to 93 | provision a project with the necessary APIs enabled. 94 | 95 | ## Contributing 96 | 97 | Refer to the [contribution guidelines](./CONTRIBUTING.md) for 98 | information on contributing to this module. 99 | 100 | [iam-module]: https://registry.terraform.io/modules/terraform-google-modules/iam/google 101 | [project-factory-module]: https://registry.terraform.io/modules/terraform-google-modules/project-factory/google 102 | [terraform-provider-gcp]: https://www.terraform.io/docs/providers/google/index.html 103 | [terraform]: https://www.terraform.io/downloads.html 104 | 105 | ## Security Disclosures 106 | 107 | Please see our [security disclosure process](./SECURITY.md). 108 | -------------------------------------------------------------------------------- /deploy_solution.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2023 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | set -o pipefail 16 | 17 | handle_error() { 18 | local exit_code=$? 19 | exit $exit_code 20 | } 21 | trap 'handle_error' ERR 22 | 23 | SOLUTION_ID="analytics-lakehouse" 24 | 25 | echo "Fetching Project ID" 26 | PROJECT_ID=$(gcloud config get project) 27 | echo "Project ID is ${PROJECT_ID}" 28 | 29 | # Iterate over the infra manager location to identify the deployment 30 | # currently one deployment per project is only supported 31 | # in future if multiple deployments are supported per project this will need to change 32 | IM_SUPPORTED_REGIONS=("us-central1" "europe-west1" "asia-east1") 33 | 34 | for REGION in "${IM_SUPPORTED_REGIONS[@]}"; do 35 | DEPLOYMENT_NAME=$(gcloud infra-manager deployments list --location "${REGION}" \ 36 | --filter="labels.goog-solutions-console-deployment-name:* AND \ 37 | labels.goog-solutions-console-solution-id:${SOLUTION_ID}" \ 38 | --format='value(name)') 39 | if [ -n "$DEPLOYMENT_NAME" ]; then 40 | break 41 | fi 42 | done 43 | if [ -z "$DEPLOYMENT_NAME" ]; then 44 | echo "Failed to find the existing deployment, exiting now!" 45 | exit 1 46 | fi 47 | echo "Region is ${REGION}" 48 | echo "Deployment name is ${DEPLOYMENT_NAME}" 49 | 50 | SERVICE_ACCOUNT=$(gcloud infra-manager deployments describe "${DEPLOYMENT_NAME}" --location "${REGION}" --format='value(serviceAccount)') 51 | 52 | echo "Assigning required roles to the service account ${SERVICE_ACCOUNT}" 53 | # Iterate over the roles and check if the service account already has that role 54 | # assigned. If it has then skip adding that policy binding as using 55 | # --condition=None can overwrite any existing conditions in the binding. 56 | CURRENT_POLICY=$(gcloud projects get-iam-policy "${PROJECT_ID}" --format=json) 57 | MEMBER_EMAIL=$(echo "${SERVICE_ACCOUNT}" | awk -F '/' '{print $NF}') 58 | MEMBER="serviceAccount:${MEMBER_EMAIL}" 59 | 60 | while IFS= read -r role || [[ -n "$role" ]] 61 | do \ 62 | if echo "$CURRENT_POLICY" | jq -e --arg role "$role" --arg member "$MEMBER" '.bindings[] | select(.role == $role) | .members[] | select(. == $member)' > /dev/null; then \ 63 | echo "IAM policy binding already exists for member ${MEMBER} and role ${role}" 64 | else \ 65 | gcloud projects add-iam-policy-binding "${PROJECT_ID}" \ 66 | --member="$MEMBER" \ 67 | --role="$role" \ 68 | --condition=None 69 | fi 70 | done < "roles.txt" 71 | 72 | DEPLOYMENT_DESCRIPTION=$(gcloud infra-manager deployments describe "${DEPLOYMENT_NAME}" --location "${REGION}" --format json) 73 | cat < input.tfvars 74 | # Do not edit the region as changing the region can lead to failed deployment. 75 | region="$(echo "$DEPLOYMENT_DESCRIPTION" | jq -r '.terraformBlueprint.inputValues.region.inputValue')" 76 | project_id = "${PROJECT_ID}" 77 | labels = { 78 | "goog-solutions-console-deployment-name" = "${DEPLOYMENT_NAME}", 79 | "goog-solutions-console-solution-id" = "${SOLUTION_ID}" 80 | } 81 | EOF 82 | 83 | echo "An input.tfvars has been created in the current directory with a set of default input terraform variables for the solution. You can modify their values or go ahead with the defaults." 84 | read -r -p "Once done, press Enter to continue: " 85 | 86 | echo "Creating the cloud storage bucket if it does not exist already" 87 | BUCKET_NAME="${PROJECT_ID}_infra_manager_staging" 88 | if ! gsutil ls "gs://$BUCKET_NAME" &> /dev/null; then 89 | gsutil mb "gs://$BUCKET_NAME/" 90 | echo "Bucket $BUCKET_NAME created successfully." 91 | else 92 | echo "Bucket $BUCKET_NAME already exists. Moving on to the next step." 93 | fi 94 | 95 | echo "Deploying the solution" 96 | gcloud infra-manager deployments apply projects/"${PROJECT_ID}"/locations/"${REGION}"/deployments/"${DEPLOYMENT_NAME}" --service-account "${SERVICE_ACCOUNT}" --local-source="." --inputs-file=./input.tfvars --labels="modification-reason=make-it-mine,goog-solutions-console-deployment-name=${DEPLOYMENT_NAME},goog-solutions-console-solution-id=${SOLUTION_ID},goog-config-partner=sc" 97 | -------------------------------------------------------------------------------- /deploy_via_trigger.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2024 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -o pipefail 17 | 18 | handle_error() { 19 | local exit_code=$? 20 | exit $exit_code 21 | } 22 | trap 'handle_error' ERR 23 | 24 | while getopts p: flag 25 | do 26 | case "${flag}" in 27 | p) PROJECT_ID=${OPTARG};; 28 | *) echo "usage: $0 [-p PROJECT_ID]" >&2 29 | exit 1 ;; 30 | esac 31 | done 32 | 33 | if [ -z "$PROJECT_ID" ]; then 34 | echo "Failed to read the project id, exiting now!" 35 | exit 1 36 | fi 37 | 38 | SOLUTION_ID="analytics-lakehouse" 39 | 40 | # Iterate over the infra manager location to identify the deployment 41 | # currently one deployment per project is only supported 42 | # in future if multiple deployments are supported per project this will need to change 43 | IM_SUPPORTED_REGIONS=("us-central1" "europe-west1" "asia-east1") 44 | 45 | for REGION in "${IM_SUPPORTED_REGIONS[@]}"; do 46 | DEPLOYMENT_NAME=$(gcloud infra-manager deployments list --location "${REGION}" \ 47 | --filter="labels.goog-solutions-console-deployment-name:* AND \ 48 | labels.goog-solutions-console-solution-id:${SOLUTION_ID}" \ 49 | --format='value(name)') 50 | if [ -n "$DEPLOYMENT_NAME" ]; then 51 | break 52 | fi 53 | done 54 | if [ -z "$DEPLOYMENT_NAME" ]; then 55 | echo "Failed to find the existing deployment, exiting now!" 56 | exit 1 57 | fi 58 | echo "Project ID is ${PROJECT_ID}" 59 | echo "Region is ${REGION}" 60 | echo "Deployment name is ${DEPLOYMENT_NAME}" 61 | 62 | SERVICE_ACCOUNT=$(gcloud infra-manager deployments describe "${DEPLOYMENT_NAME}" --location "${REGION}" --format='value(serviceAccount)') 63 | 64 | echo "Assigning required roles to the service account ${SERVICE_ACCOUNT}" 65 | # Iterate over the roles and check if the service account already has that role 66 | # assigned. If it has then skip adding that policy binding as using 67 | # --condition=None can overwrite any existing conditions in the binding. 68 | CURRENT_POLICY=$(gcloud projects get-iam-policy "${PROJECT_ID}" --format=json) 69 | MEMBER_EMAIL=$(echo "${SERVICE_ACCOUNT}" | awk -F '/' '{print $NF}') 70 | MEMBER="serviceAccount:${MEMBER_EMAIL}" 71 | apt-get install jq -y 72 | while IFS= read -r role || [[ -n "$role" ]] 73 | do \ 74 | if echo "$CURRENT_POLICY" | jq -e --arg role "$role" --arg member "$MEMBER" '.bindings[] | select(.role == $role) | .members[] | select(. == $member)' > /dev/null; then \ 75 | echo "IAM policy binding already exists for member ${MEMBER} and role ${role}" 76 | else \ 77 | gcloud projects add-iam-policy-binding "${PROJECT_ID}" \ 78 | --member="$MEMBER" \ 79 | --role="$role" \ 80 | --condition=None 81 | fi 82 | done < "roles.txt" 83 | 84 | DEPLOYMENT_DESCRIPTION=$(gcloud infra-manager deployments describe "${DEPLOYMENT_NAME}" --location "${REGION}" --format json) 85 | cat < input.tfvars 86 | # Do not edit the region as changing the region can lead to failed deployment. 87 | region="$(echo "$DEPLOYMENT_DESCRIPTION" | jq -r '.terraformBlueprint.inputValues.region.inputValue')" 88 | project_id = "${PROJECT_ID}" 89 | labels = { 90 | "goog-solutions-console-deployment-name" = "${DEPLOYMENT_NAME}", 91 | "goog-solutions-console-solution-id" = "${SOLUTION_ID}" 92 | } 93 | EOF 94 | 95 | echo "Creating the cloud storage bucket if it does not exist already" 96 | BUCKET_NAME="${PROJECT_ID}_infra_manager_staging" 97 | if ! gsutil ls "gs://$BUCKET_NAME" &> /dev/null; then 98 | gsutil mb "gs://$BUCKET_NAME/" 99 | echo "Bucket $BUCKET_NAME created successfully." 100 | else 101 | echo "Bucket $BUCKET_NAME already exists. Moving on to the next step." 102 | fi 103 | 104 | echo "Deploying the solution" 105 | gcloud infra-manager deployments apply projects/"${PROJECT_ID}"/locations/"${REGION}"/deployments/"${DEPLOYMENT_NAME}" --service-account "${SERVICE_ACCOUNT}" --local-source="." --inputs-file=./input.tfvars --labels="modification-reason=make-it-mine,goog-solutions-console-deployment-name=${DEPLOYMENT_NAME},goog-solutions-console-solution-id=${SOLUTION_ID},goog-config-partner=sc" 106 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | We'd love to accept your patches and contributions to this project. There are 4 | just a few small guidelines you need to follow. 5 | 6 | ## Contributor License Agreement 7 | 8 | Contributions to this project must be accompanied by a Contributor License 9 | Agreement (CLA). You (or your employer) retain the copyright to your 10 | contribution; this simply gives us permission to use and redistribute your 11 | contributions as part of the project. Head over to 12 | to see your current agreements on file or 13 | to sign a new one. 14 | 15 | You generally only need to submit a CLA once, so if you've already submitted one 16 | (even if it was for a different project), you probably don't need to do it 17 | again. 18 | 19 | ## Code Reviews 20 | 21 | All submissions, including submissions by project members, require review. We 22 | use GitHub pull requests for this purpose. Consult 23 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 24 | information on using pull requests. 25 | 26 | ## Development 27 | 28 | The following dependencies must be installed on the development system: 29 | 30 | - [Docker Engine][docker-engine] 31 | - [Google Cloud SDK][google-cloud-sdk] 32 | - [make] 33 | 34 | ### Generating Documentation for Inputs and Outputs 35 | 36 | The Inputs and Outputs tables in the READMEs of the root module, 37 | submodules, and example modules are automatically generated based on 38 | the `variables` and `outputs` of the respective modules. These tables 39 | must be refreshed if the module interfaces are changed. 40 | 41 | #### Execution 42 | 43 | Run `make generate_docs` to generate new Inputs and Outputs tables. 44 | 45 | ### Integration Testing 46 | 47 | Integration tests are used to verify the behaviour of the root module, 48 | submodules, and example modules. Additions, changes, and fixes should 49 | be accompanied with tests. 50 | 51 | The integration tests are run using [Kitchen][kitchen], 52 | [Kitchen-Terraform][kitchen-terraform], and [InSpec][inspec]. These 53 | tools are packaged within a Docker image for convenience. 54 | 55 | The general strategy for these tests is to verify the behaviour of the 56 | [example modules](./examples/), thus ensuring that the root module, 57 | submodules, and example modules are all functionally correct. 58 | 59 | #### Test Environment 60 | The easiest way to test the module is in an isolated test project. The setup for such a project is defined in [test/setup](./test/setup/) directory. 61 | 62 | To use this setup, you need a service account with these permissions (on a Folder or Organization): 63 | - Project Creator 64 | - Project Billing Manager 65 | 66 | The project that the service account belongs to must have the following APIs enabled (the setup won't 67 | create any resources on the service account's project): 68 | - Cloud Resource Manager 69 | - Cloud Billing 70 | - Service Usage 71 | - Identity and Access Management (IAM) 72 | 73 | Export the Service Account credentials to your environment like so: 74 | 75 | ``` 76 | export SERVICE_ACCOUNT_JSON=$(< credentials.json) 77 | ``` 78 | 79 | You will also need to set a few environment variables: 80 | ``` 81 | export TF_VAR_org_id="your_org_id" 82 | export TF_VAR_folder_id="your_folder_id" 83 | export TF_VAR_billing_account="your_billing_account_id" 84 | ``` 85 | 86 | With these settings in place, you can prepare a test project using Docker: 87 | ``` 88 | make docker_test_prepare 89 | ``` 90 | 91 | #### Noninteractive Execution 92 | 93 | Run `make docker_test_integration` to test all of the example modules 94 | noninteractively, using the prepared test project. 95 | 96 | #### Interactive Execution 97 | 98 | 1. Run `make docker_run` to start the testing Docker container in 99 | interactive mode. 100 | 101 | 1. Run `kitchen_do create ` to initialize the working 102 | directory for an example module. 103 | 104 | 1. Run `kitchen_do converge ` to apply the example module. 105 | 106 | 1. Run `kitchen_do verify ` to test the example module. 107 | 108 | 1. Run `kitchen_do destroy ` to destroy the example module 109 | state. 110 | 111 | ### Linting and Formatting 112 | 113 | Many of the files in the repository can be linted or formatted to 114 | maintain a standard of quality. 115 | 116 | #### Execution 117 | 118 | Run `make docker_test_lint`. 119 | 120 | [docker-engine]: https://www.docker.com/products/docker-engine 121 | [flake8]: http://flake8.pycqa.org/en/latest/ 122 | [gofmt]: https://golang.org/cmd/gofmt/ 123 | [google-cloud-sdk]: https://cloud.google.com/sdk/install 124 | [hadolint]: https://github.com/hadolint/hadolint 125 | [inspec]: https://inspec.io/ 126 | [kitchen-terraform]: https://github.com/newcontext-oss/kitchen-terraform 127 | [kitchen]: https://kitchen.ci/ 128 | [make]: https://en.wikipedia.org/wiki/Make_(software) 129 | [shellcheck]: https://www.shellcheck.net/ 130 | [terraform-docs]: https://github.com/segmentio/terraform-docs 131 | [terraform]: https://terraform.io/ 132 | -------------------------------------------------------------------------------- /workflows.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2023 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | resource "google_project_service_identity" "workflows" { 18 | provider = google-beta 19 | project = module.project-services.project_id 20 | service = "workflows.googleapis.com" 21 | 22 | depends_on = [time_sleep.wait_after_apis_activate] 23 | } 24 | 25 | resource "google_service_account" "workflows_sa" { 26 | project = module.project-services.project_id 27 | account_id = "workflows-sa-${random_id.id.hex}" 28 | display_name = "Workflows Service Account" 29 | 30 | depends_on = [google_project_service_identity.workflows] 31 | } 32 | 33 | resource "google_project_iam_member" "workflows_sa_roles" { 34 | for_each = toset([ 35 | "roles/workflows.admin", 36 | "roles/storage.admin", 37 | "roles/iam.serviceAccountTokenCreator", 38 | "roles/iam.serviceAccountUser", 39 | "roles/logging.logWriter", 40 | "roles/dataproc.admin", 41 | "roles/bigquery.admin", 42 | "roles/dataplex.admin" 43 | ]) 44 | 45 | project = module.project-services.project_id 46 | role = each.key 47 | member = "serviceAccount:${google_service_account.workflows_sa.email}" 48 | 49 | depends_on = [ 50 | google_service_account.workflows_sa 51 | ] 52 | } 53 | 54 | # Workflow to copy data from prod GCS bucket to private buckets 55 | # NOTE: google_storage_bucket..name omits the `gs://` prefix. 56 | # You can use google_storage_bucket..url to include the prefix. 57 | resource "google_workflows_workflow" "copy_data" { 58 | name = "copy-data" 59 | project = module.project-services.project_id 60 | region = var.region 61 | description = "Copies data and performs project setup" 62 | service_account = google_service_account.workflows_sa.email 63 | deletion_protection = false 64 | source_contents = templatefile("${path.module}/src/yaml/copy-data.yaml", { 65 | public_data_bucket = var.public_data_bucket, 66 | textocr_images_bucket = google_storage_bucket.textocr_images_bucket.name, 67 | ga4_images_bucket = google_storage_bucket.ga4_images_bucket.name, 68 | tables_bucket = google_storage_bucket.tables_bucket.name, 69 | dataplex_bucket = google_storage_bucket.dataplex_bucket.name, 70 | images_zone_name = google_dataplex_zone.gcp_primary_raw.name, 71 | tables_zone_name = google_dataplex_zone.gcp_primary_staging.name, 72 | lake_name = google_dataplex_lake.gcp_primary.name 73 | }) 74 | 75 | depends_on = [ 76 | google_project_iam_member.workflows_sa_roles, 77 | google_project_iam_member.dataproc_sa_roles 78 | ] 79 | 80 | } 81 | 82 | # Workflow to set up project resources 83 | resource "google_workflows_workflow" "project_setup" { 84 | name = "project-setup" 85 | project = module.project-services.project_id 86 | region = var.region 87 | description = "Copies data and performs project setup" 88 | service_account = google_service_account.workflows_sa.email 89 | deletion_protection = false 90 | source_contents = templatefile("${path.module}/src/yaml/project-setup.yaml", {}) 91 | 92 | depends_on = [ 93 | google_project_iam_member.workflows_sa_roles 94 | ] 95 | 96 | } 97 | 98 | # execute workflows after all resources are created 99 | # # get a token to execute the workflows 100 | data "google_client_config" "current" { 101 | } 102 | 103 | # # execute the copy data workflow 104 | data "http" "call_workflows_copy_data" { 105 | url = "https://workflowexecutions.googleapis.com/v1/projects/${module.project-services.project_id}/locations/${var.region}/workflows/${google_workflows_workflow.copy_data.name}/executions" 106 | method = "POST" 107 | request_headers = { 108 | Accept = "application/json" 109 | Authorization = "Bearer ${data.google_client_config.current.access_token}" } 110 | depends_on = [ 111 | google_storage_bucket.textocr_images_bucket, 112 | google_storage_bucket.ga4_images_bucket, 113 | google_storage_bucket.tables_bucket 114 | ] 115 | } 116 | 117 | resource "time_sleep" "wait_after_copy_data" { 118 | create_duration = "30s" 119 | depends_on = [ 120 | data.http.call_workflows_copy_data 121 | ] 122 | } 123 | 124 | # execute the other project setup workflow 125 | data "http" "call_workflows_project_setup" { 126 | url = "https://workflowexecutions.googleapis.com/v1/projects/${module.project-services.project_id}/locations/${var.region}/workflows/${google_workflows_workflow.project_setup.name}/executions" 127 | method = "POST" 128 | request_headers = { 129 | Accept = "application/json" 130 | Authorization = "Bearer ${data.google_client_config.current.access_token}" } 131 | depends_on = [ 132 | google_bigquery_dataset.gcp_lakehouse_ds, 133 | time_sleep.wait_for_dataplex_discovery, 134 | google_dataplex_asset.gcp_primary_ga4_obfuscated_sample_ecommerce, 135 | google_dataplex_asset.gcp_primary_tables, 136 | google_dataplex_asset.gcp_primary_textocr, 137 | google_project_iam_member.connection_permission_grant, 138 | google_project_iam_member.dataproc_sa_roles, 139 | google_service_account.dataproc_service_account, 140 | google_storage_bucket.provisioning_bucket, 141 | google_storage_bucket.warehouse_bucket, 142 | time_sleep.wait_after_copy_data 143 | ] 144 | } 145 | -------------------------------------------------------------------------------- /src/ipynb/exploratory-analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "!pip install chart-studio" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "import numpy as np\n", 20 | "import scipy.optimize\n", 21 | "\n", 22 | "# Import and setup for plotly in Colab\n", 23 | "import chart_studio\n", 24 | "import chart_studio.plotly as py\n", 25 | "import plotly.graph_objects as go\n", 26 | "import plotly.io as pio\n", 27 | "import plotly.express as px\n", 28 | "\n", 29 | "# Enable displaying pandas data frames as interactive tables by default\n", 30 | "from google.colab import data_table\n", 31 | "data_table.enable_dataframe_formatter()" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "PROJECT_ID = 'CHANGE_TO_PROJECT_ID'\n", 41 | "REGION = \"CHANGE_TO_DEPLOYMENT_REGION\"" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "from google.colab import auth\n", 51 | "auth.authenticate_user()" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "!gcloud config set project {PROJECT_ID}\n", 61 | "!gcloud config get-value project" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "%%bigquery --project {PROJECT_ID}\n", 71 | "SELECT\n", 72 | " o.order_id,\n", 73 | " o.user_id order_user_id,\n", 74 | " o.status order_status,\n", 75 | " o.created_at order_created_at,\n", 76 | " o.returned_at order_returned_at,\n", 77 | " o.shipped_at order_shipped_at,\n", 78 | " o.delivered_at order_delivered_at,\n", 79 | " o.num_of_item order_number_of_items,\n", 80 | " i.id AS order_items_id,\n", 81 | " i.product_id AS order_items_product_id,\n", 82 | " i.status order_items_status,\n", 83 | " i.sale_price order_items_sale_price,\n", 84 | " p.id AS product_id,\n", 85 | " p.cost product_cost,\n", 86 | " p.category product_category,\n", 87 | " p.name product_name,\n", 88 | " p.brand product_brand,\n", 89 | " p.retail_price product_retail_price,\n", 90 | " p.department product_department,\n", 91 | " p.sku product_sku,\n", 92 | " p.distribution_center_id,\n", 93 | " d.name AS dist_center_name,\n", 94 | " d.latitude dist_center_lat,\n", 95 | " d.longitude dist_center_long,\n", 96 | " u.id AS user_id,\n", 97 | " u.first_name user_first_name,\n", 98 | " u.last_name user_last_name,\n", 99 | " u.age user_age,\n", 100 | " u.gender user_gender,\n", 101 | " u.state user_state,\n", 102 | " u.postal_code user_postal_code,\n", 103 | " u.city user_city,\n", 104 | " u.country user_country,\n", 105 | " u.latitude user_lat,\n", 106 | " u.longitude user_long,\n", 107 | " u.traffic_source user_traffic_source\n", 108 | "FROM\n", 109 | " gcp_lakehouse_ds.gcp_tbl_orders o\n", 110 | "INNER JOIN\n", 111 | " gcp_lakehouse_ds.gcp_tbl_order_items i\n", 112 | "ON\n", 113 | " o.order_id = i.order_id\n", 114 | "INNER JOIN\n", 115 | " gcp_lakehouse_ds.gcp_tbl_products p\n", 116 | "ON\n", 117 | " i.product_id = p.id\n", 118 | "INNER JOIN\n", 119 | " gcp_lakehouse_ds.gcp_tbl_distribution_centers d\n", 120 | "ON\n", 121 | " p.distribution_center_id = d.id\n", 122 | "INNER JOIN\n", 123 | " gcp_lakehouse_ds.gcp_tbl_users u\n", 124 | "ON\n", 125 | " o.user_id = u.id\n", 126 | "limit 100" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "%%bigquery --project {PROJECT_ID}\n", 136 | "\n", 137 | "SELECT\n", 138 | "sum(order_id) as count,\n", 139 | " date(o.created_at) date\n", 140 | "FROM\n", 141 | " gcp_lakehouse_ds.gcp_tbl_orders o\n", 142 | " group by o.created_at\n", 143 | " order by date(o.created_at)\n", 144 | " limit 500" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "%%bigquery data --project {PROJECT_ID}\n", 154 | "\n", 155 | "SELECT\n", 156 | "sum(order_id) as count,\n", 157 | " date(o.created_at) date\n", 158 | "FROM\n", 159 | " gcp_lakehouse_ds.gcp_tbl_orders o\n", 160 | " group by o.created_at\n", 161 | " order by date(o.created_at)\n", 162 | " limit 500" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "data['date'] = pd.to_datetime(data['date'])\n", 172 | "data['date'] = data['date'].astype(np.int64) // 10**9\n", 173 | "data.head()" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "from datetime import datetime\n", 183 | "from matplotlib import pyplot\n", 184 | "\n", 185 | "fig, ax = pyplot.subplots(figsize=(20,12))\n", 186 | "data.plot(x='date', y='count', kind='scatter', ax=ax)\n", 187 | "ax.set_xticklabels([datetime.fromtimestamp(date).strftime('%Y/%m/%d') for date in ax.get_xticks()])" 188 | ] 189 | } 190 | ], 191 | "metadata": { 192 | "language_info": { 193 | "name": "python" 194 | }, 195 | "orig_nbformat": 4 196 | }, 197 | "nbformat": 4, 198 | "nbformat_minor": 2 199 | } 200 | -------------------------------------------------------------------------------- /src/sql/sp_lookerstudio_report.sql: -------------------------------------------------------------------------------- 1 | -- Copyright 2023 Google LLC 2 | -- 3 | -- Licensed under the Apache License, Version 2.0 (the "License"); 4 | -- you may not use this file except in compliance with the License. 5 | -- You may obtain a copy of the License at 6 | -- 7 | -- http://www.apache.org/licenses/LICENSE-2.0 8 | -- 9 | -- Unless required by applicable law or agreed to in writing, software 10 | -- distributed under the License is distributed on an "AS IS" BASIS, 11 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | -- See the License for the specific language governing permissions and 13 | -- limitations under the License. 14 | 15 | CREATE OR REPLACE TABLE `${project_id}.ds_edw.lookerstudio_report` 16 | AS 17 | WITH TaxiData AS 18 | ( 19 | SELECT VENDOR_ID as TaxiCompany, 20 | EXTRACT(YEAR FROM Pickup_DateTime) AS Year, 21 | EXTRACT(WEEK FROM Pickup_DateTime) AS WeekNumber, 22 | CONCAT('Week ',FORMAT("%02d", 23 | EXTRACT(WEEK FROM Pickup_DateTime))) AS WeekName, 24 | CONCAT(VENDOR_ID,':',EXTRACT(YEAR FROM Pickup_DateTime),':',FORMAT("%02d",EXTRACT(WEEK FROM Pickup_DateTime))) AS GroupPartition, 25 | COUNT(1) AS NumberOfRides, 26 | AVG(Trip_Distance) AS AvgDistance, 27 | SUM(Fare_Amount) AS Total_Fare_Amount, 28 | SUM(Extra) AS Total_Surcharge, 29 | SUM(MTA_Tax) AS Total_MTA_Tax, 30 | SUM(Tolls_Amount) AS Total_Tolls_Amount, 31 | SUM(imp_Surcharge) AS Total_Improvement_Surcharge, 32 | SUM(Tip_Amount) AS Total_Tip_Amount, 33 | SUM(Total_Amount) AS Total_Total_Amount 34 | FROM `${project_id}.ds_edw.taxi_trips` AS taxi_trips 35 | WHERE Pickup_DateTime BETWEEN '2022-01-01' AND '2022-02-02' --'2015-01-01' AND '2021-12-31' -- There is odd data in some of the source files from NYC 36 | GROUP BY 1, 2, 3, 4, 5 37 | ) 38 | , LagPercents AS 39 | ( 40 | SELECT TaxiCompany, 41 | Year, 42 | WeekNumber, 43 | WeekName, 44 | NumberOfRides, 45 | GroupPartition, 46 | AvgDistance, 47 | Total_Fare_Amount, 48 | Total_Surcharge, 49 | Total_MTA_Tax, 50 | Total_Tolls_Amount, 51 | Total_Improvement_Surcharge, 52 | Total_Tip_Amount, 53 | Total_Total_Amount, 54 | LAG(NumberOfRides) OVER (PARTITION BY TaxiCompany ORDER BY Year, WeekNumber ASC) AS Prior_Week_NumberOfRides, 55 | LAG(AvgDistance) OVER (PARTITION BY TaxiCompany ORDER BY Year, WeekNumber ASC) AS Prior_Week_AvgDistance, 56 | LAG(Total_Fare_Amount) OVER (PARTITION BY TaxiCompany ORDER BY Year, WeekNumber ASC) AS Prior_Week_Total_Fare_Amount, 57 | LAG(Total_Surcharge) OVER (PARTITION BY TaxiCompany ORDER BY Year, WeekNumber ASC) AS Prior_Week_Total_Surcharge, 58 | LAG(Total_MTA_Tax) OVER (PARTITION BY TaxiCompany ORDER BY Year, WeekNumber ASC) AS Prior_Week_Total_MTA_Tax, 59 | LAG(Total_Tolls_Amount) OVER (PARTITION BY TaxiCompany ORDER BY Year, WeekNumber ASC) AS Prior_Week_Total_Tolls_Amount, 60 | LAG(Total_Improvement_Surcharge) OVER (PARTITION BY TaxiCompany ORDER BY Year, WeekNumber ASC) AS Prior_Week_Total_Improvement_Surcharge, 61 | LAG(Total_Tip_Amount) OVER (PARTITION BY TaxiCompany ORDER BY Year, WeekNumber ASC) AS Prior_Week_Total_Tip_Amount, 62 | LAG(Total_Total_Amount) OVER (PARTITION BY TaxiCompany ORDER BY Year, WeekNumber ASC) AS Prior_Week_Total_Total_Amount 63 | FROM TaxiData 64 | ) 65 | , PercentChange AS 66 | ( 67 | SELECT TaxiCompany, 68 | Year, 69 | WeekNumber, 70 | WeekName, 71 | GroupPartition, 72 | NumberOfRides, 73 | AvgDistance, 74 | Total_Fare_Amount, 75 | Total_Surcharge, 76 | Total_MTA_Tax, 77 | Total_Tolls_Amount, 78 | Total_Improvement_Surcharge, 79 | Total_Tip_Amount, 80 | Total_Total_Amount, 81 | Prior_Week_NumberOfRides, 82 | Prior_Week_AvgDistance, 83 | Prior_Week_Total_Fare_Amount, 84 | Prior_Week_Total_Surcharge, 85 | Prior_Week_Total_MTA_Tax, 86 | Prior_Week_Total_Tolls_Amount, 87 | Prior_Week_Total_Improvement_Surcharge, 88 | Prior_Week_Total_Tip_Amount, 89 | Prior_Week_Total_Total_Amount, 90 | SAFE_DIVIDE(CAST(NumberOfRides - Prior_Week_NumberOfRides AS NUMERIC) , CAST(Prior_Week_NumberOfRides AS NUMERIC)) AS PercentChange_NumberOfRides, 91 | SAFE_DIVIDE(CAST(AvgDistance - Prior_Week_AvgDistance AS NUMERIC) , CAST(Prior_Week_AvgDistance AS NUMERIC)) AS PercentChange_AvgDistance, 92 | SAFE_DIVIDE((Total_Fare_Amount - Prior_Week_Total_Fare_Amount) , Prior_Week_Total_Fare_Amount) AS PercentChange_Total_Fare_Amount, 93 | SAFE_DIVIDE((Total_Surcharge - Prior_Week_Total_Surcharge) , Prior_Week_Total_Surcharge) AS PercentChange_Total_Surcharge, 94 | SAFE_DIVIDE((Total_MTA_Tax - Prior_Week_Total_MTA_Tax) , Prior_Week_Total_MTA_Tax) AS PercentChange_Total_MTA_Tax, 95 | SAFE_DIVIDE((Total_Tolls_Amount - Prior_Week_Total_Tolls_Amount) , Prior_Week_Total_Tolls_Amount) AS PercentChange_Total_Tolls_Amount, 96 | SAFE_DIVIDE((Total_Improvement_Surcharge - Prior_Week_Total_Improvement_Surcharge) , Prior_Week_Total_Improvement_Surcharge) AS PercentChange_Total_Improvement_Surcharge, 97 | SAFE_DIVIDE((Total_Tip_Amount - Prior_Week_Total_Tip_Amount) , Prior_Week_Total_Tip_Amount) AS PercentChange_Total_Tip_Amount, 98 | SAFE_DIVIDE((Total_Total_Amount - Prior_Week_Total_Total_Amount) , Prior_Week_Total_Total_Amount) AS PercentChange_Total_Total_Amount 99 | FROM LagPercents 100 | ) 101 | SELECT * 102 | FROM PercentChange 103 | ORDER BY GroupPartition; 104 | 105 | CREATE OR REPLACE VIEW `${project_id}.ds_edw.vw_lookerstudio_report` as 106 | SELECT * FROM `${project_id}.ds_edw.lookerstudio_report` 107 | WHERE Year in (2022); 108 | -------------------------------------------------------------------------------- /bigquery.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2023 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | # Set up BigQuery resources 18 | 19 | # # Create the BigQuery dataset 20 | resource "google_bigquery_dataset" "gcp_lakehouse_ds" { 21 | project = module.project-services.project_id 22 | dataset_id = "gcp_lakehouse_ds" 23 | friendly_name = "My gcp_lakehouse Dataset" 24 | description = "My gcp_lakehouse Dataset with tables" 25 | location = var.region 26 | labels = var.labels 27 | delete_contents_on_destroy = var.force_destroy 28 | } 29 | 30 | # # Create a BigQuery connection for Spark 31 | resource "google_bigquery_connection" "spark" { 32 | project = module.project-services.project_id 33 | connection_id = "spark" 34 | location = var.region 35 | friendly_name = "gcp lakehouse spark connection" 36 | spark {} 37 | } 38 | 39 | # # This grands permissions to the service account of the Spark connection. 40 | resource "google_project_iam_member" "connection_permission_grant" { 41 | for_each = toset([ 42 | "roles/biglake.admin", 43 | "roles/bigquery.dataEditor", 44 | "roles/bigquery.connectionAdmin", 45 | "roles/bigquery.jobUser", 46 | "roles/bigquery.readSessionUser", 47 | "roles/storage.objectAdmin" 48 | ]) 49 | 50 | project = module.project-services.project_id 51 | role = each.key 52 | member = format("serviceAccount:%s", google_bigquery_connection.spark.spark[0].service_account_id) 53 | } 54 | 55 | locals { 56 | lakehouse_catalog = "lakehouse_catalog" 57 | } 58 | 59 | # # Creates a stored procedure for a spark job to create iceberg tables 60 | resource "google_bigquery_routine" "create_iceberg_tables" { 61 | project = module.project-services.project_id 62 | dataset_id = google_bigquery_dataset.gcp_lakehouse_ds.dataset_id 63 | routine_id = "create_iceberg_tables" 64 | routine_type = "PROCEDURE" 65 | language = "PYTHON" 66 | definition_body = "" 67 | arguments { 68 | name = "lakehouse_catalog" 69 | data_type = "{\"typeKind\" : \"STRING\"}" 70 | } 71 | arguments { 72 | name = "lakehouse_database" 73 | data_type = "{\"typeKind\" : \"STRING\"}" 74 | } 75 | arguments { 76 | name = "bq_dataset" 77 | data_type = "{\"typeKind\" : \"STRING\"}" 78 | } 79 | spark_options { 80 | connection = google_bigquery_connection.spark.name 81 | runtime_version = "2.1" 82 | main_file_uri = "gs://${google_storage_bucket_object.bigquery_file.bucket}/${google_storage_bucket_object.bigquery_file.name}" 83 | jar_uris = ["gs://spark-lib/biglake/biglake-catalog-iceberg1.2.0-0.1.0-with-dependencies.jar"] 84 | properties = { 85 | "spark.sql.catalog.lakehouse_catalog" : "org.apache.iceberg.spark.SparkCatalog", 86 | "spark.sql.catalog.lakehouse_catalog.blms_catalog" : local.lakehouse_catalog 87 | "spark.sql.catalog.lakehouse_catalog.catalog-impl" : "org.apache.iceberg.gcp.biglake.BigLakeCatalog", 88 | "spark.sql.catalog.lakehouse_catalog.gcp_location" : var.region, 89 | "spark.sql.catalog.lakehouse_catalog.gcp_project" : var.project_id, 90 | "spark.sql.catalog.lakehouse_catalog.warehouse" : "${google_storage_bucket.warehouse_bucket.url}/warehouse", 91 | "spark.jars.packages" : "org.apache.iceberg:iceberg-spark-runtime-3.3_2.13:1.2.1" 92 | } 93 | } 94 | } 95 | 96 | # # Execute after Dataplex discovery wait 97 | 98 | resource "google_bigquery_job" "create_view_ecommerce" { 99 | project = module.project-services.project_id 100 | location = var.region 101 | job_id = "create_view_ecommerce_${random_id.id.hex}" 102 | 103 | query { 104 | query = file("${path.module}/src/sql/view_ecommerce.sql") 105 | 106 | # Since the query contains DML, these must be set to empty. 107 | create_disposition = "" 108 | write_disposition = "" 109 | } 110 | 111 | depends_on = [time_sleep.wait_for_dataplex_discovery] 112 | } 113 | 114 | # resource "time_sleep" "check_create_view_ecommerce" { 115 | # create_duration = "30s" 116 | 117 | # depends_on = [google_bigquery_job.create_view_ecommerce] 118 | 119 | # lifecycle { 120 | # postcondition { 121 | # condition = google_bigquery_job.create_view_ecommerce.status.state == "DONE" && google_bigquery_job.create_view_ecommerce.status.error_result == null 122 | # error_message = "State: ${google_bigquery_job.create_view_ecommerce.status}, Error: ${google_bigquery_job.create_view_ecommerce.status.error_result.message}" 123 | # } 124 | # } 125 | # } 126 | 127 | resource "google_bigquery_job" "create_iceberg_tables" { 128 | project = module.project-services.project_id 129 | location = var.region 130 | job_id = "create_iceberg_tables_${random_id.id.hex}" 131 | 132 | query { 133 | query = "call gcp_lakehouse_ds.create_iceberg_tables('${local.lakehouse_catalog}', 'lakehouse_db', '${google_bigquery_dataset.gcp_lakehouse_ds.dataset_id}')" 134 | 135 | # Since the query calls a stored procedure, these must be set to empty. 136 | create_disposition = "" 137 | write_disposition = "" 138 | } 139 | 140 | depends_on = [time_sleep.wait_for_dataplex_discovery] 141 | } 142 | 143 | # resource "time_sleep" "check_create_iceberg_tables" { 144 | # create_duration = "300s" 145 | 146 | # depends_on = [google_bigquery_job.create_iceberg_tables] 147 | 148 | # lifecycle { 149 | # postcondition { 150 | # condition = google_bigquery_job.create_iceberg_tables.status.state == "DONE" && google_bigquery_job.create_view_ecommerce.status.error_result == null 151 | # error_message = "State: ${google_bigquery_job.create_iceberg_tables.status}, Error: ${google_bigquery_job.create_view_ecommerce.status.error_result.message}" 152 | # } 153 | # } 154 | # } 155 | 156 | -------------------------------------------------------------------------------- /src/yaml/copy-data.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | main: 16 | params: [] 17 | steps: 18 | - init: 19 | # Define local variables from terraform env variables 20 | assign: 21 | - source_bucket_name: ${public_data_bucket} 22 | - dest_ga4_images_bucket_name: ${ga4_images_bucket} 23 | - dest_textocr_images_bucket_name: ${textocr_images_bucket} 24 | - dest_tables_bucket_name: ${tables_bucket} 25 | - images_zone_name: ${images_zone_name}ga4 26 | - tables_zone_name: ${tables_zone_name} 27 | - lake_name: ${lake_name} 28 | - dataplex_bucket: ${dataplex_bucket} 29 | # If this workflow has been run before, do not run again 30 | - sub_check_if_run: 31 | steps: 32 | - assign_values: 33 | assign: 34 | - project_id: $${sys.get_env("GOOGLE_CLOUD_PROJECT_ID")} 35 | - location: $${sys.get_env("GOOGLE_CLOUD_LOCATION")} 36 | - workflow_id: $${sys.get_env("GOOGLE_CLOUD_WORKFLOW_ID")} 37 | - get_executions: 38 | call: http.get 39 | args: 40 | url: $${"https://workflowexecutions.googleapis.com/v1/projects/"+project_id+"/locations/"+location+"/workflows/"+workflow_id+"/executions"} 41 | auth: 42 | type: OAuth2 43 | result: Operation 44 | - check_if_run: 45 | switch: 46 | - condition: $${len(Operation.body.executions) > 1} 47 | next: end 48 | - sub_copy_data: 49 | parallel: 50 | branches: 51 | - copy_textocr_images: 52 | steps: 53 | - copy_textocr_images_call: 54 | call: copy_objects 55 | args: 56 | source_bucket_name: $${source_bucket_name} 57 | prefix: TextOCR_images 58 | dest_bucket_name: $${dest_textocr_images_bucket_name} 59 | result: copy_textocr_images_output 60 | - copy_ga4_images: 61 | steps: 62 | - copy_ga4_images_call: 63 | call: copy_objects 64 | args: 65 | source_bucket_name: $${source_bucket_name} 66 | prefix: ga4_obfuscated_sample_ecommerce_images 67 | dest_bucket_name: $${dest_ga4_images_bucket_name} 68 | result: copy_ga4_output 69 | - copy_new_york_taxi_trips_tables: 70 | steps: 71 | - copy_new_york_taxi_trips_tables_call: 72 | call: copy_objects 73 | args: 74 | source_bucket_name: $${source_bucket_name} 75 | prefix: new-york-taxi-trips 76 | dest_bucket_name: $${dest_tables_bucket_name} 77 | result: copy_new_york_taxi_trips_tables_output 78 | - copy_thelook_ecommerce_tables: 79 | steps: 80 | - copy_thelook_ecommerce_tables_call: 81 | call: copy_objects 82 | args: 83 | source_bucket_name: $${source_bucket_name} 84 | prefix: thelook_ecommerce 85 | dest_bucket_name: $${dest_tables_bucket_name} 86 | result: copy_thelook_ecommerce_tables_output 87 | - copy_dataplex_names_counts: 88 | steps: 89 | - copy_dataplex_names_counts_call: 90 | call: copy_objects 91 | args: 92 | source_bucket_name: $${source_bucket_name} 93 | prefix: views 94 | dest_bucket_name: $${dataplex_bucket} 95 | result: copy_dataplex_names_counts_output 96 | 97 | # Subworkflow to copy initial objects 98 | copy_objects: 99 | params: [source_bucket_name, prefix, dest_bucket_name] 100 | steps: 101 | - list_objects: 102 | call: googleapis.storage.v1.objects.list 103 | args: 104 | bucket: $${source_bucket_name} 105 | prefix: $${prefix} 106 | result: list_result 107 | - start_counter: 108 | assign: 109 | - copied_objects: 0 110 | - copy_objects: 111 | parallel: 112 | shared: [copied_objects] 113 | for: 114 | value: object 115 | index: i 116 | in: $${list_result.items} 117 | steps: 118 | - copy: 119 | try: 120 | steps: 121 | - copy_object: 122 | call: googleapis.storage.v1.objects.copy 123 | args: 124 | sourceBucket: $${source_bucket_name} 125 | sourceObject: $${text.url_encode(object.name)} 126 | destinationBucket: $${dest_bucket_name} 127 | destinationObject: $${text.url_encode(object.name)} 128 | result: copy_result 129 | - save_result: 130 | assign: 131 | - copied_objects: $${copied_objects + 1} 132 | except: 133 | as: e 134 | raise: 135 | exception: $${e} 136 | sourceBucket: $${source_bucket_name} 137 | sourceObject: $${object.name} 138 | destinationBucket: $${dest_bucket_name} 139 | - finish: 140 | return: $${copied_objects + " objects copied"} 141 | -------------------------------------------------------------------------------- /.github/workflows/periodic-reporter.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2023-2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # NOTE: This file is automatically generated from: 16 | # https://github.com/GoogleCloudPlatform/cloud-foundation-toolkit/blob/main/infra/terraform/modules/workflow_files/periodic-reporter.yaml 17 | 18 | name: 'reporter' 19 | 20 | on: 21 | schedule: 22 | # 2 hours after scheduled periodic and once again in the evening 23 | - cron: '0 5,17 * * *' 24 | workflow_dispatch: 25 | 26 | jobs: 27 | report: 28 | if: github.repository_owner == 'GoogleCloudPlatform' || github.repository_owner == 'terraform-google-modules' 29 | 30 | permissions: 31 | issues: 'write' 32 | 33 | runs-on: 'ubuntu-latest' 34 | 35 | steps: 36 | - uses: 'actions/github-script@v8' 37 | with: 38 | script: |- 39 | // label for all issues opened by reporter 40 | const periodicLabel = 'periodic-failure'; 41 | 42 | // check if any reporter opened any issues previously 43 | const prevIssues = await github.paginate(github.rest.issues.listForRepo, { 44 | ...context.repo, 45 | state: 'open', 46 | creator: 'github-actions[bot]', 47 | labels: [periodicLabel] 48 | }); 49 | // createOrCommentIssue creates a new issue or comments on an existing issue. 50 | const createOrCommentIssue = async function (title, txt) { 51 | if (prevIssues.length < 1) { 52 | console.log('no previous issues found, creating one'); 53 | await github.rest.issues.create({ 54 | ...context.repo, 55 | title: title, 56 | body: txt, 57 | labels: [periodicLabel] 58 | }); 59 | return; 60 | } 61 | if (prevIssues.length > 1) { 62 | console.warn( 63 | `found ${prevIssues.length} issues but only adding comment to ${prevIssues[0].html_url}` 64 | ); 65 | } 66 | console.log( 67 | `found previous issue ${prevIssues[0].html_url}, adding comment` 68 | ); 69 | await github.rest.issues.createComment({ 70 | ...context.repo, 71 | issue_number: prevIssues[0].number, 72 | body: txt 73 | }); 74 | }; 75 | 76 | // updateAndCloseIssues comments on any existing issues and closes them. No-op if no issue exists. 77 | const updateAndCloseIssues = async function (txt) { 78 | if (prevIssues.length < 1) { 79 | console.log('no previous issues found, skipping close'); 80 | return; 81 | } 82 | for (const prevIssue of prevIssues) { 83 | console.log(`found previous issue ${prevIssue.html_url}, adding comment`); 84 | await github.rest.issues.createComment({ 85 | ...context.repo, 86 | issue_number: prevIssue.number, 87 | body: txt 88 | }); 89 | console.log(`closing ${prevIssue.html_url}`); 90 | await github.rest.issues.update({ 91 | ...context.repo, 92 | issue_number: prevIssue.number, 93 | body: txt, 94 | state: 'closed' 95 | }); 96 | } 97 | }; 98 | 99 | // Find status of check runs. 100 | // We will find check runs for each commit and then filter for the periodic. 101 | // Checks API only allows for ref and if we use main there could be edge cases where 102 | // the check run happened on a SHA that is different from head. 103 | const commits = await github.paginate(github.rest.repos.listCommits, { 104 | ...context.repo 105 | }); 106 | 107 | var foundCheck = false; 108 | let periodicCheck = {}; 109 | 110 | for (const commit of commits) { 111 | console.log( 112 | `checking runs at ${commit.html_url}: ${commit.commit.message}` 113 | ); 114 | const checks = await github.rest.checks.listForRef({ 115 | ...context.repo, 116 | ref: commit.sha 117 | }); 118 | // find runs for this commit 119 | for (const check of checks.data.check_runs) { 120 | console.log(`found run ${check.name} for ${commit.html_url}`); 121 | if (check.name.includes('periodic-int-trigger')) { 122 | foundCheck = true; 123 | periodicCheck = check; 124 | break; 125 | } 126 | } 127 | 128 | if (foundCheck) { 129 | if ( 130 | periodicCheck.status === 'completed' && 131 | periodicCheck.conclusion === 'success' 132 | ) { 133 | updateAndCloseIssues( 134 | `[Passing periodic](${periodicCheck.html_url}) at ${commit.html_url}. Closing this issue.` 135 | ); 136 | } else if (periodicCheck.status === 'in_progress') { 137 | console.log( 138 | `Check is pending ${periodicCheck.html_url} for ${commit.html_url}. Retry again later.` 139 | ); 140 | } 141 | // error case 142 | else { 143 | createOrCommentIssue( 144 | 'Failing periodic', 145 | `[Failing periodic](${periodicCheck.html_url}) at ${commit.html_url}.` 146 | ); 147 | } 148 | // exit early as check was found 149 | return; 150 | } 151 | } 152 | 153 | // no periodic-int-trigger checks found across all commits, report it 154 | createOrCommentIssue( 155 | 'Missing periodic', 156 | `Periodic test has not run in the past 24hrs. Last checked from ${ 157 | commits[0].html_url 158 | } to ${commits[commits.length - 1].html_url}.` 159 | ); 160 | -------------------------------------------------------------------------------- /main.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2023 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | module "project-services" { 18 | source = "terraform-google-modules/project-factory/google//modules/project_services" 19 | version = "~> 18.0" 20 | disable_services_on_destroy = false 21 | 22 | project_id = var.project_id 23 | enable_apis = var.enable_apis 24 | 25 | activate_apis = [ 26 | "artifactregistry.googleapis.com", 27 | "biglake.googleapis.com", 28 | "bigquery.googleapis.com", 29 | "bigqueryconnection.googleapis.com", 30 | "bigquerydatapolicy.googleapis.com", 31 | "bigquerydatatransfer.googleapis.com", 32 | "bigquerymigration.googleapis.com", 33 | "bigqueryreservation.googleapis.com", 34 | "bigquerystorage.googleapis.com", 35 | "cloudapis.googleapis.com", 36 | "cloudbuild.googleapis.com", 37 | "cloudfunctions.googleapis.com", 38 | "compute.googleapis.com", 39 | "config.googleapis.com", 40 | "datacatalog.googleapis.com", 41 | "datalineage.googleapis.com", 42 | "dataplex.googleapis.com", 43 | "dataproc.googleapis.com", 44 | "iam.googleapis.com", 45 | "serviceusage.googleapis.com", 46 | "storage-api.googleapis.com", 47 | "storage.googleapis.com", 48 | "workflows.googleapis.com", 49 | "notebooks.googleapis.com", 50 | ] 51 | } 52 | 53 | resource "time_sleep" "wait_after_apis_activate" { 54 | depends_on = [module.project-services] 55 | create_duration = "30s" 56 | } 57 | 58 | #random id 59 | resource "random_id" "id" { 60 | byte_length = 4 61 | } 62 | 63 | # Set up Storage Buckets 64 | 65 | # # Set up the raw storage bucket 66 | resource "google_storage_bucket" "raw_bucket" { 67 | name = "gcp-${var.use_case_short}-raw-${random_id.id.hex}" 68 | project = module.project-services.project_id 69 | location = var.region 70 | uniform_bucket_level_access = true 71 | force_destroy = var.force_destroy 72 | 73 | # public_access_prevention = "enforced" # need to validate if this is a hard requirement 74 | } 75 | 76 | # # Set up the warehouse storage bucket 77 | resource "google_storage_bucket" "warehouse_bucket" { 78 | name = "gcp-${var.use_case_short}-warehouse-${random_id.id.hex}" 79 | project = module.project-services.project_id 80 | location = var.region 81 | uniform_bucket_level_access = true 82 | force_destroy = var.force_destroy 83 | 84 | # public_access_prevention = "enforced" # need to validate if this is a hard requirement 85 | } 86 | 87 | # # Set up the provisioning bucketstorage bucket 88 | resource "google_storage_bucket" "provisioning_bucket" { 89 | name = "gcp-${var.use_case_short}-provisioner-${random_id.id.hex}" 90 | project = module.project-services.project_id 91 | location = var.region 92 | uniform_bucket_level_access = true 93 | force_destroy = var.force_destroy 94 | 95 | } 96 | 97 | resource "google_storage_bucket" "ga4_images_bucket" { 98 | name = "gcp-${var.use_case_short}-ga4-images-${random_id.id.hex}" 99 | project = module.project-services.project_id 100 | location = var.region 101 | uniform_bucket_level_access = true 102 | force_destroy = var.force_destroy 103 | } 104 | 105 | resource "google_storage_bucket" "textocr_images_bucket" { 106 | name = "gcp-${var.use_case_short}-textocr-images-${random_id.id.hex}" 107 | project = module.project-services.project_id 108 | location = var.region 109 | uniform_bucket_level_access = true 110 | force_destroy = var.force_destroy 111 | } 112 | 113 | resource "google_storage_bucket" "tables_bucket" { 114 | name = "gcp-${var.use_case_short}-tables-${random_id.id.hex}" 115 | project = module.project-services.project_id 116 | location = var.region 117 | uniform_bucket_level_access = true 118 | force_destroy = var.force_destroy 119 | } 120 | 121 | # Bucket used to store BI data in Dataplex 122 | resource "google_storage_bucket" "dataplex_bucket" { 123 | name = "gcp-${var.use_case_short}-dataplex-${random_id.id.hex}" 124 | project = module.project-services.project_id 125 | location = var.region 126 | uniform_bucket_level_access = true 127 | force_destroy = var.force_destroy 128 | } 129 | 130 | resource "google_storage_bucket_object" "bigquery_file" { 131 | bucket = google_storage_bucket.provisioning_bucket.name 132 | name = "bigquery.py" 133 | source = "${path.module}/src/python/bigquery.py" 134 | 135 | depends_on = [ 136 | google_storage_bucket.provisioning_bucket 137 | ] 138 | } 139 | 140 | resource "google_storage_bucket_object" "bigtable_file" { 141 | bucket = google_storage_bucket.provisioning_bucket.name 142 | name = "bigtable.py" 143 | source = "${path.module}/src/python/bigtable.py" 144 | 145 | depends_on = [ 146 | google_storage_bucket.provisioning_bucket 147 | ] 148 | } 149 | 150 | # Uploads the post-startup script for the workbench instance. 151 | resource "google_storage_bucket_object" "post_startup_script" { 152 | bucket = google_storage_bucket.provisioning_bucket.name 153 | name = "post_startup.sh" 154 | source = "${path.module}/src/shell/post_startup.sh" 155 | 156 | depends_on = [ 157 | google_storage_bucket.provisioning_bucket 158 | ] 159 | } 160 | 161 | resource "google_storage_bucket" "spark-log-directory" { 162 | name = "gcp-${var.use_case_short}-spark-log-directory-${random_id.id.hex}" 163 | project = module.project-services.project_id 164 | location = var.region 165 | uniform_bucket_level_access = true 166 | force_destroy = var.force_destroy 167 | } 168 | 169 | resource "google_storage_bucket" "phs-staging-bucket" { 170 | name = "gcp-${var.use_case_short}-phs-staging-${random_id.id.hex}" 171 | project = module.project-services.project_id 172 | location = var.region 173 | uniform_bucket_level_access = true 174 | force_destroy = var.force_destroy 175 | } 176 | 177 | resource "google_storage_bucket" "phs-temp-bucket" { 178 | name = "gcp-${var.use_case_short}-phs-temp-${random_id.id.hex}" 179 | project = module.project-services.project_id 180 | location = var.region 181 | uniform_bucket_level_access = true 182 | force_destroy = var.force_destroy 183 | } 184 | 185 | resource "google_storage_bucket" "sparkml-model-bucket" { 186 | name = "gcp-${var.use_case_short}-model-${random_id.id.hex}" 187 | project = module.project-services.project_id 188 | location = var.region 189 | uniform_bucket_level_access = true 190 | force_destroy = var.force_destroy 191 | } 192 | -------------------------------------------------------------------------------- /dataplex.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2023 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | resource "google_project_service_identity" "dataplex_sa" { 18 | provider = google-beta 19 | project = module.project-services.project_id 20 | service = "dataplex.googleapis.com" 21 | } 22 | 23 | #give dataplex access to biglake bucket 24 | resource "google_project_iam_member" "dataplex_bucket_access" { 25 | project = module.project-services.project_id 26 | role = "roles/dataplex.serviceAgent" 27 | member = "serviceAccount:${google_project_service_identity.dataplex_sa.email}" 28 | } 29 | 30 | resource "google_dataplex_lake" "gcp_primary" { 31 | location = var.region 32 | name = "gcp-primary-lake" 33 | description = "gcp primary lake" 34 | display_name = "gcp primary lake" 35 | 36 | labels = { 37 | gcp-lake = "exists" 38 | } 39 | 40 | project = module.project-services.project_id 41 | 42 | depends_on = [ 43 | google_project_iam_member.dataplex_bucket_access 44 | ] 45 | 46 | } 47 | 48 | #zone - raw 49 | resource "google_dataplex_zone" "gcp_primary_raw" { 50 | discovery_spec { 51 | enabled = true 52 | } 53 | 54 | lake = google_dataplex_lake.gcp_primary.name 55 | location = var.region 56 | name = "gcp-primary-raw" 57 | 58 | resource_spec { 59 | location_type = "SINGLE_REGION" 60 | } 61 | 62 | type = "RAW" 63 | description = "Zone for thelook_ecommerce image data" 64 | display_name = "images" 65 | labels = {} 66 | project = module.project-services.project_id 67 | 68 | 69 | } 70 | 71 | #zone - curated, for staging the data 72 | resource "google_dataplex_zone" "gcp_primary_staging" { 73 | discovery_spec { 74 | enabled = true 75 | } 76 | 77 | lake = google_dataplex_lake.gcp_primary.name 78 | location = var.region 79 | name = "gcp-primary-staging" 80 | 81 | resource_spec { 82 | location_type = "SINGLE_REGION" 83 | } 84 | 85 | type = "CURATED" 86 | description = "Zone for thelook_ecommerce tabular data" 87 | display_name = "staging" 88 | labels = {} 89 | project = module.project-services.project_id 90 | } 91 | 92 | #zone - curated, for BI 93 | resource "google_dataplex_zone" "gcp_primary_curated_bi" { 94 | discovery_spec { 95 | enabled = true 96 | } 97 | 98 | lake = google_dataplex_lake.gcp_primary.name 99 | location = var.region 100 | name = "gcp-primary-curated" 101 | 102 | resource_spec { 103 | location_type = "SINGLE_REGION" 104 | } 105 | 106 | type = "CURATED" 107 | description = "Zone for thelook_ecommerce tabular data" 108 | display_name = "business_intelligence" 109 | labels = {} 110 | project = module.project-services.project_id 111 | } 112 | 113 | # Assets are listed below. Assets need to wait for data to be copied to be created. 114 | 115 | #asset 116 | resource "google_dataplex_asset" "gcp_primary_textocr" { 117 | name = "gcp-primary-textocr" 118 | location = var.region 119 | 120 | lake = google_dataplex_lake.gcp_primary.name 121 | dataplex_zone = google_dataplex_zone.gcp_primary_raw.name 122 | 123 | discovery_spec { 124 | enabled = true 125 | } 126 | 127 | resource_spec { 128 | name = "projects/${module.project-services.project_id}/buckets/${google_storage_bucket.textocr_images_bucket.name}" 129 | type = "STORAGE_BUCKET" 130 | read_access_mode = "MANAGED" 131 | } 132 | 133 | project = module.project-services.project_id 134 | depends_on = [time_sleep.wait_after_copy_data] 135 | 136 | } 137 | 138 | #asset 139 | resource "google_dataplex_asset" "gcp_primary_ga4_obfuscated_sample_ecommerce" { 140 | name = "gcp-primary-ga4-obfuscated-sample-ecommerce" 141 | location = var.region 142 | 143 | lake = google_dataplex_lake.gcp_primary.name 144 | dataplex_zone = google_dataplex_zone.gcp_primary_raw.name 145 | 146 | discovery_spec { 147 | enabled = true 148 | } 149 | 150 | resource_spec { 151 | name = "projects/${module.project-services.project_id}/buckets/${google_storage_bucket.ga4_images_bucket.name}" 152 | type = "STORAGE_BUCKET" 153 | read_access_mode = "MANAGED" 154 | } 155 | 156 | project = module.project-services.project_id 157 | depends_on = [time_sleep.wait_after_copy_data] 158 | 159 | } 160 | 161 | #asset 162 | resource "google_dataplex_asset" "gcp_primary_tables" { 163 | name = "gcp-primary-tables" 164 | location = var.region 165 | 166 | lake = google_dataplex_lake.gcp_primary.name 167 | dataplex_zone = google_dataplex_zone.gcp_primary_staging.name 168 | 169 | discovery_spec { 170 | enabled = true 171 | } 172 | 173 | resource_spec { 174 | name = "projects/${module.project-services.project_id}/buckets/${google_storage_bucket.tables_bucket.name}" 175 | type = "STORAGE_BUCKET" 176 | read_access_mode = "MANAGED" 177 | } 178 | 179 | project = module.project-services.project_id 180 | depends_on = [time_sleep.wait_after_copy_data] 181 | } 182 | 183 | # Add a wait for Dataplex Discovery. 184 | # Discovery on this data generally takes 6-8 minutes. 185 | resource "time_sleep" "wait_for_dataplex_discovery" { 186 | depends_on = [ 187 | google_dataplex_asset.gcp_primary_tables, 188 | google_dataplex_asset.gcp_primary_ga4_obfuscated_sample_ecommerce, 189 | google_dataplex_asset.gcp_primary_textocr 190 | ] 191 | 192 | create_duration = "600s" 193 | } 194 | 195 | locals { 196 | datascan_dataset = replace(google_dataplex_zone.gcp_primary_staging.name, "-", "_") 197 | } 198 | 199 | resource "google_dataplex_datascan" "dq_scan" { 200 | project = module.project-services.project_id 201 | location = var.region 202 | data_scan_id = "thelook-ecommerce-orders" 203 | 204 | data { 205 | resource = "//bigquery.googleapis.com/projects/${module.project-services.project_id}/datasets/${local.datascan_dataset}/tables/thelook_ecommerce_orders" 206 | } 207 | 208 | execution_spec { 209 | trigger { 210 | on_demand {} 211 | } 212 | } 213 | 214 | data_quality_spec { 215 | rules { 216 | column = "order_id" 217 | dimension = "COMPLETENESS" 218 | name = "non-null" 219 | description = "Sample rule for non-null column" 220 | threshold = 1.0 221 | non_null_expectation {} 222 | } 223 | 224 | rules { 225 | column = "user_id" 226 | dimension = "COMPLETENESS" 227 | name = "non-null" 228 | description = "Sample rule for non-null column" 229 | threshold = 1.0 230 | non_null_expectation {} 231 | } 232 | 233 | rules { 234 | column = "created_at" 235 | dimension = "COMPLETENESS" 236 | name = "non-null" 237 | description = "Sample rule for non-null column" 238 | threshold = 1.0 239 | non_null_expectation {} 240 | } 241 | 242 | rules { 243 | column = "order_id" 244 | dimension = "UNIQUENESS" 245 | name = "unique" 246 | description = "Sample rule for values in a set" 247 | uniqueness_expectation {} 248 | } 249 | 250 | rules { 251 | column = "status" 252 | dimension = "VALIDITY" 253 | name = "one-of-set" 254 | description = "Sample rule for values in a set" 255 | ignore_null = false 256 | set_expectation { 257 | values = ["Shipped", "Complete", "Processing", "Cancelled", "Returned"] 258 | } 259 | } 260 | 261 | rules { 262 | column = "num_of_item" 263 | dimension = "VALIDITY" 264 | name = "range-values" 265 | description = "Sample rule for values in a range" 266 | ignore_null = false 267 | threshold = 0.99 268 | range_expectation { 269 | max_value = 1 270 | strict_max_enabled = false 271 | strict_min_enabled = false 272 | } 273 | } 274 | 275 | rules { 276 | dimension = "VALIDITY" 277 | name = "non-empty-table" 278 | description = "Sample rule for a non-empty table" 279 | table_condition_expectation { 280 | sql_expression = "COUNT(*) > 0" 281 | } 282 | } 283 | } 284 | 285 | depends_on = [time_sleep.wait_for_dataplex_discovery] 286 | } 287 | -------------------------------------------------------------------------------- /src/ipynb/spark_langchain.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "8c79ba30-439d-4064-a2ac-859ea887ce75", 6 | "metadata": {}, 7 | "source": [ 8 | "## Tutorial" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "b339cf14-d890-458e-9587-21a788936ab2", 14 | "metadata": {}, 15 | "source": [ 16 | "### Install Langchain\n", 17 | "\n", 18 | "Install the `langchain`, `langchain-experimental`, and `langchain-google-geni` libraries. You can install these directly into your Spark Serverless environment." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "id": "23bdf160-2885-43c3-86d3-cfb3f605eb82", 25 | "metadata": { 26 | "tags": [] 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "pip install langchain langchain-experimental langchain-google-genai" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "id": "f2dbf357-d8b3-4038-84d2-19b06b49bf9a", 36 | "metadata": {}, 37 | "source": [ 38 | "### Create an API key\n", 39 | "\n", 40 | "Create an API key using Google AI Studio. Run the next cell and paste the API key in when prompted.\n" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "id": "63b1d896-164e-4c56-87d2-04447ee9d628", 47 | "metadata": { 48 | "tags": [] 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "from getpass import getpass\n", 53 | "\n", 54 | "api_key = getpass()" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "id": "c4709327-ca03-4b31-9a86-4df91c4dd788", 60 | "metadata": { 61 | "tags": [] 62 | }, 63 | "source": [ 64 | "### Import required libraries" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "id": "ce553641-3093-4016-9e17-20a76e4e2aa9", 71 | "metadata": { 72 | "tags": [] 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "from langchain_experimental.agents.agent_toolkits import create_spark_dataframe_agent\n", 77 | "from langchain_google_genai import GoogleGenerativeAI\n", 78 | "from pyspark.sql import SparkSession" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "id": "3f2e2e70-f2ab-4394-9332-8d6499ea3bb9", 84 | "metadata": { 85 | "tags": [] 86 | }, 87 | "source": [ 88 | "### Create a connection to the Gemini model service\n", 89 | "\n", 90 | "Create an LLM object using the `GoogleGenerativeAI` class which creates a connection to the Gemini model service." 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "id": "d3d47140-434a-4a0a-96e1-d62b9192e519", 97 | "metadata": { 98 | "tags": [] 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "llm = GoogleGenerativeAI(model=\"gemini-1.5-pro\", temperature=0.0, google_api_key=api_key)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "id": "588bf4b5-e5f9-4437-923e-11430713cddf", 108 | "metadata": {}, 109 | "source": [ 110 | "Use `llm.invoke` to ask Gemini a question and confirm your connection to the service." 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "id": "7d713cb8-c656-4285-9050-b2b6a9d2c1b2", 117 | "metadata": { 118 | "tags": [] 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "print(llm.invoke(\"What is the best programming language?\"))" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "id": "f58b9819-2142-43fb-ab40-d1ff82d8bf8f", 128 | "metadata": {}, 129 | "source": [ 130 | "### Create a Spark Session\n", 131 | "\n", 132 | "Create a connection to the Spark context in your environment." 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "id": "ee16d694-4652-4c38-98ec-88a24df5e175", 139 | "metadata": { 140 | "tags": [] 141 | }, 142 | "outputs": [], 143 | "source": [ 144 | "spark = SparkSession.builder.getOrCreate()" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "id": "f1d266c0-c32b-437e-a36a-79a339288c65", 150 | "metadata": { 151 | "tags": [] 152 | }, 153 | "source": [ 154 | "### Load data\n", 155 | "\n", 156 | "Load your BigLake table `gcp_primary_staging.thelook_ecommerce_order_items` into your environment. This table contains ecommerce orders." 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "id": "e0288a55-d7c5-4843-b5bd-c510b82a549c", 163 | "metadata": { 164 | "tags": [] 165 | }, 166 | "outputs": [], 167 | "source": [ 168 | "df = spark.read.format(\"bigquery\").load(\"next-2024-spark-demo.gcp_primary_staging.thelook_ecommerce_order_items\")" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "id": "93f0f829-6506-4607-95cf-30882ce7bd15", 174 | "metadata": { 175 | "tags": [] 176 | }, 177 | "source": [ 178 | "View some of the data" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "id": "38c40cda-092e-4f8e-bd7e-2be47559e8a6", 185 | "metadata": { 186 | "tags": [] 187 | }, 188 | "outputs": [], 189 | "source": [ 190 | "df.show(10)" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "id": "5825a341-2230-4e85-b482-9b378979c13e", 196 | "metadata": { 197 | "tags": [] 198 | }, 199 | "source": [ 200 | "Use the `create_spark_dataframe_agent` method to configure a LangChain agent using the loaded dataset and Gemini model. The `verbose=True` parameter, send to std.out the steps the agent is taking. Omitting this parameter to suppresses this output." 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "id": "3986ec6b-4dd8-4102-92b0-74479da4351b", 207 | "metadata": { 208 | "tags": [] 209 | }, 210 | "outputs": [], 211 | "source": [ 212 | "agent = create_spark_dataframe_agent(llm=llm, df=df, verbose=True)" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "id": "50f7bd72-4082-40c9-adc6-1312490d8a1d", 218 | "metadata": {}, 219 | "source": [ 220 | "Use natural language to gain insights into your data. To start with something simple, ask for the order_id and the price of the most expensive order." 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "id": "82cc9524-5d36-46f6-b8cc-21c6849708cd", 227 | "metadata": { 228 | "tags": [] 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "agent.invoke(\"what was the order id and the price of the most expensive order?\")" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "id": "9172275f-51cb-4fdc-b803-a34c28aca7f8", 238 | "metadata": {}, 239 | "source": [ 240 | "With the verbose parameter set to True, we can see exactly how the agent is working. The agent generates code based on the schema of the dataframe and executes it. It doesn't always get it on the first try, but it is able to learn from the errors it sees to adjust and correct until it lands on an acceptable answer.\n", 241 | "\n", 242 | "Next, make a request that involves the agent importing new functions." 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "id": "fa86d21d-b52f-4797-a7bd-6102b33d49bc", 249 | "metadata": { 250 | "tags": [] 251 | }, 252 | "outputs": [], 253 | "source": [ 254 | "agent.invoke(\"What week of the year has the total highest sales overall?\")" 255 | ] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "id": "84fba518-fd15-4dc2-96db-c51cb84c6138", 260 | "metadata": {}, 261 | "source": [ 262 | "Now you probably don't want to include this natural language prompt directly into a production environment. Instead, we can ask Gemini to generate the PySparkSQL code for us that would create the same output." 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "id": "16b41a6a-482c-4ce8-aa63-4e1c7d9de305", 269 | "metadata": { 270 | "tags": [] 271 | }, 272 | "outputs": [], 273 | "source": [ 274 | "agent.invoke(\"Print the PySpark code that answers 'What week of the year has the total highest sales overall?' Include all necessary imports.\")" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "id": "ccc75dce-bfab-4652-850c-2242dc28f9a1", 280 | "metadata": {}, 281 | "source": [ 282 | "Like anything created by the still-maturing LLM technology, review generated code for accuracy." 283 | ] 284 | } 285 | ], 286 | "metadata": { 287 | "environment": { 288 | "kernel": "9c39b79e5d2e7072beb4bd59-next-2024", 289 | "name": "workbench-notebooks.m113", 290 | "type": "gcloud", 291 | "uri": "gcr.io/deeplearning-platform-release/workbench-notebooks:m113" 292 | }, 293 | "kernelspec": { 294 | "display_name": "next-2024 on Serverless Spark (Remote)", 295 | "language": "python", 296 | "name": "9c39b79e5d2e7072beb4bd59-next-2024" 297 | }, 298 | "language_info": { 299 | "codemirror_mode": { 300 | "name": "ipython", 301 | "version": 3 302 | }, 303 | "file_extension": ".py", 304 | "mimetype": "text/x-python", 305 | "name": "python", 306 | "nbconvert_exporter": "python", 307 | "pygments_lexer": "ipython3", 308 | "version": "3.12.2" 309 | } 310 | }, 311 | "nbformat": 4, 312 | "nbformat_minor": 5 313 | } 314 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on 6 | [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 7 | and this project adheres to 8 | [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 9 | This changelog is generated automatically based on [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/). 10 | 11 | ## [0.4.0](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/compare/v0.3.0...v0.4.0) (2024-01-23) 12 | 13 | 14 | ### Features 15 | 16 | * add bucket for PHS created in Spark Serverless Interactive Tutorial ([e087195](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/e08719526836af1e4197ef55005b3291920b7909)) 17 | * adding sparkml notebook ([#99](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/99)) ([4b2169a](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/4b2169a11be058d495884a4ee455f49ef109b754)) 18 | * adding unit tests, removing unused arg from README ([#93](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/93)) ([bb9257b](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/bb9257b975d7b9635cb249f1a3867c5c0a14369b)) 19 | * create a bucket for dataplex ([#76](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/76)) ([ccadcc0](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/ccadcc0667d1b3e81f7f093c2a0acc83e567120a)) 20 | * **deps:** Update Terraform Google Provider to v5 (major) ([#79](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/79)) ([40ab09d](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/40ab09d2006f6052740afffc5df7cdaf06352c16)) 21 | 22 | 23 | ### Bug Fixes 24 | 25 | * add service account to phs cluster ([#82](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/82)) ([04a9fae](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/04a9fae8e4f1bb9cbe1a420bb9c89c79d1849ddb)) 26 | * add unique hash to the service account name ([#71](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/71)) ([c16912d](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/c16912d7c3d182671dceac4067ba196aa814948a)) 27 | * change data file paths to point to root directory ([#60](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/60)) ([4621da0](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/4621da033d56f88bb0c03948b1c2e0c5108c297d)) 28 | * **deps:** update module github.com/googlecloudplatform/cloud-foundation-toolkit/infra/blueprint-test to v0.8.0 ([#63](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/63)) ([54075a5](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/54075a59ef58fe8a156cac8f36f295ee149125a2)) 29 | * **deps:** update terraform google-beta to v4.74.0 ([#57](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/57)) ([f3848c3](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/f3848c3a71518930b94c582406d3100a0e29bcde)) 30 | * **deps:** update terraform google-beta to v4.75.0 ([#58](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/58)) ([10a452e](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/10a452e4f0e612f4ce63deb7559fc4c45bed3be0)) 31 | * **deps:** Update Terraform google-beta to v4.81.0 ([#66](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/66)) ([825fd7d](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/825fd7d163e361711c7a23c14b68b65125def50a)) 32 | * **deps:** Update Terraform google-beta to v4.82.0 ([#70](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/70)) ([cc8373f](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/cc8373fdda84690982c6c928480d67dfacb3d979)) 33 | * **deps:** Update Terraform google-beta to v4.83.0 ([#73](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/73)) ([a2cabdb](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/a2cabdb10ec92242f8d17c72f8734a47937fa7e6)) 34 | * **deps:** Update Terraform google-beta to v4.84.0 ([#74](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/74)) ([c70d9af](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/c70d9af5958fd6a6a792f1ebeee542b2f21ddb1b)) 35 | * **deps:** Update Terraform terraform-google-modules/project-factory/google to v14.3.0 ([#65](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/65)) ([a59521a](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/a59521a0f7017cee43f166677ab50546245504e2)) 36 | * **deps:** Update Terraform terraform-google-modules/project-factory/google to v14.4.0 ([#87](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/87)) ([8ca39d1](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/8ca39d18d09f6c0e5e08faa9ee5392b857d7fd96)) 37 | * remove compute instance check from integration test teardown ([#110](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/110)) ([e07095d](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/e07095df965d032a8d64f83fa5511f442cc9c433)) 38 | * rolling back PHS creation in deployment ([#105](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/105)) ([f5acf8e](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/f5acf8e9289422c549ea0243f0db5f8d3972399b)) 39 | * set staging and temp bucket for phs cluster ([#88](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/88)) ([c7ff112](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/c7ff1121a5a38645531c3eb44201c08bb6407713)) 40 | * Update Terraform versioning, improve dependency tree, remove unused table, add Managed Tables to Dataplex Assets ([#72](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/72)) ([9283feb](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/9283febc691cb313b97adc242dc38605dc3976d4)) 41 | * wait for Dataplex IAM to create lake ([#86](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/86)) ([9f42b95](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/9f42b95015f6bb65ee67c9b5ada2e06a8b9a3274)) 42 | * wait to create dataproc cluster until SA roles are assigned ([#91](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/91)) ([66bb99b](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/66bb99b2be3801abe86220f0a331c18b29bbe577)) 43 | 44 | ## [0.3.0](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/compare/v0.2.1...v0.3.0) (2023-07-18) 45 | 46 | 47 | ### Bug Fixes 48 | 49 | * **deps:** update terraform google-beta to v4.70.0 ([6460f59](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/6460f59c1bd6464dbb46b5561ee4ffa0109f75ff)) 50 | * **deps:** update terraform google-beta to v4.71.0 ([c64944b](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/c64944b1a7e3c73c87bb0bcb49696cc9b8693084)) 51 | * **deps:** update terraform google-beta to v4.73.1 ([47c1b4f](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/47c1b4f60367404c303c945d3b5dd46a0a378815)) 52 | * **deps:** update terraform google-beta to v4.73.2 ([f6f8cb8](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/f6f8cb836f6d2d67ba775b795778b754893bcca0)) 53 | * **deps:** update terraform terraform-google-modules/project-factory/google to v14.2.1 ([a6ca8a1](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/a6ca8a13dc0dbfc79683c5e43b43593957407064)) 54 | * upgrade dataplex tables to managed, create new zone, remove manual table creation ([52a45f2](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/52a45f2aee107dfd6fde04ce92e77cf7b61c4e5c)) 55 | 56 | ## [0.2.1](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/compare/v0.2.0...v0.2.1) (2023-06-22) 57 | 58 | 59 | ### Bug Fixes 60 | 61 | * update neos toc url ([#47](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/47)) ([629f00b](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/629f00b679faf1f29c676514f0ef7869c7b9ee8a)) 62 | 63 | ## [0.2.0](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/compare/v0.1.0...v0.2.0) (2023-06-14) 64 | 65 | 66 | ### Features 67 | 68 | * add polling logic to Spark workflow ([9ea1517](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/9ea151703ccdfb13998d1220f29885a55aeae547)) 69 | * adds metadata generation for the blueprint ([#34](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/34)) ([ef1b35c](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/ef1b35cf28d897cae3beff4dd4200617be902d20)) 70 | 71 | 72 | ### Bug Fixes 73 | 74 | * **deps:** update terraform google-beta to v4.69.1 ([28a034d](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/28a034d2115a0982ed3b5df02e7f91be696e8e33)) 75 | * **deps:** update terraform googles to <= 4.69.0, != 4.65.0, != 4.65.1 ([9a9852e](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/9a9852e7084ae0d3e0699437ea8ec78817f33104)) 76 | * **deps:** update terraform terraform-google-modules/project-factory/google to v14 ([e5e5d00](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/e5e5d00774ee5f7881b799fbb4ad435094b3087c)) 77 | * refactor references from 'assets' directory to 'src' ([acf7efb](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/acf7efba619230102e7691778ab69e47facc27aa)) 78 | * Update int.cloudbuild.yaml to use LR billing ([#43](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/43)) ([1d0ddc7](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/1d0ddc78ec473c7ca2c0863a9abdf1da2edc15f7)) 79 | 80 | ## 0.1.0 (2023-05-17) 81 | 82 | 83 | ### Features 84 | 85 | * output.tf additions ([#14](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/14)) ([07d4ea4](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/07d4ea4afd488c5df6899529fb60556a93aaaca7)) 86 | 87 | 88 | ### Bug Fixes 89 | 90 | * Biglake cleanup ([#10](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/10)) ([98646d8](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/98646d8f305554749f5afd7ab46e790f97d527fd)) 91 | * formatting and linting ([#12](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/12)) ([5e55357](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/5e553573532115bd7888600dc0c1565f79ef5b53)) 92 | * Lakehouse cleanup ([#9](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/9)) ([c474b66](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/c474b665018babe96ab897a1a338b703ac0a3b95)) 93 | * move RAP to Neos ([#24](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/24)) ([4a2aeb6](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/4a2aeb60a32f4bc79d08f008ad69bf2bc03a3792)) 94 | * pin google provider version to before 4.65 or not equal to 4.65 ([0510153](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/0510153a1849ff5f134a28cb7569f2970c142e93)) 95 | * pin google provider version to v4.64.0 ([32a83ba](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/32a83bac28f6c50de009d15333cc3ac61fc5be0a)) 96 | * update colab link ([#16](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/16)) ([20ef826](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/20ef8266bd0c70f35625008c3806a33099ded396)) 97 | * update neos, remove solution guide output ([7357552](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/735755295278b6c89cc9dbbe811f109bf96d8b52)) 98 | * Workflow dependency ([#23](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/23)) ([6e2b2df](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/6e2b2df7eba67ac2403da0a80c85a5ae99e067e9)) 99 | * workflows and remove hardcoding ([675b35c](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/675b35ce15db043204dd4bcfaa73faffe2933164)) 100 | 101 | ## [0.1.0](https://github.com/terraform-google-modules/terraform-google-/releases/tag/v0.1.0) - 20XX-YY-ZZ 102 | 103 | ### Features 104 | 105 | - Initial release 106 | 107 | [0.1.0]: https://github.com/terraform-google-modules/terraform-google-/releases/tag/v0.1.0 108 | -------------------------------------------------------------------------------- /test/integration/go.sum: -------------------------------------------------------------------------------- 1 | github.com/GoogleCloudPlatform/cloud-foundation-toolkit/infra/blueprint-test v0.17.6 h1:ZuWhmUXY/co2jqEUYYosTlAruqzATzrYQ4IV5VKiKNM= 2 | github.com/GoogleCloudPlatform/cloud-foundation-toolkit/infra/blueprint-test v0.17.6/go.mod h1:UX+iYMTzZ7Ik6N5rD8U32x7QwKaGyG/aAflWWDaHMDc= 3 | github.com/agext/levenshtein v1.2.3 h1:YB2fHEn0UJagG8T1rrWknE3ZQzWM06O8AMAatNn7lmo= 4 | github.com/agext/levenshtein v1.2.3/go.mod h1:JEDfjyjHDjOF/1e4FlBE/PkbqA9OfWu2ki2W0IB5558= 5 | github.com/alexflint/go-filemutex v1.3.0 h1:LgE+nTUWnQCyRKbpoceKZsPQbs84LivvgwUymZXdOcM= 6 | github.com/alexflint/go-filemutex v1.3.0/go.mod h1:U0+VA/i30mGBlLCrFPGtTe9y6wGQfNAWPBTekHQ+c8A= 7 | github.com/apparentlymart/go-textseg/v15 v15.0.0 h1:uYvfpb3DyLSCGWnctWKGj857c6ew1u1fNQOlOtuGxQY= 8 | github.com/apparentlymart/go-textseg/v15 v15.0.0/go.mod h1:K8XmNZdhEBkdlyDdvbmmsvpAG721bKi0joRfFdHIWJ4= 9 | github.com/bgentry/go-netrc v0.0.0-20140422174119-9fd32a8b3d3d h1:xDfNPAt8lFiC1UJrqV3uuy861HCTo708pDMbjHHdCas= 10 | github.com/bgentry/go-netrc v0.0.0-20140422174119-9fd32a8b3d3d/go.mod h1:6QX/PXZ00z/TKoufEY6K/a0k6AhaJrQKdFe6OfVXsa4= 11 | github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= 12 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 13 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 14 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= 15 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 16 | github.com/go-errors/errors v1.5.0 h1:/EuijeGOu7ckFxzhkj4CXJ8JaenxK7bKUxpPYqeLHqQ= 17 | github.com/go-errors/errors v1.5.0/go.mod h1:sIVyrIiJhuEF+Pj9Ebtd6P/rEYROXFi3BopGUQ5a5Og= 18 | github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= 19 | github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= 20 | github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= 21 | github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= 22 | github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k= 23 | github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= 24 | github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= 25 | github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= 26 | github.com/go-test/deep v1.0.7 h1:/VSMRlnY/JSyqxQUzQLKVMAskpY/NZKFA5j2P+0pP2M= 27 | github.com/go-test/deep v1.0.7/go.mod h1:QV8Hv/iy04NyLBxAdO9njL0iVPN1S4d/A3NVv1V36o8= 28 | github.com/google/gnostic-models v0.6.9 h1:MU/8wDLif2qCXZmzncUQ/BOfxWfthHi63KqpoNbWqVw= 29 | github.com/google/gnostic-models v0.6.9/go.mod h1:CiWsm0s6BSQd1hRn8/QmxqB6BesYcbSZxsz9b0KuDBw= 30 | github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= 31 | github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= 32 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= 33 | github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= 34 | github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= 35 | github.com/gruntwork-io/terratest v0.48.2 h1:+VwfODchq8jxZZWD+s8gBlhD1z6/C4bFLNrhpm9ONrs= 36 | github.com/gruntwork-io/terratest v0.48.2/go.mod h1:Y5ETyD4ZQ2MZhasPno272fWuCpKwvTPYDi8Y0tIMqTE= 37 | github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= 38 | github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I= 39 | github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= 40 | github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ= 41 | github.com/hashicorp/go-cleanhttp v0.5.2/go.mod h1:kO/YDlP8L1346E6Sodw+PrpBSV4/SoxCXGY6BqNFT48= 42 | github.com/hashicorp/go-getter/v2 v2.2.3 h1:6CVzhT0KJQHqd9b0pK3xSP0CM/Cv+bVhk+jcaRJ2pGk= 43 | github.com/hashicorp/go-getter/v2 v2.2.3/go.mod h1:hp5Yy0GMQvwWVUmwLs3ygivz1JSLI323hdIE9J9m7TY= 44 | github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo= 45 | github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM= 46 | github.com/hashicorp/go-safetemp v1.0.0 h1:2HR189eFNrjHQyENnQMMpCiBAsRxzbTMIgBhEyExpmo= 47 | github.com/hashicorp/go-safetemp v1.0.0/go.mod h1:oaerMy3BhqiTbVye6QuFhFtIceqFoDHxNAB65b+Rj1I= 48 | github.com/hashicorp/go-version v1.7.0 h1:5tqGy27NaOTB8yJKUZELlFAS/LTKJkrmONwQKeRZfjY= 49 | github.com/hashicorp/go-version v1.7.0/go.mod h1:fltr4n8CU8Ke44wwGCBoEymUuxUHl09ZGVZPK5anwXA= 50 | github.com/hashicorp/hcl v0.0.0-20170504190234-a4b07c25de5f h1:UdxlrJz4JOnY8W+DbLISwf2B8WXEolNRA8BGCwI9jws= 51 | github.com/hashicorp/hcl v0.0.0-20170504190234-a4b07c25de5f/go.mod h1:oZtUIOe8dh44I2q6ScRibXws4Ajl+d+nod3AaR9vL5w= 52 | github.com/hashicorp/hcl/v2 v2.22.0 h1:hkZ3nCtqeJsDhPRFz5EA9iwcG1hNWGePOTw6oyul12M= 53 | github.com/hashicorp/hcl/v2 v2.22.0/go.mod h1:62ZYHrXgPoX8xBnzl8QzbWq4dyDsDtfCRgIq1rbJEvA= 54 | github.com/hashicorp/terraform-config-inspect v0.0.0-20250203082807-efaa306e97b4 h1:6zYoI+NGpRPo0UjbnJfmqqTFcTEKvbv77h0ZcgeLXJs= 55 | github.com/hashicorp/terraform-config-inspect v0.0.0-20250203082807-efaa306e97b4/go.mod h1:Gz/z9Hbn+4KSp8A2FBtNszfLSdT2Tn/uAKGuVqqWmDI= 56 | github.com/hashicorp/terraform-json v0.24.0 h1:rUiyF+x1kYawXeRth6fKFm/MdfBS6+lW4NbeATsYz8Q= 57 | github.com/hashicorp/terraform-json v0.24.0/go.mod h1:Nfj5ubo9xbu9uiAoZVBsNOjvNKB66Oyrvtit74kC7ow= 58 | github.com/jinzhu/copier v0.4.0 h1:w3ciUoD19shMCRargcpm0cm91ytaBhDvuRpz1ODO/U8= 59 | github.com/jinzhu/copier v0.4.0/go.mod h1:DfbEm0FYsaqBcKcFuvmOZb218JkPGtvSHsKg8S8hyyg= 60 | github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= 61 | github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= 62 | github.com/klauspost/compress v1.16.7 h1:2mk3MPGNzKyxErAw8YaohYh69+pa4sIQSC0fPGCFR9I= 63 | github.com/klauspost/compress v1.16.7/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= 64 | github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= 65 | github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= 66 | github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= 67 | github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= 68 | github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= 69 | github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= 70 | github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= 71 | github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= 72 | github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= 73 | github.com/mattn/go-shellwords v1.0.12 h1:M2zGm7EW6UQJvDeQxo4T51eKPurbeFbe8WtebGE2xrk= 74 | github.com/mattn/go-shellwords v1.0.12/go.mod h1:EZzvwXDESEeg03EKmM+RmDnNOPKG4lLtQsUlTZDWQ8Y= 75 | github.com/mattn/go-zglob v0.0.4 h1:LQi2iOm0/fGgu80AioIJ/1j9w9Oh+9DZ39J4VAGzHQM= 76 | github.com/mattn/go-zglob v0.0.4/go.mod h1:MxxjyoXXnMxfIpxTK2GAkw1w8glPsQILx3N5wrKakiY= 77 | github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y= 78 | github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= 79 | github.com/mitchellh/go-testing-interface v1.14.2-0.20210821155943-2d9075ca8770 h1:drhDO54gdT/a15GBcMRmunZiNcLgPiFIJa23KzmcvcU= 80 | github.com/mitchellh/go-testing-interface v1.14.2-0.20210821155943-2d9075ca8770/go.mod h1:SO/iHr6q2EzbqRApt+8/E9wqebTwQn5y+UlB04bxzo0= 81 | github.com/mitchellh/go-wordwrap v1.0.1 h1:TLuKupo69TCn6TQSyGxwI1EblZZEsQ0vMlAFQflz0v0= 82 | github.com/mitchellh/go-wordwrap v1.0.1/go.mod h1:R62XHJLzvMFRBbcrT7m7WgmE1eOyTSsCt+hzestvNj0= 83 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 84 | github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= 85 | github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 86 | github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= 87 | github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= 88 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 89 | github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= 90 | github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= 91 | github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= 92 | github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 93 | github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= 94 | github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= 95 | github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= 96 | github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= 97 | github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= 98 | github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY= 99 | github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= 100 | github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= 101 | github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= 102 | github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= 103 | github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4= 104 | github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= 105 | github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= 106 | github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= 107 | github.com/tmccombs/hcl2json v0.6.4 h1:/FWnzS9JCuyZ4MNwrG4vMrFrzRgsWEOVi+1AyYUVLGw= 108 | github.com/tmccombs/hcl2json v0.6.4/go.mod h1:+ppKlIW3H5nsAsZddXPy2iMyvld3SHxyjswOZhavRDk= 109 | github.com/ulikunitz/xz v0.5.11 h1:kpFauv27b6ynzBNT/Xy+1k+fK4WswhN/6PN5WhFAGw8= 110 | github.com/ulikunitz/xz v0.5.11/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14= 111 | github.com/zclconf/go-cty v1.15.1 h1:RgQYm4j2EvoBRXOPxhUvxPzRrGDo1eCOhHXuGfrj5S0= 112 | github.com/zclconf/go-cty v1.15.1/go.mod h1:VvMs5i0vgZdhYawQNq5kePSpLAoz8u1xvZgrPIxfnZE= 113 | github.com/zclconf/go-cty-debug v0.0.0-20240509010212-0d6042c53940 h1:4r45xpDWB6ZMSMNJFMOjqrGHynW3DIBuR2H9j0ug+Mo= 114 | github.com/zclconf/go-cty-debug v0.0.0-20240509010212-0d6042c53940/go.mod h1:CmBdvvj3nqzfzJ6nTCIwDTPZ56aVGvDrmztiO5g3qrM= 115 | golang.org/x/crypto v0.35.0 h1:b15kiHdrGCHrP6LvwaQ3c03kgNhhiMgvlhxHQhmg2Xs= 116 | golang.org/x/crypto v0.35.0/go.mod h1:dy7dXNW32cAb/6/PRuTNsix8T+vJAqvuIy5Bli/x0YQ= 117 | golang.org/x/mod v0.23.0 h1:Zb7khfcRGKk+kqfxFaP5tZqCnDZMjC5VtUBs87Hr6QM= 118 | golang.org/x/mod v0.23.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY= 119 | golang.org/x/net v0.36.0 h1:vWF2fRbw4qslQsQzgFqZff+BItCvGFQqKzKIzx1rmoA= 120 | golang.org/x/net v0.36.0/go.mod h1:bFmbeoIPfrw4sMHNhb4J9f6+tPziuGjq7Jk/38fxi1I= 121 | golang.org/x/sync v0.11.0 h1:GGz8+XQP4FvTTrjZPzNKTMFtSXH80RAzG+5ghFPgK9w= 122 | golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= 123 | golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 124 | golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc= 125 | golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 126 | golang.org/x/term v0.29.0 h1:L6pJp37ocefwRRtYPKSWOWzOtWSxVajvz2ldH/xi3iU= 127 | golang.org/x/term v0.29.0/go.mod h1:6bl4lRlvVuDgSf3179VpIxBF0o10JUpXWOnI7nErv7s= 128 | golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM= 129 | golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY= 130 | golang.org/x/tools v0.26.0 h1:v/60pFQmzmT9ExmjDv2gGIfi3OqfKoEP6I5+umXlbnQ= 131 | golang.org/x/tools v0.26.0/go.mod h1:TPVVj70c7JJ3WCazhD8OdXcZg/og+b9+tH/KxylGwH0= 132 | google.golang.org/protobuf v1.35.1 h1:m3LfL6/Ca+fqnjnlqQXNpFPABW1UD7mjh8KO2mKFytA= 133 | google.golang.org/protobuf v1.35.1/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= 134 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 135 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= 136 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= 137 | gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 138 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 139 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 140 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 141 | k8s.io/kube-openapi v0.0.0-20241212222426-2c72e554b1e7 h1:hcha5B1kVACrLujCKLbr8XWMxCxzQx42DY8QKYJrDLg= 142 | k8s.io/kube-openapi v0.0.0-20241212222426-2c72e554b1e7/go.mod h1:GewRfANuJ70iYzvn+i4lezLDAFzvjxZYK1gn1lWcfas= 143 | sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8= 144 | sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo= 145 | sigs.k8s.io/kustomize/kyaml v0.19.0 h1:RFge5qsO1uHhwJsu3ipV7RNolC7Uozc0jUBC/61XSlA= 146 | sigs.k8s.io/kustomize/kyaml v0.19.0/go.mod h1:FeKD5jEOH+FbZPpqUghBP8mrLjJ3+zD3/rf9NNu1cwY= 147 | sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= 148 | sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= 149 | -------------------------------------------------------------------------------- /src/ipynb/spark_ml.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "4d6c84304016" 7 | }, 8 | "source": [ 9 | "# SparkML with Dataproc Serverless" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": { 15 | "id": "3eee516156f1" 16 | }, 17 | "source": [ 18 | "## Overview\n", 19 | "\n", 20 | "This notebook tutorial demonstrates the execution of Apache SparkML jobs using Dataproc Serverless. This example machine learning pipeline ingests the [NYC TLC (Taxi and Limousine Commission) Trips](https://console.cloud.google.com/marketplace/product/city-of-new-york/nyc-tlc-trips) dataset from your lakehouse and performs cleaning, feature engineering, model training, and model evaluation to calculate trip duration.\n", 21 | "\n", 22 | "The tutorial uses the following Google Cloud products:\n", 23 | "- `Dataproc`\n", 24 | "- `BigQuery`\n", 25 | "- `Vertex AI Training`\n", 26 | "- `BigLake`" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": { 32 | "id": "2773979cd11d" 33 | }, 34 | "source": [ 35 | "## Tutorial\n", 36 | "\n", 37 | "### Set your project ID, location, and session ID" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": { 44 | "id": "20366a83e3f1", 45 | "tags": [] 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "# Retrieve the current active project and store it as a list of strings.\n", 50 | "PROJECT_ID = !gcloud config get-value project\n", 51 | "\n", 52 | "# Extract the project ID from the list.\n", 53 | "PROJECT_ID = PROJECT_ID[0] if PROJECT_ID else None\n", 54 | "\n", 55 | "# Retrieve the current location.\n", 56 | "LOCATION = !gcloud compute instances list --project={PROJECT_ID} --format='get(ZONE)'\n", 57 | "LOCATION = str(LOCATION).split(\"/\")[-1][:-4]\n", 58 | "\n", 59 | "# Get the name of the active Dataproc Serverless Session\n", 60 | "SESSION = !gcloud beta dataproc sessions list --location='{LOCATION}' --filter='state=ACTIVE' --format='get(SESSION_ID)' --sort-by='~createTime'\n", 61 | "SESSION = SESSION[0].split('/')[-1] if SESSION else None" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": { 67 | "id": "73f5734153d4" 68 | }, 69 | "source": [ 70 | "### Get a Cloud Storage bucket URI" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": { 77 | "id": "06cb3320201b", 78 | "tags": [] 79 | }, 80 | "outputs": [], 81 | "source": [ 82 | "# Define the prefix of the bucket created via Terraform.\n", 83 | "BUCKET_PREFIX = \"gcp-lakehouse-model\"\n", 84 | "\n", 85 | "# Retrieve the Cloud Storage bucket URI for storing the machine learning model.\n", 86 | "BUCKET_URI = !gcloud storage buckets list --format='value(name)' --filter='name:{BUCKET_PREFIX}*'\n", 87 | "\n", 88 | "# Extract the bucket URI from the list.\n", 89 | "BUCKET_URI = BUCKET_URI[0] if BUCKET_URI else None" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": { 95 | "id": "34dd442fd5af" 96 | }, 97 | "source": [ 98 | "### Import required libraries" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": { 105 | "id": "80ef02298a93", 106 | "tags": [] 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "import matplotlib.pyplot as plt\n", 111 | "import seaborn as sns\n", 112 | "from geopandas import gpd\n", 113 | "from pyspark.ml.evaluation import RegressionEvaluator\n", 114 | "from pyspark.ml.feature import VectorAssembler\n", 115 | "from pyspark.ml.regression import GBTRegressor\n", 116 | "# A Spark Session is how you interact with Spark SQL to create Dataframes\n", 117 | "from pyspark.sql import SparkSession\n", 118 | "# PySpark functions\n", 119 | "from pyspark.sql.functions import col, floor, unix_timestamp" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": { 125 | "id": "2b6fb4d7c7f5" 126 | }, 127 | "source": [ 128 | "### Initialize the SparkSession\n", 129 | "\n", 130 | "Use the [spark-bigquery-connector](https://github.com/GoogleCloudDataproc/spark-bigquery-connector) to read and write data between Apache Spark and BigQuery." 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": { 137 | "id": "3ce77cd7c0d2", 138 | "tags": [] 139 | }, 140 | "outputs": [], 141 | "source": [ 142 | "VER = \"0.34.0\"\n", 143 | "FILE_NAME = f\"spark-bigquery-with-dependencies_2.12-{VER}.jar\"\n", 144 | "connector = f\"gs://spark-lib/bigquery/{FILE_NAME}\"\n", 145 | "\n", 146 | "# Initialize the SparkSession.\n", 147 | "spark = (\n", 148 | " SparkSession.builder.appName(\"spark-ml-taxi\")\n", 149 | " .config(\"spark.jars\", connector)\n", 150 | " .config(\"spark.logConf\", \"false\")\n", 151 | " .getOrCreate()\n", 152 | ")" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": { 158 | "id": "3a4080065c8b" 159 | }, 160 | "source": [ 161 | "### Fetch data\n", 162 | "\n", 163 | "Load the table `gcp_primary_staging.new_york_taxi_trips_tlc_yellow_trips_2022`." 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": { 170 | "id": "4a5f19a732ed", 171 | "tags": [] 172 | }, 173 | "outputs": [], 174 | "source": [ 175 | "# Load NYC_taxi in Github Activity Public Dataset from BigQuery.\n", 176 | "taxi_df = (\n", 177 | " spark.read.format(\"bigquery\")\n", 178 | " .option(\n", 179 | " \"table\",\n", 180 | " f\"{PROJECT_ID}.gcp_primary_staging.new_york_taxi_trips_tlc_yellow_trips_2022\",\n", 181 | " )\n", 182 | " .load()\n", 183 | ")\n", 184 | "\n", 185 | "# Sample parameter. Increase or decrease to experiment with different data sizes.\n", 186 | "FRACTION = 0.05\n", 187 | "\n", 188 | "# Sample data to minimize the runtime.\n", 189 | "taxi_df = taxi_df.sample(fraction=FRACTION, seed=42)" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": { 195 | "id": "26cef587d4de" 196 | }, 197 | "source": [ 198 | "### Perform Exploratory Data Analysis (EDA)\n", 199 | "\n", 200 | "Perform EDA to uncover more information about your data." 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": { 207 | "id": "37d76ec684b9", 208 | "tags": [] 209 | }, 210 | "outputs": [], 211 | "source": [ 212 | "taxi_df.printSchema()" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": { 218 | "id": "64d2006cab0f" 219 | }, 220 | "source": [ 221 | "Select and modify necessary columns." 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": { 228 | "id": "a0aa977e3d27", 229 | "tags": [] 230 | }, 231 | "outputs": [], 232 | "source": [ 233 | "# Choose necessary columns.\n", 234 | "COLUMNS_TO_SELECT = [\n", 235 | " \"start_time\",\n", 236 | " \"end_time\",\n", 237 | " \"passenger_count\",\n", 238 | " \"trip_distance\",\n", 239 | " \"trip_duration\",\n", 240 | " \"fare_amount\",\n", 241 | " \"extra\",\n", 242 | " \"mta_tax\",\n", 243 | " \"tip_amount\",\n", 244 | " \"tolls_amount\",\n", 245 | " \"imp_surcharge\",\n", 246 | " \"airport_fee\",\n", 247 | " \"total_amount\",\n", 248 | " \"start_zone_id\",\n", 249 | " \"end_zone_id\",\n", 250 | "]\n", 251 | "\n", 252 | "# Convert pickup_location_id and dropoff_location_id to integers for a later processing step:\n", 253 | "taxi_df = (\n", 254 | " taxi_df.withColumn(\"start_zone_id\", col(\"pickup_location_id\").cast(\"int\")) # Convert pickup_location_id to integer\n", 255 | " .withColumn(\"end_zone_id\", col(\"dropoff_location_id\").cast(\"int\")) # Convert dropoff_location_id to integer\n", 256 | ")\n", 257 | "\n", 258 | "# Convert datetime from string to Unix timestamp:\n", 259 | "taxi_df = (\n", 260 | " taxi_df.withColumn(\"start_time\", unix_timestamp(col(\"pickup_datetime\"))) # Convert pickup_datetime to Unix timestamp\n", 261 | " .withColumn(\"end_time\", unix_timestamp(col(\"dropoff_datetime\"))) # Convert dropoff_datetime to Unix timestamp\n", 262 | ")\n", 263 | "\n", 264 | "# Calculate trip_duration.\n", 265 | "taxi_df = taxi_df.withColumn(\"trip_duration\", col(\"end_time\") - col(\"start_time\"))\n", 266 | "\n", 267 | "# Select the specified columns:\n", 268 | "taxi_df = taxi_df.select(*COLUMNS_TO_SELECT) # Selects columns based on the list in COLUMNS_TO_SELECT\n", 269 | "\n", 270 | "# Display summary statistics and preview the modified DataFrame.\n", 271 | "taxi_df.describe().show()" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": { 277 | "id": "970761b1f949" 278 | }, 279 | "source": [ 280 | "Build a boxplot to further assess the data." 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "metadata": {}, 287 | "outputs": [], 288 | "source": [ 289 | "# Convert Spark DataFrame into a Pandas DataFrame.\n", 290 | "taxi_pd = taxi_df.toPandas()\n", 291 | "\n", 292 | "# Define columns to be converted to numerical type in Pandas and be visualized.\n", 293 | "PD_COLUMNS = [\n", 294 | " \"trip_distance\",\n", 295 | " \"fare_amount\",\n", 296 | " \"extra\",\n", 297 | " \"mta_tax\",\n", 298 | " \"tip_amount\",\n", 299 | " \"tolls_amount\",\n", 300 | " \"imp_surcharge\",\n", 301 | " \"airport_fee\",\n", 302 | " \"total_amount\",\n", 303 | "]\n", 304 | "\n", 305 | "# Convert columns of \"object\" type to the float type.\n", 306 | "taxi_pd[PD_COLUMNS] = taxi_pd[PD_COLUMNS].astype(float)\n", 307 | "\n", 308 | "# Box plots and histograms for the specified columns.\n", 309 | "for column in taxi_pd.columns:\n", 310 | " if column in PD_COLUMNS:\n", 311 | " _, ax = plt.subplots(1, 2, figsize=(5, 2))\n", 312 | " taxi_pd[column].plot(kind=\"box\", ax=ax[0])\n", 313 | " taxi_pd[column].plot(kind=\"hist\", ax=ax[1])\n", 314 | " plt.title(column)\n", 315 | " plt.figure()\n", 316 | "plt.show()" 317 | ] 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "metadata": {}, 322 | "source": [ 323 | "From these summary and boxplots, there are over 1 million trip histories for Yellow Taxi in 2022, which represents approximately 5% of the total trips. \n", 324 | "\n", 325 | "However, some trip histories have data anomalies. Trips exceeding 10,000 miles are beyond realistic expectations and will be excluded. Additionally, null and negative values in fare, tax, and tolls create inconsistencies and can distort analysis. Filter these values out of the data." 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": null, 331 | "metadata": { 332 | "id": "a9196e32f245", 333 | "tags": [] 334 | }, 335 | "outputs": [], 336 | "source": [ 337 | "taxi_df = taxi_df.where(\n", 338 | " (col(\"trip_distance\") < 10000)\n", 339 | " & (col(\"fare_amount\") > 0)\n", 340 | " & (col(\"extra\") >= 0)\n", 341 | " & (col(\"mta_tax\") >= 0)\n", 342 | " & (col(\"tip_amount\") >= 0)\n", 343 | " & (col(\"tolls_amount\") >= 0)\n", 344 | " & (col(\"imp_surcharge\") >= 0)\n", 345 | " & (col(\"airport_fee\") >= 0)\n", 346 | " & (col(\"total_amount\") > 0)\n", 347 | ").dropna()" 348 | ] 349 | }, 350 | { 351 | "cell_type": "markdown", 352 | "metadata": { 353 | "id": "9a930952e7da" 354 | }, 355 | "source": [ 356 | "### Perform Feature Engineering\n", 357 | "\n", 358 | "While the Taxi dataset contains trips for all NYC boroughs, precise location information is categorized using `NYC Taxi zones`. Use the `bigquery-public-data.new_york_taxi_trips.taxi_zone_geom` public dataset to calculate longitude and latitude values." 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": { 365 | "id": "7b58375ab96a", 366 | "tags": [] 367 | }, 368 | "outputs": [], 369 | "source": [ 370 | "# Load the GeoJSON format of NYC Taxi zones from the BigQuery public dataset.\n", 371 | "geo_df = (\n", 372 | " spark.read.format(\"bigquery\")\n", 373 | " .option(\"table\", \"bigquery-public-data.new_york_taxi_trips.taxi_zone_geom\")\n", 374 | " .load()\n", 375 | ")\n", 376 | "\n", 377 | "# Convert Spark DataFrame into Pandas DataFrame to integrate with the GeoPandas library.\n", 378 | "geo_pd = geo_df.toPandas()\n", 379 | "\n", 380 | "# Create a GeoDataFrame based on the central point of each taxi zone, separated by latitude and longitude.\n", 381 | "geo_pd[\"long\"] = gpd.GeoSeries.from_wkt(geo_pd[\"zone_geom\"]).centroid.x\n", 382 | "geo_pd[\"lat\"] = gpd.GeoSeries.from_wkt(geo_pd[\"zone_geom\"]).centroid.y\n", 383 | "\n", 384 | "# Drop unnecessary columns.\n", 385 | "geo_pd = geo_pd[[\"zone_id\", \"long\", \"lat\"]]\n", 386 | "\n", 387 | "# Convert back to a Spark DataFrame.\n", 388 | "geo_spark_df = spark.createDataFrame(geo_pd)\n", 389 | "\n", 390 | "# Join taxi_df with geographic position for each start_zone_id and end_zone_id.\n", 391 | "taxi_zone_df = (\n", 392 | " taxi_df.join(geo_spark_df, taxi_df.start_zone_id == geo_spark_df.zone_id)\n", 393 | " .withColumnRenamed(\"long\", \"start_long\")\n", 394 | " .withColumnRenamed(\"lat\", \"start_lat\")\n", 395 | " .drop(\"zone_id\")\n", 396 | " .join(geo_spark_df, taxi_df.end_zone_id == geo_spark_df.zone_id)\n", 397 | " .withColumnRenamed(\"long\", \"end_long\")\n", 398 | " .withColumnRenamed(\"lat\", \"end_lat\")\n", 399 | " .drop(\"zone_id\")\n", 400 | ")\n", 401 | "\n", 402 | "# Convert Spark DataFrame into a Pandas DataFrame.\n", 403 | "taxi_pd = taxi_df.toPandas()\n", 404 | "\n", 405 | "# Convert columns of \"object\" type to the float type.\n", 406 | "taxi_pd[\"trip_duration\"] = taxi_pd[\"trip_duration\"].astype(float)\n", 407 | "\n", 408 | "# Box plots and histograms for the specified columns.\n", 409 | "_, ax = plt.subplots(1, 2, figsize=(10, 4))\n", 410 | "taxi_pd[\"trip_duration\"].plot(kind=\"box\", ax=ax[0])\n", 411 | "taxi_pd[\"trip_duration\"].plot(kind=\"hist\", ax=ax[1])\n", 412 | "plt.title(\"trip_duration\")\n", 413 | "plt.figure()\n", 414 | "plt.show()" 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "metadata": { 420 | "id": "6c42f747c421" 421 | }, 422 | "source": [ 423 | "`trip_duration` also has some extreme values. Remove these." 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "metadata": { 430 | "id": "c4dda7df0ec8", 431 | "tags": [] 432 | }, 433 | "outputs": [], 434 | "source": [ 435 | "# Filter trips occurring between same taxi zones and exceeding where trip_duration is more than 28800 seconds (8 hours).\n", 436 | "taxi_df = taxi_zone_df.where(\n", 437 | " (col(\"trip_duration\") < 28800) & (col(\"start_zone_id\") != col(\"end_zone_id\"))\n", 438 | ")" 439 | ] 440 | }, 441 | { 442 | "cell_type": "markdown", 443 | "metadata": { 444 | "id": "5f6dfc19b47e" 445 | }, 446 | "source": [ 447 | "Create the scatterplot to see the relationship between `trip_distance` and `trip_duration`." 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "metadata": { 454 | "id": "edfff6a2abbc", 455 | "tags": [] 456 | }, 457 | "outputs": [], 458 | "source": [ 459 | "# Convert Spark DataFrame into a Pandas DataFrame.\n", 460 | "taxi_pd = taxi_df.toPandas()\n", 461 | "\n", 462 | "# Convert \"trip_distance\" column of \"object\" type to the float type.\n", 463 | "taxi_pd[\"trip_distance\"] = taxi_pd[\"trip_distance\"].astype(float)\n", 464 | "\n", 465 | "# Filter the DataFrame to include data within reasonable ranges.\n", 466 | "taxi_pd_filtered = taxi_pd.query(\n", 467 | " \"trip_distance > 0 and trip_distance < 20 \\\n", 468 | " and trip_duration > 0 and trip_duration < 10000\"\n", 469 | ")\n", 470 | "\n", 471 | "# Scatter plot to visualize the relationship between trip_distance and trip_duration.\n", 472 | "sns.relplot(\n", 473 | " data=taxi_pd_filtered,\n", 474 | " x=\"trip_distance\",\n", 475 | " y=\"trip_duration\",\n", 476 | " kind=\"scatter\",\n", 477 | ")" 478 | ] 479 | }, 480 | { 481 | "cell_type": "markdown", 482 | "metadata": { 483 | "id": "f9d42bffcebb" 484 | }, 485 | "source": [ 486 | "Takeaways here include:\n", 487 | " * the data is right-skewed\n", 488 | " * there is a positive correlation between `trip_distance` and `trip_duration`\n", 489 | " * most trips are completed in under 3600 seconds (one hour)" 490 | ] 491 | }, 492 | { 493 | "cell_type": "markdown", 494 | "metadata": { 495 | "id": "0e0ee0c6469c" 496 | }, 497 | "source": [ 498 | "### Feature Selection\n", 499 | "\n", 500 | "Use `VectorAssembler()` to consolidate feature columns into a vector column." 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": null, 506 | "metadata": { 507 | "id": "c085bae96dec", 508 | "tags": [] 509 | }, 510 | "outputs": [], 511 | "source": [ 512 | "# List of selected features for training the model.\n", 513 | "feature_cols = [\n", 514 | " \"passenger_count\",\n", 515 | " \"trip_distance\",\n", 516 | " \"start_time\",\n", 517 | " \"end_time\",\n", 518 | " \"start_long\",\n", 519 | " \"start_lat\",\n", 520 | " \"end_long\",\n", 521 | " \"end_lat\",\n", 522 | " \"total_amount\",\n", 523 | " \"fare_amount\",\n", 524 | " \"extra\",\n", 525 | " \"mta_tax\",\n", 526 | " \"tip_amount\",\n", 527 | " \"tolls_amount\",\n", 528 | " \"imp_surcharge\",\n", 529 | " \"airport_fee\",\n", 530 | "]\n", 531 | "\n", 532 | "# Create a VectorAssembler with specified input and output columns.\n", 533 | "assembler = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", 534 | "\n", 535 | "# Transform each column into vector form using the VectorAssembler.\n", 536 | "taxi_transformed_data = assembler.transform(taxi_df)\n", 537 | "\n", 538 | "# Randomly split the transformed data into training and test sets.\n", 539 | "(taxi_training_data, taxi_test_data) = taxi_transformed_data.randomSplit([0.95, 0.05])" 540 | ] 541 | }, 542 | { 543 | "cell_type": "markdown", 544 | "metadata": { 545 | "id": "e68b4258d4d5" 546 | }, 547 | "source": [ 548 | "### Training the Model\n", 549 | "\n", 550 | "Use `GBTRegressor` model to train the data." 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": null, 556 | "metadata": { 557 | "id": "cd2e2c8c4396", 558 | "tags": [] 559 | }, 560 | "outputs": [], 561 | "source": [ 562 | "# Define GBTRegressor model with specified input, output, and prediction columns.\n", 563 | "gbt = GBTRegressor(\n", 564 | " featuresCol=\"features\",\n", 565 | " labelCol=\"trip_duration\",\n", 566 | " predictionCol=\"pred_trip_duration\",\n", 567 | ")\n", 568 | "\n", 569 | "# Define an evaluator for calculating the R2 score.\n", 570 | "evaluator_r2 = RegressionEvaluator(\n", 571 | " labelCol=gbt.getLabelCol(), predictionCol=gbt.getPredictionCol(), metricName=\"r2\"\n", 572 | ")\n", 573 | "\n", 574 | "# Define an evaluator for calculating the RMSE error.\n", 575 | "evaluator_rmse = RegressionEvaluator(\n", 576 | " labelCol=gbt.getLabelCol(), predictionCol=gbt.getPredictionCol(), metricName=\"rmse\"\n", 577 | ")" 578 | ] 579 | }, 580 | { 581 | "cell_type": "code", 582 | "execution_count": null, 583 | "metadata": { 584 | "id": "3080a7ddabf9", 585 | "tags": [] 586 | }, 587 | "outputs": [], 588 | "source": [ 589 | "# Train a Gradient Boosted Trees (GBT) model on the Taxi dataset. This process may take several minutes.\n", 590 | "taxi_gbt_model = gbt.fit(taxi_training_data)\n", 591 | "\n", 592 | "# Get predictions for the Taxi dataset using the trained GBT model.\n", 593 | "taxi_gbt_predictions = taxi_gbt_model.transform(taxi_test_data)" 594 | ] 595 | }, 596 | { 597 | "cell_type": "code", 598 | "execution_count": null, 599 | "metadata": { 600 | "id": "a4e4f60a8a3e", 601 | "tags": [] 602 | }, 603 | "outputs": [], 604 | "source": [ 605 | "# Evaluate the R2 score for the Taxi dataset predictions.\n", 606 | "taxi_gbt_accuracy_r2 = evaluator_r2.evaluate(taxi_gbt_predictions)\n", 607 | "print(f\"Taxi Test GBT R2 Accuracy = {taxi_gbt_accuracy_r2}\")\n", 608 | "\n", 609 | "# Evaluate the Root Mean Squared Error (RMSE) for the Taxi dataset predictions.\n", 610 | "taxi_gbt_accuracy_rmse = evaluator_rmse.evaluate(taxi_gbt_predictions)\n", 611 | "print(f\"Taxi Test GBT RMSE Accuracy = {taxi_gbt_accuracy_rmse}\")" 612 | ] 613 | }, 614 | { 615 | "cell_type": "markdown", 616 | "metadata": { 617 | "id": "7d07452cd103" 618 | }, 619 | "source": [ 620 | "### View the result\n", 621 | "\n", 622 | "Expect an R2 score of approximately 83-87% and a Root Mean Square Error(RMSE) of 200-300. This sample does not include [Cross-validation (statistics)](https://en.wikipedia.org/wiki/Cross-validation_%28statistics%29) which can provide improved model performance." 623 | ] 624 | }, 625 | { 626 | "cell_type": "markdown", 627 | "metadata": { 628 | "id": "d1b46847f395" 629 | }, 630 | "source": [ 631 | "### Save the model to Cloud Storage for future use\n", 632 | "\n", 633 | "To ensure the preservation and accessibility of the trained model, it can be saved to a Cloud Storage path." 634 | ] 635 | }, 636 | { 637 | "cell_type": "code", 638 | "execution_count": null, 639 | "metadata": { 640 | "id": "9cc57f4362c4" 641 | }, 642 | "outputs": [], 643 | "source": [ 644 | "# Save the trained model to a Cloud Storage path\n", 645 | "taxi_gbt_model.write().overwrite().save(f\"gs://{BUCKET_URI}/\")" 646 | ] 647 | }, 648 | { 649 | "cell_type": "markdown", 650 | "metadata": {}, 651 | "source": [ 652 | "### Delete the Dataproc session and the session template\n", 653 | "\n", 654 | "To delete the running Dataproc Serverless session, run the following command.\n", 655 | "If you've completed this tutorial as part of the [Analytics Lakehouse](https://console.cloud.google.com/products/solutions/details/analytics-lakehouse) solution, you will need to proceed with this step before deleting the solution from your project." 656 | ] 657 | }, 658 | { 659 | "cell_type": "code", 660 | "execution_count": null, 661 | "metadata": {}, 662 | "outputs": [], 663 | "source": [ 664 | "# Delete the session template\n", 665 | "!gcloud beta dataproc session-templates delete sparkml-template --location='{LOCATION}' --quiet\n", 666 | "\n", 667 | "# Delete the Dataproc Serverless Session if session exists\n", 668 | "if SESSION:\n", 669 | " !gcloud beta dataproc sessions terminate '{SESSION}' --location='{LOCATION}' --quiet" 670 | ] 671 | } 672 | ], 673 | "metadata": { 674 | "colab": { 675 | "name": "spark_ml.ipynb", 676 | "toc_visible": true 677 | }, 678 | "environment": { 679 | "kernel": "9c39b79e5d2e7072beb4bd59-runtime-00002d16685d", 680 | "name": "workbench-notebooks.m113", 681 | "type": "gcloud", 682 | "uri": "gcr.io/deeplearning-platform-release/workbench-notebooks:m113" 683 | }, 684 | "kernelspec": { 685 | "display_name": "test on Serverless Spark (Remote)", 686 | "language": "python", 687 | "name": "9c39b79e5d2e7072beb4bd59-runtime-00002d16685d" 688 | }, 689 | "language_info": { 690 | "codemirror_mode": { 691 | "name": "ipython", 692 | "version": 3 693 | }, 694 | "file_extension": ".py", 695 | "mimetype": "text/x-python", 696 | "name": "python", 697 | "nbconvert_exporter": "python", 698 | "pygments_lexer": "ipython3", 699 | "version": "3.11.0" 700 | } 701 | }, 702 | "nbformat": 4, 703 | "nbformat_minor": 4 704 | } 705 | --------------------------------------------------------------------------------