├── test
    ├── setup
    │   ├── .gitignore
    │   ├── outputs.tf
    │   ├── variables.tf
    │   ├── versions.tf
    │   ├── iam.tf
    │   └── main.tf
    ├── .gitignore
    └── integration
    │   ├── discover_test.go
    │   ├── go.mod
    │   ├── analytics_lakehouse
    │       └── analytics_lakehouse_test.go
    │   └── go.sum
├── assets
    └── lakehouse-architecture.png
├── .github
    ├── renovate.json
    ├── release-please.yml
    ├── trusted-contribution.yml
    └── workflows
    │   ├── stale.yml
    │   ├── lint.yaml
    │   └── periodic-reporter.yaml
├── SECURITY.md
├── roles.txt
├── CODEOWNERS
├── examples
    └── analytics_lakehouse
    │   ├── variables.tf
    │   ├── main.tf
    │   ├── README.md
    │   ├── outputs.tf
    │   └── versions.tf
├── cloudbuild_mim.yaml
├── .gitIgnore
├── versions.tf
├── src
    ├── sql
    │   ├── sp_bigqueryml_model.sql
    │   ├── sp_sample_queries.sql
    │   ├── view_ecommerce.sql
    │   └── sp_lookerstudio_report.sql
    ├── python
    │   ├── bigquery.py
    │   └── bigtable.py
    ├── shell
    │   └── post_startup.sh
    ├── yaml
    │   ├── project-setup.yaml
    │   └── copy-data.yaml
    └── ipynb
    │   ├── exploratory-analysis.ipynb
    │   ├── spark_langchain.ipynb
    │   └── spark_ml.ipynb
├── metadata.display.yaml
├── variables.tf
├── outputs.tf
├── dataproc.tf
├── workbench.tf
├── Makefile
├── metadata.yaml
├── tutorial.md
├── README.md
├── deploy_solution.sh
├── deploy_via_trigger.sh
├── CONTRIBUTING.md
├── workflows.tf
├── bigquery.tf
├── main.tf
├── dataplex.tf
├── LICENSE
└── CHANGELOG.md


/test/setup/.gitignore:
--------------------------------------------------------------------------------
1 | terraform.tfvars
2 | source.sh
3 | 


--------------------------------------------------------------------------------
/assets/lakehouse-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/HEAD/assets/lakehouse-architecture.png


--------------------------------------------------------------------------------
/test/.gitignore:
--------------------------------------------------------------------------------
 1 | source.sh
 2 | 
 3 | # Local .terraform directories
 4 | **/.terraform/*
 5 | **/.terraform.lock.*
 6 | 
 7 | # .tfstate files
 8 | *.tfstate
 9 | *.tfstate.*
10 | 


--------------------------------------------------------------------------------
/.github/renovate.json:
--------------------------------------------------------------------------------
1 | {
2 |   "$schema": "https://docs.renovatebot.com/renovate-schema.json",
3 |   "extends": ["github>GoogleCloudPlatform/cloud-foundation-toolkit//infra/terraform/test-org/github/resources/renovate"]
4 | }
5 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | To report a security issue, please use http://g.co/vulnz. We use
2 | http://g.co/vulnz for our intake, and do coordination and disclosure here on
3 | GitHub (including using GitHub Security Advisory). The Google Security Team will
4 | respond within 5 working days of your report on g.co/vulnz.
5 | 


--------------------------------------------------------------------------------
/roles.txt:
--------------------------------------------------------------------------------
 1 | roles/bigquery.admin
 2 | roles/compute.admin
 3 | roles/config.agent
 4 | roles/dataplex.admin
 5 | roles/dataproc.admin
 6 | roles/iam.serviceAccountAdmin
 7 | roles/iam.serviceAccountUser
 8 | roles/logging.configWriter
 9 | roles/notebooks.admin
10 | roles/resourcemanager.projectIamAdmin
11 | roles/serviceusage.serviceUsageAdmin
12 | roles/storage.admin
13 | roles/workflows.admin
14 | 


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
 1 | # NOTE: This file is automatically generated from values at:
 2 | # https://github.com/GoogleCloudPlatform/cloud-foundation-toolkit/blob/main/infra/terraform/test-org/org/locals.tf
 3 | 
 4 | * @GoogleCloudPlatform/blueprint-solutions @bradmiro @davenportjw @GoogleCloudPlatform/jump-start-solutions-admins
 5 | 
 6 | # NOTE: GitHub CODEOWNERS locations:
 7 | # https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners#codeowners-and-branch-protection
 8 | 
 9 | CODEOWNERS @GoogleCloudPlatform/blueprint-solutions
10 | .github/CODEOWNERS @GoogleCloudPlatform/blueprint-solutions
11 | docs/CODEOWNERS @GoogleCloudPlatform/blueprint-solutions
12 | 
13 | 


--------------------------------------------------------------------------------
/.github/release-please.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | releaseType: terraform-module
16 | handleGHRelease: true
17 | bumpMinorPreMajor: true
18 | 


--------------------------------------------------------------------------------
/examples/analytics_lakehouse/variables.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2021 Google LLC
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | variable "project_id" {
18 |   description = "The ID of the project in which to provision resources."
19 |   type        = string
20 | }
21 | 


--------------------------------------------------------------------------------
/test/integration/discover_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2022 Google LLC
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //      http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package test
16 | 
17 | import (
18 | 	"testing"
19 | 
20 | 	"github.com/GoogleCloudPlatform/cloud-foundation-toolkit/infra/blueprint-test/pkg/tft"
21 | )
22 | 
23 | func TestAll(t *testing.T) {
24 | 	tft.AutoDiscoverAndTest(t)
25 | }
26 | 


--------------------------------------------------------------------------------
/examples/analytics_lakehouse/main.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2021 Google LLC
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | module "analytics_lakehouse" {
18 |   source  = "GoogleCloudPlatform/analytics-lakehouse/google"
19 |   version = "~> 0.4"
20 | 
21 |   project_id    = var.project_id
22 |   region        = "us-central1"
23 |   force_destroy = true
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/test/setup/outputs.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2019 Google LLC
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | output "project_id" {
18 |   value = module.project.project_id
19 | }
20 | 
21 | output "sa_key" {
22 |   value     = google_service_account_key.int_test.private_key
23 |   sensitive = true
24 | }
25 | 
26 | output "kms_keys" {
27 |   value = module.kms_keyring.keys
28 | }
29 | 


--------------------------------------------------------------------------------
/cloudbuild_mim.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | steps:
15 | - name: 'gcr.io/google.com/cloudsdktool/cloud-sdk'
16 |   entrypoint: 'bash'
17 |   args: ['./deploy_via_trigger.sh', '-p$PROJECT_ID']
18 | serviceAccount: 'projects/$PROJECT_ID/serviceAccounts/cloudbuild-trigger-default@$PROJECT_ID.iam.gserviceaccount.com'
19 | options:
20 |   logging: CLOUD_LOGGING_ONLY
21 | 


--------------------------------------------------------------------------------
/test/setup/variables.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2019 Google LLC
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | variable "org_id" {
17 |   description = "The numeric organization id"
18 | }
19 | 
20 | variable "folder_id" {
21 |   description = "The folder to deploy in"
22 | }
23 | 
24 | variable "billing_account" {
25 |   description = "The billing account id associated with the project, e.g. XXXXXX-YYYYYY-ZZZZZZ"
26 | }
27 | 


--------------------------------------------------------------------------------
/test/setup/versions.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2019 Google LLC
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | terraform {
18 |   required_version = ">= 1.5"
19 |   required_providers {
20 |     google = {
21 |       source  = "hashicorp/google"
22 |       version = ">= 3.25.0"
23 |     }
24 |     google-beta = {
25 |       source  = "hashicorp/google-beta"
26 |       version = ">= 3.25.0"
27 |     }
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/.github/trusted-contribution.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2023-2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # NOTE: This file is automatically generated from:
16 | # https://github.com/GoogleCloudPlatform/cloud-foundation-toolkit/blob/main/infra/terraform/test-org/github
17 | 
18 | annotations:
19 |   - type: comment
20 |     text: "/gcbrun"
21 | trustedContributors:
22 |   - release-please[bot]
23 |   - renovate[bot]
24 |   - renovate-bot
25 |   - forking-renovate[bot]
26 |   - dependabot[bot]
27 | 


--------------------------------------------------------------------------------
/examples/analytics_lakehouse/README.md:
--------------------------------------------------------------------------------
 1 | # Analytics Lakehouse Example
 2 | 
 3 | This example illustrates how to use the `analytics_lakehouse` module.
 4 | 
 5 | <!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
 6 | ## Inputs
 7 | 
 8 | | Name | Description | Type | Default | Required |
 9 | |------|-------------|------|---------|:--------:|
10 | | project\_id | The ID of the project in which to provision resources. | `string` | n/a | yes |
11 | 
12 | ## Outputs
13 | 
14 | | Name | Description |
15 | |------|-------------|
16 | | bigquery\_editor\_url | The URL to launch the BigQuery editor |
17 | | lakehouse\_colab\_url | The URL to launch the Colab instance |
18 | | lookerstudio\_report\_url | The URL to create a new Looker Studio report |
19 | 
20 | <!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
21 | 
22 | To provision this example, run the following from within this directory:
23 | - `terraform init` to get the plugins
24 | - `terraform plan` to see the infrastructure plan
25 | - `terraform apply` to apply the infrastructure build
26 | - `terraform destroy` to destroy the built infrastructure
27 | 


--------------------------------------------------------------------------------
/.gitIgnore:
--------------------------------------------------------------------------------
 1 | #vs code
 2 | *code-workspace*
 3 | 
 4 | # OSX leaves these everywhere on SMB shares
 5 | ._*
 6 | 
 7 | # OSX trash
 8 | .DS_Store
 9 | 
10 | # BIN
11 | 
12 | .venv/bin/*
13 | .venv/lib/*
14 | 
15 | # Python
16 | *.pyc
17 | *.txt
18 | *.exe
19 | *.cfg
20 | *.sh
21 | 
22 | 
23 | 
24 | # Emacs save files
25 | *~
26 | \#*\#
27 | .\#*
28 | 
29 | # Vim-related files
30 | [._]*.s[a-w][a-z]
31 | [._]s[a-w][a-z]
32 | *.un~
33 | Session.vim
34 | .netrwhist
35 | 
36 | ### https://raw.github.com/github/gitignore/90f149de451a5433aebd94d02d11b0e28843a1af/Terraform.gitignore
37 | 
38 | # Local .terraform directories
39 | **/.terraform/*
40 | **/.terraform.lock.*
41 | 
42 | # .tfstate files
43 | *.tfstate
44 | *.tfstate.*
45 | 
46 | # Crash log files
47 | crash.log
48 | 
49 | # Kitchen files
50 | **/inspec.lock
51 | **/.kitchen
52 | **/kitchen.local.yml
53 | **/Gemfile.lock
54 | 
55 | # Ignore any .tfvars files that are generated automatically for each Terraform run. Most
56 | # .tfvars files are managed as part of configuration and so should be included in
57 | # version control.
58 | **/*.tfvars
59 | 
60 | credentials.json
61 | 
62 | # tf lock file
63 | .terraform.lock.hcl
64 | 
65 | **/*.zip
66 | 
67 | .vscode/
68 | 


--------------------------------------------------------------------------------
/examples/analytics_lakehouse/outputs.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2021 Google LLC
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | output "lookerstudio_report_url" {
18 |   value       = module.analytics_lakehouse.lookerstudio_report_url
19 |   description = "The URL to create a new Looker Studio report"
20 | }
21 | 
22 | output "bigquery_editor_url" {
23 |   value       = module.analytics_lakehouse.bigquery_editor_url
24 |   description = "The URL to launch the BigQuery editor"
25 | }
26 | 
27 | output "lakehouse_colab_url" {
28 |   value       = module.analytics_lakehouse.lakehouse_colab_url
29 |   description = "The URL to launch the Colab instance"
30 | }
31 | 


--------------------------------------------------------------------------------
/test/setup/iam.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2019 Google LLC
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | locals {
18 |   int_required_roles = [
19 |     "roles/owner",
20 |     "roles/bigquery.dataViewer"
21 |   ]
22 | }
23 | 
24 | resource "google_service_account" "int_test" {
25 |   project      = module.project.project_id
26 |   account_id   = "ci-account"
27 |   display_name = "ci-account"
28 | }
29 | 
30 | resource "google_project_iam_member" "int_test" {
31 |   count = length(local.int_required_roles)
32 | 
33 |   project = module.project.project_id
34 |   role    = local.int_required_roles[count.index]
35 |   member  = "serviceAccount:${google_service_account.int_test.email}"
36 | }
37 | 
38 | resource "google_service_account_key" "int_test" {
39 |   service_account_id = google_service_account.int_test.id
40 | }
41 | 


--------------------------------------------------------------------------------
/examples/analytics_lakehouse/versions.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2021 Google LLC
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | terraform {
18 |   required_version = ">= 1.5"
19 |   required_providers {
20 |     google = {
21 |       source  = "hashicorp/google"
22 |       version = ">= 5.10.0, < 7.0.0"
23 |     }
24 |     google-beta = {
25 |       source  = "hashicorp/google-beta"
26 |       version = ">= 5.10.0, < 7.0.0"
27 |     }
28 |     random = {
29 |       source  = "hashicorp/random"
30 |       version = ">= 2"
31 |     }
32 |     archive = {
33 |       source  = "hashicorp/archive"
34 |       version = ">= 2"
35 |     }
36 |     time = {
37 |       source  = "hashicorp/time"
38 |       version = ">= 0.9.1"
39 |     }
40 |     http = {
41 |       source  = "hashicorp/http"
42 |       version = ">= 3.2.1"
43 |     }
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/versions.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2023 Google LLC
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | terraform {
18 |   required_version = ">= 1.5"
19 |   required_providers {
20 |     google = {
21 |       source  = "hashicorp/google"
22 |       version = ">= 6.11.0, < 7.0.0"
23 |     }
24 |     google-beta = {
25 |       source  = "hashicorp/google-beta"
26 |       version = ">= 5.10.0, < 7.0.0"
27 |     }
28 |     random = {
29 |       source  = "hashicorp/random"
30 |       version = ">= 2"
31 |     }
32 |     archive = {
33 |       source  = "hashicorp/archive"
34 |       version = ">= 2"
35 |     }
36 |     time = {
37 |       source  = "hashicorp/time"
38 |       version = ">= 0.9.1"
39 |     }
40 |     http = {
41 |       source  = "hashicorp/http"
42 |       version = ">= 3.2.1"
43 |     }
44 |   }
45 | 
46 |   provider_meta "google" {
47 |     module_name = "blueprints/terraform/terraform-google-analytics-lakehouse/v0.4.0"
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/sql/sp_bigqueryml_model.sql:
--------------------------------------------------------------------------------
 1 | -- Copyright 2023 Google LLC
 2 | --
 3 | -- Licensed under the Apache License, Version 2.0 (the "License");
 4 | -- you may not use this file except in compliance with the License.
 5 | -- You may obtain a copy of the License at
 6 | --
 7 | --      http://www.apache.org/licenses/LICENSE-2.0
 8 | --
 9 | -- Unless required by applicable law or agreed to in writing, software
10 | -- distributed under the License is distributed on an "AS IS" BASIS,
11 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | -- See the License for the specific language governing permissions and
13 | -- limitations under the License.
14 | 
15 | /* Run a query to see the prediction results of the model
16 | --
17 | select * from ML.PREDICT(MODEL ds_edw.model_taxi_estimate,
18 |   TABLE ds_edw.taxi_trips)
19 |   limit 1000;  */
20 | 
21 | --Model Example
22 | CREATE OR REPLACE MODEL
23 |   `${project_id}.ds_edw.model_taxi_estimate` OPTIONS ( MODEL_TYPE='LINEAR_REG',
24 |     LS_INIT_LEARN_RATE=0.15,
25 |     L1_REG=1,
26 |     MAX_ITERATIONS=5 ) AS
27 | SELECT
28 |   pickup_datetime,
29 |   dropoff_datetime,
30 |   IFNULL(passenger_count,0) passenger_count,
31 |   IFNULL(trip_distance,0) trip_distance,
32 |   IFNULL(rate_code,'') rate_code,
33 |   IFNULL(payment_type,'') payment_type,
34 |   IFNULL(fare_amount,0) label,
35 |   IFNULL(pickup_location_id,'') pickup_location_id,
36 |   IFNULL(dropoff_location_id,'')dropoff_location_id
37 | FROM
38 |   `${project_id}.ds_edw.taxi_trips`
39 | WHERE
40 |   fare_amount > 0;
41 | 


--------------------------------------------------------------------------------
/src/sql/sp_sample_queries.sql:
--------------------------------------------------------------------------------
 1 | -- Copyright 2023 Google LLC
 2 | --
 3 | -- Licensed under the Apache License, Version 2.0 (the "License");
 4 | -- you may not use this file except in compliance with the License.
 5 | -- You may obtain a copy of the License at
 6 | --
 7 | --      http://www.apache.org/licenses/LICENSE-2.0
 8 | --
 9 | -- Unless required by applicable law or agreed to in writing, software
10 | -- distributed under the License is distributed on an "AS IS" BASIS,
11 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | -- See the License for the specific language governing permissions and
13 | -- limitations under the License.
14 | 
15 | /*
16 | Use Cases:
17 |     - BigQuery supports full SQL syntax and many analytic functions that make complex queries of lots of data easy
18 | 
19 | Description:
20 |     - Show joins, date functions, rank, partition, pivot
21 | 
22 | Reference:
23 |     - Rank/Partition: https://cloud.google.com/bigquery/docs/reference/standard-sql/analytic-function-concepts
24 |     - Pivot: https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#pivot_operator
25 | 
26 | Clean up / Reset script:
27 |     n/a
28 | */
29 | 
30 | --Rank, Pivot, Json
31 | 
32 | -- Query: Get number of orders by category, name, id
33 | SELECT
34 |   oi.product_id AS product_id,
35 |   p.name AS product_name,
36 |   p.category AS product_category,
37 |   COUNT(*) AS num_of_orders
38 | FROM
39 |   `solutions-2023-testing-c.gcp_lakehouse_ds.gcp_tbl_products` AS p
40 | JOIN
41 |   `solutions-2023-testing-c.gcp_lakehouse_ds.gcp_tbl_order_items` AS oi
42 | ON
43 |   p.id = oi.product_id
44 | GROUP BY
45 |   1,
46 |   2,
47 |   3
48 | ORDER BY
49 |   num_of_orders DESC
50 | 


--------------------------------------------------------------------------------
/.github/workflows/stale.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2022-2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # NOTE: This file is automatically generated from:
16 | # https://github.com/GoogleCloudPlatform/cloud-foundation-toolkit/blob/main/infra/terraform/test-org/github
17 | 
18 | name: "Close stale issues"
19 | on:
20 |   schedule:
21 |   - cron: "0 23 * * *"
22 | 
23 | permissions:
24 |   contents: read
25 |   issues: write
26 |   pull-requests: write
27 |   actions: write
28 | 
29 | jobs:
30 |   stale:
31 |     if: github.repository_owner == 'GoogleCloudPlatform' || github.repository_owner == 'terraform-google-modules'
32 |     runs-on: ubuntu-latest
33 |     steps:
34 |     - uses: actions/stale@v10
35 |       with:
36 |         repo-token: ${{ secrets.GITHUB_TOKEN }}
37 |         stale-issue-message: 'This issue is stale because it has been open 60 days with no activity. Remove stale label or comment or this will be closed in 7 days'
38 |         stale-pr-message: 'This PR is stale because it has been open 60 days with no activity. Remove stale label or comment or this will be closed in 7 days'
39 |         exempt-issue-labels: 'triaged'
40 |         exempt-pr-labels: 'dependencies,autorelease: pending'
41 |         operations-per-run: 100
42 | 


--------------------------------------------------------------------------------
/metadata.display.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | apiVersion: blueprints.cloud.google.com/v1alpha1
16 | kind: BlueprintMetadata
17 | metadata:
18 |   name: terraform-google-analytics-lakehouse-display
19 | spec:
20 |   info:
21 |     title: terraform-google-lakehouse
22 |     source:
23 |       repo: https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse.git
24 |       sourceType: git
25 |   ui:
26 |     input:
27 |       variables:
28 |         deletion_protection:
29 |           name: deletion_protection
30 |           title: Deletion Protection
31 |         enable_apis:
32 |           name: enable_apis
33 |           title: Enable Apis
34 |         force_destroy:
35 |           name: force_destroy
36 |           title: Force Destroy
37 |         labels:
38 |           name: labels
39 |           title: Labels
40 |         project_id:
41 |           name: project_id
42 |           title: Project Id
43 |         public_data_bucket:
44 |           name: public_data_bucket
45 |           title: Public Data Bucket
46 |         region:
47 |           name: region
48 |           title: Region
49 |         use_case_short:
50 |           name: use_case_short
51 |           title: Use Case Short
52 | 


--------------------------------------------------------------------------------
/test/setup/main.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2019 Google LLC
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | module "project" {
18 |   source  = "terraform-google-modules/project-factory/google"
19 |   version = "~> 18.0"
20 | 
21 |   name                     = "ci-bigquery"
22 |   random_project_id        = "true"
23 |   random_project_id_length = 10
24 |   org_id                   = var.org_id
25 |   folder_id                = var.folder_id
26 |   billing_account          = var.billing_account
27 |   default_service_account  = "keep"
28 | 
29 |   activate_apis = [
30 |     "cloudkms.googleapis.com",
31 |     "cloudresourcemanager.googleapis.com",
32 |     "bigquery.googleapis.com",
33 |     "bigquerystorage.googleapis.com",
34 |     "bigqueryconnection.googleapis.com",
35 |     "serviceusage.googleapis.com",
36 |     "iam.googleapis.com",
37 |   ]
38 | }
39 | 
40 | module "kms_keyring" {
41 |   source  = "terraform-google-modules/kms/google"
42 |   version = "~> 4.0"
43 | 
44 |   project_id      = module.project.project_id
45 |   location        = "us"
46 |   keyring         = "ci-bigquery-keyring"
47 |   keys            = ["foo"]
48 |   prevent_destroy = "false"
49 |   depends_on = [
50 |     module.project
51 |   ]
52 | }
53 | 
54 | data "google_bigquery_default_service_account" "initialize_encryption_account" {
55 |   project = module.project.project_id
56 | }
57 | 


--------------------------------------------------------------------------------
/variables.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2023 Google LLC
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | # --------------------------------------------------
18 | # VARIABLES
19 | # Set these before applying the configuration
20 | # --------------------------------------------------
21 | 
22 | variable "project_id" {
23 |   type        = string
24 |   description = "Google Cloud Project ID"
25 | }
26 | 
27 | variable "region" {
28 |   type        = string
29 |   description = "Google Cloud Region"
30 |   default     = "us-central1"
31 | }
32 | 
33 | variable "labels" {
34 |   type        = map(string)
35 |   description = "A map of labels to apply to contained resources."
36 |   default     = { "analytics-lakehouse" = true }
37 | }
38 | 
39 | variable "enable_apis" {
40 |   type        = string
41 |   description = "Whether or not to enable underlying apis in this solution. ."
42 |   default     = true
43 | }
44 | 
45 | variable "force_destroy" {
46 |   type        = string
47 |   description = "Whether or not to protect GCS resources from deletion when solution is modified or changed."
48 |   default     = false
49 | }
50 | 
51 | variable "use_case_short" {
52 |   type        = string
53 |   description = "Short name for use case"
54 |   default     = "lakehouse"
55 | }
56 | 
57 | variable "public_data_bucket" {
58 |   type        = string
59 |   description = "Public Data bucket for access"
60 |   default     = "data-analytics-demos"
61 | }
62 | 


--------------------------------------------------------------------------------
/outputs.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2021 Google LLC
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | output "workflow_return_project_setup" {
18 |   description = "Output of the project setup workflow"
19 |   value       = data.http.call_workflows_project_setup.response_body
20 | }
21 | 
22 | output "lookerstudio_report_url" {
23 |   value       = "https://lookerstudio.google.com/reporting/create?c.reportId=79675b4f-9ed8-4ee4-bb35-709b8fd5306a&ds.ds0.datasourceName=vw_ecommerce&ds.ds0.projectId=${var.project_id}&ds.ds0.type=TABLE&ds.ds0.datasetId=gcp_lakehouse_ds&ds.ds0.tableId=view_ecommerce"
24 |   description = "The URL to create a new Looker Studio report displays a sample dashboard for data analysis"
25 | }
26 | 
27 | output "bigquery_editor_url" {
28 |   value       = "https://console.cloud.google.com/bigquery?project=${var.project_id}"
29 |   description = "The URL to launch the BigQuery editor"
30 | }
31 | 
32 | output "neos_tutorial_url" {
33 |   value       = "http://console.cloud.google.com/products/solutions/deployments?walkthrough_id=panels--sic--analytics-lakehouse_toc"
34 |   description = "The URL to launch the in-console tutorial for the Analytics Lakehouse solution"
35 | }
36 | 
37 | output "lakehouse_colab_url" {
38 |   value       = "https://colab.research.google.com/github/GoogleCloudPlatform/terraform-google-analytics-lakehouse/blob/main/src/ipynb/exploratory-analysis.ipynb"
39 |   description = "The URL to launch the in-console tutorial for the Analytics Lakehouse solution"
40 | }
41 | 


--------------------------------------------------------------------------------
/src/python/bigquery.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # Copyright 2023 Google LLC
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """BigQuery I/O with BigLake Iceberg PySpark example."""
17 | from pyspark.sql import SparkSession
18 | import json
19 | import os
20 | 
21 | spark = SparkSession \
22 |     .builder \
23 |     .appName("spark-bigquery-demo") \
24 |     .enableHiveSupport() \
25 |     .getOrCreate()
26 | 
27 | 
28 | def load_arg(arg):
29 |     return str(json.loads(os.environ[f"BIGQUERY_PROC_PARAM.{arg}"]))
30 | 
31 | 
32 | catalog = load_arg("lakehouse_catalog")
33 | database = load_arg("lakehouse_database")
34 | bq_dataset = load_arg("bq_dataset")
35 | 
36 | # Delete the BigLake Catalog if it currently exists to ensure proper setup.
37 | spark.sql(f"DROP NAMESPACE IF EXISTS {catalog} CASCADE;")
38 | 
39 | # Create BigLake Catalog and Database if they are not already created.
40 | spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {catalog};")
41 | spark.sql(f"CREATE DATABASE IF NOT EXISTS {catalog}.{database};")
42 | spark.sql(f"DROP TABLE IF EXISTS {catalog}.{database}.agg_events_iceberg;")
43 | 
44 | # Load data from BigQuery.
45 | events = spark.read.format("bigquery") \
46 |     .option("table", "gcp_primary_staging.thelook_ecommerce_events") \
47 |     .load()
48 | events.createOrReplaceTempView("events")
49 | 
50 | # Create Iceberg Table if not exists
51 | spark.sql(
52 |     f"""CREATE TABLE IF NOT EXISTS {catalog}.{database}.agg_events_iceberg
53 |     (user_id string, event_count bigint)
54 |     USING iceberg
55 |             TBLPROPERTIES(
56 |                 bq_table='{bq_dataset}.agg_events_iceberg');
57 |     """
58 | )
59 | 
60 | # Create Iceberg Table if not exists
61 | spark.sql(
62 |     f"""INSERT INTO {catalog}.{database}.agg_events_iceberg
63 |     (user_id, event_count)
64 |     select user_id, count(session_id)
65 |     from events
66 |     group by user_id;
67 |     """
68 | )
69 | 


--------------------------------------------------------------------------------
/src/sql/view_ecommerce.sql:
--------------------------------------------------------------------------------
 1 | -- Copyright 2023 Google LLC
 2 | --
 3 | -- Licensed under the Apache License, Version 2.0 (the "License");
 4 | -- you may not use this file except in compliance with the License.
 5 | -- You may obtain a copy of the License at
 6 | --
 7 | --      http://www.apache.org/licenses/LICENSE-2.0
 8 | --
 9 | -- Unless required by applicable law or agreed to in writing, software
10 | -- distributed under the License is distributed on an "AS IS" BASIS,
11 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | -- See the License for the specific language governing permissions and
13 | -- limitations under the License.
14 | CREATE OR REPLACE VIEW
15 |   gcp_lakehouse_ds.view_ecommerce AS
16 | SELECT
17 |   o.order_id,
18 |   o.user_id order_user_id,
19 |   o.status order_status,
20 |   o.created_at order_created_at,
21 |   o.returned_at order_returned_at,
22 |   o.shipped_at order_shipped_at,
23 |   o.delivered_at order_delivered_at,
24 |   o.num_of_item order_number_of_items,
25 |   i.id AS order_items_id,
26 |   i.product_id AS order_items_product_id,
27 |   i.status order_items_status,
28 |   i.sale_price order_items_sale_price,
29 |   p.id AS product_id,
30 |   p.cost product_cost,
31 |   p.category product_category,
32 |   p.name product_name,
33 |   p.brand product_brand,
34 |   p.retail_price product_retail_price,
35 |   p.department product_department,
36 |   p.sku product_sku,
37 |   p.distribution_center_id,
38 |   d.name AS dist_center_name,
39 |   d.latitude dist_center_lat,
40 |   d.longitude dist_center_long,
41 |   u.id AS user_id,
42 |   u.first_name user_first_name,
43 |   u.last_name user_last_name,
44 |   u.age user_age,
45 |   u.gender user_gender,
46 |   u.state user_state,
47 |   u.postal_code user_postal_code,
48 |   u.city user_city,
49 |   u.country user_country,
50 |   u.latitude user_lat,
51 |   u.longitude user_long,
52 |   u.traffic_source user_traffic_source
53 | FROM
54 |   gcp_primary_staging.thelook_ecommerce_orders o
55 | INNER JOIN
56 |   gcp_primary_staging.thelook_ecommerce_order_items i
57 | ON
58 |   o.order_id = i.order_id
59 | INNER JOIN
60 |   `gcp_primary_staging.thelook_ecommerce_products` p
61 | ON
62 |   i.product_id = p.id
63 | INNER JOIN
64 |   `gcp_primary_staging.thelook_ecommerce_distribution_centers` d
65 | ON
66 |   p.distribution_center_id = d.id
67 | INNER JOIN
68 |   `gcp_primary_staging.thelook_ecommerce_users` u
69 | ON
70 |   o.user_id = u.id
71 | ;
72 | 


--------------------------------------------------------------------------------
/src/shell/post_startup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2024 Google LLC
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | # Retrieve current project and location using gcloud
18 | PROJECT=$(gcloud config get-value project)
19 | ZONE=$(curl -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/zone)
20 | LOCATION="$(echo "$ZONE" | awk -F/ '{split($4, a, "-"); print a[1]"-"a[2]}')"
21 | echo "Current instance location: $LOCATION"
22 | 
23 | # Specify the file name
24 | YAML_FILE="temp.yaml"
25 | declare -a NOTEBOOKS=("spark_langchain.ipynb" "spark_ml.ipynb")
26 | 
27 | # Define the content for the YAML file
28 | YAML_CONTENT=$(cat <<EOF
29 | environmentConfig:
30 |   executionConfig:
31 |     subnetworkUri: dataproc-subnet
32 |   peripheralsConfig: {}
33 | jupyterSession:
34 |   kernel: PYTHON
35 |   displayName: SparkML Notebook
36 | description: Serverless Template for the SparkML Notebook
37 | labels:
38 |   client: dataproc-jupyter-plugin
39 | runtimeConfig:
40 |   version: '2.2'
41 | EOF
42 | )
43 | 
44 | # Write the content to the YAML file
45 | echo "$YAML_CONTENT" > /home/jupyter/"$YAML_FILE"
46 | 
47 | # Use wget to download the file and check if the download was successful
48 | for NOTEBOOK in "${NOTEBOOKS[@]}"
49 | do
50 |   # Specify the GitHub repository URL and the file path
51 |   REPO_URL="https://raw.githubusercontent.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/main/src/ipynb/$NOTEBOOK"
52 |   if wget "$REPO_URL" -O /home/jupyter/"$NOTEBOOK"; then
53 |       echo "File downloaded successfully."
54 |   else
55 |       echo "Error downloading the file."
56 |   fi
57 | done
58 | 
59 | # Import Dataproc session template
60 | gcloud beta dataproc session-templates import sparkml-template \
61 |   --source=/home/jupyter/"$YAML_FILE" --project="$PROJECT" --location="$LOCATION" --quiet
62 | 
63 | # Delete temporal YAML config file
64 | rm /home/jupyter/"$YAML_FILE"
65 | 


--------------------------------------------------------------------------------
/dataproc.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2023 Google LLC
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | # Set up networking
18 | resource "google_compute_network" "default_network" {
19 |   project                 = module.project-services.project_id
20 |   name                    = "vpc-${var.use_case_short}"
21 |   description             = "Default network"
22 |   auto_create_subnetworks = false
23 |   mtu                     = 1460
24 | }
25 | 
26 | # add destroy sleep here
27 | 
28 | resource "google_compute_subnetwork" "subnet" {
29 |   project                  = module.project-services.project_id
30 |   name                     = "dataproc-subnet"
31 |   ip_cidr_range            = "10.3.0.0/16"
32 |   region                   = var.region
33 |   network                  = google_compute_network.default_network.id
34 |   private_ip_google_access = true
35 | }
36 | 
37 | # Firewall rule for dataproc cluster
38 | resource "google_compute_firewall" "subnet_firewall_rule" {
39 |   project = module.project-services.project_id
40 |   name    = "dataproc-firewall"
41 |   network = google_compute_network.default_network.id
42 | 
43 |   allow {
44 |     protocol = "icmp"
45 |   }
46 | 
47 |   allow {
48 |     protocol = "tcp"
49 |   }
50 | 
51 |   allow {
52 |     protocol = "udp"
53 |   }
54 |   source_ranges = ["10.3.0.0/16"]
55 | 
56 |   depends_on = [
57 |     google_compute_subnetwork.subnet
58 |   ]
59 | }
60 | 
61 | 
62 | # Set up Dataproc service account for the Cloud Function to execute as
63 | # # Set up the Dataproc service account
64 | resource "google_service_account" "dataproc_service_account" {
65 |   project      = module.project-services.project_id
66 |   account_id   = "dataproc-sa-${random_id.id.hex}"
67 |   display_name = "Service Account for Dataproc Execution"
68 | }
69 | 
70 | resource "google_project_iam_member" "dataproc_sa_roles" {
71 |   for_each = toset([
72 |     "roles/storage.objectAdmin",
73 |     "roles/bigquery.connectionAdmin",
74 |     "roles/biglake.admin",
75 |     "roles/bigquery.dataOwner",
76 |     "roles/bigquery.user",
77 |     "roles/dataproc.worker",
78 |   ])
79 | 
80 |   project = module.project-services.project_id
81 |   role    = each.key
82 |   member  = "serviceAccount:${google_service_account.dataproc_service_account.email}"
83 | }
84 | 


--------------------------------------------------------------------------------
/workbench.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2024 Google LLC
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 | */
16 | 
17 | # Creates a service account specifically for the Workbench instance.
18 | resource "google_service_account" "workbench_service_account" {
19 |   project      = module.project-services.project_id
20 |   account_id   = "workbench-sa-${random_id.id.hex}"
21 |   display_name = "Service Account for Workbench Instance"
22 | }
23 | 
24 | # Grants necessary roles to the Workbench service account.
25 | resource "google_project_iam_member" "workbench_sa_roles" {
26 |   for_each = toset([
27 |     "roles/iam.serviceAccountUser",
28 |     "roles/storage.objectAdmin",
29 |     "roles/compute.osAdminLogin",
30 |     "roles/dataproc.admin",
31 |   ])
32 | 
33 |   project = module.project-services.project_id
34 |   role    = each.key
35 |   member  = "serviceAccount:${google_service_account.workbench_service_account.email}"
36 | }
37 | 
38 | # Provisions a new Workbench instance.
39 | resource "google_workbench_instance" "workbench_instance" {
40 |   name          = "gcp-${var.use_case_short}-workbench-instance-${random_id.id.hex}"
41 |   project       = module.project-services.project_id
42 |   location      = "${var.region}-b"
43 |   desired_state = "STOPPED"
44 | 
45 |   gce_setup {
46 |     machine_type = "e2-standard-4"
47 | 
48 |     vm_image {
49 |       project = "cloud-notebooks-managed"
50 |       name    = "workbench-instances-v20231108-py310"
51 |     }
52 | 
53 |     boot_disk {
54 |       disk_type = "PD_STANDARD"
55 |     }
56 | 
57 |     data_disks {
58 |       disk_type = "PD_STANDARD"
59 |     }
60 | 
61 |     network_interfaces {
62 |       network  = google_compute_network.default_network.id
63 |       subnet   = google_compute_subnetwork.subnet.id
64 |       nic_type = "GVNIC"
65 |     }
66 | 
67 |     disable_public_ip = false
68 | 
69 |     service_accounts {
70 |       email = google_service_account.workbench_service_account.email
71 |     }
72 | 
73 |     metadata = {
74 |       proxy-mode            = "service_account"
75 |       idle-timeout-seconds  = "10800"
76 |       report-event-health   = "true"
77 |       disable-mixer         = "false"
78 |       post-startup-script   = "gs://${google_storage_bucket.provisioning_bucket.name}/post_startup.sh"
79 |       report-dns-resolution = "true"
80 |     }
81 | 
82 |     enable_ip_forwarding = true
83 |   }
84 | 
85 |   depends_on = [
86 |     google_project_iam_member.workbench_sa_roles,
87 |     google_compute_firewall.subnet_firewall_rule
88 |   ]
89 | }
90 | 


--------------------------------------------------------------------------------
/src/yaml/project-setup.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # This defines the Google Workflow for the Analytics lakehouse Soultion: https://console.cloud.google.com/products/solutions/details/analytics-lakehouse
16 | # This Workflow executes through Terraform. For Google Workflows executed via Terraform, variables are defined such that:
17 | #
18 | #     - Terraform environment variables are denoted by $
19 | #     - Google Workflow variables are escaped via $$
20 | #
21 | # To modify this Workflow to stand alone (no Terraform):
22 | #
23 | #     - Replace vars in `main` -> `steps` -> `assign` with your own (or use https://cloud.google.com/workflows/docs/passing-runtime-arguments#gcloud)
24 | #     - Change all $$ to $
25 | 
26 | main:
27 |   params: []
28 |   steps:
29 |     # If this workflow has been run before, do not run again
30 |     - sub_check_if_run:
31 |         steps:
32 |           - assign_values:
33 |               assign:
34 |                 - project_id: $${sys.get_env("GOOGLE_CLOUD_PROJECT_ID")}
35 |                 - location: $${sys.get_env("GOOGLE_CLOUD_LOCATION")}
36 |                 - workflow_id: $${sys.get_env("GOOGLE_CLOUD_WORKFLOW_ID")}
37 |           - get_executions:
38 |               call: http.get
39 |               args:
40 |                 url: $${"https://workflowexecutions.googleapis.com/v1/projects/"+project_id+"/locations/"+location+"/workflows/"+workflow_id+"/executions"}
41 |                 auth:
42 |                   type: OAuth2
43 |               result: Operation
44 |           - check_if_run:
45 |               switch:
46 |                 - condition: $${len(Operation.body.executions) > 1}
47 |                   next: end
48 |     - sub_create_taxonomy:
49 |         call: create_taxonomy
50 |         result: create_taxonomy_output
51 | 
52 | # Subworkflow to Dataplex taxonomy
53 | create_taxonomy:
54 |   steps:
55 |     - assign_values:
56 |         assign:
57 |           - project_id: $${sys.get_env("GOOGLE_CLOUD_PROJECT_ID")}
58 |           - location: $${sys.get_env("GOOGLE_CLOUD_LOCATION")}
59 |     - ufdataplex_job:
60 |         call: http.post
61 |         args:
62 |           url: $${"https://dataplex.googleapis.com/v1/projects/"+project_id+"/locations/"+location+"/dataTaxonomies?alt=json&dataTaxonomyId=sample-taxonomy&validateOnly=False"}
63 |           auth:
64 |             type: OAuth2
65 |           body:
66 |             description: Sample Taxonomy Description
67 |             displayName: Sample Taxonomy Display Name
68 |         result: Operation
69 |     - returnResult:
70 |         return: $${Operation}
71 | 


--------------------------------------------------------------------------------
/test/integration/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/terraform-google-modules/terraform-google-analytics-lakehouse/test/integration
 2 | 
 3 | go 1.23.0
 4 | 
 5 | toolchain go1.23.7
 6 | 
 7 | require (
 8 | 	github.com/GoogleCloudPlatform/cloud-foundation-toolkit/infra/blueprint-test v0.17.6
 9 | 	github.com/stretchr/testify v1.10.0
10 | )
11 | 
12 | require (
13 | 	github.com/agext/levenshtein v1.2.3 // indirect
14 | 	github.com/alexflint/go-filemutex v1.3.0 // indirect
15 | 	github.com/apparentlymart/go-textseg/v15 v15.0.0 // indirect
16 | 	github.com/bgentry/go-netrc v0.0.0-20140422174119-9fd32a8b3d3d // indirect
17 | 	github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
18 | 	github.com/go-errors/errors v1.5.0 // indirect
19 | 	github.com/go-openapi/jsonpointer v0.21.0 // indirect
20 | 	github.com/go-openapi/jsonreference v0.20.2 // indirect
21 | 	github.com/go-openapi/swag v0.23.0 // indirect
22 | 	github.com/google/gnostic-models v0.6.9 // indirect
23 | 	github.com/google/go-cmp v0.6.0 // indirect
24 | 	github.com/gruntwork-io/terratest v0.48.2 // indirect
25 | 	github.com/hashicorp/errwrap v1.1.0 // indirect
26 | 	github.com/hashicorp/go-cleanhttp v0.5.2 // indirect
27 | 	github.com/hashicorp/go-getter/v2 v2.2.3 // indirect
28 | 	github.com/hashicorp/go-multierror v1.1.1 // indirect
29 | 	github.com/hashicorp/go-safetemp v1.0.0 // indirect
30 | 	github.com/hashicorp/go-version v1.7.0 // indirect
31 | 	github.com/hashicorp/hcl v0.0.0-20170504190234-a4b07c25de5f // indirect
32 | 	github.com/hashicorp/hcl/v2 v2.22.0 // indirect
33 | 	github.com/hashicorp/terraform-config-inspect v0.0.0-20250203082807-efaa306e97b4 // indirect
34 | 	github.com/hashicorp/terraform-json v0.24.0 // indirect
35 | 	github.com/jinzhu/copier v0.4.0 // indirect
36 | 	github.com/josharian/intern v1.0.0 // indirect
37 | 	github.com/klauspost/compress v1.16.7 // indirect
38 | 	github.com/mailru/easyjson v0.7.7 // indirect
39 | 	github.com/mattn/go-shellwords v1.0.12 // indirect
40 | 	github.com/mattn/go-zglob v0.0.4 // indirect
41 | 	github.com/mitchellh/go-homedir v1.1.0 // indirect
42 | 	github.com/mitchellh/go-testing-interface v1.14.2-0.20210821155943-2d9075ca8770 // indirect
43 | 	github.com/mitchellh/go-wordwrap v1.0.1 // indirect
44 | 	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
45 | 	github.com/tidwall/gjson v1.18.0 // indirect
46 | 	github.com/tidwall/match v1.1.1 // indirect
47 | 	github.com/tidwall/pretty v1.2.1 // indirect
48 | 	github.com/tidwall/sjson v1.2.5 // indirect
49 | 	github.com/tmccombs/hcl2json v0.6.4 // indirect
50 | 	github.com/ulikunitz/xz v0.5.11 // indirect
51 | 	github.com/zclconf/go-cty v1.15.1 // indirect
52 | 	golang.org/x/crypto v0.35.0 // indirect
53 | 	golang.org/x/mod v0.23.0 // indirect
54 | 	golang.org/x/net v0.36.0 // indirect
55 | 	golang.org/x/sync v0.11.0 // indirect
56 | 	golang.org/x/sys v0.30.0 // indirect
57 | 	golang.org/x/text v0.22.0 // indirect
58 | 	golang.org/x/tools v0.26.0 // indirect
59 | 	google.golang.org/protobuf v1.35.1 // indirect
60 | 	gopkg.in/yaml.v3 v3.0.1 // indirect
61 | 	k8s.io/kube-openapi v0.0.0-20241212222426-2c72e554b1e7 // indirect
62 | 	sigs.k8s.io/kustomize/kyaml v0.19.0 // indirect
63 | 	sigs.k8s.io/yaml v1.4.0 // indirect
64 | )
65 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2023-2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # NOTE: This file is automatically generated from values at:
16 | # https://github.com/GoogleCloudPlatform/cloud-foundation-toolkit/blob/main/infra/terraform/test-org/org/locals.tf
17 | 
18 | name: 'lint'
19 | 
20 | on:
21 |   workflow_dispatch:
22 |   pull_request:
23 |     types: [opened, edited, reopened, synchronize]
24 |     branches: [main]
25 | 
26 | permissions:
27 |   contents: read
28 | 
29 | concurrency:
30 |   group: '${{ github.workflow }}-${{ github.head_ref || github.ref }}'
31 |   cancel-in-progress: true
32 | 
33 | jobs:
34 |   lint:
35 |     name: 'lint'
36 |     runs-on: 'ubuntu-latest'
37 |     steps:
38 |       - uses: 'actions/checkout@v6'
39 |       - id: variables
40 |         run: |
41 |           MAKEFILE=$(find . -name Makefile -print -quit)
42 |           if [ -z "$MAKEFILE" ]; then
43 |             echo dev-tools=gcr.io/cloud-foundation-cicd/cft/developer-tools:1 >> "$GITHUB_OUTPUT"
44 |           else
45 |             VERSION=$(grep "DOCKER_TAG_VERSION_DEVELOPER_TOOLS := " $MAKEFILE | cut -d\  -f3)
46 |             IMAGE=$(grep "DOCKER_IMAGE_DEVELOPER_TOOLS := " $MAKEFILE | cut -d\  -f3)
47 |             REGISTRY=$(grep "REGISTRY_URL := " $MAKEFILE | cut -d\  -f3)
48 |             echo dev-tools=${REGISTRY}/${IMAGE}:${VERSION} >> "$GITHUB_OUTPUT"
49 |           fi
50 |       - run: docker run --rm -e ENABLE_BPMETADATA -v ${{ github.workspace }}:/workspace ${{ steps.variables.outputs.dev-tools }} module-swapper
51 |         env:
52 |           ENABLE_BPMETADATA: 1
53 | 
54 |       - run: docker run --rm -e ENABLE_BPMETADATA -v ${{ github.workspace }}:/workspace ${{ steps.variables.outputs.dev-tools }} /usr/local/bin/test_lint.sh
55 |         env:
56 |           ENABLE_BPMETADATA: 1
57 | 
58 |   commitlint:
59 |     runs-on: ubuntu-latest
60 |     steps:
61 |       - uses: actions/checkout@v6
62 |         with:
63 |           fetch-depth: 0
64 |       - name: Setup node
65 |         uses: actions/setup-node@v6
66 |         with:
67 |           node-version: lts/*
68 |       - name: Install commitlint
69 |         run: |
70 |           npm install -D @commitlint/cli@20.2.0 @commitlint/config-conventional@20.2.0
71 |           echo "module.exports = { extends: ['@commitlint/config-conventional'], rules: {'subject-case': [0], 'header-max-length': [0]} };" > commitlint.config.js
72 |           npx commitlint --version
73 |       - name: Validate PR commits with commitlint
74 |         if: github.event_name == 'pull_request'
75 |         env:
76 |           TITLE: ${{ github.event.pull_request.title }}
77 |         run: 'echo "$TITLE" | npx commitlint --verbose'
78 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Please note that this file was generated from [terraform-google-module-template](https://github.com/terraform-google-modules/terraform-google-module-template).
16 | # Please make sure to contribute relevant changes upstream!
17 | 
18 | # Make will use bash instead of sh
19 | SHELL := /usr/bin/env bash
20 | 
21 | DOCKER_TAG_VERSION_DEVELOPER_TOOLS := 1.23
22 | DOCKER_IMAGE_DEVELOPER_TOOLS := cft/developer-tools
23 | REGISTRY_URL := gcr.io/cloud-foundation-cicd
24 | ENABLE_BPMETADATA := 1
25 | export ENABLE_BPMETADATA
26 | 
27 | # Enter docker container for local development
28 | .PHONY: docker_run
29 | docker_run:
30 | 	docker run --rm -it \
31 | 		-e SERVICE_ACCOUNT_JSON \
32 | 		-v "$(CURDIR)":/workspace \
33 | 		$(REGISTRY_URL)/${DOCKER_IMAGE_DEVELOPER_TOOLS}:${DOCKER_TAG_VERSION_DEVELOPER_TOOLS} \
34 | 		/bin/bash
35 | 
36 | # Execute prepare tests within the docker container
37 | .PHONY: docker_test_prepare
38 | docker_test_prepare:
39 | 	docker run --rm -it \
40 | 		-e SERVICE_ACCOUNT_JSON \
41 | 		-e TF_VAR_org_id \
42 | 		-e TF_VAR_folder_id \
43 | 		-e TF_VAR_billing_account \
44 | 		-v "$(CURDIR)":/workspace \
45 | 		$(REGISTRY_URL)/${DOCKER_IMAGE_DEVELOPER_TOOLS}:${DOCKER_TAG_VERSION_DEVELOPER_TOOLS} \
46 | 		/usr/local/bin/execute_with_credentials.sh prepare_environment
47 | 
48 | # Clean up test environment within the docker container
49 | .PHONY: docker_test_cleanup
50 | docker_test_cleanup:
51 | 	docker run --rm -it \
52 | 		-e SERVICE_ACCOUNT_JSON \
53 | 		-e TF_VAR_org_id \
54 | 		-e TF_VAR_folder_id \
55 | 		-e TF_VAR_billing_account \
56 | 		-v "$(CURDIR)":/workspace \
57 | 		$(REGISTRY_URL)/${DOCKER_IMAGE_DEVELOPER_TOOLS}:${DOCKER_TAG_VERSION_DEVELOPER_TOOLS} \
58 | 		/usr/local/bin/execute_with_credentials.sh cleanup_environment
59 | 
60 | # Execute integration tests within the docker container
61 | .PHONY: docker_test_integration
62 | docker_test_integration:
63 | 	docker run --rm -it \
64 | 		-e SERVICE_ACCOUNT_JSON \
65 | 		-v "$(CURDIR)":/workspace \
66 | 		$(REGISTRY_URL)/${DOCKER_IMAGE_DEVELOPER_TOOLS}:${DOCKER_TAG_VERSION_DEVELOPER_TOOLS} \
67 | 		/usr/local/bin/test_integration.sh
68 | 
69 | # Execute lint tests within the docker container
70 | .PHONY: docker_test_lint
71 | docker_test_lint:
72 | 	docker run --rm -it \
73 | 		-e ENABLE_BPMETADATA \
74 | 		-e EXCLUDE_LINT_DIRS \
75 | 		-v "$(CURDIR)":/workspace \
76 | 		$(REGISTRY_URL)/${DOCKER_IMAGE_DEVELOPER_TOOLS}:${DOCKER_TAG_VERSION_DEVELOPER_TOOLS} \
77 | 		/usr/local/bin/test_lint.sh
78 | 
79 | # Generate documentation
80 | .PHONY: docker_generate_docs
81 | docker_generate_docs:
82 | 	docker run --rm -it \
83 | 		-e ENABLE_BPMETADATA \
84 | 		-v "$(CURDIR)":/workspace \
85 | 		$(REGISTRY_URL)/${DOCKER_IMAGE_DEVELOPER_TOOLS}:${DOCKER_TAG_VERSION_DEVELOPER_TOOLS} \
86 | 		/bin/bash -c 'source /usr/local/bin/task_helper_functions.sh && generate_docs -d'
87 | 
88 | # Alias for backwards compatibility
89 | .PHONY: generate_docs
90 | generate_docs: docker_generate_docs
91 | 


--------------------------------------------------------------------------------
/metadata.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | apiVersion: blueprints.cloud.google.com/v1alpha1
16 | kind: BlueprintMetadata
17 | metadata:
18 |   name: terraform-google-analytics-lakehouse
19 |   annotations:
20 |     config.kubernetes.io/local-config: "true"
21 | spec:
22 |   info:
23 |     title: terraform-google-lakehouse
24 |     source:
25 |       repo: https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse.git
26 |       sourceType: git
27 |     version: 0.4.0
28 |     actuationTool:
29 |       flavor: Terraform
30 |       version: ">= 0.13"
31 |     description: {}
32 |   content:
33 |     documentation:
34 |       - title: Create an Analytics Lakehouse
35 |         url: https://cloud.google.com/architecture/big-data-analytics/analytics-lakehouse
36 |     examples:
37 |       - name: analytics_lakehouse
38 |         location: examples/analytics_lakehouse
39 |   interfaces:
40 |     variables:
41 |       - name: enable_apis
42 |         description: Whether or not to enable underlying apis in this solution. .
43 |         varType: string
44 |         defaultValue: true
45 |       - name: force_destroy
46 |         description: Whether or not to protect GCS resources from deletion when solution is modified or changed.
47 |         varType: string
48 |         defaultValue: false
49 |       - name: labels
50 |         description: A map of labels to apply to contained resources.
51 |         varType: map(string)
52 |         defaultValue:
53 |           analytics-lakehouse: true
54 |       - name: project_id
55 |         description: Google Cloud Project ID
56 |         varType: string
57 |         required: true
58 |       - name: public_data_bucket
59 |         description: Public Data bucket for access
60 |         varType: string
61 |         defaultValue: data-analytics-demos
62 |       - name: region
63 |         description: Google Cloud Region
64 |         varType: string
65 |         defaultValue: us-central1
66 |       - name: use_case_short
67 |         description: Short name for use case
68 |         varType: string
69 |         defaultValue: lakehouse
70 |     outputs:
71 |       - name: bigquery_editor_url
72 |         description: The URL to launch the BigQuery editor
73 |       - name: lakehouse_colab_url
74 |         description: The URL to launch the in-console tutorial for the Analytics Lakehouse solution
75 |       - name: lookerstudio_report_url
76 |         description: The URL to create a new Looker Studio report displays a sample dashboard for data analysis
77 |       - name: neos_tutorial_url
78 |         description: The URL to launch the in-console tutorial for the Analytics Lakehouse solution
79 |       - name: workflow_return_project_setup
80 |         description: Output of the project setup workflow
81 |   requirements:
82 |     roles:
83 |       - level: Project
84 |         roles:
85 |           - roles/owner
86 |           - roles/bigquery.dataViewer
87 |     services:
88 |       - cloudkms.googleapis.com
89 |       - cloudresourcemanager.googleapis.com
90 |       - bigquery.googleapis.com
91 |       - bigquerystorage.googleapis.com
92 |       - bigqueryconnection.googleapis.com
93 |       - serviceusage.googleapis.com
94 |       - iam.googleapis.com
95 | 


--------------------------------------------------------------------------------
/tutorial.md:
--------------------------------------------------------------------------------
 1 | <walkthrough-metadata>
 2 |   <meta name="title" content="Edit Jumpstart Solution and deploy tutorial " />
 3 |   <meta name="description" content="Make it mine neos tutorial" />
 4 |   <meta name="component_id" content="1361081" />
 5 |   <meta name="short_id" content="true" />
 6 | </walkthrough-metadata>
 7 | 
 8 | # Customize an Analytics Lakehouse Solution
 9 | 
10 | Learn how to build and deploy your own proof of concept based on the deployed [Analytics Lakehouse](https://console.cloud.google.com/products/solutions/details/analytics-lakehouse) Jump Start Solution. You can customize the Jump Start Solution deployment by creating a copy of the source code. You can modify the infrastructure and application code as needed and redeploy the solution with the changes.
11 | 
12 | To avoid conflicts, only one user should modify and deploy a solution in a single Google Cloud project.
13 | 
14 | ## Open cloned repository as workspace
15 | 
16 | Open the directory where the repository is cloned as a workspace in the editor, follow the steps based on whether you are using the Cloud Shell Editor in Preview Mode or Legacy Mode.
17 | 
18 | ---
19 | **Legacy Cloud Shell Editor**
20 | 
21 | 1. Go to the `File` menu.
22 | 2. Select `Open Workspace`.
23 | 3. Choose the directory where the repository has been cloned. This directory is the current directory in the cloud shell terminal.
24 | 
25 | **New Cloud Shell Editor**
26 | 
27 | 1. Go the hamburger icon located in the top left corner of the editor.
28 | 2. Go to the `File` Menu.
29 | 3. Select `Open Folder`.
30 | 4. Choose the directory where the repository has been cloned. This directory is the current directory in the cloud shell terminal.
31 | 
32 | ## Before you begin
33 | 
34 | We also strongly recommend that you familiarize yourself with the Analytics Lakehouse solution by reading the [solution guide](https://cloud.google.com/architecture/big-data-analytics/analytics-lakehouse).
35 | 
36 | NOTE: A change in the infrastructure code might cause a change in the incurred cost.
37 | 
38 | ---
39 | **Create an automated deployment**
40 | 
41 | Run the <walkthrough-editor-open-file filePath="./deploy_solution.sh">deploy_solution.sh</walkthrough-editor-open-file> script.
42 | 
43 | ```bash
44 | ./deploy_solution.sh
45 | ```
46 | 
47 | ---
48 | **Monitor the deployment**
49 | 
50 | Get the deployment details.
51 | 
52 | ```bash
53 | gcloud infra-manager deployments describe <var>DEPLOYMENT_NAME</var> --location <var>REGION</var>
54 | ```
55 | 
56 | Monitor your deployment at [Solution deployments page](https://console.cloud.google.com/products/solutions/deployments?pageState=(%22deployments%22:(%22f%22:%22%255B%257B_22k_22_3A_22Labels_22_2C_22t_22_3A13_2C_22v_22_3A_22_5C_22modification-reason%2520_3A%2520make-it-mine_5C_22_22_2C_22s_22_3Atrue_2C_22i_22_3A_22deployment.labels_22%257D%255D%22))).
57 | 
58 | ## Save your edits to the solution
59 | 
60 | Use any of the following methods to save your edits to the solution
61 | 
62 | ---
63 | **Download the solution**
64 | 
65 | To download your solution, in the `File` menu, select `Download Workspace`. The solution is downloaded in a compressed format.
66 | 
67 | 
68 | ---
69 | **Save the solution to your Git repository**
70 | 
71 | Set the remote URL to your Git repository
72 | ```bash
73 | git remote set-url origin [git-repo-url]
74 | ```
75 | 
76 | Review the modified files, commit and push to your remote repository branch.
77 | 
78 | ## Delete the deployed solution
79 | 
80 | Optional: Use one of the below options in case you want to delete the deployed solution
81 | 
82 | * Go to [Solution deployments page](https://console.cloud.google.com/products/solutions/deployments?pageState=(%22deployments%22:(%22f%22:%22%255B%257B_22k_22_3A_22Labels_22_2C_22t_22_3A13_2C_22v_22_3A_22_5C_22modification-reason%2520_3A%2520make-it-mine_5C_22_22_2C_22s_22_3Atrue_2C_22i_22_3A_22deployment.labels_22%257D%255D%22))).
83 | * Click on the link under "Deployment name". It will take you to the deployment details page for the solution.
84 | * Click on the "DELETE" button located at the top right corner of the page.
85 | <walkthrough-inline-feedback></walkthrough-inline-feedback>
86 | 


--------------------------------------------------------------------------------
/test/integration/analytics_lakehouse/analytics_lakehouse_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2023 Google LLC
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //      http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | package multiple_buckets
 16 | 
 17 | import (
 18 | 	"fmt"
 19 | 	"testing"
 20 | 	"time"
 21 | 
 22 | 	"github.com/GoogleCloudPlatform/cloud-foundation-toolkit/infra/blueprint-test/pkg/bq"
 23 | 	"github.com/GoogleCloudPlatform/cloud-foundation-toolkit/infra/blueprint-test/pkg/gcloud"
 24 | 	"github.com/GoogleCloudPlatform/cloud-foundation-toolkit/infra/blueprint-test/pkg/tft"
 25 | 	"github.com/GoogleCloudPlatform/cloud-foundation-toolkit/infra/blueprint-test/pkg/utils"
 26 | 	"github.com/stretchr/testify/assert"
 27 | )
 28 | 
 29 | // Retry if these errors are encountered.
 30 | var retryErrors = map[string]string{
 31 | 	".*does not have enough resources available to fulfill the request.  Try a different zone,.*": "Compute zone resources currently unavailable.",
 32 | 	".*Error 400: The subnetwork resource*":                                                       "Subnet is eventually drained",
 33 | }
 34 | 
 35 | func TestAnalyticsLakehouse(t *testing.T) {
 36 | 	dwh := tft.NewTFBlueprintTest(t, tft.WithRetryableTerraformErrors(retryErrors, 60, time.Minute))
 37 | 
 38 | 	dwh.DefineVerify(func(assert *assert.Assertions) {
 39 | 		// Commented out until Workbench provider proxy-byoid-url bug is fixed
 40 | 		// dwh.DefaultVerify(assert)
 41 | 
 42 | 		time.Sleep(300 * time.Second)
 43 | 
 44 | 		projectID := dwh.GetTFSetupStringOutput("project_id")
 45 | 
 46 | 		verifyWorkflow := func(workflow string) (bool, error) {
 47 | 			executions := gcloud.Runf(t, "workflows executions list %s --project %s --sort-by=startTime", workflow, projectID)
 48 | 			state := executions.Get("0.state").String()
 49 | 			if state == "FAILED" {
 50 | 				id := executions.Get("0.name")
 51 | 				gcloud.Runf(t, "workflows executions describe %s", id)
 52 | 				t.FailNow()
 53 | 			}
 54 | 			if state == "SUCCEEDED" {
 55 | 				return false, nil
 56 | 			}
 57 | 			return true, nil
 58 | 		}
 59 | 
 60 | 		// Assert copy-data workflow ran successfully
 61 | 		verifyCopyDataWorkflow := func() (bool, error) {
 62 | 			return verifyWorkflow("copy-data")
 63 | 		}
 64 | 		utils.Poll(t, verifyCopyDataWorkflow, 50, 15*time.Second)
 65 | 
 66 | 		// Assert project-setup workflow ran successfully
 67 | 		verifyProjectSetupWorkflow := func() (bool, error) {
 68 | 			return verifyWorkflow("project-setup")
 69 | 		}
 70 | 		utils.Poll(t, verifyProjectSetupWorkflow, 100, 15*time.Second)
 71 | 
 72 | 		tables := []string{
 73 | 			"gcp_primary_raw.ga4_obfuscated_sample_ecommerce_images",
 74 | 			"gcp_primary_raw.textocr_images",
 75 | 			"gcp_primary_staging.new_york_taxi_trips_tlc_yellow_trips_2022",
 76 | 			"gcp_primary_staging.thelook_ecommerce_distribution_centers",
 77 | 			"gcp_primary_staging.thelook_ecommerce_events",
 78 | 			"gcp_primary_staging.thelook_ecommerce_inventory_items",
 79 | 			"gcp_primary_staging.thelook_ecommerce_order_items",
 80 | 			"gcp_primary_staging.thelook_ecommerce_orders",
 81 | 			"gcp_primary_staging.thelook_ecommerce_products",
 82 | 			"gcp_primary_staging.thelook_ecommerce_users",
 83 | 			"gcp_lakehouse_ds.agg_events_iceberg",
 84 | 		}
 85 | 
 86 | 		query_template := "SELECT count(*) AS count FROM `%[1]s.%[2]s`;"
 87 | 		for _, table := range tables {
 88 | 			query := fmt.Sprintf(query_template, projectID, table)
 89 | 			op := bq.Runf(t, "--project_id=%[1]s query --nouse_legacy_sql %[2]s", projectID, query)
 90 | 
 91 | 			count := op.Get("0.count").Int()
 92 | 			assert.Greater(count, int64(0), table)
 93 | 		}
 94 | 	})
 95 | 
 96 | 	dwh.DefineTeardown(func(assert *assert.Assertions) {
 97 | 		dwh.DefaultTeardown(assert)
 98 | 
 99 | 	})
100 | 	dwh.Test()
101 | }
102 | 


--------------------------------------------------------------------------------
/src/python/bigtable.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # Copyright 2024 Google LLC
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | # This file is used as a part of the Neos journey for the Analytics
 17 | # Lakehouse Jumpstart solution. It is not automatically executed as a
 18 | # part of the default deployment.
 19 | 
 20 | """Bigtable to PySpark to BigQuery example."""
 21 | from pyspark.sql import SparkSession
 22 | import sys
 23 | 
 24 | # Must provide a project ID and an instance ID.
 25 | if len(sys.argv) < 3:
 26 |     print("Please provide a project ID and an instance ID.")
 27 | 
 28 | project_id = sys.argv[1]
 29 | instance_id = sys.argv[2]
 30 | 
 31 | # Create a Spark session and configure the spark-bigtable connector.
 32 | spark = SparkSession.builder \
 33 |           .config("spark.jars",
 34 |                   "gs://spark-bigtable-preview/jars/" +
 35 |                   "spark-bigtable-0.0.1-preview5-SNAPSHOT.jar") \
 36 |           .getOrCreate()
 37 | 
 38 | # Create the catalog schema to convert Bigtable columns to Spark.
 39 | # "table" defnes the Bigtable namespace and table to read data from.
 40 | # "rowkey" defines the rowkey.
 41 | # "columns" are formatted as
 42 | # "SPARK_DF_COLUMN_NAME":{
 43 | #                          "cf":"BIGTABLE_COLUMN_FAMILY",
 44 | #                          "col":"BIGTABLE_COLUMN_NAME",
 45 | #                          "type":"BIGTABLE_TYPE"
 46 | #                        }
 47 | catalog = ''.join(("""{
 48 |       "table":{"namespace":"default", "name":"UserPersonalization"},
 49 |       "rowkey":"rowkey",
 50 |       "columns":{
 51 |         "_rowkey":{"cf":"rowkey", "col":"rowkey", "type":"string"},
 52 |         "rec0":{
 53 |                  "cf":"Recommendations",
 54 |                  "col":"Recommendation0",
 55 |                  "type":"string"
 56 |                },
 57 |         "rec1":{
 58 |                  "cf":"Recommendations",
 59 |                  "col":"Recommendation1",
 60 |                  "type":"string"
 61 |                },
 62 |         "rec2":{
 63 |                  "cf":"Recommendations",
 64 |                  "col":"Recommendation2",
 65 |                  "type":"string"
 66 |                },
 67 |         "rec3":{
 68 |                  "cf":"Recommendations",
 69 |                  "col":"Recommendation3",
 70 |                  "type":"string"
 71 |         }
 72 |       }
 73 |       }""").split())
 74 | 
 75 | # Load Bigtable data.
 76 | df = spark.read \
 77 |         .format('bigtable') \
 78 |         .option('spark.bigtable.project.id', project_id) \
 79 |         .option('spark.bigtable.instance.id', instance_id) \
 80 |         .options(catalog=catalog) \
 81 |         .load()
 82 | 
 83 | 
 84 | # Create new dfs counting each recommended item per rec position.
 85 | # Rename columns to join later.
 86 | def groupby_count_rename(df, col):
 87 |     return df.groupBy(col) \
 88 |              .count() \
 89 |              .withColumnRenamed(col, "item") \
 90 |              .withColumnRenamed("count", col)
 91 | 
 92 | 
 93 | r0 = groupby_count_rename(df, "rec0")
 94 | r1 = groupby_count_rename(df, "rec1")
 95 | r2 = groupby_count_rename(df, "rec2")
 96 | r3 = groupby_count_rename(df, "rec3")
 97 | 
 98 | # Join all columns together. The output is a table with
 99 | # item names and number of times each name appears in each rec column.
100 | joined_df = r0.join(r1, r0.item == r1.item, 'outer') \
101 |               .join(r2, r0.item == r2.item, 'outer') \
102 |               .join(r3, r0.item == r3.item, 'outer') \
103 |               .select(r0.item, "rec0", "rec1", "rec2", "rec3")
104 | 
105 | # Write the table to BigQuery.
106 | joined_df.write \
107 |   .format("bigquery") \
108 |   .option("writeMethod", "direct") \
109 |   .save("gcp_lakehouse_ds.user_recommendations")
110 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # terraform-google-lakehouse
  2 | 
  3 | ## Description
  4 | ### tagline
  5 | This is an auto-generated module.
  6 | 
  7 | ### detailed
  8 | This module was generated from [terraform-google-module-template](https://github.com/terraform-google-modules/terraform-google-module-template/), which by default generates a module that simply creates a GCS bucket. As the module develops, this README should be updated.
  9 | 
 10 | The resources/services/activations/deletions that this module will create/trigger are:
 11 | 
 12 | - Create a GCS bucket with the provided name
 13 | 
 14 | ### preDeploy
 15 | To deploy this blueprint you must have an active billing account and billing permissions.
 16 | 
 17 | ## Documentation
 18 | - [Create an Analytics Lakehouse](https://cloud.google.com/architecture/big-data-analytics/analytics-lakehouse)
 19 | 
 20 | ## Usage
 21 | 
 22 | Basic usage of this module is as follows:
 23 | 
 24 | ```hcl
 25 | module "analytics_lakehouse" {
 26 |   source = "../.."
 27 | 
 28 |   project_id          = var.project_id
 29 |   region              = "us-central1"
 30 |   force_destroy       = true
 31 | 
 32 | }
 33 | ```
 34 | 
 35 | Functional examples are included in the
 36 | [examples](./examples/) directory.
 37 | 
 38 | <!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
 39 | ## Inputs
 40 | 
 41 | | Name | Description | Type | Default | Required |
 42 | |------|-------------|------|---------|:--------:|
 43 | | enable\_apis | Whether or not to enable underlying apis in this solution. . | `string` | `true` | no |
 44 | | force\_destroy | Whether or not to protect GCS resources from deletion when solution is modified or changed. | `string` | `false` | no |
 45 | | labels | A map of labels to apply to contained resources. | `map(string)` | <pre>{<br>  "analytics-lakehouse": true<br>}</pre> | no |
 46 | | project\_id | Google Cloud Project ID | `string` | n/a | yes |
 47 | | public\_data\_bucket | Public Data bucket for access | `string` | `"data-analytics-demos"` | no |
 48 | | region | Google Cloud Region | `string` | `"us-central1"` | no |
 49 | | use\_case\_short | Short name for use case | `string` | `"lakehouse"` | no |
 50 | 
 51 | ## Outputs
 52 | 
 53 | | Name | Description |
 54 | |------|-------------|
 55 | | bigquery\_editor\_url | The URL to launch the BigQuery editor |
 56 | | lakehouse\_colab\_url | The URL to launch the in-console tutorial for the Analytics Lakehouse solution |
 57 | | lookerstudio\_report\_url | The URL to create a new Looker Studio report displays a sample dashboard for data analysis |
 58 | | neos\_tutorial\_url | The URL to launch the in-console tutorial for the Analytics Lakehouse solution |
 59 | | workflow\_return\_project\_setup | Output of the project setup workflow |
 60 | 
 61 | <!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
 62 | 
 63 | ## Requirements
 64 | 
 65 | These sections describe requirements for using this module.
 66 | 
 67 | ### Software
 68 | 
 69 | The following dependencies must be available:
 70 | 
 71 | - [Terraform][terraform] >= v0.13
 72 | - [Terraform Provider for GCP][terraform-provider-gcp] plugin ~> v4.56
 73 | 
 74 | ### Service Account
 75 | 
 76 | A service account with the following roles must be used to provision
 77 | the resources of this module:
 78 | 
 79 | - Storage Admin: `roles/storage.admin`
 80 | 
 81 | The [Project Factory module][project-factory-module] and the
 82 | [IAM module][iam-module] may be used in combination to provision a
 83 | service account with the necessary roles applied.
 84 | 
 85 | ### APIs
 86 | 
 87 | A project with the following APIs enabled must be used to host the
 88 | resources of this module:
 89 | 
 90 | - Google Cloud Storage JSON API: `storage-api.googleapis.com`
 91 | 
 92 | The [Project Factory module][project-factory-module] can be used to
 93 | provision a project with the necessary APIs enabled.
 94 | 
 95 | ## Contributing
 96 | 
 97 | Refer to the [contribution guidelines](./CONTRIBUTING.md) for
 98 | information on contributing to this module.
 99 | 
100 | [iam-module]: https://registry.terraform.io/modules/terraform-google-modules/iam/google
101 | [project-factory-module]: https://registry.terraform.io/modules/terraform-google-modules/project-factory/google
102 | [terraform-provider-gcp]: https://www.terraform.io/docs/providers/google/index.html
103 | [terraform]: https://www.terraform.io/downloads.html
104 | 
105 | ## Security Disclosures
106 | 
107 | Please see our [security disclosure process](./SECURITY.md).
108 | 


--------------------------------------------------------------------------------
/deploy_solution.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2023 Google LLC
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | set -o pipefail
16 | 
17 | handle_error() {
18 |     local exit_code=$?
19 |     exit $exit_code
20 | }
21 | trap 'handle_error' ERR
22 | 
23 | SOLUTION_ID="analytics-lakehouse"
24 | 
25 | echo "Fetching Project ID"
26 | PROJECT_ID=$(gcloud config get project)
27 | echo "Project ID is ${PROJECT_ID}"
28 | 
29 | # Iterate over the infra manager location to identify the deployment
30 | # currently one deployment per project is only supported
31 | # in future if multiple deployments are supported per project this will need to change
32 | IM_SUPPORTED_REGIONS=("us-central1" "europe-west1" "asia-east1")
33 | 
34 | for REGION in "${IM_SUPPORTED_REGIONS[@]}"; do
35 |     DEPLOYMENT_NAME=$(gcloud infra-manager deployments list --location "${REGION}" \
36 |                         --filter="labels.goog-solutions-console-deployment-name:* AND \
37 |                         labels.goog-solutions-console-solution-id:${SOLUTION_ID}" \
38 |                         --format='value(name)')
39 |     if [ -n "$DEPLOYMENT_NAME" ]; then
40 |         break
41 |     fi
42 | done
43 | if [ -z "$DEPLOYMENT_NAME" ]; then
44 | 	echo "Failed to find the existing deployment, exiting now!"
45 | 	exit 1
46 | fi
47 | echo "Region is ${REGION}"
48 | echo "Deployment name is ${DEPLOYMENT_NAME}"
49 | 
50 | SERVICE_ACCOUNT=$(gcloud infra-manager deployments describe "${DEPLOYMENT_NAME}" --location "${REGION}" --format='value(serviceAccount)')
51 | 
52 | echo "Assigning required roles to the service account ${SERVICE_ACCOUNT}"
53 | # Iterate over the roles and check if the service account already has that role
54 | # assigned. If it has then skip adding that policy binding as using
55 | # --condition=None can overwrite any existing conditions in the binding.
56 | CURRENT_POLICY=$(gcloud projects get-iam-policy "${PROJECT_ID}" --format=json)
57 | MEMBER_EMAIL=$(echo "${SERVICE_ACCOUNT}" | awk -F '/' '{print $NF}')
58 | MEMBER="serviceAccount:${MEMBER_EMAIL}"
59 | 
60 | while IFS= read -r role || [[ -n "$role" ]]
61 | do \
62 | if echo "$CURRENT_POLICY" | jq -e --arg role "$role" --arg member "$MEMBER" '.bindings[] | select(.role == $role) | .members[] | select(. == $member)' > /dev/null; then \
63 |     echo "IAM policy binding already exists for member ${MEMBER} and role ${role}"
64 | else \
65 |     gcloud projects add-iam-policy-binding "${PROJECT_ID}" \
66 |     --member="$MEMBER" \
67 |     --role="$role" \
68 |     --condition=None
69 | fi
70 | done < "roles.txt"
71 | 
72 | DEPLOYMENT_DESCRIPTION=$(gcloud infra-manager deployments describe "${DEPLOYMENT_NAME}" --location "${REGION}" --format json)
73 | cat <<EOF > input.tfvars
74 | # Do not edit the region as changing the region can lead to failed deployment.
75 | region="$(echo "$DEPLOYMENT_DESCRIPTION" | jq -r '.terraformBlueprint.inputValues.region.inputValue')"
76 | project_id = "${PROJECT_ID}"
77 | labels = {
78 |   "goog-solutions-console-deployment-name" = "${DEPLOYMENT_NAME}",
79 |   "goog-solutions-console-solution-id" = "${SOLUTION_ID}"
80 | }
81 | EOF
82 | 
83 | echo "An input.tfvars has been created in the current directory with a set of default input terraform variables for the solution. You can modify their values or go ahead with the defaults."
84 | read -r -p "Once done, press Enter to continue: "
85 | 
86 | echo "Creating the cloud storage bucket if it does not exist already"
87 | BUCKET_NAME="${PROJECT_ID}_infra_manager_staging"
88 | if ! gsutil ls "gs://$BUCKET_NAME" &> /dev/null; then
89 |     gsutil mb "gs://$BUCKET_NAME/"
90 |     echo "Bucket $BUCKET_NAME created successfully."
91 | else
92 |     echo "Bucket $BUCKET_NAME already exists. Moving on to the next step."
93 | fi
94 | 
95 | echo "Deploying the solution"
96 | gcloud infra-manager deployments apply projects/"${PROJECT_ID}"/locations/"${REGION}"/deployments/"${DEPLOYMENT_NAME}" --service-account "${SERVICE_ACCOUNT}" --local-source="."     --inputs-file=./input.tfvars --labels="modification-reason=make-it-mine,goog-solutions-console-deployment-name=${DEPLOYMENT_NAME},goog-solutions-console-solution-id=${SOLUTION_ID},goog-config-partner=sc"
97 | 


--------------------------------------------------------------------------------
/deploy_via_trigger.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2024 Google LLC
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | set -o pipefail
 17 | 
 18 | handle_error() {
 19 |     local exit_code=$?
 20 |     exit $exit_code
 21 | }
 22 | trap 'handle_error' ERR
 23 | 
 24 | while getopts p: flag
 25 | do
 26 |     case "${flag}" in
 27 |         p) PROJECT_ID=${OPTARG};;
 28 |         *) echo "usage: $0 [-p PROJECT_ID]" >&2
 29 |            exit 1 ;;
 30 |     esac
 31 | done
 32 | 
 33 | if [ -z "$PROJECT_ID" ]; then
 34 | 	echo "Failed to read the project id, exiting now!"
 35 | 	exit 1
 36 | fi
 37 | 
 38 | SOLUTION_ID="analytics-lakehouse"
 39 | 
 40 | # Iterate over the infra manager location to identify the deployment
 41 | # currently one deployment per project is only supported
 42 | # in future if multiple deployments are supported per project this will need to change
 43 | IM_SUPPORTED_REGIONS=("us-central1" "europe-west1" "asia-east1")
 44 | 
 45 | for REGION in "${IM_SUPPORTED_REGIONS[@]}"; do
 46 |     DEPLOYMENT_NAME=$(gcloud infra-manager deployments list --location "${REGION}" \
 47 |                         --filter="labels.goog-solutions-console-deployment-name:* AND \
 48 |                         labels.goog-solutions-console-solution-id:${SOLUTION_ID}" \
 49 |                         --format='value(name)')
 50 |     if [ -n "$DEPLOYMENT_NAME" ]; then
 51 |         break
 52 |     fi
 53 | done
 54 | if [ -z "$DEPLOYMENT_NAME" ]; then
 55 | 	echo "Failed to find the existing deployment, exiting now!"
 56 | 	exit 1
 57 | fi
 58 | echo "Project ID is ${PROJECT_ID}"
 59 | echo "Region is ${REGION}"
 60 | echo "Deployment name is ${DEPLOYMENT_NAME}"
 61 | 
 62 | SERVICE_ACCOUNT=$(gcloud infra-manager deployments describe "${DEPLOYMENT_NAME}" --location "${REGION}" --format='value(serviceAccount)')
 63 | 
 64 | echo "Assigning required roles to the service account ${SERVICE_ACCOUNT}"
 65 | # Iterate over the roles and check if the service account already has that role
 66 | # assigned. If it has then skip adding that policy binding as using
 67 | # --condition=None can overwrite any existing conditions in the binding.
 68 | CURRENT_POLICY=$(gcloud projects get-iam-policy "${PROJECT_ID}" --format=json)
 69 | MEMBER_EMAIL=$(echo "${SERVICE_ACCOUNT}" | awk -F '/' '{print $NF}')
 70 | MEMBER="serviceAccount:${MEMBER_EMAIL}"
 71 | apt-get install jq -y
 72 | while IFS= read -r role || [[ -n "$role" ]]
 73 | do \
 74 | if echo "$CURRENT_POLICY" | jq -e --arg role "$role" --arg member "$MEMBER" '.bindings[] | select(.role == $role) | .members[] | select(. == $member)' > /dev/null; then \
 75 |     echo "IAM policy binding already exists for member ${MEMBER} and role ${role}"
 76 | else \
 77 |     gcloud projects add-iam-policy-binding "${PROJECT_ID}" \
 78 |     --member="$MEMBER" \
 79 |     --role="$role" \
 80 |     --condition=None
 81 | fi
 82 | done < "roles.txt"
 83 | 
 84 | DEPLOYMENT_DESCRIPTION=$(gcloud infra-manager deployments describe "${DEPLOYMENT_NAME}" --location "${REGION}" --format json)
 85 | cat <<EOF > input.tfvars
 86 | # Do not edit the region as changing the region can lead to failed deployment.
 87 | region="$(echo "$DEPLOYMENT_DESCRIPTION" | jq -r '.terraformBlueprint.inputValues.region.inputValue')"
 88 | project_id = "${PROJECT_ID}"
 89 | labels = {
 90 |   "goog-solutions-console-deployment-name" = "${DEPLOYMENT_NAME}",
 91 |   "goog-solutions-console-solution-id" = "${SOLUTION_ID}"
 92 | }
 93 | EOF
 94 | 
 95 | echo "Creating the cloud storage bucket if it does not exist already"
 96 | BUCKET_NAME="${PROJECT_ID}_infra_manager_staging"
 97 | if ! gsutil ls "gs://$BUCKET_NAME" &> /dev/null; then
 98 |     gsutil mb "gs://$BUCKET_NAME/"
 99 |     echo "Bucket $BUCKET_NAME created successfully."
100 | else
101 |     echo "Bucket $BUCKET_NAME already exists. Moving on to the next step."
102 | fi
103 | 
104 | echo "Deploying the solution"
105 | gcloud infra-manager deployments apply projects/"${PROJECT_ID}"/locations/"${REGION}"/deployments/"${DEPLOYMENT_NAME}" --service-account "${SERVICE_ACCOUNT}" --local-source="."     --inputs-file=./input.tfvars --labels="modification-reason=make-it-mine,goog-solutions-console-deployment-name=${DEPLOYMENT_NAME},goog-solutions-console-solution-id=${SOLUTION_ID},goog-config-partner=sc"
106 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing
  2 | 
  3 | We'd love to accept your patches and contributions to this project. There are
  4 | just a few small guidelines you need to follow.
  5 | 
  6 | ## Contributor License Agreement
  7 | 
  8 | Contributions to this project must be accompanied by a Contributor License
  9 | Agreement (CLA). You (or your employer) retain the copyright to your
 10 | contribution; this simply gives us permission to use and redistribute your
 11 | contributions as part of the project. Head over to
 12 | <https://cla.developers.google.com/> to see your current agreements on file or
 13 | to sign a new one.
 14 | 
 15 | You generally only need to submit a CLA once, so if you've already submitted one
 16 | (even if it was for a different project), you probably don't need to do it
 17 | again.
 18 | 
 19 | ## Code Reviews
 20 | 
 21 | All submissions, including submissions by project members, require review. We
 22 | use GitHub pull requests for this purpose. Consult
 23 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
 24 | information on using pull requests.
 25 | 
 26 | ## Development
 27 | 
 28 | The following dependencies must be installed on the development system:
 29 | 
 30 | - [Docker Engine][docker-engine]
 31 | - [Google Cloud SDK][google-cloud-sdk]
 32 | - [make]
 33 | 
 34 | ### Generating Documentation for Inputs and Outputs
 35 | 
 36 | The Inputs and Outputs tables in the READMEs of the root module,
 37 | submodules, and example modules are automatically generated based on
 38 | the `variables` and `outputs` of the respective modules. These tables
 39 | must be refreshed if the module interfaces are changed.
 40 | 
 41 | #### Execution
 42 | 
 43 | Run `make generate_docs` to generate new Inputs and Outputs tables.
 44 | 
 45 | ### Integration Testing
 46 | 
 47 | Integration tests are used to verify the behaviour of the root module,
 48 | submodules, and example modules. Additions, changes, and fixes should
 49 | be accompanied with tests.
 50 | 
 51 | The integration tests are run using [Kitchen][kitchen],
 52 | [Kitchen-Terraform][kitchen-terraform], and [InSpec][inspec]. These
 53 | tools are packaged within a Docker image for convenience.
 54 | 
 55 | The general strategy for these tests is to verify the behaviour of the
 56 | [example modules](./examples/), thus ensuring that the root module,
 57 | submodules, and example modules are all functionally correct.
 58 | 
 59 | #### Test Environment
 60 | The easiest way to test the module is in an isolated test project. The setup for such a project is defined in [test/setup](./test/setup/) directory.
 61 | 
 62 | To use this setup, you need a service account with these permissions (on a Folder or Organization):
 63 | - Project Creator
 64 | - Project Billing Manager
 65 | 
 66 | The project that the service account belongs to must have the following APIs enabled (the setup won't
 67 | create any resources on the service account's project):
 68 | - Cloud Resource Manager
 69 | - Cloud Billing
 70 | - Service Usage
 71 | - Identity and Access Management (IAM)
 72 | 
 73 | Export the Service Account credentials to your environment like so:
 74 | 
 75 | ```
 76 | export SERVICE_ACCOUNT_JSON=$(< credentials.json)
 77 | ```
 78 | 
 79 | You will also need to set a few environment variables:
 80 | ```
 81 | export TF_VAR_org_id="your_org_id"
 82 | export TF_VAR_folder_id="your_folder_id"
 83 | export TF_VAR_billing_account="your_billing_account_id"
 84 | ```
 85 | 
 86 | With these settings in place, you can prepare a test project using Docker:
 87 | ```
 88 | make docker_test_prepare
 89 | ```
 90 | 
 91 | #### Noninteractive Execution
 92 | 
 93 | Run `make docker_test_integration` to test all of the example modules
 94 | noninteractively, using the prepared test project.
 95 | 
 96 | #### Interactive Execution
 97 | 
 98 | 1. Run `make docker_run` to start the testing Docker container in
 99 |    interactive mode.
100 | 
101 | 1. Run `kitchen_do create <EXAMPLE_NAME>` to initialize the working
102 |    directory for an example module.
103 | 
104 | 1. Run `kitchen_do converge <EXAMPLE_NAME>` to apply the example module.
105 | 
106 | 1. Run `kitchen_do verify <EXAMPLE_NAME>` to test the example module.
107 | 
108 | 1. Run `kitchen_do destroy <EXAMPLE_NAME>` to destroy the example module
109 |    state.
110 | 
111 | ### Linting and Formatting
112 | 
113 | Many of the files in the repository can be linted or formatted to
114 | maintain a standard of quality.
115 | 
116 | #### Execution
117 | 
118 | Run `make docker_test_lint`.
119 | 
120 | [docker-engine]: https://www.docker.com/products/docker-engine
121 | [flake8]: http://flake8.pycqa.org/en/latest/
122 | [gofmt]: https://golang.org/cmd/gofmt/
123 | [google-cloud-sdk]: https://cloud.google.com/sdk/install
124 | [hadolint]: https://github.com/hadolint/hadolint
125 | [inspec]: https://inspec.io/
126 | [kitchen-terraform]: https://github.com/newcontext-oss/kitchen-terraform
127 | [kitchen]: https://kitchen.ci/
128 | [make]: https://en.wikipedia.org/wiki/Make_(software)
129 | [shellcheck]: https://www.shellcheck.net/
130 | [terraform-docs]: https://github.com/segmentio/terraform-docs
131 | [terraform]: https://terraform.io/
132 | 


--------------------------------------------------------------------------------
/workflows.tf:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2023 Google LLC
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *      http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | resource "google_project_service_identity" "workflows" {
 18 |   provider = google-beta
 19 |   project  = module.project-services.project_id
 20 |   service  = "workflows.googleapis.com"
 21 | 
 22 |   depends_on = [time_sleep.wait_after_apis_activate]
 23 | }
 24 | 
 25 | resource "google_service_account" "workflows_sa" {
 26 |   project      = module.project-services.project_id
 27 |   account_id   = "workflows-sa-${random_id.id.hex}"
 28 |   display_name = "Workflows Service Account"
 29 | 
 30 |   depends_on = [google_project_service_identity.workflows]
 31 | }
 32 | 
 33 | resource "google_project_iam_member" "workflows_sa_roles" {
 34 |   for_each = toset([
 35 |     "roles/workflows.admin",
 36 |     "roles/storage.admin",
 37 |     "roles/iam.serviceAccountTokenCreator",
 38 |     "roles/iam.serviceAccountUser",
 39 |     "roles/logging.logWriter",
 40 |     "roles/dataproc.admin",
 41 |     "roles/bigquery.admin",
 42 |     "roles/dataplex.admin"
 43 |   ])
 44 | 
 45 |   project = module.project-services.project_id
 46 |   role    = each.key
 47 |   member  = "serviceAccount:${google_service_account.workflows_sa.email}"
 48 | 
 49 |   depends_on = [
 50 |     google_service_account.workflows_sa
 51 |   ]
 52 | }
 53 | 
 54 | # Workflow to copy data from prod GCS bucket to private buckets
 55 | # NOTE: google_storage_bucket.<bucket>.name omits the `gs://` prefix.
 56 | # You can use google_storage_bucket.<bucket>.url to include the prefix.
 57 | resource "google_workflows_workflow" "copy_data" {
 58 |   name                = "copy-data"
 59 |   project             = module.project-services.project_id
 60 |   region              = var.region
 61 |   description         = "Copies data and performs project setup"
 62 |   service_account     = google_service_account.workflows_sa.email
 63 |   deletion_protection = false
 64 |   source_contents = templatefile("${path.module}/src/yaml/copy-data.yaml", {
 65 |     public_data_bucket    = var.public_data_bucket,
 66 |     textocr_images_bucket = google_storage_bucket.textocr_images_bucket.name,
 67 |     ga4_images_bucket     = google_storage_bucket.ga4_images_bucket.name,
 68 |     tables_bucket         = google_storage_bucket.tables_bucket.name,
 69 |     dataplex_bucket       = google_storage_bucket.dataplex_bucket.name,
 70 |     images_zone_name      = google_dataplex_zone.gcp_primary_raw.name,
 71 |     tables_zone_name      = google_dataplex_zone.gcp_primary_staging.name,
 72 |     lake_name             = google_dataplex_lake.gcp_primary.name
 73 |   })
 74 | 
 75 |   depends_on = [
 76 |     google_project_iam_member.workflows_sa_roles,
 77 |     google_project_iam_member.dataproc_sa_roles
 78 |   ]
 79 | 
 80 | }
 81 | 
 82 | # Workflow to set up project resources
 83 | resource "google_workflows_workflow" "project_setup" {
 84 |   name                = "project-setup"
 85 |   project             = module.project-services.project_id
 86 |   region              = var.region
 87 |   description         = "Copies data and performs project setup"
 88 |   service_account     = google_service_account.workflows_sa.email
 89 |   deletion_protection = false
 90 |   source_contents     = templatefile("${path.module}/src/yaml/project-setup.yaml", {})
 91 | 
 92 |   depends_on = [
 93 |     google_project_iam_member.workflows_sa_roles
 94 |   ]
 95 | 
 96 | }
 97 | 
 98 | # execute workflows after all resources are created
 99 | # # get a token to execute the workflows
100 | data "google_client_config" "current" {
101 | }
102 | 
103 | # # execute the copy data workflow
104 | data "http" "call_workflows_copy_data" {
105 |   url    = "https://workflowexecutions.googleapis.com/v1/projects/${module.project-services.project_id}/locations/${var.region}/workflows/${google_workflows_workflow.copy_data.name}/executions"
106 |   method = "POST"
107 |   request_headers = {
108 |     Accept = "application/json"
109 |   Authorization = "Bearer ${data.google_client_config.current.access_token}" }
110 |   depends_on = [
111 |     google_storage_bucket.textocr_images_bucket,
112 |     google_storage_bucket.ga4_images_bucket,
113 |     google_storage_bucket.tables_bucket
114 |   ]
115 | }
116 | 
117 | resource "time_sleep" "wait_after_copy_data" {
118 |   create_duration = "30s"
119 |   depends_on = [
120 |     data.http.call_workflows_copy_data
121 |   ]
122 | }
123 | 
124 | # execute the other project setup workflow
125 | data "http" "call_workflows_project_setup" {
126 |   url    = "https://workflowexecutions.googleapis.com/v1/projects/${module.project-services.project_id}/locations/${var.region}/workflows/${google_workflows_workflow.project_setup.name}/executions"
127 |   method = "POST"
128 |   request_headers = {
129 |     Accept = "application/json"
130 |   Authorization = "Bearer ${data.google_client_config.current.access_token}" }
131 |   depends_on = [
132 |     google_bigquery_dataset.gcp_lakehouse_ds,
133 |     time_sleep.wait_for_dataplex_discovery,
134 |     google_dataplex_asset.gcp_primary_ga4_obfuscated_sample_ecommerce,
135 |     google_dataplex_asset.gcp_primary_tables,
136 |     google_dataplex_asset.gcp_primary_textocr,
137 |     google_project_iam_member.connection_permission_grant,
138 |     google_project_iam_member.dataproc_sa_roles,
139 |     google_service_account.dataproc_service_account,
140 |     google_storage_bucket.provisioning_bucket,
141 |     google_storage_bucket.warehouse_bucket,
142 |     time_sleep.wait_after_copy_data
143 |   ]
144 | }
145 | 


--------------------------------------------------------------------------------
/src/ipynb/exploratory-analysis.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "!pip install chart-studio"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import pandas as pd\n",
 19 |     "import numpy as np\n",
 20 |     "import scipy.optimize\n",
 21 |     "\n",
 22 |     "# Import and setup for plotly in Colab\n",
 23 |     "import chart_studio\n",
 24 |     "import chart_studio.plotly as py\n",
 25 |     "import plotly.graph_objects as go\n",
 26 |     "import plotly.io as pio\n",
 27 |     "import plotly.express as px\n",
 28 |     "\n",
 29 |     "# Enable displaying pandas data frames as interactive tables by default\n",
 30 |     "from google.colab import data_table\n",
 31 |     "data_table.enable_dataframe_formatter()"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "PROJECT_ID = 'CHANGE_TO_PROJECT_ID'\n",
 41 |     "REGION = \"CHANGE_TO_DEPLOYMENT_REGION\""
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "from google.colab import auth\n",
 51 |     "auth.authenticate_user()"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "!gcloud config set project {PROJECT_ID}\n",
 61 |     "!gcloud config get-value project"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "%%bigquery --project {PROJECT_ID}\n",
 71 |     "SELECT\n",
 72 |     "  o.order_id,\n",
 73 |     "  o.user_id order_user_id,\n",
 74 |     "  o.status order_status,\n",
 75 |     "  o.created_at order_created_at,\n",
 76 |     "  o.returned_at order_returned_at,\n",
 77 |     "  o.shipped_at order_shipped_at,\n",
 78 |     "  o.delivered_at order_delivered_at,\n",
 79 |     "  o.num_of_item order_number_of_items,\n",
 80 |     "  i.id AS order_items_id,\n",
 81 |     "  i.product_id AS order_items_product_id,\n",
 82 |     "  i.status order_items_status,\n",
 83 |     "  i.sale_price order_items_sale_price,\n",
 84 |     "  p.id AS product_id,\n",
 85 |     "  p.cost product_cost,\n",
 86 |     "  p.category product_category,\n",
 87 |     "  p.name product_name,\n",
 88 |     "  p.brand product_brand,\n",
 89 |     "  p.retail_price product_retail_price,\n",
 90 |     "  p.department product_department,\n",
 91 |     "  p.sku product_sku,\n",
 92 |     "  p.distribution_center_id,\n",
 93 |     "  d.name AS dist_center_name,\n",
 94 |     "  d.latitude dist_center_lat,\n",
 95 |     "  d.longitude dist_center_long,\n",
 96 |     "  u.id AS user_id,\n",
 97 |     "  u.first_name user_first_name,\n",
 98 |     "  u.last_name user_last_name,\n",
 99 |     "  u.age user_age,\n",
100 |     "  u.gender user_gender,\n",
101 |     "  u.state user_state,\n",
102 |     "  u.postal_code user_postal_code,\n",
103 |     "  u.city user_city,\n",
104 |     "  u.country user_country,\n",
105 |     "  u.latitude user_lat,\n",
106 |     "  u.longitude user_long,\n",
107 |     "  u.traffic_source user_traffic_source\n",
108 |     "FROM\n",
109 |     "  gcp_lakehouse_ds.gcp_tbl_orders o\n",
110 |     "INNER JOIN\n",
111 |     "  gcp_lakehouse_ds.gcp_tbl_order_items i\n",
112 |     "ON\n",
113 |     "  o.order_id = i.order_id\n",
114 |     "INNER JOIN\n",
115 |     "  gcp_lakehouse_ds.gcp_tbl_products p\n",
116 |     "ON\n",
117 |     "  i.product_id = p.id\n",
118 |     "INNER JOIN\n",
119 |     "  gcp_lakehouse_ds.gcp_tbl_distribution_centers d\n",
120 |     "ON\n",
121 |     "  p.distribution_center_id = d.id\n",
122 |     "INNER JOIN\n",
123 |     "  gcp_lakehouse_ds.gcp_tbl_users u\n",
124 |     "ON\n",
125 |     "  o.user_id = u.id\n",
126 |     "limit 100"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "%%bigquery --project {PROJECT_ID}\n",
136 |     "\n",
137 |     "SELECT\n",
138 |     "sum(order_id) as count,\n",
139 |     "  date(o.created_at) date\n",
140 |     "FROM\n",
141 |     "  gcp_lakehouse_ds.gcp_tbl_orders o\n",
142 |     "  group by o.created_at\n",
143 |     "  order by date(o.created_at)\n",
144 |     "  limit 500"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "%%bigquery data --project {PROJECT_ID}\n",
154 |     "\n",
155 |     "SELECT\n",
156 |     "sum(order_id) as count,\n",
157 |     "  date(o.created_at) date\n",
158 |     "FROM\n",
159 |     "  gcp_lakehouse_ds.gcp_tbl_orders o\n",
160 |     "  group by o.created_at\n",
161 |     "  order by date(o.created_at)\n",
162 |     "  limit 500"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "data['date'] = pd.to_datetime(data['date'])\n",
172 |     "data['date'] = data['date'].astype(np.int64) // 10**9\n",
173 |     "data.head()"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": null,
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "from datetime import datetime\n",
183 |     "from matplotlib import pyplot\n",
184 |     "\n",
185 |     "fig, ax = pyplot.subplots(figsize=(20,12))\n",
186 |     "data.plot(x='date', y='count', kind='scatter', ax=ax)\n",
187 |     "ax.set_xticklabels([datetime.fromtimestamp(date).strftime('%Y/%m/%d') for date in ax.get_xticks()])"
188 |    ]
189 |   }
190 |  ],
191 |  "metadata": {
192 |   "language_info": {
193 |    "name": "python"
194 |   },
195 |   "orig_nbformat": 4
196 |  },
197 |  "nbformat": 4,
198 |  "nbformat_minor": 2
199 | }
200 | 


--------------------------------------------------------------------------------
/src/sql/sp_lookerstudio_report.sql:
--------------------------------------------------------------------------------
  1 | -- Copyright 2023 Google LLC
  2 | --
  3 | -- Licensed under the Apache License, Version 2.0 (the "License");
  4 | -- you may not use this file except in compliance with the License.
  5 | -- You may obtain a copy of the License at
  6 | --
  7 | --      http://www.apache.org/licenses/LICENSE-2.0
  8 | --
  9 | -- Unless required by applicable law or agreed to in writing, software
 10 | -- distributed under the License is distributed on an "AS IS" BASIS,
 11 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | -- See the License for the specific language governing permissions and
 13 | -- limitations under the License.
 14 | 
 15 | CREATE OR REPLACE TABLE `${project_id}.ds_edw.lookerstudio_report`
 16 | AS
 17 | WITH TaxiData AS
 18 | (
 19 | SELECT VENDOR_ID as TaxiCompany,
 20 |        EXTRACT(YEAR FROM Pickup_DateTime)          AS Year,
 21 |        EXTRACT(WEEK FROM Pickup_DateTime)          AS WeekNumber,
 22 |        CONCAT('Week ',FORMAT("%02d",
 23 |               EXTRACT(WEEK FROM Pickup_DateTime))) AS WeekName,
 24 |        CONCAT(VENDOR_ID,':',EXTRACT(YEAR FROM Pickup_DateTime),':',FORMAT("%02d",EXTRACT(WEEK FROM Pickup_DateTime))) AS GroupPartition,
 25 |        COUNT(1)                                    AS NumberOfRides,
 26 |        AVG(Trip_Distance)                          AS AvgDistance,
 27 |        SUM(Fare_Amount)                            AS Total_Fare_Amount,
 28 |        SUM(Extra)                              AS Total_Surcharge,
 29 |        SUM(MTA_Tax)                                AS Total_MTA_Tax,
 30 |        SUM(Tolls_Amount)                           AS Total_Tolls_Amount,
 31 |        SUM(imp_Surcharge)                  AS Total_Improvement_Surcharge,
 32 |        SUM(Tip_Amount)                             AS Total_Tip_Amount,
 33 |        SUM(Total_Amount)                           AS Total_Total_Amount
 34 |   FROM `${project_id}.ds_edw.taxi_trips` AS taxi_trips
 35 |  WHERE Pickup_DateTime BETWEEN '2022-01-01' AND '2022-02-02' --'2015-01-01' AND '2021-12-31'   -- There is odd data in some of the source files from NYC
 36 |  GROUP BY 1, 2, 3, 4, 5
 37 | )
 38 | , LagPercents AS
 39 | (
 40 | SELECT TaxiCompany,
 41 |        Year,
 42 |        WeekNumber,
 43 |        WeekName,
 44 |        NumberOfRides,
 45 |        GroupPartition,
 46 |        AvgDistance,
 47 |        Total_Fare_Amount,
 48 |        Total_Surcharge,
 49 |        Total_MTA_Tax,
 50 |        Total_Tolls_Amount,
 51 |        Total_Improvement_Surcharge,
 52 |        Total_Tip_Amount,
 53 |        Total_Total_Amount,
 54 |        LAG(NumberOfRides)               OVER (PARTITION BY TaxiCompany  ORDER BY Year, WeekNumber ASC) AS Prior_Week_NumberOfRides,
 55 |        LAG(AvgDistance)                 OVER (PARTITION BY TaxiCompany  ORDER BY Year, WeekNumber ASC) AS Prior_Week_AvgDistance,
 56 |        LAG(Total_Fare_Amount)           OVER (PARTITION BY TaxiCompany  ORDER BY Year, WeekNumber ASC) AS Prior_Week_Total_Fare_Amount,
 57 |        LAG(Total_Surcharge)             OVER (PARTITION BY TaxiCompany  ORDER BY Year, WeekNumber ASC) AS Prior_Week_Total_Surcharge,
 58 |        LAG(Total_MTA_Tax)               OVER (PARTITION BY TaxiCompany  ORDER BY Year, WeekNumber ASC) AS Prior_Week_Total_MTA_Tax,
 59 |        LAG(Total_Tolls_Amount)          OVER (PARTITION BY TaxiCompany  ORDER BY Year, WeekNumber ASC) AS Prior_Week_Total_Tolls_Amount,
 60 |        LAG(Total_Improvement_Surcharge) OVER (PARTITION BY TaxiCompany  ORDER BY Year, WeekNumber ASC) AS Prior_Week_Total_Improvement_Surcharge,
 61 |        LAG(Total_Tip_Amount)            OVER (PARTITION BY TaxiCompany  ORDER BY Year, WeekNumber ASC) AS Prior_Week_Total_Tip_Amount,
 62 |        LAG(Total_Total_Amount)          OVER (PARTITION BY TaxiCompany  ORDER BY Year, WeekNumber ASC) AS Prior_Week_Total_Total_Amount
 63 |   FROM TaxiData
 64 | )
 65 | , PercentChange AS
 66 | (
 67 | SELECT TaxiCompany,
 68 |        Year,
 69 |        WeekNumber,
 70 |        WeekName,
 71 |        GroupPartition,
 72 |        NumberOfRides,
 73 |        AvgDistance,
 74 |        Total_Fare_Amount,
 75 |        Total_Surcharge,
 76 |        Total_MTA_Tax,
 77 |        Total_Tolls_Amount,
 78 |        Total_Improvement_Surcharge,
 79 |        Total_Tip_Amount,
 80 |        Total_Total_Amount,
 81 |        Prior_Week_NumberOfRides,
 82 |        Prior_Week_AvgDistance,
 83 |        Prior_Week_Total_Fare_Amount,
 84 |        Prior_Week_Total_Surcharge,
 85 |        Prior_Week_Total_MTA_Tax,
 86 |        Prior_Week_Total_Tolls_Amount,
 87 |        Prior_Week_Total_Improvement_Surcharge,
 88 |        Prior_Week_Total_Tip_Amount,
 89 |        Prior_Week_Total_Total_Amount,
 90 |        SAFE_DIVIDE(CAST(NumberOfRides - Prior_Week_NumberOfRides AS NUMERIC) , CAST(Prior_Week_NumberOfRides AS NUMERIC)) AS PercentChange_NumberOfRides,
 91 |        SAFE_DIVIDE(CAST(AvgDistance - Prior_Week_AvgDistance AS NUMERIC) , CAST(Prior_Week_AvgDistance AS NUMERIC)) AS PercentChange_AvgDistance,
 92 |        SAFE_DIVIDE((Total_Fare_Amount - Prior_Week_Total_Fare_Amount) , Prior_Week_Total_Fare_Amount) AS PercentChange_Total_Fare_Amount,
 93 |        SAFE_DIVIDE((Total_Surcharge - Prior_Week_Total_Surcharge) , Prior_Week_Total_Surcharge) AS PercentChange_Total_Surcharge,
 94 |        SAFE_DIVIDE((Total_MTA_Tax - Prior_Week_Total_MTA_Tax) , Prior_Week_Total_MTA_Tax) AS PercentChange_Total_MTA_Tax,
 95 |        SAFE_DIVIDE((Total_Tolls_Amount - Prior_Week_Total_Tolls_Amount) , Prior_Week_Total_Tolls_Amount) AS PercentChange_Total_Tolls_Amount,
 96 |        SAFE_DIVIDE((Total_Improvement_Surcharge - Prior_Week_Total_Improvement_Surcharge) , Prior_Week_Total_Improvement_Surcharge) AS PercentChange_Total_Improvement_Surcharge,
 97 |        SAFE_DIVIDE((Total_Tip_Amount - Prior_Week_Total_Tip_Amount) , Prior_Week_Total_Tip_Amount) AS PercentChange_Total_Tip_Amount,
 98 |        SAFE_DIVIDE((Total_Total_Amount - Prior_Week_Total_Total_Amount) , Prior_Week_Total_Total_Amount) AS PercentChange_Total_Total_Amount
 99 |   FROM LagPercents
100 | )
101 | SELECT *
102 |   FROM PercentChange
103 | ORDER BY GroupPartition;
104 | 
105 | CREATE OR REPLACE VIEW `${project_id}.ds_edw.vw_lookerstudio_report` as
106 | SELECT * FROM `${project_id}.ds_edw.lookerstudio_report`
107 | WHERE Year in (2022);
108 | 


--------------------------------------------------------------------------------
/bigquery.tf:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2023 Google LLC
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *      http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | # Set up BigQuery resources
 18 | 
 19 | # # Create the BigQuery dataset
 20 | resource "google_bigquery_dataset" "gcp_lakehouse_ds" {
 21 |   project                    = module.project-services.project_id
 22 |   dataset_id                 = "gcp_lakehouse_ds"
 23 |   friendly_name              = "My gcp_lakehouse Dataset"
 24 |   description                = "My gcp_lakehouse Dataset with tables"
 25 |   location                   = var.region
 26 |   labels                     = var.labels
 27 |   delete_contents_on_destroy = var.force_destroy
 28 | }
 29 | 
 30 | # # Create a BigQuery connection for Spark
 31 | resource "google_bigquery_connection" "spark" {
 32 |   project       = module.project-services.project_id
 33 |   connection_id = "spark"
 34 |   location      = var.region
 35 |   friendly_name = "gcp lakehouse spark connection"
 36 |   spark {}
 37 | }
 38 | 
 39 | # # This grands permissions to the service account of the Spark connection.
 40 | resource "google_project_iam_member" "connection_permission_grant" {
 41 |   for_each = toset([
 42 |     "roles/biglake.admin",
 43 |     "roles/bigquery.dataEditor",
 44 |     "roles/bigquery.connectionAdmin",
 45 |     "roles/bigquery.jobUser",
 46 |     "roles/bigquery.readSessionUser",
 47 |     "roles/storage.objectAdmin"
 48 |   ])
 49 | 
 50 |   project = module.project-services.project_id
 51 |   role    = each.key
 52 |   member  = format("serviceAccount:%s", google_bigquery_connection.spark.spark[0].service_account_id)
 53 | }
 54 | 
 55 | locals {
 56 |   lakehouse_catalog = "lakehouse_catalog"
 57 | }
 58 | 
 59 | # # Creates a stored procedure for a spark job to create iceberg tables
 60 | resource "google_bigquery_routine" "create_iceberg_tables" {
 61 |   project         = module.project-services.project_id
 62 |   dataset_id      = google_bigquery_dataset.gcp_lakehouse_ds.dataset_id
 63 |   routine_id      = "create_iceberg_tables"
 64 |   routine_type    = "PROCEDURE"
 65 |   language        = "PYTHON"
 66 |   definition_body = ""
 67 |   arguments {
 68 |     name      = "lakehouse_catalog"
 69 |     data_type = "{\"typeKind\" :  \"STRING\"}"
 70 |   }
 71 |   arguments {
 72 |     name      = "lakehouse_database"
 73 |     data_type = "{\"typeKind\" :  \"STRING\"}"
 74 |   }
 75 |   arguments {
 76 |     name      = "bq_dataset"
 77 |     data_type = "{\"typeKind\" :  \"STRING\"}"
 78 |   }
 79 |   spark_options {
 80 |     connection      = google_bigquery_connection.spark.name
 81 |     runtime_version = "2.1"
 82 |     main_file_uri   = "gs://${google_storage_bucket_object.bigquery_file.bucket}/${google_storage_bucket_object.bigquery_file.name}"
 83 |     jar_uris        = ["gs://spark-lib/biglake/biglake-catalog-iceberg1.2.0-0.1.0-with-dependencies.jar"]
 84 |     properties = {
 85 |       "spark.sql.catalog.lakehouse_catalog" : "org.apache.iceberg.spark.SparkCatalog",
 86 |       "spark.sql.catalog.lakehouse_catalog.blms_catalog" : local.lakehouse_catalog
 87 |       "spark.sql.catalog.lakehouse_catalog.catalog-impl" : "org.apache.iceberg.gcp.biglake.BigLakeCatalog",
 88 |       "spark.sql.catalog.lakehouse_catalog.gcp_location" : var.region,
 89 |       "spark.sql.catalog.lakehouse_catalog.gcp_project" : var.project_id,
 90 |       "spark.sql.catalog.lakehouse_catalog.warehouse" : "${google_storage_bucket.warehouse_bucket.url}/warehouse",
 91 |       "spark.jars.packages" : "org.apache.iceberg:iceberg-spark-runtime-3.3_2.13:1.2.1"
 92 |     }
 93 |   }
 94 | }
 95 | 
 96 | # # Execute after Dataplex discovery wait
 97 | 
 98 | resource "google_bigquery_job" "create_view_ecommerce" {
 99 |   project  = module.project-services.project_id
100 |   location = var.region
101 |   job_id   = "create_view_ecommerce_${random_id.id.hex}"
102 | 
103 |   query {
104 |     query = file("${path.module}/src/sql/view_ecommerce.sql")
105 | 
106 |     # Since the query contains DML, these must be set to empty.
107 |     create_disposition = ""
108 |     write_disposition  = ""
109 |   }
110 | 
111 |   depends_on = [time_sleep.wait_for_dataplex_discovery]
112 | }
113 | 
114 | # resource "time_sleep" "check_create_view_ecommerce" {
115 | #   create_duration = "30s"
116 | 
117 | #   depends_on = [google_bigquery_job.create_view_ecommerce]
118 | 
119 | #   lifecycle {
120 | #     postcondition {
121 | #       condition     = google_bigquery_job.create_view_ecommerce.status.state == "DONE" && google_bigquery_job.create_view_ecommerce.status.error_result == null
122 | #       error_message = "State: ${google_bigquery_job.create_view_ecommerce.status}, Error: ${google_bigquery_job.create_view_ecommerce.status.error_result.message}"
123 | #     }
124 | #   }
125 | # }
126 | 
127 | resource "google_bigquery_job" "create_iceberg_tables" {
128 |   project  = module.project-services.project_id
129 |   location = var.region
130 |   job_id   = "create_iceberg_tables_${random_id.id.hex}"
131 | 
132 |   query {
133 |     query = "call gcp_lakehouse_ds.create_iceberg_tables('${local.lakehouse_catalog}', 'lakehouse_db', '${google_bigquery_dataset.gcp_lakehouse_ds.dataset_id}')"
134 | 
135 |     # Since the query calls a stored procedure, these must be set to empty.
136 |     create_disposition = ""
137 |     write_disposition  = ""
138 |   }
139 | 
140 |   depends_on = [time_sleep.wait_for_dataplex_discovery]
141 | }
142 | 
143 | # resource "time_sleep" "check_create_iceberg_tables" {
144 | #   create_duration = "300s"
145 | 
146 | #   depends_on = [google_bigquery_job.create_iceberg_tables]
147 | 
148 | #   lifecycle {
149 | #     postcondition {
150 | #       condition     = google_bigquery_job.create_iceberg_tables.status.state == "DONE" && google_bigquery_job.create_view_ecommerce.status.error_result == null
151 | #       error_message = "State: ${google_bigquery_job.create_iceberg_tables.status}, Error: ${google_bigquery_job.create_view_ecommerce.status.error_result.message}"
152 | #     }
153 | #   }
154 | # }
155 | 
156 | 


--------------------------------------------------------------------------------
/src/yaml/copy-data.yaml:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | main:
 16 |     params: []
 17 |     steps:
 18 |         - init:
 19 |             # Define local variables from terraform env variables
 20 |             assign:
 21 |                 - source_bucket_name: ${public_data_bucket}
 22 |                 - dest_ga4_images_bucket_name: ${ga4_images_bucket}
 23 |                 - dest_textocr_images_bucket_name: ${textocr_images_bucket}
 24 |                 - dest_tables_bucket_name: ${tables_bucket}
 25 |                 - images_zone_name: ${images_zone_name}ga4
 26 |                 - tables_zone_name: ${tables_zone_name}
 27 |                 - lake_name: ${lake_name}
 28 |                 - dataplex_bucket: ${dataplex_bucket}
 29 |         # If this workflow has been run before, do not run again
 30 |         - sub_check_if_run:
 31 |             steps:
 32 |                 - assign_values:
 33 |                     assign:
 34 |                       - project_id: $${sys.get_env("GOOGLE_CLOUD_PROJECT_ID")}
 35 |                       - location: $${sys.get_env("GOOGLE_CLOUD_LOCATION")}
 36 |                       - workflow_id: $${sys.get_env("GOOGLE_CLOUD_WORKFLOW_ID")}
 37 |                 - get_executions:
 38 |                     call: http.get
 39 |                     args:
 40 |                         url: $${"https://workflowexecutions.googleapis.com/v1/projects/"+project_id+"/locations/"+location+"/workflows/"+workflow_id+"/executions"}
 41 |                         auth:
 42 |                             type: OAuth2
 43 |                     result: Operation
 44 |                 - check_if_run:
 45 |                     switch:
 46 |                       - condition: $${len(Operation.body.executions) > 1}
 47 |                         next: end
 48 |         - sub_copy_data:
 49 |             parallel:
 50 |               branches:
 51 |                 - copy_textocr_images:
 52 |                     steps:
 53 |                       - copy_textocr_images_call:
 54 |                           call: copy_objects
 55 |                           args:
 56 |                               source_bucket_name: $${source_bucket_name}
 57 |                               prefix: TextOCR_images
 58 |                               dest_bucket_name: $${dest_textocr_images_bucket_name}
 59 |                           result: copy_textocr_images_output
 60 |                 - copy_ga4_images:
 61 |                     steps:
 62 |                       -  copy_ga4_images_call:
 63 |                           call: copy_objects
 64 |                           args:
 65 |                               source_bucket_name: $${source_bucket_name}
 66 |                               prefix: ga4_obfuscated_sample_ecommerce_images
 67 |                               dest_bucket_name: $${dest_ga4_images_bucket_name}
 68 |                           result: copy_ga4_output
 69 |                 - copy_new_york_taxi_trips_tables:
 70 |                     steps:
 71 |                       - copy_new_york_taxi_trips_tables_call:
 72 |                           call: copy_objects
 73 |                           args:
 74 |                               source_bucket_name: $${source_bucket_name}
 75 |                               prefix: new-york-taxi-trips
 76 |                               dest_bucket_name: $${dest_tables_bucket_name}
 77 |                           result: copy_new_york_taxi_trips_tables_output
 78 |                 - copy_thelook_ecommerce_tables:
 79 |                     steps:
 80 |                       - copy_thelook_ecommerce_tables_call:
 81 |                           call: copy_objects
 82 |                           args:
 83 |                               source_bucket_name: $${source_bucket_name}
 84 |                               prefix: thelook_ecommerce
 85 |                               dest_bucket_name: $${dest_tables_bucket_name}
 86 |                           result: copy_thelook_ecommerce_tables_output
 87 |                 - copy_dataplex_names_counts:
 88 |                     steps:
 89 |                       - copy_dataplex_names_counts_call:
 90 |                           call: copy_objects
 91 |                           args:
 92 |                               source_bucket_name: $${source_bucket_name}
 93 |                               prefix: views
 94 |                               dest_bucket_name: $${dataplex_bucket}
 95 |                           result: copy_dataplex_names_counts_output
 96 | 
 97 | # Subworkflow to copy initial objects
 98 | copy_objects:
 99 |     params: [source_bucket_name, prefix, dest_bucket_name]
100 |     steps:
101 |         - list_objects:
102 |             call: googleapis.storage.v1.objects.list
103 |             args:
104 |                 bucket: $${source_bucket_name}
105 |                 prefix: $${prefix}
106 |             result: list_result
107 |         - start_counter:
108 |             assign:
109 |                 - copied_objects: 0
110 |         - copy_objects:
111 |                 parallel:
112 |                     shared: [copied_objects]
113 |                     for:
114 |                         value: object
115 |                         index: i
116 |                         in: $${list_result.items}
117 |                         steps:
118 |                             - copy:
119 |                                 try:
120 |                                     steps:
121 |                                         - copy_object:
122 |                                             call: googleapis.storage.v1.objects.copy
123 |                                             args:
124 |                                                 sourceBucket: $${source_bucket_name}
125 |                                                 sourceObject: $${text.url_encode(object.name)}
126 |                                                 destinationBucket: $${dest_bucket_name}
127 |                                                 destinationObject: $${text.url_encode(object.name)}
128 |                                             result: copy_result
129 |                                         - save_result:
130 |                                             assign:
131 |                                                 - copied_objects: $${copied_objects + 1}
132 |                                 except:
133 |                                     as: e
134 |                                     raise:
135 |                                         exception: $${e}
136 |                                         sourceBucket: $${source_bucket_name}
137 |                                         sourceObject: $${object.name}
138 |                                         destinationBucket: $${dest_bucket_name}
139 |         - finish:
140 |             return: $${copied_objects + " objects copied"}
141 | 


--------------------------------------------------------------------------------
/.github/workflows/periodic-reporter.yaml:
--------------------------------------------------------------------------------
  1 | # Copyright 2023-2025 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | # NOTE: This file is automatically generated from:
 16 | # https://github.com/GoogleCloudPlatform/cloud-foundation-toolkit/blob/main/infra/terraform/modules/workflow_files/periodic-reporter.yaml
 17 | 
 18 | name: 'reporter'
 19 | 
 20 | on:
 21 |   schedule:
 22 |     # 2 hours after scheduled periodic and once again in the evening
 23 |     - cron: '0 5,17 * * *'
 24 |   workflow_dispatch:
 25 | 
 26 | jobs:
 27 |   report:
 28 |     if: github.repository_owner == 'GoogleCloudPlatform' || github.repository_owner == 'terraform-google-modules'
 29 | 
 30 |     permissions:
 31 |       issues: 'write'
 32 | 
 33 |     runs-on: 'ubuntu-latest'
 34 | 
 35 |     steps:
 36 |       - uses: 'actions/github-script@v8'
 37 |         with:
 38 |           script: |-
 39 |                   // label for all issues opened by reporter
 40 |                   const periodicLabel = 'periodic-failure';
 41 | 
 42 |                   // check if any reporter opened any issues previously
 43 |                   const prevIssues = await github.paginate(github.rest.issues.listForRepo, {
 44 |                     ...context.repo,
 45 |                     state: 'open',
 46 |                     creator: 'github-actions[bot]',
 47 |                     labels: [periodicLabel]
 48 |                   });
 49 |                   // createOrCommentIssue creates a new issue or comments on an existing issue.
 50 |                   const createOrCommentIssue = async function (title, txt) {
 51 |                     if (prevIssues.length < 1) {
 52 |                       console.log('no previous issues found, creating one');
 53 |                       await github.rest.issues.create({
 54 |                         ...context.repo,
 55 |                         title: title,
 56 |                         body: txt,
 57 |                         labels: [periodicLabel]
 58 |                       });
 59 |                       return;
 60 |                     }
 61 |                     if (prevIssues.length > 1) {
 62 |                       console.warn(
 63 |                         `found ${prevIssues.length} issues but only adding comment to ${prevIssues[0].html_url}`
 64 |                       );
 65 |                     }
 66 |                     console.log(
 67 |                       `found previous issue ${prevIssues[0].html_url}, adding comment`
 68 |                     );
 69 |                     await github.rest.issues.createComment({
 70 |                       ...context.repo,
 71 |                       issue_number: prevIssues[0].number,
 72 |                       body: txt
 73 |                     });
 74 |                   };
 75 | 
 76 |                   // updateAndCloseIssues comments on any existing issues and closes them. No-op if no issue exists.
 77 |                   const updateAndCloseIssues = async function (txt) {
 78 |                     if (prevIssues.length < 1) {
 79 |                       console.log('no previous issues found, skipping close');
 80 |                       return;
 81 |                     }
 82 |                     for (const prevIssue of prevIssues) {
 83 |                       console.log(`found previous issue ${prevIssue.html_url}, adding comment`);
 84 |                       await github.rest.issues.createComment({
 85 |                         ...context.repo,
 86 |                         issue_number: prevIssue.number,
 87 |                         body: txt
 88 |                       });
 89 |                       console.log(`closing ${prevIssue.html_url}`);
 90 |                       await github.rest.issues.update({
 91 |                         ...context.repo,
 92 |                         issue_number: prevIssue.number,
 93 |                         body: txt,
 94 |                         state: 'closed'
 95 |                       });
 96 |                     }
 97 |                   };
 98 | 
 99 |                   // Find status of check runs.
100 |                   // We will find check runs for each commit and then filter for the periodic.
101 |                   // Checks API only allows for ref and if we use main there could be edge cases where
102 |                   // the check run happened on a SHA that is different from head.
103 |                   const commits = await github.paginate(github.rest.repos.listCommits, {
104 |                     ...context.repo
105 |                   });
106 | 
107 |                   var foundCheck = false;
108 |                   let periodicCheck = {};
109 | 
110 |                   for (const commit of commits) {
111 |                     console.log(
112 |                       `checking runs at ${commit.html_url}: ${commit.commit.message}`
113 |                     );
114 |                     const checks = await github.rest.checks.listForRef({
115 |                       ...context.repo,
116 |                       ref: commit.sha
117 |                     });
118 |                     // find runs for this commit
119 |                     for (const check of checks.data.check_runs) {
120 |                       console.log(`found run ${check.name} for ${commit.html_url}`);
121 |                       if (check.name.includes('periodic-int-trigger')) {
122 |                         foundCheck = true;
123 |                         periodicCheck = check;
124 |                         break;
125 |                       }
126 |                     }
127 | 
128 |                     if (foundCheck) {
129 |                       if (
130 |                         periodicCheck.status === 'completed' &&
131 |                         periodicCheck.conclusion === 'success'
132 |                       ) {
133 |                         updateAndCloseIssues(
134 |                           `[Passing periodic](${periodicCheck.html_url}) at ${commit.html_url}. Closing this issue.`
135 |                         );
136 |                       } else if (periodicCheck.status === 'in_progress') {
137 |                         console.log(
138 |                           `Check is pending ${periodicCheck.html_url} for ${commit.html_url}. Retry again later.`
139 |                         );
140 |                       }
141 |                       // error case
142 |                       else {
143 |                         createOrCommentIssue(
144 |                           'Failing periodic',
145 |                           `[Failing periodic](${periodicCheck.html_url}) at ${commit.html_url}.`
146 |                         );
147 |                       }
148 |                       // exit early as check was found
149 |                       return;
150 |                     }
151 |                   }
152 | 
153 |                   // no periodic-int-trigger checks found across all commits, report it
154 |                   createOrCommentIssue(
155 |                     'Missing periodic',
156 |                     `Periodic test has not run in the past 24hrs. Last checked from ${
157 |                       commits[0].html_url
158 |                     } to ${commits[commits.length - 1].html_url}.`
159 |                   );
160 | 


--------------------------------------------------------------------------------
/main.tf:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2023 Google LLC
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *      http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | module "project-services" {
 18 |   source                      = "terraform-google-modules/project-factory/google//modules/project_services"
 19 |   version                     = "~> 18.0"
 20 |   disable_services_on_destroy = false
 21 | 
 22 |   project_id  = var.project_id
 23 |   enable_apis = var.enable_apis
 24 | 
 25 |   activate_apis = [
 26 |     "artifactregistry.googleapis.com",
 27 |     "biglake.googleapis.com",
 28 |     "bigquery.googleapis.com",
 29 |     "bigqueryconnection.googleapis.com",
 30 |     "bigquerydatapolicy.googleapis.com",
 31 |     "bigquerydatatransfer.googleapis.com",
 32 |     "bigquerymigration.googleapis.com",
 33 |     "bigqueryreservation.googleapis.com",
 34 |     "bigquerystorage.googleapis.com",
 35 |     "cloudapis.googleapis.com",
 36 |     "cloudbuild.googleapis.com",
 37 |     "cloudfunctions.googleapis.com",
 38 |     "compute.googleapis.com",
 39 |     "config.googleapis.com",
 40 |     "datacatalog.googleapis.com",
 41 |     "datalineage.googleapis.com",
 42 |     "dataplex.googleapis.com",
 43 |     "dataproc.googleapis.com",
 44 |     "iam.googleapis.com",
 45 |     "serviceusage.googleapis.com",
 46 |     "storage-api.googleapis.com",
 47 |     "storage.googleapis.com",
 48 |     "workflows.googleapis.com",
 49 |     "notebooks.googleapis.com",
 50 |   ]
 51 | }
 52 | 
 53 | resource "time_sleep" "wait_after_apis_activate" {
 54 |   depends_on      = [module.project-services]
 55 |   create_duration = "30s"
 56 | }
 57 | 
 58 | #random id
 59 | resource "random_id" "id" {
 60 |   byte_length = 4
 61 | }
 62 | 
 63 | # Set up Storage Buckets
 64 | 
 65 | # # Set up the raw storage bucket
 66 | resource "google_storage_bucket" "raw_bucket" {
 67 |   name                        = "gcp-${var.use_case_short}-raw-${random_id.id.hex}"
 68 |   project                     = module.project-services.project_id
 69 |   location                    = var.region
 70 |   uniform_bucket_level_access = true
 71 |   force_destroy               = var.force_destroy
 72 | 
 73 |   # public_access_prevention = "enforced" # need to validate if this is a hard requirement
 74 | }
 75 | 
 76 | # # Set up the warehouse storage bucket
 77 | resource "google_storage_bucket" "warehouse_bucket" {
 78 |   name                        = "gcp-${var.use_case_short}-warehouse-${random_id.id.hex}"
 79 |   project                     = module.project-services.project_id
 80 |   location                    = var.region
 81 |   uniform_bucket_level_access = true
 82 |   force_destroy               = var.force_destroy
 83 | 
 84 |   # public_access_prevention = "enforced" # need to validate if this is a hard requirement
 85 | }
 86 | 
 87 | # # Set up the provisioning bucketstorage bucket
 88 | resource "google_storage_bucket" "provisioning_bucket" {
 89 |   name                        = "gcp-${var.use_case_short}-provisioner-${random_id.id.hex}"
 90 |   project                     = module.project-services.project_id
 91 |   location                    = var.region
 92 |   uniform_bucket_level_access = true
 93 |   force_destroy               = var.force_destroy
 94 | 
 95 | }
 96 | 
 97 | resource "google_storage_bucket" "ga4_images_bucket" {
 98 |   name                        = "gcp-${var.use_case_short}-ga4-images-${random_id.id.hex}"
 99 |   project                     = module.project-services.project_id
100 |   location                    = var.region
101 |   uniform_bucket_level_access = true
102 |   force_destroy               = var.force_destroy
103 | }
104 | 
105 | resource "google_storage_bucket" "textocr_images_bucket" {
106 |   name                        = "gcp-${var.use_case_short}-textocr-images-${random_id.id.hex}"
107 |   project                     = module.project-services.project_id
108 |   location                    = var.region
109 |   uniform_bucket_level_access = true
110 |   force_destroy               = var.force_destroy
111 | }
112 | 
113 | resource "google_storage_bucket" "tables_bucket" {
114 |   name                        = "gcp-${var.use_case_short}-tables-${random_id.id.hex}"
115 |   project                     = module.project-services.project_id
116 |   location                    = var.region
117 |   uniform_bucket_level_access = true
118 |   force_destroy               = var.force_destroy
119 | }
120 | 
121 | # Bucket used to store BI data in Dataplex
122 | resource "google_storage_bucket" "dataplex_bucket" {
123 |   name                        = "gcp-${var.use_case_short}-dataplex-${random_id.id.hex}"
124 |   project                     = module.project-services.project_id
125 |   location                    = var.region
126 |   uniform_bucket_level_access = true
127 |   force_destroy               = var.force_destroy
128 | }
129 | 
130 | resource "google_storage_bucket_object" "bigquery_file" {
131 |   bucket = google_storage_bucket.provisioning_bucket.name
132 |   name   = "bigquery.py"
133 |   source = "${path.module}/src/python/bigquery.py"
134 | 
135 |   depends_on = [
136 |     google_storage_bucket.provisioning_bucket
137 |   ]
138 | }
139 | 
140 | resource "google_storage_bucket_object" "bigtable_file" {
141 |   bucket = google_storage_bucket.provisioning_bucket.name
142 |   name   = "bigtable.py"
143 |   source = "${path.module}/src/python/bigtable.py"
144 | 
145 |   depends_on = [
146 |     google_storage_bucket.provisioning_bucket
147 |   ]
148 | }
149 | 
150 | # Uploads the post-startup script for the workbench instance.
151 | resource "google_storage_bucket_object" "post_startup_script" {
152 |   bucket = google_storage_bucket.provisioning_bucket.name
153 |   name   = "post_startup.sh"
154 |   source = "${path.module}/src/shell/post_startup.sh"
155 | 
156 |   depends_on = [
157 |     google_storage_bucket.provisioning_bucket
158 |   ]
159 | }
160 | 
161 | resource "google_storage_bucket" "spark-log-directory" {
162 |   name                        = "gcp-${var.use_case_short}-spark-log-directory-${random_id.id.hex}"
163 |   project                     = module.project-services.project_id
164 |   location                    = var.region
165 |   uniform_bucket_level_access = true
166 |   force_destroy               = var.force_destroy
167 | }
168 | 
169 | resource "google_storage_bucket" "phs-staging-bucket" {
170 |   name                        = "gcp-${var.use_case_short}-phs-staging-${random_id.id.hex}"
171 |   project                     = module.project-services.project_id
172 |   location                    = var.region
173 |   uniform_bucket_level_access = true
174 |   force_destroy               = var.force_destroy
175 | }
176 | 
177 | resource "google_storage_bucket" "phs-temp-bucket" {
178 |   name                        = "gcp-${var.use_case_short}-phs-temp-${random_id.id.hex}"
179 |   project                     = module.project-services.project_id
180 |   location                    = var.region
181 |   uniform_bucket_level_access = true
182 |   force_destroy               = var.force_destroy
183 | }
184 | 
185 | resource "google_storage_bucket" "sparkml-model-bucket" {
186 |   name                        = "gcp-${var.use_case_short}-model-${random_id.id.hex}"
187 |   project                     = module.project-services.project_id
188 |   location                    = var.region
189 |   uniform_bucket_level_access = true
190 |   force_destroy               = var.force_destroy
191 | }
192 | 


--------------------------------------------------------------------------------
/dataplex.tf:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2023 Google LLC
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *      http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | resource "google_project_service_identity" "dataplex_sa" {
 18 |   provider = google-beta
 19 |   project  = module.project-services.project_id
 20 |   service  = "dataplex.googleapis.com"
 21 | }
 22 | 
 23 | #give dataplex access to biglake bucket
 24 | resource "google_project_iam_member" "dataplex_bucket_access" {
 25 |   project = module.project-services.project_id
 26 |   role    = "roles/dataplex.serviceAgent"
 27 |   member  = "serviceAccount:${google_project_service_identity.dataplex_sa.email}"
 28 | }
 29 | 
 30 | resource "google_dataplex_lake" "gcp_primary" {
 31 |   location     = var.region
 32 |   name         = "gcp-primary-lake"
 33 |   description  = "gcp primary lake"
 34 |   display_name = "gcp primary lake"
 35 | 
 36 |   labels = {
 37 |     gcp-lake = "exists"
 38 |   }
 39 | 
 40 |   project = module.project-services.project_id
 41 | 
 42 |   depends_on = [
 43 |     google_project_iam_member.dataplex_bucket_access
 44 |   ]
 45 | 
 46 | }
 47 | 
 48 | #zone - raw
 49 | resource "google_dataplex_zone" "gcp_primary_raw" {
 50 |   discovery_spec {
 51 |     enabled = true
 52 |   }
 53 | 
 54 |   lake     = google_dataplex_lake.gcp_primary.name
 55 |   location = var.region
 56 |   name     = "gcp-primary-raw"
 57 | 
 58 |   resource_spec {
 59 |     location_type = "SINGLE_REGION"
 60 |   }
 61 | 
 62 |   type         = "RAW"
 63 |   description  = "Zone for thelook_ecommerce image data"
 64 |   display_name = "images"
 65 |   labels       = {}
 66 |   project      = module.project-services.project_id
 67 | 
 68 | 
 69 | }
 70 | 
 71 | #zone - curated, for staging the data
 72 | resource "google_dataplex_zone" "gcp_primary_staging" {
 73 |   discovery_spec {
 74 |     enabled = true
 75 |   }
 76 | 
 77 |   lake     = google_dataplex_lake.gcp_primary.name
 78 |   location = var.region
 79 |   name     = "gcp-primary-staging"
 80 | 
 81 |   resource_spec {
 82 |     location_type = "SINGLE_REGION"
 83 |   }
 84 | 
 85 |   type         = "CURATED"
 86 |   description  = "Zone for thelook_ecommerce tabular data"
 87 |   display_name = "staging"
 88 |   labels       = {}
 89 |   project      = module.project-services.project_id
 90 | }
 91 | 
 92 | #zone - curated, for BI
 93 | resource "google_dataplex_zone" "gcp_primary_curated_bi" {
 94 |   discovery_spec {
 95 |     enabled = true
 96 |   }
 97 | 
 98 |   lake     = google_dataplex_lake.gcp_primary.name
 99 |   location = var.region
100 |   name     = "gcp-primary-curated"
101 | 
102 |   resource_spec {
103 |     location_type = "SINGLE_REGION"
104 |   }
105 | 
106 |   type         = "CURATED"
107 |   description  = "Zone for thelook_ecommerce tabular data"
108 |   display_name = "business_intelligence"
109 |   labels       = {}
110 |   project      = module.project-services.project_id
111 | }
112 | 
113 | # Assets are listed below. Assets need to wait for data to be copied to be created.
114 | 
115 | #asset
116 | resource "google_dataplex_asset" "gcp_primary_textocr" {
117 |   name     = "gcp-primary-textocr"
118 |   location = var.region
119 | 
120 |   lake          = google_dataplex_lake.gcp_primary.name
121 |   dataplex_zone = google_dataplex_zone.gcp_primary_raw.name
122 | 
123 |   discovery_spec {
124 |     enabled = true
125 |   }
126 | 
127 |   resource_spec {
128 |     name             = "projects/${module.project-services.project_id}/buckets/${google_storage_bucket.textocr_images_bucket.name}"
129 |     type             = "STORAGE_BUCKET"
130 |     read_access_mode = "MANAGED"
131 |   }
132 | 
133 |   project    = module.project-services.project_id
134 |   depends_on = [time_sleep.wait_after_copy_data]
135 | 
136 | }
137 | 
138 | #asset
139 | resource "google_dataplex_asset" "gcp_primary_ga4_obfuscated_sample_ecommerce" {
140 |   name     = "gcp-primary-ga4-obfuscated-sample-ecommerce"
141 |   location = var.region
142 | 
143 |   lake          = google_dataplex_lake.gcp_primary.name
144 |   dataplex_zone = google_dataplex_zone.gcp_primary_raw.name
145 | 
146 |   discovery_spec {
147 |     enabled = true
148 |   }
149 | 
150 |   resource_spec {
151 |     name             = "projects/${module.project-services.project_id}/buckets/${google_storage_bucket.ga4_images_bucket.name}"
152 |     type             = "STORAGE_BUCKET"
153 |     read_access_mode = "MANAGED"
154 |   }
155 | 
156 |   project    = module.project-services.project_id
157 |   depends_on = [time_sleep.wait_after_copy_data]
158 | 
159 | }
160 | 
161 | #asset
162 | resource "google_dataplex_asset" "gcp_primary_tables" {
163 |   name     = "gcp-primary-tables"
164 |   location = var.region
165 | 
166 |   lake          = google_dataplex_lake.gcp_primary.name
167 |   dataplex_zone = google_dataplex_zone.gcp_primary_staging.name
168 | 
169 |   discovery_spec {
170 |     enabled = true
171 |   }
172 | 
173 |   resource_spec {
174 |     name             = "projects/${module.project-services.project_id}/buckets/${google_storage_bucket.tables_bucket.name}"
175 |     type             = "STORAGE_BUCKET"
176 |     read_access_mode = "MANAGED"
177 |   }
178 | 
179 |   project    = module.project-services.project_id
180 |   depends_on = [time_sleep.wait_after_copy_data]
181 | }
182 | 
183 | # Add a wait for Dataplex Discovery.
184 | # Discovery on this data generally takes 6-8 minutes.
185 | resource "time_sleep" "wait_for_dataplex_discovery" {
186 |   depends_on = [
187 |     google_dataplex_asset.gcp_primary_tables,
188 |     google_dataplex_asset.gcp_primary_ga4_obfuscated_sample_ecommerce,
189 |     google_dataplex_asset.gcp_primary_textocr
190 |   ]
191 | 
192 |   create_duration = "600s"
193 | }
194 | 
195 | locals {
196 |   datascan_dataset = replace(google_dataplex_zone.gcp_primary_staging.name, "-", "_")
197 | }
198 | 
199 | resource "google_dataplex_datascan" "dq_scan" {
200 |   project      = module.project-services.project_id
201 |   location     = var.region
202 |   data_scan_id = "thelook-ecommerce-orders"
203 | 
204 |   data {
205 |     resource = "//bigquery.googleapis.com/projects/${module.project-services.project_id}/datasets/${local.datascan_dataset}/tables/thelook_ecommerce_orders"
206 |   }
207 | 
208 |   execution_spec {
209 |     trigger {
210 |       on_demand {}
211 |     }
212 |   }
213 | 
214 |   data_quality_spec {
215 |     rules {
216 |       column      = "order_id"
217 |       dimension   = "COMPLETENESS"
218 |       name        = "non-null"
219 |       description = "Sample rule for non-null column"
220 |       threshold   = 1.0
221 |       non_null_expectation {}
222 |     }
223 | 
224 |     rules {
225 |       column      = "user_id"
226 |       dimension   = "COMPLETENESS"
227 |       name        = "non-null"
228 |       description = "Sample rule for non-null column"
229 |       threshold   = 1.0
230 |       non_null_expectation {}
231 |     }
232 | 
233 |     rules {
234 |       column      = "created_at"
235 |       dimension   = "COMPLETENESS"
236 |       name        = "non-null"
237 |       description = "Sample rule for non-null column"
238 |       threshold   = 1.0
239 |       non_null_expectation {}
240 |     }
241 | 
242 |     rules {
243 |       column      = "order_id"
244 |       dimension   = "UNIQUENESS"
245 |       name        = "unique"
246 |       description = "Sample rule for values in a set"
247 |       uniqueness_expectation {}
248 |     }
249 | 
250 |     rules {
251 |       column      = "status"
252 |       dimension   = "VALIDITY"
253 |       name        = "one-of-set"
254 |       description = "Sample rule for values in a set"
255 |       ignore_null = false
256 |       set_expectation {
257 |         values = ["Shipped", "Complete", "Processing", "Cancelled", "Returned"]
258 |       }
259 |     }
260 | 
261 |     rules {
262 |       column      = "num_of_item"
263 |       dimension   = "VALIDITY"
264 |       name        = "range-values"
265 |       description = "Sample rule for values in a range"
266 |       ignore_null = false
267 |       threshold   = 0.99
268 |       range_expectation {
269 |         max_value          = 1
270 |         strict_max_enabled = false
271 |         strict_min_enabled = false
272 |       }
273 |     }
274 | 
275 |     rules {
276 |       dimension   = "VALIDITY"
277 |       name        = "non-empty-table"
278 |       description = "Sample rule for a non-empty table"
279 |       table_condition_expectation {
280 |         sql_expression = "COUNT(*) > 0"
281 |       }
282 |     }
283 |   }
284 | 
285 |   depends_on = [time_sleep.wait_for_dataplex_discovery]
286 | }
287 | 


--------------------------------------------------------------------------------
/src/ipynb/spark_langchain.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "8c79ba30-439d-4064-a2ac-859ea887ce75",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "## Tutorial"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "b339cf14-d890-458e-9587-21a788936ab2",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "### Install Langchain\n",
 17 |     "\n",
 18 |     "Install the `langchain`, `langchain-experimental`, and `langchain-google-geni` libraries. You can install these directly into your Spark Serverless environment."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "id": "23bdf160-2885-43c3-86d3-cfb3f605eb82",
 25 |    "metadata": {
 26 |     "tags": []
 27 |    },
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "pip install langchain langchain-experimental langchain-google-genai"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "id": "f2dbf357-d8b3-4038-84d2-19b06b49bf9a",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "### Create an API key\n",
 39 |     "\n",
 40 |     "Create an API key using <a href=\"https://aistudio.google.com/app/apikey\"><span style=\"color:blue\">Google AI Studio</span></a>. Run the next cell and paste the API key in when prompted.\n"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "id": "63b1d896-164e-4c56-87d2-04447ee9d628",
 47 |    "metadata": {
 48 |     "tags": []
 49 |    },
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "from getpass import getpass\n",
 53 |     "\n",
 54 |     "api_key = getpass()"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "id": "c4709327-ca03-4b31-9a86-4df91c4dd788",
 60 |    "metadata": {
 61 |     "tags": []
 62 |    },
 63 |    "source": [
 64 |     "### Import required libraries"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "id": "ce553641-3093-4016-9e17-20a76e4e2aa9",
 71 |    "metadata": {
 72 |     "tags": []
 73 |    },
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "from langchain_experimental.agents.agent_toolkits import create_spark_dataframe_agent\n",
 77 |     "from langchain_google_genai import GoogleGenerativeAI\n",
 78 |     "from pyspark.sql import SparkSession"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "id": "3f2e2e70-f2ab-4394-9332-8d6499ea3bb9",
 84 |    "metadata": {
 85 |     "tags": []
 86 |    },
 87 |    "source": [
 88 |     "### Create a connection to the Gemini model service\n",
 89 |     "\n",
 90 |     "Create an LLM object using the `GoogleGenerativeAI` class which creates a connection to the Gemini model service."
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "id": "d3d47140-434a-4a0a-96e1-d62b9192e519",
 97 |    "metadata": {
 98 |     "tags": []
 99 |    },
100 |    "outputs": [],
101 |    "source": [
102 |     "llm = GoogleGenerativeAI(model=\"gemini-1.5-pro\", temperature=0.0, google_api_key=api_key)"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "id": "588bf4b5-e5f9-4437-923e-11430713cddf",
108 |    "metadata": {},
109 |    "source": [
110 |     "Use `llm.invoke` to ask Gemini a question and confirm your connection to the service."
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "id": "7d713cb8-c656-4285-9050-b2b6a9d2c1b2",
117 |    "metadata": {
118 |     "tags": []
119 |    },
120 |    "outputs": [],
121 |    "source": [
122 |     "print(llm.invoke(\"What is the best programming language?\"))"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "id": "f58b9819-2142-43fb-ab40-d1ff82d8bf8f",
128 |    "metadata": {},
129 |    "source": [
130 |     "### Create a Spark Session\n",
131 |     "\n",
132 |     "Create a connection to the Spark context in your environment."
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "id": "ee16d694-4652-4c38-98ec-88a24df5e175",
139 |    "metadata": {
140 |     "tags": []
141 |    },
142 |    "outputs": [],
143 |    "source": [
144 |     "spark = SparkSession.builder.getOrCreate()"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "id": "f1d266c0-c32b-437e-a36a-79a339288c65",
150 |    "metadata": {
151 |     "tags": []
152 |    },
153 |    "source": [
154 |     "### Load data\n",
155 |     "\n",
156 |     "Load your BigLake table `gcp_primary_staging.thelook_ecommerce_order_items` into your environment. This table contains ecommerce orders."
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": null,
162 |    "id": "e0288a55-d7c5-4843-b5bd-c510b82a549c",
163 |    "metadata": {
164 |     "tags": []
165 |    },
166 |    "outputs": [],
167 |    "source": [
168 |     "df = spark.read.format(\"bigquery\").load(\"next-2024-spark-demo.gcp_primary_staging.thelook_ecommerce_order_items\")"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "id": "93f0f829-6506-4607-95cf-30882ce7bd15",
174 |    "metadata": {
175 |     "tags": []
176 |    },
177 |    "source": [
178 |     "View some of the data"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": null,
184 |    "id": "38c40cda-092e-4f8e-bd7e-2be47559e8a6",
185 |    "metadata": {
186 |     "tags": []
187 |    },
188 |    "outputs": [],
189 |    "source": [
190 |     "df.show(10)"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "markdown",
195 |    "id": "5825a341-2230-4e85-b482-9b378979c13e",
196 |    "metadata": {
197 |     "tags": []
198 |    },
199 |    "source": [
200 |     "Use the `create_spark_dataframe_agent` method to configure a LangChain agent using the loaded dataset and Gemini model. The `verbose=True` parameter, send to std.out the steps the agent is taking. Omitting this parameter to suppresses this output."
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "id": "3986ec6b-4dd8-4102-92b0-74479da4351b",
207 |    "metadata": {
208 |     "tags": []
209 |    },
210 |    "outputs": [],
211 |    "source": [
212 |     "agent = create_spark_dataframe_agent(llm=llm, df=df, verbose=True)"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "markdown",
217 |    "id": "50f7bd72-4082-40c9-adc6-1312490d8a1d",
218 |    "metadata": {},
219 |    "source": [
220 |     "Use natural language to gain insights into your data. To start with something simple, ask for the order_id and the price of the most expensive order."
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": null,
226 |    "id": "82cc9524-5d36-46f6-b8cc-21c6849708cd",
227 |    "metadata": {
228 |     "tags": []
229 |    },
230 |    "outputs": [],
231 |    "source": [
232 |     "agent.invoke(\"what was the order id and the price of the most expensive order?\")"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "id": "9172275f-51cb-4fdc-b803-a34c28aca7f8",
238 |    "metadata": {},
239 |    "source": [
240 |     "With the verbose parameter set to True, we can see exactly how the agent is working. The agent generates code based on the schema of the dataframe and executes it. It doesn't always get it on the first try, but it is able to learn from the errors it sees to adjust and correct until it lands on an acceptable answer.\n",
241 |     "\n",
242 |     "Next, make a request that involves the agent importing new functions."
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": null,
248 |    "id": "fa86d21d-b52f-4797-a7bd-6102b33d49bc",
249 |    "metadata": {
250 |     "tags": []
251 |    },
252 |    "outputs": [],
253 |    "source": [
254 |     "agent.invoke(\"What week of the year has the total highest sales overall?\")"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "markdown",
259 |    "id": "84fba518-fd15-4dc2-96db-c51cb84c6138",
260 |    "metadata": {},
261 |    "source": [
262 |     "Now you probably don't want to include this natural language prompt directly into a production environment. Instead, we can ask Gemini to generate the PySparkSQL code for us that would create the same output."
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": null,
268 |    "id": "16b41a6a-482c-4ce8-aa63-4e1c7d9de305",
269 |    "metadata": {
270 |     "tags": []
271 |    },
272 |    "outputs": [],
273 |    "source": [
274 |     "agent.invoke(\"Print the PySpark code that answers 'What week of the year has the total highest sales overall?' Include all necessary imports.\")"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "markdown",
279 |    "id": "ccc75dce-bfab-4652-850c-2242dc28f9a1",
280 |    "metadata": {},
281 |    "source": [
282 |     "Like anything created by the still-maturing LLM technology, review generated code for accuracy."
283 |    ]
284 |   }
285 |  ],
286 |  "metadata": {
287 |   "environment": {
288 |    "kernel": "9c39b79e5d2e7072beb4bd59-next-2024",
289 |    "name": "workbench-notebooks.m113",
290 |    "type": "gcloud",
291 |    "uri": "gcr.io/deeplearning-platform-release/workbench-notebooks:m113"
292 |   },
293 |   "kernelspec": {
294 |    "display_name": "next-2024 on Serverless Spark (Remote)",
295 |    "language": "python",
296 |    "name": "9c39b79e5d2e7072beb4bd59-next-2024"
297 |   },
298 |   "language_info": {
299 |    "codemirror_mode": {
300 |     "name": "ipython",
301 |     "version": 3
302 |    },
303 |    "file_extension": ".py",
304 |    "mimetype": "text/x-python",
305 |    "name": "python",
306 |    "nbconvert_exporter": "python",
307 |    "pygments_lexer": "ipython3",
308 |    "version": "3.12.2"
309 |   }
310 |  },
311 |  "nbformat": 4,
312 |  "nbformat_minor": 5
313 | }
314 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | All notable changes to this project will be documented in this file.
  4 | 
  5 | The format is based on
  6 | [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
  7 | and this project adheres to
  8 | [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
  9 | This changelog is generated automatically based on [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/).
 10 | 
 11 | ## [0.4.0](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/compare/v0.3.0...v0.4.0) (2024-01-23)
 12 | 
 13 | 
 14 | ### Features
 15 | 
 16 | * add bucket for PHS created in Spark Serverless Interactive Tutorial ([e087195](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/e08719526836af1e4197ef55005b3291920b7909))
 17 | * adding sparkml notebook ([#99](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/99)) ([4b2169a](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/4b2169a11be058d495884a4ee455f49ef109b754))
 18 | * adding unit tests, removing unused arg from README ([#93](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/93)) ([bb9257b](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/bb9257b975d7b9635cb249f1a3867c5c0a14369b))
 19 | * create a bucket for dataplex ([#76](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/76)) ([ccadcc0](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/ccadcc0667d1b3e81f7f093c2a0acc83e567120a))
 20 | * **deps:** Update Terraform Google Provider to v5 (major) ([#79](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/79)) ([40ab09d](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/40ab09d2006f6052740afffc5df7cdaf06352c16))
 21 | 
 22 | 
 23 | ### Bug Fixes
 24 | 
 25 | * add service account to phs cluster ([#82](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/82)) ([04a9fae](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/04a9fae8e4f1bb9cbe1a420bb9c89c79d1849ddb))
 26 | * add unique hash to the service account name ([#71](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/71)) ([c16912d](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/c16912d7c3d182671dceac4067ba196aa814948a))
 27 | * change data file paths to point to root directory ([#60](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/60)) ([4621da0](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/4621da033d56f88bb0c03948b1c2e0c5108c297d))
 28 | * **deps:** update module github.com/googlecloudplatform/cloud-foundation-toolkit/infra/blueprint-test to v0.8.0 ([#63](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/63)) ([54075a5](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/54075a59ef58fe8a156cac8f36f295ee149125a2))
 29 | * **deps:** update terraform google-beta to v4.74.0 ([#57](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/57)) ([f3848c3](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/f3848c3a71518930b94c582406d3100a0e29bcde))
 30 | * **deps:** update terraform google-beta to v4.75.0 ([#58](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/58)) ([10a452e](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/10a452e4f0e612f4ce63deb7559fc4c45bed3be0))
 31 | * **deps:** Update Terraform google-beta to v4.81.0 ([#66](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/66)) ([825fd7d](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/825fd7d163e361711c7a23c14b68b65125def50a))
 32 | * **deps:** Update Terraform google-beta to v4.82.0 ([#70](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/70)) ([cc8373f](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/cc8373fdda84690982c6c928480d67dfacb3d979))
 33 | * **deps:** Update Terraform google-beta to v4.83.0 ([#73](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/73)) ([a2cabdb](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/a2cabdb10ec92242f8d17c72f8734a47937fa7e6))
 34 | * **deps:** Update Terraform google-beta to v4.84.0 ([#74](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/74)) ([c70d9af](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/c70d9af5958fd6a6a792f1ebeee542b2f21ddb1b))
 35 | * **deps:** Update Terraform terraform-google-modules/project-factory/google to v14.3.0 ([#65](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/65)) ([a59521a](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/a59521a0f7017cee43f166677ab50546245504e2))
 36 | * **deps:** Update Terraform terraform-google-modules/project-factory/google to v14.4.0 ([#87](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/87)) ([8ca39d1](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/8ca39d18d09f6c0e5e08faa9ee5392b857d7fd96))
 37 | * remove compute instance check from integration test teardown ([#110](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/110)) ([e07095d](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/e07095df965d032a8d64f83fa5511f442cc9c433))
 38 | * rolling back PHS creation in deployment ([#105](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/105)) ([f5acf8e](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/f5acf8e9289422c549ea0243f0db5f8d3972399b))
 39 | * set staging and temp bucket for phs cluster ([#88](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/88)) ([c7ff112](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/c7ff1121a5a38645531c3eb44201c08bb6407713))
 40 | * Update Terraform versioning, improve dependency tree, remove unused table, add Managed Tables to Dataplex Assets ([#72](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/72)) ([9283feb](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/9283febc691cb313b97adc242dc38605dc3976d4))
 41 | * wait for Dataplex IAM to create lake ([#86](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/86)) ([9f42b95](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/9f42b95015f6bb65ee67c9b5ada2e06a8b9a3274))
 42 | * wait to create dataproc cluster until SA roles are assigned ([#91](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/91)) ([66bb99b](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/66bb99b2be3801abe86220f0a331c18b29bbe577))
 43 | 
 44 | ## [0.3.0](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/compare/v0.2.1...v0.3.0) (2023-07-18)
 45 | 
 46 | 
 47 | ### Bug Fixes
 48 | 
 49 | * **deps:** update terraform google-beta to v4.70.0 ([6460f59](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/6460f59c1bd6464dbb46b5561ee4ffa0109f75ff))
 50 | * **deps:** update terraform google-beta to v4.71.0 ([c64944b](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/c64944b1a7e3c73c87bb0bcb49696cc9b8693084))
 51 | * **deps:** update terraform google-beta to v4.73.1 ([47c1b4f](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/47c1b4f60367404c303c945d3b5dd46a0a378815))
 52 | * **deps:** update terraform google-beta to v4.73.2 ([f6f8cb8](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/f6f8cb836f6d2d67ba775b795778b754893bcca0))
 53 | * **deps:** update terraform terraform-google-modules/project-factory/google to v14.2.1 ([a6ca8a1](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/a6ca8a13dc0dbfc79683c5e43b43593957407064))
 54 | * upgrade dataplex tables to managed, create new zone, remove manual table creation ([52a45f2](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/52a45f2aee107dfd6fde04ce92e77cf7b61c4e5c))
 55 | 
 56 | ## [0.2.1](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/compare/v0.2.0...v0.2.1) (2023-06-22)
 57 | 
 58 | 
 59 | ### Bug Fixes
 60 | 
 61 | * update neos toc url ([#47](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/47)) ([629f00b](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/629f00b679faf1f29c676514f0ef7869c7b9ee8a))
 62 | 
 63 | ## [0.2.0](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/compare/v0.1.0...v0.2.0) (2023-06-14)
 64 | 
 65 | 
 66 | ### Features
 67 | 
 68 | * add polling logic to Spark workflow ([9ea1517](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/9ea151703ccdfb13998d1220f29885a55aeae547))
 69 | * adds metadata generation for the blueprint ([#34](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/34)) ([ef1b35c](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/ef1b35cf28d897cae3beff4dd4200617be902d20))
 70 | 
 71 | 
 72 | ### Bug Fixes
 73 | 
 74 | * **deps:** update terraform google-beta to v4.69.1 ([28a034d](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/28a034d2115a0982ed3b5df02e7f91be696e8e33))
 75 | * **deps:** update terraform googles to &lt;= 4.69.0, != 4.65.0, != 4.65.1 ([9a9852e](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/9a9852e7084ae0d3e0699437ea8ec78817f33104))
 76 | * **deps:** update terraform terraform-google-modules/project-factory/google to v14 ([e5e5d00](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/e5e5d00774ee5f7881b799fbb4ad435094b3087c))
 77 | * refactor references from 'assets' directory to 'src' ([acf7efb](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/acf7efba619230102e7691778ab69e47facc27aa))
 78 | * Update int.cloudbuild.yaml to use LR billing ([#43](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/43)) ([1d0ddc7](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/1d0ddc78ec473c7ca2c0863a9abdf1da2edc15f7))
 79 | 
 80 | ## 0.1.0 (2023-05-17)
 81 | 
 82 | 
 83 | ### Features
 84 | 
 85 | * output.tf additions ([#14](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/14)) ([07d4ea4](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/07d4ea4afd488c5df6899529fb60556a93aaaca7))
 86 | 
 87 | 
 88 | ### Bug Fixes
 89 | 
 90 | * Biglake cleanup ([#10](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/10)) ([98646d8](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/98646d8f305554749f5afd7ab46e790f97d527fd))
 91 | * formatting and linting ([#12](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/12)) ([5e55357](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/5e553573532115bd7888600dc0c1565f79ef5b53))
 92 | * Lakehouse cleanup ([#9](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/9)) ([c474b66](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/c474b665018babe96ab897a1a338b703ac0a3b95))
 93 | * move RAP to Neos ([#24](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/24)) ([4a2aeb6](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/4a2aeb60a32f4bc79d08f008ad69bf2bc03a3792))
 94 | * pin google provider version to before 4.65 or not equal to 4.65 ([0510153](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/0510153a1849ff5f134a28cb7569f2970c142e93))
 95 | * pin google provider version to v4.64.0 ([32a83ba](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/32a83bac28f6c50de009d15333cc3ac61fc5be0a))
 96 | * update colab link ([#16](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/16)) ([20ef826](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/20ef8266bd0c70f35625008c3806a33099ded396))
 97 | * update neos, remove solution guide output ([7357552](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/735755295278b6c89cc9dbbe811f109bf96d8b52))
 98 | * Workflow dependency ([#23](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/issues/23)) ([6e2b2df](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/6e2b2df7eba67ac2403da0a80c85a5ae99e067e9))
 99 | * workflows and remove hardcoding ([675b35c](https://github.com/GoogleCloudPlatform/terraform-google-analytics-lakehouse/commit/675b35ce15db043204dd4bcfaa73faffe2933164))
100 | 
101 | ## [0.1.0](https://github.com/terraform-google-modules/terraform-google-/releases/tag/v0.1.0) - 20XX-YY-ZZ
102 | 
103 | ### Features
104 | 
105 | - Initial release
106 | 
107 | [0.1.0]: https://github.com/terraform-google-modules/terraform-google-/releases/tag/v0.1.0
108 | 


--------------------------------------------------------------------------------
/test/integration/go.sum:
--------------------------------------------------------------------------------
  1 | github.com/GoogleCloudPlatform/cloud-foundation-toolkit/infra/blueprint-test v0.17.6 h1:ZuWhmUXY/co2jqEUYYosTlAruqzATzrYQ4IV5VKiKNM=
  2 | github.com/GoogleCloudPlatform/cloud-foundation-toolkit/infra/blueprint-test v0.17.6/go.mod h1:UX+iYMTzZ7Ik6N5rD8U32x7QwKaGyG/aAflWWDaHMDc=
  3 | github.com/agext/levenshtein v1.2.3 h1:YB2fHEn0UJagG8T1rrWknE3ZQzWM06O8AMAatNn7lmo=
  4 | github.com/agext/levenshtein v1.2.3/go.mod h1:JEDfjyjHDjOF/1e4FlBE/PkbqA9OfWu2ki2W0IB5558=
  5 | github.com/alexflint/go-filemutex v1.3.0 h1:LgE+nTUWnQCyRKbpoceKZsPQbs84LivvgwUymZXdOcM=
  6 | github.com/alexflint/go-filemutex v1.3.0/go.mod h1:U0+VA/i30mGBlLCrFPGtTe9y6wGQfNAWPBTekHQ+c8A=
  7 | github.com/apparentlymart/go-textseg/v15 v15.0.0 h1:uYvfpb3DyLSCGWnctWKGj857c6ew1u1fNQOlOtuGxQY=
  8 | github.com/apparentlymart/go-textseg/v15 v15.0.0/go.mod h1:K8XmNZdhEBkdlyDdvbmmsvpAG721bKi0joRfFdHIWJ4=
  9 | github.com/bgentry/go-netrc v0.0.0-20140422174119-9fd32a8b3d3d h1:xDfNPAt8lFiC1UJrqV3uuy861HCTo708pDMbjHHdCas=
 10 | github.com/bgentry/go-netrc v0.0.0-20140422174119-9fd32a8b3d3d/go.mod h1:6QX/PXZ00z/TKoufEY6K/a0k6AhaJrQKdFe6OfVXsa4=
 11 | github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
 12 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 13 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 14 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
 15 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 16 | github.com/go-errors/errors v1.5.0 h1:/EuijeGOu7ckFxzhkj4CXJ8JaenxK7bKUxpPYqeLHqQ=
 17 | github.com/go-errors/errors v1.5.0/go.mod h1:sIVyrIiJhuEF+Pj9Ebtd6P/rEYROXFi3BopGUQ5a5Og=
 18 | github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs=
 19 | github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ=
 20 | github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY=
 21 | github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE=
 22 | github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k=
 23 | github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14=
 24 | github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE=
 25 | github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ=
 26 | github.com/go-test/deep v1.0.7 h1:/VSMRlnY/JSyqxQUzQLKVMAskpY/NZKFA5j2P+0pP2M=
 27 | github.com/go-test/deep v1.0.7/go.mod h1:QV8Hv/iy04NyLBxAdO9njL0iVPN1S4d/A3NVv1V36o8=
 28 | github.com/google/gnostic-models v0.6.9 h1:MU/8wDLif2qCXZmzncUQ/BOfxWfthHi63KqpoNbWqVw=
 29 | github.com/google/gnostic-models v0.6.9/go.mod h1:CiWsm0s6BSQd1hRn8/QmxqB6BesYcbSZxsz9b0KuDBw=
 30 | github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
 31 | github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
 32 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
 33 | github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0=
 34 | github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 35 | github.com/gruntwork-io/terratest v0.48.2 h1:+VwfODchq8jxZZWD+s8gBlhD1z6/C4bFLNrhpm9ONrs=
 36 | github.com/gruntwork-io/terratest v0.48.2/go.mod h1:Y5ETyD4ZQ2MZhasPno272fWuCpKwvTPYDi8Y0tIMqTE=
 37 | github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
 38 | github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I=
 39 | github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
 40 | github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ=
 41 | github.com/hashicorp/go-cleanhttp v0.5.2/go.mod h1:kO/YDlP8L1346E6Sodw+PrpBSV4/SoxCXGY6BqNFT48=
 42 | github.com/hashicorp/go-getter/v2 v2.2.3 h1:6CVzhT0KJQHqd9b0pK3xSP0CM/Cv+bVhk+jcaRJ2pGk=
 43 | github.com/hashicorp/go-getter/v2 v2.2.3/go.mod h1:hp5Yy0GMQvwWVUmwLs3ygivz1JSLI323hdIE9J9m7TY=
 44 | github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo=
 45 | github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM=
 46 | github.com/hashicorp/go-safetemp v1.0.0 h1:2HR189eFNrjHQyENnQMMpCiBAsRxzbTMIgBhEyExpmo=
 47 | github.com/hashicorp/go-safetemp v1.0.0/go.mod h1:oaerMy3BhqiTbVye6QuFhFtIceqFoDHxNAB65b+Rj1I=
 48 | github.com/hashicorp/go-version v1.7.0 h1:5tqGy27NaOTB8yJKUZELlFAS/LTKJkrmONwQKeRZfjY=
 49 | github.com/hashicorp/go-version v1.7.0/go.mod h1:fltr4n8CU8Ke44wwGCBoEymUuxUHl09ZGVZPK5anwXA=
 50 | github.com/hashicorp/hcl v0.0.0-20170504190234-a4b07c25de5f h1:UdxlrJz4JOnY8W+DbLISwf2B8WXEolNRA8BGCwI9jws=
 51 | github.com/hashicorp/hcl v0.0.0-20170504190234-a4b07c25de5f/go.mod h1:oZtUIOe8dh44I2q6ScRibXws4Ajl+d+nod3AaR9vL5w=
 52 | github.com/hashicorp/hcl/v2 v2.22.0 h1:hkZ3nCtqeJsDhPRFz5EA9iwcG1hNWGePOTw6oyul12M=
 53 | github.com/hashicorp/hcl/v2 v2.22.0/go.mod h1:62ZYHrXgPoX8xBnzl8QzbWq4dyDsDtfCRgIq1rbJEvA=
 54 | github.com/hashicorp/terraform-config-inspect v0.0.0-20250203082807-efaa306e97b4 h1:6zYoI+NGpRPo0UjbnJfmqqTFcTEKvbv77h0ZcgeLXJs=
 55 | github.com/hashicorp/terraform-config-inspect v0.0.0-20250203082807-efaa306e97b4/go.mod h1:Gz/z9Hbn+4KSp8A2FBtNszfLSdT2Tn/uAKGuVqqWmDI=
 56 | github.com/hashicorp/terraform-json v0.24.0 h1:rUiyF+x1kYawXeRth6fKFm/MdfBS6+lW4NbeATsYz8Q=
 57 | github.com/hashicorp/terraform-json v0.24.0/go.mod h1:Nfj5ubo9xbu9uiAoZVBsNOjvNKB66Oyrvtit74kC7ow=
 58 | github.com/jinzhu/copier v0.4.0 h1:w3ciUoD19shMCRargcpm0cm91ytaBhDvuRpz1ODO/U8=
 59 | github.com/jinzhu/copier v0.4.0/go.mod h1:DfbEm0FYsaqBcKcFuvmOZb218JkPGtvSHsKg8S8hyyg=
 60 | github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
 61 | github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
 62 | github.com/klauspost/compress v1.16.7 h1:2mk3MPGNzKyxErAw8YaohYh69+pa4sIQSC0fPGCFR9I=
 63 | github.com/klauspost/compress v1.16.7/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE=
 64 | github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
 65 | github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
 66 | github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
 67 | github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
 68 | github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
 69 | github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 70 | github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
 71 | github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
 72 | github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
 73 | github.com/mattn/go-shellwords v1.0.12 h1:M2zGm7EW6UQJvDeQxo4T51eKPurbeFbe8WtebGE2xrk=
 74 | github.com/mattn/go-shellwords v1.0.12/go.mod h1:EZzvwXDESEeg03EKmM+RmDnNOPKG4lLtQsUlTZDWQ8Y=
 75 | github.com/mattn/go-zglob v0.0.4 h1:LQi2iOm0/fGgu80AioIJ/1j9w9Oh+9DZ39J4VAGzHQM=
 76 | github.com/mattn/go-zglob v0.0.4/go.mod h1:MxxjyoXXnMxfIpxTK2GAkw1w8glPsQILx3N5wrKakiY=
 77 | github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y=
 78 | github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
 79 | github.com/mitchellh/go-testing-interface v1.14.2-0.20210821155943-2d9075ca8770 h1:drhDO54gdT/a15GBcMRmunZiNcLgPiFIJa23KzmcvcU=
 80 | github.com/mitchellh/go-testing-interface v1.14.2-0.20210821155943-2d9075ca8770/go.mod h1:SO/iHr6q2EzbqRApt+8/E9wqebTwQn5y+UlB04bxzo0=
 81 | github.com/mitchellh/go-wordwrap v1.0.1 h1:TLuKupo69TCn6TQSyGxwI1EblZZEsQ0vMlAFQflz0v0=
 82 | github.com/mitchellh/go-wordwrap v1.0.1/go.mod h1:R62XHJLzvMFRBbcrT7m7WgmE1eOyTSsCt+hzestvNj0=
 83 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 84 | github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
 85 | github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 86 | github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
 87 | github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
 88 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 89 | github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
 90 | github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
 91 | github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
 92 | github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 93 | github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
 94 | github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
 95 | github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
 96 | github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
 97 | github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
 98 | github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY=
 99 | github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
100 | github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA=
101 | github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
102 | github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
103 | github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4=
104 | github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
105 | github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY=
106 | github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28=
107 | github.com/tmccombs/hcl2json v0.6.4 h1:/FWnzS9JCuyZ4MNwrG4vMrFrzRgsWEOVi+1AyYUVLGw=
108 | github.com/tmccombs/hcl2json v0.6.4/go.mod h1:+ppKlIW3H5nsAsZddXPy2iMyvld3SHxyjswOZhavRDk=
109 | github.com/ulikunitz/xz v0.5.11 h1:kpFauv27b6ynzBNT/Xy+1k+fK4WswhN/6PN5WhFAGw8=
110 | github.com/ulikunitz/xz v0.5.11/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
111 | github.com/zclconf/go-cty v1.15.1 h1:RgQYm4j2EvoBRXOPxhUvxPzRrGDo1eCOhHXuGfrj5S0=
112 | github.com/zclconf/go-cty v1.15.1/go.mod h1:VvMs5i0vgZdhYawQNq5kePSpLAoz8u1xvZgrPIxfnZE=
113 | github.com/zclconf/go-cty-debug v0.0.0-20240509010212-0d6042c53940 h1:4r45xpDWB6ZMSMNJFMOjqrGHynW3DIBuR2H9j0ug+Mo=
114 | github.com/zclconf/go-cty-debug v0.0.0-20240509010212-0d6042c53940/go.mod h1:CmBdvvj3nqzfzJ6nTCIwDTPZ56aVGvDrmztiO5g3qrM=
115 | golang.org/x/crypto v0.35.0 h1:b15kiHdrGCHrP6LvwaQ3c03kgNhhiMgvlhxHQhmg2Xs=
116 | golang.org/x/crypto v0.35.0/go.mod h1:dy7dXNW32cAb/6/PRuTNsix8T+vJAqvuIy5Bli/x0YQ=
117 | golang.org/x/mod v0.23.0 h1:Zb7khfcRGKk+kqfxFaP5tZqCnDZMjC5VtUBs87Hr6QM=
118 | golang.org/x/mod v0.23.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY=
119 | golang.org/x/net v0.36.0 h1:vWF2fRbw4qslQsQzgFqZff+BItCvGFQqKzKIzx1rmoA=
120 | golang.org/x/net v0.36.0/go.mod h1:bFmbeoIPfrw4sMHNhb4J9f6+tPziuGjq7Jk/38fxi1I=
121 | golang.org/x/sync v0.11.0 h1:GGz8+XQP4FvTTrjZPzNKTMFtSXH80RAzG+5ghFPgK9w=
122 | golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
123 | golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
124 | golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc=
125 | golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
126 | golang.org/x/term v0.29.0 h1:L6pJp37ocefwRRtYPKSWOWzOtWSxVajvz2ldH/xi3iU=
127 | golang.org/x/term v0.29.0/go.mod h1:6bl4lRlvVuDgSf3179VpIxBF0o10JUpXWOnI7nErv7s=
128 | golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM=
129 | golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY=
130 | golang.org/x/tools v0.26.0 h1:v/60pFQmzmT9ExmjDv2gGIfi3OqfKoEP6I5+umXlbnQ=
131 | golang.org/x/tools v0.26.0/go.mod h1:TPVVj70c7JJ3WCazhD8OdXcZg/og+b9+tH/KxylGwH0=
132 | google.golang.org/protobuf v1.35.1 h1:m3LfL6/Ca+fqnjnlqQXNpFPABW1UD7mjh8KO2mKFytA=
133 | google.golang.org/protobuf v1.35.1/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE=
134 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
135 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
136 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
137 | gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
138 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
139 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
140 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
141 | k8s.io/kube-openapi v0.0.0-20241212222426-2c72e554b1e7 h1:hcha5B1kVACrLujCKLbr8XWMxCxzQx42DY8QKYJrDLg=
142 | k8s.io/kube-openapi v0.0.0-20241212222426-2c72e554b1e7/go.mod h1:GewRfANuJ70iYzvn+i4lezLDAFzvjxZYK1gn1lWcfas=
143 | sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8=
144 | sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo=
145 | sigs.k8s.io/kustomize/kyaml v0.19.0 h1:RFge5qsO1uHhwJsu3ipV7RNolC7Uozc0jUBC/61XSlA=
146 | sigs.k8s.io/kustomize/kyaml v0.19.0/go.mod h1:FeKD5jEOH+FbZPpqUghBP8mrLjJ3+zD3/rf9NNu1cwY=
147 | sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E=
148 | sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY=
149 | 


--------------------------------------------------------------------------------
/src/ipynb/spark_ml.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "id": "4d6c84304016"
  7 |    },
  8 |    "source": [
  9 |     "# SparkML with Dataproc Serverless"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {
 15 |     "id": "3eee516156f1"
 16 |    },
 17 |    "source": [
 18 |     "## Overview\n",
 19 |     "\n",
 20 |     "This notebook tutorial demonstrates the execution of Apache SparkML jobs using Dataproc Serverless. This example machine learning pipeline ingests the [NYC TLC (Taxi and Limousine Commission) Trips](https://console.cloud.google.com/marketplace/product/city-of-new-york/nyc-tlc-trips) dataset from your lakehouse and performs cleaning, feature engineering, model training, and model evaluation to calculate trip duration.\n",
 21 |     "\n",
 22 |     "The tutorial uses the following Google Cloud products:\n",
 23 |     "- `Dataproc`\n",
 24 |     "- `BigQuery`\n",
 25 |     "- `Vertex AI Training`\n",
 26 |     "- `BigLake`"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {
 32 |     "id": "2773979cd11d"
 33 |    },
 34 |    "source": [
 35 |     "## Tutorial\n",
 36 |     "\n",
 37 |     "### Set your project ID, location, and session ID"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "metadata": {
 44 |     "id": "20366a83e3f1",
 45 |     "tags": []
 46 |    },
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "# Retrieve the current active project and store it as a list of strings.\n",
 50 |     "PROJECT_ID = !gcloud config get-value project\n",
 51 |     "\n",
 52 |     "# Extract the project ID from the list.\n",
 53 |     "PROJECT_ID = PROJECT_ID[0] if PROJECT_ID else None\n",
 54 |     "\n",
 55 |     "# Retrieve the current location.\n",
 56 |     "LOCATION = !gcloud compute instances list --project={PROJECT_ID} --format='get(ZONE)'\n",
 57 |     "LOCATION = str(LOCATION).split(\"/\")[-1][:-4]\n",
 58 |     "\n",
 59 |     "# Get the name of the active Dataproc Serverless Session\n",
 60 |     "SESSION = !gcloud beta dataproc sessions list --location='{LOCATION}' --filter='state=ACTIVE' --format='get(SESSION_ID)' --sort-by='~createTime'\n",
 61 |     "SESSION = SESSION[0].split('/')[-1] if SESSION else None"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "metadata": {
 67 |     "id": "73f5734153d4"
 68 |    },
 69 |    "source": [
 70 |     "### Get a Cloud Storage bucket URI"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {
 77 |     "id": "06cb3320201b",
 78 |     "tags": []
 79 |    },
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "# Define the prefix of the bucket created via Terraform.\n",
 83 |     "BUCKET_PREFIX = \"gcp-lakehouse-model\"\n",
 84 |     "\n",
 85 |     "# Retrieve the Cloud Storage bucket URI for storing the machine learning model.\n",
 86 |     "BUCKET_URI = !gcloud storage buckets list --format='value(name)' --filter='name:{BUCKET_PREFIX}*'\n",
 87 |     "\n",
 88 |     "# Extract the bucket URI from the list.\n",
 89 |     "BUCKET_URI = BUCKET_URI[0] if BUCKET_URI else None"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {
 95 |     "id": "34dd442fd5af"
 96 |    },
 97 |    "source": [
 98 |     "### Import required libraries"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {
105 |     "id": "80ef02298a93",
106 |     "tags": []
107 |    },
108 |    "outputs": [],
109 |    "source": [
110 |     "import matplotlib.pyplot as plt\n",
111 |     "import seaborn as sns\n",
112 |     "from geopandas import gpd\n",
113 |     "from pyspark.ml.evaluation import RegressionEvaluator\n",
114 |     "from pyspark.ml.feature import VectorAssembler\n",
115 |     "from pyspark.ml.regression import GBTRegressor\n",
116 |     "# A Spark Session is how you interact with Spark SQL to create Dataframes\n",
117 |     "from pyspark.sql import SparkSession\n",
118 |     "# PySpark functions\n",
119 |     "from pyspark.sql.functions import col, floor, unix_timestamp"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "metadata": {
125 |     "id": "2b6fb4d7c7f5"
126 |    },
127 |    "source": [
128 |     "### Initialize the SparkSession\n",
129 |     "\n",
130 |     "Use the [spark-bigquery-connector](https://github.com/GoogleCloudDataproc/spark-bigquery-connector) to read and write data between Apache Spark and BigQuery."
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {
137 |     "id": "3ce77cd7c0d2",
138 |     "tags": []
139 |    },
140 |    "outputs": [],
141 |    "source": [
142 |     "VER = \"0.34.0\"\n",
143 |     "FILE_NAME = f\"spark-bigquery-with-dependencies_2.12-{VER}.jar\"\n",
144 |     "connector = f\"gs://spark-lib/bigquery/{FILE_NAME}\"\n",
145 |     "\n",
146 |     "# Initialize the SparkSession.\n",
147 |     "spark = (\n",
148 |     "    SparkSession.builder.appName(\"spark-ml-taxi\")\n",
149 |     "    .config(\"spark.jars\", connector)\n",
150 |     "    .config(\"spark.logConf\", \"false\")\n",
151 |     "    .getOrCreate()\n",
152 |     ")"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "metadata": {
158 |     "id": "3a4080065c8b"
159 |    },
160 |    "source": [
161 |     "### Fetch data\n",
162 |     "\n",
163 |     "Load the table `gcp_primary_staging.new_york_taxi_trips_tlc_yellow_trips_2022`."
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "metadata": {
170 |     "id": "4a5f19a732ed",
171 |     "tags": []
172 |    },
173 |    "outputs": [],
174 |    "source": [
175 |     "# Load NYC_taxi in Github Activity Public Dataset from BigQuery.\n",
176 |     "taxi_df = (\n",
177 |     "    spark.read.format(\"bigquery\")\n",
178 |     "    .option(\n",
179 |     "        \"table\",\n",
180 |     "        f\"{PROJECT_ID}.gcp_primary_staging.new_york_taxi_trips_tlc_yellow_trips_2022\",\n",
181 |     "    )\n",
182 |     "    .load()\n",
183 |     ")\n",
184 |     "\n",
185 |     "# Sample parameter. Increase or decrease to experiment with different data sizes.\n",
186 |     "FRACTION = 0.05\n",
187 |     "\n",
188 |     "# Sample data to minimize the runtime.\n",
189 |     "taxi_df = taxi_df.sample(fraction=FRACTION, seed=42)"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "markdown",
194 |    "metadata": {
195 |     "id": "26cef587d4de"
196 |    },
197 |    "source": [
198 |     "### Perform Exploratory Data Analysis (EDA)\n",
199 |     "\n",
200 |     "Perform EDA to uncover more information about your data."
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {
207 |     "id": "37d76ec684b9",
208 |     "tags": []
209 |    },
210 |    "outputs": [],
211 |    "source": [
212 |     "taxi_df.printSchema()"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "markdown",
217 |    "metadata": {
218 |     "id": "64d2006cab0f"
219 |    },
220 |    "source": [
221 |     "Select and modify necessary columns."
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "metadata": {
228 |     "id": "a0aa977e3d27",
229 |     "tags": []
230 |    },
231 |    "outputs": [],
232 |    "source": [
233 |     "# Choose necessary columns.\n",
234 |     "COLUMNS_TO_SELECT = [\n",
235 |     "    \"start_time\",\n",
236 |     "    \"end_time\",\n",
237 |     "    \"passenger_count\",\n",
238 |     "    \"trip_distance\",\n",
239 |     "    \"trip_duration\",\n",
240 |     "    \"fare_amount\",\n",
241 |     "    \"extra\",\n",
242 |     "    \"mta_tax\",\n",
243 |     "    \"tip_amount\",\n",
244 |     "    \"tolls_amount\",\n",
245 |     "    \"imp_surcharge\",\n",
246 |     "    \"airport_fee\",\n",
247 |     "    \"total_amount\",\n",
248 |     "    \"start_zone_id\",\n",
249 |     "    \"end_zone_id\",\n",
250 |     "]\n",
251 |     "\n",
252 |     "# Convert pickup_location_id and dropoff_location_id to integers for a later processing step:\n",
253 |     "taxi_df = (\n",
254 |     "  taxi_df.withColumn(\"start_zone_id\", col(\"pickup_location_id\").cast(\"int\"))  # Convert pickup_location_id to integer\n",
255 |     "  .withColumn(\"end_zone_id\", col(\"dropoff_location_id\").cast(\"int\"))  # Convert dropoff_location_id to integer\n",
256 |     ")\n",
257 |     "\n",
258 |     "# Convert datetime from string to Unix timestamp:\n",
259 |     "taxi_df = (\n",
260 |     "  taxi_df.withColumn(\"start_time\", unix_timestamp(col(\"pickup_datetime\")))  # Convert pickup_datetime to Unix timestamp\n",
261 |     "  .withColumn(\"end_time\", unix_timestamp(col(\"dropoff_datetime\")))  # Convert dropoff_datetime to Unix timestamp\n",
262 |     ")\n",
263 |     "\n",
264 |     "# Calculate trip_duration.\n",
265 |     "taxi_df = taxi_df.withColumn(\"trip_duration\", col(\"end_time\") - col(\"start_time\"))\n",
266 |     "\n",
267 |     "# Select the specified columns:\n",
268 |     "taxi_df = taxi_df.select(*COLUMNS_TO_SELECT)  # Selects columns based on the list in COLUMNS_TO_SELECT\n",
269 |     "\n",
270 |     "# Display summary statistics and preview the modified DataFrame.\n",
271 |     "taxi_df.describe().show()"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "markdown",
276 |    "metadata": {
277 |     "id": "970761b1f949"
278 |    },
279 |    "source": [
280 |     "Build a boxplot to further assess the data."
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": null,
286 |    "metadata": {},
287 |    "outputs": [],
288 |    "source": [
289 |     "# Convert Spark DataFrame into a Pandas DataFrame.\n",
290 |     "taxi_pd = taxi_df.toPandas()\n",
291 |     "\n",
292 |     "# Define columns to be converted to numerical type in Pandas and be visualized.\n",
293 |     "PD_COLUMNS = [\n",
294 |     "    \"trip_distance\",\n",
295 |     "    \"fare_amount\",\n",
296 |     "    \"extra\",\n",
297 |     "    \"mta_tax\",\n",
298 |     "    \"tip_amount\",\n",
299 |     "    \"tolls_amount\",\n",
300 |     "    \"imp_surcharge\",\n",
301 |     "    \"airport_fee\",\n",
302 |     "    \"total_amount\",\n",
303 |     "]\n",
304 |     "\n",
305 |     "# Convert columns of \"object\" type to the float type.\n",
306 |     "taxi_pd[PD_COLUMNS] = taxi_pd[PD_COLUMNS].astype(float)\n",
307 |     "\n",
308 |     "# Box plots and histograms for the specified columns.\n",
309 |     "for column in taxi_pd.columns:\n",
310 |     "    if column in PD_COLUMNS:\n",
311 |     "        _, ax = plt.subplots(1, 2, figsize=(5, 2))\n",
312 |     "        taxi_pd[column].plot(kind=\"box\", ax=ax[0])\n",
313 |     "        taxi_pd[column].plot(kind=\"hist\", ax=ax[1])\n",
314 |     "        plt.title(column)\n",
315 |     "        plt.figure()\n",
316 |     "plt.show()"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "markdown",
321 |    "metadata": {},
322 |    "source": [
323 |     "From these summary and boxplots, there are over 1 million trip histories for Yellow Taxi in 2022, which represents approximately 5% of the total trips. \n",
324 |     "\n",
325 |     "However, some trip histories have data anomalies. Trips exceeding 10,000 miles are beyond realistic expectations and will be excluded. Additionally, null and negative values in fare, tax, and tolls create inconsistencies and can distort analysis. Filter these values out of the data."
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": null,
331 |    "metadata": {
332 |     "id": "a9196e32f245",
333 |     "tags": []
334 |    },
335 |    "outputs": [],
336 |    "source": [
337 |     "taxi_df = taxi_df.where(\n",
338 |     "    (col(\"trip_distance\") < 10000)\n",
339 |     "    & (col(\"fare_amount\") > 0)\n",
340 |     "    & (col(\"extra\") >= 0)\n",
341 |     "    & (col(\"mta_tax\") >= 0)\n",
342 |     "    & (col(\"tip_amount\") >= 0)\n",
343 |     "    & (col(\"tolls_amount\") >= 0)\n",
344 |     "    & (col(\"imp_surcharge\") >= 0)\n",
345 |     "    & (col(\"airport_fee\") >= 0)\n",
346 |     "    & (col(\"total_amount\") > 0)\n",
347 |     ").dropna()"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "markdown",
352 |    "metadata": {
353 |     "id": "9a930952e7da"
354 |    },
355 |    "source": [
356 |     "### Perform Feature Engineering\n",
357 |     "\n",
358 |     "While the Taxi dataset contains trips for all NYC boroughs, precise location information is categorized using `NYC Taxi zones`. Use the `bigquery-public-data.new_york_taxi_trips.taxi_zone_geom` public dataset to calculate longitude and latitude values."
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": null,
364 |    "metadata": {
365 |     "id": "7b58375ab96a",
366 |     "tags": []
367 |    },
368 |    "outputs": [],
369 |    "source": [
370 |     "# Load the GeoJSON format of NYC Taxi zones from the BigQuery public dataset.\n",
371 |     "geo_df = (\n",
372 |     "    spark.read.format(\"bigquery\")\n",
373 |     "    .option(\"table\", \"bigquery-public-data.new_york_taxi_trips.taxi_zone_geom\")\n",
374 |     "    .load()\n",
375 |     ")\n",
376 |     "\n",
377 |     "# Convert Spark DataFrame into Pandas DataFrame to integrate with the GeoPandas library.\n",
378 |     "geo_pd = geo_df.toPandas()\n",
379 |     "\n",
380 |     "# Create a GeoDataFrame based on the central point of each taxi zone, separated by latitude and longitude.\n",
381 |     "geo_pd[\"long\"] = gpd.GeoSeries.from_wkt(geo_pd[\"zone_geom\"]).centroid.x\n",
382 |     "geo_pd[\"lat\"] = gpd.GeoSeries.from_wkt(geo_pd[\"zone_geom\"]).centroid.y\n",
383 |     "\n",
384 |     "# Drop unnecessary columns.\n",
385 |     "geo_pd = geo_pd[[\"zone_id\", \"long\", \"lat\"]]\n",
386 |     "\n",
387 |     "# Convert back to a Spark DataFrame.\n",
388 |     "geo_spark_df = spark.createDataFrame(geo_pd)\n",
389 |     "\n",
390 |     "# Join taxi_df with geographic position for each start_zone_id and end_zone_id.\n",
391 |     "taxi_zone_df = (\n",
392 |     "    taxi_df.join(geo_spark_df, taxi_df.start_zone_id == geo_spark_df.zone_id)\n",
393 |     "    .withColumnRenamed(\"long\", \"start_long\")\n",
394 |     "    .withColumnRenamed(\"lat\", \"start_lat\")\n",
395 |     "    .drop(\"zone_id\")\n",
396 |     "    .join(geo_spark_df, taxi_df.end_zone_id == geo_spark_df.zone_id)\n",
397 |     "    .withColumnRenamed(\"long\", \"end_long\")\n",
398 |     "    .withColumnRenamed(\"lat\", \"end_lat\")\n",
399 |     "    .drop(\"zone_id\")\n",
400 |     ")\n",
401 |     "\n",
402 |     "# Convert Spark DataFrame into a Pandas DataFrame.\n",
403 |     "taxi_pd = taxi_df.toPandas()\n",
404 |     "\n",
405 |     "# Convert columns of \"object\" type to the float type.\n",
406 |     "taxi_pd[\"trip_duration\"] = taxi_pd[\"trip_duration\"].astype(float)\n",
407 |     "\n",
408 |     "# Box plots and histograms for the specified columns.\n",
409 |     "_, ax = plt.subplots(1, 2, figsize=(10, 4))\n",
410 |     "taxi_pd[\"trip_duration\"].plot(kind=\"box\", ax=ax[0])\n",
411 |     "taxi_pd[\"trip_duration\"].plot(kind=\"hist\", ax=ax[1])\n",
412 |     "plt.title(\"trip_duration\")\n",
413 |     "plt.figure()\n",
414 |     "plt.show()"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "markdown",
419 |    "metadata": {
420 |     "id": "6c42f747c421"
421 |    },
422 |    "source": [
423 |     "`trip_duration` also has some extreme values. Remove these."
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": null,
429 |    "metadata": {
430 |     "id": "c4dda7df0ec8",
431 |     "tags": []
432 |    },
433 |    "outputs": [],
434 |    "source": [
435 |     "# Filter trips occurring between same taxi zones and exceeding where trip_duration is more than 28800 seconds (8 hours).\n",
436 |     "taxi_df = taxi_zone_df.where(\n",
437 |     "    (col(\"trip_duration\") < 28800) & (col(\"start_zone_id\") != col(\"end_zone_id\"))\n",
438 |     ")"
439 |    ]
440 |   },
441 |   {
442 |    "cell_type": "markdown",
443 |    "metadata": {
444 |     "id": "5f6dfc19b47e"
445 |    },
446 |    "source": [
447 |     "Create the scatterplot to see the relationship between `trip_distance` and `trip_duration`."
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": null,
453 |    "metadata": {
454 |     "id": "edfff6a2abbc",
455 |     "tags": []
456 |    },
457 |    "outputs": [],
458 |    "source": [
459 |     "# Convert Spark DataFrame into a Pandas DataFrame.\n",
460 |     "taxi_pd = taxi_df.toPandas()\n",
461 |     "\n",
462 |     "# Convert \"trip_distance\" column of \"object\" type to the float type.\n",
463 |     "taxi_pd[\"trip_distance\"] = taxi_pd[\"trip_distance\"].astype(float)\n",
464 |     "\n",
465 |     "# Filter the DataFrame to include data within reasonable ranges.\n",
466 |     "taxi_pd_filtered = taxi_pd.query(\n",
467 |     "    \"trip_distance > 0 and trip_distance < 20 \\\n",
468 |     "    and trip_duration > 0 and trip_duration < 10000\"\n",
469 |     ")\n",
470 |     "\n",
471 |     "# Scatter plot to visualize the relationship between trip_distance and trip_duration.\n",
472 |     "sns.relplot(\n",
473 |     "    data=taxi_pd_filtered,\n",
474 |     "    x=\"trip_distance\",\n",
475 |     "    y=\"trip_duration\",\n",
476 |     "    kind=\"scatter\",\n",
477 |     ")"
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "markdown",
482 |    "metadata": {
483 |     "id": "f9d42bffcebb"
484 |    },
485 |    "source": [
486 |     "Takeaways here include:\n",
487 |     "  * the data is right-skewed\n",
488 |     "  * there is a positive correlation between `trip_distance` and `trip_duration`\n",
489 |     "  * most trips are completed in under 3600 seconds (one hour)"
490 |    ]
491 |   },
492 |   {
493 |    "cell_type": "markdown",
494 |    "metadata": {
495 |     "id": "0e0ee0c6469c"
496 |    },
497 |    "source": [
498 |     "### Feature Selection\n",
499 |     "\n",
500 |     "Use `VectorAssembler()` to consolidate feature columns into a vector column."
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "code",
505 |    "execution_count": null,
506 |    "metadata": {
507 |     "id": "c085bae96dec",
508 |     "tags": []
509 |    },
510 |    "outputs": [],
511 |    "source": [
512 |     "# List of selected features for training the model.\n",
513 |     "feature_cols = [\n",
514 |     "    \"passenger_count\",\n",
515 |     "    \"trip_distance\",\n",
516 |     "    \"start_time\",\n",
517 |     "    \"end_time\",\n",
518 |     "    \"start_long\",\n",
519 |     "    \"start_lat\",\n",
520 |     "    \"end_long\",\n",
521 |     "    \"end_lat\",\n",
522 |     "    \"total_amount\",\n",
523 |     "    \"fare_amount\",\n",
524 |     "    \"extra\",\n",
525 |     "    \"mta_tax\",\n",
526 |     "    \"tip_amount\",\n",
527 |     "    \"tolls_amount\",\n",
528 |     "    \"imp_surcharge\",\n",
529 |     "    \"airport_fee\",\n",
530 |     "]\n",
531 |     "\n",
532 |     "# Create a VectorAssembler with specified input and output columns.\n",
533 |     "assembler = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n",
534 |     "\n",
535 |     "# Transform each column into vector form using the VectorAssembler.\n",
536 |     "taxi_transformed_data = assembler.transform(taxi_df)\n",
537 |     "\n",
538 |     "# Randomly split the transformed data into training and test sets.\n",
539 |     "(taxi_training_data, taxi_test_data) = taxi_transformed_data.randomSplit([0.95, 0.05])"
540 |    ]
541 |   },
542 |   {
543 |    "cell_type": "markdown",
544 |    "metadata": {
545 |     "id": "e68b4258d4d5"
546 |    },
547 |    "source": [
548 |     "### Training the Model\n",
549 |     "\n",
550 |     "Use `GBTRegressor` model to train the data."
551 |    ]
552 |   },
553 |   {
554 |    "cell_type": "code",
555 |    "execution_count": null,
556 |    "metadata": {
557 |     "id": "cd2e2c8c4396",
558 |     "tags": []
559 |    },
560 |    "outputs": [],
561 |    "source": [
562 |     "# Define GBTRegressor model with specified input, output, and prediction columns.\n",
563 |     "gbt = GBTRegressor(\n",
564 |     "    featuresCol=\"features\",\n",
565 |     "    labelCol=\"trip_duration\",\n",
566 |     "    predictionCol=\"pred_trip_duration\",\n",
567 |     ")\n",
568 |     "\n",
569 |     "# Define an evaluator for calculating the R2 score.\n",
570 |     "evaluator_r2 = RegressionEvaluator(\n",
571 |     "    labelCol=gbt.getLabelCol(), predictionCol=gbt.getPredictionCol(), metricName=\"r2\"\n",
572 |     ")\n",
573 |     "\n",
574 |     "# Define an evaluator for calculating the RMSE error.\n",
575 |     "evaluator_rmse = RegressionEvaluator(\n",
576 |     "    labelCol=gbt.getLabelCol(), predictionCol=gbt.getPredictionCol(), metricName=\"rmse\"\n",
577 |     ")"
578 |    ]
579 |   },
580 |   {
581 |    "cell_type": "code",
582 |    "execution_count": null,
583 |    "metadata": {
584 |     "id": "3080a7ddabf9",
585 |     "tags": []
586 |    },
587 |    "outputs": [],
588 |    "source": [
589 |     "# Train a Gradient Boosted Trees (GBT) model on the Taxi dataset. This process may take several minutes.\n",
590 |     "taxi_gbt_model = gbt.fit(taxi_training_data)\n",
591 |     "\n",
592 |     "# Get predictions for the Taxi dataset using the trained GBT model.\n",
593 |     "taxi_gbt_predictions = taxi_gbt_model.transform(taxi_test_data)"
594 |    ]
595 |   },
596 |   {
597 |    "cell_type": "code",
598 |    "execution_count": null,
599 |    "metadata": {
600 |     "id": "a4e4f60a8a3e",
601 |     "tags": []
602 |    },
603 |    "outputs": [],
604 |    "source": [
605 |     "# Evaluate the R2 score for the Taxi dataset predictions.\n",
606 |     "taxi_gbt_accuracy_r2 = evaluator_r2.evaluate(taxi_gbt_predictions)\n",
607 |     "print(f\"Taxi Test GBT R2 Accuracy = {taxi_gbt_accuracy_r2}\")\n",
608 |     "\n",
609 |     "# Evaluate the Root Mean Squared Error (RMSE) for the Taxi dataset predictions.\n",
610 |     "taxi_gbt_accuracy_rmse = evaluator_rmse.evaluate(taxi_gbt_predictions)\n",
611 |     "print(f\"Taxi Test GBT RMSE Accuracy = {taxi_gbt_accuracy_rmse}\")"
612 |    ]
613 |   },
614 |   {
615 |    "cell_type": "markdown",
616 |    "metadata": {
617 |     "id": "7d07452cd103"
618 |    },
619 |    "source": [
620 |     "### View the result\n",
621 |     "\n",
622 |     "Expect an R2 score of approximately 83-87% and a Root Mean Square Error(RMSE) of 200-300. This sample does not include [Cross-validation (statistics)](https://en.wikipedia.org/wiki/Cross-validation_%28statistics%29) which can provide improved model performance."
623 |    ]
624 |   },
625 |   {
626 |    "cell_type": "markdown",
627 |    "metadata": {
628 |     "id": "d1b46847f395"
629 |    },
630 |    "source": [
631 |     "### Save the model to Cloud Storage for future use\n",
632 |     "\n",
633 |     "To ensure the preservation and accessibility of the trained model, it can be saved to a Cloud Storage path."
634 |    ]
635 |   },
636 |   {
637 |    "cell_type": "code",
638 |    "execution_count": null,
639 |    "metadata": {
640 |     "id": "9cc57f4362c4"
641 |    },
642 |    "outputs": [],
643 |    "source": [
644 |     "# Save the trained model to a Cloud Storage path\n",
645 |     "taxi_gbt_model.write().overwrite().save(f\"gs://{BUCKET_URI}/\")"
646 |    ]
647 |   },
648 |   {
649 |    "cell_type": "markdown",
650 |    "metadata": {},
651 |    "source": [
652 |     "### Delete the Dataproc session and the session template\n",
653 |     "\n",
654 |     "To delete the running Dataproc Serverless session, run the following command.\n",
655 |     "If you've completed this tutorial as part of the [Analytics Lakehouse](https://console.cloud.google.com/products/solutions/details/analytics-lakehouse) solution, you will need to proceed with this step before deleting the solution from your project."
656 |    ]
657 |   },
658 |   {
659 |    "cell_type": "code",
660 |    "execution_count": null,
661 |    "metadata": {},
662 |    "outputs": [],
663 |    "source": [
664 |     "# Delete the session template\n",
665 |     "!gcloud beta dataproc session-templates delete sparkml-template --location='{LOCATION}' --quiet\n",
666 |     "\n",
667 |     "# Delete the Dataproc Serverless Session if session exists\n",
668 |     "if SESSION:\n",
669 |     "    !gcloud beta dataproc sessions terminate '{SESSION}' --location='{LOCATION}' --quiet"
670 |    ]
671 |   }
672 |  ],
673 |  "metadata": {
674 |   "colab": {
675 |    "name": "spark_ml.ipynb",
676 |    "toc_visible": true
677 |   },
678 |   "environment": {
679 |    "kernel": "9c39b79e5d2e7072beb4bd59-runtime-00002d16685d",
680 |    "name": "workbench-notebooks.m113",
681 |    "type": "gcloud",
682 |    "uri": "gcr.io/deeplearning-platform-release/workbench-notebooks:m113"
683 |   },
684 |   "kernelspec": {
685 |    "display_name": "test on Serverless Spark (Remote)",
686 |    "language": "python",
687 |    "name": "9c39b79e5d2e7072beb4bd59-runtime-00002d16685d"
688 |   },
689 |   "language_info": {
690 |    "codemirror_mode": {
691 |     "name": "ipython",
692 |     "version": 3
693 |    },
694 |    "file_extension": ".py",
695 |    "mimetype": "text/x-python",
696 |    "name": "python",
697 |    "nbconvert_exporter": "python",
698 |    "pygments_lexer": "ipython3",
699 |    "version": "3.11.0"
700 |   }
701 |  },
702 |  "nbformat": 4,
703 |  "nbformat_minor": 4
704 | }
705 | 


--------------------------------------------------------------------------------