├── dlp-demo
    ├── service.yaml
    ├── test_app.py
    ├── app
    │   ├── __init__.py
    │   ├── services
    │   │   ├── __init__.py
    │   │   └── dlp_service.py
    │   ├── static
    │   │   └── images
    │   │   │   ├── logo.png
    │   │   │   └── symbol-full-color.svg
    │   ├── templates
    │   │   ├── message.html
    │   │   ├── base.html
    │   │   └── dlp-demo.html
    │   ├── config.py
    │   └── main.py
    ├── .dockerignore
    ├── .gcloudignore
    ├── .env.example
    ├── run.py
    ├── Dockerfile
    ├── pyproject.toml
    ├── deploy.sh
    └── README.md
├── dataplex
    ├── lineage
    │   ├── requirements.txt
    │   └── lineage_tools.py
    └── profiling
    │   ├── store_transactions_20240806.csv
    │   ├── app_events_20240806_1400.jsonl
    │   ├── web_events_20240806_14.json
    │   ├── README.md
    │   ├── profile.md
    │   └── main.tf
├── dataflow
    ├── dflow-bq-stream-python
    │   ├── req1.txt
    │   ├── rows.png
    │   ├── send.png
    │   ├── pipeline.png
    │   ├── req2.txt
    │   ├── setup.sh
    │   ├── send_events.py
    │   ├── schema_defs.py
    │   ├── README.md
    │   └── process_events.py
    └── simple_demos
    │   ├── beam_demo_1.py
    │   └── beam_demo_2.py
├── bigquery
    ├── schema-demo
    │   ├── requirements.txt
    │   ├── order_schema.json
    │   ├── line_item_schema.json
    │   ├── product_schema.json
    │   ├── denorm_query.sql
    │   ├── customer_schema.json
    │   ├── norm_query.sql
    │   ├── load_data.sql
    │   ├── nested_queries.sql
    │   ├── README.md
    │   ├── load_data.sh
    │   └── generate_data.py
    ├── wiki_query_example.sql
    ├── external_hive_example.sql
    ├── time_travel_example.sql
    ├── approx_example.sql
    ├── mv_example.sql
    ├── udf_examples.sql
    ├── views_example.sql
    ├── time_travel.sh
    ├── exported_billing_data_example.sql
    ├── github_demo.sql
    ├── elt_examples.sql
    ├── arrays_examples.sql
    ├── information_schema_examples.sql
    └── scds_examples.sql
├── terraform
    └── exp_to_tf.sh
├── docs
    ├── img
    │   ├── 2072fd183685cee3.png
    │   ├── c2f8eeefc77bc843.png
    │   ├── e4b52eedaff69ff5.png
    │   └── f4db34e38b750e09.png
    └── codelab.json
├── security
    ├── org_policy
    │   ├── policy.yaml
    │   ├── constraint.sh
    │   └── constraint.yaml
    └── auth_examples.py
├── dataproc
    ├── autoscaling_policy.yaml
    ├── dataproc_scale_demo.sh
    └── dataproc_autoscale_demo.sh
├── utilities
    └── shopping_list_api
    │   ├── pyproject.toml
    │   ├── Dockerfile
    │   ├── main.py
    │   └── README.md
├── .gcloudignore
├── composer
    ├── dag_development
    │   ├── validate_dag.sh
    │   └── validate_dag.py
    └── dags
    │   ├── bq_export_strategies.py
    │   └── export_top_customers.py
├── ai
    ├── pipelines
    │   └── README.md
    ├── automl
    │   ├── salads_deploy.py
    │   ├── adoption_deploy.py
    │   ├── README.md
    │   └── adoption_predict.py
    └── del_endpoints.py
├── NEW_AUG25.md
├── .gitignore
├── functions
    └── cat-bq-completions.py
└── README.md


/dlp-demo/service.yaml:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dlp-demo/test_app.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dataplex/lineage/requirements.txt:
--------------------------------------------------------------------------------
1 | google-cloud-datacatalog-lineage


--------------------------------------------------------------------------------
/dlp-demo/app/__init__.py:
--------------------------------------------------------------------------------
1 | """DLP Demo application package."""
2 | 


--------------------------------------------------------------------------------
/dlp-demo/.dockerignore:
--------------------------------------------------------------------------------
1 | .venv/
2 | .env
3 | __pycache__/
4 | *.pyc
5 | 


--------------------------------------------------------------------------------
/dlp-demo/.gcloudignore:
--------------------------------------------------------------------------------
1 | .venv/
2 | .env
3 | __pycache__/
4 | *.pyc
5 | 


--------------------------------------------------------------------------------
/dataflow/dflow-bq-stream-python/req1.txt:
--------------------------------------------------------------------------------
1 | apache-beam[gcp]
2 | google-cloud-core


--------------------------------------------------------------------------------
/bigquery/schema-demo/requirements.txt:
--------------------------------------------------------------------------------
1 | apache-beam[gcp]
2 | google-cloud-core
3 | 


--------------------------------------------------------------------------------
/dlp-demo/app/services/__init__.py:
--------------------------------------------------------------------------------
1 | """Service package for DLP demo application."""
2 | 


--------------------------------------------------------------------------------
/terraform/exp_to_tf.sh:
--------------------------------------------------------------------------------
1 | gcloud beta resource-config bulk-export --path=./proj_spec --resource-format=terraform


--------------------------------------------------------------------------------
/docs/img/2072fd183685cee3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/roitraining/gcp-demos/HEAD/docs/img/2072fd183685cee3.png


--------------------------------------------------------------------------------
/docs/img/c2f8eeefc77bc843.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/roitraining/gcp-demos/HEAD/docs/img/c2f8eeefc77bc843.png


--------------------------------------------------------------------------------
/docs/img/e4b52eedaff69ff5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/roitraining/gcp-demos/HEAD/docs/img/e4b52eedaff69ff5.png


--------------------------------------------------------------------------------
/docs/img/f4db34e38b750e09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/roitraining/gcp-demos/HEAD/docs/img/f4db34e38b750e09.png


--------------------------------------------------------------------------------
/dlp-demo/app/static/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/roitraining/gcp-demos/HEAD/dlp-demo/app/static/images/logo.png


--------------------------------------------------------------------------------
/dataflow/dflow-bq-stream-python/rows.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/roitraining/gcp-demos/HEAD/dataflow/dflow-bq-stream-python/rows.png


--------------------------------------------------------------------------------
/dataflow/dflow-bq-stream-python/send.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/roitraining/gcp-demos/HEAD/dataflow/dflow-bq-stream-python/send.png


--------------------------------------------------------------------------------
/dataflow/dflow-bq-stream-python/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/roitraining/gcp-demos/HEAD/dataflow/dflow-bq-stream-python/pipeline.png


--------------------------------------------------------------------------------
/dlp-demo/.env.example:
--------------------------------------------------------------------------------
1 | # Environment variables for local development
2 | GOOGLE_CLOUD_PROJECT=your-project-id
3 | PORT=8080
4 | FLASK_ENV=development
5 | 


--------------------------------------------------------------------------------
/dataflow/dflow-bq-stream-python/req2.txt:
--------------------------------------------------------------------------------
1 | google-cloud-core
2 | google-cloud-pubsub
3 | google-api-python-client
4 | google-auth
5 | google-auth-httplib2
6 | google-cloud-bigquery


--------------------------------------------------------------------------------
/security/org_policy/policy.yaml:
--------------------------------------------------------------------------------
1 | # replace PROJECT_ID
2 | 
3 | name: projects/PROJECT_ID/policies/custom.gcsBucketLocationConstraint
4 | spec:
5 |   rules:
6 |   - enforce: true
7 | 


--------------------------------------------------------------------------------
/bigquery/schema-demo/order_schema.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "name": "order_num",
 4 |     "type": "STRING"
 5 |   },
 6 |   {
 7 |     "name": "cust_id",
 8 |     "type": "INTEGER"
 9 |   },
10 |   {
11 |     "name": "order_date",
12 |     "type": "DATE"
13 |   }
14 | ]
15 | 


--------------------------------------------------------------------------------
/dataproc/autoscaling_policy.yaml:
--------------------------------------------------------------------------------
 1 | basicAlgorithm:
 2 |   yarnConfig:
 3 |     gracefulDecommissionTimeout: 30s
 4 |     scaleDownFactor: 0.5
 5 |     scaleUpFactor: 0.5
 6 | workerConfig:
 7 |   minInstances: 2
 8 |   maxInstances: 10
 9 | secondaryWorkerConfig:
10 |   minInstances: 0
11 |   maxInstances: 150


--------------------------------------------------------------------------------
/bigquery/wiki_query_example.sql:
--------------------------------------------------------------------------------
 1 | -- StandardSQL
 2 | -- wiki 1M
 3 | SELECT
 4 |   title,
 5 |   SUM(views) AS views,
 6 |   COUNT(views) AS rows_summed
 7 | FROM
 8 |   `bigquery-samples.wikipedia_benchmark.Wiki1M`
 9 | WHERE
10 |   REGEXP_CONTAINS(title,".*Davis.*")
11 | GROUP BY
12 |   title
13 | ORDER BY
14 |   views DESC


--------------------------------------------------------------------------------
/bigquery/schema-demo/line_item_schema.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "name": "order_num",
 4 |     "type": "STRING"
 5 |   },
 6 |   {
 7 |     "name": "line_item_num",
 8 |     "type": "INTEGER"
 9 |   },
10 |   {
11 |     "name": "prod_code",
12 |     "type": "STRING"
13 |   },
14 |   {
15 |     "name": "qty",
16 |     "type": "INTEGER"
17 |   }
18 | ]
19 | 


--------------------------------------------------------------------------------
/bigquery/schema-demo/product_schema.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "name": "prod_code",
 4 |     "type": "STRING"
 5 |   },
 6 |   {
 7 |     "name": "prod_name",
 8 |     "type": "STRING"
 9 |   },
10 |   {
11 |     "name": "prod_desc",
12 |     "type": "STRING"
13 |   },
14 |   {
15 |     "name": "prod_price",
16 |     "type": "FLOAT"
17 |   }
18 | ]
19 | 


--------------------------------------------------------------------------------
/security/org_policy/constraint.sh:
--------------------------------------------------------------------------------
1 | gcloud org-policies set-custom-constraint constraint.yaml # make the custom constraint available for org policies
2 | gcloud org-policies set-policy policy.yaml # apply new constraint
3 | 
4 | ## To remove constraint, run gcloud org-policies reset custom.gcsBucketLocationConstraint --project PROJECT_ID (with your project id)


--------------------------------------------------------------------------------
/bigquery/external_hive_example.sql:
--------------------------------------------------------------------------------
 1 | -- query the external table
 2 | SELECT
 3 |   *
 4 | FROM
 5 |   class.ext_part
 6 | 
 7 | -- query external table with where clause
 8 | SELECT
 9 |   *
10 | FROM
11 |   class.ext_part
12 | WHERE
13 |   order_num="68610383-54"
14 | 
15 | --query external table on partition
16 | SELECT
17 |   *
18 | FROM
19 |   class.ext_part
20 | WHERE
21 |   order_date="2018-01-01"


--------------------------------------------------------------------------------
/utilities/shopping_list_api/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "shopping-list-api"
 3 | version = "0.1.0"
 4 | description = "Add your description here"
 5 | readme = "README.md"
 6 | requires-python = ">=3.12"
 7 | dependencies = [
 8 |     "dotenv>=0.9.9",
 9 |     "fastapi>=0.116.1",
10 |     "google-auth>=2.40.3",
11 |     "google-genai>=1.28.0",
12 |     "openai>=1.98.0",
13 |     "uvicorn>=0.35.0",
14 | ]
15 | 


--------------------------------------------------------------------------------
/security/org_policy/constraint.yaml:
--------------------------------------------------------------------------------
 1 | # replace ORG_ID
 2 | 
 3 | name: organizations/ORG_ID/customConstraints/custom.gcsBucketLocationConstraint
 4 | displayName: Restrict GCS Bucket Location
 5 | description: Restricts Cloud Storage buckets to be created only in the 'us-central1' region.
 6 | actionType: DENY
 7 | condition: |
 8 |   resource.location.startsWith('us-central1') == false
 9 | methodTypes:
10 |   - CREATE
11 | resourceTypes:
12 |   - storage.googleapis.com/Bucket
13 | 


--------------------------------------------------------------------------------
/bigquery/schema-demo/denorm_query.sql:
--------------------------------------------------------------------------------
 1 | -- find sales/zip for march
 2 | -- base denorm table
 3 | WITH
 4 |   orders AS (
 5 |   SELECT
 6 |     cust_zip,
 7 |     prod_price * qty AS line_item_subtotal
 8 |   FROM
 9 |     `<project-id>.bq_demo.denorm`
10 |   WHERE
11 |     order_date >= "2018-03-01"
12 |     AND order_date <= "2018-03-31")
13 | SELECT
14 |   cust_zip,
15 |   SUM(line_item_subtotal) as zip_sales
16 | FROM
17 |   orders
18 | GROUP BY
19 |   cust_zip
20 | order by 
21 |   zip_sales desc
22 | 


--------------------------------------------------------------------------------
/bigquery/schema-demo/customer_schema.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "name": "cust_id",
 4 |     "type": "INTEGER"
 5 |   },
 6 |   {
 7 |     "name": "cust_name",
 8 |     "type": "STRING"
 9 |   },
10 |   {
11 |     "name": "cust_address",
12 |     "type": "STRING"
13 |   },
14 |   {
15 |     "name": "cust_state",
16 |     "type": "STRING"
17 |   },
18 |   {
19 |     "name": "cust_zip",
20 |     "type": "INTEGER"
21 |   },
22 |   {
23 |     "name": "cust_email",
24 |     "type": "STRING"
25 |   },
26 |   {
27 |     "name": "cust_phone",
28 |     "type": "STRING"
29 |   }
30 | ]
31 | 


--------------------------------------------------------------------------------
/bigquery/time_travel_example.sql:
--------------------------------------------------------------------------------
 1 | -- view up-to-date table
 2 | SELECT
 3 |   month,
 4 |   COUNT(*)
 5 | FROM
 6 |   `class.time_travel`
 7 | GROUP BY
 8 |   month;
 9 | 
10 | -- view table with only initial load
11 | SELECT
12 |   month,
13 |   COUNT(*)
14 | FROM
15 |   `class.time_travel` FOR SYSTEM_TIME AS OF TIMESTAMP_SECONDS(target)
16 | GROUP BY
17 |   month;
18 | 
19 | -- create restoration table
20 | CREATE OR REPLACE TABLE
21 |   class.time_travel_restore AS (
22 |   SELECT
23 |     *
24 |   FROM
25 |     `class.time_travel` FOR SYSTEM_TIME AS OF TIMESTAMP_SECONDS(target))
26 | 


--------------------------------------------------------------------------------
/bigquery/approx_example.sql:
--------------------------------------------------------------------------------
 1 | -- Run these within a QLabs projects
 2 | -- Run the approx first
 3 | -- When it's done, run the exact
 4 | -- Compare times and % difference
 5 | 
 6 | -- First query - exact
 7 | -- StandardSQL
 8 | -- wiki 1M
 9 | SELECT
10 |   COUNT(distinct title) AS articles
11 | FROM
12 |   `bigquery-samples.wikipedia_benchmark.Wiki100B`
13 | ORDER BY
14 |   articles DESC
15 | 
16 | -- Second query - approx
17 | -- StandardSQL
18 | -- wiki 1M
19 | SELECT
20 |   approx_count_distinct(title) AS articles
21 | FROM
22 |   `bigquery-samples.wikipedia_benchmark.Wiki100B`
23 | ORDER BY
24 |   articles DESC


--------------------------------------------------------------------------------
/.gcloudignore:
--------------------------------------------------------------------------------
 1 | # This file specifies files that are *not* uploaded to Google Cloud Platform
 2 | # using gcloud. It follows the same syntax as .gitignore, with the addition of
 3 | # "#!include" directives (which insert the entries of the given .gitignore-style
 4 | # file at that point).
 5 | #
 6 | # For more information, run:
 7 | #   $ gcloud topic gcloudignore
 8 | #
 9 | .gcloudignore
10 | # If you would like to upload your .git directory, .gitignore file or files
11 | # from your .gitignore file, remove the corresponding line
12 | # below:
13 | .git
14 | .gitignore
15 | 
16 | node_modules
17 | #!include:.gitignore
18 | 


--------------------------------------------------------------------------------
/dlp-demo/app/templates/message.html:
--------------------------------------------------------------------------------
 1 | {% extends 'base.html' %}
 2 | 
 3 | {% block head %}
 4 | <title>{{ title }}</title>
 5 | {% endblock %}
 6 | 
 7 | {% block content %}
 8 | <div class="row justify-content-center">
 9 |   <div class="col-md-8 col-lg-6">
10 |     <div class="card">
11 |       <div class="card-header bg-warning text-dark">
12 |         <h4 class="mb-0">{{ headline }}</h4>
13 |       </div>
14 |       <div class="card-body text-center">
15 |         <p class="card-text">{{ message_text }}</p>
16 |         <a href="/" class="btn btn-primary">Go Home</a>
17 |       </div>
18 |     </div>
19 |   </div>
20 | </div>
21 | {% endblock %}
22 | 


--------------------------------------------------------------------------------
/docs/codelab.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "environment": "web",
 3 |   "format": "html",
 4 |   "prefix": "https://storage.googleapis.com",
 5 |   "mainga": "UA-49880327-14",
 6 |   "updated": "2025-08-21T02:00:40Z",
 7 |   "id": "docs",
 8 |   "duration": 0,
 9 |   "title": "Do-It-Now activities (gcp-demos)",
10 |   "summary": "Do It Now activities that go with the gcp-demos repo",
11 |   "source": "14C6K5l2yJ1ZJToChz72lPq-pFiKm_WcRGcgjnf5VJO0",
12 |   "theme": "",
13 |   "status": [
14 |     "-"
15 |   ],
16 |   "category": [
17 |     "-"
18 |   ],
19 |   "tags": [
20 |     "detailed",
21 |     "web"
22 |   ],
23 |   "feedback": "-",
24 |   "url": "docs"
25 | }
26 | 


--------------------------------------------------------------------------------
/composer/dag_development/validate_dag.sh:
--------------------------------------------------------------------------------
 1 | # In your environment's bucket, create a test directory and copy your DAGs to it.
 2 | gcloud storage cp gs://us-central1-example-environment-a12bc345-bucket/dags \
 3 |   gs://us-central1-example-environment-a12bc345-bucket/data/test --recursive
 4 | 
 5 | # Test for errors in all your DAGs
 6 | gcloud storage cp gs://us-central1-example-environment-a12bc345-bucket/dags \
 7 |   gs://us-central1-example-environment-a12bc345-bucket/data/test --recursive
 8 | 
 9 | # Test a task for errors
10 | gcloud composer environments run \
11 |   ENVIRONMENT_NAME \
12 |   --location ENVIRONMENT_LOCATION \
13 |   tasks test -- --subdir /home/airflow/gcs/data/test \
14 |   DAG_ID TASK_ID \
15 |   DAG_EXECUTION_DATE


--------------------------------------------------------------------------------
/utilities/shopping_list_api/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.12-slim-bookworm AS base
 2 | 
 3 | 
 4 | FROM base AS builder
 5 | 
 6 | COPY --from=ghcr.io/astral-sh/uv:0.4.9 /uv /bin/uv
 7 | 
 8 | ENV UV_COMPILE_BYTECODE=1 UV_LINK_MODE=copy
 9 | 
10 | WORKDIR /app
11 | 
12 | COPY uv.lock pyproject.toml /app/
13 | 
14 | RUN --mount=type=cache,target=/root/.cache/uv \
15 |   uv sync --frozen --no-install-project --no-dev
16 | 
17 | COPY . /app
18 | 
19 | RUN --mount=type=cache,target=/root/.cache/uv \
20 |   uv sync --frozen --no-dev
21 | 
22 | FROM base
23 | 
24 | WORKDIR /app
25 | 
26 | COPY --from=builder /app .
27 | 
28 | ENV PATH="/app/.venv/bin:$PATH"
29 | 
30 | EXPOSE 8080
31 | 
32 | CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"]
33 | 


--------------------------------------------------------------------------------
/dlp-demo/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Entry point for the DLP Demo application.
 4 | """
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | # Add the project root to the Python path
10 | project_root = os.path.dirname(os.path.abspath(__file__))
11 | sys.path.insert(0, project_root)
12 | 
13 | from app.main import app
14 | 
15 | if __name__ == "__main__":
16 |     # Load environment variables
17 |     from dotenv import load_dotenv
18 | 
19 |     load_dotenv()
20 | 
21 |     # Configuration
22 |     port = int(os.environ.get("PORT", 8080))
23 |     debug = os.environ.get("FLASK_ENV") == "development"
24 | 
25 |     print(f"Starting DLP Demo application on port {port}")
26 |     print(f"Debug mode: {debug}")
27 | 
28 |     app.run(host="0.0.0.0", port=port, debug=debug)
29 | 


--------------------------------------------------------------------------------
/dataflow/dflow-bq-stream-python/setup.sh:
--------------------------------------------------------------------------------
 1 |  #!/bin/bash
 2 | 
 3 | export PROJECT_ID=$(gcloud config get-value project)
 4 | 
 5 | python -m venv .venv
 6 | source .venv/bin/activate 
 7 | pip install -r req1.txt
 8 | pip install -r req2.txt
 9 | 
10 | gcloud iam service-accounts create $1  \
11 |     --display-name="$1"
12 | sleep 2
13 | export sa_email=$(gcloud iam service-accounts list --filter="displayName:$1" --format="value(email)")
14 | gcloud projects add-iam-policy-binding $PROJECT_ID\
15 |     --member="serviceAccount:$sa_email" \
16 |     --role="roles/editor"
17 | gcloud iam service-accounts keys create $1.json --iam-account=$sa_email
18 | export GOOGLE_APPLICATION_CREDENTIALS=$1.json
19 | 
20 | gsutil mb -l us-central1 "gs://$PROJECT_ID-dflow-demo"
21 | 
22 | gcloud services disable dataflow
23 | sleep 5
24 | gcloud services enable  bigquery pubsub dataflow


--------------------------------------------------------------------------------
/bigquery/mv_example.sql:
--------------------------------------------------------------------------------
 1 | -- this is for reference; cannot be run unless you're an admin on dataset
 2 | CREATE MATERIALIZED VIEW
 3 |   roi-bq-demos.bq_demo.order_mv AS
 4 | SELECT
 5 |   cust_zip,
 6 |   order_date,
 7 |   count(*) as orders
 8 | FROM
 9 |   `roi-bq-demos.bq_demo.customer` c
10 | JOIN
11 |   `roi-bq-demos.bq_demo.order` o
12 | ON
13 |   o.cust_id= c.cust_id
14 | GROUP BY
15 |   order_date,
16 |   cust_zip
17 | 
18 | -- query against materialized view
19 | SELECT
20 |   *
21 | FROM
22 |   `roi-bq-demos.bq_demo.order_mv`
23 | WHERE
24 |   cust_zip<2000
25 | 
26 | -- query against original tables that automatically uses the materialized view
27 | SELECT
28 |   cust_zip,
29 |   order_date,
30 |   COUNT(*) AS orders
31 | FROM
32 |   `roi-bq-demos.bq_demo.customer` c
33 | JOIN
34 |   `roi-bq-demos.bq_demo.order` o
35 | ON
36 |   o.cust_id = c.cust_id
37 | WHERE
38 |   cust_zip<2000
39 | GROUP BY
40 |   order_date,
41 |   cust_zip


--------------------------------------------------------------------------------
/bigquery/udf_examples.sql:
--------------------------------------------------------------------------------
 1 | -- trim strings
 2 | SELECT
 3 |   text AS messy,
 4 |   TRIM(REGEXP_REPLACE(LOWER(text), '[^a-zA-Z0-9 ]+', '')) AS tidy
 5 | FROM
 6 |   `roi-bq-demos.bq_demo.messy_text`
 7 |   
 8 | -- create udf
 9 | CREATE OR REPLACE FUNCTION
10 |   `class.tidy_string` (text STRING)
11 |   RETURNS STRING AS (TRIM(REGEXP_REPLACE(LOWER(text), '[^a-zA-Z0-9 ]+', '')));
12 | 
13 | -- query with SQL UDF
14 | SELECT
15 |   text AS messy,
16 |   `class.tidy_string`(text) AS tidy
17 | FROM
18 |   `roi-bq-demos.bq_demo.messy_text`
19 | 
20 | -- create javascript udf
21 | CREATE OR REPLACE FUNCTION
22 |   `class.get_numbers`(str STRING)
23 |   RETURNS NUMERIC
24 |   LANGUAGE js AS '''
25 |    return nlp(str).values(0).toNumber().out()
26 | ''' OPTIONS ( library="gs://fh-bigquery/js/compromise.min.11.14.0.js");
27 | 
28 | -- query with javascript udf
29 | SELECT
30 |   text,
31 |   `class.get_numbers`(text) AS number
32 | FROM
33 |   `roi-bq-demos.bq_demo.number_strings`


--------------------------------------------------------------------------------
/dlp-demo/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Cloud Run deployment configuration
 2 | FROM python:3.11-slim
 3 | 
 4 | # Set environment variables
 5 | ENV PYTHONUNBUFFERED=1
 6 | ENV PATH="/app/.venv/bin:$PATH"
 7 | 
 8 | # Install uv
 9 | COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
10 | 
11 | # Set working directory
12 | WORKDIR /app
13 | 
14 | # Copy dependency files
15 | COPY pyproject.toml uv.lock ./
16 | 
17 | # Install dependencies
18 | RUN uv sync --frozen --no-cache
19 | 
20 | # Copy application code
21 | COPY . .
22 | 
23 | # Create non-root user
24 | RUN useradd --create-home --shell /bin/bash app && chown -R app:app /app
25 | USER app
26 | 
27 | # Expose port
28 | EXPOSE 8080
29 | 
30 | # Health check
31 | HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
32 |   CMD curl -f http://localhost:8080/health || exit 1
33 | 
34 | # Run application
35 | CMD ["uv", "run", "gunicorn", "--bind", "0.0.0.0:8080", "--workers", "2", "--timeout", "60", "app.main:app"]
36 | 


--------------------------------------------------------------------------------
/dlp-demo/app/config.py:
--------------------------------------------------------------------------------
 1 | """Configuration settings for the DLP demo application."""
 2 | 
 3 | import os
 4 | 
 5 | 
 6 | class Config:
 7 |     """Base configuration class."""
 8 | 
 9 |     # Google Cloud settings
10 |     GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT")
11 | 
12 |     # Flask settings
13 |     SECRET_KEY = os.environ.get("SECRET_KEY", "dev-secret-key-change-in-production")
14 | 
15 |     # Application settings
16 |     PORT = int(os.environ.get("PORT", 8080))
17 | 
18 |     @classmethod
19 |     def validate(cls) -> None:
20 |         """Validate required configuration values."""
21 |         if not cls.GOOGLE_CLOUD_PROJECT:
22 |             raise ValueError("GOOGLE_CLOUD_PROJECT environment variable is required")
23 | 
24 | 
25 | class DevelopmentConfig(Config):
26 |     """Development configuration."""
27 | 
28 |     DEBUG = True
29 | 
30 | 
31 | class ProductionConfig(Config):
32 |     """Production configuration."""
33 | 
34 |     DEBUG = False
35 | 


--------------------------------------------------------------------------------
/bigquery/views_example.sql:
--------------------------------------------------------------------------------
 1 | -- query with subquery
 2 | SELECT
 3 |   *
 4 | FROM (
 5 |   SELECT
 6 |     cust_id,
 7 |     cust_name,
 8 |     cust_address,
 9 |     cust_zip
10 |   FROM
11 |     `roi-bq-demos.bq_demo.customer`
12 |   WHERE
13 |     cust_state="NJ")
14 | WHERE
15 |   cust_id <100000
16 | 
17 | -- with clause
18 | WITH
19 |   nj AS (
20 |   SELECT
21 |     cust_id,
22 |     cust_name,
23 |     cust_address,
24 |     cust_zip
25 |   FROM
26 |     `roi-bq-demos.bq_demo.customer`
27 |   WHERE
28 |     cust_state="NJ")
29 | SELECT
30 |   *
31 | FROM
32 |   nj
33 | WHERE
34 |   cust_id<100000
35 | 
36 | -- create view
37 | -- assumes dataset in current project named class
38 | CREATE OR REPLACE VIEW
39 |   `class.nj_view` AS
40 | SELECT
41 |   cust_id,
42 |   cust_name,
43 |   cust_address,
44 |   cust_zip
45 | FROM
46 |   `roi-bq-demos.bq_demo.customer`
47 | WHERE
48 |   cust_state="NJ"
49 | 
50 | -- query view
51 | SELECT
52 |   *
53 | FROM
54 |   `class.nj_view`
55 | WHERE
56 |   cust_id < 100000


--------------------------------------------------------------------------------
/ai/pipelines/README.md:
--------------------------------------------------------------------------------
 1 | # Simple Vertex AI Pipeline demo
 2 | 
 3 | ## Setup
 4 | 1. Load [notebook](https://github.com/roitraining/challenge-labs-public/blob/main/data%20science/challenge-labs-pipelines.ipynb) in a Vertex AI Workbench instance
 5 |    1. You can rull in Collab Enterprise, but you need to change `serviceAccount:` to `user:` as the code will run as you rather than the service account of an instance VM.
 6 | 
 7 | ## Demo
 8 | 
 9 | 1. Run all the cells in the notebook
10 | 2. Talk the students through the pipeline definition
11 | 3. Show the students where the source data is
12 | 4. Show the students the pipeline graph and discuss the nodes
13 | 5. Show the students the dataset and the endpoint created
14 | 6. Discuss the compilation and job submission steps
15 | 7. Highlight that the actual training will take 2+hours
16 | 8. You can always do a pipeline execution ahead of time to show results
17 | 
18 | ## Teardown
19 | 
20 | 1. Manually undeploy the model from the endpoint
21 | 2. Delete the endpoint
22 | 3. Optionally, delete the model and the dataset


--------------------------------------------------------------------------------
/dlp-demo/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "dlp-demo"
 3 | version = "2.0.0"
 4 | description = "Modern Google Cloud DLP demonstration application"
 5 | authors = [
 6 |     {name = "ROI Training", email = "info@roitraining.com"}
 7 | ]
 8 | requires-python = ">=3.11"
 9 | dependencies = [
10 |     "flask>=3.0.0",
11 |     "flask-cors>=4.0.0",
12 |     "google-cloud-dlp>=3.18.0",
13 |     "gunicorn>=21.2.0",
14 |     "python-dotenv>=1.0.0",
15 | ]
16 | 
17 | [project.optional-dependencies]
18 | dev = [
19 |     "pytest>=7.4.0",
20 |     "black>=23.0.0",
21 |     "ruff>=0.1.0",
22 |     "mypy>=1.5.0",
23 | ]
24 | 
25 | [build-system]
26 | requires = ["hatchling"]
27 | build-backend = "hatchling.build"
28 | 
29 | [tool.hatch.build.targets.wheel]
30 | packages = ["app"]
31 | 
32 | [tool.black]
33 | line-length = 88
34 | target-version = ['py311']
35 | 
36 | [tool.ruff]
37 | line-length = 88
38 | target-version = "py311"
39 | select = ["E", "F", "W", "I", "N", "B", "A", "S", "UP"]
40 | 
41 | [tool.mypy]
42 | python_version = "3.11"
43 | warn_return_any = true
44 | warn_unused_configs = true
45 | disallow_untyped_defs = true
46 | 


--------------------------------------------------------------------------------
/bigquery/time_travel.sh:
--------------------------------------------------------------------------------
 1 | # create the initial time travel table
 2 | bq query \
 3 | --use_legacy_sql=false \
 4 | --destination_table=class.time_travel \
 5 | --replace \
 6 | 'SELECT
 7 |   c.*,
 8 |   o.order_num,
 9 |   o.order_date,
10 |   FORMAT_DATETIME("%B", DATETIME(order_date)) AS month
11 | FROM
12 |   `roi-bq-demos.bq_demo_small.customer` c
13 | JOIN
14 |   `roi-bq-demos.bq_demo_small.order` o
15 | ON
16 |   c.cust_id = o.cust_id
17 | WHERE
18 |   cust_state = "CA"
19 |   AND order_date BETWEEN "2018-01-01"
20 |   AND "2018-01-31"'
21 | 
22 | # grab the time of completion for the table update
23 | export TARGET=$(date +"%s")
24 | 
25 | bq query \
26 | --use_legacy_sql=false \
27 | --destination_table=class.time_travel \
28 | --append_table \
29 | 'SELECT
30 |   c.*,
31 |   o.order_num,
32 |   o.order_date,
33 |   FORMAT_DATETIME("%B", DATETIME(order_date)) AS month
34 | FROM
35 |   `roi-bq-demos.bq_demo_small.customer` c
36 | JOIN
37 |   `roi-bq-demos.bq_demo_small.order` o
38 | ON
39 |   c.cust_id = o.cust_id
40 | WHERE
41 |   cust_state = "CA"
42 |   AND order_date BETWEEN "2018-02-01"
43 |   AND "2018-02-28"'
44 |   
45 | echo "Your time travel table is ready!"
46 | echo "Your time travel target is $TARGET"


--------------------------------------------------------------------------------
/bigquery/schema-demo/norm_query.sql:
--------------------------------------------------------------------------------
 1 | -- find sales/zip for march
 2 | -- base normalized tables
 3 | SELECT
 4 |   c.cust_zip,
 5 |   SUM(li.qty * p.prod_price) AS zip_sales
 6 | FROM
 7 |   `roi-bq-demos.bq_demo.order` o
 8 | JOIN
 9 |   `roi-bq-demos.bq_demo.line_item` li
10 | ON
11 |   o.order_num = li.order_num
12 | JOIN
13 |   `roi-bq-demos.bq_demo.customer` c
14 | ON
15 |   o.cust_id = c.cust_id
16 | JOIN
17 |   `roi-bq-demos.bq_demo.product` p
18 | ON
19 |   p.prod_code = li.prod_code
20 | WHERE
21 |   o.order_date >= "2018-03-01"
22 |   AND o.order_date <= "2018-03-31"
23 | GROUP BY
24 |   c.cust_zip
25 | ORDER BY
26 |   zip_sales DESC
27 | 
28 | -- find sales/zip for march
29 | -- normalized tables with order_part table
30 | SELECT
31 |   c.cust_zip,
32 |   SUM(li.qty * p.prod_price) AS zip_sales
33 | FROM
34 |   `roi-bq-demos.bq_demo.order_part` o
35 | JOIN
36 |   `roi-bq-demos.bq_demo.line_item` li
37 | ON
38 |   o.order_num = li.order_num
39 | JOIN
40 |   `roi-bq-demos.bq_demo.customer` c
41 | ON
42 |   o.cust_id = c.cust_id
43 | JOIN
44 |   `roi-bq-demos.bq_demo.product` p
45 | ON
46 |   p.prod_code = li.prod_code
47 | WHERE
48 |   o.order_date >= "2018-03-01"
49 |   AND o.order_date <= "2018-03-31"
50 | GROUP BY
51 |   c.cust_zip
52 | ORDER BY
53 |   zip_sales DESC


--------------------------------------------------------------------------------
/dataproc/dataproc_scale_demo.sh:
--------------------------------------------------------------------------------
 1 | # run in cloud shell to create the demo cluster
 2 | gcloud dataproc clusters create demo-cluster \
 3 |     --region us-central1 \
 4 |     --zone us-central1-a \
 5 |     --worker-machine-type=n1-standard-8 \
 6 |     --num-workers=2 \
 7 |     --num-secondary-workers=0 \
 8 |     --secondary-worker-boot-disk-size=30 \
 9 |     --delete-max-age=10m \
10 |     --verbosity=error
11 | 
12 | # run in cloud shell to submit a job to the cluster
13 | # show the progress rate (about 1% map per 30 seconds)
14 | export PROJECT_ID=$(gcloud config get-value project)
15 | gsutil mb gs://$PROJECT_ID
16 | gcloud dataproc jobs submit hadoop \
17 |     --cluster=demo-cluster \
18 |     --region=us-central1 \
19 |     --class=org.apache.hadoop.examples.terasort.TeraGen \
20 |     --jars=file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar \
21 |     -- -D mapreduce.job.maps=800 10000000000 gs://$PROJECT_ID/tg_n/$(date +%s)
22 | 
23 | # run in cloud shell to add pre-emptible instances
24 | # adjust the number of nodes to something within your quota
25 | # show new progress rate (2-3%/sec)
26 | gcloud dataproc clusters update demo-cluster \
27 |     --num-secondary-workers 150 \
28 |     --region us-central1
29 | 
30 | # delete the cluster
31 | gcloud dataproc clusters delete demo-cluster \
32 |     --region us-central1


--------------------------------------------------------------------------------
/NEW_AUG25.md:
--------------------------------------------------------------------------------
 1 | # What's new in the August 25 update
 2 | > [!NOTE]
 3 | > Updated August 20, 2025
 4 | 
 5 | ## BigQuery demos
 6 | 1. SQL transform examples
 7 | 2. Exported billing data example
 8 | 3. Information schema examples
 9 |    
10 | ## Composer demos
11 | 4. DAG check scripts (python and bash)
12 | 5. DAG that shows different Airflow strategies for BigQuery exports
13 | 6. DAG that shows executing a query and exporting the results
14 | 
15 | ## Dataflow demos
16 | 7. Simple demo pipeline that illustrates read, write, branch, Map, FlatMap and Filter
17 | 8. Simple demo pipeline that illustrates Create, GroupByKey, CombineGlobally, CombinePerKey, and CoGroupByKey
18 | 
19 | ## Dataproc demos
20 | 9. Autoscaling cluster demo to go with manual scaling demo
21 | 
22 | ## DLP-demo
23 | 10. Revised app and source code for demoing DLP abilities
24 | 
25 | ## Do it nows
26 | 11. Fixes to the Do It Now instructions and code
27 | 
28 | ## Security
29 | 12. Assets for creating and applying a custom constraint
30 | 13. A python script that illustrates multiple ways of authenticating
31 | 
32 | ## Terraform
33 | 14. Bash script that exports a project resources as Terraform HCL
34 | 
35 | ## Utilities
36 | 15. Source code for a silly web service that generates Costco shopping lists and returns them in JSON payloads
37 | 
38 | ## Works in Progress
39 | - Finishing Dataform demo
40 | - Finishing Dataplex demo
41 | - Finishing custom log-sink -> pub/sub -> cloud function -> interesting action demo
42 | 


--------------------------------------------------------------------------------
/dlp-demo/deploy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Deploy script for DLP Demo to Cloud Run
 4 | set -e
 5 | 
 6 | # Configuration
 7 | PROJECT_ID=${GOOGLE_CLOUD_PROJECT:-"your-project-id"}
 8 | REGION=${REGION:-"us-central1"}
 9 | SERVICE_NAME="dlp-demo"
10 | IMAGE_NAME="us-central1-docker.pkg.dev/${PROJECT_ID}/${SERVICE_NAME}/${SERVICE_NAME}:latest"
11 | 
12 | echo "Deploying DLP Demo to Cloud Run..."
13 | echo "Project ID: ${PROJECT_ID}"
14 | echo "Region: ${REGION}"
15 | echo "Service Name: ${SERVICE_NAME}"
16 | 
17 | # Ensure we're using the correct project
18 | gcloud config set project ${PROJECT_ID}
19 | 
20 | # Build and push the container image
21 | echo "Building container image..."
22 | gcloud builds submit --tag ${IMAGE_NAME}
23 | 
24 | # Deploy to Cloud Run
25 | echo "Deploying to Cloud Run..."
26 | gcloud run deploy ${SERVICE_NAME} \
27 |   --image ${IMAGE_NAME} \
28 |   --platform managed \
29 |   --region ${REGION} \
30 |   --allow-unauthenticated \
31 |   --set-env-vars GOOGLE_CLOUD_PROJECT=${PROJECT_ID} \
32 |   --memory 512Mi \
33 |   --cpu 1 \
34 |   --min-instances 0 \
35 |   --max-instances 10 \
36 |   --port 8080
37 | 
38 | # Get the service URL
39 | SERVICE_URL=$(gcloud run services describe ${SERVICE_NAME} \
40 |   --platform managed \
41 |   --region ${REGION} \
42 |   --format 'value(status.url)')
43 | 
44 | echo ""
45 | echo "Deployment complete!"
46 | echo "Service URL: ${SERVICE_URL}"
47 | echo ""
48 | echo "To test the service:"
49 | echo "curl ${SERVICE_URL}/health"
50 | 


--------------------------------------------------------------------------------
/ai/automl/salads_deploy.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import google.auth
 3 | from google.cloud import aiplatform, aiplatform_v1
 4 | 
 5 | logging.basicConfig(level=logging.INFO)
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | creds, project = google.auth.default()
 9 | location = "us-central1"
10 | 
11 | 
12 | def deploy_model_to_endpoint(model_prefix):
13 |     if not project:
14 |         raise RuntimeError(
15 |             "No GCP project found. Set with `gcloud config set project PROJECT`."
16 |         )
17 | 
18 |     if not model_prefix:
19 |         raise RuntimeError("No model prefix specified.")
20 | 
21 |     endpoint_display_name = f"{model_prefix}_prediction_endpoint"
22 |     parent = f"projects/{project}/locations/{location}"
23 |     api_endpoint = f"{location}-aiplatform.googleapis.com"
24 | 
25 |     model_client = aiplatform_v1.ModelServiceClient(
26 |         client_options={"api_endpoint": api_endpoint}
27 |     )
28 | 
29 |     target_model = None
30 |     for model in model_client.list_models(request={"parent": parent}):
31 |         if model.display_name and model.display_name.startswith(model_prefix):
32 |             target_model = model
33 |             break
34 |     if not target_model:
35 |         raise RuntimeError(
36 |             f"No model found with display_name starting with '{model_prefix}' in {parent}"
37 |         )
38 |     endpoint = aiplatform.Model(model_name=target_model.name).deploy()
39 |     return {"endpoint": endpoint}
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     deploy_model_to_endpoint("salads")
44 | 


--------------------------------------------------------------------------------
/dataproc/dataproc_autoscale_demo.sh:
--------------------------------------------------------------------------------
 1 | # run in cloud shell to create the demo cluster with autoscaling enabled
 2 | # This version uses autoscaling for secondary workers instead of manual scaling
 3 | 
 4 | # create the autoscaling policy
 5 | gcloud dataproc autoscaling-policies import autoscaling_demo_policy \
 6 |     --region=us-central1 \
 7 |     --source="./autoscaling_policy.yaml"
 8 | 
 9 | 
10 | # create the autoscaling cluster
11 | gcloud dataproc clusters create demo-cluster-autoscale \
12 |     --region us-central1 \
13 |     --zone us-central1-a \
14 |     --worker-machine-type=n1-standard-8 \
15 |     --num-workers=2 \
16 |     --autoscaling-policy=autoscaling_demo_policy \
17 |     --secondary-worker-type=spot \
18 |     --secondary-worker-machine-types=type=n1-standard-8 \
19 |     --secondary-worker-boot-disk-size=30 \
20 |     --verbosity=error
21 | 
22 | # run in cloud shell to submit a job to the cluster
23 | # The cluster will automatically scale secondary workers based on job demand
24 | export PROJECT_ID=$(gcloud config get-value project)
25 | gsutil mb gs://$PROJECT_ID
26 | gcloud dataproc jobs submit hadoop \
27 |     --cluster=demo-cluster-autoscale \
28 |     --region=us-central1 \
29 |     --class=org.apache.hadoop.examples.terasort.TeraGen \
30 |     --jars=file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar \
31 |     -- -D mapreduce.job.maps=800 10000000000 gs://$PROJECT_ID/tg_n/$(date +%s)
32 | 
33 | # delete the cluster
34 | gcloud dataproc clusters delete demo-cluster-autoscale \
35 |     --region us-central1
36 | 


--------------------------------------------------------------------------------
/ai/automl/adoption_deploy.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import google.auth
 3 | from google.cloud import aiplatform, aiplatform_v1
 4 | 
 5 | logging.basicConfig(level=logging.INFO)
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | creds, project = google.auth.default()
 9 | location = "us-central1"
10 | 
11 | 
12 | def deploy_model_to_endpoint(model_prefix):
13 |     if not project:
14 |         raise RuntimeError(
15 |             "No GCP project found. Set with `gcloud config set project PROJECT`."
16 |         )
17 | 
18 |     if not model_prefix:
19 |         raise RuntimeError("No model prefix specified.")
20 | 
21 |     endpoint_display_name = f"{model_prefix}_prediction_endpoint"
22 |     parent = f"projects/{project}/locations/{location}"
23 |     api_endpoint = f"{location}-aiplatform.googleapis.com"
24 | 
25 |     model_client = aiplatform_v1.ModelServiceClient(
26 |         client_options={"api_endpoint": api_endpoint}
27 |     )
28 | 
29 |     target_model = None
30 |     for model in model_client.list_models(request={"parent": parent}):
31 |         if model.display_name and model.display_name.startswith(model_prefix):
32 |             target_model = model
33 |             break
34 |     if not target_model:
35 |         raise RuntimeError(
36 |             f"No model found with display_name starting with '{model_prefix}' in {parent}"
37 |         )
38 |     endpoint = aiplatform.Model(model_name=target_model.name).deploy(
39 |         machine_type="n1-standard-4"
40 |     )
41 |     return {"endpoint": endpoint}
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     deploy_model_to_endpoint("adopted")
46 | 


--------------------------------------------------------------------------------
/dataplex/profiling/store_transactions_20240806.csv:
--------------------------------------------------------------------------------
 1 | transaction_id,store_id,register_id,timestamp,cashier_id,customer_id,payment_method,subtotal,tax_amount,discount_amount,total_amount,currency,receipt_number,status
 2 | txn_pos_20240806_001,store_sf_001,reg_003,2024-08-06 14:00:15,cashier_101,cust_loyal_789456,credit_card,85.47,7.69,0.00,93.16,USD,RCP_20240806_001,completed
 3 | txn_pos_20240806_002,store_sf_001,reg_001,2024-08-06 14:02:33,cashier_205,cust_walk_in,cash,24.99,2.25,2.50,24.74,USD,RCP_20240806_002,completed
 4 | txn_pos_20240806_003,store_ny_002,reg_002,2024-08-06 14:03:45,cashier_302,,debit_card,156.78,14.11,15.68,155.21,USD,RCP_20240806_003,completed
 5 | txn_pos_20240806_004,store_sf_001,reg_003,2024-08-06 14:05:12,cashier_101,cust_loyal_123789,credit_card,299.99,27.00,30.00,296.99,USD,RCP_20240806_004,refunded
 6 | txn_pos_20240806_005,store_la_003,reg_001,2024-08-06 14:07:28,cashier_401,cust_loyal_456123,gift_card,67.45,6.07,0.00,73.52,USD,RCP_20240806_005,completed
 7 | txn_pos_20240806_006,store_ny_002,reg_001,2024-08-06 14:08:19,cashier_301,cust_walk_in,cash,12.50,1.13,0.00,13.63,USD,RCP_20240806_006,void
 8 | txn_pos_20240806_007,store_sf_001,reg_002,2024-08-06 14:10:41,cashier_206,cust_loyal_987654,credit_card,445.20,40.07,44.52,440.75,USD,RCP_20240806_007,completed
 9 | txn_pos_20240806_008,store_la_003,reg_003,2024-08-06 14:12:05,,cust_walk_in,cash,89.99,8.10,0.00,98.09,USD,RCP_20240806_008,completed
10 | txn_pos_20240806_009,store_sf_001,reg_001,2024-08-06 14:14:22,cashier_205,cust_loyal_555888,debit_card,178.34,16.05,0.00,194.39,USD,RCP_20240806_009,completed
11 | txn_pos_20240806_010,store_ny_002,reg_002,2024-08-06 14:15:33,cashier_302,cust_walk_in,credit_card,INVALID_AMOUNT,5.42,0.00,ERROR,USD,RCP_20240806_010,error


--------------------------------------------------------------------------------
/bigquery/schema-demo/load_data.sql:
--------------------------------------------------------------------------------
 1 | CREATE OR REPLACE TABLE
 2 |   `bq_demo.denorm` AS (
 3 | SELECT
 4 |   c.*,
 5 |   o.order_num AS order_num,
 6 |   order_date,
 7 |   line_item_num,
 8 |   li.prod_code AS prod_code,
 9 |   qty,
10 |   prod_name,
11 |   prod_desc,
12 |   prod_price
13 | FROM
14 |   `roi-bq-demos.bq_demo.customer` c
15 | LEFT JOIN
16 |   `roi-bq-demos.bq_demo.order` o
17 | ON
18 |   c.cust_id = o.cust_id
19 | LEFT JOIN
20 |   `roi-bq-demos.bq_demo.line_item` AS li
21 | ON
22 |   o.order_num = li.order_num
23 | LEFT JOIN
24 |   `roi-bq-demos.bq_demo.product` AS p
25 | ON
26 |   li.prod_code = p.prod_code);
27 | 
28 | CREATE OR REPLACE TABLE
29 |   `bq_demo.nested_once` AS (
30 |   WITH
31 |     dlow AS (
32 |     SELECT
33 |       *
34 |     FROM
35 |       `bq_demo.denorm` )
36 |   SELECT
37 |     cust_id,
38 |     cust_name,
39 |     cust_address,
40 |     cust_state,
41 |     cust_zip,
42 |     cust_email,
43 |     cust_phone,
44 |     order_num,
45 |     order_date,
46 |     ARRAY_AGG( STRUCT(line_item_num,
47 |         prod_code,
48 |         qty,
49 |         prod_name,
50 |         prod_desc,
51 |         prod_price)) AS line_items
52 |   FROM
53 |     dlow
54 |   GROUP BY
55 |     order_num,
56 |     order_date,
57 |     cust_phone,
58 |     cust_email,
59 |     cust_zip,
60 |     cust_state,
61 |     cust_address,
62 |     cust_name,
63 |     cust_id);
64 | 
65 | CREATE OR REPLACE TABLE
66 |   `bq_demo.table_nested_partitioned`
67 | PARTITION BY
68 |   order_date AS (
69 | SELECT
70 |   *
71 | FROM
72 |   `bq_demo.nested_once`);
73 | 
74 | CREATE OR REPLACE TABLE
75 |   `bq_demo.table_nested_partitioned_clustered`
76 | PARTITION BY
77 |   order_date
78 | CLUSTER BY
79 |   cust_zip AS (
80 | SELECT
81 |   *
82 | FROM
83 |   `bq_demo.nested_once`)


--------------------------------------------------------------------------------
/composer/dag_development/validate_dag.py:
--------------------------------------------------------------------------------
 1 | # validate_dag.py
 2 | # ---------------------------------------------
 3 | # Use case: Validates an Airflow DAG file for import errors before deploying to Composer/Airflow.
 4 | # How it works: Takes a DAG Python file as a command line argument, attempts to import it using Airflow's DagBag,
 5 | # and reports any import errors found. Exits with code 0 if successful, 1 if errors, 2 if no argument provided.
 6 | # Setup: Recommended to use Python 3.11 via pyenv. Install Apache Airflow in your environment.
 7 | # You'll also need to install any provider modules your DAG uses
 8 | # Example setup:
 9 | #   pyenv local 3.11
10 | #   uv venv .venv
11 | #   source .venv/bin/activate
12 | #   uv pip install apache-airflow apache-airflow-providers-google
13 | # Usage:
14 | #   uv run validate_dag.py <dag_file.py>
15 | 
16 | import sys
17 | import os
18 | import warnings
19 | 
20 | warnings.filterwarnings("ignore", category=FutureWarning, module="airflow")
21 | warnings.simplefilter("ignore", DeprecationWarning)
22 | 
23 | from airflow.models.dagbag import DagBag
24 | 
25 | # Check for DAG file argument
26 | if len(sys.argv) < 2:
27 |     print("Usage: python validate_dag.py <dag_file.py>")
28 |     sys.exit(2)
29 | DAG_FILE = sys.argv[1]  # DAG file to validate
30 | 
31 | dag_bag = DagBag(dag_folder=DAG_FILE, include_examples=False)  # Load DAG for validation
32 | 
33 | errors = dag_bag.import_errors
34 | 
35 | if errors:
36 |     print("❌ DAG import errors:")
37 |     for f, err in errors.items():
38 |         if DAG_FILE in f:
39 |             print(f"\nFile: {f}\nError:\n{err}")
40 |     sys.exit(1)  # Exit with error code if import errors found
41 | else:
42 |     print("✅ DAG parsed successfully.")
43 |     sys.exit(0)  # Exit with success code
44 | 


--------------------------------------------------------------------------------
/dlp-demo/app/templates/base.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="utf-8">
 5 |     <meta name="viewport" content="initial-scale=1.0"/>
 6 |     <meta name="description" content="DLP Demo 2.0">
 7 |     <meta name="author" content="ROI Training">
 8 |     <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
 9 |     {% block head %}
10 |     {% endblock %}
11 |   </head>
12 | 
13 |   <body>
14 |     <header class="bg-light border-bottom py-3">
15 |       <div class="container-fluid">
16 |         <div class="row">
17 |           <div class="col-lg-6">
18 |             <div class="d-flex align-items-center">
19 |               <img src="{{ url_for('static', filename='images/symbol-full-color.svg') }}" height="40" class="me-3" alt="ROI Training">
20 |               <div>
21 |                 <h4 class="mb-0 text-primary">Google Cloud DLP Demo</h4>
22 |                 <small class="text-muted">ROI Training - Data Loss Prevention API</small>
23 |               </div>
24 |             </div>
25 |           </div>
26 |         </div>
27 |       </div>
28 |     </header>
29 | 
30 |     <main class="container-fluid py-4">
31 |       {% block content %}
32 |       {% endblock %}
33 |     </main>
34 | 
35 |     <footer class="mt-5 py-3">
36 |       <div class="container-fluid">
37 |         <div class="row">
38 |           <div class="col-lg-6">
39 |             <p class="text-muted mb-0">&copy; 2025 ROI Training, Inc. All rights reserved.</p>
40 |           </div>
41 |         </div>
42 |       </div>
43 |     </footer>
44 | 
45 |     <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>
46 |     {% block scripts %}
47 |     {% endblock %}
48 |   </body>
49 | </html>
50 | 


--------------------------------------------------------------------------------
/ai/automl/README.md:
--------------------------------------------------------------------------------
 1 | # AutoML demos
 2 | 
 3 | ## Adoption Demo
 4 | 
 5 | ## Forecasting Demo
 6 | 
 7 | This demo shows building a liquor sales forecasting model using AutoML and
 8 | tabular data.
 9 | 
10 | ### Outline
11 | 
12 | 1. Initialize values and services
13 | 2. Create the dataset
14 | 3. Create and run model training job (creates model)
15 | 4. Make batch predictions
16 | 5. Create a Looker Studio dashboard to show prediction results
17 | 
18 | ### Timing issues
19 | 
20 | 1. Training the model takes 1.5-2 hours to complete, so waiting for the job to complete is generally not feasible
21 | 2. Doing batch prediction takes 25-30 minutes, so again, not feasible
22 | 
23 | ### Workarounds
24 | 
25 | 1. Run the cells in the workbook before class to create the dataset
26 | 
27 | 
28 | To use...
29 | 
30 | 1. Click on the **automl_forecasting** notebook
31 | 2. Click the link to open in the console using Colab Enterprise
32 | 3. Replace the project ID placeholder with the correct value
33 | 4. Run the cells in the **Setting up** section
34 |    1. Skip the cell that deletes/creates the BigQuery dataset if you have already run predictions and want to use the results already in BigQuery
35 | 5. If you haven't created the dataset and the model previously, run the cells in the **Creating the model** section.
36 |    1. This takes a couple hours to run, so you might want to have a version of the dataset and model already created before class and just show the students
37 | 6. If you have the dataset and model already created, you can then run the cell in the **Using an existing model** section to get a reference to the existing model
38 | 7. Run the cells in the **Making predictions** section to do batch inference with the model
39 |    1. This sadly takes like 25-30 minutes
40 |    2. You might want to have already run the batch prior to class and skip actually running the inference here
41 | 8. Run the cells in the **Creating a dashboard** section and then demo the dashboard and results
42 | 9.  Run the cell in the **Cleaning up** section
43 | 


--------------------------------------------------------------------------------
/dataplex/profiling/app_events_20240806_1400.jsonl:
--------------------------------------------------------------------------------
1 | {"event_id":"mob_1722963601001","timestamp":"2024-08-06T14:00:01.234Z","user_id":"user_mobile_456789","session_id":"mob_sess_def456ghi789","app_version":"2.1.3","platform":"iOS","os_version":"17.5.1","device_model":"iPhone 14","event_type":"app_open","screen_name":"home","previous_screen":null,"engagement_time_ms":null}
2 | {"event_id":"mob_1722963605002","timestamp":"2024-08-06T14:00:05.567Z","user_id":"user_mobile_456789","session_id":"mob_sess_def456ghi789","app_version":"2.1.3","platform":"iOS","os_version":"17.5.1","device_model":"iPhone 14","event_type":"screen_view","screen_name":"product_catalog","previous_screen":"home","engagement_time_ms":4333}
3 | {"event_id":"mob_1722963620003","timestamp":"2024-08-06T14:00:20.890Z","user_id":"user_mobile_456789","session_id":"mob_sess_def456ghi789","app_version":"2.1.3","platform":"iOS","os_version":"17.5.1","device_model":"iPhone 14","event_type":"product_view","screen_name":"product_detail","product_id":"prod_sneakers_042","product_name":"Athletic Running Shoes","product_category":"Footwear > Athletic","price":129.99,"currency":"USD","engagement_time_ms":15000}
4 | {"event_id":"mob_1722963635004","timestamp":"2024-08-06T14:00:35.123Z","user_id":"user_mobile_456789","session_id":"mob_sess_def456ghi789","app_version":"2.1.3","platform":"iOS","os_version":"17.5.1","device_model":"iPhone 14","event_type":"add_to_wishlist","product_id":"prod_sneakers_042","product_name":"Athletic Running Shoes","wishlist_count":3}
5 | {"event_id":"mob_1722963640005","timestamp":"2024-08-06T14:00:40.456Z","user_id":"user_android_123456","session_id":"mob_sess_jkl012mno345","app_version":"2.1.1","platform":"Android","os_version":"13","device_model":"Samsung Galaxy S23","event_type":"app_open","screen_name":"home","previous_screen":null,"engagement_time_ms":null}
6 | {"event_id":"mob_1722963645006","timestamp":"2024-08-06T14:00:45.789Z","user_id":"user_android_123456","session_id":"mob_sess_jkl012mno345","app_version":"2.1.1","platform":"Android","os_version":"13","device_model":"Samsung Galaxy S23","event_type":"search","search_query":"bluetooth speakers","search_results_count":24,"screen_name":"search_results"}


--------------------------------------------------------------------------------
/dataplex/profiling/web_events_20240806_14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "events": [
 3 |     {
 4 |       "event_id": "evt_1722963600001",
 5 |       "timestamp": "2024-08-06T14:00:00.123Z",
 6 |       "session_id": "sess_abc123def456",
 7 |       "user_id": "user_789012345",
 8 |       "event_type": "page_view",
 9 |       "page_url": "https://shop.example.com/products/wireless-headphones",
10 |       "page_title": "Premium Wireless Headphones - Electronics Store",
11 |       "referrer": "https://google.com/search",
12 |       "user_agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15",
13 |       "ip_address": "192.168.1.100",
14 |       "country": "US",
15 |       "state": "CA",
16 |       "city": "San Francisco",
17 |       "device_type": "mobile",
18 |       "browser": "Safari",
19 |       "utm_source": "google",
20 |       "utm_medium": "cpc",
21 |       "utm_campaign": "summer_electronics"
22 |     },
23 |     {
24 |       "event_id": "evt_1722963660002",
25 |       "timestamp": "2024-08-06T14:01:00.456Z",
26 |       "session_id": "sess_abc123def456",
27 |       "user_id": "user_789012345",
28 |       "event_type": "add_to_cart",
29 |       "product_id": "prod_headphones_001",
30 |       "product_name": "Premium Wireless Headphones",
31 |       "product_category": "Electronics > Audio",
32 |       "price": 199.99,
33 |       "quantity": 1,
34 |       "currency": "USD",
35 |       "cart_total": 199.99,
36 |       "ip_address": "192.168.1.100",
37 |       "device_type": "mobile"
38 |     },
39 |     {
40 |       "event_id": "evt_1722963720003",
41 |       "timestamp": "2024-08-06T14:02:00.789Z",
42 |       "session_id": "sess_xyz789ghi012",
43 |       "user_id": null,
44 |       "event_type": "page_view",
45 |       "page_url": "https://shop.example.com/",
46 |       "page_title": "Home - Electronics Store",
47 |       "referrer": "https://facebook.com/",
48 |       "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
49 |       "ip_address": "203.0.113.42",
50 |       "country": "US",
51 |       "state": "NY",
52 |       "city": "New York",
53 |       "device_type": "desktop",
54 |       "browser": "Chrome",
55 |       "utm_source": "facebook",
56 |       "utm_medium": "social",
57 |       "utm_campaign": null
58 |     }
59 |   ]
60 | }
61 | 


--------------------------------------------------------------------------------
/ai/automl/adoption_predict.py:
--------------------------------------------------------------------------------
 1 | import google.auth
 2 | import logging
 3 | 
 4 | from google.cloud import aiplatform_v1, aiplatform
 5 | 
 6 | logging.basicConfig(level=logging.INFO)
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | creds, project = google.auth.default()
10 | location = "us-central1"
11 | 
12 | 
13 | def find_endpoint_and_predict(display_prefix="adopted", location=location):
14 |     if not project:
15 |         logger.error(
16 |             "No gcloud project found. Set with `gcloud config set project PROJECT` or pass project explicitly."
17 |         )
18 |         return
19 | 
20 |     parent = f"projects/{project}/locations/{location}"
21 | 
22 |     endpoint_client = aiplatform_v1.EndpointServiceClient(
23 |         client_options={"api_endpoint": f"{location}-aiplatform.googleapis.com"}
24 |     )
25 | 
26 |     # List endpoints and find first whose display_name starts with the prefix
27 |     endpoints = endpoint_client.list_endpoints(request={"parent": parent})
28 |     target = None
29 |     for ep in endpoints:
30 |         if ep.display_name and ep.display_name.startswith(display_prefix):
31 |             target = aiplatform.Endpoint(endpoint_name=ep.name)
32 |             break
33 | 
34 |     if not target:
35 |         logger.error(
36 |             f"No endpoint found with display_name starting with '{display_prefix}' in {parent}"
37 |         )
38 |         return
39 | 
40 |     # Prediction instances for adoption model
41 |     instances = [
42 |         {
43 |             "Type": "Cat",
44 |             "Age": "3",
45 |             "Breed1": "Tabby",
46 |             "Gender": "Male",
47 |             "Color1": "Black",
48 |             "Color2": "White",
49 |             "MaturitySize": "Small",
50 |             "FurLength": "Short",
51 |             "Vaccinated": "No",
52 |             "Sterilized": "No",
53 |             "Health": "Healthy",
54 |             "Fee": "100",
55 |             "PhotoAmt": "2",
56 |         }
57 |     ]
58 | 
59 |     try:
60 |         prediction = target.predict(instances)
61 |         logger.info(f"Prediction result: {prediction.predictions[0]}")
62 |     except Exception as e:
63 |         logger.error(f"Prediction call failed: {e}")
64 |         raise
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     find_endpoint_and_predict()
69 | 


--------------------------------------------------------------------------------
/dataplex/lineage/lineage_tools.py:
--------------------------------------------------------------------------------
 1 | from google.cloud import datacatalog_lineage_v1
 2 | from datetime import datetime, timezone, timedelta
 3 | 
 4 | 
 5 | def list_processes():
 6 |     # Create a client
 7 |     client = datacatalog_lineage_v1.LineageClient()
 8 | 
 9 |     # Initialize request argument(s)
10 |     request = datacatalog_lineage_v1.ListProcessesRequest(
11 |         parent=parent,
12 |     )
13 | 
14 |     # Make the request
15 |     page_result = client.list_processes(request=request)
16 | 
17 |     # Handle the response
18 |     for response in page_result:
19 |         yield response
20 | 
21 | 
22 | def list_lineage_events(event_name):
23 |     # Create a client
24 |     client = datacatalog_lineage_v1.LineageClient()
25 | 
26 |     # Initialize request argument(s)
27 |     request = datacatalog_lineage_v1.ListLineageEventsRequest(
28 |         parent=event_name,
29 |     )
30 | 
31 |     # Make the request
32 |     page_result = client.list_lineage_events(request=request)
33 | 
34 |     # Handle the response
35 |     for response in page_result:
36 |         yield response
37 | 
38 | 
39 | def list_runs(process_name, num_days):
40 | 
41 |     now_utc = datetime.now(timezone.utc).replace(tzinfo=None)
42 |     cutoff = now_utc + timedelta(days=-num_days)
43 | 
44 |     # Create a client
45 |     client = datacatalog_lineage_v1.LineageClient()
46 | 
47 |     # Initialize request argument(s)
48 |     request = datacatalog_lineage_v1.ListRunsRequest(
49 |         parent=process_name,
50 |     )
51 | 
52 |     # Make the request
53 |     page_result = client.list_runs(request=request)
54 | 
55 |     # Handle the response
56 |     for response in page_result:
57 |         dt = datetime.fromtimestamp(response.start_time.timestamp())
58 |         if dt >= cutoff:
59 |             yield response
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     parent = "projects/jwd-gcp-demos/locations/us"
64 |     processes = list_processes()
65 |     for process in processes:
66 |         print(process)
67 |         print(f"Runs for {process.name}:")
68 |         runs = list_runs(process_name=process.name, num_days=1)
69 |         for run in runs:
70 |             print("Run start time:", run.start_time)
71 |             print(run)
72 |             print(f"Events for {run.name}:")
73 |             events = list_lineage_events(run.name)
74 |             for event in events:
75 |                 print(f"  {event}")
76 | 


--------------------------------------------------------------------------------
/dlp-demo/README.md:
--------------------------------------------------------------------------------
 1 | # DLP Demo 2.0
 2 | 
 3 | A Google Cloud Data Loss Prevention (DLP) demonstration application built for Cloud Run.
 4 | 
 5 | ## Features
 6 | 
 7 | - **Modern Python**: Built with Python 3.11+ and type hints
 8 | - **Cloud Run Ready**: Containerized application with health checks
 9 | - **Application Default Credentials**: No service account keys needed
10 | - **Modern Dependencies**: Up-to-date Flask, Google Cloud DLP, and other libraries
11 | - **uv Package Management**: Fast and reliable dependency management
12 | - **Best Practices**: Well-architected, documented, and tested
13 | 
14 | ## Architecture
15 | 
16 | The application provides a web interface for demonstrating Google Cloud DLP capabilities:
17 | 
18 | - **Text Inspection**: Identify sensitive information in text
19 | - **Data Redaction**: Remove sensitive information
20 | - **Data Replacement**: Replace sensitive information with placeholders
21 | - **Data Masking**: Mask sensitive information with characters
22 | 
23 | ## Development Setup
24 | 
25 | 1. Install uv if not already installed:
26 |    ```bash
27 |    curl -LsSf https://astral.sh/uv/install.sh | sh
28 |    ```
29 | 
30 | 2. Install dependencies:
31 |    ```bash
32 |    uv sync
33 |    ```
34 | 
35 | 3. Set up Google Cloud authentication:
36 |    ```bash
37 |    gcloud auth application-default login
38 |    ```
39 | 
40 | 4. Set environment variables:
41 |    ```bash
42 |    export GOOGLE_CLOUD_PROJECT=your-project-id
43 |    export PORT=8080
44 |    ```
45 | 
46 | 5. Run the development server:
47 |    ```bash
48 |    uv run python run.py
49 |    ```
50 | 
51 |    Or alternatively:
52 |    ```bash
53 |    uv run python -m app.main
54 |    ```
55 | 
56 | ## Deployment
57 | 
58 | ### Cloud Run
59 | 
60 | 1. Build and deploy:
61 |    ```bash
62 |    gcloud run deploy dlp-demo \
63 |      --source . \
64 |      --platform managed \
65 |      --region us-central1 \
66 |      --allow-unauthenticated
67 |    ```
68 | 
69 | 2. The service will automatically use the Cloud Run service account with appropriate permissions.
70 | 
71 | ## Environment Variables
72 | 
73 | - `GOOGLE_CLOUD_PROJECT`: Your Google Cloud project ID
74 | - `PORT`: Port to run the application on (default: 8080)
75 | - `FLASK_ENV`: Set to 'development' for debug mode
76 | 
77 | ## API Endpoints
78 | 
79 | - `GET /`: Main DLP demo interface
80 | - `POST /api/dlp`: Process text with DLP operations
81 | - `GET /health`: Health check endpoint
82 | 
83 | ## License
84 | 
85 | This project is for educational purposes as part of ROI Training's GCP demonstrations.
86 | 


--------------------------------------------------------------------------------
/dataflow/dflow-bq-stream-python/send_events.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import random
 3 | import datetime
 4 | import time
 5 | import googleapiclient.discovery
 6 | import json
 7 | 
 8 | from google.cloud import pubsub_v1 as pubsub
 9 | from google.oauth2 import service_account
10 | 
11 | # all transactions will occur in these zip codes
12 | ZIPS = ["95136", "95126", "95404", "94929"]
13 | 
14 | # handle command-line arguments
15 | # must provide project_id
16 | # can specify values for topic, sub, and service account
17 | parser = argparse.ArgumentParser()
18 | parser.add_argument(
19 |     "--project_id",
20 |     required=True)
21 | parser.add_argument(
22 |     "--topic_id",
23 |     default="demo_topic")
24 | parser.add_argument(
25 |     "--sub_id",
26 |     default="demo_sub")
27 | 
28 | known_args, extra_args = parser.parse_known_args()
29 | 
30 | # create the topic in the specified project
31 | publisher = pubsub.PublisherClient()
32 | topic_name = 'projects/{project_id}/topics/{topic}'.format(
33 |     project_id=known_args.project_id,
34 |     topic=known_args.topic_id,  # Set this to something appropriate.
35 | )
36 | topic_list = publisher.list_topics(project=f"projects/{known_args.project_id}")
37 | if (next((True for x in topic_list if topic_name == x.name), False)):
38 |     pass
39 | else:
40 |     topic = publisher.create_topic(name=topic_name)
41 | 
42 | # create the sub in the specified project
43 | subscriber = pubsub.SubscriberClient()
44 | topic_path = publisher.topic_path(known_args.project_id, known_args.topic_id)
45 | sub_path = subscriber.subscription_path(known_args.project_id, known_args.sub_id)
46 | sub_list = subscriber.list_subscriptions(project=f"projects/{known_args.project_id}")
47 | if (next((True for x in sub_list if sub_path == x.name), False)):
48 |     pass
49 | else:
50 |     subscriber.create_subscription(request={"name": sub_path, "topic": topic_path})
51 | 
52 | # send a message every second
53 | while True:
54 |     time.sleep(1)
55 |     pos_id = random.randint(1,9) # there are 10 pos terminals
56 |     timestamp = datetime.datetime.now().isoformat() 
57 |     zip_code = ZIPS[random.randint(0,3)]
58 |     amount =  round(random.uniform(1.00, 1000.0),2)
59 |     body_dict = {"pos_id": pos_id,
60 |         "ts": timestamp,
61 |         "zip": zip_code,
62 |         "sale_amount": amount}
63 |     body = json.dumps(body_dict).encode("utf-8") # create a byte array
64 |     future = publisher.publish(topic_path, body)
65 |     message_id = future.result()
66 |     print("Message published")
67 |     print(f"  - ID:   {message_id}")
68 |     print(f"  - BODY: {body}")


--------------------------------------------------------------------------------
/bigquery/exported_billing_data_example.sql:
--------------------------------------------------------------------------------
 1 | -- ------------------------------------------------------------
 2 | -- USE CASE (BigQuery-focused):
 3 | -- This query helps you understand spend on BigQuery specifically by reading
 4 | -- your Google Cloud billing export in BigQuery and subtotaling the different
 5 | -- BigQuery cost components (for example: query/analysis, storage, streaming inserts,
 6 | -- load jobs, copy jobs, and other SKU-level charges).
 7 | --
 8 | -- WHAT IT RETURNS:
 9 | -- It returns a subtotaled list of BigQuery cost components and their total USD
10 | -- cost over the selected time window (the query groups SKUs into human-friendly
11 | -- activity buckets and sums the cost for each bucket).
12 | --
13 | -- HOW IT WORKS:
14 | -- The query reads rows from your billing export table filtered to service = 'BigQuery',
15 | -- extracts the SKU description, maps SKUs into activity groups (analysis/query, storage,
16 | -- streaming, load, copy, etc.), and then aggregates costs per activity.
17 | --
18 | -- REQUIREMENTS TO USE:
19 | -- - Billing export must be enabled in Google Cloud.
20 | -- - You must use the resource-level billing export (resource-level exports include
21 | --   SKU and resource metadata that this query relies on).
22 | -- - Replace the table placeholder(s) below with your actual export table identifier,
23 | --   for example: `my-billing-project.my_dataset.gcp_billing_export_v1_012345_YYYYMMDD` or
24 | --   the wildcard export table pattern `my-billing-project.my_dataset.gcp_billing_export_v1_*`.
25 | -- ------------------------------------------------------------
26 | WITH
27 |   bq_usage AS (
28 |   SELECT
29 |     cost,
30 |     sku.description AS sku_desc,
31 |     service.description AS service_desc,
32 |     usage_start_time
33 |   FROM
34 |     `{{PROJECT_ID}}.{{DATASET}}.{{TABLE}}`
35 |   WHERE
36 |     service.description = 'BigQuery'
37 |     AND usage_start_time BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 30 DAY)
38 |     AND CURRENT_TIMESTAMP() )
39 | SELECT
40 |   activity,
41 |   ROUND(SUM(cost), 2) AS total_cost_usd
42 | FROM (
43 |   SELECT
44 |     cost,
45 |     CASE
46 |       WHEN LOWER(sku_desc) LIKE '%analysis%' THEN 'Query (Analysis)'
47 |       WHEN LOWER(sku_desc) LIKE '%storage%' THEN 'Storage'
48 |       WHEN LOWER(sku_desc) LIKE '%streaming insert%' THEN 'Streaming Inserts'
49 |       WHEN LOWER(sku_desc) LIKE '%load job%' THEN 'Load Jobs'
50 |       WHEN LOWER(sku_desc) LIKE '%copy%' THEN 'Copy Jobs'
51 |       ELSE 'Other: ' || sku_desc
52 |   END
53 |     AS activity
54 |   FROM
55 |     bq_usage )
56 | GROUP BY
57 |   activity
58 | ORDER BY
59 |   total_cost_usd DESC;


--------------------------------------------------------------------------------
/dataflow/dflow-bq-stream-python/schema_defs.py:
--------------------------------------------------------------------------------
 1 | from google.cloud import bigquery
 2 | 
 3 | # python client library schema definitions
 4 | ccl_messages_schema = [
 5 |     bigquery.SchemaField('pos_id', 'INT64', mode='REQUIRED'),
 6 |     bigquery.SchemaField('ts', 'DATETIME', mode='REQUIRED'),
 7 |     bigquery.SchemaField('zip', 'STRING', mode='REQUIRED'),
 8 |     bigquery.SchemaField('sale_amount', 'FLOAT', mode='REQUIRED')
 9 | ]
10 | 
11 | ccl_messages_nested_schema = [
12 |     bigquery.SchemaField('window_ending', 'DATETIME', mode='REQUIRED'),
13 |     bigquery.SchemaField('pos_id', 'INT64', mode='REQUIRED'),
14 |     bigquery.SchemaField(
15 |         'transactions', 
16 |         'RECORD', 
17 |         mode='REPEATED',
18 |         fields=[
19 |             bigquery.SchemaField('ts', 'DATETIME', mode='REQUIRED'),
20 |             bigquery.SchemaField('zip', 'STRING', mode='REQUIRED'),
21 |             bigquery.SchemaField('sale_amount', 'FLOAT', mode='REQUIRED')
22 |         ]
23 |     )
24 | ]
25 | 
26 | # beam biqueryio schema definitions
27 | beam_messages_schema = {
28 |     "fields": [
29 |         {
30 |             "name": "pos_id",
31 |             "type": "INT64",
32 |             "mode": 'REQUIRED'
33 |         },
34 |         {
35 |             "name": "ts",
36 |             "type": "DATETIME",
37 |             "mode": 'REQUIRED'
38 |         },
39 |         {
40 |             "name": "zip",
41 |             "type": "STRING",
42 |             "mode": 'REQUIRED'
43 |         },
44 |         {
45 |             "name": "sale_amount",
46 |             "type": "FLOAT",
47 |             "mode": 'REQUIRED'
48 |         }
49 |     ]
50 | }
51 | 
52 | beam_messages_nested_schema = {
53 |     "fields": [
54 |         {
55 |             "name": "window_ending",
56 |             "type": "DATETIME",
57 |             "mode": 'REQUIRED'
58 |         },
59 |         {
60 |             "name": "post_id",
61 |             "type": "INT64",
62 |             "mode": 'REQUIRED'
63 |         },
64 |         {
65 |             "name": "transactions",
66 |             "type": "RECORD",
67 |             "mode": 'REPEATED',
68 |             "fields": [
69 |                 {
70 |                     "name": "ts",
71 |                     "type": "DATETIME",
72 |                     "mode": 'REQUIRED'
73 |                 },
74 |                 {
75 |                     "name": "zip",
76 |                     "type": "STRING",
77 |                     "mode": 'REQUIRED'
78 |                 },
79 |                 {
80 |                     "name": "sale_amount",
81 |                     "type": "float",
82 |                     "mode": 'REQUIRED'
83 |                 },
84 |             ]
85 |         }
86 |     ]
87 | }


--------------------------------------------------------------------------------
/bigquery/github_demo.sql:
--------------------------------------------------------------------------------
  1 | -- standardSQL
  2 | -- search based on array length
  3 | -- display full struct and array of struct
  4 | SELECT
  5 |   author,
  6 |   difference
  7 | FROM
  8 |   `bigquery-public-data.github_repos.commits`
  9 | WHERE
 10 |   array_length(difference) = 5
 11 | LIMIT 10
 12 | 
 13 | -- standardSQL
 14 | -- search based on array length
 15 | -- create separate columns from struct properties
 16 | SELECT
 17 |   author.email,
 18 |   difference
 19 | FROM
 20 |   `bigquery-public-data.github_repos.commits`
 21 | WHERE
 22 |   array_length(difference) = 5
 23 | LIMIT 10
 24 | 
 25 | -- standardSQL
 26 | -- show correlated cross join and unnest
 27 | -- this one row per email/file combo
 28 | -- but also include the entire array for each output row
 29 | WITH
 30 |   sample AS (
 31 |   SELECT
 32 |     author.email,
 33 |     difference
 34 |   FROM
 35 |     `bigquery-public-data.github_repos.commits`
 36 |   WHERE
 37 |     ARRAY_LENGTH(difference) = 5
 38 |   LIMIT
 39 |     1)
 40 | SELECT
 41 |   email,
 42 |   difference,
 43 |   diff.new_path as path
 44 | from
 45 |   sample,
 46 |   unnest(difference) as diff
 47 | 
 48 | -- standardSQL
 49 | -- show correlated cross join and unnest
 50 | -- this drop the difference column with the array
 51 | WITH
 52 |   sample AS (
 53 |   SELECT
 54 |     author.email,
 55 |     difference
 56 |   FROM
 57 |     `bigquery-public-data.github_repos.commits`
 58 |   WHERE
 59 |     ARRAY_LENGTH(difference) = 5
 60 |   LIMIT
 61 |     1)
 62 | SELECT
 63 |   email,
 64 |   diff.new_path as path
 65 | from
 66 |   sample,
 67 |   unnest(difference) as diff
 68 | 
 69 | -- standardSQL
 70 | -- find commits where a particular file was touched
 71 | -- this shows searching on values within an array
 72 | -- by using correlated cross join and filter
 73 | SELECT
 74 |   author,
 75 |   difference
 76 | FROM
 77 |   `bigquery-public-data.github_repos.commits`,
 78 |   unnest(difference) as files
 79 | WHERE
 80 |   files.new_path = "courses/data_analysis/lab2/python/is_popular.py"
 81 | 
 82 | -- standardSQL
 83 | -- this also shows searching on values within an array
 84 | -- this time using subquery in where clause
 85 | SELECT
 86 |   author,
 87 |   difference
 88 | FROM
 89 |   `bigquery-public-data.github_repos.commits`
 90 | WHERE
 91 |   "courses/data_analysis/lab2/python/is_popular.py" in (select f.new_path from unnest(difference) as f)
 92 | 
 93 | -- standardSQL
 94 | -- this is by far the fastest way of the three to search on values in array
 95 | -- this avoids the cross join of #1. EXISTS is faster than IN
 96 | SELECT
 97 |   author,
 98 |   difference
 99 | FROM
100 |   `bigquery-public-data.github_repos.commits`
101 | WHERE
102 |   EXISTS (
103 |   SELECT
104 |     *
105 |   FROM
106 |     UNNEST(difference) AS f
107 |   WHERE
108 |     f.new_path="courses/data_analysis/lab2/python/is_popular.py")


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | .DS_Store
  2 | .vscode*
  3 | snippets*
  4 | dflow-bq-stream-python/env/
  5 | dflow-bq-stream-python/env/
  6 | *secrets*
  7 | 
  8 | # Byte-compiled / optimized / DLL files
  9 | __pycache__/
 10 | *.py[cod]
 11 | *$py.class
 12 | 
 13 | # C extensions
 14 | *.so
 15 | 
 16 | # Distribution / packaging
 17 | .Python
 18 | build/
 19 | develop-eggs/
 20 | dist/
 21 | downloads/
 22 | eggs/
 23 | .eggs/
 24 | lib/
 25 | lib64/
 26 | parts/
 27 | sdist/
 28 | var/
 29 | wheels/
 30 | pip-wheel-metadata/
 31 | share/python-wheels/
 32 | *.egg-info/
 33 | .installed.cfg
 34 | *.egg
 35 | MANIFEST
 36 | 
 37 | # PyInstaller
 38 | #  Usually these files are written by a python script from a template
 39 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 40 | *.manifest
 41 | *.spec
 42 | 
 43 | # Installer logs
 44 | pip-log.txt
 45 | pip-delete-this-directory.txt
 46 | 
 47 | # Unit test / coverage reports
 48 | htmlcov/
 49 | .tox/
 50 | .nox/
 51 | .coverage
 52 | .coverage.*
 53 | .cache
 54 | nosetests.xml
 55 | coverage.xml
 56 | *.cover
 57 | *.py,cover
 58 | .hypothesis/
 59 | .pytest_cache/
 60 | 
 61 | # Translations
 62 | *.mo
 63 | *.pot
 64 | 
 65 | # Django stuff:
 66 | *.log
 67 | local_settings.py
 68 | db.sqlite3
 69 | db.sqlite3-journal
 70 | 
 71 | # Flask stuff:
 72 | instance/
 73 | .webassets-cache
 74 | 
 75 | # Scrapy stuff:
 76 | .scrapy
 77 | 
 78 | # Sphinx documentation
 79 | docs/_build/
 80 | 
 81 | # PyBuilder
 82 | target/
 83 | 
 84 | # Jupyter Notebook
 85 | .ipynb_checkpoints
 86 | 
 87 | # IPython
 88 | profile_default/
 89 | ipython_config.py
 90 | 
 91 | # pyenv
 92 | .python-version
 93 | 
 94 | # pipenv
 95 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 96 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 97 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 98 | #   install all needed dependencies.
 99 | #Pipfile.lock
100 | 
101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
102 | __pypackages__/
103 | 
104 | # Celery stuff
105 | celerybeat-schedule
106 | celerybeat.pid
107 | 
108 | # SageMath parsed files
109 | *.sage.py
110 | 
111 | # Environments
112 | .env
113 | .venv
114 | env/
115 | venv/
116 | ENV/
117 | env.bak/
118 | venv.bak/
119 | 
120 | # Spyder project settings
121 | .spyderproject
122 | .spyproject
123 | 
124 | # Rope project settings
125 | .ropeproject
126 | 
127 | # mkdocs documentation
128 | /site
129 | 
130 | # mypy
131 | .mypy_cache/
132 | .dmypy.json
133 | dmypy.json
134 | 
135 | # Pyre type checker
136 | .pyre/
137 | 
138 | snippets.txt
139 | .streamlit/secrets.toml
140 | .streamlit/streamlit-sa.json
141 | .DS_Store
142 | 
143 | # Ignore Terraform state files
144 | *.tfstate
145 | *.tfstate.*
146 | .terraform/
147 | *.tfvars


--------------------------------------------------------------------------------
/bigquery/schema-demo/nested_queries.sql:
--------------------------------------------------------------------------------
  1 | -- standardSQL
  2 | -- find sales/zip for march from nested_once table
  3 | WITH
  4 |   orders AS (
  5 |   SELECT
  6 |     cust_zip,
  7 |     prod_price * qty AS line_item_subtotal
  8 |   FROM
  9 |     `<project-id>.bq_demo.nested_once`,
 10 |     unnest(line_items)
 11 |   WHERE
 12 |     order_date >= "2018-03-01"
 13 |     AND order_date <= "2018-03-31")
 14 | SELECT
 15 |   cust_zip,
 16 |   SUM(line_item_subtotal) as zip_sales
 17 | FROM
 18 |   orders
 19 | GROUP BY
 20 |   cust_zip
 21 | order by 
 22 |   zip_sales desc
 23 | 
 24 | 
 25 | -- standardSQL
 26 | -- find sales/zip for march from nested/partitioned
 27 | WITH
 28 |   orders AS (
 29 |   SELECT
 30 |     cust_zip,
 31 |     prod_price * qty AS line_item_subtotal
 32 |   FROM
 33 |     `<project-id>.bq_demo.table_nested_partitioned`,
 34 |     unnest(line_items)
 35 |   WHERE
 36 |     order_date >= "2018-03-01"
 37 |     AND order_date <= "2018-03-31")
 38 | SELECT
 39 |   cust_zip,
 40 |   SUM(line_item_subtotal) as zip_sales
 41 | FROM
 42 |   orders
 43 | GROUP BY
 44 |   cust_zip
 45 | order by 
 46 |   zip_sales desc
 47 | 
 48 | -- standardSQL
 49 | -- find sales for 6 months in 8754 from nested
 50 | WITH
 51 |   orders AS (
 52 |   SELECT
 53 |     cust_zip,
 54 |     prod_price * qty AS line_item_subtotal
 55 |   FROM
 56 |     `<project-id>.bq_demo.nested_once`,
 57 |     UNNEST(line_items)
 58 |   WHERE
 59 |     order_date >= "2018-01-01"
 60 |     AND order_date <= "2018-06-30"
 61 |     AND cust_zip=8754)
 62 |   SELECT
 63 |     cust_zip,
 64 |     SUM(line_item_subtotal) AS zip_sales
 65 |   FROM
 66 |     orders
 67 |   GROUP BY
 68 |     cust_zip
 69 |   ORDER BY
 70 |     zip_sales DESC
 71 | 
 72 | -- standardSQL
 73 | -- find for 6 months in 8754 from nested/partitioned
 74 | WITH
 75 |   orders AS (
 76 |   SELECT
 77 |     cust_zip,
 78 |     prod_price * qty AS line_item_subtotal
 79 |   FROM
 80 |     `<project-id>.bq_demo.table_nested_partitioned`,
 81 |     UNNEST(line_items)
 82 |   WHERE
 83 |     order_date >= "2018-01-01"
 84 |     AND order_date <= "2018-06-30"
 85 |     AND cust_zip=8754)
 86 |   SELECT
 87 |     cust_zip,
 88 |     SUM(line_item_subtotal) AS zip_sales
 89 |   FROM
 90 |     orders
 91 |   GROUP BY
 92 |     cust_zip
 93 |   ORDER BY
 94 |     zip_sales DESC
 95 | 
 96 | -- standardSQL
 97 | -- find for 6 months in 8754 from nested/partitioned/clustered
 98 | WITH
 99 |   orders AS (
100 |   SELECT
101 |     cust_zip,
102 |     prod_price * qty AS line_item_subtotal
103 |   FROM
104 |     `<project-id>.bq_demo.table_nested_partitioned_clustered`,
105 |     UNNEST(line_items)
106 |   WHERE
107 |     order_date >= "2018-01-01"
108 |     AND order_date <= "2018-06-30"
109 |     AND cust_zip=8754)
110 |   SELECT
111 |     cust_zip,
112 |     SUM(line_item_subtotal) AS zip_sales
113 |   FROM
114 |     orders
115 |   GROUP BY
116 |     cust_zip
117 |   ORDER BY
118 |     zip_sales DESC


--------------------------------------------------------------------------------
/utilities/shopping_list_api/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | from fastapi import FastAPI, HTTPException
 4 | from google import genai
 5 | from pydantic import BaseModel
 6 | from google.genai import types
 7 | import dotenv
 8 | from typing import List
 9 | from pydantic import BaseModel
10 | 
11 | dotenv.load_dotenv()
12 | 
13 | 
14 | class Item(BaseModel):
15 |     name: str
16 |     quantity: int
17 |     aisle: str
18 | 
19 | 
20 | class ShoppingListResponse(BaseModel):
21 |     items: List[Item]
22 | 
23 | 
24 | schema_config = {
25 |     "response_mime_type": "application/json",
26 |     "response_schema": list[Item],  # <— Gemini will use this schema
27 | }
28 | 
29 | 
30 | from fastapi.responses import JSONResponse, FileResponse
31 | from fastapi.requests import Request
32 | from fastapi.staticfiles import StaticFiles
33 | 
34 | # The app will listen on port 8080 (see Dockerfile CMD)
35 | app = FastAPI()
36 | 
37 | # Mount static directory for favicon
38 | import pathlib
39 | 
40 | static_dir = pathlib.Path(__file__).parent / "static"
41 | static_dir.mkdir(exist_ok=True)
42 | app.mount("/static", StaticFiles(directory=static_dir), name="static")
43 | 
44 | 
45 | @app.exception_handler(404)
46 | async def not_found_handler(request: Request, exc):
47 |     return JSONResponse(status_code=404, content={"detail": "Not Found"})
48 | 
49 | 
50 | # Initialize the Gen AI client for Vertex AI
51 | client = genai.Client(
52 |     vertexai=True,
53 |     project=os.environ["GOOGLE_CLOUD_PROJECT"],
54 |     location=os.environ.get("GOOGLE_CLOUD_LOCATION", "us-central1"),
55 | )
56 | 
57 | 
58 | @app.get("/get-list", response_model=ShoppingListResponse)
59 | async def shopping_list():
60 |     prompt = (
61 |         "Think of an American household with n adults and m children."
62 |         "Think of an event that this household might be preparing for"
63 |         "Generate a Costco shopping list of 5-10 items for that household"
64 |         "The list should be in JSON format as an array of objects. "
65 |         "Each object must have fields: name (string), quantity (integer), aisle (string)."
66 |         "Return the list as a JSON array. Don't include any additional text or formatting."
67 |     )
68 | 
69 |     # Call the Gemini model via the Gen AI SDK
70 |     try:
71 |         response = client.models.generate_content(
72 |             model="gemini-2.5-flash",
73 |             contents=prompt,
74 |             config=schema_config,
75 |         )  # :contentReference[oaicite:1]{index=1}
76 |     except Exception as e:
77 |         raise HTTPException(status_code=500, detail=f"GenAI API error: {e}")
78 | 
79 |     # Parse the returned text as JSON
80 |     raw = response.text
81 |     try:
82 |         items = json.loads(raw)
83 |     except json.JSONDecodeError as e:
84 |         raise HTTPException(
85 |             status_code=500,
86 |             detail=f"Failed to parse JSON from GenAI response: {e}",
87 |         )
88 | 
89 |     return {"items": items}
90 | 


--------------------------------------------------------------------------------
/functions/cat-bq-completions.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import base64
 3 | from datetime import datetime
 4 | from google.cloud import storage
 5 | import functions_framework
 6 | 
 7 | 
 8 | @functions_framework.cloud_event
 9 | def process_bigquery_job_completion(cloud_event):
10 |     """
11 |     Processes BigQuery job completion events from Pub/Sub and writes details to Cloud Storage.
12 | 
13 |     Triggered by Pub/Sub messages containing BigQuery audit logs.
14 |     """
15 | 
16 |     # Log the incoming event for debugging
17 |     log_entry = None
18 |     # Try to handle different event structures
19 |     if "data" in cloud_event.data:
20 |         # Standard Pub/Sub trigger
21 |         try:
22 |             message_data = base64.b64decode(cloud_event.data["data"]).decode("utf-8")
23 |             log_entry = json.loads(message_data)
24 |         except Exception as e:
25 |             print(f"Error decoding message: {e}")
26 |             return
27 |     elif "message" in cloud_event.data and "data" in cloud_event.data["message"]:
28 |         # Eventarc Pub/Sub trigger
29 |         try:
30 |             message_data = base64.b64decode(cloud_event.data["message"]["data"]).decode(
31 |                 "utf-8"
32 |             )
33 |             log_entry = json.loads(message_data)
34 |         except Exception as e:
35 |             print(f"Error decoding message: {e}")
36 |             return
37 |     else:
38 |         print("No data found in cloud event. Event structure:", cloud_event.data)
39 |         return
40 | 
41 |     try:
42 | 
43 |         # Write to Cloud Storage
44 |         write_to_cloud_storage(
45 |             bucket_name="jwd-gcp-demos",
46 |             file_path=f"bigquery-jobs/{datetime.now().strftime('%Y/%m/%d')}/job-completions.log",
47 |             content=json.dumps(log_entry, indent=2),
48 |         )
49 | 
50 |         print(f"Successfully processed job completion for {job_info['job_id']}")
51 | 
52 |     except Exception as e:
53 |         print(f"Error processing BigQuery job completion: {str(e)}")
54 |         print(f"Raw log entry: {json.dumps(log_entry, indent=2)}")
55 | 
56 | 
57 | def write_to_cloud_storage(bucket_name, file_path, content):
58 |     """
59 |     Write content to Cloud Storage, appending to existing file if it exists.
60 |     """
61 |     try:
62 |         # Initialize the Cloud Storage client
63 |         storage_client = storage.Client()
64 |         bucket = storage_client.bucket(bucket_name)
65 |         blob = bucket.blob(file_path)
66 | 
67 |         # Upload the updated content
68 |         blob.upload_from_string(content, content_type="application/json")
69 | 
70 |         print(f"Successfully wrote job data to gs://{bucket_name}/{file_path}")
71 | 
72 |     except Exception as e:
73 |         print(f"Error writing to Cloud Storage: {str(e)}")
74 |         raise
75 | 
76 | 
77 | # Optional: Add requirements.txt content
78 | """
79 | requirements.txt:
80 | google-cloud-storage>=2.10.0
81 | functions-framework>=3.0.0
82 | """
83 | 


--------------------------------------------------------------------------------
/utilities/shopping_list_api/README.md:
--------------------------------------------------------------------------------
 1 | # Shopping List API server
 2 | 
 3 | Silly little demo API that returns a generated shopping list (JSON). The service
 4 | calls Vertex AI's GenAI (Gemini) via the `google-genai` SDK and exposes a single
 5 | HTTP GET endpoint that returns a list of items suitable for demo/testing.
 6 | 
 7 | ## What this does
 8 | 
 9 | - Starts a FastAPI server (listens on port 8080 by default).
10 | - Calls a Gemini model to generate a small shopping list in JSON.
11 | - Returns the list as: `{ "items": [ {"name":..., "quantity":..., "aisle":...}, ... ] }`.
12 | 
13 | ## Files of interest
14 | 
15 | - `main.py` - FastAPI app and GenAI client usage.
16 | - `pyproject.toml` - project metadata and dependencies.
17 | - `Dockerfile` - container image build for running the app in Docker.
18 | 
19 | ## Prerequisites
20 | 
21 | - Python 3.12 (the project `pyproject.toml` requests >=3.12) or Docker.
22 | - A Google Cloud project with Vertex AI enabled and a service account that has
23 |   permission to call Vertex AI.
24 | - The `GOOGLE_CLOUD_PROJECT` environment variable must be set.
25 | 
26 | Optional but recommended environment variables in a `.env` file:
27 | 
28 | ```
29 | GOOGLE_CLOUD_PROJECT=your-gcp-project-id
30 | GOOGLE_CLOUD_LOCATION=us-central1
31 | ```
32 | 
33 | If running locally, authenticate with `gcloud auth application-default login`
34 | or set `GOOGLE_APPLICATION_CREDENTIALS` to a service account JSON key.
35 | 
36 | ## Run with Docker (recommended)
37 | 
38 | Build the image from the `shopping_list_api` directory and run it:
39 | 
40 | ```bash
41 | docker build -t shopping-list-api .
42 | docker run -p 8080:8080 \
43 |   -e GOOGLE_CLOUD_PROJECT=your-gcp-project-id \
44 |   -e GOOGLE_CLOUD_LOCATION=us-central1 \
45 |   shopping-list-api
46 | ```
47 | 
48 | The server will be reachable at `http://localhost:8080`.
49 | 
50 | ## Run locally (development)
51 | 
52 | 1. Export env vars (or create a `.env` file) and run:
53 | 
54 | ```bash
55 | export GOOGLE_CLOUD_PROJECT=your-gcp-project-id
56 | export GOOGLE_CLOUD_LOCATION=us-central1
57 | uv run uvicorn main:app --host 0.0.0.0 --port 8080 --reload
58 | ```
59 | 
60 | ## API
61 | 
62 | GET /get-list
63 | 
64 | - Description: Generates a Costco-style shopping list using Gemini and returns
65 |   it as JSON.
66 | - Response schema:
67 | 
68 | ```json
69 | {
70 |   "items": [
71 |     {"name": "string", "quantity": 1, "aisle": "string"}
72 |   ]
73 | }
74 | ```
75 | 
76 | Example curl:
77 | 
78 | ```bash
79 | curl -s http://localhost:8080/get-list | jq
80 | ```
81 | 
82 | Expected behaviors and error cases:
83 | 
84 | - If the GenAI call fails, the server returns HTTP 500 with a message.
85 | - If the model returns non-JSON or invalid JSON, the server returns HTTP 500.
86 | 
87 | ## Environment / Authentication notes
88 | 
89 | - When deployed on GCE/GKE/Cloud Run with the correct service account, the app
90 |   will use Workload Identity / service account credentials automatically.
91 | - For local testing, set `GOOGLE_APPLICATION_CREDENTIALS` to a service account
92 |   key file that has permission to call Vertex AI, or run
93 |   `gcloud auth application-default login`.
94 | 


--------------------------------------------------------------------------------
/composer/dags/bq_export_strategies.py:
--------------------------------------------------------------------------------
  1 | # Purpose:
  2 | # Demonstrates multiple approaches to exporting BigQuery data to GCS in Airflow.
  3 | # Highlights that there are often many ways to accomplish tasks in Airflow.
  4 | #
  5 | # Preparation Needed:
  6 | # - Create an Airflow variable for your GCP project ID.
  7 | # - Ensure a GCS bucket exists with the same name as your project.
  8 | # - Install required dependencies in your Composer environment (BigQuery, GCS, pandas).
  9 | 
 10 | from datetime import datetime, timedelta
 11 | 
 12 | from airflow import DAG
 13 | from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator
 14 | from airflow.providers.google.cloud.transfers.bigquery_to_gcs import (
 15 |     BigQueryToGCSOperator,
 16 | )
 17 | from airflow.operators.python import PythonOperator
 18 | from airflow.models import Variable
 19 | from google.cloud import bigquery
 20 | 
 21 | import pandas as pd
 22 | import logging
 23 | 
 24 | # Default arguments for the DAG
 25 | default_args = {
 26 |     "owner": "data-team",
 27 |     "depends_on_past": False,
 28 |     "start_date": datetime(2024, 1, 1),
 29 | }
 30 | 
 31 | 
 32 | # Read GCP project and bucket from Airflow Variables
 33 | SOURCE_PROJECT_ID = "roi-bq-demos"
 34 | GCS_BUCKET = Variable.get("gcp_project_id")
 35 | DATASET_ID = "bq_demo"
 36 | TABLE_ID = "product"
 37 | OUTPUT_FILE = "product_export.json"
 38 | 
 39 | # DAG Definition
 40 | dag = DAG(
 41 |     "export_product_table_strategies",
 42 |     default_args=default_args,
 43 |     description="Export product table from BigQuery to GCS using multiple strategies",
 44 |     schedule_interval=None,
 45 |     catchup=False,
 46 |     tags=["bigquery", "gcs", "product", "multi-strategy"],
 47 | )
 48 | 
 49 | 
 50 | # 1. Export using BigQueryInsertJobOperator (EXPORT DATA)
 51 | export_with_insertjob = BigQueryInsertJobOperator(
 52 |     task_id="export_with_insertjob",
 53 |     configuration={
 54 |         "query": {
 55 |             "query": (
 56 |                 f"""
 57 |                 EXPORT DATA OPTIONS(
 58 |                     uri='gs://{GCS_BUCKET}/product/airflow_export_with_insertjob/*.json',
 59 |                     format='JSON'
 60 |                 ) AS
 61 |                 SELECT * FROM `{SOURCE_PROJECT_ID}.{DATASET_ID}.{TABLE_ID}`
 62 |                 """
 63 |             ),
 64 |             "useLegacySql": False,
 65 |         }
 66 |     },
 67 |     location="US",
 68 |     dag=dag,
 69 | )
 70 | 
 71 | # 2. Export as Parquet using BigQueryToGCSOperator
 72 | export_with_parquet = BigQueryToGCSOperator(
 73 |     task_id="export_with_parquet",
 74 |     source_project_dataset_table=f"{SOURCE_PROJECT_ID}.{DATASET_ID}.{TABLE_ID}",
 75 |     destination_cloud_storage_uris=[
 76 |         f"gs://{GCS_BUCKET}/product/airflow_export_with_ToGCS.parquet"
 77 |     ],
 78 |     export_format="PARQUET",
 79 |     dag=dag,
 80 | )
 81 | 
 82 | 
 83 | def export_with_custom_logic(**context):
 84 |     client = bigquery.Client()
 85 | 
 86 |     # Query the table
 87 |     query = f"SELECT * FROM `{SOURCE_PROJECT_ID}.{DATASET_ID}.{TABLE_ID}`"
 88 |     df = client.query(query).to_dataframe()
 89 | 
 90 |     # Custom processing
 91 |     # df_processed = df.apply(<custom_transformation>)
 92 |     df_processed = df
 93 | 
 94 |     # Export to various destinations
 95 |     df_processed.to_parquet(f"gs://{GCS_BUCKET}/product/airflow_export_custom.parquet")
 96 | 
 97 | 
 98 | # 3. Export as Parquet using BigQueryToGCSOperator
 99 | custom_export = PythonOperator(
100 |     task_id="custom_export", python_callable=export_with_custom_logic, dag=dag
101 | )
102 | 


--------------------------------------------------------------------------------
/ai/del_endpoints.py:
--------------------------------------------------------------------------------
 1 | import google.auth
 2 | import logging
 3 | from google.cloud import aiplatform_v1
 4 | from google.api_core import exceptions as api_exceptions
 5 | 
 6 | logging.basicConfig(level=logging.INFO)
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | creds, project = google.auth.default()
10 | location = "us-central1"
11 | 
12 | 
13 | def main():
14 |     client_options = {"api_endpoint": f"{location}-aiplatform.googleapis.com"}
15 |     client = aiplatform_v1.EndpointServiceClient(client_options=client_options)
16 | 
17 |     parent = f"projects/{project}/locations/{location}"
18 | 
19 |     # List endpoints
20 |     try:
21 |         endpoints = client.list_endpoints(request={"parent": parent})
22 |     except api_exceptions.GoogleAPICallError as e:
23 |         logger.error(f"Failed to list endpoints: {e}")
24 |         return
25 | 
26 |     any_endpoints = False
27 |     for ep in endpoints:
28 |         any_endpoints = True
29 |         logger.info(
30 |             f"Processing endpoint: name={ep.name}, display_name={ep.display_name}"
31 |         )
32 | 
33 |         # Undeploy any deployed models from this endpoint
34 |         deployed = getattr(ep, "deployed_models", None) or []
35 |         if deployed:
36 |             for dm in deployed:
37 |                 # DeployedModel proto has an 'id' field which is the deployed_model_id
38 |                 deployed_model_id = getattr(dm, "id", None) or getattr(
39 |                     dm, "deployed_model_id", None
40 |                 )
41 |                 if not deployed_model_id:
42 |                     logger.warning(
43 |                         f"Could not determine deployed model id for deployed model: {dm}"
44 |                     )
45 |                     continue
46 | 
47 |                 logger.info(
48 |                     f"Undeploying deployed_model_id={deployed_model_id} from endpoint={ep.name}"
49 |                 )
50 |                 try:
51 |                     op = client.undeploy_model(
52 |                         request={
53 |                             "endpoint": ep.name,
54 |                             "deployed_model_id": deployed_model_id,
55 |                         }
56 |                     )
57 |                     logger.info("Waiting for undeploy operation to complete...")
58 |                     op.result(timeout=300)
59 |                     logger.info(f"Undeployed {deployed_model_id} from {ep.name}")
60 |                 except api_exceptions.GoogleAPICallError as e:
61 |                     logger.error(
62 |                         f"Failed to undeploy model {deployed_model_id} from {ep.name}: {e}"
63 |                     )
64 |                 except Exception as e:
65 |                     logger.error(
66 |                         f"Unexpected error undeploying {deployed_model_id} from {ep.name}: {e}"
67 |                     )
68 |         else:
69 |             logger.info("No deployed models found on this endpoint")
70 | 
71 |         # Delete the endpoint
72 |         logger.info(f"Deleting endpoint: {ep.name}")
73 |         try:
74 |             del_op = client.delete_endpoint(request={"name": ep.name})
75 |             logger.info("Waiting for delete operation to complete...")
76 |             del_op.result(timeout=300)
77 |             logger.info(f"Deleted endpoint {ep.name}")
78 |         except api_exceptions.GoogleAPICallError as e:
79 |             logger.error(f"Failed to delete endpoint {ep.name}: {e}")
80 |         except Exception as e:
81 |             logger.error(f"Unexpected error deleting endpoint {ep.name}: {e}")
82 | 
83 |     if not any_endpoints:
84 |         logger.info("No endpoints found.")
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     main()
89 | 


--------------------------------------------------------------------------------
/bigquery/schema-demo/README.md:
--------------------------------------------------------------------------------
 1 | # ROI BigQuery Schema performance demo
 2 | 
 3 | ## Goal
 4 | The goal of this demo is to show the relative performance of different schema choices in BigQuery.
 5 | 
 6 | After setup, there will be tables with the same data stored in tables with varying schemas:
 7 | 
 8 | - Normalized w/multiple tables
 9 | - Fully denormalized
10 | - Single-level nesting (one row per order)
11 | - Single-level nesting with partitioning
12 | - Single-level nesting with partitioning and clustering
13 | 
14 | You can demo the changes in query performance and cost with these different schema.
15 | 
16 | Incidentally, this also provides the potential to demo Dataflow, BigQuery load techniques, doing intra-BigQuery ETL, etc.
17 | 
18 | ## Setup
19 | 
20 | This demo uses this normalized data set: **roi-bq-demos.bq_demo** which has the following tables:
21 | - customer: 75M rows
22 | - product: 10K rows
23 | - order: 7.5B rows
24 | - line_item: 75B rows
25 | - order_part: 7.5B rows (partitioned on `order_date`)
26 | 
27 | 
28 | This is the largest dataset we can reasonably store for instructor use. If you want a larger dataset, you can create your own - the pieces you need are found in this directory.
29 | 
30 | 1. Log into the cloud console, and select an appropriate project. Queries used to derive the not-normalized tables and do the demos are expensive, so choose your project wisely.
31 | 
32 | 2. In your target project, make sure that there is a dataset named `bq_demo` (create it if necessary).
33 | 
34 | 3. Run the query in **load_data.sql**. This will take about 70 minutes, cost $200, and create four new tables in your target project/dataset:
35 | - denorm
36 | - nested_once
37 | - table_nested_partitioned
38 | - table_nested_partitioned_clustered
39 | 
40 | ## Demo
41 | 
42 | Load the BQ user interface in the project where you have the dataset.
43 | 
44 | 1. Run the `base normalized tables` query found in **norm_query.sql**
45 |     * Note that amount of data processed
46 |     * Note the structure of the query
47 |     * Note the time taken to complete the query
48 |     * Example run: 2TB, 140 seconds, $12.50
49 | 
50 | 2. Run the `normalized tables with order_part table` query found in **norm_query.sql**
51 |     * Note that amount of data processed
52 |     * Note the structure of the query
53 |     * Note the time taken to complete the query
54 |     * Example run: 1.8TB, 125 seconds, $10=1.25
55 |     * Note that partitioning did reduce the data read, and the time taken, but the win was minimal given the size of the line_item table -> this is the gating factor. You can show the execution graph to drive this hom.
56 | 
57 | 3. Run the query found in **denorm_query.sql**
58 |     * Note that amount of data processed
59 |     * Note the structure of the query
60 |     * Note the time taken to complete the query
61 |     * Note you get the same results as with 1
62 |     * Example run: 2.2TB, 24 seconds, $13.75
63 | 
64 | 3. Run the first query found in **nested_queries.sql**
65 |     * Note that amount of data processed
66 |     * Note the structure of the query
67 |     * Note the time taken to complete the query
68 |     * Note you get the same results as with 1
69 |     * Example run: 1.2TB, 6.4 seconds, $7.50
70 |   
71 | 4. Run the second query found in **nested_queries.sql**
72 |     * Note that amount of data processed
73 |     * Note the structure of the query
74 |     * Note the time taken to complete the query
75 |     * Note you get the same results as with 1
76 |     * Example run: xTB, x seconds, $x
77 | 
78 | 5. Run queries 3-4-5 from **nested_queries.sql**
79 |     * Note the amount of data processed for each
80 |     * Note the query time for each
81 |     * You should seeing decreases for each one
82 |     * Example runs
83 |         * 1.2TB, 6.3 seconds, $7
84 |         * 614G, 3.8 seconds, $3
85 |         * 11GB, 3.2 seconds, $.06


--------------------------------------------------------------------------------
/composer/dags/export_top_customers.py:
--------------------------------------------------------------------------------
  1 | # Example Scenario:
  2 | # A marketing analyst wants to identify the top customers for the week of Christmas.
  3 | # This DAG exports top customers from BigQuery to a GCS bucket to be fed to a marketing campaign.
  4 | #
  5 | # Prerequisites:
  6 | # 1. Create an Airflow variable for your project: gcp_project_id
  7 | # 2. Create a 'marketing' dataset in BigQuery.
  8 | # 3. Ensure a GCS bucket exists with the project name.
  9 | #
 10 | # Demo Instructions:
 11 | # 1. Place this DAG in your Airflow DAGs folder.
 12 | # 2. Trigger the DAG in Airflow UI.
 13 | # 3. Check the output table in BigQuery.
 14 | # 4. Verify the exported file in the GCS bucket.
 15 | 
 16 | from datetime import datetime
 17 | 
 18 | from airflow import DAG
 19 | from airflow.models import Variable
 20 | from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator
 21 | from airflow.providers.google.cloud.transfers.bigquery_to_gcs import (
 22 |     BigQueryToGCSOperator,
 23 | )
 24 | 
 25 | # Example 1: Export High-Value Customers to Cloud Storage
 26 | high_value_customers_dag = DAG(
 27 |     "export_high_value_customers",
 28 |     default_args={"owner": "marketing-team", "start_date": datetime(2024, 1, 1)},
 29 |     description="Find high-value customers and export to GCS for campaign targeting",
 30 |     catchup=False,
 31 |     tags=["marketing", "export"],
 32 | )
 33 | 
 34 | # Read GCP project ID from Airflow Variable
 35 | PROJECT_ID = Variable.get("gcp_project_id")
 36 | 
 37 | # Query to identify high-value customers
 38 | identify_high_value_customers = BigQueryInsertJobOperator(
 39 |     task_id="find_high_value_customers",
 40 |     configuration={
 41 |         "query": {
 42 |             "query": (
 43 |                 """
 44 |                 WITH
 45 |                 christmas_orders AS (
 46 |                 SELECT
 47 |                     order_num,
 48 |                     cust_id
 49 |                 FROM
 50 |                     `roi-bq-demos.bq_demo_small.order` o
 51 |                 WHERE
 52 |                     o.order_date >= "2018-12-18"
 53 |                     AND o.order_date <= "2018-12-23" )
 54 |                 SELECT
 55 |                     c.cust_id,
 56 |                     SUM(li.qty * p.prod_price) AS total_purchases
 57 |                 FROM
 58 |                     christmas_orders co
 59 |                 JOIN
 60 |                     roi-bq-demos.bq_demo_small.line_item li
 61 |                 ON
 62 |                     co.order_num = li.order_num
 63 |                 JOIN
 64 |                     roi-bq-demos.bq_demo_small.customer c
 65 |                 ON
 66 |                     c.cust_id = co.cust_id
 67 |                 JOIN
 68 |                     roi-bq-demos.bq_demo_small.product p
 69 |                 ON
 70 |                     p.prod_code = li.prod_code
 71 |                 GROUP BY
 72 |                     c.cust_id
 73 |                 ORDER BY
 74 |                     total_purchases desc
 75 |                 LIMIT 100
 76 |                 """
 77 |             ),
 78 |             "useLegacySql": False,
 79 |             "destinationTable": {
 80 |                 "projectId": PROJECT_ID,
 81 |                 "datasetId": "marketing",
 82 |                 "tableId": "high_value_customers",
 83 |             },
 84 |             "writeDisposition": "WRITE_TRUNCATE",
 85 |         }
 86 |     },
 87 |     dag=high_value_customers_dag,
 88 | )
 89 | 
 90 | # Export results to Cloud Storage for marketing campaign
 91 | export_to_gcs = BigQueryToGCSOperator(
 92 |     task_id="export_customer_list",
 93 |     source_project_dataset_table=f"{PROJECT_ID}.marketing.high_value_customers",
 94 |     destination_cloud_storage_uris=[f"gs://{PROJECT_ID}/high-value-customers.csv"],
 95 |     export_format="CSV",
 96 |     dag=high_value_customers_dag,
 97 | )
 98 | 
 99 | identify_high_value_customers >> export_to_gcs
100 | 


--------------------------------------------------------------------------------
/bigquery/schema-demo/load_data.sh:
--------------------------------------------------------------------------------
  1 |  #!/bin/bash
  2 | 
  3 |  read -p "project for data: " project 
  4 |  read -p "source bucket: " bucket
  5 | 
  6 | bq load --source_format=CSV --replace $project:bq_demo.customer gs://$bucket/bq-demo/customer* ./customer_schema.json
  7 | bq load --source_format=CSV --replace  $project:bq_demo.order gs://$bucket/bq-demo/order* ./order_schema.json
  8 | bq load --source_format=CSV --replace  $project:bq_demo.product gs://$bucket/bq-demo/product* ./product_schema.json
  9 | bq load --source_format=CSV --replace  $project:bq_demo.line_item gs://$bucket/bq-demo/line_item* ./line_item_schema.json
 10 | 
 11 | bq query --use_legacy_sql=false --replace --destination_table=$project:bq_demo.denorm '
 12 | SELECT
 13 |   c.*,
 14 |   o.order_num as order_num, 
 15 |   order_date,
 16 |   line_item_num, 
 17 |   li.prod_code as prod_code, 
 18 |   qty,
 19 |   prod_name, 
 20 |   prod_desc, 
 21 |   prod_price
 22 | FROM
 23 |   '"\`$project.bq_demo.customer\`"' c
 24 | LEFT JOIN
 25 |   '"\`$project.bq_demo.order\`"' o
 26 | ON
 27 |   c.cust_id = o.cust_id
 28 | LEFT JOIN
 29 |   '"\`$project.bq_demo.line_item\`"' AS li
 30 | ON
 31 |   o.order_num = li.order_num
 32 | LEFT JOIN
 33 |   '"\`$project.bq_demo.product\`"' AS p
 34 | ON
 35 |   li.prod_code = p.prod_code'
 36 | 
 37 | bq query --use_legacy_sql=false --replace --destination_table=$project:bq_demo.nested_once '
 38 | WITH
 39 |   dlow AS (
 40 |   SELECT
 41 |     *
 42 |   FROM
 43 |     '"\`$project.bq_demo.denorm\`"'
 44 | )
 45 | SELECT
 46 |   cust_id,
 47 |   cust_name,
 48 |   cust_address,
 49 |   cust_state,
 50 |   cust_zip,
 51 |   cust_email,
 52 |   cust_phone,
 53 |   order_num,
 54 |   order_date,
 55 |   ARRAY_AGG( STRUCT(line_item_num,
 56 |       prod_code,
 57 |       qty,
 58 |       prod_name,
 59 |       prod_desc,
 60 |       prod_price)) as line_items
 61 | FROM
 62 |   dlow
 63 | GROUP BY
 64 |   order_num,
 65 |   order_date,
 66 |   cust_phone,
 67 |   cust_email,
 68 |   cust_zip,
 69 |   cust_state,
 70 |   cust_address,
 71 |   cust_name,
 72 |   cust_id'
 73 | 
 74 | bq query --use_legacy_sql=false \
 75 | --replace \
 76 | --destination_table $project:bq_demo.table_nested_partitioned \
 77 | --time_partitioning_field order_date \
 78 | 'SELECT * FROM '"\`$project.bq_demo.nested_once\`"
 79 | 
 80 | bq query --use_legacy_sql=false 'CREATE OR REPLACE TABLE 
 81 | '"\`$project.bq_demo.table_nested_partitioned_clustered\`"' 
 82 | PARTITION BY order_date 
 83 | CLUSTER BY cust_zip AS 
 84 | SELECT * FROM '"\`$project.bq_demo.nested_once\`"
 85 | 
 86 | # bq query --use_legacy_sql=false \
 87 | # --replace \
 88 | # --destination_table=$project:bq_demo.nested_twice '
 89 | # WITH
 90 | #   dlow AS (
 91 | #   SELECT
 92 | #     *
 93 | #   FROM
 94 | #     '"\`$project.bq_demo.denorm\`"' ),
 95 | #   orders AS (
 96 | #   SELECT
 97 | #     cust_id,
 98 | #     cust_name,
 99 | #     cust_address,
100 | #     cust_state,
101 | #     cust_zip,
102 | #     cust_email,
103 | #     cust_phone,
104 | #     order_num,
105 | #     order_date,
106 | #     ARRAY_AGG( STRUCT(line_item_num,
107 | #         prod_code,
108 | #         qty,
109 | #         prod_name,
110 | #         prod_desc,
111 | #         prod_price)) AS line_items
112 | #   FROM
113 | #     dlow
114 | #   GROUP BY
115 | #     order_num,
116 | #     order_date,
117 | #     cust_phone,
118 | #     cust_email,
119 | #     cust_zip,
120 | #     cust_state,
121 | #     cust_address,
122 | #     cust_name,
123 | #     cust_id)
124 | #   SELECT
125 | #     cust_phone,
126 | #     cust_email,
127 | #     cust_zip,
128 | #     cust_state,
129 | #     cust_address,
130 | #     cust_name,
131 | #     cust_id,
132 | #     ARRAY_AGG( STRUCT( order_num,
133 | #         order_date,
134 | #         line_items )) AS orders
135 | #   FROM
136 | #     orders
137 | #   GROUP BY
138 | #     cust_id,
139 | #     cust_phone,
140 | #     cust_email,
141 | #     cust_zip,
142 | #     cust_state,
143 | #     cust_address,
144 | #     cust_name'


--------------------------------------------------------------------------------
/dataflow/dflow-bq-stream-python/README.md:
--------------------------------------------------------------------------------
  1 | # Dataflow Python Streaming Demo
  2 | 
  3 | ## Purpose
  4 | 
  5 | On the surface, this is a very simple demo intended to show Dataflow streaming
  6 | using the Python SDK.
  7 | 
  8 | There are a few fun little tricks and techniques involved, including:
  9 | 
 10 | * Checking for topic/sub existence
 11 | * Encoding objects into messages
 12 | * Reading the end time of Beam windows
 13 | * Checking for dataset/table existence in BQ
 14 | * Streaming nested/repeated data into BQ
 15 | 
 16 | ## Setup
 17 | 
 18 | Setup will do the following:
 19 | 
 20 | * Create and activate a python virtual environment
 21 | * Install dependencies for both pieces of the demo code
 22 | * Create a service account and make it an editor
 23 | * Create and download a key file for the service account
 24 | * Set an environment variable that will point the code to the service account keyfile
 25 | * Create a bucket that is used by the dataflow job
 26 | * Enable the BQ, PubSub, and Dataflow services
 27 | 
 28 | 1. Clone the repo, and change directories to `dflow-bq-stream-python`:
 29 | 
 30 |     ```bash
 31 |     git clone https://github.com/roitraining/gcp-demos.git
 32 |     cd gcp-demos/dataflow/dflow-bq-stream-python
 33 |     ```
 34 | 
 35 | 2. Make sure that **gcloud** is configured to point to the project you want to work in.
 36 | 
 37 | 3. Make sure you are running Python 3.12.x or earlier (there's a conflict between Beam and 3.13)
 38 |    ```bash
 39 |    pyenv local 3.12
 40 |    ```
 41 | 
 42 | 4. Run the `setup.sh` script providing the name of the service account you want the demo code to use:
 43 | 
 44 |     ```bash
 45 |     . ./setup.sh df-demo-sa
 46 |     ```
 47 | 
 48 | ## Sending events
 49 | 
 50 | This part of this demo sends a stream of events to a Pub/Sub topic,
 51 | one per second.
 52 | 
 53 | 5. If the pipeline is running in Dataflow, run the `send_events` script:
 54 | 
 55 |     ```bash
 56 |     python send_events.py \
 57 |         --project_id=$PROJECT_ID
 58 |     ```
 59 | 
 60 |     You may optionally change the topic and sucscription names if you like by
 61 |     specifying additional arguments: `topic_id` and `sub_id`.
 62 | 
 63 |     <img src="send.png" style="border:2px solid #333; width:600px;" />
 64 | 
 65 | ## Starting the pipeline
 66 | 
 67 | This section creates a Dataflow job which reads the messages from a PubSub
 68 | subscription, writes all messages into a `messages` table, and also windows the
 69 | messages and writes nested/repeated rows for each window into a
 70 | `messages_nested` table.
 71 | 
 72 | 
 73 | 6. Open a 2nd terminal window.
 74 | 7. Submit the job to the Dataflow service:
 75 |    ```bash
 76 |    export PROJECT_ID=$(gcloud config get-value project)
 77 |    cd gcp-demos/dataflow/dflow-bq-stream-python
 78 |    source .venv/bin/activate
 79 |    python process_events.py \
 80 |    --runner DataflowRunner \
 81 |    --region us-central1 \
 82 |    --project $PROJECT_ID \
 83 |    --staging_location gs://$PROJECT_ID-dflow-demo/ \
 84 |    --temp_location gs://$PROJECT_ID-dflow-demo/
 85 |    ```
 86 | 
 87 |     <img src="pipeline.png" style="border:2px solid #333; width:600px;" />
 88 |     <img src="rows.png" style="border:2px solid #333; width:600px;" />
 89 | 
 90 | > [!NOTE] 
 91 | > If you want to adjust the fixed window size from 10 seconds, you can provide
 92 | > an optional command-line argument `--window-size` (defined in seconds).
 93 | > 
 94 | > If you want to run the pipeline locally, the command looks like this:
 95 | > ```bash
 96 | > python3 process_events.py \
 97 | >   -- project $PROJECT_ID
 98 | > ```
 99 | 
100 | ## Checking out results
101 | 
102 | 1. It can take 5+ minutes until messages start flowing through the pipeline
103 | 1. Check out the pipeline in Dataflow (if running there)
104 |    1. Note the branch and two different sinks
105 |    2. Note that windowing happens after several transforms
106 |    3. Note the aggregation
107 | 
108 | 1. Check out the **messages** table in BQ and see all the individual messages
109 | 1. Check out the **messages_nested** table in BQ and see the nested data
110 | 
111 | ## Cleaning up
112 | 
113 | 1. Stop the Dataflow job
114 | 2. Stop the process sending events
115 | 3. Delete the Pub/Sub assets
116 | 4. Delete the BigQuery assets
117 | 5. Delete your service account
118 | 6. Delete your bucket


--------------------------------------------------------------------------------
/bigquery/elt_examples.sql:
--------------------------------------------------------------------------------
  1 | -- This file contains a list of SQL snippets illustrating various data sanitization techniques
  2 | -- that can be performed entirely within BigQuery. Each example demonstrates a different approach
  3 | -- to cleaning and standardizing raw data using SQL functions and expressions.
  4 | 
  5 | -- Cleans the sale_amount field by safely casting it to NUMERIC; invalid values become NULL.
  6 | SELECT
  7 |     transaction_id,
  8 |     -- Attempt to cast to NUMERIC. If it fails, SAFE_CAST returns NULL.
  9 |     SAFE_CAST(sale_amount AS NUMERIC) AS clean_sale_amount,
 10 |     customer_email,
 11 |     order_date,
 12 |     product_code,
 13 |     quantity,
 14 |     status
 15 | FROM
 16 |     my_dataset.raw_data_staging;
 17 | 
 18 | -- Parses common order_date string formats into a single DATE column called clean_order_date.
 19 | -- Returns NULL if none of the formats match; prefers ISO '%Y-%m-%d', then '%m/%d/%Y', then '%d-%b-%Y'.
 20 | SELECT
 21 |     transaction_id,
 22 |     sale_amount,
 23 |     customer_email,
 24 |     -- Attempt to parse common date formats.
 25 |     -- Prioritize the most common/desired format.
 26 |     COALESCE(
 27 |         SAFE.PARSE_DATE('%Y-%m-%d', order_date), -- '2023-01-15'
 28 |         SAFE.PARSE_DATE('%m/%d/%Y', order_date), -- '01/15/2023'
 29 |         SAFE.PARSE_DATE('%d-%b-%Y', order_date)  -- '15-Jan-2023'
 30 |     ) AS clean_order_date,
 31 |     product_code,
 32 |     quantity,
 33 |     status
 34 | FROM
 35 |     my_dataset.raw_data_staging;
 36 | 
 37 | -- Trims leading/trailing whitespace from string fields and converts empty strings to NULL
 38 | -- (produces normalized clean_customer_email and clean_product_code columns).
 39 | SELECT
 40 |     transaction_id,
 41 |     sale_amount,
 42 |     -- Trim whitespace and convert empty strings to NULL
 43 |     NULLIF(TRIM(customer_email), '') AS clean_customer_email,
 44 |     order_date,
 45 |     -- Trim whitespace and convert empty strings to NULL
 46 |     NULLIF(TRIM(product_code), '') AS clean_product_code,
 47 |     quantity,
 48 |     status
 49 | FROM
 50 |     my_dataset.raw_data_staging;
 51 | 
 52 | -- Validates and standardizes the status field: keeps expected values, maps 'Done' -> 'Completed',
 53 | -- and falls back to 'Unknown' (or NULL if you prefer) for unexpected values.
 54 | SELECT
 55 |     transaction_id,
 56 |     sale_amount,
 57 |     customer_email,
 58 |     order_date,
 59 |     product_code,
 60 |     quantity,
 61 |     -- Validate and standardize status values
 62 |     CASE
 63 |         WHEN status IN ('Completed', 'Pending', 'Cancelled') THEN status
 64 |         WHEN status = 'Done' THEN 'Completed' -- Standardize 'Done' to 'Completed'
 65 |         ELSE 'Unknown' -- Or NULL, depending on your requirement
 66 |     END AS clean_status
 67 | FROM
 68 |     my_dataset.raw_data_staging;
 69 | 
 70 | 
 71 |     -- Combined example: composes earlier cleaning steps (SAFE_CAST for numbers, TRIM/NULLIF for strings,
 72 |     -- DATE parsing with SAFE.PARSE_DATE, and status normalization), then applies filtering and deduplication.
 73 |     -- Use this as a ready-to-run pattern for end-to-end in-query sanitization before loading or analytics.
 74 |     SELECT
 75 |     transaction_id,
 76 |     SAFE_CAST(sale_amount AS NUMERIC) AS clean_sale_amount,
 77 |     NULLIF(TRIM(customer_email), '') AS clean_customer_email,
 78 |     COALESCE(
 79 |         SAFE.PARSE_DATE('%Y-%m-%d', order_date),
 80 |         SAFE.PARSE_DATE('%m/%d/%Y', order_date)
 81 |     ) AS clean_order_date,
 82 |     NULLIF(TRIM(product_code), '') AS clean_product_code,
 83 |     SAFE_CAST(quantity AS INT64) AS clean_quantity,
 84 |     CASE
 85 |         WHEN status IN ('Completed', 'Pending', 'Cancelled') THEN status
 86 |         WHEN status = 'Done' THEN 'Completed'
 87 |         ELSE 'Unknown'
 88 |     END AS clean_status
 89 | FROM
 90 |     my_dataset.raw_data_staging
 91 | -- Filter out rows where critical columns are NULL after cleaning attempts
 92 | WHERE
 93 |     transaction_id IS NOT NULL
 94 |     AND SAFE_CAST(sale_amount AS NUMERIC) IS NOT NULL -- Ensures sale_amount is valid and not NULL
 95 |     AND SAFE_CAST(quantity AS INT64) >= 1 -- Quantity must be at least 1
 96 | QUALIFY
 97 |     -- Deduplicate based on transaction_id, keeping the first encountered record
 98 |     ROW_NUMBER() OVER(PARTITION BY transaction_id ORDER BY order_date DESC) = 1;
 99 |     -- The ORDER BY in ROW_NUMBER() determines which duplicate to keep.
100 |     -- Here, we prioritize the most recent order_date if duplicates exist.


--------------------------------------------------------------------------------
/security/auth_examples.py:
--------------------------------------------------------------------------------
  1 | """
  2 | BigQuery Authentication Methods Demo
  3 | 
  4 | This script demonstrates three different ways to authenticate with BigQuery:
  5 | 
  6 | 1. Application Default Credentials (ADC).
  7 | 2. Service Account Key File
  8 | 3. Service Account Impersonation
  9 | 
 10 | Requirements:
 11 | - pip install google-cloud-bigquery google-auth
 12 | - For ADC
 13 |     - Make sure your user account has the necessary permissions.
 14 |     - Run 'gcloud auth application-default login' or set GOOGLE_APPLICATION_CREDENTIALS
 15 | - For Service Account key file
 16 |     - Create a service account with appropriate permissions
 17 |     - Create and download a keyfile
 18 |     - Update 'key_path' variable with your key file path
 19 | - For Service Account Impersonation
 20 |     - Make sure your user account has the "Service Account Token Creator" role on the target service account
 21 |     - Update 'target_service_account' variable with the email of the service account to impersonate
 22 | 
 23 | Variations
 24 | - You can configure ADC to use a service account with key file
 25 | - You can configure ADC to use impersonation as default
 26 | """
 27 | 
 28 | import os
 29 | from google.cloud import bigquery
 30 | from google.auth import default, impersonated_credentials
 31 | from google.oauth2 import service_account
 32 | import json
 33 | 
 34 | # Query to execute
 35 | QUERY = "SELECT COUNT(*) as row_count FROM `roi-bq-demos.bq_demo.line_item`"
 36 | 
 37 | 
 38 | def execute_query_with_client(client, method_name):
 39 |     """Execute the query and print results"""
 40 |     try:
 41 |         print(f"\n--- {method_name} ---")
 42 |         query_job = client.query(QUERY)
 43 |         results = query_job.result()
 44 | 
 45 |         for row in results:
 46 |             print(f"Row count: {row.row_count}")
 47 | 
 48 |         print(f"✓ {method_name} completed successfully")
 49 | 
 50 |     except Exception as e:
 51 |         print(f"✗ {method_name} failed: {str(e)}")
 52 | 
 53 | 
 54 | def method_1_application_default_credentials():
 55 |     """Method 1: Use Application Default Credentials (ADC)"""
 56 |     try:
 57 |         # This will use ADC - credentials from:
 58 |         # 1. GOOGLE_APPLICATION_CREDENTIALS environment variable
 59 |         # 2. gcloud auth application-default login
 60 |         # 3. Compute Engine/Cloud Run/GKE metadata service
 61 |         client = bigquery.Client()
 62 |         execute_query_with_client(client, "Application Default Credentials")
 63 | 
 64 |     except Exception as e:
 65 |         print(f"✗ Application Default Credentials setup failed: {str(e)}")
 66 | 
 67 | 
 68 | def method_2_service_account_key():
 69 |     """Method 2: Use Service Account Key File"""
 70 |     # Update this path to your service account key file
 71 |     key_path = "path/to/your/service-account-key.json"
 72 | 
 73 |     try:
 74 |         if not os.path.exists(key_path):
 75 |             print(f"✗ Service account key file not found: {key_path}")
 76 |             print("  Please update the 'key_path' variable with the correct path")
 77 |             return
 78 | 
 79 |         credentials = service_account.Credentials.from_service_account_file(key_path)
 80 |         client = bigquery.Client(credentials=credentials)
 81 |         execute_query_with_client(client, "Service Account Key")
 82 | 
 83 |     except Exception as e:
 84 |         print(f"✗ Service Account Key method failed: {str(e)}")
 85 | 
 86 | 
 87 | def method_3_service_account_impersonation():
 88 |     """Method 3: Use Service Account Impersonation"""
 89 |     # Update this with the service account email you want to impersonate
 90 |     target_service_account = "your-service-account@your-project.iam.gserviceaccount.com"
 91 | 
 92 |     try:
 93 |         # Get source credentials (usually from ADC)
 94 |         source_credentials, project = default()
 95 | 
 96 |         # Create impersonated credentials
 97 |         target_credentials = impersonated_credentials.Credentials(
 98 |             source_credentials=source_credentials,
 99 |             target_principal=target_service_account,
100 |             target_scopes=["https://www.googleapis.com/auth/cloud-platform"],
101 |         )
102 | 
103 |         client = bigquery.Client(credentials=target_credentials)
104 |         execute_query_with_client(client, "Service Account Impersonation")
105 | 
106 |     except Exception as e:
107 |         print(f"✗ Service Account Impersonation failed: {str(e)}")
108 | 
109 | 
110 | def main():
111 |     """Main function to run all authentication methods"""
112 |     print("BigQuery Authentication Methods Demo")
113 |     print("=" * 50)
114 | 
115 |     # Method 1: Application Default Credentials
116 |     method_1_application_default_credentials()
117 | 
118 |     # Method 2: Service Account Key
119 |     method_2_service_account_key()
120 | 
121 |     # Method 3: Service Account Impersonation
122 |     method_3_service_account_impersonation()
123 | 
124 |     print("\n" + "=" * 50)
125 |     print("Demo completed!")
126 | 
127 | 
128 | if __name__ == "__main__":
129 |     main()
130 | 


--------------------------------------------------------------------------------
/dataflow/simple_demos/beam_demo_1.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is intentionally small, focused code meant to illustrate a few key Apache Beam concepts
  3 | in a minimal way. It is NOT production code; it's an educational demo. The key points shown here:
  4 | 
  5 | - How to read data into a PCollection using I/O transforms (ReadFromText).
  6 | - Element-wise transforms: Map (one-to-one), FlatMap (one-to-many), Filter (predicate).
  7 | - How to write results with WriteToText sinks.
  8 | 
  9 | Setup notes (local testing):
 10 | - Use pyenv to install a supported Python version (3.11 or earlier). For example:
 11 |     # Install a specific Python version using pyenv (3.11.x or earlier)
 12 |     # $ pyenv install 3.11.2
 13 |     # Create and activate a virtualenv (venv or pyenv-virtualenv):
 14 |     # $ python -m venv .venv
 15 |     # $ source .venv/bin/activate
 16 |     # Install dependencies (for local runs):
 17 |     # $ pip install apache-beam
 18 |     # If you plan to run on Google Cloud Dataflow and access GCS/BQ, install GCP extras:
 19 |     # $ pip install "apache-beam[gcp]"
 20 | """
 21 | 
 22 | import apache_beam as beam
 23 | from apache_beam.io import ReadFromText, WriteToText
 24 | import sys
 25 | 
 26 | 
 27 | def count_words(line):
 28 |     """Count words in a single line.
 29 | 
 30 |     What it does:
 31 |     - Splits the incoming text line on whitespace and returns the word count (an int).
 32 | 
 33 |     Called by:
 34 |     - The 'CountWords' Map transform below. Beam calls this once per element in the input
 35 |         PCollection (one-to-one transform).
 36 |     """
 37 |     return len(line.split())
 38 | 
 39 | 
 40 | def lear_there(line):
 41 |     """FlatMap helper: yield the line if it contains the string 'Lear'.
 42 | 
 43 |     What it does:
 44 |     - Checks if the substring 'Lear' appears in the line. If yes, yields the line.
 45 |     - Because this is used with FlatMap, it can emit zero or more outputs per input.
 46 | 
 47 |     Called by:
 48 |     - The 'FlatMapLear' FlatMap transform below. FlatMap expects an iterable/generator
 49 |         of zero-or-more output elements per input element.
 50 |     """
 51 |     if "Lear" in line:
 52 |         yield line
 53 | 
 54 | 
 55 | # Build a Pipeline object. This is the root of your Beam program. The Pipeline object
 56 | # is used to apply transforms and then run the resulting graph.
 57 | p = beam.Pipeline(argv=sys.argv)
 58 | 
 59 | 
 60 | # Read: create an initial PCollection of text lines.
 61 | # Concept: ReadFromText is an I/O transform that returns a PCollection where each
 62 | # element is one line from the input file. Here we point to a sample file in GCS.
 63 | lines = p | "Read" >> ReadFromText("gs://dataflow-samples/shakespeare/kinglear.txt")
 64 | 
 65 | 
 66 | # Example sink: write the raw lines out to a file. This demonstrates a sink transform.
 67 | # Concept: Sinks are also transforms that consume PCollections.
 68 | _ = lines | "lines_out" >> WriteToText("beam_demo_1_lines.txt")
 69 | 
 70 | 
 71 | # Map transform: apply a function to each element, producing a one-to-one mapping.
 72 | # Concept: Map is useful for stateless, per-element computations. Note that you pass the function as an argument,
 73 | # with no other arguments. The receiving function will receive the next element of the input pcollection as it's first
 74 | # positional argument.
 75 | word_counts = lines | "CountWords" >> beam.Map(count_words)
 76 | _ = word_counts | "word_counts_out" >> WriteToText("beam_demo_1_word_counts.txt")
 77 | 
 78 | 
 79 | # FlatMap example: the supplied function may return zero, one, or many outputs per input.
 80 | # Concept: FlatMap is used when you need to expand or filter elements and potentially
 81 | # emit multiple outputs for a single input element. Again, the function is passed as an argument
 82 | # with no other arguments and will receive the next element of the input pcollection as it's first positional argument.
 83 | lear_there_flatmap = lines | "FlatMapLear" >> beam.FlatMap(lear_there)
 84 | _ = lear_there_flatmap | "lear_there_flatmap_out" >> WriteToText(
 85 |     "beam_demo_1_flatmap.txt"
 86 | )
 87 | 
 88 | 
 89 | # Filter: keep elements where the predicate is True.
 90 | # Lambda behavior: receives each element and returns True when 'Lear' is present.
 91 | # Use a lambda for short, one-off predicates; prefer a top-level function or DoFn
 92 | # when logic is non-trivial, reused, or to avoid potential runner/serialization issues.
 93 | lear_there_filter = lines | "FilterLear" >> beam.Filter(lambda x: "Lear" in x)
 94 | _ = lear_there_filter | "lear_there_filter_out" >> WriteToText("beam_demo_1_filter.txt")
 95 | 
 96 | 
 97 | # Run: execute the pipeline. For the DirectRunner this runs locally
 98 | p.run().wait_until_finish()
 99 | 
100 | 
101 | # End-of-file notes:
102 | # - After running locally you should see files created in the current directory:
103 | #   out_1-00000-of-00001, out_2-00000-of-00001, out_3-00000-of-00001, out_4-00000-of-00001
104 | #   (WriteToText names are suffixed by shard information).
105 | # - Inspect those files to verify the transformations: raw lines, numeric word counts,
106 | #   lines containing 'Lear' produced by FlatMap, and lines selected by Filter.
107 | 


--------------------------------------------------------------------------------
/bigquery/schema-demo/generate_data.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import apache_beam as beam
  3 | import random
  4 | from apache_beam.io import ReadFromText
  5 | import datetime
  6 | import argparse
  7 | 
  8 | # from apache_beam.options.pipeline_options import PipelineOptions
  9 | # from apache_beam.options.pipeline_options import SetupOptions
 10 | 
 11 | 
 12 | states = ("AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL",\
 13 | "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME",\
 14 | "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH",\
 15 | "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI",\
 16 | "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI",\
 17 | "WY", "DC")
 18 | 
 19 | # handle arguments
 20 | parser = argparse.ArgumentParser()
 21 | 
 22 | parser.add_argument(
 23 |     "--bucket", help="Name of the bucket where output files are written", required=True)
 24 | parser.add_argument(
 25 |     "--products", help="Number of products to generate", default=10000)
 26 | parser.add_argument(
 27 |     "--customers",  help="Number of customer to generate", default=60000000)
 28 | parser.add_argument(
 29 |     "--orders", help="Number of orders per customer", default=500)
 30 | 
 31 | known_args, pipeline_args = parser.parse_known_args()
 32 | 
 33 | def make_orders(customer):
 34 |     cust_id = customer.split(",")[0]
 35 |     for order_num in range(1, int(known_args.orders) + 1):
 36 |         order_date = str(datetime.date(2018, random.randint(1,12), random.randint(1,28)))
 37 |         order_num = "{}-{}".format(cust_id, order_num)
 38 |         row = [order_num, str(cust_id), order_date]
 39 |         yield ",".join(row)
 40 | 
 41 | 
 42 | def make_lines(order_string):
 43 |     order = order_string.split(",")
 44 |     for line_item_num in range(1,11):
 45 |         order_num = order[0]
 46 |         line_item_num = str(line_item_num)
 47 |         prod_code = str(random.randint(0, int(known_args.products)))
 48 |         qty = str(random.randint(0,10))
 49 |         row = [order_num, line_item_num, prod_code, qty]
 50 |         yield ",".join(row)
 51 | 
 52 | 
 53 | def create_cust_ids(num_cust_ids):
 54 |     for cust_id in range(0,num_cust_ids):
 55 |         yield cust_id
 56 | 
 57 | 
 58 | def make_customer(cust_id):
 59 |     cust_num = str(cust_id)
 60 |     cust_name = "Customer_" + cust_num + "_Name"
 61 |     phone = str(random.randint(100,999))\
 62 |         + "-" + str(random.randint(100,999))\
 63 |         + "-" + str(random.randint(0,9999))
 64 |     cust_email = "Customer_" + cust_num + "_Email@{}.com".format(cust_name)
 65 |     cust_address = cust_num + " Main St."
 66 |     cust_state = states[random.randint(0,50)]
 67 |     cust_zip = str(random.randint(0,99999))
 68 |     row = [cust_num, cust_name, cust_address, cust_state, cust_zip, cust_email, phone]   
 69 |     return ",".join(row)
 70 | 
 71 | 
 72 | def create_pids(num_pids):
 73 |     for pid in range(0,num_pids):
 74 |         yield pid
 75 | 
 76 | 
 77 | def make_product(pid):
 78 |     prod_code = str(pid)
 79 |     prod_name = "Product {}".format(prod_code)
 80 |     prod_desc = "The product that's perfect for {} stuff".format(prod_code)
 81 |     prod_price = str(random.randint(0,50) * pid)
 82 |     row = [prod_code, prod_name, prod_desc, prod_price]   
 83 |     return ",".join(row)
 84 | 
 85 | 
 86 | def run():
 87 | 
 88 |     pipeline_args.append(
 89 |         '--job_name=bq-demo-data-{}'.format(datetime.datetime.now().strftime('%Y%m%d%H%M%S')))
 90 |     pipeline_args.append(
 91 |         '--staging_location=gs://{0}/bq-demo/staging/'.format(known_args.bucket))
 92 |     pipeline_args.append(
 93 |         '--temp_location=gs://{0}/bq-demo/temp/'.format(known_args.bucket))
 94 | 
 95 |     p1 = beam.Pipeline(argv=pipeline_args)
 96 |     # create the customer ids
 97 |     num_customers = p1 | "num_customers" >> beam.Create(
 98 |         [int(known_args.customers)])
 99 |     cust_ids = num_customers | beam.FlatMap(create_cust_ids)
100 | 
101 |     # create the product ids
102 |     num_products = p1 | "num_product" >> beam.Create(
103 |         [int(known_args.products)])
104 |     pids = num_products | beam.FlatMap(create_pids)
105 | 
106 |     # create customers and products
107 |     customers = cust_ids | "generate customer row" >> beam.Map(make_customer)
108 |     products = pids | "generate product row" >> beam.Map(make_product)
109 | 
110 |     # output customer
111 |     output = customers | "write customers to gcs" >> beam.io.WriteToText(
112 |         "gs://{}/bq-demo/customer".format(known_args.bucket))
113 | 
114 |     # output products
115 |     output = products | "write products to gcs" >> beam.io.WriteToText(
116 |         "gs://{}/bq-demo/product".format(known_args.bucket))
117 | 
118 |     p1.run().wait_until_finish()
119 | 
120 |     p2 = beam.Pipeline(argv=pipeline_args)
121 | 
122 |     customers = p2 | 'read customer' >> ReadFromText(
123 |         'gs://{}/bq-demo/customer*'.format(known_args.bucket))
124 |     orders = customers | beam.FlatMap(make_orders)
125 |     line_items = orders | beam.FlatMap(make_lines)
126 |     output = orders | "write orders to gcs" >> beam.io.WriteToText("gs://{}/bq-demo/order".format(known_args.bucket))
127 |     output = line_items | "write line_items to gcs" >> beam.io.WriteToText("gs://{}/bq-demo/line_items".format(known_args.bucket))
128 | 
129 |     p2.run() 
130 | 
131 | if __name__ == '__main__':
132 |    run()
133 | 


--------------------------------------------------------------------------------
/dataflow/simple_demos/beam_demo_2.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file contains minimal, self-contained examples meant to illustrate
  3 | several key Apache Beam concepts and transforms. It's intentionally small
  4 | and readable so you can experiment locally.
  5 | 
  6 | Key points illustrated:
  7 | - Creating PCollections from in-memory data with Create
  8 | - Grouping keyed data with GroupByKey / CoGroupByKey
  9 | - Global and per-key aggregation with CombineGlobally and CombinePerKey
 10 | 
 11 | Suggested local setup (macOS):
 12 | 1. Use pyenv to install a compatible Python (3.11 or earlier is safe for
 13 |     most apache-beam releases). Example:
 14 |     - pyenv install 3.11.6
 15 |     - pyenv local 3.11.6
 16 | 2. Create and activate a virtual environment (venv):
 17 |     - python -m venv .venv
 18 |     - source .venv/bin/activate
 19 | 3. Upgrade pip and install dependencies:
 20 |     - pip install --upgrade pip
 21 |     - pip install apache-beam
 22 |     - If you plan to run on Google Cloud Dataflow, also install:
 23 |       pip install apache-beam[gcp]
 24 | """
 25 | 
 26 | import apache_beam as beam
 27 | from apache_beam.io import WriteToText
 28 | import sys
 29 | 
 30 | # --- Sample data: city -> zip codes (keyed tuples) ---
 31 | city_zip_list = [
 32 |     ("Lexington", "40513"),
 33 |     ("Nashville", "37027"),
 34 |     ("Lexington", "40502"),
 35 |     ("Seattle", "98125"),
 36 |     ("Mountain View", "94041"),
 37 |     ("Seattle", "98133"),
 38 |     ("Lexington", "40591"),
 39 |     ("Mountain View", "94085"),
 40 | ]
 41 | 
 42 | 
 43 | # --- Sample data: sales amounts (scalar numeric values) ---
 44 | sales = [
 45 |     1200.50,
 46 |     950.00,
 47 |     300.75,
 48 |     2100.00,
 49 |     400.25,
 50 |     1800.00,
 51 |     500.00,
 52 |     700.00,
 53 | ]
 54 | 
 55 | # --- Sample data: sales_rep_id -> sale amounts (keyed tuples) ---
 56 | sales_and_reps = [
 57 |     ("SP001", 1200.50),
 58 |     ("SP002", 950.00),
 59 |     ("SP001", 300.75),
 60 |     ("SP003", 2100.00),
 61 |     ("SP002", 400.25),
 62 |     ("SP004", 1800.00),
 63 |     ("SP003", 500.00),
 64 |     ("SP001", 700.00),
 65 | ]
 66 | 
 67 | # --- Sample data: order numbers and amounts (keyed tuples) ---
 68 | order_numbers_amounts = [
 69 |     ("ORD1001", 250.00),
 70 |     ("ORD1002", 120.50),
 71 |     ("ORD1003", 75.25),
 72 |     ("ORD1004", 600.00),
 73 |     ("ORD1005", 320.10),
 74 |     ("ORD1006", 150.75),
 75 |     ("ORD1007", 980.00),
 76 |     ("ORD1008", 45.00),
 77 | ]
 78 | 
 79 | # --- Sample data: order numbers and delivery dates (keyed tuples) ---
 80 | order_numbers_delivery_dates = [
 81 |     ("ORD1001", "2025-08-14"),
 82 |     ("ORD1002", "2025-08-15"),
 83 |     ("ORD1003", "2025-08-16"),
 84 |     ("ORD1004", "2025-08-17"),
 85 |     ("ORD1005", "2025-08-18"),
 86 |     ("ORD1006", "2025-08-19"),
 87 |     ("ORD1007", "2025-08-20"),
 88 |     ("ORD1008", "2025-08-21"),
 89 | ]
 90 | 
 91 | # --- Build and run the pipeline ---
 92 | # Create a Pipeline object. In real projects you typically pass PipelineOptions
 93 | # (for runner, project, temp_location, etc.). For this demo we use the default
 94 | # direct runner which executes locally.
 95 | p = beam.Pipeline()
 96 | 
 97 | 
 98 | # Section: create a keyed PCollection and group by key
 99 | # What it's doing: creates a PCollection of (city, zip) tuples and groups
100 | # all values by the city key. The result is a PCollection of
101 | # (city, iterable_of_zip_codes).
102 | citycodes = p | "CreateCityCodes" >> beam.Create(city_zip_list)
103 | grouped = citycodes | beam.GroupByKey()
104 | grouped | "write_city_grouped" >> WriteToText("beam_demo_2_city_grouped.txt")
105 | 
106 | 
107 | # Section: global aggregation
108 | # What it's doing: create a scalar PCollection and computing the global sum
109 | # across all elements using CombineGlobally. This returns a single-element
110 | # PCollection with the total sales.
111 | sales = p | "CreateSalesCollection" >> beam.Create(sales)
112 | sales_total = sales | beam.CombineGlobally(sum)
113 | sales_total | "write_sales_total" >> WriteToText("beam_demo_2_sales_total.txt")
114 | 
115 | 
116 | # Section: per-key aggregation
117 | # What it's doing: create a keyed PCollection (sales_and_reps) and computing
118 | # the sum per salesperson using CombinePerKey. Output is (salesperson, total).
119 | sales = p | "CreateSalesByRepCollection" >> beam.Create(sales_and_reps)
120 | sales_total_by_rep = sales | beam.CombinePerKey(sum)
121 | sales_total_by_rep | "write_sales_total_by_rep" >> WriteToText(
122 |     "beam_demo_2_sales_total_by_rep.txt"
123 | )
124 | 
125 | 
126 | # Section: CoGroupByKey (a form of join)
127 | # What it's doing: CoGroupByKey takes a dict of keyed PCollections and produces
128 | # for each key a dictionary-like result with lists of values from each input
129 | # PCollection. This demonstrates how to join related datasets by key.
130 | # Called by: the pipeline and then written to disk for inspection.
131 | orders_amounts = p | "CreateOrderNumbers" >> beam.Create(order_numbers_amounts)
132 | orders_delivery_dates = p | "CreateOrderDeliveryDates" >> beam.Create(
133 |     order_numbers_delivery_dates
134 | )
135 | joined = {
136 |     "orders": orders_amounts,
137 |     "shipping": orders_delivery_dates,
138 | } | beam.CoGroupByKey()
139 | joined | "write_joined_orders_shipping" >> WriteToText(
140 |     "beam_demo_2_joined_orders_shipping.txt"
141 | )
142 | 
143 | p.run().wait_until_finish()
144 | 


--------------------------------------------------------------------------------
/bigquery/arrays_examples.sql:
--------------------------------------------------------------------------------
  1 | -- populate arrays explicitly
  2 | SELECT
  3 |   "row1" AS row_id,
  4 |   [1,
  5 |   2,
  6 |   3,
  7 |   4] AS num_array
  8 | UNION ALL
  9 | SELECT
 10 |   "row2" AS row_id,
 11 |   [2,
 12 |   4,
 13 |   8,
 14 |   16,
 15 |   32] AS num_array
 16 | UNION ALL
 17 | SELECT
 18 |   "row3" AS row_id,
 19 |   [5,
 20 |   10] AS num_array
 21 | 
 22 | -- populate arrays using array_agg
 23 | WITH
 24 |   c AS (
 25 |   SELECT
 26 |     cust_id,
 27 |     cust_name,
 28 |     cust_zip
 29 |   FROM
 30 |     `roi-bq-demos.bq_demo.cp`
 31 |   WHERE
 32 |     cust_state = "AK")
 33 | SELECT
 34 |   cust_name,
 35 |   ARRAY_AGG(order_num) as orders
 36 | FROM
 37 |   c
 38 | JOIN
 39 |   `roi-bq-demos.bq_demo.order` o
 40 | ON
 41 |   o.cust_id = c.cust_id
 42 | GROUP BY
 43 |   c.cust_name
 44 | 
 45 | -- report array length
 46 | SELECT
 47 |   `commit`,
 48 |   ARRAY_LENGTH(difference) AS arr_len,
 49 |   difference
 50 | FROM
 51 |   `bigquery-public-data.github_repos.commits`
 52 | WHERE
 53 |   author.email LIKE "%jwdavis.me"
 54 | ORDER BY
 55 |   arr_len DESC
 56 | LIMIT
 57 |   5
 58 | 
 59 | -- find by array length
 60 | SELECT
 61 |   author,
 62 |   difference
 63 | FROM
 64 |   `bigquery-public-data.github_repos.commits`
 65 | WHERE
 66 |   array_length(difference) = 5
 67 | LIMIT 10
 68 | 
 69 | -- select basic array
 70 | SELECT
 71 |   [1,
 72 |   2,
 73 |   3,
 74 |   4] AS num_array
 75 | 
 76 | -- select table from array
 77 | SELECT
 78 |   *
 79 | FROM
 80 |   UNNEST ( [1, 2, 3, 4]) AS num
 81 | 
 82 | -- calculate average of array
 83 | SELECT
 84 |     AVG(num) AS avg_num
 85 | FROM
 86 |   UNNEST ( [1, 2, 3, 4]) AS num
 87 | 
 88 | -- basic correlated cross join
 89 | WITH
 90 |   arrays AS (
 91 |   SELECT
 92 |     "row1" AS row_id,
 93 |     [1,
 94 |     2,
 95 |     3,
 96 |     4] AS num_array
 97 |   UNION ALL
 98 |   SELECT
 99 |     "row2" AS row_id,
100 |     [2,
101 |     4,
102 |     8,
103 |     16,
104 |     32] AS num_array)
105 | SELECT
106 |   row_id,
107 |   num_array,
108 |   num
109 | FROM
110 |   arrays
111 | CROSS JOIN
112 |   UNNEST(num_array) AS num
113 | 
114 | -- comma correlated cross join
115 | WITH
116 |   arrays AS (
117 |   SELECT
118 |     "row1" AS row_id,
119 |     [1,
120 |     2,
121 |     3,
122 |     4] AS num_array
123 |   UNION ALL
124 |   SELECT
125 |     "row2" AS row_id,
126 |     [2,
127 |     4,
128 |     8,
129 |     16,
130 |     32] AS num_array)
131 | SELECT
132 |   row_id,
133 |   num_array,
134 |   num
135 | FROM
136 |   arrays,
137 |   UNNEST(num_array) AS num
138 | 
139 | -- implicit unnest
140 | WITH
141 |   arrays AS (
142 |   SELECT
143 |     "row1" AS row_id,
144 |     [1,
145 |     2,
146 |     3,
147 |     4] AS num_array
148 |   UNION ALL
149 |   SELECT
150 |     "row2" AS row_id,
151 |     [2,
152 |     4,
153 |     8,
154 |     16,
155 |     32] AS num_array)
156 | SELECT
157 |   row_id,
158 |   num_array,
159 |   num
160 | FROM
161 |   arrays,
162 |   arrays.num_array AS num
163 | 
164 | -- find row where num_array contains 2 - take 1
165 | WITH
166 |   arrays AS (
167 |   SELECT
168 |     "row1" AS row_id,
169 |     [1,
170 |     2,
171 |     3,
172 |     4] AS num_array
173 |   UNION ALL
174 |   SELECT
175 |     "row2" AS row_id,
176 |     [2,
177 |     4,
178 |     8,
179 |     16,
180 |     32] AS num_array)
181 | SELECT
182 |   row_id,
183 |   num_array
184 | FROM
185 |   arrays
186 | CROSS JOIN
187 |   UNNEST(num_array) AS num
188 | WHERE
189 |   num=2
190 | 
191 | -- find row where num_array contains 2 - take 2
192 | WITH
193 |   arrays AS (
194 |   SELECT
195 |     "row1" AS row_id,
196 |     [2,
197 |     2,
198 |     3,
199 |     4] AS num_array
200 |   UNION ALL
201 |   SELECT
202 |     "row2" AS row_id,
203 |     [2,
204 |     4,
205 |     8,
206 |     16,
207 |     32] AS num_array)
208 | SELECT
209 |   row_id,
210 |   num_array
211 | FROM
212 |   arrays
213 | WHERE
214 |   2 in (select num from unnest(arrays.num_array) as num)
215 | 
216 | -- find row where num_array contains 2 - take 3
217 | WITH
218 |   arrays AS (
219 |   SELECT
220 |     "row1" AS row_id,
221 |     [2,
222 |     2,
223 |     3,
224 |     4] AS num_array
225 |   UNION ALL
226 |   SELECT
227 |     "row2" AS row_id,
228 |     [2,
229 |     4,
230 |     8,
231 |     16,
232 |     32] AS num_array)
233 | SELECT
234 |   row_id,
235 |   num_array
236 | FROM
237 |   arrays
238 | WHERE
239 |   EXISTS (
240 |   SELECT
241 |     *
242 |   FROM
243 |     UNNEST(num_array) AS num
244 |   WHERE
245 |     num=2)
246 | 
247 | -- find commits that touched a specific file - take 1
248 | SELECT
249 |   author,
250 |   difference
251 | FROM
252 |   `bigquery-public-data.github_repos.commits`,
253 |   unnest(difference) as files
254 | WHERE
255 |   files.new_path = "courses/data_analysis/lab2/python/is_popular.py"
256 | 
257 | -- find commits that touched a specific file - take 2
258 | SELECT
259 |   author,
260 |   difference
261 | FROM
262 |   `bigquery-public-data.github_repos.commits`
263 | WHERE
264 |   "courses/data_analysis/lab2/python/is_popular.py" in (select f.new_path from unnest(difference) as f)
265 | 
266 | -- find commits that touched a specific file - take 3
267 | SELECT
268 |   author,
269 |   difference
270 | FROM
271 |   `bigquery-public-data.github_repos.commits`
272 | WHERE
273 |   EXISTS (
274 |   SELECT
275 |     *
276 |   FROM
277 |     UNNEST(difference) AS f
278 |   WHERE
279 |     f.new_path="courses/data_analysis/lab2/python/is_popular.py")


--------------------------------------------------------------------------------
/dataflow/dflow-bq-stream-python/process_events.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import apache_beam as beam
  3 | import apache_beam.transforms.window as window
  4 | import random
  5 | import argparse
  6 | import json
  7 | import schema_defs
  8 | 
  9 | from apache_beam.options.pipeline_options import PipelineOptions
 10 | from apache_beam.options.pipeline_options import SetupOptions
 11 | from apache_beam.options.pipeline_options import StandardOptions
 12 | from apache_beam.transforms import window
 13 | from datetime import datetime
 14 | 
 15 | from google.cloud import bigquery
 16 | from google.cloud.exceptions import NotFound
 17 | 
 18 | # takes input element, and returns an array of one bq row
 19 | # that includes the window end time
 20 | class CreateBQRow(beam.DoFn):
 21 |   def process(self, element, window=beam.DoFn.WindowParam):
 22 |     window_end_ts = window.end.to_utc_datetime().isoformat()
 23 |     row = {"window_ending": window_end_ts,
 24 |         "pos_id": element[0],
 25 |         "transactions": element[1]
 26 |     }
 27 |     # print(f"Writing row:{row}")
 28 |     return [row]
 29 | 
 30 | # convert message into a kv pair
 31 | # with transaction info in an objet
 32 | def make_kv(element):
 33 |     kv = (
 34 |         element["pos_id"], 
 35 |         {
 36 |             "ts": element["ts"],
 37 |             "zip": element["zip"],
 38 |             "sale_amount": element["sale_amount"]
 39 |         }
 40 |     )
 41 |     return kv
 42 | 
 43 | parser = argparse.ArgumentParser()
 44 | parser.add_argument(
 45 |     "--dataset_id",
 46 |     default='dflow_demo')
 47 | parser.add_argument(
 48 |     "--table_id",
 49 |     default='messages')
 50 | parser.add_argument(
 51 |     "--sub_id",
 52 |     default='demo_sub')
 53 | parser.add_argument(
 54 |     "--window_size",
 55 |     default=10)
 56 | 
 57 | known_args, pipeline_args = parser.parse_known_args()
 58 | 
 59 | # check to see if user specified --project; quit if not
 60 | pipeline_args_dict = {}
 61 | for index in range(1, len(pipeline_args), 2):
 62 |     pipeline_args_dict[pipeline_args[index-1].replace("--", "", 1)] = pipeline_args[index]
 63 | 
 64 | if not ("project" in pipeline_args_dict):
 65 |     print("project argument is missing")
 66 |     quit()
 67 | 
 68 | sub_path = f"projects/{pipeline_args_dict['project']}/subscriptions/{known_args.sub_id}"
 69 | 
 70 | # check to see if dataset exists, create if not
 71 | bq_client = bigquery.Client()
 72 | dataset_path = f"{pipeline_args_dict['project']}.{known_args.dataset_id}"
 73 | try:
 74 |     dataset = bq_client.get_dataset(dataset_path)
 75 | except NotFound:
 76 |     dataset = bigquery.Dataset(dataset_path)
 77 |     dataset.location = "US"
 78 |     dataset = bq_client.create_dataset(dataset, timeout=30)
 79 | 
 80 | # check to see if messages table exists, create if not
 81 | messages_table_path = f"{pipeline_args_dict['project']}.{known_args.dataset_id}.{known_args.table_id}"
 82 | try:
 83 |     table = bq_client.get_table(messages_table_path)
 84 | except NotFound:
 85 |     table_ref = dataset.table(known_args.table_id)
 86 |     table = bigquery.Table(table_ref, schema=schema_defs.ccl_messages_schema)
 87 |     table = bq_client.create_table(table)
 88 | 
 89 | # check to see if nested table exists, create if not
 90 | nested_table_path = f"{pipeline_args_dict['project']}.{known_args.dataset_id}.{known_args.table_id}_nested"
 91 | try:
 92 |     table = bq_client.get_table(nested_table_path)
 93 | except NotFound:
 94 |     table_ref = dataset.table(f"{known_args.table_id}_nested")
 95 |     table = bigquery.Table(table_ref, schema=schema_defs.ccl_messages_nested_schema)
 96 |     table = bq_client.create_table(table)
 97 | 
 98 | pipeline_options = PipelineOptions(pipeline_args)
 99 | pipeline_options.view_as(SetupOptions).save_main_session = False
100 | pipeline_options.view_as(StandardOptions).streaming = True
101 | 
102 | p = beam.Pipeline(options=pipeline_options)
103 | messages = p | "read messages" >> beam.io.ReadFromPubSub(subscription = sub_path)
104 | decoded_messages = messages | "decode bytes" >> beam.Map(lambda x: x.decode('utf-8'))
105 | json_messages = decoded_messages | "convert to json" >> beam.Map(lambda x: json.loads(x))
106 | 
107 | # write the transactions as is into messages table
108 | json_messages | "write messages to BQ" >> beam.io.WriteToBigQuery(
109 |     messages_table_path.replace(".", ":", 1),
110 |     schema=schema_defs.beam_messages_schema,
111 |     write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
112 |     create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED
113 | )
114 | 
115 | # convert rows into kv pairs, window them, group them, create BQ Row
116 | pos_sale_kvs = json_messages | "create key/value pairs" >> beam.Map(make_kv)
117 | windowed_kvs = pos_sale_kvs | "window elements" >> beam.WindowInto(window.FixedWindows(10))
118 | nested_rows = windowed_kvs | "group per key/window" >> beam.GroupByKey()
119 | nested_labelled_rows = nested_rows | "create BQ nested row" >> beam.ParDo(CreateBQRow())
120 | 
121 | # then stream rows into BQ nested table 
122 | nested_labelled_rows | "write nested rows to BQ" >> beam.io.WriteToBigQuery(
123 |     nested_table_path.replace(".", ":", 1),
124 |     schema=schema_defs.beam_messages_nested_schema,
125 |     write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
126 |     create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED
127 | )
128 | 
129 | if "DataflowRunner" in pipeline_args:
130 |     p.run()
131 | else:
132 |     p.run().wait_until_finish()


--------------------------------------------------------------------------------
/bigquery/information_schema_examples.sql:
--------------------------------------------------------------------------------
  1 | -- This file contains a list of example queries illustrating the power and benefit of using BigQuery INFORMATION_SCHEMA views.
  2 | 
  3 | -- Use case: Identify top users by query volume and bytes processed in the last 7 days.
  4 | -- This query helps monitor user activity and analyze cost drivers, especially useful for environments with on-demand pricing.
  5 | -- It works by aggregating the total bytes processed and number of queries per user from the JOBS_BY_PROJECT view, filtered to the past week, and sorts the results to show the most active users.
  6 | SELECT
  7 |   user_email,
  8 |   SUM(total_bytes_processed) AS total_bytes_processed,
  9 |   COUNT(job_id) AS total_queries,
 10 | FROM
 11 |   `region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT`
 12 | WHERE
 13 |   job_type = 'QUERY'
 14 |   AND creation_time BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 7 DAY) AND CURRENT_TIMESTAMP()
 15 | GROUP BY
 16 |   user_email
 17 | ORDER BY
 18 |   total_bytes_processed DESC
 19 | LIMIT 10;
 20 | 
 21 | 
 22 | -- Use case: Analyze table storage efficiency and compression to optimize costs and identify tables that might benefit from physical storage pricing.
 23 | -- This query retrieves logical and physical storage metrics for each table, calculates compression ratios, and highlights tables with high or low compression.
 24 | -- It also computes the proportion of data stored long-term, helping you spot tables that may benefit from partitioning or clustering.
 25 | SELECT
 26 |   table_schema,
 27 |   table_name,
 28 |   total_logical_bytes,
 29 |   total_physical_bytes,
 30 | IF
 31 |   (total_logical_bytes = 0, 0, (1-ROUND(total_physical_bytes/total_logical_bytes, 2)))*100 AS compression_ratio,
 32 |   active_logical_bytes,
 33 |   long_term_logical_bytes,
 34 |  IF
 35 |   (total_logical_bytes = 0, 0, (1-ROUND(active_logical_bytes/total_logical_bytes, 2)))*100 AS long_term_ratio,
 36 | FROM
 37 |   `region-us.INFORMATION_SCHEMA.TABLE_STORAGE_BY_PROJECT`
 38 | ORDER BY
 39 |   compression_ratio DESC;
 40 | 
 41 | 
 42 | -- Use case: Identify recent queries that performed full table scans, which can be costly and indicate missing WHERE filters.
 43 | -- This query finds SELECT statements executed in the last 24 hours that do not contain a WHERE clause, highlighting queries likely to scan entire tables.
 44 | -- It helps pinpoint opportunities to optimize queries and reduce costs by adding filters or partitioning.
 45 | SELECT
 46 |   query,
 47 |   user_email,
 48 |   total_bytes_processed
 49 | FROM
 50 |   `region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT`
 51 | WHERE
 52 |   job_type = 'QUERY'
 53 |   AND statement_type = 'SELECT'
 54 |   AND total_bytes_processed > 0
 55 |   AND creation_time BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 24 HOUR) AND CURRENT_TIMESTAMP()
 56 |   AND NOT REGEXP_CONTAINS(query, r'WHERE')
 57 | ORDER BY
 58 |   total_bytes_processed DESC
 59 | LIMIT 10;
 60 | 
 61 | -- Use case: Find tables that have not been accessed in the last 90 days to identify candidates for cleanup, archiving, or cost optimization.
 62 | -- This query combines job history and table metadata to determine the last time each table was queried, then filters for tables with no recent access.
 63 | WITH
 64 |   recent_access AS (
 65 |   SELECT
 66 |     rt.project_id AS project_id,
 67 |     rt.dataset_id AS dataset_id,
 68 |     rt.table_id AS table_name,
 69 |     MAX(j.creation_time) AS last_access_time
 70 |   FROM
 71 |     `region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT` AS j
 72 |   CROSS JOIN
 73 |     UNNEST(j.referenced_tables) AS rt
 74 |   WHERE
 75 |     j.job_type = "QUERY"
 76 |     AND j.state = "DONE"
 77 |   GROUP BY
 78 |     project_id,
 79 |     dataset_id,
 80 |     table_name )
 81 |   -- 2) List all tables in the project and left-join access info
 82 | SELECT
 83 |   t.table_catalog AS project_id,
 84 |   t.table_schema AS dataset_id,
 85 |   t.table_name,
 86 |   r.last_access_time
 87 | FROM
 88 |   `region-us.INFORMATION_SCHEMA.TABLES` AS t
 89 | LEFT JOIN
 90 |   recent_access AS r
 91 | ON
 92 |   t.table_catalog = r.project_id
 93 |   AND t.table_schema = r.dataset_id
 94 |   AND t.table_name = r.table_name
 95 |   -- 3) Filter to those not accessed in the last 90 days (or never accessed)
 96 | WHERE
 97 |   COALESCE(r.last_access_time, TIMESTAMP '1970-01-01') < TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 90 DAY)
 98 | ORDER BY
 99 |   r.last_access_time;
100 | 
101 | 
102 | -- Use case: Monitor recent slot usage for jobs to analyze resource consumption, do slot budgeting, allocate costs, etc..
103 | -- This query lists jobs executed in the last hour in a given region/project that used slots, showing who ran them and how many slot milliseconds were consumed.
104 | -- It helps identify users or jobs with high resource usage
105 | SELECT
106 |   creation_time,
107 |   job_id,
108 |   user_email,
109 |   total_slot_ms,
110 | FROM
111 |   `region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT`
112 | WHERE
113 |   creation_time BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1 HOUR) AND CURRENT_TIMESTAMP()
114 |   AND total_slot_ms > 0
115 | ORDER BY
116 |   total_slot_ms DESC;
117 | 
118 | -- Use case: Audit and document all view definitions in a project for governance, troubleshooting, or migration.
119 | -- This query lists every view in the project/region along with its SQL definition, making it easy to review logic, dependencies, and ensure compliance.
120 | SELECT
121 |   table_schema,
122 |   table_name,
123 |   view_definition
124 | FROM
125 |   `region-us.INFORMATION_SCHEMA.VIEWS`
126 | ORDER BY
127 |   table_schema,
128 |   table_name;


--------------------------------------------------------------------------------
/dlp-demo/app/main.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Modern Google Cloud DLP demonstration application.
  3 | 
  4 | This module provides a Flask web application that demonstrates Google Cloud
  5 | Data Loss Prevention (DLP) API capabilities including text inspection,
  6 | redaction, replacement, and masking of sensitive information.
  7 | """
  8 | 
  9 | import os
 10 | import os
 11 | import sys
 12 | from typing import Any, Dict, Tuple
 13 | 
 14 | from flask import Flask, render_template, request, jsonify
 15 | from flask_cors import CORS
 16 | from dotenv import load_dotenv
 17 | 
 18 | # Handle imports for both direct execution and module import
 19 | if __name__ == "__main__":
 20 |     # When run directly, add parent directory to path and use absolute imports
 21 |     sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 22 |     from app.services.dlp_service import DLPService
 23 |     from app.config import Config
 24 | else:
 25 |     # When imported as a module, use relative imports
 26 |     from .services.dlp_service import DLPService
 27 |     from .config import Config
 28 | 
 29 | # Load environment variables
 30 | load_dotenv()
 31 | 
 32 | 
 33 | def create_app(config_class: type = Config) -> Flask:
 34 |     """Application factory pattern for creating Flask app instances.
 35 | 
 36 |     Args:
 37 |         config_class: Configuration class to use for the app
 38 | 
 39 |     Returns:
 40 |         Configured Flask application instance
 41 |     """
 42 |     app = Flask(__name__)
 43 |     app.config.from_object(config_class)
 44 | 
 45 |     # Enable CORS for all domains and routes
 46 |     CORS(app, resources={r"/*": {"origins": "*"}})
 47 | 
 48 |     # Initialize services with error handling
 49 |     dlp_service = None
 50 |     if app.config["GOOGLE_CLOUD_PROJECT"]:
 51 |         try:
 52 |             dlp_service = DLPService(project_id=app.config["GOOGLE_CLOUD_PROJECT"])
 53 |         except Exception as e:
 54 |             app.logger.warning(f"Failed to initialize DLP service: {str(e)}")
 55 |     else:
 56 |         app.logger.warning(
 57 |             "GOOGLE_CLOUD_PROJECT not configured - DLP functionality will be disabled"
 58 |         )
 59 | 
 60 |     @app.route("/")
 61 |     def index() -> str:
 62 |         """Serve the main DLP demo page."""
 63 |         return render_template("dlp-demo.html", title="DLP Demo 2.0")
 64 | 
 65 |     @app.route("/health")
 66 |     def health() -> Tuple[Dict[str, str], int]:
 67 |         """Health check endpoint for Cloud Run."""
 68 |         return {"status": "healthy"}, 200
 69 | 
 70 |     @app.route("/api/dlp", methods=["POST"])
 71 |     def process_dlp() -> Tuple[Dict[str, Any], int]:
 72 |         """Process text with DLP operations.
 73 | 
 74 |         Expected JSON payload:
 75 |         {
 76 |             "text": "Text to process",
 77 |             "action": "inspect|redact|replace|mask"
 78 |         }
 79 | 
 80 |         Returns:
 81 |             JSON response with DLP results
 82 |         """
 83 |         try:
 84 |             data = request.get_json()
 85 |             if not data or "text" not in data or "action" not in data:
 86 |                 return {"error": "Missing required fields: text, action"}, 400
 87 | 
 88 |             text = data["text"]
 89 |             action = data["action"]
 90 | 
 91 |             if not text.strip():
 92 |                 return {"error": "Text cannot be empty"}, 400
 93 | 
 94 |             # Check if DLP service is available
 95 |             if dlp_service is None:
 96 |                 return {
 97 |                     "error": (
 98 |                         "DLP service not available - check GOOGLE_CLOUD_PROJECT configuration"
 99 |                     )
100 |                 }, 503
101 | 
102 |             if action == "inspect":
103 |                 result = dlp_service.inspect_text(text)
104 |             elif action in ["redact", "replace", "mask"]:
105 |                 result = dlp_service.deidentify_text(text, action)
106 |             else:
107 |                 return {"error": f"Invalid action: {action}"}, 400
108 | 
109 |             return {"result": result}, 200
110 | 
111 |         except Exception as e:
112 |             app.logger.error(f"Error processing DLP request: {str(e)}")
113 |             return {"error": "Internal server error"}, 500
114 | 
115 |     @app.errorhandler(404)
116 |     def not_found(error: Any) -> Tuple[str, int]:
117 |         """Handle 404 errors with a custom page."""
118 |         return (
119 |             render_template(
120 |                 "message.html",
121 |                 headline="404 - Page Not Found",
122 |                 message_text="The page you're looking for doesn't exist.",
123 |                 title="Not Found",
124 |             ),
125 |             404,
126 |         )
127 | 
128 |     @app.errorhandler(500)
129 |     def internal_error(error: Any) -> Tuple[str, int]:
130 |         """Handle 500 errors with a custom page."""
131 |         return (
132 |             render_template(
133 |                 "message.html",
134 |                 headline="500 - Internal Server Error",
135 |                 message_text="Something went wrong on our end.",
136 |                 title="Server Error",
137 |             ),
138 |             500,
139 |         )
140 | 
141 |     return app
142 | 
143 | 
144 | # Create app instance for gunicorn
145 | app = create_app()
146 | 
147 | 
148 | if __name__ == "__main__":
149 |     # Development server
150 |     port = int(os.environ.get("PORT", 8080))
151 |     debug = os.environ.get("FLASK_ENV") == "development"
152 | 
153 |     app.run(host="0.0.0.0", port=port, debug=debug)
154 | 


--------------------------------------------------------------------------------
/dataplex/profiling/README.md:
--------------------------------------------------------------------------------
  1 | ### Dataplex Data Profiling Demonstration
  2 | 
  3 | This document provides a simplified, but realistic, demonstration of how Dataplex can be used for data profiling and quality checks in a production environment.
  4 | 
  5 | #### 1. Simplified Data Engineer Workflow
  6 | 
  7 | This workflow models a common scenario where a data engineer needs to validate new data from an external source before it can be used for downstream analytics.
  8 | 
  9 | **Scenario:** A company receives daily sales data as a CSV file in a Google Cloud Storage (GCS) bucket. The data engineering team must ensure the data is complete, has the correct data types, and meets business rules before it's moved to a data warehouse.
 10 | 
 11 | **Workflow Stages:**
 12 | 
 13 | 1.  **Ingestion:** A partner uploads a CSV file (`sales_data.csv`) to a designated **raw GCS bucket**.
 14 | 2.  **Data Discovery & Profiling (Dataplex):** A **Dataplex discovery job** automatically scans the GCS bucket. When a new file is detected, it triggers a **data profiling task**.
 15 |       * **Goal:** Automatically analyze the structure, data types, value distributions, and potential anomalies in the new data.
 16 | 3.  **Quality Check & Validation (Dataplex Data Quality):** A **Dataplex data quality scan** runs on the profiled data to enforce business rules.
 17 |       * **Goal:** Validate specific rules, such as "the `transaction_id` column must not contain null values."
 18 | 4.  **Action:**
 19 |       * **Success:** If the data passes the quality checks, it's considered clean and ready for analysis.
 20 |       * **Failure:** If the data fails, a data engineer is alerted to investigate the data quality issue.
 21 | 5.  **Preparation for Analytics:** The validated, clean data is now ready for use by business analysts and data scientists.
 22 | 
 23 | -----
 24 | 
 25 | #### 2\. Step-by-Step Demonstration Instructions
 26 | 
 27 | This guide will walk you through setting up the core Dataplex components and then simulating the ingestion of data to see the process in action.
 28 | 
 29 | **Prerequisites:**
 30 | 
 31 |   * A Google Cloud Project with billing enabled.
 32 |   * The `gcloud` CLI installed and authenticated.
 33 |   * Terraform installed.
 34 |   * Necessary IAM roles for your user account (e.g., `Owner` or `Project Editor` for a demo environment).
 35 | 
 36 | **Step 1: Set Up the Project with Terraform**
 37 | 
 38 | 1.  Create a new directory for your Terraform code.
 39 | 
 40 | 2.  Create the `main.tf` and `dataplex.tf` files with the code provided below.
 41 | 
 42 | 3.  Replace `your-gcp-project-id` with your actual project ID in `main.tf`.
 43 | 
 44 | 4.  Open a terminal in your project directory and run the following commands:
 45 | 
 46 |     ```bash
 47 |     terraform init
 48 |     terraform apply
 49 |     ```
 50 | 
 51 | 5.  When prompted, type `yes` to approve the creation of the resources. The output will show the names of the resources created, including the raw GCS bucket.
 52 | 
 53 | **Step 2: Generate Sample Data**
 54 | 
 55 | 1.  Create a CSV file named `sales_data.csv` with the following content. This represents our "good" data.
 56 | 
 57 |     ```csv
 58 |     transaction_id,product_sku,sale_amount,sale_date
 59 |     1001,SKU-A,12.50,2025-08-01
 60 |     1002,SKU-B,25.00,2025-08-01
 61 |     1003,SKU-A,12.50,2025-08-02
 62 |     1004,SKU-C,75.25,2025-08-02
 63 |     ```
 64 | 
 65 | **Step 3: Trigger the Workflow (Simulated Ingestion)**
 66 | 
 67 | 1.  Upload the `sales_data.csv` file to the GCS bucket created by Terraform. Find the bucket name in the Terraform output (it will be `demo-raw-bucket-<project_id>`).
 68 | 
 69 |     ```bash
 70 |     gcloud storage cp sales_data.csv gs://<your-raw-bucket-name>/sales/sales_data.csv
 71 |     ```
 72 | 
 73 | **Step 4: Observe Dataplex in the Console**
 74 | 
 75 | 1.  Navigate to the **Dataplex** UI in the Google Cloud Console.
 76 | 2.  Click on your lake, zone, and asset.
 77 | 3.  In the asset details, you'll see a **Profiling** tab. Within a few minutes, the discovery and profiling job will have run on your new file.
 78 | 4.  On the **Profiling** tab, you'll see a detailed analysis of your data:
 79 |       * **Data Types:** Confirm that `transaction_id` is an `INTEGER`, `sale_amount` is a `FLOAT`, and `sale_date` is a `DATE` or `TIMESTAMP`.
 80 |       * **Statistics:** See metrics like mean, min, max, and standard deviation.
 81 |       * **Value Distribution:** View the distribution of values for each column.
 82 |       * **Null Count:** Observe that the `transaction_id` column has a null count of 0.
 83 | 
 84 | **Step 5: Inspect the Data Quality Scan Results**
 85 | 
 86 | 1.  Go to the **Data quality scans** section in the Dataplex UI.
 87 | 2.  Find the scan you created (`sales-data-quality-scan`).
 88 | 3.  Click on the scan to view the results. The scan should show a **Success** status, indicating that the `NOT_NULL` check passed.
 89 | 
 90 | **Step 6: Demonstrate a Failure Scenario (Optional)**
 91 | 
 92 | 1.  Create a new CSV file named `bad_sales_data.csv` that contains a null value in the `transaction_id` column.
 93 | 
 94 |     ```csv
 95 |     transaction_id,product_sku,sale_amount,sale_date
 96 |     1005,SKU-D,50.00,2025-08-03
 97 |     ,SKU-E,15.25,2025-08-03
 98 |     ```
 99 | 
100 | 2.  Upload this file to the GCS bucket:
101 | 
102 |     ```bash
103 |     gcloud storage cp bad_sales_data.csv gs://<your-raw-bucket-name>/sales/bad_sales_data.csv
104 |     ```
105 | 
106 | 3.  Wait for the discovery and profiling jobs to run.
107 | 
108 | 4.  Check the **Data quality scans** again. The new run of the `sales-data-quality-scan` should show a **Failure** status due to the null value.
109 | 


--------------------------------------------------------------------------------
/dataplex/profiling/profile.md:
--------------------------------------------------------------------------------
  1 | # E-commerce Data Pipeline with Dataplex Profiling
  2 | 
  3 | ## Overview
  4 | 
  5 | This document details a realistic data engineering pipeline that leverages Google Cloud Dataplex for data profiling, showing how profiling integrates into the data flow and drives automated decision-making.
  6 | 
  7 | ## Example: E-commerce Customer Analytics Pipeline
  8 | 
  9 | **Context**: An e-commerce company ingests customer transaction data from multiple sources (web, mobile app, point-of-sale systems) and needs to maintain data quality while building analytics datasets.
 10 | 
 11 | ## The Pipeline Flow
 12 | 
 13 | ### 1. Data Ingestion
 14 | - Raw transaction data lands in Cloud Storage buckets (JSON, CSV, Parquet files)
 15 | - Data comes from web analytics, mobile apps, and POS systems
 16 | - Files arrive throughout the day with varying schemas and quality
 17 | 
 18 | ### 2. Dataplex Discovery & Profiling
 19 | - Dataplex automatically discovers new datasets in the storage buckets
 20 | - Profiling jobs run on a schedule (e.g., every 4 hours for new data, daily for full datasets)
 21 | - Profiles capture:
 22 |   - Schema drift detection
 23 |   - Null value percentages
 24 |   - Data type distributions
 25 |   - Value ranges and outliers
 26 |   - Duplicate records
 27 |   - Pattern matching for emails, phone numbers, etc.
 28 | 
 29 | ### 3. Data Quality Assessment
 30 | - Cloud Functions triggered by Dataplex profiling completion
 31 | - Custom logic evaluates profiling results against business rules:
 32 |   - Email fields must be >95% valid format
 33 |   - Transaction amounts must be within expected ranges
 34 |   - Customer IDs must have <1% null values
 35 |   - Schema changes trigger alerts
 36 | 
 37 | ### 4. Automated Responses
 38 | - **Pass Quality Gates**: Data flows to BigQuery staging tables
 39 | - **Fail Quality Gates**: 
 40 |   - Quarantine bad data to separate storage
 41 |   - Send alerts to data engineering team
 42 |   - Create incident tickets automatically
 43 |   - Block downstream processing
 44 | 
 45 | ### 5. Data Processing & Enrichment
 46 | - Dataflow/Dataproc jobs process qualified data
 47 | - Join with reference data, apply business logic
 48 | - Create customer 360 views, product analytics
 49 | 
 50 | ### 6. Consumption
 51 | - Clean data lands in BigQuery data warehouse
 52 | - Powers BI dashboards, ML models, and operational reports
 53 | - Data lineage tracked through Dataplex
 54 | 
 55 | ## How Profiling Results Drive Actions
 56 | 
 57 | ### Schema Evolution
 58 | When profiling detects new fields in mobile app data, the pipeline automatically updates BigQuery schemas and notifies analysts of new available data.
 59 | 
 60 | ### Data Quality Monitoring
 61 | If profiling shows transaction amounts with unusual spikes, the system quarantines that batch and alerts the team to investigate potential data corruption.
 62 | 
 63 | ### Performance Optimization
 64 | Profiling results showing high cardinality fields inform partitioning strategies in BigQuery.
 65 | 
 66 | ## Pipeline Architecture Diagram
 67 | 
 68 | ```mermaid
 69 | graph TD
 70 |     A[Web Analytics] --> D[Cloud Storage Raw Zone]
 71 |     B[Mobile App] --> D
 72 |     C[POS Systems] --> D
 73 |     
 74 |     D --> E[Dataplex Discovery & Auto-Profiling]
 75 |     E --> F{Data Quality Assessment}
 76 |     
 77 |     F --> G[Profile Results Analysis<br/>- Schema validation<br/>- Null value checks<br/>- Range validation<br/>- Pattern matching]
 78 |     
 79 |     G --> H{Quality Gates}
 80 |     
 81 |     H -->|Pass| I[Cloud Storage Curated Zone]
 82 |     H -->|Fail| J[Quarantine Storage]
 83 |     
 84 |     J --> K[Alert System<br/>- Slack notifications<br/>- Incident tickets<br/>- Data team alerts]
 85 |     
 86 |     I --> L[Dataflow Processing<br/>- Data cleansing<br/>- Business logic<br/>- Enrichment]
 87 |     
 88 |     L --> M[BigQuery Data Warehouse<br/>- Customer 360<br/>- Product analytics<br/>- Transaction history]
 89 |     
 90 |     M --> N[Analytics & ML<br/>- BI Dashboards<br/>- Recommendation engine<br/>- Fraud detection]
 91 |     
 92 |     E --> O[Dataplex Data Catalog<br/>- Schema registry<br/>- Data lineage<br/>- Quality metrics]
 93 |     
 94 |     O --> P[Data Governance<br/>- Policy enforcement<br/>- Access controls<br/>- Compliance reporting]
 95 |     
 96 |     Q[Scheduled Triggers<br/>- Every 4 hours for new data<br/>- Daily full profiling<br/>- Weekly deep analysis] --> E
 97 |     
 98 |     style E fill:#e1f5fe
 99 |     style G fill:#fff3e0
100 |     style H fill:#f3e5f5
101 |     style J fill:#ffebee
102 |     style O fill:#e8f5e8
103 | ```
104 | 
105 | ## Key Benefits
106 | 
107 | ### Proactive Quality Management
108 | Rather than discovering data issues during analysis, profiling catches problems at ingestion time, preventing downstream corruption.
109 | 
110 | ### Automated Decision Making
111 | Profile results directly trigger pipeline branching logic, reducing manual intervention and improving response time.
112 | 
113 | ### Continuous Monitoring
114 | Regular profiling provides trend analysis on data quality metrics, helping teams identify degrading data sources before they become critical issues.
115 | 
116 | ### Operational Efficiency
117 | By automatically quarantining bad data and alerting teams with specific profiling insights, the pipeline reduces time-to-resolution for data quality incidents.
118 | 
119 | ## Conclusion
120 | 
121 | The profiling essentially acts as a "data firewall" - ensuring only quality data flows through to expensive processing and storage systems while providing rich metadata for governance and optimization decisions.
122 | 
123 | 
124 | 
125 | Demo
126 | 
127 | gcloud storage cp web_events_20240806_14.json gs://jwd-gcp-demos-ecommerce-raw-dev/web-analytics/
128 | gcloud storage cp store_transactions_20240806.csv gs://jwd-gcp-demos-ecommerce-raw-dev/pos-systems/
129 | gcloud storage cp app_events_20240806_1400.jsonl gs://jwd-gcp-demos-ecommerce-raw-dev/mobile-app/
130 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 🚀 ROI GCP Training Demos
  2 | 
  3 | Welcome to a collection of Google Cloud Platform demonstrations and hands-on examples brought to you by ROI Training! This repository contains demos designed to illustrate key GCP concepts, best practices, and common use cases across various Google Cloud services.
  4 | 
  5 | Whether you're an instructor leading a training session or a student exploring GCP capabilities, these demos provide hands-on experience with the most important Google Cloud services and patterns.
  6 | 
  7 | ## 1. Quick Start
  8 | 
  9 | Get started in just a few steps:
 10 | 
 11 | ```bash
 12 | # Clone the repository
 13 | cd ~
 14 | git clone https://github.com/roitraining/gcp-demos.git
 15 | cd gcp-demos
 16 | 
 17 | # Set your project (replace with your actual project ID)
 18 | export GOOGLE_CLOUD_PROJECT=your-project-id
 19 | gcloud config set project $GOOGLE_CLOUD_PROJECT
 20 | ```
 21 | 
 22 | ---
 23 | 
 24 | ## 2. BigQuery
 25 | 
 26 | *Explore the power of Google's serverless data warehouse*
 27 | 
 28 | #### 🔍 **SQL Examples Collection**
 29 | The `bigquery/` directory contains a comprehensive set of SQL examples demonstrating:
 30 | - **Array Functions**: Complex array manipulations and searching (`arrays_examples.sql`)
 31 | - **Approximate Functions**: Using approximate functions for large-scale analytics (`approx_example.sql`)
 32 | - **ELT Patterns**: Extract, Load, Transform patterns (`elt_examples.sql`)
 33 | - **External Data**: Working with Hive-style external tables (`external_hive_example.sql`)
 34 | - **Information Schema**: Metadata queries and system introspection (`information_schema_examples.sql`)
 35 | - **Materialized Views**: Performance optimization with precomputed results (`mv_example.sql`)
 36 | - **Time Travel**: Querying historical data snapshots (`time_travel_example.sql`)
 37 | - **User-Defined Functions**: Custom SQL and JavaScript functions (`udf_examples.sql`)
 38 | - **Views**: Creating and managing logical views (`views_example.sql`)
 39 | 
 40 | #### 🏗️ **Schema Design Demo**
 41 | The `bigquery/schema-demo/` directory provides a complete demonstration of schema design impact:
 42 | - Compare normalized vs. denormalized table performance
 43 | - Explore nested and repeated fields
 44 | - Understand partitioning and clustering benefits
 45 | - Generate sample datasets for testing
 46 | 
 47 | #### 📚 **Interactive Do-It-Nows**
 48 | Access 20+ hands-on BigQuery activities at: **https://roitraining.github.io/gcp-demos/#0**
 49 | 
 50 | These self-paced exercises cover everything from basic queries to advanced analytics patterns.
 51 | 
 52 | ---
 53 | 
 54 | ## 3. Composer (Apache Airflow)
 55 | 
 56 | #### 🛠️ **DAG Development**
 57 | The `composer/dag_development/` directory contains DAG validation tools and scripts
 58 | 
 59 | #### 📋 **Example DAGs**
 60 | The `composer/dags/` directory includes simple but useful DAG examples
 61 | 
 62 | ---
 63 | 
 64 | ## 4. Dataflow
 65 | 
 66 | #### 🔄 **Streaming Pipeline Demo**
 67 | The `dataflow/dflow-bq-stream-python/` directory contains a complete streaming example:
 68 | - Pub/Sub to BigQuery streaming pipeline
 69 | - Window functions and aggregations
 70 | - Nested/repeated data handling
 71 | - Local and cloud execution patterns
 72 | 
 73 | #### 🧪 **Simple Beam Examples**
 74 | The `dataflow/simple_demos/` directory provides:
 75 | - Basic Apache Beam concepts
 76 | - Transform examples
 77 | - Pipeline patterns and best practices
 78 | 
 79 | ---
 80 | 
 81 | ## 5. Data Loss Prevention (DLP)
 82 | 
 83 | #### 🌐 **Interactive DLP Demo**
 84 | Experience DLP capabilities firsthand: **https://bit.ly/roi-dlp-demo**
 85 | 
 86 | 1. Enter text with various data types in the left pane
 87 | 2. Watch DLP identify and classify sensitive information
 88 | 3. Experiment with different remediation strategies
 89 | 4. Explore contextual confidence ratings
 90 | 
 91 | #### 💻 **Source Code**
 92 | The `dlp-demo/` directory contains the complete application source:
 93 | - Cloud Run deployment configuration
 94 | - Python Flask application
 95 | - DLP API integration examples
 96 | - Docker containerization setup
 97 | 
 98 | ---
 99 | 
100 | ## 6. Dataproc
101 | 
102 | #### 📈 **Scaling Demonstrations**
103 | - **Manual Scaling**: Traditional cluster resizing (`dataproc_scale_demo.sh`)
104 | - **Autoscaling**: Dynamic resource allocation (`dataproc_autoscale_demo.sh`)
105 | 
106 | ---
107 | 
108 | ## 7. Dataform
109 | 
110 | https://github.com/jwdavis/dataform-demo
111 | 
112 | ---
113 | 
114 | ## 8. Dataplex
115 | 
116 | #### 📊 **Data Profiling**
117 | The `dataplex/profiling/` directory demonstrates:
118 | - Automated data quality assessment
119 | 
120 | ---
121 | 
122 | ## 9. Cloud Functions
123 | 
124 | Examples include:
125 | - Sample function for processing log entries received via Pub/Sub
126 | 
127 | ---
128 | 
129 | ## 10. Security & IAM
130 | 
131 | #### 🔑 **Authentication Examples**
132 | The `security/` directory contains:
133 | - Service account authentication patterns
134 | - OAuth and API key management
135 | - Organization policy examples and constraints
136 | 
137 | ---
138 | 
139 | ## 🚀 11. Coming Soon...
140 | 
141 | The following areas are under active development:
142 | 
143 | - **Pub/Sub**: Messaging and event streaming examples
144 | - **Terraform**: Infrastructure as Code templates
145 | - **Utilities**: Helper scripts and tools
146 | 
147 | ---
148 | 
149 | ## 📋 Quick Reference
150 | 
151 | | Service  | Directory   | Key Features                                   |
152 | | -------- | ----------- | ---------------------------------------------- |
153 | | BigQuery | `bigquery/` | SQL examples, schema design, analytics         |
154 | | Composer | `composer/` | Airflow DAGs, workflow orchestration           |
155 | | Dataflow | `dataflow/` | Streaming pipelines, Apache Beam               |
156 | | DLP      | `dlp-demo/` | Data classification, sensitive data protection |
157 | | Dataproc | `dataproc/` | Spark/Hadoop clusters, scaling demos           |
158 | | Security | `security/` | IAM, authentication, policies                  |
159 | 
160 | Happy learning! 🎓
161 | 


--------------------------------------------------------------------------------
/dlp-demo/app/services/dlp_service.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Google Cloud DLP service wrapper.
  3 | 
  4 | This module provides a clean interface to Google Cloud DLP API for
  5 | text inspection and de-identification operations.
  6 | """
  7 | 
  8 | from typing import Dict, List, Any, Optional
  9 | import logging
 10 | 
 11 | from google.cloud import dlp_v2
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | class DLPService:
 17 |     """Service class for Google Cloud DLP operations."""
 18 | 
 19 |     # Standard info types to detect
 20 |     DEFAULT_INFO_TYPES = [
 21 |         "EMAIL_ADDRESS",
 22 |         "CREDIT_CARD_NUMBER",
 23 |         "GENERIC_ID",
 24 |         "IP_ADDRESS",
 25 |         "PHONE_NUMBER",
 26 |         "US_DRIVERS_LICENSE_NUMBER",
 27 |         "US_SOCIAL_SECURITY_NUMBER",
 28 |         "PERSON_NAME",
 29 |         "US_PASSPORT",
 30 |         "DATE_OF_BIRTH",
 31 |     ]
 32 | 
 33 |     def __init__(self, project_id: str, info_types: Optional[List[str]] = None):
 34 |         """Initialize the DLP service.
 35 | 
 36 |         Args:
 37 |             project_id: Google Cloud project ID
 38 |             info_types: List of info types to detect (uses defaults if None)
 39 |         """
 40 |         self.project_id = project_id
 41 |         self.info_types = info_types or self.DEFAULT_INFO_TYPES
 42 |         self.client = dlp_v2.DlpServiceClient()
 43 |         self.parent = f"projects/{project_id}"
 44 | 
 45 |         logger.info(f"Initialized DLP service for project: {project_id}")
 46 | 
 47 |     def inspect_text(self, text: str) -> str:
 48 |         """Inspect text for sensitive information.
 49 | 
 50 |         Args:
 51 |             text: Text to inspect
 52 | 
 53 |         Returns:
 54 |             HTML-formatted string with inspection results
 55 |         """
 56 |         try:
 57 |             # Configure inspection
 58 |             inspect_config = {
 59 |                 "info_types": [{"name": info_type} for info_type in self.info_types],
 60 |                 "include_quote": True,
 61 |                 "min_likelihood": "POSSIBLE",
 62 |             }
 63 | 
 64 |             item = {"value": text}
 65 | 
 66 |             # Call the API
 67 |             response = self.client.inspect_content(
 68 |                 request={
 69 |                     "parent": self.parent,
 70 |                     "inspect_config": inspect_config,
 71 |                     "item": item,
 72 |                 }
 73 |             )
 74 | 
 75 |             # Format results
 76 |             if response.result.findings:
 77 |                 result_parts = []
 78 |                 for finding in response.result.findings:
 79 |                     parts = []
 80 |                     if finding.quote:
 81 |                         parts.append(f"<strong>Quote:</strong> {finding.quote}")
 82 |                     parts.append(
 83 |                         f"<strong>Info type:</strong> {finding.info_type.name}"
 84 |                     )
 85 |                     parts.append(
 86 |                         f"<strong>Likelihood:</strong> {finding.likelihood.name}"
 87 |                     )
 88 | 
 89 |                     if finding.location.byte_range.start:
 90 |                         start = finding.location.byte_range.start
 91 |                         end = finding.location.byte_range.end
 92 |                         parts.append(f"<strong>Location:</strong> {start}-{end}")
 93 | 
 94 |                     result_parts.append("<br>".join(parts))
 95 | 
 96 |                 return "<br><br>".join(result_parts)
 97 |             else:
 98 |                 return "<em>No sensitive information detected.</em>"
 99 | 
100 |         except Exception as e:
101 |             logger.error(f"Error inspecting text: {str(e)}")
102 |             return f"<em>Error during inspection: {str(e)}</em>"
103 | 
104 |     def deidentify_text(self, text: str, action: str) -> str:
105 |         """De-identify sensitive information in text.
106 | 
107 |         Args:
108 |             text: Text to de-identify
109 |             action: Type of de-identification ('redact', 'replace', 'mask')
110 | 
111 |         Returns:
112 |             HTML-formatted string with de-identified text
113 |         """
114 |         try:
115 |             # Configure inspection
116 |             inspect_config = {
117 |                 "info_types": [{"name": info_type} for info_type in self.info_types]
118 |             }
119 | 
120 |             # Configure transformation based on action
121 |             if action == "redact":
122 |                 transformation = {"redact_config": {}}
123 |             elif action == "replace":
124 |                 transformation = {
125 |                     "replace_config": {"new_value": {"string_value": "[REDACTED]"}}
126 |                 }
127 |             elif action == "mask":
128 |                 transformation = {
129 |                     "character_mask_config": {
130 |                         "masking_character": "#",
131 |                         "number_to_mask": 0,  # Mask all characters
132 |                         "characters_to_ignore": [{"characters_to_skip": "(),-/@."}],
133 |                     }
134 |                 }
135 |             else:
136 |                 raise ValueError(f"Unsupported action: {action}")
137 | 
138 |             # Configure de-identification
139 |             deidentify_config = {
140 |                 "info_type_transformations": {
141 |                     "transformations": [{"primitive_transformation": transformation}]
142 |                 }
143 |             }
144 | 
145 |             item = {"value": text}
146 | 
147 |             # Call the API
148 |             response = self.client.deidentify_content(
149 |                 request={
150 |                     "parent": self.parent,
151 |                     "inspect_config": inspect_config,
152 |                     "deidentify_config": deidentify_config,
153 |                     "item": item,
154 |                 }
155 |             )
156 | 
157 |             # Return formatted result
158 |             result_text = response.item.value
159 |             return "<br>".join(result_text.split("\n"))
160 | 
161 |         except Exception as e:
162 |             logger.error(f"Error de-identifying text with action '{action}': {str(e)}")
163 |             return f"<em>Error during {action}: {str(e)}</em>"
164 | 


--------------------------------------------------------------------------------
/dlp-demo/app/templates/dlp-demo.html:
--------------------------------------------------------------------------------
  1 | {% extends 'base.html' %}
  2 | 
  3 | {% block head %}
  4 | <title>{{ title }}</title>
  5 | <script src="https://unpkg.com/vue@3/dist/vue.global.js"></script>
  6 | <script src="https://cdn.jsdelivr.net/npm/axios/dist/axios.min.js"></script>
  7 | <style>
  8 |   .text-input-area, .result-area {
  9 |     height: 400px;
 10 |     resize: none;
 11 |     font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
 12 |     font-size: 14px;
 13 |   }
 14 |   .result-area {
 15 |     background-color: #f8f9fa;
 16 |     border: 1px solid #dee2e6;
 17 |     border-radius: 0.375rem;
 18 |     padding: 1rem;
 19 |     overflow-y: auto;
 20 |   }
 21 |   .btn-group .btn.active {
 22 |     background-color: #0d6efd;
 23 |     border-color: #0d6efd;
 24 |   }
 25 |   .status-indicator {
 26 |     font-size: 0.875rem;
 27 |     margin-top: 0.5rem;
 28 |   }
 29 |   .matching-cards {
 30 |     height: 100%;
 31 |   }
 32 |   .card-body-matching {
 33 |     display: flex;
 34 |     flex-direction: column;
 35 |     height: calc(100% - 3.5rem); /* Subtract header height */
 36 |   }
 37 |   .flex-grow-content {
 38 |     flex-grow: 1;
 39 |   }
 40 | </style>
 41 | {% endblock %}
 42 | 
 43 | {% block content %}
 44 | <div id="app" class="container-fluid">
 45 |   <div class="row">
 46 |     <div class="col-lg-6 mb-4">
 47 |       <div class="card matching-cards">
 48 |         <div class="card-header bg-primary text-white">
 49 |           <h5 class="mb-0">Source Text</h5>
 50 |         </div>
 51 |         <div class="card-body card-body-matching">
 52 |           <textarea 
 53 |             v-model="text_for_dlp" 
 54 |             @input="onTextInput"
 55 |             class="form-control text-input-area mb-3 flex-grow-content"
 56 |             placeholder="Enter text containing sensitive information like emails, phone numbers, credit cards, etc.">
 57 |           </textarea>
 58 |           
 59 |           <div class="d-flex justify-content-center mb-3">
 60 |             <div class="btn-group" role="group">
 61 |               <input type="radio" class="btn-check" id="inspect" value="inspect" v-model="activeAction" @change="processIfReady">
 62 |               <label class="btn btn-outline-primary" for="inspect">Inspect</label>
 63 |               
 64 |               <input type="radio" class="btn-check" id="redact" value="redact" v-model="activeAction" @change="processIfReady">
 65 |               <label class="btn btn-outline-primary" for="redact">Redact</label>
 66 |               
 67 |               <input type="radio" class="btn-check" id="replace" value="replace" v-model="activeAction" @change="processIfReady">
 68 |               <label class="btn btn-outline-primary" for="replace">Replace</label>
 69 |               
 70 |               <input type="radio" class="btn-check" id="mask" value="mask" v-model="activeAction" @change="processIfReady">
 71 |               <label class="btn btn-outline-primary" for="mask">Mask</label>
 72 |             </div>
 73 |           </div>
 74 |           
 75 |           <div class="status-indicator text-center">
 76 |             <span v-if="loading" class="text-primary">
 77 |               <span class="spinner-border spinner-border-sm me-2"></span>
 78 |               Processing...
 79 |             </span>
 80 |             <span v-else-if="typing" class="text-muted">
 81 |               <i class="bi bi-clock"></i> Waiting for input to finish...
 82 |             </span>
 83 |             <span v-else-if="result && text_for_dlp.trim()" class="text-success">
 84 |               ✓ Processed
 85 |             </span>
 86 |             <span v-else class="text-muted">
 87 |               Enter text above to see DLP results
 88 |             </span>
 89 |           </div>
 90 |         </div>
 91 |       </div>
 92 |     </div>
 93 | 
 94 |     <div class="col-lg-6 mb-4">
 95 |       <div class="card matching-cards">
 96 |         <div class="card-header bg-success text-white">
 97 |           <h5 class="mb-0">DLP Results</h5>
 98 |         </div>
 99 |         <div class="card-body card-body-matching">
100 |           <div class="result-area flex-grow-content" v-html="result || '&lt;em class=&quot;text-muted&quot;&gt;Results will appear here after processing text.&lt;/em&gt;'">
101 |           </div>
102 |         </div>
103 |       </div>
104 |     </div>
105 |   </div>
106 | </div>
107 | {% endblock %}
108 | 
109 | {% block scripts %}
110 | <script>
111 | const { createApp } = Vue;
112 | 
113 | createApp({
114 |   data() {
115 |     return {
116 |       text_for_dlp: '',
117 |       activeAction: 'inspect',
118 |       result: '',
119 |       loading: false,
120 |       typing: false,
121 |       typingTimeout: null
122 |     }
123 |   },
124 |   methods: {
125 |     onTextInput() {
126 |       this.typing = true;
127 |       
128 |       // Clear existing timeout
129 |       if (this.typingTimeout) {
130 |         clearTimeout(this.typingTimeout);
131 |       }
132 |       
133 |       // Set new timeout to process after user stops typing
134 |       this.typingTimeout = setTimeout(() => {
135 |         this.typing = false;
136 |         this.processIfReady();
137 |       }, 1000); // Wait 1 second after user stops typing
138 |     },
139 |     
140 |     processIfReady() {
141 |       if (this.text_for_dlp.trim() && this.activeAction && !this.loading) {
142 |         this.processDLP();
143 |       }
144 |     },
145 | 
146 |     async processDLP() {
147 |       if (!this.text_for_dlp.trim() || !this.activeAction) {
148 |         this.result = '';
149 |         return;
150 |       }
151 | 
152 |       this.loading = true;
153 | 
154 |       try {
155 |         const response = await axios.post('/api/dlp', {
156 |           text: this.text_for_dlp,
157 |           action: this.activeAction
158 |         });
159 | 
160 |         if (response.data && response.data.result) {
161 |           this.result = response.data.result;
162 |         } else {
163 |           this.result = '<em class="text-warning">No results returned.</em>';
164 |         }
165 |       } catch (error) {
166 |         console.error('Error processing DLP request:', error);
167 |         if (error.response && error.response.data && error.response.data.error) {
168 |           this.result = `<em class="text-danger">Error: ${error.response.data.error}</em>`;
169 |         } else {
170 |           this.result = '<em class="text-danger">An error occurred while processing your request.</em>';
171 |         }
172 |       } finally {
173 |         this.loading = false;
174 |       }
175 |     }
176 |   },
177 |   
178 |   beforeUnmount() {
179 |     // Clean up timeout when component is destroyed
180 |     if (this.typingTimeout) {
181 |       clearTimeout(this.typingTimeout);
182 |     }
183 |   }
184 | }).mount('#app');
185 | </script>
186 | {% endblock %}
187 | 


--------------------------------------------------------------------------------
/bigquery/scds_examples.sql:
--------------------------------------------------------------------------------
  1 | -- this is all very much in progress
  2 | -- # dml to update array, match element in array, update element in array
  3 | 
  4 | -- take 1
  5 | -- rebuild the entire table
  6 | -- takes about 18 minutes
  7 | CREATE OR REPLACE TABLE
  8 |   `bq_demo.nested_once` AS (
  9 |   WITH
 10 | 
 11 |     -- denormalize the nested table
 12 |     denorm AS (
 13 |     SELECT
 14 |       * EXCEPT (line_items)
 15 |     FROM
 16 |       `bq_demo.nested_once` n,
 17 |       n.line_items ),
 18 | 
 19 |     -- update line_items with new price if there's an entry in price_updates
 20 |     updated AS (
 21 |     SELECT
 22 |       denorm.* EXCEPT(prod_price),
 23 |       if(pu.prod_price is not null, pu.prod_price, denorm.prod_price) AS prod_price
 24 |     FROM
 25 |       denorm
 26 |     left JOIN
 27 |       `bq_demo.price_updates` pu
 28 |     ON
 29 |       denorm.prod_code = pu.prod_code)
 30 | 
 31 |   -- reconstitute the nested table
 32 |   SELECT
 33 |     * EXCEPT (line_item_num,
 34 |       prod_code,
 35 |       qty,
 36 |       prod_name,
 37 |       prod_desc,
 38 |       prod_price),
 39 |     ARRAY_AGG(STRUCT(line_item_num,
 40 |         prod_code,
 41 |         qty,
 42 |         prod_name,
 43 |         prod_desc,
 44 |         prod_price)) as line_items
 45 |   FROM
 46 |     updated
 47 |   GROUP BY
 48 |     order_num,
 49 |     order_date,
 50 |     cust_phone,
 51 |     cust_email,
 52 |     cust_zip,
 53 |     cust_state,
 54 |     cust_address,
 55 |     cust_name,
 56 |     cust_id)
 57 | 
 58 | -- take 2
 59 | -- stored procedure to update nested once
 60 | -- uses temporary table, updates only rows that need to be updated
 61 | -- takes about 19 minutes
 62 | BEGIN
 63 |   -- generate the update table that has only updated order rows
 64 |   CREATE TEMPORARY TABLE m AS (
 65 |   WITH
 66 |     -- denormalize the nested table
 67 |     denorm AS (
 68 |     SELECT
 69 |       * EXCEPT(line_items)
 70 |     FROM
 71 |       `bq_demo.nested_once` o,
 72 |       UNNEST(line_items) l),
 73 | 
 74 |     -- get the order numbers that have rows that need to be updated
 75 |     order_numbers AS (
 76 |     SELECT
 77 |       order_num
 78 |     FROM
 79 |       denorm d
 80 |     JOIN
 81 |       `bq_demo.price_updates` p
 82 |     ON
 83 |       d.prod_code = p.prod_code
 84 |     GROUP BY
 85 |       order_num),
 86 | 
 87 |     -- get the rows that need to be updated
 88 |     relevant AS (
 89 |     SELECT
 90 |       d.*
 91 |     FROM
 92 |       denorm d
 93 |     JOIN
 94 |       order_numbers o
 95 |     ON
 96 |       o.order_num = d.order_num ),
 97 | 
 98 |     -- update line_items with new price if there's an entry in price_updates
 99 |     updated AS (
100 |     SELECT
101 |       r.* EXCEPT (prod_price),
102 |       IFNULL(p.prod_price,r.prod_price) AS prod_price
103 |     FROM
104 |       relevant r
105 |     LEFT JOIN
106 |       `bq_demo.price_updates` p
107 |     ON
108 |       r.prod_code = p.prod_code)
109 | 
110 |   -- reconstitute the nested table
111 |   SELECT
112 |     * EXCEPT (line_item_num,
113 |       prod_code,
114 |       qty,
115 |       prod_name,
116 |       prod_desc,
117 |       prod_price),
118 |     ARRAY_AGG(STRUCT(line_item_num,
119 |         prod_code,
120 |         qty,
121 |         prod_name,
122 |         prod_desc,
123 |         prod_price)) AS line_items
124 |   FROM
125 |     updated
126 |   GROUP BY
127 |     order_num,
128 |     order_date,
129 |     cust_phone,
130 |     cust_email,
131 |     cust_zip,
132 |     cust_state,
133 |     cust_address,
134 |     cust_name,
135 |     cust_id);
136 | 
137 | -- merge the updated order rows into the original table
138 | MERGE
139 |   `bq_demo.nested_once` n
140 | USING
141 |   m
142 | ON
143 |   m.order_num = n.order_num
144 |   WHEN MATCHED THEN UPDATE SET line_items = m.line_items;
145 | END
146 | 
147 | -- take 3 
148 | -- stored procedure to update nested once
149 | -- does order search before denormalizing
150 | -- avoids temporary table
151 | -- takes like 12-13 minutes
152 | BEGIN
153 | -- create an array of price_updates
154 | -- we can use this to filter rows that need to be updated
155 | -- and avoid the join
156 | DECLARE
157 |   prod_codes DEFAULT (array(
158 |   SELECT
159 |     prod_code
160 |   FROM
161 |     bq_demo.price_updates));
162 | 
163 | -- avoid the temp table, put everything into the merge
164 | MERGE
165 |   bq_demo.nested_once n
166 | USING
167 |   (
168 |   WITH
169 | 
170 |     -- get the rows that need to be updated
171 |     relevant AS (
172 |     SELECT
173 |       *
174 |     FROM
175 |       bq_demo.nested_once
176 |     WHERE
177 |       EXISTS (
178 |       SELECT
179 |         *
180 |       FROM
181 |         UNNEST(line_items) as li
182 |       WHERE
183 |         prod_code IN unnest(prod_codes))),
184 | 
185 |     -- denormalize the rows that need to be updated
186 |     denorm AS (
187 |     SELECT
188 |       * EXCEPT (line_items)
189 |     FROM
190 |       relevant,
191 |       relevant.line_items),
192 | 
193 |     -- update line_items with new price if there's an entry in price_updates
194 |     updated AS (
195 |     SELECT
196 |       d.* EXCEPT (prod_price),
197 |       IFNULL(p.prod_price,d.prod_price) AS prod_price
198 |     FROM
199 |       denorm d
200 |     LEFT JOIN
201 |       `bq_demo.price_updates` p
202 |     ON
203 |       d.prod_code = p.prod_code)
204 | 
205 |   -- reconstitute the nested table
206 |   SELECT
207 |     * EXCEPT (line_item_num,
208 |       prod_code,
209 |       qty,
210 |       prod_name,
211 |       prod_desc,
212 |       prod_price),
213 |     ARRAY_AGG(STRUCT(line_item_num,
214 |         prod_code,
215 |         qty,
216 |         prod_name,
217 |         prod_desc,
218 |         prod_price)) AS line_items
219 |   FROM
220 |     updated
221 |   GROUP BY
222 |     order_num,
223 |     order_date,
224 |     cust_phone,
225 |     cust_email,
226 |     cust_zip,
227 |     cust_state,
228 |     cust_address,
229 |     cust_name,
230 |     cust_id) u
231 | ON
232 |   u.order_num = n.order_num
233 |   -- replace array with new array with update values
234 |   WHEN MATCHED THEN UPDATE SET line_items = u.line_items;
235 | END
236 | 
237 | --take 4
238 | -- stored procedure to update nested once
239 | -- filters before denormalizing
240 | -- uses merge to update denorm
241 | -- uses merge to reconstitute rows then merge into source
242 | -- takes 12-13 minutes (12.5 with 5K slots)
243 | BEGIN
244 | 
245 | -- create an array of price_updates
246 | DECLARE
247 |   prod_codes DEFAULT (ARRAY(
248 |     SELECT
249 |       prod_code
250 |     FROM
251 |       bq_demo.price_updates)); 
252 | -- find the rows that need to be updated
253 | -- denormalize them
254 | CREATE TEMPORARY TABLE denorm AS (
255 |   WITH
256 |     rows_to_update AS (
257 |     SELECT
258 |       *
259 |     FROM
260 |       bq_demo.nested_once
261 |     WHERE
262 |       EXISTS (
263 |       SELECT
264 |         *
265 |       FROM
266 |         UNNEST(line_items) AS li
267 |       WHERE
268 |         prod_code IN UNNEST(prod_codes)))
269 |   SELECT
270 |     * EXCEPT (line_items)
271 |   FROM
272 |     rows_to_update,
273 |     rows_to_update.line_items);
274 | -- update the denormalized rows
275 | MERGE
276 |   denorm d
277 | USING
278 |   bq_demo.price_updates p
279 | ON
280 |   d.prod_code = p.prod_code
281 | WHEN MATCHED THEN 
282 | UPDATE 
283 | SET prod_price = p.prd_price; 
284 | 
285 | -- merge
286 | -- create nested rows, then replace the old rows
287 | MERGE bq_demo.nested_once n 
288 | USING (
289 | SELECT * EXCEPT (line_item_num, prod_code, qty, prod_name, prod_desc, prod_price), ARRAY_AGG(STRUCT(line_item_num, prod_code, qty, prod_name, prod_desc, prod_price)) AS line_items FROM denorm d GROUP BY order_num, order_date, cust_phone, cust_email, cust_zip, cust_state, cust_address, cust_name, cust_id) u 
290 | ON u.order_num = n.order_num
291 | WHEN MATCHED THEN
292 | UPDATE
293 | SET
294 |   line_items = u.line_items;
295 | END
296 | 
297 | -- To dos
298 | -- # dml for type 1 dimension
299 | -- # dml for type 2 dimension
300 | -- # dml to update array, match col
301 | -- # dml to update array, match element in array, delete element in array
302 | -- # dml to update array, match element in array, update elements in array
303 | -- # dml to update array, match element in array, delete elements in array
304 | -- # dml to update array, match element in array, insert element into array
305 | -- # dml to update array, match element in array, insert elements into array


--------------------------------------------------------------------------------
/dlp-demo/app/static/images/symbol-full-color.svg:
--------------------------------------------------------------------------------
1 | <svg width="97" height="94" viewBox="0 0 97 94" fill="none" xmlns="http://www.w3.org/2000/svg">
2 | <path d="M84.1928 43.2947C84.1549 42.8399 84.0791 42.366 84.0222 41.9112C84.0033 41.7596 83.9843 41.6079 83.9464 41.4563C83.9085 41.1531 83.8517 40.8498 83.7948 40.5466C83.7759 40.395 83.738 40.2434 83.719 40.0917C80.7814 24.8538 68.3863 12.9516 52.902 10.791C52.4472 10.7341 51.9923 10.6773 51.5375 10.6204C50.0402 10.4688 48.524 10.4119 46.9699 10.4498C46.8183 10.4498 46.6666 10.4498 46.515 10.4498C46.1928 10.4498 45.8896 10.4688 45.5674 10.4877C45.4347 10.4877 45.2831 10.4877 45.1504 10.5067C44.9988 10.5067 44.8472 10.5256 44.6956 10.5446C44.5439 10.5636 44.3923 10.5636 44.2407 10.5825C26.918 12.2503 13.0447 26.1237 11.3579 43.4653C11.3579 43.6169 11.32 43.7685 11.32 43.9202C11.32 44.0718 11.301 44.2234 11.2821 44.375C11.2821 44.5077 11.2821 44.6593 11.2631 44.792C11.2631 45.1142 11.2252 45.4174 11.2252 45.7396C11.2252 45.8912 11.2252 46.0429 11.2252 46.1945C11.1873 47.7486 11.2442 49.2648 11.3958 50.7621C11.4527 51.2169 11.5095 51.6718 11.5664 52.1267C13.7459 67.6109 25.6482 80.006 40.8861 82.9436C41.0377 82.9626 41.1893 83.0005 41.3409 83.0194C41.6442 83.0763 41.9474 83.1331 42.2317 83.171C42.3833 83.171 42.535 83.209 42.6866 83.2469C43.1414 83.3037 43.5963 83.3606 44.0512 83.4174C45.5484 83.5691 47.0646 83.6259 48.6188 83.588C48.7704 83.588 48.922 83.588 49.0736 83.588C49.3958 83.588 49.6991 83.569 50.0212 83.5501C50.1539 83.5501 50.3055 83.5501 50.4382 83.5122C50.5898 83.5122 50.7414 83.5122 50.8931 83.4743C51.0447 83.4743 51.1963 83.4553 51.3479 83.4364C68.7275 81.7685 82.5629 67.9331 84.2307 50.5536C84.2307 50.402 84.2686 50.2503 84.2686 50.0987C84.2686 49.9471 84.2876 49.7955 84.3065 49.6439C84.3065 49.5112 84.3065 49.3596 84.3444 49.2079C84.3444 48.9047 84.3824 48.6015 84.3824 48.2793C84.3824 48.1276 84.3824 47.976 84.3824 47.8244C84.4203 46.2892 84.3444 44.7541 84.2118 43.2568M26.3305 22.2763C30.6517 18.5237 35.9774 15.8703 41.8337 14.79C42.2507 14.7142 42.535 15.188 42.2886 15.5102C40.0522 18.5426 37.3419 22.9017 35.2193 28.379C35.1245 28.6254 34.8402 28.7391 34.5938 28.6254C31.5804 27.204 28.7943 25.2897 26.3305 22.9396C26.141 22.7501 26.141 22.4279 26.3305 22.2573V22.2763ZM22.9948 25.6119C23.1654 25.4224 23.4686 25.4035 23.6582 25.593C26.5579 28.3601 29.8746 30.6154 33.4567 32.2454C33.6651 32.3401 33.7789 32.5675 33.703 32.795C32.6606 36.396 31.9215 40.376 31.713 44.6783C31.713 44.9247 31.5045 45.1142 31.2582 45.1142H15.5464C15.2811 45.1142 15.0726 44.8867 15.0916 44.6214C15.6222 37.3815 18.503 30.786 22.9759 25.6119H22.9948ZM22.9948 68.4259C18.522 63.2518 15.6222 56.6563 15.1105 49.4164C15.1105 49.1511 15.3001 48.9237 15.5654 48.9237H31.2771C31.5235 48.9237 31.732 49.1132 31.732 49.3596C31.9215 53.6808 32.6796 57.6419 33.722 61.2429C33.7978 61.4513 33.6841 61.6977 33.4756 61.7925C29.8936 63.4224 26.5769 65.6588 23.6771 68.4449C23.4876 68.6344 23.1843 68.6154 23.0138 68.4259H22.9948ZM41.8148 79.2668C35.9584 78.1865 30.6517 75.5521 26.3115 71.7805C26.103 71.6099 26.103 71.2878 26.3115 71.0982C28.7754 68.7481 31.5614 66.8339 34.5749 65.4124C34.8212 65.2987 35.1055 65.4124 35.2003 65.6778C37.323 71.1551 40.0522 75.5142 42.2696 78.5466C42.516 78.8878 42.2317 79.3426 41.8148 79.2668ZM45.8896 75.6279C45.8896 76.0828 45.321 76.2533 45.0557 75.8932C43.0467 73.1072 40.6397 69.184 38.7444 64.3132C38.6497 64.0668 38.7823 63.7825 39.0477 63.7067C41.1135 63.1002 43.2362 62.6833 45.3968 62.5127C45.6621 62.5127 45.8896 62.7022 45.8896 62.9675V75.6279ZM45.8896 58.2862C45.8896 58.5326 45.7001 58.7222 45.4537 58.7411C42.8572 58.9117 40.2986 59.4045 37.8347 60.1436C37.5883 60.2194 37.323 60.0868 37.2661 59.8214C36.3754 56.6374 35.7499 53.1501 35.5604 49.3975C35.5604 49.1321 35.7499 48.9047 36.0153 48.9047H45.4347C45.6811 48.9047 45.8896 49.1132 45.8896 49.3596V58.2673V58.2862ZM45.8896 44.6593C45.8896 44.9057 45.6811 45.1142 45.4347 45.1142H36.0153C35.7499 45.1142 35.5414 44.8867 35.5604 44.6214C35.7499 40.8498 36.3754 37.3626 37.2661 34.1975C37.3419 33.9511 37.5883 33.8184 37.8347 33.8753C40.3175 34.6334 42.8572 35.1072 45.4537 35.2778C45.7001 35.2778 45.8896 35.4863 45.8896 35.7326V44.6404V44.6593ZM45.8896 31.0892C45.8896 31.3546 45.6621 31.582 45.3968 31.5441C43.2362 31.3735 41.1135 30.9755 39.0477 30.3501C38.7823 30.2743 38.6497 29.99 38.7444 29.7436C40.6397 24.8728 43.0467 20.9496 45.0557 18.1636C45.321 17.8035 45.8896 17.993 45.8896 18.4289V31.0892ZM72.5938 25.6119C77.0856 30.786 79.9664 37.4005 80.4781 44.6214C80.4971 44.8867 80.2886 45.1142 80.0232 45.1142H64.3115C64.0651 45.1142 63.8567 44.9247 63.8567 44.6783C63.6671 40.376 62.928 36.396 61.8666 32.795C61.7908 32.5675 61.9045 32.3401 62.113 32.2454C65.6951 30.6154 69.0118 28.3601 71.9115 25.593C72.1011 25.4035 72.4043 25.4224 72.5749 25.6119H72.5938ZM62.113 61.8114C61.9045 61.7167 61.7908 61.4892 61.8666 61.2618C62.909 57.6608 63.6482 53.6808 63.8567 49.3785C63.8567 49.1321 64.0651 48.9426 64.3115 48.9426H80.0232C80.2886 48.9426 80.516 49.17 80.4781 49.4354C79.9474 56.6753 77.0666 63.2708 72.5938 68.4449C72.4232 68.6533 72.12 68.6533 71.9305 68.4828C69.0307 65.7157 65.714 63.4603 62.132 61.8304L62.113 61.8114ZM69.2582 71.7805C64.937 75.5331 59.6113 78.1865 53.7549 79.2668C53.338 79.3426 53.0537 78.8688 53.3001 78.5466C55.5365 75.5142 58.2656 71.1361 60.3883 65.6399C60.4831 65.3935 60.7674 65.2798 61.0138 65.3935C64.0083 66.8149 66.7943 68.7291 69.2771 71.1172C69.4666 71.3067 69.4666 71.6289 69.2771 71.7995L69.2582 71.7805ZM53.7549 14.79C59.6113 15.8703 64.918 18.5237 69.2582 22.2763C69.4666 22.4468 69.4666 22.769 69.2582 22.9586C66.7754 25.3276 63.9893 27.2608 60.9948 28.6823C60.7484 28.796 60.4641 28.6823 60.3694 28.4169C58.2467 22.9207 55.5175 18.5616 53.2811 15.5291C53.0347 15.188 53.319 14.7331 53.736 14.8089L53.7549 14.79ZM49.6991 18.4289C49.6991 17.974 50.2676 17.8035 50.533 18.1446C52.5609 20.9496 54.9489 24.8728 56.8442 29.7626C56.939 30.0089 56.8063 30.2932 56.5409 30.369C54.4751 30.9755 52.3524 31.3546 50.1729 31.5252C49.9075 31.5252 49.6801 31.3356 49.6801 31.0703V18.4099L49.6991 18.4289ZM49.6991 35.7705C49.6991 35.5242 49.8886 35.3346 50.116 35.3157C52.7125 35.1451 55.2711 34.6523 57.735 33.9132C57.9813 33.8374 58.2467 33.989 58.3035 34.2354C59.1943 37.4194 59.8387 40.8877 60.0093 44.6593C60.0093 44.9247 59.8198 45.1521 59.5544 45.1521H50.135C49.8696 45.1521 49.6801 44.9436 49.6801 44.6972V35.7895L49.6991 35.7705ZM49.6991 49.3975C49.6991 49.1321 49.9075 48.9426 50.1539 48.9426H59.5734C59.8387 48.9426 60.0472 49.17 60.0282 49.4354C59.8387 53.207 59.2133 56.6942 58.3225 59.8593C58.2467 60.1057 58.0003 60.2384 57.7539 60.1815C55.2711 59.4234 52.7315 58.9496 50.135 58.779C49.8886 58.779 49.718 58.5705 49.718 58.3242V49.4164L49.6991 49.3975ZM50.533 75.8932C50.2676 76.2533 49.6991 76.0638 49.6991 75.6279V62.9675C49.6991 62.7022 49.9265 62.4937 50.1918 62.5127C52.3714 62.6833 54.4941 63.0623 56.5599 63.6878C56.8252 63.7636 56.9579 64.0479 56.8631 64.2942C54.9679 69.184 52.5609 73.1262 50.5519 75.9122L50.533 75.8932Z" fill="#1E71D6"/>
3 | <path d="M47.7814 0.0827942C23.4462 0.0827942 3.43223 18.6185 1.06315 42.3282C1.06315 42.3661 1.06315 42.404 1.06315 42.4419C1.02525 42.783 1.00629 43.1431 0.968389 43.5032C0.968389 43.617 0.968388 43.7307 0.949435 43.8444C0.949435 44.1287 0.911531 44.394 0.892578 44.6783C0.892578 44.8489 0.892578 45.0195 0.892578 45.19V48.8289C0.892578 49.0564 0.892578 49.2838 0.930483 49.5112C0.930483 49.5681 0.930483 49.6249 0.930483 49.7008C2.27612 73.4863 21.3235 92.5526 45.109 93.8793C45.1659 93.8793 45.2228 93.8793 45.2986 93.8793C45.5639 93.8793 45.8103 93.8983 46.0756 93.9172C46.2272 93.9172 46.3789 93.9172 46.5305 93.9172C46.739 93.9172 46.9474 93.9172 47.1559 93.9172C47.3265 93.9172 47.4971 93.9172 47.6866 93.9172H47.8003C47.9898 93.9172 48.1794 93.9172 48.3689 93.9172C48.5584 93.9172 48.7669 93.9172 48.9564 93.9172C49.1839 93.9172 49.4113 93.9172 49.6387 93.8983C49.8093 93.8983 49.9799 93.8983 50.1504 93.8983C50.4347 93.8983 50.7001 93.8604 50.9844 93.8414C51.0981 93.8414 51.2118 93.8414 51.3255 93.8225C51.6666 93.8035 52.0267 93.7656 52.3868 93.7277C52.4247 93.7277 52.4627 93.7277 52.5006 93.7277C76.2292 91.3776 94.7649 71.3636 94.7649 47.0095C94.7649 46.6304 94.7649 46.2514 94.7649 45.8913C94.7649 45.7776 94.7649 45.6638 94.7649 45.5501C94.7649 45.2848 94.7649 45.0195 94.727 44.7731C94.727 44.6404 94.727 44.5077 94.727 44.394C94.727 44.1476 94.6891 43.8823 94.6701 43.6359C94.6701 43.5222 94.6701 43.3895 94.6512 43.2758C94.6322 42.9915 94.5943 42.7072 94.5754 42.4229C94.5754 42.3471 94.5754 42.2524 94.5564 42.1766C94.4806 41.4374 94.3858 40.6983 94.2721 39.9591C94.2721 39.8643 94.2342 39.7885 94.2342 39.6938C94.1963 39.4095 94.1395 39.1441 94.1016 38.8599C94.1016 38.7461 94.0637 38.6324 94.0447 38.5377C94.0068 38.2913 93.9499 38.0259 93.8931 37.7796C93.8741 37.6658 93.8552 37.5521 93.8173 37.4384C93.7604 37.1731 93.7036 36.9267 93.6467 36.6614C93.6277 36.5476 93.6088 36.4529 93.5709 36.3581C93.4951 36.0359 93.4193 35.7327 93.3435 35.4105C93.3435 35.3726 93.3435 35.3157 93.3056 35.2778C93.2108 34.9367 93.116 34.5955 93.0213 34.2354C92.9834 34.1217 92.9455 33.989 92.9075 33.8564C92.8317 33.6479 92.7749 33.4205 92.718 33.212C92.6612 33.0604 92.6233 32.9087 92.5664 32.7571C92.5664 32.795 92.5664 32.814 92.5664 32.814L92.5285 32.7003C92.5285 32.7003 92.5285 32.7003 92.5285 32.7571C92.5285 32.7192 92.4527 32.5676 92.4148 32.397C92.339 32.1696 92.4527 31.9232 92.6612 31.8284L96.6223 30.0279C96.9065 29.8953 96.9823 29.5352 96.7739 29.3077L80.3609 9.82444C80.2661 9.71072 80.1335 9.65387 80.0198 9.65387C79.7544 9.65387 79.508 9.8813 79.5649 10.2035L83.924 35.183C83.9619 35.4105 84.1704 35.5621 84.3789 35.5621C84.4357 35.5621 84.5115 35.5621 84.5684 35.5242L87.3544 34.2544C87.3544 34.2544 87.4871 34.2165 87.544 34.2165C87.7335 34.2165 87.923 34.3302 87.9799 34.5387C89.2118 38.5377 89.8562 42.7072 89.8562 46.9337C89.913 69.9611 71.2257 88.8758 48.1983 89.1033H47.7814C24.5834 89.1033 5.70654 70.2264 5.70654 47.0284C5.70654 23.8304 24.5834 4.95362 47.7814 4.95362C53.9031 4.95362 59.7784 6.22344 65.2936 8.76309C65.3504 8.801 65.4262 8.801 65.4831 8.801C65.6537 8.801 65.8243 8.70624 65.9001 8.53566L67.5868 4.95362C67.7006 4.72619 67.5868 4.46085 67.3784 4.34713C67.3405 4.34713 67.3025 4.30923 67.2646 4.29028C67.0183 4.17656 66.7719 4.06284 66.5255 3.96808C66.3928 3.91122 66.2602 3.85437 66.1464 3.79751C65.938 3.70274 65.7295 3.62693 65.521 3.55112C61.3704 1.86434 57.0492 0.784041 52.5953 0.329178C52.5953 0.329178 52.5574 0.329178 52.5385 0.329178C52.1594 0.291273 51.7993 0.253368 51.4203 0.234415C51.3444 0.234415 51.2876 0.234415 51.2118 0.234415C50.8896 0.215462 50.5674 0.19651 50.2262 0.177557C50.0557 0.177557 49.9041 0.177557 49.7335 0.177557C49.4871 0.177557 49.2597 0.158605 49.0133 0.158605C48.5963 0.158605 48.1983 0.158605 47.7814 0.158605V0.0827942Z" fill="black"/>
4 | </svg>
5 | 


--------------------------------------------------------------------------------
/dataplex/profiling/main.tf:
--------------------------------------------------------------------------------
  1 | # Variables
  2 | variable "project_id" {
  3 |   description = "GCP Project ID"
  4 |   type        = string
  5 | }
  6 | 
  7 | variable "region" {
  8 |   description = "GCP Region"
  9 |   type        = string
 10 |   default     = "us-central1"
 11 | }
 12 | 
 13 | variable "environment" {
 14 |   description = "Environment name (dev, staging, prod)"
 15 |   type        = string
 16 |   default     = "dev"
 17 | }
 18 | 
 19 | # Enable required APIs
 20 | resource "google_project_service" "required_apis" {
 21 |   for_each = toset([
 22 |     "dataplex.googleapis.com",
 23 |     "storage.googleapis.com",
 24 |     "bigquery.googleapis.com",
 25 |     "dataflow.googleapis.com",
 26 |     "cloudfunctions.googleapis.com",
 27 |     "cloudscheduler.googleapis.com",
 28 |     "pubsub.googleapis.com"
 29 |   ])
 30 | 
 31 |   project = var.project_id
 32 |   service = each.value
 33 | 
 34 |   disable_dependent_services = false
 35 |   disable_on_destroy         = false
 36 | }
 37 | 
 38 | # Cloud Storage Buckets for Data Zones
 39 | resource "google_storage_bucket" "raw_zone" {
 40 |   name          = "${var.project_id}-ecommerce-raw-${var.environment}"
 41 |   location      = "US" # Multi-region for Dataplex compatibility
 42 |   project       = var.project_id
 43 |   force_destroy = true
 44 | 
 45 |   uniform_bucket_level_access = true
 46 | 
 47 |   versioning {
 48 |     enabled = true
 49 |   }
 50 | 
 51 |   lifecycle_rule {
 52 |     condition {
 53 |       age = 90
 54 |     }
 55 |     action {
 56 |       type = "Delete"
 57 |     }
 58 |   }
 59 | 
 60 |   labels = {
 61 |     environment = var.environment
 62 |     zone        = "raw"
 63 |     purpose     = "data-ingestion"
 64 |   }
 65 | 
 66 |   depends_on = [google_project_service.required_apis]
 67 | }
 68 | 
 69 | resource "google_storage_bucket" "curated_zone" {
 70 |   name          = "${var.project_id}-ecommerce-curated-${var.environment}"
 71 |   location      = "US" # Multi-region for Dataplex compatibility
 72 |   project       = var.project_id
 73 |   force_destroy = true
 74 | 
 75 |   uniform_bucket_level_access = true
 76 | 
 77 |   versioning {
 78 |     enabled = true
 79 |   }
 80 | 
 81 |   lifecycle_rule {
 82 |     condition {
 83 |       age = 365
 84 |     }
 85 |     action {
 86 |       type = "Delete"
 87 |     }
 88 |   }
 89 | 
 90 |   labels = {
 91 |     environment = var.environment
 92 |     zone        = "curated"
 93 |     purpose     = "processed-data"
 94 |   }
 95 | 
 96 |   depends_on = [google_project_service.required_apis]
 97 | }
 98 | 
 99 | resource "google_storage_bucket" "quarantine_zone" {
100 |   name          = "${var.project_id}-ecommerce-quarantine-${var.environment}"
101 |   location      = "US" # Multi-region for Dataplex compatibility
102 |   project       = var.project_id
103 |   force_destroy = true
104 | 
105 |   uniform_bucket_level_access = true
106 | 
107 |   labels = {
108 |     environment = var.environment
109 |     zone        = "quarantine"
110 |     purpose     = "failed-quality-checks"
111 |   }
112 | 
113 |   depends_on = [google_project_service.required_apis]
114 | }
115 | 
116 | # Create folder structure in raw zone bucket
117 | resource "google_storage_bucket_object" "raw_zone_folders" {
118 |   for_each = toset([
119 |     "web-analytics/",
120 |     "mobile-app/",
121 |     "pos-systems/"
122 |   ])
123 | 
124 |   name   = each.value
125 |   bucket = google_storage_bucket.raw_zone.name
126 |   source = "/dev/null"
127 | }
128 | 
129 | # Dataplex Lake
130 | resource "google_dataplex_lake" "ecommerce_lake" {
131 |   name         = "ecommerce-data-lake-${var.environment}"
132 |   location     = var.region
133 |   project      = var.project_id
134 |   display_name = "E-commerce Data Lake (${upper(var.environment)})"
135 |   description  = "Data lake for e-commerce customer analytics pipeline"
136 | 
137 |   labels = {
138 |     environment = var.environment
139 |     team        = "data-engineering"
140 |   }
141 | 
142 |   depends_on = [google_project_service.required_apis]
143 | }
144 | 
145 | # Raw Data Zone
146 | resource "google_dataplex_zone" "raw_zone" {
147 |   name         = "raw-zone"
148 |   location     = var.region
149 |   project      = var.project_id
150 |   lake         = google_dataplex_lake.ecommerce_lake.name
151 |   display_name = "Raw Data Zone"
152 |   description  = "Zone for ingested raw data from multiple sources"
153 | 
154 |   type = "RAW"
155 | 
156 |   discovery_spec {
157 |     enabled  = true
158 |     schedule = "0 */4 * * *" # Every 4 hours
159 | 
160 |     include_patterns = [
161 |       "gs://${google_storage_bucket.raw_zone.name}/**"
162 |     ]
163 |   }
164 | 
165 |   resource_spec {
166 |     location_type = "MULTI_REGION"
167 |   }
168 | 
169 |   labels = {
170 |     environment = var.environment
171 |     data-tier   = "raw"
172 |   }
173 | }
174 | 
175 | # Curated Data Zone
176 | resource "google_dataplex_zone" "curated_zone" {
177 |   name         = "curated-zone"
178 |   location     = var.region
179 |   project      = var.project_id
180 |   lake         = google_dataplex_lake.ecommerce_lake.name
181 |   display_name = "Curated Data Zone"
182 |   description  = "Zone for processed, quality-assured data"
183 | 
184 |   type = "CURATED"
185 | 
186 |   discovery_spec {
187 |     enabled  = true
188 |     schedule = "0 6 * * *" # Daily at 6 AM
189 | 
190 |     include_patterns = [
191 |       "gs://${google_storage_bucket.curated_zone.name}/**"
192 |     ]
193 |   }
194 | 
195 |   resource_spec {
196 |     location_type = "MULTI_REGION"
197 |   }
198 | 
199 |   labels = {
200 |     environment = var.environment
201 |     data-tier   = "curated"
202 |   }
203 | }
204 | 
205 | # Raw Zone Asset (single asset for the entire raw bucket)
206 | resource "google_dataplex_asset" "raw_data" {
207 |   name          = "raw-data-asset"
208 |   location      = var.region
209 |   project       = var.project_id
210 |   lake          = google_dataplex_lake.ecommerce_lake.name
211 |   dataplex_zone = google_dataplex_zone.raw_zone.name
212 |   display_name  = "Raw Data Storage"
213 |   description   = "Raw data from all sources: web analytics, mobile app, and POS systems"
214 | 
215 |   resource_spec {
216 |     name = "projects/${var.project_id}/buckets/${google_storage_bucket.raw_zone.name}"
217 |     type = "STORAGE_BUCKET"
218 |   }
219 | 
220 |   discovery_spec {
221 |     enabled  = true
222 |     schedule = "0 */2 * * *" # Every 2 hours
223 | 
224 |     # Include patterns to focus discovery on specific folders
225 |     include_patterns = [
226 |       "gs://${google_storage_bucket.raw_zone.name}/web-analytics/**",
227 |       "gs://${google_storage_bucket.raw_zone.name}/mobile-app/**",
228 |       "gs://${google_storage_bucket.raw_zone.name}/pos-systems/**"
229 |     ]
230 | 
231 |     # Exclude temporary or processing files
232 |     exclude_patterns = [
233 |       "gs://${google_storage_bucket.raw_zone.name}/**/temp/**",
234 |       "gs://${google_storage_bucket.raw_zone.name}/**/_processing/**"
235 |     ]
236 |   }
237 | 
238 |   labels = {
239 |     zone        = "raw"
240 |     environment = var.environment
241 |     sources     = "web-mobile-pos"
242 |   }
243 | }
244 | 
245 | # Curated Zone Asset
246 | resource "google_dataplex_asset" "curated_data" {
247 |   name          = "curated-data-asset"
248 |   location      = var.region
249 |   project       = var.project_id
250 |   lake          = google_dataplex_lake.ecommerce_lake.name
251 |   dataplex_zone = google_dataplex_zone.curated_zone.name
252 |   display_name  = "Curated Data Storage"
253 |   description   = "Processed and quality-assured data ready for analytics"
254 | 
255 |   resource_spec {
256 |     name = "projects/${var.project_id}/buckets/${google_storage_bucket.curated_zone.name}"
257 |     type = "STORAGE_BUCKET"
258 |   }
259 | 
260 |   discovery_spec {
261 |     enabled  = true
262 |     schedule = "0 6 * * *" # Daily at 6 AM
263 |   }
264 | 
265 |   labels = {
266 |     zone        = "curated"
267 |     environment = var.environment
268 |     quality     = "verified"
269 |   }
270 | }
271 | 
272 | # Data Quality Scan for automated profiling
273 | resource "google_dataplex_datascan" "data_quality_scan" {
274 |   data_scan_id = "ecommerce-data-quality-scan"
275 |   location     = var.region
276 |   project      = var.project_id
277 |   display_name = "E-commerce Data Quality Scan"
278 |   description  = "Automated data quality scanning and profiling"
279 | 
280 |   data {
281 |     resource = google_dataplex_asset.raw_data.name
282 |   }
283 | 
284 |   execution_spec {
285 |     trigger {
286 |       schedule {
287 |         cron = "0 */4 * * *" # Every 4 hours
288 |       }
289 |     }
290 |   }
291 | 
292 |   # Data profiling configuration
293 |   data_profile_spec {
294 |     sampling_percent = 100.0
295 | 
296 |     # Include specific fields for profiling
297 |     include_fields {
298 |       field_names = ["*"] # Profile all fields
299 |     }
300 |   }
301 | 
302 |   labels = {
303 |     environment = var.environment
304 |     scan-type   = "data-profile"
305 |   }
306 | }
307 | 
308 | # Data Quality Rules Scan
309 | resource "google_dataplex_datascan" "data_quality_rules" {
310 |   data_scan_id = "ecommerce-quality-rules-scan"
311 |   location     = var.region
312 |   project      = var.project_id
313 |   display_name = "E-commerce Data Quality Rules"
314 |   description  = "Business rule validation for e-commerce data"
315 | 
316 |   data {
317 |     resource = google_dataplex_asset.raw_data.name
318 |   }
319 | 
320 |   execution_spec {
321 |     trigger {
322 |       schedule {
323 |         cron = "0 */4 * * *" # Every 4 hours
324 |       }
325 |     }
326 |   }
327 | 
328 |   # Data quality rules
329 |   data_quality_spec {
330 |     sampling_percent = 100.0
331 | 
332 |     # Rule: Check for null values in critical fields
333 |     rules {
334 |       column    = "user_id"
335 |       dimension = "COMPLETENESS"
336 |       threshold = 0.7 # Allow 30% nulls (anonymous users)
337 | 
338 |       non_null_expectation {}
339 |     }
340 | 
341 |     # Rule: Validate email format if present
342 |     rules {
343 |       column    = "email"
344 |       dimension = "VALIDITY"
345 |       threshold = 0.95
346 | 
347 |       regex_expectation {
348 |         regex = "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
349 |       }
350 |     }
351 | 
352 |     # Rule: Check timestamp format
353 |     rules {
354 |       column    = "timestamp"
355 |       dimension = "VALIDITY"
356 |       threshold = 1.0
357 | 
358 |       non_null_expectation {}
359 |     }
360 | 
361 |     # Rule: Validate transaction amounts are positive
362 |     rules {
363 |       column    = "price"
364 |       dimension = "VALIDITY"
365 |       threshold = 0.99
366 | 
367 |       range_expectation {
368 |         min_value = "0"
369 |         max_value = "10000"
370 |       }
371 |     }
372 |   }
373 | 
374 |   labels = {
375 |     environment = var.environment
376 |     scan-type   = "data-quality"
377 |   }
378 | }
379 | 
380 | # Service Account for Dataplex operations
381 | resource "google_service_account" "dataplex_sa" {
382 |   account_id   = "dataplex-service-${var.environment}"
383 |   display_name = "Dataplex Service Account"
384 |   description  = "Service account for Dataplex operations and data profiling"
385 |   project      = var.project_id
386 | }
387 | 
388 | # IAM bindings for Dataplex service account
389 | resource "google_project_iam_member" "dataplex_sa_roles" {
390 |   for_each = toset([
391 |     "roles/dataplex.developer",
392 |     "roles/dataplex.dataReader",
393 |     "roles/dataplex.dataWriter",
394 |     "roles/storage.objectViewer",
395 |     "roles/storage.objectCreator",
396 |     "roles/bigquery.dataEditor",
397 |     "roles/bigquery.jobUser",
398 |     "roles/dataproc.worker",
399 |     "roles/dataproc.editor"
400 |   ])
401 | 
402 |   project = var.project_id
403 |   role    = each.value
404 |   member  = "serviceAccount:${google_service_account.dataplex_sa.email}"
405 | }
406 | 
407 | # Pub/Sub topic for data quality notifications
408 | resource "google_pubsub_topic" "data_quality_alerts" {
409 |   name    = "data-quality-alerts-${var.environment}"
410 |   project = var.project_id
411 | 
412 |   labels = {
413 |     environment = var.environment
414 |     purpose     = "data-quality"
415 |   }
416 | 
417 |   depends_on = [google_project_service.required_apis]
418 | }
419 | 
420 | # BigQuery dataset for processed data
421 | resource "google_bigquery_dataset" "ecommerce_analytics" {
422 |   dataset_id  = "ecommerce_analytics_${var.environment}"
423 |   project     = var.project_id
424 |   location    = var.region
425 |   description = "Analytics dataset for e-commerce customer data"
426 | 
427 |   labels = {
428 |     environment = var.environment
429 |     team        = "analytics"
430 |   }
431 | 
432 |   depends_on = [google_project_service.required_apis]
433 | }
434 | 
435 | # Outputs
436 | output "lake_name" {
437 |   description = "Name of the Dataplex lake"
438 |   value       = google_dataplex_lake.ecommerce_lake.name
439 | }
440 | 
441 | output "raw_bucket_name" {
442 |   description = "Name of the raw data bucket"
443 |   value       = google_storage_bucket.raw_zone.name
444 | }
445 | 
446 | output "curated_bucket_name" {
447 |   description = "Name of the curated data bucket"
448 |   value       = google_storage_bucket.curated_zone.name
449 | }
450 | 
451 | output "quarantine_bucket_name" {
452 |   description = "Name of the quarantine bucket"
453 |   value       = google_storage_bucket.quarantine_zone.name
454 | }
455 | 
456 | output "dataplex_service_account" {
457 |   description = "Email of the Dataplex service account"
458 |   value       = google_service_account.dataplex_sa.email
459 | }
460 | 
461 | output "pubsub_topic" {
462 |   description = "Pub/Sub topic for data quality alerts"
463 |   value       = google_pubsub_topic.data_quality_alerts.name
464 | }
465 | 


--------------------------------------------------------------------------------