├── dlp-demo ├── service.yaml ├── test_app.py ├── app │ ├── __init__.py │ ├── services │ │ ├── __init__.py │ │ └── dlp_service.py │ ├── static │ │ └── images │ │ │ ├── logo.png │ │ │ └── symbol-full-color.svg │ ├── templates │ │ ├── message.html │ │ ├── base.html │ │ └── dlp-demo.html │ ├── config.py │ └── main.py ├── .dockerignore ├── .gcloudignore ├── .env.example ├── run.py ├── Dockerfile ├── pyproject.toml ├── deploy.sh └── README.md ├── dataplex ├── lineage │ ├── requirements.txt │ └── lineage_tools.py └── profiling │ ├── store_transactions_20240806.csv │ ├── app_events_20240806_1400.jsonl │ ├── web_events_20240806_14.json │ ├── README.md │ ├── profile.md │ └── main.tf ├── dataflow ├── dflow-bq-stream-python │ ├── req1.txt │ ├── rows.png │ ├── send.png │ ├── pipeline.png │ ├── req2.txt │ ├── setup.sh │ ├── send_events.py │ ├── schema_defs.py │ ├── README.md │ └── process_events.py └── simple_demos │ ├── beam_demo_1.py │ └── beam_demo_2.py ├── bigquery ├── schema-demo │ ├── requirements.txt │ ├── order_schema.json │ ├── line_item_schema.json │ ├── product_schema.json │ ├── denorm_query.sql │ ├── customer_schema.json │ ├── norm_query.sql │ ├── load_data.sql │ ├── nested_queries.sql │ ├── README.md │ ├── load_data.sh │ └── generate_data.py ├── wiki_query_example.sql ├── external_hive_example.sql ├── time_travel_example.sql ├── approx_example.sql ├── mv_example.sql ├── udf_examples.sql ├── views_example.sql ├── time_travel.sh ├── exported_billing_data_example.sql ├── github_demo.sql ├── elt_examples.sql ├── arrays_examples.sql ├── information_schema_examples.sql └── scds_examples.sql ├── terraform └── exp_to_tf.sh ├── docs ├── img │ ├── 2072fd183685cee3.png │ ├── c2f8eeefc77bc843.png │ ├── e4b52eedaff69ff5.png │ └── f4db34e38b750e09.png └── codelab.json ├── security ├── org_policy │ ├── policy.yaml │ ├── constraint.sh │ └── constraint.yaml └── auth_examples.py ├── dataproc ├── autoscaling_policy.yaml ├── dataproc_scale_demo.sh └── dataproc_autoscale_demo.sh ├── utilities └── shopping_list_api │ ├── pyproject.toml │ ├── Dockerfile │ ├── main.py │ └── README.md ├── .gcloudignore ├── composer ├── dag_development │ ├── validate_dag.sh │ └── validate_dag.py └── dags │ ├── bq_export_strategies.py │ └── export_top_customers.py ├── ai ├── pipelines │ └── README.md ├── automl │ ├── salads_deploy.py │ ├── adoption_deploy.py │ ├── README.md │ └── adoption_predict.py └── del_endpoints.py ├── NEW_AUG25.md ├── .gitignore ├── functions └── cat-bq-completions.py └── README.md /dlp-demo/service.yaml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dlp-demo/test_app.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataplex/lineage/requirements.txt: -------------------------------------------------------------------------------- 1 | google-cloud-datacatalog-lineage -------------------------------------------------------------------------------- /dlp-demo/app/__init__.py: -------------------------------------------------------------------------------- 1 | """DLP Demo application package.""" 2 | -------------------------------------------------------------------------------- /dlp-demo/.dockerignore: -------------------------------------------------------------------------------- 1 | .venv/ 2 | .env 3 | __pycache__/ 4 | *.pyc 5 | -------------------------------------------------------------------------------- /dlp-demo/.gcloudignore: -------------------------------------------------------------------------------- 1 | .venv/ 2 | .env 3 | __pycache__/ 4 | *.pyc 5 | -------------------------------------------------------------------------------- /dataflow/dflow-bq-stream-python/req1.txt: -------------------------------------------------------------------------------- 1 | apache-beam[gcp] 2 | google-cloud-core -------------------------------------------------------------------------------- /bigquery/schema-demo/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-beam[gcp] 2 | google-cloud-core 3 | -------------------------------------------------------------------------------- /dlp-demo/app/services/__init__.py: -------------------------------------------------------------------------------- 1 | """Service package for DLP demo application.""" 2 | -------------------------------------------------------------------------------- /terraform/exp_to_tf.sh: -------------------------------------------------------------------------------- 1 | gcloud beta resource-config bulk-export --path=./proj_spec --resource-format=terraform -------------------------------------------------------------------------------- /docs/img/2072fd183685cee3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/roitraining/gcp-demos/HEAD/docs/img/2072fd183685cee3.png -------------------------------------------------------------------------------- /docs/img/c2f8eeefc77bc843.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/roitraining/gcp-demos/HEAD/docs/img/c2f8eeefc77bc843.png -------------------------------------------------------------------------------- /docs/img/e4b52eedaff69ff5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/roitraining/gcp-demos/HEAD/docs/img/e4b52eedaff69ff5.png -------------------------------------------------------------------------------- /docs/img/f4db34e38b750e09.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/roitraining/gcp-demos/HEAD/docs/img/f4db34e38b750e09.png -------------------------------------------------------------------------------- /dlp-demo/app/static/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/roitraining/gcp-demos/HEAD/dlp-demo/app/static/images/logo.png -------------------------------------------------------------------------------- /dataflow/dflow-bq-stream-python/rows.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/roitraining/gcp-demos/HEAD/dataflow/dflow-bq-stream-python/rows.png -------------------------------------------------------------------------------- /dataflow/dflow-bq-stream-python/send.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/roitraining/gcp-demos/HEAD/dataflow/dflow-bq-stream-python/send.png -------------------------------------------------------------------------------- /dataflow/dflow-bq-stream-python/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/roitraining/gcp-demos/HEAD/dataflow/dflow-bq-stream-python/pipeline.png -------------------------------------------------------------------------------- /dlp-demo/.env.example: -------------------------------------------------------------------------------- 1 | # Environment variables for local development 2 | GOOGLE_CLOUD_PROJECT=your-project-id 3 | PORT=8080 4 | FLASK_ENV=development 5 | -------------------------------------------------------------------------------- /dataflow/dflow-bq-stream-python/req2.txt: -------------------------------------------------------------------------------- 1 | google-cloud-core 2 | google-cloud-pubsub 3 | google-api-python-client 4 | google-auth 5 | google-auth-httplib2 6 | google-cloud-bigquery -------------------------------------------------------------------------------- /security/org_policy/policy.yaml: -------------------------------------------------------------------------------- 1 | # replace PROJECT_ID 2 | 3 | name: projects/PROJECT_ID/policies/custom.gcsBucketLocationConstraint 4 | spec: 5 | rules: 6 | - enforce: true 7 | -------------------------------------------------------------------------------- /bigquery/schema-demo/order_schema.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "order_num", 4 | "type": "STRING" 5 | }, 6 | { 7 | "name": "cust_id", 8 | "type": "INTEGER" 9 | }, 10 | { 11 | "name": "order_date", 12 | "type": "DATE" 13 | } 14 | ] 15 | -------------------------------------------------------------------------------- /dataproc/autoscaling_policy.yaml: -------------------------------------------------------------------------------- 1 | basicAlgorithm: 2 | yarnConfig: 3 | gracefulDecommissionTimeout: 30s 4 | scaleDownFactor: 0.5 5 | scaleUpFactor: 0.5 6 | workerConfig: 7 | minInstances: 2 8 | maxInstances: 10 9 | secondaryWorkerConfig: 10 | minInstances: 0 11 | maxInstances: 150 -------------------------------------------------------------------------------- /bigquery/wiki_query_example.sql: -------------------------------------------------------------------------------- 1 | -- StandardSQL 2 | -- wiki 1M 3 | SELECT 4 | title, 5 | SUM(views) AS views, 6 | COUNT(views) AS rows_summed 7 | FROM 8 | `bigquery-samples.wikipedia_benchmark.Wiki1M` 9 | WHERE 10 | REGEXP_CONTAINS(title,".*Davis.*") 11 | GROUP BY 12 | title 13 | ORDER BY 14 | views DESC -------------------------------------------------------------------------------- /bigquery/schema-demo/line_item_schema.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "order_num", 4 | "type": "STRING" 5 | }, 6 | { 7 | "name": "line_item_num", 8 | "type": "INTEGER" 9 | }, 10 | { 11 | "name": "prod_code", 12 | "type": "STRING" 13 | }, 14 | { 15 | "name": "qty", 16 | "type": "INTEGER" 17 | } 18 | ] 19 | -------------------------------------------------------------------------------- /bigquery/schema-demo/product_schema.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "prod_code", 4 | "type": "STRING" 5 | }, 6 | { 7 | "name": "prod_name", 8 | "type": "STRING" 9 | }, 10 | { 11 | "name": "prod_desc", 12 | "type": "STRING" 13 | }, 14 | { 15 | "name": "prod_price", 16 | "type": "FLOAT" 17 | } 18 | ] 19 | -------------------------------------------------------------------------------- /security/org_policy/constraint.sh: -------------------------------------------------------------------------------- 1 | gcloud org-policies set-custom-constraint constraint.yaml # make the custom constraint available for org policies 2 | gcloud org-policies set-policy policy.yaml # apply new constraint 3 | 4 | ## To remove constraint, run gcloud org-policies reset custom.gcsBucketLocationConstraint --project PROJECT_ID (with your project id) -------------------------------------------------------------------------------- /bigquery/external_hive_example.sql: -------------------------------------------------------------------------------- 1 | -- query the external table 2 | SELECT 3 | * 4 | FROM 5 | class.ext_part 6 | 7 | -- query external table with where clause 8 | SELECT 9 | * 10 | FROM 11 | class.ext_part 12 | WHERE 13 | order_num="68610383-54" 14 | 15 | --query external table on partition 16 | SELECT 17 | * 18 | FROM 19 | class.ext_part 20 | WHERE 21 | order_date="2018-01-01" -------------------------------------------------------------------------------- /utilities/shopping_list_api/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "shopping-list-api" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.12" 7 | dependencies = [ 8 | "dotenv>=0.9.9", 9 | "fastapi>=0.116.1", 10 | "google-auth>=2.40.3", 11 | "google-genai>=1.28.0", 12 | "openai>=1.98.0", 13 | "uvicorn>=0.35.0", 14 | ] 15 | -------------------------------------------------------------------------------- /security/org_policy/constraint.yaml: -------------------------------------------------------------------------------- 1 | # replace ORG_ID 2 | 3 | name: organizations/ORG_ID/customConstraints/custom.gcsBucketLocationConstraint 4 | displayName: Restrict GCS Bucket Location 5 | description: Restricts Cloud Storage buckets to be created only in the 'us-central1' region. 6 | actionType: DENY 7 | condition: | 8 | resource.location.startsWith('us-central1') == false 9 | methodTypes: 10 | - CREATE 11 | resourceTypes: 12 | - storage.googleapis.com/Bucket 13 | -------------------------------------------------------------------------------- /bigquery/schema-demo/denorm_query.sql: -------------------------------------------------------------------------------- 1 | -- find sales/zip for march 2 | -- base denorm table 3 | WITH 4 | orders AS ( 5 | SELECT 6 | cust_zip, 7 | prod_price * qty AS line_item_subtotal 8 | FROM 9 | `.bq_demo.denorm` 10 | WHERE 11 | order_date >= "2018-03-01" 12 | AND order_date <= "2018-03-31") 13 | SELECT 14 | cust_zip, 15 | SUM(line_item_subtotal) as zip_sales 16 | FROM 17 | orders 18 | GROUP BY 19 | cust_zip 20 | order by 21 | zip_sales desc 22 | -------------------------------------------------------------------------------- /bigquery/schema-demo/customer_schema.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "cust_id", 4 | "type": "INTEGER" 5 | }, 6 | { 7 | "name": "cust_name", 8 | "type": "STRING" 9 | }, 10 | { 11 | "name": "cust_address", 12 | "type": "STRING" 13 | }, 14 | { 15 | "name": "cust_state", 16 | "type": "STRING" 17 | }, 18 | { 19 | "name": "cust_zip", 20 | "type": "INTEGER" 21 | }, 22 | { 23 | "name": "cust_email", 24 | "type": "STRING" 25 | }, 26 | { 27 | "name": "cust_phone", 28 | "type": "STRING" 29 | } 30 | ] 31 | -------------------------------------------------------------------------------- /bigquery/time_travel_example.sql: -------------------------------------------------------------------------------- 1 | -- view up-to-date table 2 | SELECT 3 | month, 4 | COUNT(*) 5 | FROM 6 | `class.time_travel` 7 | GROUP BY 8 | month; 9 | 10 | -- view table with only initial load 11 | SELECT 12 | month, 13 | COUNT(*) 14 | FROM 15 | `class.time_travel` FOR SYSTEM_TIME AS OF TIMESTAMP_SECONDS(target) 16 | GROUP BY 17 | month; 18 | 19 | -- create restoration table 20 | CREATE OR REPLACE TABLE 21 | class.time_travel_restore AS ( 22 | SELECT 23 | * 24 | FROM 25 | `class.time_travel` FOR SYSTEM_TIME AS OF TIMESTAMP_SECONDS(target)) 26 | -------------------------------------------------------------------------------- /bigquery/approx_example.sql: -------------------------------------------------------------------------------- 1 | -- Run these within a QLabs projects 2 | -- Run the approx first 3 | -- When it's done, run the exact 4 | -- Compare times and % difference 5 | 6 | -- First query - exact 7 | -- StandardSQL 8 | -- wiki 1M 9 | SELECT 10 | COUNT(distinct title) AS articles 11 | FROM 12 | `bigquery-samples.wikipedia_benchmark.Wiki100B` 13 | ORDER BY 14 | articles DESC 15 | 16 | -- Second query - approx 17 | -- StandardSQL 18 | -- wiki 1M 19 | SELECT 20 | approx_count_distinct(title) AS articles 21 | FROM 22 | `bigquery-samples.wikipedia_benchmark.Wiki100B` 23 | ORDER BY 24 | articles DESC -------------------------------------------------------------------------------- /.gcloudignore: -------------------------------------------------------------------------------- 1 | # This file specifies files that are *not* uploaded to Google Cloud Platform 2 | # using gcloud. It follows the same syntax as .gitignore, with the addition of 3 | # "#!include" directives (which insert the entries of the given .gitignore-style 4 | # file at that point). 5 | # 6 | # For more information, run: 7 | # $ gcloud topic gcloudignore 8 | # 9 | .gcloudignore 10 | # If you would like to upload your .git directory, .gitignore file or files 11 | # from your .gitignore file, remove the corresponding line 12 | # below: 13 | .git 14 | .gitignore 15 | 16 | node_modules 17 | #!include:.gitignore 18 | -------------------------------------------------------------------------------- /dlp-demo/app/templates/message.html: -------------------------------------------------------------------------------- 1 | {% extends 'base.html' %} 2 | 3 | {% block head %} 4 | {{ title }} 5 | {% endblock %} 6 | 7 | {% block content %} 8 |
9 |
10 |
11 |
12 |

{{ headline }}

13 |
14 |
15 |

{{ message_text }}

16 | Go Home 17 |
18 |
19 |
20 |
21 | {% endblock %} 22 | -------------------------------------------------------------------------------- /docs/codelab.json: -------------------------------------------------------------------------------- 1 | { 2 | "environment": "web", 3 | "format": "html", 4 | "prefix": "https://storage.googleapis.com", 5 | "mainga": "UA-49880327-14", 6 | "updated": "2025-08-21T02:00:40Z", 7 | "id": "docs", 8 | "duration": 0, 9 | "title": "Do-It-Now activities (gcp-demos)", 10 | "summary": "Do It Now activities that go with the gcp-demos repo", 11 | "source": "14C6K5l2yJ1ZJToChz72lPq-pFiKm_WcRGcgjnf5VJO0", 12 | "theme": "", 13 | "status": [ 14 | "-" 15 | ], 16 | "category": [ 17 | "-" 18 | ], 19 | "tags": [ 20 | "detailed", 21 | "web" 22 | ], 23 | "feedback": "-", 24 | "url": "docs" 25 | } 26 | -------------------------------------------------------------------------------- /composer/dag_development/validate_dag.sh: -------------------------------------------------------------------------------- 1 | # In your environment's bucket, create a test directory and copy your DAGs to it. 2 | gcloud storage cp gs://us-central1-example-environment-a12bc345-bucket/dags \ 3 | gs://us-central1-example-environment-a12bc345-bucket/data/test --recursive 4 | 5 | # Test for errors in all your DAGs 6 | gcloud storage cp gs://us-central1-example-environment-a12bc345-bucket/dags \ 7 | gs://us-central1-example-environment-a12bc345-bucket/data/test --recursive 8 | 9 | # Test a task for errors 10 | gcloud composer environments run \ 11 | ENVIRONMENT_NAME \ 12 | --location ENVIRONMENT_LOCATION \ 13 | tasks test -- --subdir /home/airflow/gcs/data/test \ 14 | DAG_ID TASK_ID \ 15 | DAG_EXECUTION_DATE -------------------------------------------------------------------------------- /utilities/shopping_list_api/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.12-slim-bookworm AS base 2 | 3 | 4 | FROM base AS builder 5 | 6 | COPY --from=ghcr.io/astral-sh/uv:0.4.9 /uv /bin/uv 7 | 8 | ENV UV_COMPILE_BYTECODE=1 UV_LINK_MODE=copy 9 | 10 | WORKDIR /app 11 | 12 | COPY uv.lock pyproject.toml /app/ 13 | 14 | RUN --mount=type=cache,target=/root/.cache/uv \ 15 | uv sync --frozen --no-install-project --no-dev 16 | 17 | COPY . /app 18 | 19 | RUN --mount=type=cache,target=/root/.cache/uv \ 20 | uv sync --frozen --no-dev 21 | 22 | FROM base 23 | 24 | WORKDIR /app 25 | 26 | COPY --from=builder /app . 27 | 28 | ENV PATH="/app/.venv/bin:$PATH" 29 | 30 | EXPOSE 8080 31 | 32 | CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"] 33 | -------------------------------------------------------------------------------- /dlp-demo/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Entry point for the DLP Demo application. 4 | """ 5 | 6 | import os 7 | import sys 8 | 9 | # Add the project root to the Python path 10 | project_root = os.path.dirname(os.path.abspath(__file__)) 11 | sys.path.insert(0, project_root) 12 | 13 | from app.main import app 14 | 15 | if __name__ == "__main__": 16 | # Load environment variables 17 | from dotenv import load_dotenv 18 | 19 | load_dotenv() 20 | 21 | # Configuration 22 | port = int(os.environ.get("PORT", 8080)) 23 | debug = os.environ.get("FLASK_ENV") == "development" 24 | 25 | print(f"Starting DLP Demo application on port {port}") 26 | print(f"Debug mode: {debug}") 27 | 28 | app.run(host="0.0.0.0", port=port, debug=debug) 29 | -------------------------------------------------------------------------------- /dataflow/dflow-bq-stream-python/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export PROJECT_ID=$(gcloud config get-value project) 4 | 5 | python -m venv .venv 6 | source .venv/bin/activate 7 | pip install -r req1.txt 8 | pip install -r req2.txt 9 | 10 | gcloud iam service-accounts create $1 \ 11 | --display-name="$1" 12 | sleep 2 13 | export sa_email=$(gcloud iam service-accounts list --filter="displayName:$1" --format="value(email)") 14 | gcloud projects add-iam-policy-binding $PROJECT_ID\ 15 | --member="serviceAccount:$sa_email" \ 16 | --role="roles/editor" 17 | gcloud iam service-accounts keys create $1.json --iam-account=$sa_email 18 | export GOOGLE_APPLICATION_CREDENTIALS=$1.json 19 | 20 | gsutil mb -l us-central1 "gs://$PROJECT_ID-dflow-demo" 21 | 22 | gcloud services disable dataflow 23 | sleep 5 24 | gcloud services enable bigquery pubsub dataflow -------------------------------------------------------------------------------- /bigquery/mv_example.sql: -------------------------------------------------------------------------------- 1 | -- this is for reference; cannot be run unless you're an admin on dataset 2 | CREATE MATERIALIZED VIEW 3 | roi-bq-demos.bq_demo.order_mv AS 4 | SELECT 5 | cust_zip, 6 | order_date, 7 | count(*) as orders 8 | FROM 9 | `roi-bq-demos.bq_demo.customer` c 10 | JOIN 11 | `roi-bq-demos.bq_demo.order` o 12 | ON 13 | o.cust_id= c.cust_id 14 | GROUP BY 15 | order_date, 16 | cust_zip 17 | 18 | -- query against materialized view 19 | SELECT 20 | * 21 | FROM 22 | `roi-bq-demos.bq_demo.order_mv` 23 | WHERE 24 | cust_zip<2000 25 | 26 | -- query against original tables that automatically uses the materialized view 27 | SELECT 28 | cust_zip, 29 | order_date, 30 | COUNT(*) AS orders 31 | FROM 32 | `roi-bq-demos.bq_demo.customer` c 33 | JOIN 34 | `roi-bq-demos.bq_demo.order` o 35 | ON 36 | o.cust_id = c.cust_id 37 | WHERE 38 | cust_zip<2000 39 | GROUP BY 40 | order_date, 41 | cust_zip -------------------------------------------------------------------------------- /bigquery/udf_examples.sql: -------------------------------------------------------------------------------- 1 | -- trim strings 2 | SELECT 3 | text AS messy, 4 | TRIM(REGEXP_REPLACE(LOWER(text), '[^a-zA-Z0-9 ]+', '')) AS tidy 5 | FROM 6 | `roi-bq-demos.bq_demo.messy_text` 7 | 8 | -- create udf 9 | CREATE OR REPLACE FUNCTION 10 | `class.tidy_string` (text STRING) 11 | RETURNS STRING AS (TRIM(REGEXP_REPLACE(LOWER(text), '[^a-zA-Z0-9 ]+', ''))); 12 | 13 | -- query with SQL UDF 14 | SELECT 15 | text AS messy, 16 | `class.tidy_string`(text) AS tidy 17 | FROM 18 | `roi-bq-demos.bq_demo.messy_text` 19 | 20 | -- create javascript udf 21 | CREATE OR REPLACE FUNCTION 22 | `class.get_numbers`(str STRING) 23 | RETURNS NUMERIC 24 | LANGUAGE js AS ''' 25 | return nlp(str).values(0).toNumber().out() 26 | ''' OPTIONS ( library="gs://fh-bigquery/js/compromise.min.11.14.0.js"); 27 | 28 | -- query with javascript udf 29 | SELECT 30 | text, 31 | `class.get_numbers`(text) AS number 32 | FROM 33 | `roi-bq-demos.bq_demo.number_strings` -------------------------------------------------------------------------------- /dlp-demo/Dockerfile: -------------------------------------------------------------------------------- 1 | # Cloud Run deployment configuration 2 | FROM python:3.11-slim 3 | 4 | # Set environment variables 5 | ENV PYTHONUNBUFFERED=1 6 | ENV PATH="/app/.venv/bin:$PATH" 7 | 8 | # Install uv 9 | COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ 10 | 11 | # Set working directory 12 | WORKDIR /app 13 | 14 | # Copy dependency files 15 | COPY pyproject.toml uv.lock ./ 16 | 17 | # Install dependencies 18 | RUN uv sync --frozen --no-cache 19 | 20 | # Copy application code 21 | COPY . . 22 | 23 | # Create non-root user 24 | RUN useradd --create-home --shell /bin/bash app && chown -R app:app /app 25 | USER app 26 | 27 | # Expose port 28 | EXPOSE 8080 29 | 30 | # Health check 31 | HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ 32 | CMD curl -f http://localhost:8080/health || exit 1 33 | 34 | # Run application 35 | CMD ["uv", "run", "gunicorn", "--bind", "0.0.0.0:8080", "--workers", "2", "--timeout", "60", "app.main:app"] 36 | -------------------------------------------------------------------------------- /dlp-demo/app/config.py: -------------------------------------------------------------------------------- 1 | """Configuration settings for the DLP demo application.""" 2 | 3 | import os 4 | 5 | 6 | class Config: 7 | """Base configuration class.""" 8 | 9 | # Google Cloud settings 10 | GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT") 11 | 12 | # Flask settings 13 | SECRET_KEY = os.environ.get("SECRET_KEY", "dev-secret-key-change-in-production") 14 | 15 | # Application settings 16 | PORT = int(os.environ.get("PORT", 8080)) 17 | 18 | @classmethod 19 | def validate(cls) -> None: 20 | """Validate required configuration values.""" 21 | if not cls.GOOGLE_CLOUD_PROJECT: 22 | raise ValueError("GOOGLE_CLOUD_PROJECT environment variable is required") 23 | 24 | 25 | class DevelopmentConfig(Config): 26 | """Development configuration.""" 27 | 28 | DEBUG = True 29 | 30 | 31 | class ProductionConfig(Config): 32 | """Production configuration.""" 33 | 34 | DEBUG = False 35 | -------------------------------------------------------------------------------- /bigquery/views_example.sql: -------------------------------------------------------------------------------- 1 | -- query with subquery 2 | SELECT 3 | * 4 | FROM ( 5 | SELECT 6 | cust_id, 7 | cust_name, 8 | cust_address, 9 | cust_zip 10 | FROM 11 | `roi-bq-demos.bq_demo.customer` 12 | WHERE 13 | cust_state="NJ") 14 | WHERE 15 | cust_id <100000 16 | 17 | -- with clause 18 | WITH 19 | nj AS ( 20 | SELECT 21 | cust_id, 22 | cust_name, 23 | cust_address, 24 | cust_zip 25 | FROM 26 | `roi-bq-demos.bq_demo.customer` 27 | WHERE 28 | cust_state="NJ") 29 | SELECT 30 | * 31 | FROM 32 | nj 33 | WHERE 34 | cust_id<100000 35 | 36 | -- create view 37 | -- assumes dataset in current project named class 38 | CREATE OR REPLACE VIEW 39 | `class.nj_view` AS 40 | SELECT 41 | cust_id, 42 | cust_name, 43 | cust_address, 44 | cust_zip 45 | FROM 46 | `roi-bq-demos.bq_demo.customer` 47 | WHERE 48 | cust_state="NJ" 49 | 50 | -- query view 51 | SELECT 52 | * 53 | FROM 54 | `class.nj_view` 55 | WHERE 56 | cust_id < 100000 -------------------------------------------------------------------------------- /ai/pipelines/README.md: -------------------------------------------------------------------------------- 1 | # Simple Vertex AI Pipeline demo 2 | 3 | ## Setup 4 | 1. Load [notebook](https://github.com/roitraining/challenge-labs-public/blob/main/data%20science/challenge-labs-pipelines.ipynb) in a Vertex AI Workbench instance 5 | 1. You can rull in Collab Enterprise, but you need to change `serviceAccount:` to `user:` as the code will run as you rather than the service account of an instance VM. 6 | 7 | ## Demo 8 | 9 | 1. Run all the cells in the notebook 10 | 2. Talk the students through the pipeline definition 11 | 3. Show the students where the source data is 12 | 4. Show the students the pipeline graph and discuss the nodes 13 | 5. Show the students the dataset and the endpoint created 14 | 6. Discuss the compilation and job submission steps 15 | 7. Highlight that the actual training will take 2+hours 16 | 8. You can always do a pipeline execution ahead of time to show results 17 | 18 | ## Teardown 19 | 20 | 1. Manually undeploy the model from the endpoint 21 | 2. Delete the endpoint 22 | 3. Optionally, delete the model and the dataset -------------------------------------------------------------------------------- /dlp-demo/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "dlp-demo" 3 | version = "2.0.0" 4 | description = "Modern Google Cloud DLP demonstration application" 5 | authors = [ 6 | {name = "ROI Training", email = "info@roitraining.com"} 7 | ] 8 | requires-python = ">=3.11" 9 | dependencies = [ 10 | "flask>=3.0.0", 11 | "flask-cors>=4.0.0", 12 | "google-cloud-dlp>=3.18.0", 13 | "gunicorn>=21.2.0", 14 | "python-dotenv>=1.0.0", 15 | ] 16 | 17 | [project.optional-dependencies] 18 | dev = [ 19 | "pytest>=7.4.0", 20 | "black>=23.0.0", 21 | "ruff>=0.1.0", 22 | "mypy>=1.5.0", 23 | ] 24 | 25 | [build-system] 26 | requires = ["hatchling"] 27 | build-backend = "hatchling.build" 28 | 29 | [tool.hatch.build.targets.wheel] 30 | packages = ["app"] 31 | 32 | [tool.black] 33 | line-length = 88 34 | target-version = ['py311'] 35 | 36 | [tool.ruff] 37 | line-length = 88 38 | target-version = "py311" 39 | select = ["E", "F", "W", "I", "N", "B", "A", "S", "UP"] 40 | 41 | [tool.mypy] 42 | python_version = "3.11" 43 | warn_return_any = true 44 | warn_unused_configs = true 45 | disallow_untyped_defs = true 46 | -------------------------------------------------------------------------------- /bigquery/time_travel.sh: -------------------------------------------------------------------------------- 1 | # create the initial time travel table 2 | bq query \ 3 | --use_legacy_sql=false \ 4 | --destination_table=class.time_travel \ 5 | --replace \ 6 | 'SELECT 7 | c.*, 8 | o.order_num, 9 | o.order_date, 10 | FORMAT_DATETIME("%B", DATETIME(order_date)) AS month 11 | FROM 12 | `roi-bq-demos.bq_demo_small.customer` c 13 | JOIN 14 | `roi-bq-demos.bq_demo_small.order` o 15 | ON 16 | c.cust_id = o.cust_id 17 | WHERE 18 | cust_state = "CA" 19 | AND order_date BETWEEN "2018-01-01" 20 | AND "2018-01-31"' 21 | 22 | # grab the time of completion for the table update 23 | export TARGET=$(date +"%s") 24 | 25 | bq query \ 26 | --use_legacy_sql=false \ 27 | --destination_table=class.time_travel \ 28 | --append_table \ 29 | 'SELECT 30 | c.*, 31 | o.order_num, 32 | o.order_date, 33 | FORMAT_DATETIME("%B", DATETIME(order_date)) AS month 34 | FROM 35 | `roi-bq-demos.bq_demo_small.customer` c 36 | JOIN 37 | `roi-bq-demos.bq_demo_small.order` o 38 | ON 39 | c.cust_id = o.cust_id 40 | WHERE 41 | cust_state = "CA" 42 | AND order_date BETWEEN "2018-02-01" 43 | AND "2018-02-28"' 44 | 45 | echo "Your time travel table is ready!" 46 | echo "Your time travel target is $TARGET" -------------------------------------------------------------------------------- /bigquery/schema-demo/norm_query.sql: -------------------------------------------------------------------------------- 1 | -- find sales/zip for march 2 | -- base normalized tables 3 | SELECT 4 | c.cust_zip, 5 | SUM(li.qty * p.prod_price) AS zip_sales 6 | FROM 7 | `roi-bq-demos.bq_demo.order` o 8 | JOIN 9 | `roi-bq-demos.bq_demo.line_item` li 10 | ON 11 | o.order_num = li.order_num 12 | JOIN 13 | `roi-bq-demos.bq_demo.customer` c 14 | ON 15 | o.cust_id = c.cust_id 16 | JOIN 17 | `roi-bq-demos.bq_demo.product` p 18 | ON 19 | p.prod_code = li.prod_code 20 | WHERE 21 | o.order_date >= "2018-03-01" 22 | AND o.order_date <= "2018-03-31" 23 | GROUP BY 24 | c.cust_zip 25 | ORDER BY 26 | zip_sales DESC 27 | 28 | -- find sales/zip for march 29 | -- normalized tables with order_part table 30 | SELECT 31 | c.cust_zip, 32 | SUM(li.qty * p.prod_price) AS zip_sales 33 | FROM 34 | `roi-bq-demos.bq_demo.order_part` o 35 | JOIN 36 | `roi-bq-demos.bq_demo.line_item` li 37 | ON 38 | o.order_num = li.order_num 39 | JOIN 40 | `roi-bq-demos.bq_demo.customer` c 41 | ON 42 | o.cust_id = c.cust_id 43 | JOIN 44 | `roi-bq-demos.bq_demo.product` p 45 | ON 46 | p.prod_code = li.prod_code 47 | WHERE 48 | o.order_date >= "2018-03-01" 49 | AND o.order_date <= "2018-03-31" 50 | GROUP BY 51 | c.cust_zip 52 | ORDER BY 53 | zip_sales DESC -------------------------------------------------------------------------------- /dataproc/dataproc_scale_demo.sh: -------------------------------------------------------------------------------- 1 | # run in cloud shell to create the demo cluster 2 | gcloud dataproc clusters create demo-cluster \ 3 | --region us-central1 \ 4 | --zone us-central1-a \ 5 | --worker-machine-type=n1-standard-8 \ 6 | --num-workers=2 \ 7 | --num-secondary-workers=0 \ 8 | --secondary-worker-boot-disk-size=30 \ 9 | --delete-max-age=10m \ 10 | --verbosity=error 11 | 12 | # run in cloud shell to submit a job to the cluster 13 | # show the progress rate (about 1% map per 30 seconds) 14 | export PROJECT_ID=$(gcloud config get-value project) 15 | gsutil mb gs://$PROJECT_ID 16 | gcloud dataproc jobs submit hadoop \ 17 | --cluster=demo-cluster \ 18 | --region=us-central1 \ 19 | --class=org.apache.hadoop.examples.terasort.TeraGen \ 20 | --jars=file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar \ 21 | -- -D mapreduce.job.maps=800 10000000000 gs://$PROJECT_ID/tg_n/$(date +%s) 22 | 23 | # run in cloud shell to add pre-emptible instances 24 | # adjust the number of nodes to something within your quota 25 | # show new progress rate (2-3%/sec) 26 | gcloud dataproc clusters update demo-cluster \ 27 | --num-secondary-workers 150 \ 28 | --region us-central1 29 | 30 | # delete the cluster 31 | gcloud dataproc clusters delete demo-cluster \ 32 | --region us-central1 -------------------------------------------------------------------------------- /NEW_AUG25.md: -------------------------------------------------------------------------------- 1 | # What's new in the August 25 update 2 | > [!NOTE] 3 | > Updated August 20, 2025 4 | 5 | ## BigQuery demos 6 | 1. SQL transform examples 7 | 2. Exported billing data example 8 | 3. Information schema examples 9 | 10 | ## Composer demos 11 | 4. DAG check scripts (python and bash) 12 | 5. DAG that shows different Airflow strategies for BigQuery exports 13 | 6. DAG that shows executing a query and exporting the results 14 | 15 | ## Dataflow demos 16 | 7. Simple demo pipeline that illustrates read, write, branch, Map, FlatMap and Filter 17 | 8. Simple demo pipeline that illustrates Create, GroupByKey, CombineGlobally, CombinePerKey, and CoGroupByKey 18 | 19 | ## Dataproc demos 20 | 9. Autoscaling cluster demo to go with manual scaling demo 21 | 22 | ## DLP-demo 23 | 10. Revised app and source code for demoing DLP abilities 24 | 25 | ## Do it nows 26 | 11. Fixes to the Do It Now instructions and code 27 | 28 | ## Security 29 | 12. Assets for creating and applying a custom constraint 30 | 13. A python script that illustrates multiple ways of authenticating 31 | 32 | ## Terraform 33 | 14. Bash script that exports a project resources as Terraform HCL 34 | 35 | ## Utilities 36 | 15. Source code for a silly web service that generates Costco shopping lists and returns them in JSON payloads 37 | 38 | ## Works in Progress 39 | - Finishing Dataform demo 40 | - Finishing Dataplex demo 41 | - Finishing custom log-sink -> pub/sub -> cloud function -> interesting action demo 42 | -------------------------------------------------------------------------------- /dlp-demo/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Deploy script for DLP Demo to Cloud Run 4 | set -e 5 | 6 | # Configuration 7 | PROJECT_ID=${GOOGLE_CLOUD_PROJECT:-"your-project-id"} 8 | REGION=${REGION:-"us-central1"} 9 | SERVICE_NAME="dlp-demo" 10 | IMAGE_NAME="us-central1-docker.pkg.dev/${PROJECT_ID}/${SERVICE_NAME}/${SERVICE_NAME}:latest" 11 | 12 | echo "Deploying DLP Demo to Cloud Run..." 13 | echo "Project ID: ${PROJECT_ID}" 14 | echo "Region: ${REGION}" 15 | echo "Service Name: ${SERVICE_NAME}" 16 | 17 | # Ensure we're using the correct project 18 | gcloud config set project ${PROJECT_ID} 19 | 20 | # Build and push the container image 21 | echo "Building container image..." 22 | gcloud builds submit --tag ${IMAGE_NAME} 23 | 24 | # Deploy to Cloud Run 25 | echo "Deploying to Cloud Run..." 26 | gcloud run deploy ${SERVICE_NAME} \ 27 | --image ${IMAGE_NAME} \ 28 | --platform managed \ 29 | --region ${REGION} \ 30 | --allow-unauthenticated \ 31 | --set-env-vars GOOGLE_CLOUD_PROJECT=${PROJECT_ID} \ 32 | --memory 512Mi \ 33 | --cpu 1 \ 34 | --min-instances 0 \ 35 | --max-instances 10 \ 36 | --port 8080 37 | 38 | # Get the service URL 39 | SERVICE_URL=$(gcloud run services describe ${SERVICE_NAME} \ 40 | --platform managed \ 41 | --region ${REGION} \ 42 | --format 'value(status.url)') 43 | 44 | echo "" 45 | echo "Deployment complete!" 46 | echo "Service URL: ${SERVICE_URL}" 47 | echo "" 48 | echo "To test the service:" 49 | echo "curl ${SERVICE_URL}/health" 50 | -------------------------------------------------------------------------------- /ai/automl/salads_deploy.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import google.auth 3 | from google.cloud import aiplatform, aiplatform_v1 4 | 5 | logging.basicConfig(level=logging.INFO) 6 | logger = logging.getLogger(__name__) 7 | 8 | creds, project = google.auth.default() 9 | location = "us-central1" 10 | 11 | 12 | def deploy_model_to_endpoint(model_prefix): 13 | if not project: 14 | raise RuntimeError( 15 | "No GCP project found. Set with `gcloud config set project PROJECT`." 16 | ) 17 | 18 | if not model_prefix: 19 | raise RuntimeError("No model prefix specified.") 20 | 21 | endpoint_display_name = f"{model_prefix}_prediction_endpoint" 22 | parent = f"projects/{project}/locations/{location}" 23 | api_endpoint = f"{location}-aiplatform.googleapis.com" 24 | 25 | model_client = aiplatform_v1.ModelServiceClient( 26 | client_options={"api_endpoint": api_endpoint} 27 | ) 28 | 29 | target_model = None 30 | for model in model_client.list_models(request={"parent": parent}): 31 | if model.display_name and model.display_name.startswith(model_prefix): 32 | target_model = model 33 | break 34 | if not target_model: 35 | raise RuntimeError( 36 | f"No model found with display_name starting with '{model_prefix}' in {parent}" 37 | ) 38 | endpoint = aiplatform.Model(model_name=target_model.name).deploy() 39 | return {"endpoint": endpoint} 40 | 41 | 42 | if __name__ == "__main__": 43 | deploy_model_to_endpoint("salads") 44 | -------------------------------------------------------------------------------- /dataproc/dataproc_autoscale_demo.sh: -------------------------------------------------------------------------------- 1 | # run in cloud shell to create the demo cluster with autoscaling enabled 2 | # This version uses autoscaling for secondary workers instead of manual scaling 3 | 4 | # create the autoscaling policy 5 | gcloud dataproc autoscaling-policies import autoscaling_demo_policy \ 6 | --region=us-central1 \ 7 | --source="./autoscaling_policy.yaml" 8 | 9 | 10 | # create the autoscaling cluster 11 | gcloud dataproc clusters create demo-cluster-autoscale \ 12 | --region us-central1 \ 13 | --zone us-central1-a \ 14 | --worker-machine-type=n1-standard-8 \ 15 | --num-workers=2 \ 16 | --autoscaling-policy=autoscaling_demo_policy \ 17 | --secondary-worker-type=spot \ 18 | --secondary-worker-machine-types=type=n1-standard-8 \ 19 | --secondary-worker-boot-disk-size=30 \ 20 | --verbosity=error 21 | 22 | # run in cloud shell to submit a job to the cluster 23 | # The cluster will automatically scale secondary workers based on job demand 24 | export PROJECT_ID=$(gcloud config get-value project) 25 | gsutil mb gs://$PROJECT_ID 26 | gcloud dataproc jobs submit hadoop \ 27 | --cluster=demo-cluster-autoscale \ 28 | --region=us-central1 \ 29 | --class=org.apache.hadoop.examples.terasort.TeraGen \ 30 | --jars=file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar \ 31 | -- -D mapreduce.job.maps=800 10000000000 gs://$PROJECT_ID/tg_n/$(date +%s) 32 | 33 | # delete the cluster 34 | gcloud dataproc clusters delete demo-cluster-autoscale \ 35 | --region us-central1 36 | -------------------------------------------------------------------------------- /ai/automl/adoption_deploy.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import google.auth 3 | from google.cloud import aiplatform, aiplatform_v1 4 | 5 | logging.basicConfig(level=logging.INFO) 6 | logger = logging.getLogger(__name__) 7 | 8 | creds, project = google.auth.default() 9 | location = "us-central1" 10 | 11 | 12 | def deploy_model_to_endpoint(model_prefix): 13 | if not project: 14 | raise RuntimeError( 15 | "No GCP project found. Set with `gcloud config set project PROJECT`." 16 | ) 17 | 18 | if not model_prefix: 19 | raise RuntimeError("No model prefix specified.") 20 | 21 | endpoint_display_name = f"{model_prefix}_prediction_endpoint" 22 | parent = f"projects/{project}/locations/{location}" 23 | api_endpoint = f"{location}-aiplatform.googleapis.com" 24 | 25 | model_client = aiplatform_v1.ModelServiceClient( 26 | client_options={"api_endpoint": api_endpoint} 27 | ) 28 | 29 | target_model = None 30 | for model in model_client.list_models(request={"parent": parent}): 31 | if model.display_name and model.display_name.startswith(model_prefix): 32 | target_model = model 33 | break 34 | if not target_model: 35 | raise RuntimeError( 36 | f"No model found with display_name starting with '{model_prefix}' in {parent}" 37 | ) 38 | endpoint = aiplatform.Model(model_name=target_model.name).deploy( 39 | machine_type="n1-standard-4" 40 | ) 41 | return {"endpoint": endpoint} 42 | 43 | 44 | if __name__ == "__main__": 45 | deploy_model_to_endpoint("adopted") 46 | -------------------------------------------------------------------------------- /dataplex/profiling/store_transactions_20240806.csv: -------------------------------------------------------------------------------- 1 | transaction_id,store_id,register_id,timestamp,cashier_id,customer_id,payment_method,subtotal,tax_amount,discount_amount,total_amount,currency,receipt_number,status 2 | txn_pos_20240806_001,store_sf_001,reg_003,2024-08-06 14:00:15,cashier_101,cust_loyal_789456,credit_card,85.47,7.69,0.00,93.16,USD,RCP_20240806_001,completed 3 | txn_pos_20240806_002,store_sf_001,reg_001,2024-08-06 14:02:33,cashier_205,cust_walk_in,cash,24.99,2.25,2.50,24.74,USD,RCP_20240806_002,completed 4 | txn_pos_20240806_003,store_ny_002,reg_002,2024-08-06 14:03:45,cashier_302,,debit_card,156.78,14.11,15.68,155.21,USD,RCP_20240806_003,completed 5 | txn_pos_20240806_004,store_sf_001,reg_003,2024-08-06 14:05:12,cashier_101,cust_loyal_123789,credit_card,299.99,27.00,30.00,296.99,USD,RCP_20240806_004,refunded 6 | txn_pos_20240806_005,store_la_003,reg_001,2024-08-06 14:07:28,cashier_401,cust_loyal_456123,gift_card,67.45,6.07,0.00,73.52,USD,RCP_20240806_005,completed 7 | txn_pos_20240806_006,store_ny_002,reg_001,2024-08-06 14:08:19,cashier_301,cust_walk_in,cash,12.50,1.13,0.00,13.63,USD,RCP_20240806_006,void 8 | txn_pos_20240806_007,store_sf_001,reg_002,2024-08-06 14:10:41,cashier_206,cust_loyal_987654,credit_card,445.20,40.07,44.52,440.75,USD,RCP_20240806_007,completed 9 | txn_pos_20240806_008,store_la_003,reg_003,2024-08-06 14:12:05,,cust_walk_in,cash,89.99,8.10,0.00,98.09,USD,RCP_20240806_008,completed 10 | txn_pos_20240806_009,store_sf_001,reg_001,2024-08-06 14:14:22,cashier_205,cust_loyal_555888,debit_card,178.34,16.05,0.00,194.39,USD,RCP_20240806_009,completed 11 | txn_pos_20240806_010,store_ny_002,reg_002,2024-08-06 14:15:33,cashier_302,cust_walk_in,credit_card,INVALID_AMOUNT,5.42,0.00,ERROR,USD,RCP_20240806_010,error -------------------------------------------------------------------------------- /bigquery/schema-demo/load_data.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE TABLE 2 | `bq_demo.denorm` AS ( 3 | SELECT 4 | c.*, 5 | o.order_num AS order_num, 6 | order_date, 7 | line_item_num, 8 | li.prod_code AS prod_code, 9 | qty, 10 | prod_name, 11 | prod_desc, 12 | prod_price 13 | FROM 14 | `roi-bq-demos.bq_demo.customer` c 15 | LEFT JOIN 16 | `roi-bq-demos.bq_demo.order` o 17 | ON 18 | c.cust_id = o.cust_id 19 | LEFT JOIN 20 | `roi-bq-demos.bq_demo.line_item` AS li 21 | ON 22 | o.order_num = li.order_num 23 | LEFT JOIN 24 | `roi-bq-demos.bq_demo.product` AS p 25 | ON 26 | li.prod_code = p.prod_code); 27 | 28 | CREATE OR REPLACE TABLE 29 | `bq_demo.nested_once` AS ( 30 | WITH 31 | dlow AS ( 32 | SELECT 33 | * 34 | FROM 35 | `bq_demo.denorm` ) 36 | SELECT 37 | cust_id, 38 | cust_name, 39 | cust_address, 40 | cust_state, 41 | cust_zip, 42 | cust_email, 43 | cust_phone, 44 | order_num, 45 | order_date, 46 | ARRAY_AGG( STRUCT(line_item_num, 47 | prod_code, 48 | qty, 49 | prod_name, 50 | prod_desc, 51 | prod_price)) AS line_items 52 | FROM 53 | dlow 54 | GROUP BY 55 | order_num, 56 | order_date, 57 | cust_phone, 58 | cust_email, 59 | cust_zip, 60 | cust_state, 61 | cust_address, 62 | cust_name, 63 | cust_id); 64 | 65 | CREATE OR REPLACE TABLE 66 | `bq_demo.table_nested_partitioned` 67 | PARTITION BY 68 | order_date AS ( 69 | SELECT 70 | * 71 | FROM 72 | `bq_demo.nested_once`); 73 | 74 | CREATE OR REPLACE TABLE 75 | `bq_demo.table_nested_partitioned_clustered` 76 | PARTITION BY 77 | order_date 78 | CLUSTER BY 79 | cust_zip AS ( 80 | SELECT 81 | * 82 | FROM 83 | `bq_demo.nested_once`) -------------------------------------------------------------------------------- /composer/dag_development/validate_dag.py: -------------------------------------------------------------------------------- 1 | # validate_dag.py 2 | # --------------------------------------------- 3 | # Use case: Validates an Airflow DAG file for import errors before deploying to Composer/Airflow. 4 | # How it works: Takes a DAG Python file as a command line argument, attempts to import it using Airflow's DagBag, 5 | # and reports any import errors found. Exits with code 0 if successful, 1 if errors, 2 if no argument provided. 6 | # Setup: Recommended to use Python 3.11 via pyenv. Install Apache Airflow in your environment. 7 | # You'll also need to install any provider modules your DAG uses 8 | # Example setup: 9 | # pyenv local 3.11 10 | # uv venv .venv 11 | # source .venv/bin/activate 12 | # uv pip install apache-airflow apache-airflow-providers-google 13 | # Usage: 14 | # uv run validate_dag.py 15 | 16 | import sys 17 | import os 18 | import warnings 19 | 20 | warnings.filterwarnings("ignore", category=FutureWarning, module="airflow") 21 | warnings.simplefilter("ignore", DeprecationWarning) 22 | 23 | from airflow.models.dagbag import DagBag 24 | 25 | # Check for DAG file argument 26 | if len(sys.argv) < 2: 27 | print("Usage: python validate_dag.py ") 28 | sys.exit(2) 29 | DAG_FILE = sys.argv[1] # DAG file to validate 30 | 31 | dag_bag = DagBag(dag_folder=DAG_FILE, include_examples=False) # Load DAG for validation 32 | 33 | errors = dag_bag.import_errors 34 | 35 | if errors: 36 | print("❌ DAG import errors:") 37 | for f, err in errors.items(): 38 | if DAG_FILE in f: 39 | print(f"\nFile: {f}\nError:\n{err}") 40 | sys.exit(1) # Exit with error code if import errors found 41 | else: 42 | print("✅ DAG parsed successfully.") 43 | sys.exit(0) # Exit with success code 44 | -------------------------------------------------------------------------------- /dlp-demo/app/templates/base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | {% block head %} 10 | {% endblock %} 11 | 12 | 13 | 14 |
15 |
16 |
17 |
18 |
19 | ROI Training 20 |
21 |

Google Cloud DLP Demo

22 | ROI Training - Data Loss Prevention API 23 |
24 |
25 |
26 |
27 |
28 |
29 | 30 |
31 | {% block content %} 32 | {% endblock %} 33 |
34 | 35 |
36 |
37 |
38 |
39 |

© 2025 ROI Training, Inc. All rights reserved.

40 |
41 |
42 |
43 |
44 | 45 | 46 | {% block scripts %} 47 | {% endblock %} 48 | 49 | 50 | -------------------------------------------------------------------------------- /ai/automl/README.md: -------------------------------------------------------------------------------- 1 | # AutoML demos 2 | 3 | ## Adoption Demo 4 | 5 | ## Forecasting Demo 6 | 7 | This demo shows building a liquor sales forecasting model using AutoML and 8 | tabular data. 9 | 10 | ### Outline 11 | 12 | 1. Initialize values and services 13 | 2. Create the dataset 14 | 3. Create and run model training job (creates model) 15 | 4. Make batch predictions 16 | 5. Create a Looker Studio dashboard to show prediction results 17 | 18 | ### Timing issues 19 | 20 | 1. Training the model takes 1.5-2 hours to complete, so waiting for the job to complete is generally not feasible 21 | 2. Doing batch prediction takes 25-30 minutes, so again, not feasible 22 | 23 | ### Workarounds 24 | 25 | 1. Run the cells in the workbook before class to create the dataset 26 | 27 | 28 | To use... 29 | 30 | 1. Click on the **automl_forecasting** notebook 31 | 2. Click the link to open in the console using Colab Enterprise 32 | 3. Replace the project ID placeholder with the correct value 33 | 4. Run the cells in the **Setting up** section 34 | 1. Skip the cell that deletes/creates the BigQuery dataset if you have already run predictions and want to use the results already in BigQuery 35 | 5. If you haven't created the dataset and the model previously, run the cells in the **Creating the model** section. 36 | 1. This takes a couple hours to run, so you might want to have a version of the dataset and model already created before class and just show the students 37 | 6. If you have the dataset and model already created, you can then run the cell in the **Using an existing model** section to get a reference to the existing model 38 | 7. Run the cells in the **Making predictions** section to do batch inference with the model 39 | 1. This sadly takes like 25-30 minutes 40 | 2. You might want to have already run the batch prior to class and skip actually running the inference here 41 | 8. Run the cells in the **Creating a dashboard** section and then demo the dashboard and results 42 | 9. Run the cell in the **Cleaning up** section 43 | -------------------------------------------------------------------------------- /dataplex/profiling/app_events_20240806_1400.jsonl: -------------------------------------------------------------------------------- 1 | {"event_id":"mob_1722963601001","timestamp":"2024-08-06T14:00:01.234Z","user_id":"user_mobile_456789","session_id":"mob_sess_def456ghi789","app_version":"2.1.3","platform":"iOS","os_version":"17.5.1","device_model":"iPhone 14","event_type":"app_open","screen_name":"home","previous_screen":null,"engagement_time_ms":null} 2 | {"event_id":"mob_1722963605002","timestamp":"2024-08-06T14:00:05.567Z","user_id":"user_mobile_456789","session_id":"mob_sess_def456ghi789","app_version":"2.1.3","platform":"iOS","os_version":"17.5.1","device_model":"iPhone 14","event_type":"screen_view","screen_name":"product_catalog","previous_screen":"home","engagement_time_ms":4333} 3 | {"event_id":"mob_1722963620003","timestamp":"2024-08-06T14:00:20.890Z","user_id":"user_mobile_456789","session_id":"mob_sess_def456ghi789","app_version":"2.1.3","platform":"iOS","os_version":"17.5.1","device_model":"iPhone 14","event_type":"product_view","screen_name":"product_detail","product_id":"prod_sneakers_042","product_name":"Athletic Running Shoes","product_category":"Footwear > Athletic","price":129.99,"currency":"USD","engagement_time_ms":15000} 4 | {"event_id":"mob_1722963635004","timestamp":"2024-08-06T14:00:35.123Z","user_id":"user_mobile_456789","session_id":"mob_sess_def456ghi789","app_version":"2.1.3","platform":"iOS","os_version":"17.5.1","device_model":"iPhone 14","event_type":"add_to_wishlist","product_id":"prod_sneakers_042","product_name":"Athletic Running Shoes","wishlist_count":3} 5 | {"event_id":"mob_1722963640005","timestamp":"2024-08-06T14:00:40.456Z","user_id":"user_android_123456","session_id":"mob_sess_jkl012mno345","app_version":"2.1.1","platform":"Android","os_version":"13","device_model":"Samsung Galaxy S23","event_type":"app_open","screen_name":"home","previous_screen":null,"engagement_time_ms":null} 6 | {"event_id":"mob_1722963645006","timestamp":"2024-08-06T14:00:45.789Z","user_id":"user_android_123456","session_id":"mob_sess_jkl012mno345","app_version":"2.1.1","platform":"Android","os_version":"13","device_model":"Samsung Galaxy S23","event_type":"search","search_query":"bluetooth speakers","search_results_count":24,"screen_name":"search_results"} -------------------------------------------------------------------------------- /dataplex/profiling/web_events_20240806_14.json: -------------------------------------------------------------------------------- 1 | { 2 | "events": [ 3 | { 4 | "event_id": "evt_1722963600001", 5 | "timestamp": "2024-08-06T14:00:00.123Z", 6 | "session_id": "sess_abc123def456", 7 | "user_id": "user_789012345", 8 | "event_type": "page_view", 9 | "page_url": "https://shop.example.com/products/wireless-headphones", 10 | "page_title": "Premium Wireless Headphones - Electronics Store", 11 | "referrer": "https://google.com/search", 12 | "user_agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15", 13 | "ip_address": "192.168.1.100", 14 | "country": "US", 15 | "state": "CA", 16 | "city": "San Francisco", 17 | "device_type": "mobile", 18 | "browser": "Safari", 19 | "utm_source": "google", 20 | "utm_medium": "cpc", 21 | "utm_campaign": "summer_electronics" 22 | }, 23 | { 24 | "event_id": "evt_1722963660002", 25 | "timestamp": "2024-08-06T14:01:00.456Z", 26 | "session_id": "sess_abc123def456", 27 | "user_id": "user_789012345", 28 | "event_type": "add_to_cart", 29 | "product_id": "prod_headphones_001", 30 | "product_name": "Premium Wireless Headphones", 31 | "product_category": "Electronics > Audio", 32 | "price": 199.99, 33 | "quantity": 1, 34 | "currency": "USD", 35 | "cart_total": 199.99, 36 | "ip_address": "192.168.1.100", 37 | "device_type": "mobile" 38 | }, 39 | { 40 | "event_id": "evt_1722963720003", 41 | "timestamp": "2024-08-06T14:02:00.789Z", 42 | "session_id": "sess_xyz789ghi012", 43 | "user_id": null, 44 | "event_type": "page_view", 45 | "page_url": "https://shop.example.com/", 46 | "page_title": "Home - Electronics Store", 47 | "referrer": "https://facebook.com/", 48 | "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", 49 | "ip_address": "203.0.113.42", 50 | "country": "US", 51 | "state": "NY", 52 | "city": "New York", 53 | "device_type": "desktop", 54 | "browser": "Chrome", 55 | "utm_source": "facebook", 56 | "utm_medium": "social", 57 | "utm_campaign": null 58 | } 59 | ] 60 | } 61 | -------------------------------------------------------------------------------- /ai/automl/adoption_predict.py: -------------------------------------------------------------------------------- 1 | import google.auth 2 | import logging 3 | 4 | from google.cloud import aiplatform_v1, aiplatform 5 | 6 | logging.basicConfig(level=logging.INFO) 7 | logger = logging.getLogger(__name__) 8 | 9 | creds, project = google.auth.default() 10 | location = "us-central1" 11 | 12 | 13 | def find_endpoint_and_predict(display_prefix="adopted", location=location): 14 | if not project: 15 | logger.error( 16 | "No gcloud project found. Set with `gcloud config set project PROJECT` or pass project explicitly." 17 | ) 18 | return 19 | 20 | parent = f"projects/{project}/locations/{location}" 21 | 22 | endpoint_client = aiplatform_v1.EndpointServiceClient( 23 | client_options={"api_endpoint": f"{location}-aiplatform.googleapis.com"} 24 | ) 25 | 26 | # List endpoints and find first whose display_name starts with the prefix 27 | endpoints = endpoint_client.list_endpoints(request={"parent": parent}) 28 | target = None 29 | for ep in endpoints: 30 | if ep.display_name and ep.display_name.startswith(display_prefix): 31 | target = aiplatform.Endpoint(endpoint_name=ep.name) 32 | break 33 | 34 | if not target: 35 | logger.error( 36 | f"No endpoint found with display_name starting with '{display_prefix}' in {parent}" 37 | ) 38 | return 39 | 40 | # Prediction instances for adoption model 41 | instances = [ 42 | { 43 | "Type": "Cat", 44 | "Age": "3", 45 | "Breed1": "Tabby", 46 | "Gender": "Male", 47 | "Color1": "Black", 48 | "Color2": "White", 49 | "MaturitySize": "Small", 50 | "FurLength": "Short", 51 | "Vaccinated": "No", 52 | "Sterilized": "No", 53 | "Health": "Healthy", 54 | "Fee": "100", 55 | "PhotoAmt": "2", 56 | } 57 | ] 58 | 59 | try: 60 | prediction = target.predict(instances) 61 | logger.info(f"Prediction result: {prediction.predictions[0]}") 62 | except Exception as e: 63 | logger.error(f"Prediction call failed: {e}") 64 | raise 65 | 66 | 67 | if __name__ == "__main__": 68 | find_endpoint_and_predict() 69 | -------------------------------------------------------------------------------- /dataplex/lineage/lineage_tools.py: -------------------------------------------------------------------------------- 1 | from google.cloud import datacatalog_lineage_v1 2 | from datetime import datetime, timezone, timedelta 3 | 4 | 5 | def list_processes(): 6 | # Create a client 7 | client = datacatalog_lineage_v1.LineageClient() 8 | 9 | # Initialize request argument(s) 10 | request = datacatalog_lineage_v1.ListProcessesRequest( 11 | parent=parent, 12 | ) 13 | 14 | # Make the request 15 | page_result = client.list_processes(request=request) 16 | 17 | # Handle the response 18 | for response in page_result: 19 | yield response 20 | 21 | 22 | def list_lineage_events(event_name): 23 | # Create a client 24 | client = datacatalog_lineage_v1.LineageClient() 25 | 26 | # Initialize request argument(s) 27 | request = datacatalog_lineage_v1.ListLineageEventsRequest( 28 | parent=event_name, 29 | ) 30 | 31 | # Make the request 32 | page_result = client.list_lineage_events(request=request) 33 | 34 | # Handle the response 35 | for response in page_result: 36 | yield response 37 | 38 | 39 | def list_runs(process_name, num_days): 40 | 41 | now_utc = datetime.now(timezone.utc).replace(tzinfo=None) 42 | cutoff = now_utc + timedelta(days=-num_days) 43 | 44 | # Create a client 45 | client = datacatalog_lineage_v1.LineageClient() 46 | 47 | # Initialize request argument(s) 48 | request = datacatalog_lineage_v1.ListRunsRequest( 49 | parent=process_name, 50 | ) 51 | 52 | # Make the request 53 | page_result = client.list_runs(request=request) 54 | 55 | # Handle the response 56 | for response in page_result: 57 | dt = datetime.fromtimestamp(response.start_time.timestamp()) 58 | if dt >= cutoff: 59 | yield response 60 | 61 | 62 | if __name__ == "__main__": 63 | parent = "projects/jwd-gcp-demos/locations/us" 64 | processes = list_processes() 65 | for process in processes: 66 | print(process) 67 | print(f"Runs for {process.name}:") 68 | runs = list_runs(process_name=process.name, num_days=1) 69 | for run in runs: 70 | print("Run start time:", run.start_time) 71 | print(run) 72 | print(f"Events for {run.name}:") 73 | events = list_lineage_events(run.name) 74 | for event in events: 75 | print(f" {event}") 76 | -------------------------------------------------------------------------------- /dlp-demo/README.md: -------------------------------------------------------------------------------- 1 | # DLP Demo 2.0 2 | 3 | A Google Cloud Data Loss Prevention (DLP) demonstration application built for Cloud Run. 4 | 5 | ## Features 6 | 7 | - **Modern Python**: Built with Python 3.11+ and type hints 8 | - **Cloud Run Ready**: Containerized application with health checks 9 | - **Application Default Credentials**: No service account keys needed 10 | - **Modern Dependencies**: Up-to-date Flask, Google Cloud DLP, and other libraries 11 | - **uv Package Management**: Fast and reliable dependency management 12 | - **Best Practices**: Well-architected, documented, and tested 13 | 14 | ## Architecture 15 | 16 | The application provides a web interface for demonstrating Google Cloud DLP capabilities: 17 | 18 | - **Text Inspection**: Identify sensitive information in text 19 | - **Data Redaction**: Remove sensitive information 20 | - **Data Replacement**: Replace sensitive information with placeholders 21 | - **Data Masking**: Mask sensitive information with characters 22 | 23 | ## Development Setup 24 | 25 | 1. Install uv if not already installed: 26 | ```bash 27 | curl -LsSf https://astral.sh/uv/install.sh | sh 28 | ``` 29 | 30 | 2. Install dependencies: 31 | ```bash 32 | uv sync 33 | ``` 34 | 35 | 3. Set up Google Cloud authentication: 36 | ```bash 37 | gcloud auth application-default login 38 | ``` 39 | 40 | 4. Set environment variables: 41 | ```bash 42 | export GOOGLE_CLOUD_PROJECT=your-project-id 43 | export PORT=8080 44 | ``` 45 | 46 | 5. Run the development server: 47 | ```bash 48 | uv run python run.py 49 | ``` 50 | 51 | Or alternatively: 52 | ```bash 53 | uv run python -m app.main 54 | ``` 55 | 56 | ## Deployment 57 | 58 | ### Cloud Run 59 | 60 | 1. Build and deploy: 61 | ```bash 62 | gcloud run deploy dlp-demo \ 63 | --source . \ 64 | --platform managed \ 65 | --region us-central1 \ 66 | --allow-unauthenticated 67 | ``` 68 | 69 | 2. The service will automatically use the Cloud Run service account with appropriate permissions. 70 | 71 | ## Environment Variables 72 | 73 | - `GOOGLE_CLOUD_PROJECT`: Your Google Cloud project ID 74 | - `PORT`: Port to run the application on (default: 8080) 75 | - `FLASK_ENV`: Set to 'development' for debug mode 76 | 77 | ## API Endpoints 78 | 79 | - `GET /`: Main DLP demo interface 80 | - `POST /api/dlp`: Process text with DLP operations 81 | - `GET /health`: Health check endpoint 82 | 83 | ## License 84 | 85 | This project is for educational purposes as part of ROI Training's GCP demonstrations. 86 | -------------------------------------------------------------------------------- /dataflow/dflow-bq-stream-python/send_events.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import random 3 | import datetime 4 | import time 5 | import googleapiclient.discovery 6 | import json 7 | 8 | from google.cloud import pubsub_v1 as pubsub 9 | from google.oauth2 import service_account 10 | 11 | # all transactions will occur in these zip codes 12 | ZIPS = ["95136", "95126", "95404", "94929"] 13 | 14 | # handle command-line arguments 15 | # must provide project_id 16 | # can specify values for topic, sub, and service account 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument( 19 | "--project_id", 20 | required=True) 21 | parser.add_argument( 22 | "--topic_id", 23 | default="demo_topic") 24 | parser.add_argument( 25 | "--sub_id", 26 | default="demo_sub") 27 | 28 | known_args, extra_args = parser.parse_known_args() 29 | 30 | # create the topic in the specified project 31 | publisher = pubsub.PublisherClient() 32 | topic_name = 'projects/{project_id}/topics/{topic}'.format( 33 | project_id=known_args.project_id, 34 | topic=known_args.topic_id, # Set this to something appropriate. 35 | ) 36 | topic_list = publisher.list_topics(project=f"projects/{known_args.project_id}") 37 | if (next((True for x in topic_list if topic_name == x.name), False)): 38 | pass 39 | else: 40 | topic = publisher.create_topic(name=topic_name) 41 | 42 | # create the sub in the specified project 43 | subscriber = pubsub.SubscriberClient() 44 | topic_path = publisher.topic_path(known_args.project_id, known_args.topic_id) 45 | sub_path = subscriber.subscription_path(known_args.project_id, known_args.sub_id) 46 | sub_list = subscriber.list_subscriptions(project=f"projects/{known_args.project_id}") 47 | if (next((True for x in sub_list if sub_path == x.name), False)): 48 | pass 49 | else: 50 | subscriber.create_subscription(request={"name": sub_path, "topic": topic_path}) 51 | 52 | # send a message every second 53 | while True: 54 | time.sleep(1) 55 | pos_id = random.randint(1,9) # there are 10 pos terminals 56 | timestamp = datetime.datetime.now().isoformat() 57 | zip_code = ZIPS[random.randint(0,3)] 58 | amount = round(random.uniform(1.00, 1000.0),2) 59 | body_dict = {"pos_id": pos_id, 60 | "ts": timestamp, 61 | "zip": zip_code, 62 | "sale_amount": amount} 63 | body = json.dumps(body_dict).encode("utf-8") # create a byte array 64 | future = publisher.publish(topic_path, body) 65 | message_id = future.result() 66 | print("Message published") 67 | print(f" - ID: {message_id}") 68 | print(f" - BODY: {body}") -------------------------------------------------------------------------------- /bigquery/exported_billing_data_example.sql: -------------------------------------------------------------------------------- 1 | -- ------------------------------------------------------------ 2 | -- USE CASE (BigQuery-focused): 3 | -- This query helps you understand spend on BigQuery specifically by reading 4 | -- your Google Cloud billing export in BigQuery and subtotaling the different 5 | -- BigQuery cost components (for example: query/analysis, storage, streaming inserts, 6 | -- load jobs, copy jobs, and other SKU-level charges). 7 | -- 8 | -- WHAT IT RETURNS: 9 | -- It returns a subtotaled list of BigQuery cost components and their total USD 10 | -- cost over the selected time window (the query groups SKUs into human-friendly 11 | -- activity buckets and sums the cost for each bucket). 12 | -- 13 | -- HOW IT WORKS: 14 | -- The query reads rows from your billing export table filtered to service = 'BigQuery', 15 | -- extracts the SKU description, maps SKUs into activity groups (analysis/query, storage, 16 | -- streaming, load, copy, etc.), and then aggregates costs per activity. 17 | -- 18 | -- REQUIREMENTS TO USE: 19 | -- - Billing export must be enabled in Google Cloud. 20 | -- - You must use the resource-level billing export (resource-level exports include 21 | -- SKU and resource metadata that this query relies on). 22 | -- - Replace the table placeholder(s) below with your actual export table identifier, 23 | -- for example: `my-billing-project.my_dataset.gcp_billing_export_v1_012345_YYYYMMDD` or 24 | -- the wildcard export table pattern `my-billing-project.my_dataset.gcp_billing_export_v1_*`. 25 | -- ------------------------------------------------------------ 26 | WITH 27 | bq_usage AS ( 28 | SELECT 29 | cost, 30 | sku.description AS sku_desc, 31 | service.description AS service_desc, 32 | usage_start_time 33 | FROM 34 | `{{PROJECT_ID}}.{{DATASET}}.{{TABLE}}` 35 | WHERE 36 | service.description = 'BigQuery' 37 | AND usage_start_time BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 30 DAY) 38 | AND CURRENT_TIMESTAMP() ) 39 | SELECT 40 | activity, 41 | ROUND(SUM(cost), 2) AS total_cost_usd 42 | FROM ( 43 | SELECT 44 | cost, 45 | CASE 46 | WHEN LOWER(sku_desc) LIKE '%analysis%' THEN 'Query (Analysis)' 47 | WHEN LOWER(sku_desc) LIKE '%storage%' THEN 'Storage' 48 | WHEN LOWER(sku_desc) LIKE '%streaming insert%' THEN 'Streaming Inserts' 49 | WHEN LOWER(sku_desc) LIKE '%load job%' THEN 'Load Jobs' 50 | WHEN LOWER(sku_desc) LIKE '%copy%' THEN 'Copy Jobs' 51 | ELSE 'Other: ' || sku_desc 52 | END 53 | AS activity 54 | FROM 55 | bq_usage ) 56 | GROUP BY 57 | activity 58 | ORDER BY 59 | total_cost_usd DESC; -------------------------------------------------------------------------------- /dataflow/dflow-bq-stream-python/schema_defs.py: -------------------------------------------------------------------------------- 1 | from google.cloud import bigquery 2 | 3 | # python client library schema definitions 4 | ccl_messages_schema = [ 5 | bigquery.SchemaField('pos_id', 'INT64', mode='REQUIRED'), 6 | bigquery.SchemaField('ts', 'DATETIME', mode='REQUIRED'), 7 | bigquery.SchemaField('zip', 'STRING', mode='REQUIRED'), 8 | bigquery.SchemaField('sale_amount', 'FLOAT', mode='REQUIRED') 9 | ] 10 | 11 | ccl_messages_nested_schema = [ 12 | bigquery.SchemaField('window_ending', 'DATETIME', mode='REQUIRED'), 13 | bigquery.SchemaField('pos_id', 'INT64', mode='REQUIRED'), 14 | bigquery.SchemaField( 15 | 'transactions', 16 | 'RECORD', 17 | mode='REPEATED', 18 | fields=[ 19 | bigquery.SchemaField('ts', 'DATETIME', mode='REQUIRED'), 20 | bigquery.SchemaField('zip', 'STRING', mode='REQUIRED'), 21 | bigquery.SchemaField('sale_amount', 'FLOAT', mode='REQUIRED') 22 | ] 23 | ) 24 | ] 25 | 26 | # beam biqueryio schema definitions 27 | beam_messages_schema = { 28 | "fields": [ 29 | { 30 | "name": "pos_id", 31 | "type": "INT64", 32 | "mode": 'REQUIRED' 33 | }, 34 | { 35 | "name": "ts", 36 | "type": "DATETIME", 37 | "mode": 'REQUIRED' 38 | }, 39 | { 40 | "name": "zip", 41 | "type": "STRING", 42 | "mode": 'REQUIRED' 43 | }, 44 | { 45 | "name": "sale_amount", 46 | "type": "FLOAT", 47 | "mode": 'REQUIRED' 48 | } 49 | ] 50 | } 51 | 52 | beam_messages_nested_schema = { 53 | "fields": [ 54 | { 55 | "name": "window_ending", 56 | "type": "DATETIME", 57 | "mode": 'REQUIRED' 58 | }, 59 | { 60 | "name": "post_id", 61 | "type": "INT64", 62 | "mode": 'REQUIRED' 63 | }, 64 | { 65 | "name": "transactions", 66 | "type": "RECORD", 67 | "mode": 'REPEATED', 68 | "fields": [ 69 | { 70 | "name": "ts", 71 | "type": "DATETIME", 72 | "mode": 'REQUIRED' 73 | }, 74 | { 75 | "name": "zip", 76 | "type": "STRING", 77 | "mode": 'REQUIRED' 78 | }, 79 | { 80 | "name": "sale_amount", 81 | "type": "float", 82 | "mode": 'REQUIRED' 83 | }, 84 | ] 85 | } 86 | ] 87 | } -------------------------------------------------------------------------------- /bigquery/github_demo.sql: -------------------------------------------------------------------------------- 1 | -- standardSQL 2 | -- search based on array length 3 | -- display full struct and array of struct 4 | SELECT 5 | author, 6 | difference 7 | FROM 8 | `bigquery-public-data.github_repos.commits` 9 | WHERE 10 | array_length(difference) = 5 11 | LIMIT 10 12 | 13 | -- standardSQL 14 | -- search based on array length 15 | -- create separate columns from struct properties 16 | SELECT 17 | author.email, 18 | difference 19 | FROM 20 | `bigquery-public-data.github_repos.commits` 21 | WHERE 22 | array_length(difference) = 5 23 | LIMIT 10 24 | 25 | -- standardSQL 26 | -- show correlated cross join and unnest 27 | -- this one row per email/file combo 28 | -- but also include the entire array for each output row 29 | WITH 30 | sample AS ( 31 | SELECT 32 | author.email, 33 | difference 34 | FROM 35 | `bigquery-public-data.github_repos.commits` 36 | WHERE 37 | ARRAY_LENGTH(difference) = 5 38 | LIMIT 39 | 1) 40 | SELECT 41 | email, 42 | difference, 43 | diff.new_path as path 44 | from 45 | sample, 46 | unnest(difference) as diff 47 | 48 | -- standardSQL 49 | -- show correlated cross join and unnest 50 | -- this drop the difference column with the array 51 | WITH 52 | sample AS ( 53 | SELECT 54 | author.email, 55 | difference 56 | FROM 57 | `bigquery-public-data.github_repos.commits` 58 | WHERE 59 | ARRAY_LENGTH(difference) = 5 60 | LIMIT 61 | 1) 62 | SELECT 63 | email, 64 | diff.new_path as path 65 | from 66 | sample, 67 | unnest(difference) as diff 68 | 69 | -- standardSQL 70 | -- find commits where a particular file was touched 71 | -- this shows searching on values within an array 72 | -- by using correlated cross join and filter 73 | SELECT 74 | author, 75 | difference 76 | FROM 77 | `bigquery-public-data.github_repos.commits`, 78 | unnest(difference) as files 79 | WHERE 80 | files.new_path = "courses/data_analysis/lab2/python/is_popular.py" 81 | 82 | -- standardSQL 83 | -- this also shows searching on values within an array 84 | -- this time using subquery in where clause 85 | SELECT 86 | author, 87 | difference 88 | FROM 89 | `bigquery-public-data.github_repos.commits` 90 | WHERE 91 | "courses/data_analysis/lab2/python/is_popular.py" in (select f.new_path from unnest(difference) as f) 92 | 93 | -- standardSQL 94 | -- this is by far the fastest way of the three to search on values in array 95 | -- this avoids the cross join of #1. EXISTS is faster than IN 96 | SELECT 97 | author, 98 | difference 99 | FROM 100 | `bigquery-public-data.github_repos.commits` 101 | WHERE 102 | EXISTS ( 103 | SELECT 104 | * 105 | FROM 106 | UNNEST(difference) AS f 107 | WHERE 108 | f.new_path="courses/data_analysis/lab2/python/is_popular.py") -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .vscode* 3 | snippets* 4 | dflow-bq-stream-python/env/ 5 | dflow-bq-stream-python/env/ 6 | *secrets* 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | pip-wheel-metadata/ 31 | share/python-wheels/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | MANIFEST 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .nox/ 51 | .coverage 52 | .coverage.* 53 | .cache 54 | nosetests.xml 55 | coverage.xml 56 | *.cover 57 | *.py,cover 58 | .hypothesis/ 59 | .pytest_cache/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | db.sqlite3 69 | db.sqlite3-journal 70 | 71 | # Flask stuff: 72 | instance/ 73 | .webassets-cache 74 | 75 | # Scrapy stuff: 76 | .scrapy 77 | 78 | # Sphinx documentation 79 | docs/_build/ 80 | 81 | # PyBuilder 82 | target/ 83 | 84 | # Jupyter Notebook 85 | .ipynb_checkpoints 86 | 87 | # IPython 88 | profile_default/ 89 | ipython_config.py 90 | 91 | # pyenv 92 | .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 102 | __pypackages__/ 103 | 104 | # Celery stuff 105 | celerybeat-schedule 106 | celerybeat.pid 107 | 108 | # SageMath parsed files 109 | *.sage.py 110 | 111 | # Environments 112 | .env 113 | .venv 114 | env/ 115 | venv/ 116 | ENV/ 117 | env.bak/ 118 | venv.bak/ 119 | 120 | # Spyder project settings 121 | .spyderproject 122 | .spyproject 123 | 124 | # Rope project settings 125 | .ropeproject 126 | 127 | # mkdocs documentation 128 | /site 129 | 130 | # mypy 131 | .mypy_cache/ 132 | .dmypy.json 133 | dmypy.json 134 | 135 | # Pyre type checker 136 | .pyre/ 137 | 138 | snippets.txt 139 | .streamlit/secrets.toml 140 | .streamlit/streamlit-sa.json 141 | .DS_Store 142 | 143 | # Ignore Terraform state files 144 | *.tfstate 145 | *.tfstate.* 146 | .terraform/ 147 | *.tfvars -------------------------------------------------------------------------------- /bigquery/schema-demo/nested_queries.sql: -------------------------------------------------------------------------------- 1 | -- standardSQL 2 | -- find sales/zip for march from nested_once table 3 | WITH 4 | orders AS ( 5 | SELECT 6 | cust_zip, 7 | prod_price * qty AS line_item_subtotal 8 | FROM 9 | `.bq_demo.nested_once`, 10 | unnest(line_items) 11 | WHERE 12 | order_date >= "2018-03-01" 13 | AND order_date <= "2018-03-31") 14 | SELECT 15 | cust_zip, 16 | SUM(line_item_subtotal) as zip_sales 17 | FROM 18 | orders 19 | GROUP BY 20 | cust_zip 21 | order by 22 | zip_sales desc 23 | 24 | 25 | -- standardSQL 26 | -- find sales/zip for march from nested/partitioned 27 | WITH 28 | orders AS ( 29 | SELECT 30 | cust_zip, 31 | prod_price * qty AS line_item_subtotal 32 | FROM 33 | `.bq_demo.table_nested_partitioned`, 34 | unnest(line_items) 35 | WHERE 36 | order_date >= "2018-03-01" 37 | AND order_date <= "2018-03-31") 38 | SELECT 39 | cust_zip, 40 | SUM(line_item_subtotal) as zip_sales 41 | FROM 42 | orders 43 | GROUP BY 44 | cust_zip 45 | order by 46 | zip_sales desc 47 | 48 | -- standardSQL 49 | -- find sales for 6 months in 8754 from nested 50 | WITH 51 | orders AS ( 52 | SELECT 53 | cust_zip, 54 | prod_price * qty AS line_item_subtotal 55 | FROM 56 | `.bq_demo.nested_once`, 57 | UNNEST(line_items) 58 | WHERE 59 | order_date >= "2018-01-01" 60 | AND order_date <= "2018-06-30" 61 | AND cust_zip=8754) 62 | SELECT 63 | cust_zip, 64 | SUM(line_item_subtotal) AS zip_sales 65 | FROM 66 | orders 67 | GROUP BY 68 | cust_zip 69 | ORDER BY 70 | zip_sales DESC 71 | 72 | -- standardSQL 73 | -- find for 6 months in 8754 from nested/partitioned 74 | WITH 75 | orders AS ( 76 | SELECT 77 | cust_zip, 78 | prod_price * qty AS line_item_subtotal 79 | FROM 80 | `.bq_demo.table_nested_partitioned`, 81 | UNNEST(line_items) 82 | WHERE 83 | order_date >= "2018-01-01" 84 | AND order_date <= "2018-06-30" 85 | AND cust_zip=8754) 86 | SELECT 87 | cust_zip, 88 | SUM(line_item_subtotal) AS zip_sales 89 | FROM 90 | orders 91 | GROUP BY 92 | cust_zip 93 | ORDER BY 94 | zip_sales DESC 95 | 96 | -- standardSQL 97 | -- find for 6 months in 8754 from nested/partitioned/clustered 98 | WITH 99 | orders AS ( 100 | SELECT 101 | cust_zip, 102 | prod_price * qty AS line_item_subtotal 103 | FROM 104 | `.bq_demo.table_nested_partitioned_clustered`, 105 | UNNEST(line_items) 106 | WHERE 107 | order_date >= "2018-01-01" 108 | AND order_date <= "2018-06-30" 109 | AND cust_zip=8754) 110 | SELECT 111 | cust_zip, 112 | SUM(line_item_subtotal) AS zip_sales 113 | FROM 114 | orders 115 | GROUP BY 116 | cust_zip 117 | ORDER BY 118 | zip_sales DESC -------------------------------------------------------------------------------- /utilities/shopping_list_api/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from fastapi import FastAPI, HTTPException 4 | from google import genai 5 | from pydantic import BaseModel 6 | from google.genai import types 7 | import dotenv 8 | from typing import List 9 | from pydantic import BaseModel 10 | 11 | dotenv.load_dotenv() 12 | 13 | 14 | class Item(BaseModel): 15 | name: str 16 | quantity: int 17 | aisle: str 18 | 19 | 20 | class ShoppingListResponse(BaseModel): 21 | items: List[Item] 22 | 23 | 24 | schema_config = { 25 | "response_mime_type": "application/json", 26 | "response_schema": list[Item], # <— Gemini will use this schema 27 | } 28 | 29 | 30 | from fastapi.responses import JSONResponse, FileResponse 31 | from fastapi.requests import Request 32 | from fastapi.staticfiles import StaticFiles 33 | 34 | # The app will listen on port 8080 (see Dockerfile CMD) 35 | app = FastAPI() 36 | 37 | # Mount static directory for favicon 38 | import pathlib 39 | 40 | static_dir = pathlib.Path(__file__).parent / "static" 41 | static_dir.mkdir(exist_ok=True) 42 | app.mount("/static", StaticFiles(directory=static_dir), name="static") 43 | 44 | 45 | @app.exception_handler(404) 46 | async def not_found_handler(request: Request, exc): 47 | return JSONResponse(status_code=404, content={"detail": "Not Found"}) 48 | 49 | 50 | # Initialize the Gen AI client for Vertex AI 51 | client = genai.Client( 52 | vertexai=True, 53 | project=os.environ["GOOGLE_CLOUD_PROJECT"], 54 | location=os.environ.get("GOOGLE_CLOUD_LOCATION", "us-central1"), 55 | ) 56 | 57 | 58 | @app.get("/get-list", response_model=ShoppingListResponse) 59 | async def shopping_list(): 60 | prompt = ( 61 | "Think of an American household with n adults and m children." 62 | "Think of an event that this household might be preparing for" 63 | "Generate a Costco shopping list of 5-10 items for that household" 64 | "The list should be in JSON format as an array of objects. " 65 | "Each object must have fields: name (string), quantity (integer), aisle (string)." 66 | "Return the list as a JSON array. Don't include any additional text or formatting." 67 | ) 68 | 69 | # Call the Gemini model via the Gen AI SDK 70 | try: 71 | response = client.models.generate_content( 72 | model="gemini-2.5-flash", 73 | contents=prompt, 74 | config=schema_config, 75 | ) # :contentReference[oaicite:1]{index=1} 76 | except Exception as e: 77 | raise HTTPException(status_code=500, detail=f"GenAI API error: {e}") 78 | 79 | # Parse the returned text as JSON 80 | raw = response.text 81 | try: 82 | items = json.loads(raw) 83 | except json.JSONDecodeError as e: 84 | raise HTTPException( 85 | status_code=500, 86 | detail=f"Failed to parse JSON from GenAI response: {e}", 87 | ) 88 | 89 | return {"items": items} 90 | -------------------------------------------------------------------------------- /functions/cat-bq-completions.py: -------------------------------------------------------------------------------- 1 | import json 2 | import base64 3 | from datetime import datetime 4 | from google.cloud import storage 5 | import functions_framework 6 | 7 | 8 | @functions_framework.cloud_event 9 | def process_bigquery_job_completion(cloud_event): 10 | """ 11 | Processes BigQuery job completion events from Pub/Sub and writes details to Cloud Storage. 12 | 13 | Triggered by Pub/Sub messages containing BigQuery audit logs. 14 | """ 15 | 16 | # Log the incoming event for debugging 17 | log_entry = None 18 | # Try to handle different event structures 19 | if "data" in cloud_event.data: 20 | # Standard Pub/Sub trigger 21 | try: 22 | message_data = base64.b64decode(cloud_event.data["data"]).decode("utf-8") 23 | log_entry = json.loads(message_data) 24 | except Exception as e: 25 | print(f"Error decoding message: {e}") 26 | return 27 | elif "message" in cloud_event.data and "data" in cloud_event.data["message"]: 28 | # Eventarc Pub/Sub trigger 29 | try: 30 | message_data = base64.b64decode(cloud_event.data["message"]["data"]).decode( 31 | "utf-8" 32 | ) 33 | log_entry = json.loads(message_data) 34 | except Exception as e: 35 | print(f"Error decoding message: {e}") 36 | return 37 | else: 38 | print("No data found in cloud event. Event structure:", cloud_event.data) 39 | return 40 | 41 | try: 42 | 43 | # Write to Cloud Storage 44 | write_to_cloud_storage( 45 | bucket_name="jwd-gcp-demos", 46 | file_path=f"bigquery-jobs/{datetime.now().strftime('%Y/%m/%d')}/job-completions.log", 47 | content=json.dumps(log_entry, indent=2), 48 | ) 49 | 50 | print(f"Successfully processed job completion for {job_info['job_id']}") 51 | 52 | except Exception as e: 53 | print(f"Error processing BigQuery job completion: {str(e)}") 54 | print(f"Raw log entry: {json.dumps(log_entry, indent=2)}") 55 | 56 | 57 | def write_to_cloud_storage(bucket_name, file_path, content): 58 | """ 59 | Write content to Cloud Storage, appending to existing file if it exists. 60 | """ 61 | try: 62 | # Initialize the Cloud Storage client 63 | storage_client = storage.Client() 64 | bucket = storage_client.bucket(bucket_name) 65 | blob = bucket.blob(file_path) 66 | 67 | # Upload the updated content 68 | blob.upload_from_string(content, content_type="application/json") 69 | 70 | print(f"Successfully wrote job data to gs://{bucket_name}/{file_path}") 71 | 72 | except Exception as e: 73 | print(f"Error writing to Cloud Storage: {str(e)}") 74 | raise 75 | 76 | 77 | # Optional: Add requirements.txt content 78 | """ 79 | requirements.txt: 80 | google-cloud-storage>=2.10.0 81 | functions-framework>=3.0.0 82 | """ 83 | -------------------------------------------------------------------------------- /utilities/shopping_list_api/README.md: -------------------------------------------------------------------------------- 1 | # Shopping List API server 2 | 3 | Silly little demo API that returns a generated shopping list (JSON). The service 4 | calls Vertex AI's GenAI (Gemini) via the `google-genai` SDK and exposes a single 5 | HTTP GET endpoint that returns a list of items suitable for demo/testing. 6 | 7 | ## What this does 8 | 9 | - Starts a FastAPI server (listens on port 8080 by default). 10 | - Calls a Gemini model to generate a small shopping list in JSON. 11 | - Returns the list as: `{ "items": [ {"name":..., "quantity":..., "aisle":...}, ... ] }`. 12 | 13 | ## Files of interest 14 | 15 | - `main.py` - FastAPI app and GenAI client usage. 16 | - `pyproject.toml` - project metadata and dependencies. 17 | - `Dockerfile` - container image build for running the app in Docker. 18 | 19 | ## Prerequisites 20 | 21 | - Python 3.12 (the project `pyproject.toml` requests >=3.12) or Docker. 22 | - A Google Cloud project with Vertex AI enabled and a service account that has 23 | permission to call Vertex AI. 24 | - The `GOOGLE_CLOUD_PROJECT` environment variable must be set. 25 | 26 | Optional but recommended environment variables in a `.env` file: 27 | 28 | ``` 29 | GOOGLE_CLOUD_PROJECT=your-gcp-project-id 30 | GOOGLE_CLOUD_LOCATION=us-central1 31 | ``` 32 | 33 | If running locally, authenticate with `gcloud auth application-default login` 34 | or set `GOOGLE_APPLICATION_CREDENTIALS` to a service account JSON key. 35 | 36 | ## Run with Docker (recommended) 37 | 38 | Build the image from the `shopping_list_api` directory and run it: 39 | 40 | ```bash 41 | docker build -t shopping-list-api . 42 | docker run -p 8080:8080 \ 43 | -e GOOGLE_CLOUD_PROJECT=your-gcp-project-id \ 44 | -e GOOGLE_CLOUD_LOCATION=us-central1 \ 45 | shopping-list-api 46 | ``` 47 | 48 | The server will be reachable at `http://localhost:8080`. 49 | 50 | ## Run locally (development) 51 | 52 | 1. Export env vars (or create a `.env` file) and run: 53 | 54 | ```bash 55 | export GOOGLE_CLOUD_PROJECT=your-gcp-project-id 56 | export GOOGLE_CLOUD_LOCATION=us-central1 57 | uv run uvicorn main:app --host 0.0.0.0 --port 8080 --reload 58 | ``` 59 | 60 | ## API 61 | 62 | GET /get-list 63 | 64 | - Description: Generates a Costco-style shopping list using Gemini and returns 65 | it as JSON. 66 | - Response schema: 67 | 68 | ```json 69 | { 70 | "items": [ 71 | {"name": "string", "quantity": 1, "aisle": "string"} 72 | ] 73 | } 74 | ``` 75 | 76 | Example curl: 77 | 78 | ```bash 79 | curl -s http://localhost:8080/get-list | jq 80 | ``` 81 | 82 | Expected behaviors and error cases: 83 | 84 | - If the GenAI call fails, the server returns HTTP 500 with a message. 85 | - If the model returns non-JSON or invalid JSON, the server returns HTTP 500. 86 | 87 | ## Environment / Authentication notes 88 | 89 | - When deployed on GCE/GKE/Cloud Run with the correct service account, the app 90 | will use Workload Identity / service account credentials automatically. 91 | - For local testing, set `GOOGLE_APPLICATION_CREDENTIALS` to a service account 92 | key file that has permission to call Vertex AI, or run 93 | `gcloud auth application-default login`. 94 | -------------------------------------------------------------------------------- /composer/dags/bq_export_strategies.py: -------------------------------------------------------------------------------- 1 | # Purpose: 2 | # Demonstrates multiple approaches to exporting BigQuery data to GCS in Airflow. 3 | # Highlights that there are often many ways to accomplish tasks in Airflow. 4 | # 5 | # Preparation Needed: 6 | # - Create an Airflow variable for your GCP project ID. 7 | # - Ensure a GCS bucket exists with the same name as your project. 8 | # - Install required dependencies in your Composer environment (BigQuery, GCS, pandas). 9 | 10 | from datetime import datetime, timedelta 11 | 12 | from airflow import DAG 13 | from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator 14 | from airflow.providers.google.cloud.transfers.bigquery_to_gcs import ( 15 | BigQueryToGCSOperator, 16 | ) 17 | from airflow.operators.python import PythonOperator 18 | from airflow.models import Variable 19 | from google.cloud import bigquery 20 | 21 | import pandas as pd 22 | import logging 23 | 24 | # Default arguments for the DAG 25 | default_args = { 26 | "owner": "data-team", 27 | "depends_on_past": False, 28 | "start_date": datetime(2024, 1, 1), 29 | } 30 | 31 | 32 | # Read GCP project and bucket from Airflow Variables 33 | SOURCE_PROJECT_ID = "roi-bq-demos" 34 | GCS_BUCKET = Variable.get("gcp_project_id") 35 | DATASET_ID = "bq_demo" 36 | TABLE_ID = "product" 37 | OUTPUT_FILE = "product_export.json" 38 | 39 | # DAG Definition 40 | dag = DAG( 41 | "export_product_table_strategies", 42 | default_args=default_args, 43 | description="Export product table from BigQuery to GCS using multiple strategies", 44 | schedule_interval=None, 45 | catchup=False, 46 | tags=["bigquery", "gcs", "product", "multi-strategy"], 47 | ) 48 | 49 | 50 | # 1. Export using BigQueryInsertJobOperator (EXPORT DATA) 51 | export_with_insertjob = BigQueryInsertJobOperator( 52 | task_id="export_with_insertjob", 53 | configuration={ 54 | "query": { 55 | "query": ( 56 | f""" 57 | EXPORT DATA OPTIONS( 58 | uri='gs://{GCS_BUCKET}/product/airflow_export_with_insertjob/*.json', 59 | format='JSON' 60 | ) AS 61 | SELECT * FROM `{SOURCE_PROJECT_ID}.{DATASET_ID}.{TABLE_ID}` 62 | """ 63 | ), 64 | "useLegacySql": False, 65 | } 66 | }, 67 | location="US", 68 | dag=dag, 69 | ) 70 | 71 | # 2. Export as Parquet using BigQueryToGCSOperator 72 | export_with_parquet = BigQueryToGCSOperator( 73 | task_id="export_with_parquet", 74 | source_project_dataset_table=f"{SOURCE_PROJECT_ID}.{DATASET_ID}.{TABLE_ID}", 75 | destination_cloud_storage_uris=[ 76 | f"gs://{GCS_BUCKET}/product/airflow_export_with_ToGCS.parquet" 77 | ], 78 | export_format="PARQUET", 79 | dag=dag, 80 | ) 81 | 82 | 83 | def export_with_custom_logic(**context): 84 | client = bigquery.Client() 85 | 86 | # Query the table 87 | query = f"SELECT * FROM `{SOURCE_PROJECT_ID}.{DATASET_ID}.{TABLE_ID}`" 88 | df = client.query(query).to_dataframe() 89 | 90 | # Custom processing 91 | # df_processed = df.apply() 92 | df_processed = df 93 | 94 | # Export to various destinations 95 | df_processed.to_parquet(f"gs://{GCS_BUCKET}/product/airflow_export_custom.parquet") 96 | 97 | 98 | # 3. Export as Parquet using BigQueryToGCSOperator 99 | custom_export = PythonOperator( 100 | task_id="custom_export", python_callable=export_with_custom_logic, dag=dag 101 | ) 102 | -------------------------------------------------------------------------------- /ai/del_endpoints.py: -------------------------------------------------------------------------------- 1 | import google.auth 2 | import logging 3 | from google.cloud import aiplatform_v1 4 | from google.api_core import exceptions as api_exceptions 5 | 6 | logging.basicConfig(level=logging.INFO) 7 | logger = logging.getLogger(__name__) 8 | 9 | creds, project = google.auth.default() 10 | location = "us-central1" 11 | 12 | 13 | def main(): 14 | client_options = {"api_endpoint": f"{location}-aiplatform.googleapis.com"} 15 | client = aiplatform_v1.EndpointServiceClient(client_options=client_options) 16 | 17 | parent = f"projects/{project}/locations/{location}" 18 | 19 | # List endpoints 20 | try: 21 | endpoints = client.list_endpoints(request={"parent": parent}) 22 | except api_exceptions.GoogleAPICallError as e: 23 | logger.error(f"Failed to list endpoints: {e}") 24 | return 25 | 26 | any_endpoints = False 27 | for ep in endpoints: 28 | any_endpoints = True 29 | logger.info( 30 | f"Processing endpoint: name={ep.name}, display_name={ep.display_name}" 31 | ) 32 | 33 | # Undeploy any deployed models from this endpoint 34 | deployed = getattr(ep, "deployed_models", None) or [] 35 | if deployed: 36 | for dm in deployed: 37 | # DeployedModel proto has an 'id' field which is the deployed_model_id 38 | deployed_model_id = getattr(dm, "id", None) or getattr( 39 | dm, "deployed_model_id", None 40 | ) 41 | if not deployed_model_id: 42 | logger.warning( 43 | f"Could not determine deployed model id for deployed model: {dm}" 44 | ) 45 | continue 46 | 47 | logger.info( 48 | f"Undeploying deployed_model_id={deployed_model_id} from endpoint={ep.name}" 49 | ) 50 | try: 51 | op = client.undeploy_model( 52 | request={ 53 | "endpoint": ep.name, 54 | "deployed_model_id": deployed_model_id, 55 | } 56 | ) 57 | logger.info("Waiting for undeploy operation to complete...") 58 | op.result(timeout=300) 59 | logger.info(f"Undeployed {deployed_model_id} from {ep.name}") 60 | except api_exceptions.GoogleAPICallError as e: 61 | logger.error( 62 | f"Failed to undeploy model {deployed_model_id} from {ep.name}: {e}" 63 | ) 64 | except Exception as e: 65 | logger.error( 66 | f"Unexpected error undeploying {deployed_model_id} from {ep.name}: {e}" 67 | ) 68 | else: 69 | logger.info("No deployed models found on this endpoint") 70 | 71 | # Delete the endpoint 72 | logger.info(f"Deleting endpoint: {ep.name}") 73 | try: 74 | del_op = client.delete_endpoint(request={"name": ep.name}) 75 | logger.info("Waiting for delete operation to complete...") 76 | del_op.result(timeout=300) 77 | logger.info(f"Deleted endpoint {ep.name}") 78 | except api_exceptions.GoogleAPICallError as e: 79 | logger.error(f"Failed to delete endpoint {ep.name}: {e}") 80 | except Exception as e: 81 | logger.error(f"Unexpected error deleting endpoint {ep.name}: {e}") 82 | 83 | if not any_endpoints: 84 | logger.info("No endpoints found.") 85 | 86 | 87 | if __name__ == "__main__": 88 | main() 89 | -------------------------------------------------------------------------------- /bigquery/schema-demo/README.md: -------------------------------------------------------------------------------- 1 | # ROI BigQuery Schema performance demo 2 | 3 | ## Goal 4 | The goal of this demo is to show the relative performance of different schema choices in BigQuery. 5 | 6 | After setup, there will be tables with the same data stored in tables with varying schemas: 7 | 8 | - Normalized w/multiple tables 9 | - Fully denormalized 10 | - Single-level nesting (one row per order) 11 | - Single-level nesting with partitioning 12 | - Single-level nesting with partitioning and clustering 13 | 14 | You can demo the changes in query performance and cost with these different schema. 15 | 16 | Incidentally, this also provides the potential to demo Dataflow, BigQuery load techniques, doing intra-BigQuery ETL, etc. 17 | 18 | ## Setup 19 | 20 | This demo uses this normalized data set: **roi-bq-demos.bq_demo** which has the following tables: 21 | - customer: 75M rows 22 | - product: 10K rows 23 | - order: 7.5B rows 24 | - line_item: 75B rows 25 | - order_part: 7.5B rows (partitioned on `order_date`) 26 | 27 | 28 | This is the largest dataset we can reasonably store for instructor use. If you want a larger dataset, you can create your own - the pieces you need are found in this directory. 29 | 30 | 1. Log into the cloud console, and select an appropriate project. Queries used to derive the not-normalized tables and do the demos are expensive, so choose your project wisely. 31 | 32 | 2. In your target project, make sure that there is a dataset named `bq_demo` (create it if necessary). 33 | 34 | 3. Run the query in **load_data.sql**. This will take about 70 minutes, cost $200, and create four new tables in your target project/dataset: 35 | - denorm 36 | - nested_once 37 | - table_nested_partitioned 38 | - table_nested_partitioned_clustered 39 | 40 | ## Demo 41 | 42 | Load the BQ user interface in the project where you have the dataset. 43 | 44 | 1. Run the `base normalized tables` query found in **norm_query.sql** 45 | * Note that amount of data processed 46 | * Note the structure of the query 47 | * Note the time taken to complete the query 48 | * Example run: 2TB, 140 seconds, $12.50 49 | 50 | 2. Run the `normalized tables with order_part table` query found in **norm_query.sql** 51 | * Note that amount of data processed 52 | * Note the structure of the query 53 | * Note the time taken to complete the query 54 | * Example run: 1.8TB, 125 seconds, $10=1.25 55 | * Note that partitioning did reduce the data read, and the time taken, but the win was minimal given the size of the line_item table -> this is the gating factor. You can show the execution graph to drive this hom. 56 | 57 | 3. Run the query found in **denorm_query.sql** 58 | * Note that amount of data processed 59 | * Note the structure of the query 60 | * Note the time taken to complete the query 61 | * Note you get the same results as with 1 62 | * Example run: 2.2TB, 24 seconds, $13.75 63 | 64 | 3. Run the first query found in **nested_queries.sql** 65 | * Note that amount of data processed 66 | * Note the structure of the query 67 | * Note the time taken to complete the query 68 | * Note you get the same results as with 1 69 | * Example run: 1.2TB, 6.4 seconds, $7.50 70 | 71 | 4. Run the second query found in **nested_queries.sql** 72 | * Note that amount of data processed 73 | * Note the structure of the query 74 | * Note the time taken to complete the query 75 | * Note you get the same results as with 1 76 | * Example run: xTB, x seconds, $x 77 | 78 | 5. Run queries 3-4-5 from **nested_queries.sql** 79 | * Note the amount of data processed for each 80 | * Note the query time for each 81 | * You should seeing decreases for each one 82 | * Example runs 83 | * 1.2TB, 6.3 seconds, $7 84 | * 614G, 3.8 seconds, $3 85 | * 11GB, 3.2 seconds, $.06 -------------------------------------------------------------------------------- /composer/dags/export_top_customers.py: -------------------------------------------------------------------------------- 1 | # Example Scenario: 2 | # A marketing analyst wants to identify the top customers for the week of Christmas. 3 | # This DAG exports top customers from BigQuery to a GCS bucket to be fed to a marketing campaign. 4 | # 5 | # Prerequisites: 6 | # 1. Create an Airflow variable for your project: gcp_project_id 7 | # 2. Create a 'marketing' dataset in BigQuery. 8 | # 3. Ensure a GCS bucket exists with the project name. 9 | # 10 | # Demo Instructions: 11 | # 1. Place this DAG in your Airflow DAGs folder. 12 | # 2. Trigger the DAG in Airflow UI. 13 | # 3. Check the output table in BigQuery. 14 | # 4. Verify the exported file in the GCS bucket. 15 | 16 | from datetime import datetime 17 | 18 | from airflow import DAG 19 | from airflow.models import Variable 20 | from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator 21 | from airflow.providers.google.cloud.transfers.bigquery_to_gcs import ( 22 | BigQueryToGCSOperator, 23 | ) 24 | 25 | # Example 1: Export High-Value Customers to Cloud Storage 26 | high_value_customers_dag = DAG( 27 | "export_high_value_customers", 28 | default_args={"owner": "marketing-team", "start_date": datetime(2024, 1, 1)}, 29 | description="Find high-value customers and export to GCS for campaign targeting", 30 | catchup=False, 31 | tags=["marketing", "export"], 32 | ) 33 | 34 | # Read GCP project ID from Airflow Variable 35 | PROJECT_ID = Variable.get("gcp_project_id") 36 | 37 | # Query to identify high-value customers 38 | identify_high_value_customers = BigQueryInsertJobOperator( 39 | task_id="find_high_value_customers", 40 | configuration={ 41 | "query": { 42 | "query": ( 43 | """ 44 | WITH 45 | christmas_orders AS ( 46 | SELECT 47 | order_num, 48 | cust_id 49 | FROM 50 | `roi-bq-demos.bq_demo_small.order` o 51 | WHERE 52 | o.order_date >= "2018-12-18" 53 | AND o.order_date <= "2018-12-23" ) 54 | SELECT 55 | c.cust_id, 56 | SUM(li.qty * p.prod_price) AS total_purchases 57 | FROM 58 | christmas_orders co 59 | JOIN 60 | roi-bq-demos.bq_demo_small.line_item li 61 | ON 62 | co.order_num = li.order_num 63 | JOIN 64 | roi-bq-demos.bq_demo_small.customer c 65 | ON 66 | c.cust_id = co.cust_id 67 | JOIN 68 | roi-bq-demos.bq_demo_small.product p 69 | ON 70 | p.prod_code = li.prod_code 71 | GROUP BY 72 | c.cust_id 73 | ORDER BY 74 | total_purchases desc 75 | LIMIT 100 76 | """ 77 | ), 78 | "useLegacySql": False, 79 | "destinationTable": { 80 | "projectId": PROJECT_ID, 81 | "datasetId": "marketing", 82 | "tableId": "high_value_customers", 83 | }, 84 | "writeDisposition": "WRITE_TRUNCATE", 85 | } 86 | }, 87 | dag=high_value_customers_dag, 88 | ) 89 | 90 | # Export results to Cloud Storage for marketing campaign 91 | export_to_gcs = BigQueryToGCSOperator( 92 | task_id="export_customer_list", 93 | source_project_dataset_table=f"{PROJECT_ID}.marketing.high_value_customers", 94 | destination_cloud_storage_uris=[f"gs://{PROJECT_ID}/high-value-customers.csv"], 95 | export_format="CSV", 96 | dag=high_value_customers_dag, 97 | ) 98 | 99 | identify_high_value_customers >> export_to_gcs 100 | -------------------------------------------------------------------------------- /bigquery/schema-demo/load_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | read -p "project for data: " project 4 | read -p "source bucket: " bucket 5 | 6 | bq load --source_format=CSV --replace $project:bq_demo.customer gs://$bucket/bq-demo/customer* ./customer_schema.json 7 | bq load --source_format=CSV --replace $project:bq_demo.order gs://$bucket/bq-demo/order* ./order_schema.json 8 | bq load --source_format=CSV --replace $project:bq_demo.product gs://$bucket/bq-demo/product* ./product_schema.json 9 | bq load --source_format=CSV --replace $project:bq_demo.line_item gs://$bucket/bq-demo/line_item* ./line_item_schema.json 10 | 11 | bq query --use_legacy_sql=false --replace --destination_table=$project:bq_demo.denorm ' 12 | SELECT 13 | c.*, 14 | o.order_num as order_num, 15 | order_date, 16 | line_item_num, 17 | li.prod_code as prod_code, 18 | qty, 19 | prod_name, 20 | prod_desc, 21 | prod_price 22 | FROM 23 | '"\`$project.bq_demo.customer\`"' c 24 | LEFT JOIN 25 | '"\`$project.bq_demo.order\`"' o 26 | ON 27 | c.cust_id = o.cust_id 28 | LEFT JOIN 29 | '"\`$project.bq_demo.line_item\`"' AS li 30 | ON 31 | o.order_num = li.order_num 32 | LEFT JOIN 33 | '"\`$project.bq_demo.product\`"' AS p 34 | ON 35 | li.prod_code = p.prod_code' 36 | 37 | bq query --use_legacy_sql=false --replace --destination_table=$project:bq_demo.nested_once ' 38 | WITH 39 | dlow AS ( 40 | SELECT 41 | * 42 | FROM 43 | '"\`$project.bq_demo.denorm\`"' 44 | ) 45 | SELECT 46 | cust_id, 47 | cust_name, 48 | cust_address, 49 | cust_state, 50 | cust_zip, 51 | cust_email, 52 | cust_phone, 53 | order_num, 54 | order_date, 55 | ARRAY_AGG( STRUCT(line_item_num, 56 | prod_code, 57 | qty, 58 | prod_name, 59 | prod_desc, 60 | prod_price)) as line_items 61 | FROM 62 | dlow 63 | GROUP BY 64 | order_num, 65 | order_date, 66 | cust_phone, 67 | cust_email, 68 | cust_zip, 69 | cust_state, 70 | cust_address, 71 | cust_name, 72 | cust_id' 73 | 74 | bq query --use_legacy_sql=false \ 75 | --replace \ 76 | --destination_table $project:bq_demo.table_nested_partitioned \ 77 | --time_partitioning_field order_date \ 78 | 'SELECT * FROM '"\`$project.bq_demo.nested_once\`" 79 | 80 | bq query --use_legacy_sql=false 'CREATE OR REPLACE TABLE 81 | '"\`$project.bq_demo.table_nested_partitioned_clustered\`"' 82 | PARTITION BY order_date 83 | CLUSTER BY cust_zip AS 84 | SELECT * FROM '"\`$project.bq_demo.nested_once\`" 85 | 86 | # bq query --use_legacy_sql=false \ 87 | # --replace \ 88 | # --destination_table=$project:bq_demo.nested_twice ' 89 | # WITH 90 | # dlow AS ( 91 | # SELECT 92 | # * 93 | # FROM 94 | # '"\`$project.bq_demo.denorm\`"' ), 95 | # orders AS ( 96 | # SELECT 97 | # cust_id, 98 | # cust_name, 99 | # cust_address, 100 | # cust_state, 101 | # cust_zip, 102 | # cust_email, 103 | # cust_phone, 104 | # order_num, 105 | # order_date, 106 | # ARRAY_AGG( STRUCT(line_item_num, 107 | # prod_code, 108 | # qty, 109 | # prod_name, 110 | # prod_desc, 111 | # prod_price)) AS line_items 112 | # FROM 113 | # dlow 114 | # GROUP BY 115 | # order_num, 116 | # order_date, 117 | # cust_phone, 118 | # cust_email, 119 | # cust_zip, 120 | # cust_state, 121 | # cust_address, 122 | # cust_name, 123 | # cust_id) 124 | # SELECT 125 | # cust_phone, 126 | # cust_email, 127 | # cust_zip, 128 | # cust_state, 129 | # cust_address, 130 | # cust_name, 131 | # cust_id, 132 | # ARRAY_AGG( STRUCT( order_num, 133 | # order_date, 134 | # line_items )) AS orders 135 | # FROM 136 | # orders 137 | # GROUP BY 138 | # cust_id, 139 | # cust_phone, 140 | # cust_email, 141 | # cust_zip, 142 | # cust_state, 143 | # cust_address, 144 | # cust_name' -------------------------------------------------------------------------------- /dataflow/dflow-bq-stream-python/README.md: -------------------------------------------------------------------------------- 1 | # Dataflow Python Streaming Demo 2 | 3 | ## Purpose 4 | 5 | On the surface, this is a very simple demo intended to show Dataflow streaming 6 | using the Python SDK. 7 | 8 | There are a few fun little tricks and techniques involved, including: 9 | 10 | * Checking for topic/sub existence 11 | * Encoding objects into messages 12 | * Reading the end time of Beam windows 13 | * Checking for dataset/table existence in BQ 14 | * Streaming nested/repeated data into BQ 15 | 16 | ## Setup 17 | 18 | Setup will do the following: 19 | 20 | * Create and activate a python virtual environment 21 | * Install dependencies for both pieces of the demo code 22 | * Create a service account and make it an editor 23 | * Create and download a key file for the service account 24 | * Set an environment variable that will point the code to the service account keyfile 25 | * Create a bucket that is used by the dataflow job 26 | * Enable the BQ, PubSub, and Dataflow services 27 | 28 | 1. Clone the repo, and change directories to `dflow-bq-stream-python`: 29 | 30 | ```bash 31 | git clone https://github.com/roitraining/gcp-demos.git 32 | cd gcp-demos/dataflow/dflow-bq-stream-python 33 | ``` 34 | 35 | 2. Make sure that **gcloud** is configured to point to the project you want to work in. 36 | 37 | 3. Make sure you are running Python 3.12.x or earlier (there's a conflict between Beam and 3.13) 38 | ```bash 39 | pyenv local 3.12 40 | ``` 41 | 42 | 4. Run the `setup.sh` script providing the name of the service account you want the demo code to use: 43 | 44 | ```bash 45 | . ./setup.sh df-demo-sa 46 | ``` 47 | 48 | ## Sending events 49 | 50 | This part of this demo sends a stream of events to a Pub/Sub topic, 51 | one per second. 52 | 53 | 5. If the pipeline is running in Dataflow, run the `send_events` script: 54 | 55 | ```bash 56 | python send_events.py \ 57 | --project_id=$PROJECT_ID 58 | ``` 59 | 60 | You may optionally change the topic and sucscription names if you like by 61 | specifying additional arguments: `topic_id` and `sub_id`. 62 | 63 | 64 | 65 | ## Starting the pipeline 66 | 67 | This section creates a Dataflow job which reads the messages from a PubSub 68 | subscription, writes all messages into a `messages` table, and also windows the 69 | messages and writes nested/repeated rows for each window into a 70 | `messages_nested` table. 71 | 72 | 73 | 6. Open a 2nd terminal window. 74 | 7. Submit the job to the Dataflow service: 75 | ```bash 76 | export PROJECT_ID=$(gcloud config get-value project) 77 | cd gcp-demos/dataflow/dflow-bq-stream-python 78 | source .venv/bin/activate 79 | python process_events.py \ 80 | --runner DataflowRunner \ 81 | --region us-central1 \ 82 | --project $PROJECT_ID \ 83 | --staging_location gs://$PROJECT_ID-dflow-demo/ \ 84 | --temp_location gs://$PROJECT_ID-dflow-demo/ 85 | ``` 86 | 87 | 88 | 89 | 90 | > [!NOTE] 91 | > If you want to adjust the fixed window size from 10 seconds, you can provide 92 | > an optional command-line argument `--window-size` (defined in seconds). 93 | > 94 | > If you want to run the pipeline locally, the command looks like this: 95 | > ```bash 96 | > python3 process_events.py \ 97 | > -- project $PROJECT_ID 98 | > ``` 99 | 100 | ## Checking out results 101 | 102 | 1. It can take 5+ minutes until messages start flowing through the pipeline 103 | 1. Check out the pipeline in Dataflow (if running there) 104 | 1. Note the branch and two different sinks 105 | 2. Note that windowing happens after several transforms 106 | 3. Note the aggregation 107 | 108 | 1. Check out the **messages** table in BQ and see all the individual messages 109 | 1. Check out the **messages_nested** table in BQ and see the nested data 110 | 111 | ## Cleaning up 112 | 113 | 1. Stop the Dataflow job 114 | 2. Stop the process sending events 115 | 3. Delete the Pub/Sub assets 116 | 4. Delete the BigQuery assets 117 | 5. Delete your service account 118 | 6. Delete your bucket -------------------------------------------------------------------------------- /bigquery/elt_examples.sql: -------------------------------------------------------------------------------- 1 | -- This file contains a list of SQL snippets illustrating various data sanitization techniques 2 | -- that can be performed entirely within BigQuery. Each example demonstrates a different approach 3 | -- to cleaning and standardizing raw data using SQL functions and expressions. 4 | 5 | -- Cleans the sale_amount field by safely casting it to NUMERIC; invalid values become NULL. 6 | SELECT 7 | transaction_id, 8 | -- Attempt to cast to NUMERIC. If it fails, SAFE_CAST returns NULL. 9 | SAFE_CAST(sale_amount AS NUMERIC) AS clean_sale_amount, 10 | customer_email, 11 | order_date, 12 | product_code, 13 | quantity, 14 | status 15 | FROM 16 | my_dataset.raw_data_staging; 17 | 18 | -- Parses common order_date string formats into a single DATE column called clean_order_date. 19 | -- Returns NULL if none of the formats match; prefers ISO '%Y-%m-%d', then '%m/%d/%Y', then '%d-%b-%Y'. 20 | SELECT 21 | transaction_id, 22 | sale_amount, 23 | customer_email, 24 | -- Attempt to parse common date formats. 25 | -- Prioritize the most common/desired format. 26 | COALESCE( 27 | SAFE.PARSE_DATE('%Y-%m-%d', order_date), -- '2023-01-15' 28 | SAFE.PARSE_DATE('%m/%d/%Y', order_date), -- '01/15/2023' 29 | SAFE.PARSE_DATE('%d-%b-%Y', order_date) -- '15-Jan-2023' 30 | ) AS clean_order_date, 31 | product_code, 32 | quantity, 33 | status 34 | FROM 35 | my_dataset.raw_data_staging; 36 | 37 | -- Trims leading/trailing whitespace from string fields and converts empty strings to NULL 38 | -- (produces normalized clean_customer_email and clean_product_code columns). 39 | SELECT 40 | transaction_id, 41 | sale_amount, 42 | -- Trim whitespace and convert empty strings to NULL 43 | NULLIF(TRIM(customer_email), '') AS clean_customer_email, 44 | order_date, 45 | -- Trim whitespace and convert empty strings to NULL 46 | NULLIF(TRIM(product_code), '') AS clean_product_code, 47 | quantity, 48 | status 49 | FROM 50 | my_dataset.raw_data_staging; 51 | 52 | -- Validates and standardizes the status field: keeps expected values, maps 'Done' -> 'Completed', 53 | -- and falls back to 'Unknown' (or NULL if you prefer) for unexpected values. 54 | SELECT 55 | transaction_id, 56 | sale_amount, 57 | customer_email, 58 | order_date, 59 | product_code, 60 | quantity, 61 | -- Validate and standardize status values 62 | CASE 63 | WHEN status IN ('Completed', 'Pending', 'Cancelled') THEN status 64 | WHEN status = 'Done' THEN 'Completed' -- Standardize 'Done' to 'Completed' 65 | ELSE 'Unknown' -- Or NULL, depending on your requirement 66 | END AS clean_status 67 | FROM 68 | my_dataset.raw_data_staging; 69 | 70 | 71 | -- Combined example: composes earlier cleaning steps (SAFE_CAST for numbers, TRIM/NULLIF for strings, 72 | -- DATE parsing with SAFE.PARSE_DATE, and status normalization), then applies filtering and deduplication. 73 | -- Use this as a ready-to-run pattern for end-to-end in-query sanitization before loading or analytics. 74 | SELECT 75 | transaction_id, 76 | SAFE_CAST(sale_amount AS NUMERIC) AS clean_sale_amount, 77 | NULLIF(TRIM(customer_email), '') AS clean_customer_email, 78 | COALESCE( 79 | SAFE.PARSE_DATE('%Y-%m-%d', order_date), 80 | SAFE.PARSE_DATE('%m/%d/%Y', order_date) 81 | ) AS clean_order_date, 82 | NULLIF(TRIM(product_code), '') AS clean_product_code, 83 | SAFE_CAST(quantity AS INT64) AS clean_quantity, 84 | CASE 85 | WHEN status IN ('Completed', 'Pending', 'Cancelled') THEN status 86 | WHEN status = 'Done' THEN 'Completed' 87 | ELSE 'Unknown' 88 | END AS clean_status 89 | FROM 90 | my_dataset.raw_data_staging 91 | -- Filter out rows where critical columns are NULL after cleaning attempts 92 | WHERE 93 | transaction_id IS NOT NULL 94 | AND SAFE_CAST(sale_amount AS NUMERIC) IS NOT NULL -- Ensures sale_amount is valid and not NULL 95 | AND SAFE_CAST(quantity AS INT64) >= 1 -- Quantity must be at least 1 96 | QUALIFY 97 | -- Deduplicate based on transaction_id, keeping the first encountered record 98 | ROW_NUMBER() OVER(PARTITION BY transaction_id ORDER BY order_date DESC) = 1; 99 | -- The ORDER BY in ROW_NUMBER() determines which duplicate to keep. 100 | -- Here, we prioritize the most recent order_date if duplicates exist. -------------------------------------------------------------------------------- /security/auth_examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | BigQuery Authentication Methods Demo 3 | 4 | This script demonstrates three different ways to authenticate with BigQuery: 5 | 6 | 1. Application Default Credentials (ADC). 7 | 2. Service Account Key File 8 | 3. Service Account Impersonation 9 | 10 | Requirements: 11 | - pip install google-cloud-bigquery google-auth 12 | - For ADC 13 | - Make sure your user account has the necessary permissions. 14 | - Run 'gcloud auth application-default login' or set GOOGLE_APPLICATION_CREDENTIALS 15 | - For Service Account key file 16 | - Create a service account with appropriate permissions 17 | - Create and download a keyfile 18 | - Update 'key_path' variable with your key file path 19 | - For Service Account Impersonation 20 | - Make sure your user account has the "Service Account Token Creator" role on the target service account 21 | - Update 'target_service_account' variable with the email of the service account to impersonate 22 | 23 | Variations 24 | - You can configure ADC to use a service account with key file 25 | - You can configure ADC to use impersonation as default 26 | """ 27 | 28 | import os 29 | from google.cloud import bigquery 30 | from google.auth import default, impersonated_credentials 31 | from google.oauth2 import service_account 32 | import json 33 | 34 | # Query to execute 35 | QUERY = "SELECT COUNT(*) as row_count FROM `roi-bq-demos.bq_demo.line_item`" 36 | 37 | 38 | def execute_query_with_client(client, method_name): 39 | """Execute the query and print results""" 40 | try: 41 | print(f"\n--- {method_name} ---") 42 | query_job = client.query(QUERY) 43 | results = query_job.result() 44 | 45 | for row in results: 46 | print(f"Row count: {row.row_count}") 47 | 48 | print(f"✓ {method_name} completed successfully") 49 | 50 | except Exception as e: 51 | print(f"✗ {method_name} failed: {str(e)}") 52 | 53 | 54 | def method_1_application_default_credentials(): 55 | """Method 1: Use Application Default Credentials (ADC)""" 56 | try: 57 | # This will use ADC - credentials from: 58 | # 1. GOOGLE_APPLICATION_CREDENTIALS environment variable 59 | # 2. gcloud auth application-default login 60 | # 3. Compute Engine/Cloud Run/GKE metadata service 61 | client = bigquery.Client() 62 | execute_query_with_client(client, "Application Default Credentials") 63 | 64 | except Exception as e: 65 | print(f"✗ Application Default Credentials setup failed: {str(e)}") 66 | 67 | 68 | def method_2_service_account_key(): 69 | """Method 2: Use Service Account Key File""" 70 | # Update this path to your service account key file 71 | key_path = "path/to/your/service-account-key.json" 72 | 73 | try: 74 | if not os.path.exists(key_path): 75 | print(f"✗ Service account key file not found: {key_path}") 76 | print(" Please update the 'key_path' variable with the correct path") 77 | return 78 | 79 | credentials = service_account.Credentials.from_service_account_file(key_path) 80 | client = bigquery.Client(credentials=credentials) 81 | execute_query_with_client(client, "Service Account Key") 82 | 83 | except Exception as e: 84 | print(f"✗ Service Account Key method failed: {str(e)}") 85 | 86 | 87 | def method_3_service_account_impersonation(): 88 | """Method 3: Use Service Account Impersonation""" 89 | # Update this with the service account email you want to impersonate 90 | target_service_account = "your-service-account@your-project.iam.gserviceaccount.com" 91 | 92 | try: 93 | # Get source credentials (usually from ADC) 94 | source_credentials, project = default() 95 | 96 | # Create impersonated credentials 97 | target_credentials = impersonated_credentials.Credentials( 98 | source_credentials=source_credentials, 99 | target_principal=target_service_account, 100 | target_scopes=["https://www.googleapis.com/auth/cloud-platform"], 101 | ) 102 | 103 | client = bigquery.Client(credentials=target_credentials) 104 | execute_query_with_client(client, "Service Account Impersonation") 105 | 106 | except Exception as e: 107 | print(f"✗ Service Account Impersonation failed: {str(e)}") 108 | 109 | 110 | def main(): 111 | """Main function to run all authentication methods""" 112 | print("BigQuery Authentication Methods Demo") 113 | print("=" * 50) 114 | 115 | # Method 1: Application Default Credentials 116 | method_1_application_default_credentials() 117 | 118 | # Method 2: Service Account Key 119 | method_2_service_account_key() 120 | 121 | # Method 3: Service Account Impersonation 122 | method_3_service_account_impersonation() 123 | 124 | print("\n" + "=" * 50) 125 | print("Demo completed!") 126 | 127 | 128 | if __name__ == "__main__": 129 | main() 130 | -------------------------------------------------------------------------------- /dataflow/simple_demos/beam_demo_1.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is intentionally small, focused code meant to illustrate a few key Apache Beam concepts 3 | in a minimal way. It is NOT production code; it's an educational demo. The key points shown here: 4 | 5 | - How to read data into a PCollection using I/O transforms (ReadFromText). 6 | - Element-wise transforms: Map (one-to-one), FlatMap (one-to-many), Filter (predicate). 7 | - How to write results with WriteToText sinks. 8 | 9 | Setup notes (local testing): 10 | - Use pyenv to install a supported Python version (3.11 or earlier). For example: 11 | # Install a specific Python version using pyenv (3.11.x or earlier) 12 | # $ pyenv install 3.11.2 13 | # Create and activate a virtualenv (venv or pyenv-virtualenv): 14 | # $ python -m venv .venv 15 | # $ source .venv/bin/activate 16 | # Install dependencies (for local runs): 17 | # $ pip install apache-beam 18 | # If you plan to run on Google Cloud Dataflow and access GCS/BQ, install GCP extras: 19 | # $ pip install "apache-beam[gcp]" 20 | """ 21 | 22 | import apache_beam as beam 23 | from apache_beam.io import ReadFromText, WriteToText 24 | import sys 25 | 26 | 27 | def count_words(line): 28 | """Count words in a single line. 29 | 30 | What it does: 31 | - Splits the incoming text line on whitespace and returns the word count (an int). 32 | 33 | Called by: 34 | - The 'CountWords' Map transform below. Beam calls this once per element in the input 35 | PCollection (one-to-one transform). 36 | """ 37 | return len(line.split()) 38 | 39 | 40 | def lear_there(line): 41 | """FlatMap helper: yield the line if it contains the string 'Lear'. 42 | 43 | What it does: 44 | - Checks if the substring 'Lear' appears in the line. If yes, yields the line. 45 | - Because this is used with FlatMap, it can emit zero or more outputs per input. 46 | 47 | Called by: 48 | - The 'FlatMapLear' FlatMap transform below. FlatMap expects an iterable/generator 49 | of zero-or-more output elements per input element. 50 | """ 51 | if "Lear" in line: 52 | yield line 53 | 54 | 55 | # Build a Pipeline object. This is the root of your Beam program. The Pipeline object 56 | # is used to apply transforms and then run the resulting graph. 57 | p = beam.Pipeline(argv=sys.argv) 58 | 59 | 60 | # Read: create an initial PCollection of text lines. 61 | # Concept: ReadFromText is an I/O transform that returns a PCollection where each 62 | # element is one line from the input file. Here we point to a sample file in GCS. 63 | lines = p | "Read" >> ReadFromText("gs://dataflow-samples/shakespeare/kinglear.txt") 64 | 65 | 66 | # Example sink: write the raw lines out to a file. This demonstrates a sink transform. 67 | # Concept: Sinks are also transforms that consume PCollections. 68 | _ = lines | "lines_out" >> WriteToText("beam_demo_1_lines.txt") 69 | 70 | 71 | # Map transform: apply a function to each element, producing a one-to-one mapping. 72 | # Concept: Map is useful for stateless, per-element computations. Note that you pass the function as an argument, 73 | # with no other arguments. The receiving function will receive the next element of the input pcollection as it's first 74 | # positional argument. 75 | word_counts = lines | "CountWords" >> beam.Map(count_words) 76 | _ = word_counts | "word_counts_out" >> WriteToText("beam_demo_1_word_counts.txt") 77 | 78 | 79 | # FlatMap example: the supplied function may return zero, one, or many outputs per input. 80 | # Concept: FlatMap is used when you need to expand or filter elements and potentially 81 | # emit multiple outputs for a single input element. Again, the function is passed as an argument 82 | # with no other arguments and will receive the next element of the input pcollection as it's first positional argument. 83 | lear_there_flatmap = lines | "FlatMapLear" >> beam.FlatMap(lear_there) 84 | _ = lear_there_flatmap | "lear_there_flatmap_out" >> WriteToText( 85 | "beam_demo_1_flatmap.txt" 86 | ) 87 | 88 | 89 | # Filter: keep elements where the predicate is True. 90 | # Lambda behavior: receives each element and returns True when 'Lear' is present. 91 | # Use a lambda for short, one-off predicates; prefer a top-level function or DoFn 92 | # when logic is non-trivial, reused, or to avoid potential runner/serialization issues. 93 | lear_there_filter = lines | "FilterLear" >> beam.Filter(lambda x: "Lear" in x) 94 | _ = lear_there_filter | "lear_there_filter_out" >> WriteToText("beam_demo_1_filter.txt") 95 | 96 | 97 | # Run: execute the pipeline. For the DirectRunner this runs locally 98 | p.run().wait_until_finish() 99 | 100 | 101 | # End-of-file notes: 102 | # - After running locally you should see files created in the current directory: 103 | # out_1-00000-of-00001, out_2-00000-of-00001, out_3-00000-of-00001, out_4-00000-of-00001 104 | # (WriteToText names are suffixed by shard information). 105 | # - Inspect those files to verify the transformations: raw lines, numeric word counts, 106 | # lines containing 'Lear' produced by FlatMap, and lines selected by Filter. 107 | -------------------------------------------------------------------------------- /bigquery/schema-demo/generate_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import apache_beam as beam 3 | import random 4 | from apache_beam.io import ReadFromText 5 | import datetime 6 | import argparse 7 | 8 | # from apache_beam.options.pipeline_options import PipelineOptions 9 | # from apache_beam.options.pipeline_options import SetupOptions 10 | 11 | 12 | states = ("AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL",\ 13 | "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME",\ 14 | "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH",\ 15 | "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI",\ 16 | "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI",\ 17 | "WY", "DC") 18 | 19 | # handle arguments 20 | parser = argparse.ArgumentParser() 21 | 22 | parser.add_argument( 23 | "--bucket", help="Name of the bucket where output files are written", required=True) 24 | parser.add_argument( 25 | "--products", help="Number of products to generate", default=10000) 26 | parser.add_argument( 27 | "--customers", help="Number of customer to generate", default=60000000) 28 | parser.add_argument( 29 | "--orders", help="Number of orders per customer", default=500) 30 | 31 | known_args, pipeline_args = parser.parse_known_args() 32 | 33 | def make_orders(customer): 34 | cust_id = customer.split(",")[0] 35 | for order_num in range(1, int(known_args.orders) + 1): 36 | order_date = str(datetime.date(2018, random.randint(1,12), random.randint(1,28))) 37 | order_num = "{}-{}".format(cust_id, order_num) 38 | row = [order_num, str(cust_id), order_date] 39 | yield ",".join(row) 40 | 41 | 42 | def make_lines(order_string): 43 | order = order_string.split(",") 44 | for line_item_num in range(1,11): 45 | order_num = order[0] 46 | line_item_num = str(line_item_num) 47 | prod_code = str(random.randint(0, int(known_args.products))) 48 | qty = str(random.randint(0,10)) 49 | row = [order_num, line_item_num, prod_code, qty] 50 | yield ",".join(row) 51 | 52 | 53 | def create_cust_ids(num_cust_ids): 54 | for cust_id in range(0,num_cust_ids): 55 | yield cust_id 56 | 57 | 58 | def make_customer(cust_id): 59 | cust_num = str(cust_id) 60 | cust_name = "Customer_" + cust_num + "_Name" 61 | phone = str(random.randint(100,999))\ 62 | + "-" + str(random.randint(100,999))\ 63 | + "-" + str(random.randint(0,9999)) 64 | cust_email = "Customer_" + cust_num + "_Email@{}.com".format(cust_name) 65 | cust_address = cust_num + " Main St." 66 | cust_state = states[random.randint(0,50)] 67 | cust_zip = str(random.randint(0,99999)) 68 | row = [cust_num, cust_name, cust_address, cust_state, cust_zip, cust_email, phone] 69 | return ",".join(row) 70 | 71 | 72 | def create_pids(num_pids): 73 | for pid in range(0,num_pids): 74 | yield pid 75 | 76 | 77 | def make_product(pid): 78 | prod_code = str(pid) 79 | prod_name = "Product {}".format(prod_code) 80 | prod_desc = "The product that's perfect for {} stuff".format(prod_code) 81 | prod_price = str(random.randint(0,50) * pid) 82 | row = [prod_code, prod_name, prod_desc, prod_price] 83 | return ",".join(row) 84 | 85 | 86 | def run(): 87 | 88 | pipeline_args.append( 89 | '--job_name=bq-demo-data-{}'.format(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))) 90 | pipeline_args.append( 91 | '--staging_location=gs://{0}/bq-demo/staging/'.format(known_args.bucket)) 92 | pipeline_args.append( 93 | '--temp_location=gs://{0}/bq-demo/temp/'.format(known_args.bucket)) 94 | 95 | p1 = beam.Pipeline(argv=pipeline_args) 96 | # create the customer ids 97 | num_customers = p1 | "num_customers" >> beam.Create( 98 | [int(known_args.customers)]) 99 | cust_ids = num_customers | beam.FlatMap(create_cust_ids) 100 | 101 | # create the product ids 102 | num_products = p1 | "num_product" >> beam.Create( 103 | [int(known_args.products)]) 104 | pids = num_products | beam.FlatMap(create_pids) 105 | 106 | # create customers and products 107 | customers = cust_ids | "generate customer row" >> beam.Map(make_customer) 108 | products = pids | "generate product row" >> beam.Map(make_product) 109 | 110 | # output customer 111 | output = customers | "write customers to gcs" >> beam.io.WriteToText( 112 | "gs://{}/bq-demo/customer".format(known_args.bucket)) 113 | 114 | # output products 115 | output = products | "write products to gcs" >> beam.io.WriteToText( 116 | "gs://{}/bq-demo/product".format(known_args.bucket)) 117 | 118 | p1.run().wait_until_finish() 119 | 120 | p2 = beam.Pipeline(argv=pipeline_args) 121 | 122 | customers = p2 | 'read customer' >> ReadFromText( 123 | 'gs://{}/bq-demo/customer*'.format(known_args.bucket)) 124 | orders = customers | beam.FlatMap(make_orders) 125 | line_items = orders | beam.FlatMap(make_lines) 126 | output = orders | "write orders to gcs" >> beam.io.WriteToText("gs://{}/bq-demo/order".format(known_args.bucket)) 127 | output = line_items | "write line_items to gcs" >> beam.io.WriteToText("gs://{}/bq-demo/line_items".format(known_args.bucket)) 128 | 129 | p2.run() 130 | 131 | if __name__ == '__main__': 132 | run() 133 | -------------------------------------------------------------------------------- /dataflow/simple_demos/beam_demo_2.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains minimal, self-contained examples meant to illustrate 3 | several key Apache Beam concepts and transforms. It's intentionally small 4 | and readable so you can experiment locally. 5 | 6 | Key points illustrated: 7 | - Creating PCollections from in-memory data with Create 8 | - Grouping keyed data with GroupByKey / CoGroupByKey 9 | - Global and per-key aggregation with CombineGlobally and CombinePerKey 10 | 11 | Suggested local setup (macOS): 12 | 1. Use pyenv to install a compatible Python (3.11 or earlier is safe for 13 | most apache-beam releases). Example: 14 | - pyenv install 3.11.6 15 | - pyenv local 3.11.6 16 | 2. Create and activate a virtual environment (venv): 17 | - python -m venv .venv 18 | - source .venv/bin/activate 19 | 3. Upgrade pip and install dependencies: 20 | - pip install --upgrade pip 21 | - pip install apache-beam 22 | - If you plan to run on Google Cloud Dataflow, also install: 23 | pip install apache-beam[gcp] 24 | """ 25 | 26 | import apache_beam as beam 27 | from apache_beam.io import WriteToText 28 | import sys 29 | 30 | # --- Sample data: city -> zip codes (keyed tuples) --- 31 | city_zip_list = [ 32 | ("Lexington", "40513"), 33 | ("Nashville", "37027"), 34 | ("Lexington", "40502"), 35 | ("Seattle", "98125"), 36 | ("Mountain View", "94041"), 37 | ("Seattle", "98133"), 38 | ("Lexington", "40591"), 39 | ("Mountain View", "94085"), 40 | ] 41 | 42 | 43 | # --- Sample data: sales amounts (scalar numeric values) --- 44 | sales = [ 45 | 1200.50, 46 | 950.00, 47 | 300.75, 48 | 2100.00, 49 | 400.25, 50 | 1800.00, 51 | 500.00, 52 | 700.00, 53 | ] 54 | 55 | # --- Sample data: sales_rep_id -> sale amounts (keyed tuples) --- 56 | sales_and_reps = [ 57 | ("SP001", 1200.50), 58 | ("SP002", 950.00), 59 | ("SP001", 300.75), 60 | ("SP003", 2100.00), 61 | ("SP002", 400.25), 62 | ("SP004", 1800.00), 63 | ("SP003", 500.00), 64 | ("SP001", 700.00), 65 | ] 66 | 67 | # --- Sample data: order numbers and amounts (keyed tuples) --- 68 | order_numbers_amounts = [ 69 | ("ORD1001", 250.00), 70 | ("ORD1002", 120.50), 71 | ("ORD1003", 75.25), 72 | ("ORD1004", 600.00), 73 | ("ORD1005", 320.10), 74 | ("ORD1006", 150.75), 75 | ("ORD1007", 980.00), 76 | ("ORD1008", 45.00), 77 | ] 78 | 79 | # --- Sample data: order numbers and delivery dates (keyed tuples) --- 80 | order_numbers_delivery_dates = [ 81 | ("ORD1001", "2025-08-14"), 82 | ("ORD1002", "2025-08-15"), 83 | ("ORD1003", "2025-08-16"), 84 | ("ORD1004", "2025-08-17"), 85 | ("ORD1005", "2025-08-18"), 86 | ("ORD1006", "2025-08-19"), 87 | ("ORD1007", "2025-08-20"), 88 | ("ORD1008", "2025-08-21"), 89 | ] 90 | 91 | # --- Build and run the pipeline --- 92 | # Create a Pipeline object. In real projects you typically pass PipelineOptions 93 | # (for runner, project, temp_location, etc.). For this demo we use the default 94 | # direct runner which executes locally. 95 | p = beam.Pipeline() 96 | 97 | 98 | # Section: create a keyed PCollection and group by key 99 | # What it's doing: creates a PCollection of (city, zip) tuples and groups 100 | # all values by the city key. The result is a PCollection of 101 | # (city, iterable_of_zip_codes). 102 | citycodes = p | "CreateCityCodes" >> beam.Create(city_zip_list) 103 | grouped = citycodes | beam.GroupByKey() 104 | grouped | "write_city_grouped" >> WriteToText("beam_demo_2_city_grouped.txt") 105 | 106 | 107 | # Section: global aggregation 108 | # What it's doing: create a scalar PCollection and computing the global sum 109 | # across all elements using CombineGlobally. This returns a single-element 110 | # PCollection with the total sales. 111 | sales = p | "CreateSalesCollection" >> beam.Create(sales) 112 | sales_total = sales | beam.CombineGlobally(sum) 113 | sales_total | "write_sales_total" >> WriteToText("beam_demo_2_sales_total.txt") 114 | 115 | 116 | # Section: per-key aggregation 117 | # What it's doing: create a keyed PCollection (sales_and_reps) and computing 118 | # the sum per salesperson using CombinePerKey. Output is (salesperson, total). 119 | sales = p | "CreateSalesByRepCollection" >> beam.Create(sales_and_reps) 120 | sales_total_by_rep = sales | beam.CombinePerKey(sum) 121 | sales_total_by_rep | "write_sales_total_by_rep" >> WriteToText( 122 | "beam_demo_2_sales_total_by_rep.txt" 123 | ) 124 | 125 | 126 | # Section: CoGroupByKey (a form of join) 127 | # What it's doing: CoGroupByKey takes a dict of keyed PCollections and produces 128 | # for each key a dictionary-like result with lists of values from each input 129 | # PCollection. This demonstrates how to join related datasets by key. 130 | # Called by: the pipeline and then written to disk for inspection. 131 | orders_amounts = p | "CreateOrderNumbers" >> beam.Create(order_numbers_amounts) 132 | orders_delivery_dates = p | "CreateOrderDeliveryDates" >> beam.Create( 133 | order_numbers_delivery_dates 134 | ) 135 | joined = { 136 | "orders": orders_amounts, 137 | "shipping": orders_delivery_dates, 138 | } | beam.CoGroupByKey() 139 | joined | "write_joined_orders_shipping" >> WriteToText( 140 | "beam_demo_2_joined_orders_shipping.txt" 141 | ) 142 | 143 | p.run().wait_until_finish() 144 | -------------------------------------------------------------------------------- /bigquery/arrays_examples.sql: -------------------------------------------------------------------------------- 1 | -- populate arrays explicitly 2 | SELECT 3 | "row1" AS row_id, 4 | [1, 5 | 2, 6 | 3, 7 | 4] AS num_array 8 | UNION ALL 9 | SELECT 10 | "row2" AS row_id, 11 | [2, 12 | 4, 13 | 8, 14 | 16, 15 | 32] AS num_array 16 | UNION ALL 17 | SELECT 18 | "row3" AS row_id, 19 | [5, 20 | 10] AS num_array 21 | 22 | -- populate arrays using array_agg 23 | WITH 24 | c AS ( 25 | SELECT 26 | cust_id, 27 | cust_name, 28 | cust_zip 29 | FROM 30 | `roi-bq-demos.bq_demo.cp` 31 | WHERE 32 | cust_state = "AK") 33 | SELECT 34 | cust_name, 35 | ARRAY_AGG(order_num) as orders 36 | FROM 37 | c 38 | JOIN 39 | `roi-bq-demos.bq_demo.order` o 40 | ON 41 | o.cust_id = c.cust_id 42 | GROUP BY 43 | c.cust_name 44 | 45 | -- report array length 46 | SELECT 47 | `commit`, 48 | ARRAY_LENGTH(difference) AS arr_len, 49 | difference 50 | FROM 51 | `bigquery-public-data.github_repos.commits` 52 | WHERE 53 | author.email LIKE "%jwdavis.me" 54 | ORDER BY 55 | arr_len DESC 56 | LIMIT 57 | 5 58 | 59 | -- find by array length 60 | SELECT 61 | author, 62 | difference 63 | FROM 64 | `bigquery-public-data.github_repos.commits` 65 | WHERE 66 | array_length(difference) = 5 67 | LIMIT 10 68 | 69 | -- select basic array 70 | SELECT 71 | [1, 72 | 2, 73 | 3, 74 | 4] AS num_array 75 | 76 | -- select table from array 77 | SELECT 78 | * 79 | FROM 80 | UNNEST ( [1, 2, 3, 4]) AS num 81 | 82 | -- calculate average of array 83 | SELECT 84 | AVG(num) AS avg_num 85 | FROM 86 | UNNEST ( [1, 2, 3, 4]) AS num 87 | 88 | -- basic correlated cross join 89 | WITH 90 | arrays AS ( 91 | SELECT 92 | "row1" AS row_id, 93 | [1, 94 | 2, 95 | 3, 96 | 4] AS num_array 97 | UNION ALL 98 | SELECT 99 | "row2" AS row_id, 100 | [2, 101 | 4, 102 | 8, 103 | 16, 104 | 32] AS num_array) 105 | SELECT 106 | row_id, 107 | num_array, 108 | num 109 | FROM 110 | arrays 111 | CROSS JOIN 112 | UNNEST(num_array) AS num 113 | 114 | -- comma correlated cross join 115 | WITH 116 | arrays AS ( 117 | SELECT 118 | "row1" AS row_id, 119 | [1, 120 | 2, 121 | 3, 122 | 4] AS num_array 123 | UNION ALL 124 | SELECT 125 | "row2" AS row_id, 126 | [2, 127 | 4, 128 | 8, 129 | 16, 130 | 32] AS num_array) 131 | SELECT 132 | row_id, 133 | num_array, 134 | num 135 | FROM 136 | arrays, 137 | UNNEST(num_array) AS num 138 | 139 | -- implicit unnest 140 | WITH 141 | arrays AS ( 142 | SELECT 143 | "row1" AS row_id, 144 | [1, 145 | 2, 146 | 3, 147 | 4] AS num_array 148 | UNION ALL 149 | SELECT 150 | "row2" AS row_id, 151 | [2, 152 | 4, 153 | 8, 154 | 16, 155 | 32] AS num_array) 156 | SELECT 157 | row_id, 158 | num_array, 159 | num 160 | FROM 161 | arrays, 162 | arrays.num_array AS num 163 | 164 | -- find row where num_array contains 2 - take 1 165 | WITH 166 | arrays AS ( 167 | SELECT 168 | "row1" AS row_id, 169 | [1, 170 | 2, 171 | 3, 172 | 4] AS num_array 173 | UNION ALL 174 | SELECT 175 | "row2" AS row_id, 176 | [2, 177 | 4, 178 | 8, 179 | 16, 180 | 32] AS num_array) 181 | SELECT 182 | row_id, 183 | num_array 184 | FROM 185 | arrays 186 | CROSS JOIN 187 | UNNEST(num_array) AS num 188 | WHERE 189 | num=2 190 | 191 | -- find row where num_array contains 2 - take 2 192 | WITH 193 | arrays AS ( 194 | SELECT 195 | "row1" AS row_id, 196 | [2, 197 | 2, 198 | 3, 199 | 4] AS num_array 200 | UNION ALL 201 | SELECT 202 | "row2" AS row_id, 203 | [2, 204 | 4, 205 | 8, 206 | 16, 207 | 32] AS num_array) 208 | SELECT 209 | row_id, 210 | num_array 211 | FROM 212 | arrays 213 | WHERE 214 | 2 in (select num from unnest(arrays.num_array) as num) 215 | 216 | -- find row where num_array contains 2 - take 3 217 | WITH 218 | arrays AS ( 219 | SELECT 220 | "row1" AS row_id, 221 | [2, 222 | 2, 223 | 3, 224 | 4] AS num_array 225 | UNION ALL 226 | SELECT 227 | "row2" AS row_id, 228 | [2, 229 | 4, 230 | 8, 231 | 16, 232 | 32] AS num_array) 233 | SELECT 234 | row_id, 235 | num_array 236 | FROM 237 | arrays 238 | WHERE 239 | EXISTS ( 240 | SELECT 241 | * 242 | FROM 243 | UNNEST(num_array) AS num 244 | WHERE 245 | num=2) 246 | 247 | -- find commits that touched a specific file - take 1 248 | SELECT 249 | author, 250 | difference 251 | FROM 252 | `bigquery-public-data.github_repos.commits`, 253 | unnest(difference) as files 254 | WHERE 255 | files.new_path = "courses/data_analysis/lab2/python/is_popular.py" 256 | 257 | -- find commits that touched a specific file - take 2 258 | SELECT 259 | author, 260 | difference 261 | FROM 262 | `bigquery-public-data.github_repos.commits` 263 | WHERE 264 | "courses/data_analysis/lab2/python/is_popular.py" in (select f.new_path from unnest(difference) as f) 265 | 266 | -- find commits that touched a specific file - take 3 267 | SELECT 268 | author, 269 | difference 270 | FROM 271 | `bigquery-public-data.github_repos.commits` 272 | WHERE 273 | EXISTS ( 274 | SELECT 275 | * 276 | FROM 277 | UNNEST(difference) AS f 278 | WHERE 279 | f.new_path="courses/data_analysis/lab2/python/is_popular.py") -------------------------------------------------------------------------------- /dataflow/dflow-bq-stream-python/process_events.py: -------------------------------------------------------------------------------- 1 | import os 2 | import apache_beam as beam 3 | import apache_beam.transforms.window as window 4 | import random 5 | import argparse 6 | import json 7 | import schema_defs 8 | 9 | from apache_beam.options.pipeline_options import PipelineOptions 10 | from apache_beam.options.pipeline_options import SetupOptions 11 | from apache_beam.options.pipeline_options import StandardOptions 12 | from apache_beam.transforms import window 13 | from datetime import datetime 14 | 15 | from google.cloud import bigquery 16 | from google.cloud.exceptions import NotFound 17 | 18 | # takes input element, and returns an array of one bq row 19 | # that includes the window end time 20 | class CreateBQRow(beam.DoFn): 21 | def process(self, element, window=beam.DoFn.WindowParam): 22 | window_end_ts = window.end.to_utc_datetime().isoformat() 23 | row = {"window_ending": window_end_ts, 24 | "pos_id": element[0], 25 | "transactions": element[1] 26 | } 27 | # print(f"Writing row:{row}") 28 | return [row] 29 | 30 | # convert message into a kv pair 31 | # with transaction info in an objet 32 | def make_kv(element): 33 | kv = ( 34 | element["pos_id"], 35 | { 36 | "ts": element["ts"], 37 | "zip": element["zip"], 38 | "sale_amount": element["sale_amount"] 39 | } 40 | ) 41 | return kv 42 | 43 | parser = argparse.ArgumentParser() 44 | parser.add_argument( 45 | "--dataset_id", 46 | default='dflow_demo') 47 | parser.add_argument( 48 | "--table_id", 49 | default='messages') 50 | parser.add_argument( 51 | "--sub_id", 52 | default='demo_sub') 53 | parser.add_argument( 54 | "--window_size", 55 | default=10) 56 | 57 | known_args, pipeline_args = parser.parse_known_args() 58 | 59 | # check to see if user specified --project; quit if not 60 | pipeline_args_dict = {} 61 | for index in range(1, len(pipeline_args), 2): 62 | pipeline_args_dict[pipeline_args[index-1].replace("--", "", 1)] = pipeline_args[index] 63 | 64 | if not ("project" in pipeline_args_dict): 65 | print("project argument is missing") 66 | quit() 67 | 68 | sub_path = f"projects/{pipeline_args_dict['project']}/subscriptions/{known_args.sub_id}" 69 | 70 | # check to see if dataset exists, create if not 71 | bq_client = bigquery.Client() 72 | dataset_path = f"{pipeline_args_dict['project']}.{known_args.dataset_id}" 73 | try: 74 | dataset = bq_client.get_dataset(dataset_path) 75 | except NotFound: 76 | dataset = bigquery.Dataset(dataset_path) 77 | dataset.location = "US" 78 | dataset = bq_client.create_dataset(dataset, timeout=30) 79 | 80 | # check to see if messages table exists, create if not 81 | messages_table_path = f"{pipeline_args_dict['project']}.{known_args.dataset_id}.{known_args.table_id}" 82 | try: 83 | table = bq_client.get_table(messages_table_path) 84 | except NotFound: 85 | table_ref = dataset.table(known_args.table_id) 86 | table = bigquery.Table(table_ref, schema=schema_defs.ccl_messages_schema) 87 | table = bq_client.create_table(table) 88 | 89 | # check to see if nested table exists, create if not 90 | nested_table_path = f"{pipeline_args_dict['project']}.{known_args.dataset_id}.{known_args.table_id}_nested" 91 | try: 92 | table = bq_client.get_table(nested_table_path) 93 | except NotFound: 94 | table_ref = dataset.table(f"{known_args.table_id}_nested") 95 | table = bigquery.Table(table_ref, schema=schema_defs.ccl_messages_nested_schema) 96 | table = bq_client.create_table(table) 97 | 98 | pipeline_options = PipelineOptions(pipeline_args) 99 | pipeline_options.view_as(SetupOptions).save_main_session = False 100 | pipeline_options.view_as(StandardOptions).streaming = True 101 | 102 | p = beam.Pipeline(options=pipeline_options) 103 | messages = p | "read messages" >> beam.io.ReadFromPubSub(subscription = sub_path) 104 | decoded_messages = messages | "decode bytes" >> beam.Map(lambda x: x.decode('utf-8')) 105 | json_messages = decoded_messages | "convert to json" >> beam.Map(lambda x: json.loads(x)) 106 | 107 | # write the transactions as is into messages table 108 | json_messages | "write messages to BQ" >> beam.io.WriteToBigQuery( 109 | messages_table_path.replace(".", ":", 1), 110 | schema=schema_defs.beam_messages_schema, 111 | write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, 112 | create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED 113 | ) 114 | 115 | # convert rows into kv pairs, window them, group them, create BQ Row 116 | pos_sale_kvs = json_messages | "create key/value pairs" >> beam.Map(make_kv) 117 | windowed_kvs = pos_sale_kvs | "window elements" >> beam.WindowInto(window.FixedWindows(10)) 118 | nested_rows = windowed_kvs | "group per key/window" >> beam.GroupByKey() 119 | nested_labelled_rows = nested_rows | "create BQ nested row" >> beam.ParDo(CreateBQRow()) 120 | 121 | # then stream rows into BQ nested table 122 | nested_labelled_rows | "write nested rows to BQ" >> beam.io.WriteToBigQuery( 123 | nested_table_path.replace(".", ":", 1), 124 | schema=schema_defs.beam_messages_nested_schema, 125 | write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, 126 | create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED 127 | ) 128 | 129 | if "DataflowRunner" in pipeline_args: 130 | p.run() 131 | else: 132 | p.run().wait_until_finish() -------------------------------------------------------------------------------- /bigquery/information_schema_examples.sql: -------------------------------------------------------------------------------- 1 | -- This file contains a list of example queries illustrating the power and benefit of using BigQuery INFORMATION_SCHEMA views. 2 | 3 | -- Use case: Identify top users by query volume and bytes processed in the last 7 days. 4 | -- This query helps monitor user activity and analyze cost drivers, especially useful for environments with on-demand pricing. 5 | -- It works by aggregating the total bytes processed and number of queries per user from the JOBS_BY_PROJECT view, filtered to the past week, and sorts the results to show the most active users. 6 | SELECT 7 | user_email, 8 | SUM(total_bytes_processed) AS total_bytes_processed, 9 | COUNT(job_id) AS total_queries, 10 | FROM 11 | `region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT` 12 | WHERE 13 | job_type = 'QUERY' 14 | AND creation_time BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 7 DAY) AND CURRENT_TIMESTAMP() 15 | GROUP BY 16 | user_email 17 | ORDER BY 18 | total_bytes_processed DESC 19 | LIMIT 10; 20 | 21 | 22 | -- Use case: Analyze table storage efficiency and compression to optimize costs and identify tables that might benefit from physical storage pricing. 23 | -- This query retrieves logical and physical storage metrics for each table, calculates compression ratios, and highlights tables with high or low compression. 24 | -- It also computes the proportion of data stored long-term, helping you spot tables that may benefit from partitioning or clustering. 25 | SELECT 26 | table_schema, 27 | table_name, 28 | total_logical_bytes, 29 | total_physical_bytes, 30 | IF 31 | (total_logical_bytes = 0, 0, (1-ROUND(total_physical_bytes/total_logical_bytes, 2)))*100 AS compression_ratio, 32 | active_logical_bytes, 33 | long_term_logical_bytes, 34 | IF 35 | (total_logical_bytes = 0, 0, (1-ROUND(active_logical_bytes/total_logical_bytes, 2)))*100 AS long_term_ratio, 36 | FROM 37 | `region-us.INFORMATION_SCHEMA.TABLE_STORAGE_BY_PROJECT` 38 | ORDER BY 39 | compression_ratio DESC; 40 | 41 | 42 | -- Use case: Identify recent queries that performed full table scans, which can be costly and indicate missing WHERE filters. 43 | -- This query finds SELECT statements executed in the last 24 hours that do not contain a WHERE clause, highlighting queries likely to scan entire tables. 44 | -- It helps pinpoint opportunities to optimize queries and reduce costs by adding filters or partitioning. 45 | SELECT 46 | query, 47 | user_email, 48 | total_bytes_processed 49 | FROM 50 | `region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT` 51 | WHERE 52 | job_type = 'QUERY' 53 | AND statement_type = 'SELECT' 54 | AND total_bytes_processed > 0 55 | AND creation_time BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 24 HOUR) AND CURRENT_TIMESTAMP() 56 | AND NOT REGEXP_CONTAINS(query, r'WHERE') 57 | ORDER BY 58 | total_bytes_processed DESC 59 | LIMIT 10; 60 | 61 | -- Use case: Find tables that have not been accessed in the last 90 days to identify candidates for cleanup, archiving, or cost optimization. 62 | -- This query combines job history and table metadata to determine the last time each table was queried, then filters for tables with no recent access. 63 | WITH 64 | recent_access AS ( 65 | SELECT 66 | rt.project_id AS project_id, 67 | rt.dataset_id AS dataset_id, 68 | rt.table_id AS table_name, 69 | MAX(j.creation_time) AS last_access_time 70 | FROM 71 | `region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT` AS j 72 | CROSS JOIN 73 | UNNEST(j.referenced_tables) AS rt 74 | WHERE 75 | j.job_type = "QUERY" 76 | AND j.state = "DONE" 77 | GROUP BY 78 | project_id, 79 | dataset_id, 80 | table_name ) 81 | -- 2) List all tables in the project and left-join access info 82 | SELECT 83 | t.table_catalog AS project_id, 84 | t.table_schema AS dataset_id, 85 | t.table_name, 86 | r.last_access_time 87 | FROM 88 | `region-us.INFORMATION_SCHEMA.TABLES` AS t 89 | LEFT JOIN 90 | recent_access AS r 91 | ON 92 | t.table_catalog = r.project_id 93 | AND t.table_schema = r.dataset_id 94 | AND t.table_name = r.table_name 95 | -- 3) Filter to those not accessed in the last 90 days (or never accessed) 96 | WHERE 97 | COALESCE(r.last_access_time, TIMESTAMP '1970-01-01') < TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 90 DAY) 98 | ORDER BY 99 | r.last_access_time; 100 | 101 | 102 | -- Use case: Monitor recent slot usage for jobs to analyze resource consumption, do slot budgeting, allocate costs, etc.. 103 | -- This query lists jobs executed in the last hour in a given region/project that used slots, showing who ran them and how many slot milliseconds were consumed. 104 | -- It helps identify users or jobs with high resource usage 105 | SELECT 106 | creation_time, 107 | job_id, 108 | user_email, 109 | total_slot_ms, 110 | FROM 111 | `region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT` 112 | WHERE 113 | creation_time BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1 HOUR) AND CURRENT_TIMESTAMP() 114 | AND total_slot_ms > 0 115 | ORDER BY 116 | total_slot_ms DESC; 117 | 118 | -- Use case: Audit and document all view definitions in a project for governance, troubleshooting, or migration. 119 | -- This query lists every view in the project/region along with its SQL definition, making it easy to review logic, dependencies, and ensure compliance. 120 | SELECT 121 | table_schema, 122 | table_name, 123 | view_definition 124 | FROM 125 | `region-us.INFORMATION_SCHEMA.VIEWS` 126 | ORDER BY 127 | table_schema, 128 | table_name; -------------------------------------------------------------------------------- /dlp-demo/app/main.py: -------------------------------------------------------------------------------- 1 | """ 2 | Modern Google Cloud DLP demonstration application. 3 | 4 | This module provides a Flask web application that demonstrates Google Cloud 5 | Data Loss Prevention (DLP) API capabilities including text inspection, 6 | redaction, replacement, and masking of sensitive information. 7 | """ 8 | 9 | import os 10 | import os 11 | import sys 12 | from typing import Any, Dict, Tuple 13 | 14 | from flask import Flask, render_template, request, jsonify 15 | from flask_cors import CORS 16 | from dotenv import load_dotenv 17 | 18 | # Handle imports for both direct execution and module import 19 | if __name__ == "__main__": 20 | # When run directly, add parent directory to path and use absolute imports 21 | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 22 | from app.services.dlp_service import DLPService 23 | from app.config import Config 24 | else: 25 | # When imported as a module, use relative imports 26 | from .services.dlp_service import DLPService 27 | from .config import Config 28 | 29 | # Load environment variables 30 | load_dotenv() 31 | 32 | 33 | def create_app(config_class: type = Config) -> Flask: 34 | """Application factory pattern for creating Flask app instances. 35 | 36 | Args: 37 | config_class: Configuration class to use for the app 38 | 39 | Returns: 40 | Configured Flask application instance 41 | """ 42 | app = Flask(__name__) 43 | app.config.from_object(config_class) 44 | 45 | # Enable CORS for all domains and routes 46 | CORS(app, resources={r"/*": {"origins": "*"}}) 47 | 48 | # Initialize services with error handling 49 | dlp_service = None 50 | if app.config["GOOGLE_CLOUD_PROJECT"]: 51 | try: 52 | dlp_service = DLPService(project_id=app.config["GOOGLE_CLOUD_PROJECT"]) 53 | except Exception as e: 54 | app.logger.warning(f"Failed to initialize DLP service: {str(e)}") 55 | else: 56 | app.logger.warning( 57 | "GOOGLE_CLOUD_PROJECT not configured - DLP functionality will be disabled" 58 | ) 59 | 60 | @app.route("/") 61 | def index() -> str: 62 | """Serve the main DLP demo page.""" 63 | return render_template("dlp-demo.html", title="DLP Demo 2.0") 64 | 65 | @app.route("/health") 66 | def health() -> Tuple[Dict[str, str], int]: 67 | """Health check endpoint for Cloud Run.""" 68 | return {"status": "healthy"}, 200 69 | 70 | @app.route("/api/dlp", methods=["POST"]) 71 | def process_dlp() -> Tuple[Dict[str, Any], int]: 72 | """Process text with DLP operations. 73 | 74 | Expected JSON payload: 75 | { 76 | "text": "Text to process", 77 | "action": "inspect|redact|replace|mask" 78 | } 79 | 80 | Returns: 81 | JSON response with DLP results 82 | """ 83 | try: 84 | data = request.get_json() 85 | if not data or "text" not in data or "action" not in data: 86 | return {"error": "Missing required fields: text, action"}, 400 87 | 88 | text = data["text"] 89 | action = data["action"] 90 | 91 | if not text.strip(): 92 | return {"error": "Text cannot be empty"}, 400 93 | 94 | # Check if DLP service is available 95 | if dlp_service is None: 96 | return { 97 | "error": ( 98 | "DLP service not available - check GOOGLE_CLOUD_PROJECT configuration" 99 | ) 100 | }, 503 101 | 102 | if action == "inspect": 103 | result = dlp_service.inspect_text(text) 104 | elif action in ["redact", "replace", "mask"]: 105 | result = dlp_service.deidentify_text(text, action) 106 | else: 107 | return {"error": f"Invalid action: {action}"}, 400 108 | 109 | return {"result": result}, 200 110 | 111 | except Exception as e: 112 | app.logger.error(f"Error processing DLP request: {str(e)}") 113 | return {"error": "Internal server error"}, 500 114 | 115 | @app.errorhandler(404) 116 | def not_found(error: Any) -> Tuple[str, int]: 117 | """Handle 404 errors with a custom page.""" 118 | return ( 119 | render_template( 120 | "message.html", 121 | headline="404 - Page Not Found", 122 | message_text="The page you're looking for doesn't exist.", 123 | title="Not Found", 124 | ), 125 | 404, 126 | ) 127 | 128 | @app.errorhandler(500) 129 | def internal_error(error: Any) -> Tuple[str, int]: 130 | """Handle 500 errors with a custom page.""" 131 | return ( 132 | render_template( 133 | "message.html", 134 | headline="500 - Internal Server Error", 135 | message_text="Something went wrong on our end.", 136 | title="Server Error", 137 | ), 138 | 500, 139 | ) 140 | 141 | return app 142 | 143 | 144 | # Create app instance for gunicorn 145 | app = create_app() 146 | 147 | 148 | if __name__ == "__main__": 149 | # Development server 150 | port = int(os.environ.get("PORT", 8080)) 151 | debug = os.environ.get("FLASK_ENV") == "development" 152 | 153 | app.run(host="0.0.0.0", port=port, debug=debug) 154 | -------------------------------------------------------------------------------- /dataplex/profiling/README.md: -------------------------------------------------------------------------------- 1 | ### Dataplex Data Profiling Demonstration 2 | 3 | This document provides a simplified, but realistic, demonstration of how Dataplex can be used for data profiling and quality checks in a production environment. 4 | 5 | #### 1. Simplified Data Engineer Workflow 6 | 7 | This workflow models a common scenario where a data engineer needs to validate new data from an external source before it can be used for downstream analytics. 8 | 9 | **Scenario:** A company receives daily sales data as a CSV file in a Google Cloud Storage (GCS) bucket. The data engineering team must ensure the data is complete, has the correct data types, and meets business rules before it's moved to a data warehouse. 10 | 11 | **Workflow Stages:** 12 | 13 | 1. **Ingestion:** A partner uploads a CSV file (`sales_data.csv`) to a designated **raw GCS bucket**. 14 | 2. **Data Discovery & Profiling (Dataplex):** A **Dataplex discovery job** automatically scans the GCS bucket. When a new file is detected, it triggers a **data profiling task**. 15 | * **Goal:** Automatically analyze the structure, data types, value distributions, and potential anomalies in the new data. 16 | 3. **Quality Check & Validation (Dataplex Data Quality):** A **Dataplex data quality scan** runs on the profiled data to enforce business rules. 17 | * **Goal:** Validate specific rules, such as "the `transaction_id` column must not contain null values." 18 | 4. **Action:** 19 | * **Success:** If the data passes the quality checks, it's considered clean and ready for analysis. 20 | * **Failure:** If the data fails, a data engineer is alerted to investigate the data quality issue. 21 | 5. **Preparation for Analytics:** The validated, clean data is now ready for use by business analysts and data scientists. 22 | 23 | ----- 24 | 25 | #### 2\. Step-by-Step Demonstration Instructions 26 | 27 | This guide will walk you through setting up the core Dataplex components and then simulating the ingestion of data to see the process in action. 28 | 29 | **Prerequisites:** 30 | 31 | * A Google Cloud Project with billing enabled. 32 | * The `gcloud` CLI installed and authenticated. 33 | * Terraform installed. 34 | * Necessary IAM roles for your user account (e.g., `Owner` or `Project Editor` for a demo environment). 35 | 36 | **Step 1: Set Up the Project with Terraform** 37 | 38 | 1. Create a new directory for your Terraform code. 39 | 40 | 2. Create the `main.tf` and `dataplex.tf` files with the code provided below. 41 | 42 | 3. Replace `your-gcp-project-id` with your actual project ID in `main.tf`. 43 | 44 | 4. Open a terminal in your project directory and run the following commands: 45 | 46 | ```bash 47 | terraform init 48 | terraform apply 49 | ``` 50 | 51 | 5. When prompted, type `yes` to approve the creation of the resources. The output will show the names of the resources created, including the raw GCS bucket. 52 | 53 | **Step 2: Generate Sample Data** 54 | 55 | 1. Create a CSV file named `sales_data.csv` with the following content. This represents our "good" data. 56 | 57 | ```csv 58 | transaction_id,product_sku,sale_amount,sale_date 59 | 1001,SKU-A,12.50,2025-08-01 60 | 1002,SKU-B,25.00,2025-08-01 61 | 1003,SKU-A,12.50,2025-08-02 62 | 1004,SKU-C,75.25,2025-08-02 63 | ``` 64 | 65 | **Step 3: Trigger the Workflow (Simulated Ingestion)** 66 | 67 | 1. Upload the `sales_data.csv` file to the GCS bucket created by Terraform. Find the bucket name in the Terraform output (it will be `demo-raw-bucket-`). 68 | 69 | ```bash 70 | gcloud storage cp sales_data.csv gs:///sales/sales_data.csv 71 | ``` 72 | 73 | **Step 4: Observe Dataplex in the Console** 74 | 75 | 1. Navigate to the **Dataplex** UI in the Google Cloud Console. 76 | 2. Click on your lake, zone, and asset. 77 | 3. In the asset details, you'll see a **Profiling** tab. Within a few minutes, the discovery and profiling job will have run on your new file. 78 | 4. On the **Profiling** tab, you'll see a detailed analysis of your data: 79 | * **Data Types:** Confirm that `transaction_id` is an `INTEGER`, `sale_amount` is a `FLOAT`, and `sale_date` is a `DATE` or `TIMESTAMP`. 80 | * **Statistics:** See metrics like mean, min, max, and standard deviation. 81 | * **Value Distribution:** View the distribution of values for each column. 82 | * **Null Count:** Observe that the `transaction_id` column has a null count of 0. 83 | 84 | **Step 5: Inspect the Data Quality Scan Results** 85 | 86 | 1. Go to the **Data quality scans** section in the Dataplex UI. 87 | 2. Find the scan you created (`sales-data-quality-scan`). 88 | 3. Click on the scan to view the results. The scan should show a **Success** status, indicating that the `NOT_NULL` check passed. 89 | 90 | **Step 6: Demonstrate a Failure Scenario (Optional)** 91 | 92 | 1. Create a new CSV file named `bad_sales_data.csv` that contains a null value in the `transaction_id` column. 93 | 94 | ```csv 95 | transaction_id,product_sku,sale_amount,sale_date 96 | 1005,SKU-D,50.00,2025-08-03 97 | ,SKU-E,15.25,2025-08-03 98 | ``` 99 | 100 | 2. Upload this file to the GCS bucket: 101 | 102 | ```bash 103 | gcloud storage cp bad_sales_data.csv gs:///sales/bad_sales_data.csv 104 | ``` 105 | 106 | 3. Wait for the discovery and profiling jobs to run. 107 | 108 | 4. Check the **Data quality scans** again. The new run of the `sales-data-quality-scan` should show a **Failure** status due to the null value. 109 | -------------------------------------------------------------------------------- /dataplex/profiling/profile.md: -------------------------------------------------------------------------------- 1 | # E-commerce Data Pipeline with Dataplex Profiling 2 | 3 | ## Overview 4 | 5 | This document details a realistic data engineering pipeline that leverages Google Cloud Dataplex for data profiling, showing how profiling integrates into the data flow and drives automated decision-making. 6 | 7 | ## Example: E-commerce Customer Analytics Pipeline 8 | 9 | **Context**: An e-commerce company ingests customer transaction data from multiple sources (web, mobile app, point-of-sale systems) and needs to maintain data quality while building analytics datasets. 10 | 11 | ## The Pipeline Flow 12 | 13 | ### 1. Data Ingestion 14 | - Raw transaction data lands in Cloud Storage buckets (JSON, CSV, Parquet files) 15 | - Data comes from web analytics, mobile apps, and POS systems 16 | - Files arrive throughout the day with varying schemas and quality 17 | 18 | ### 2. Dataplex Discovery & Profiling 19 | - Dataplex automatically discovers new datasets in the storage buckets 20 | - Profiling jobs run on a schedule (e.g., every 4 hours for new data, daily for full datasets) 21 | - Profiles capture: 22 | - Schema drift detection 23 | - Null value percentages 24 | - Data type distributions 25 | - Value ranges and outliers 26 | - Duplicate records 27 | - Pattern matching for emails, phone numbers, etc. 28 | 29 | ### 3. Data Quality Assessment 30 | - Cloud Functions triggered by Dataplex profiling completion 31 | - Custom logic evaluates profiling results against business rules: 32 | - Email fields must be >95% valid format 33 | - Transaction amounts must be within expected ranges 34 | - Customer IDs must have <1% null values 35 | - Schema changes trigger alerts 36 | 37 | ### 4. Automated Responses 38 | - **Pass Quality Gates**: Data flows to BigQuery staging tables 39 | - **Fail Quality Gates**: 40 | - Quarantine bad data to separate storage 41 | - Send alerts to data engineering team 42 | - Create incident tickets automatically 43 | - Block downstream processing 44 | 45 | ### 5. Data Processing & Enrichment 46 | - Dataflow/Dataproc jobs process qualified data 47 | - Join with reference data, apply business logic 48 | - Create customer 360 views, product analytics 49 | 50 | ### 6. Consumption 51 | - Clean data lands in BigQuery data warehouse 52 | - Powers BI dashboards, ML models, and operational reports 53 | - Data lineage tracked through Dataplex 54 | 55 | ## How Profiling Results Drive Actions 56 | 57 | ### Schema Evolution 58 | When profiling detects new fields in mobile app data, the pipeline automatically updates BigQuery schemas and notifies analysts of new available data. 59 | 60 | ### Data Quality Monitoring 61 | If profiling shows transaction amounts with unusual spikes, the system quarantines that batch and alerts the team to investigate potential data corruption. 62 | 63 | ### Performance Optimization 64 | Profiling results showing high cardinality fields inform partitioning strategies in BigQuery. 65 | 66 | ## Pipeline Architecture Diagram 67 | 68 | ```mermaid 69 | graph TD 70 | A[Web Analytics] --> D[Cloud Storage Raw Zone] 71 | B[Mobile App] --> D 72 | C[POS Systems] --> D 73 | 74 | D --> E[Dataplex Discovery & Auto-Profiling] 75 | E --> F{Data Quality Assessment} 76 | 77 | F --> G[Profile Results Analysis
- Schema validation
- Null value checks
- Range validation
- Pattern matching] 78 | 79 | G --> H{Quality Gates} 80 | 81 | H -->|Pass| I[Cloud Storage Curated Zone] 82 | H -->|Fail| J[Quarantine Storage] 83 | 84 | J --> K[Alert System
- Slack notifications
- Incident tickets
- Data team alerts] 85 | 86 | I --> L[Dataflow Processing
- Data cleansing
- Business logic
- Enrichment] 87 | 88 | L --> M[BigQuery Data Warehouse
- Customer 360
- Product analytics
- Transaction history] 89 | 90 | M --> N[Analytics & ML
- BI Dashboards
- Recommendation engine
- Fraud detection] 91 | 92 | E --> O[Dataplex Data Catalog
- Schema registry
- Data lineage
- Quality metrics] 93 | 94 | O --> P[Data Governance
- Policy enforcement
- Access controls
- Compliance reporting] 95 | 96 | Q[Scheduled Triggers
- Every 4 hours for new data
- Daily full profiling
- Weekly deep analysis] --> E 97 | 98 | style E fill:#e1f5fe 99 | style G fill:#fff3e0 100 | style H fill:#f3e5f5 101 | style J fill:#ffebee 102 | style O fill:#e8f5e8 103 | ``` 104 | 105 | ## Key Benefits 106 | 107 | ### Proactive Quality Management 108 | Rather than discovering data issues during analysis, profiling catches problems at ingestion time, preventing downstream corruption. 109 | 110 | ### Automated Decision Making 111 | Profile results directly trigger pipeline branching logic, reducing manual intervention and improving response time. 112 | 113 | ### Continuous Monitoring 114 | Regular profiling provides trend analysis on data quality metrics, helping teams identify degrading data sources before they become critical issues. 115 | 116 | ### Operational Efficiency 117 | By automatically quarantining bad data and alerting teams with specific profiling insights, the pipeline reduces time-to-resolution for data quality incidents. 118 | 119 | ## Conclusion 120 | 121 | The profiling essentially acts as a "data firewall" - ensuring only quality data flows through to expensive processing and storage systems while providing rich metadata for governance and optimization decisions. 122 | 123 | 124 | 125 | Demo 126 | 127 | gcloud storage cp web_events_20240806_14.json gs://jwd-gcp-demos-ecommerce-raw-dev/web-analytics/ 128 | gcloud storage cp store_transactions_20240806.csv gs://jwd-gcp-demos-ecommerce-raw-dev/pos-systems/ 129 | gcloud storage cp app_events_20240806_1400.jsonl gs://jwd-gcp-demos-ecommerce-raw-dev/mobile-app/ 130 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🚀 ROI GCP Training Demos 2 | 3 | Welcome to a collection of Google Cloud Platform demonstrations and hands-on examples brought to you by ROI Training! This repository contains demos designed to illustrate key GCP concepts, best practices, and common use cases across various Google Cloud services. 4 | 5 | Whether you're an instructor leading a training session or a student exploring GCP capabilities, these demos provide hands-on experience with the most important Google Cloud services and patterns. 6 | 7 | ## 1. Quick Start 8 | 9 | Get started in just a few steps: 10 | 11 | ```bash 12 | # Clone the repository 13 | cd ~ 14 | git clone https://github.com/roitraining/gcp-demos.git 15 | cd gcp-demos 16 | 17 | # Set your project (replace with your actual project ID) 18 | export GOOGLE_CLOUD_PROJECT=your-project-id 19 | gcloud config set project $GOOGLE_CLOUD_PROJECT 20 | ``` 21 | 22 | --- 23 | 24 | ## 2. BigQuery 25 | 26 | *Explore the power of Google's serverless data warehouse* 27 | 28 | #### 🔍 **SQL Examples Collection** 29 | The `bigquery/` directory contains a comprehensive set of SQL examples demonstrating: 30 | - **Array Functions**: Complex array manipulations and searching (`arrays_examples.sql`) 31 | - **Approximate Functions**: Using approximate functions for large-scale analytics (`approx_example.sql`) 32 | - **ELT Patterns**: Extract, Load, Transform patterns (`elt_examples.sql`) 33 | - **External Data**: Working with Hive-style external tables (`external_hive_example.sql`) 34 | - **Information Schema**: Metadata queries and system introspection (`information_schema_examples.sql`) 35 | - **Materialized Views**: Performance optimization with precomputed results (`mv_example.sql`) 36 | - **Time Travel**: Querying historical data snapshots (`time_travel_example.sql`) 37 | - **User-Defined Functions**: Custom SQL and JavaScript functions (`udf_examples.sql`) 38 | - **Views**: Creating and managing logical views (`views_example.sql`) 39 | 40 | #### 🏗️ **Schema Design Demo** 41 | The `bigquery/schema-demo/` directory provides a complete demonstration of schema design impact: 42 | - Compare normalized vs. denormalized table performance 43 | - Explore nested and repeated fields 44 | - Understand partitioning and clustering benefits 45 | - Generate sample datasets for testing 46 | 47 | #### 📚 **Interactive Do-It-Nows** 48 | Access 20+ hands-on BigQuery activities at: **https://roitraining.github.io/gcp-demos/#0** 49 | 50 | These self-paced exercises cover everything from basic queries to advanced analytics patterns. 51 | 52 | --- 53 | 54 | ## 3. Composer (Apache Airflow) 55 | 56 | #### 🛠️ **DAG Development** 57 | The `composer/dag_development/` directory contains DAG validation tools and scripts 58 | 59 | #### 📋 **Example DAGs** 60 | The `composer/dags/` directory includes simple but useful DAG examples 61 | 62 | --- 63 | 64 | ## 4. Dataflow 65 | 66 | #### 🔄 **Streaming Pipeline Demo** 67 | The `dataflow/dflow-bq-stream-python/` directory contains a complete streaming example: 68 | - Pub/Sub to BigQuery streaming pipeline 69 | - Window functions and aggregations 70 | - Nested/repeated data handling 71 | - Local and cloud execution patterns 72 | 73 | #### 🧪 **Simple Beam Examples** 74 | The `dataflow/simple_demos/` directory provides: 75 | - Basic Apache Beam concepts 76 | - Transform examples 77 | - Pipeline patterns and best practices 78 | 79 | --- 80 | 81 | ## 5. Data Loss Prevention (DLP) 82 | 83 | #### 🌐 **Interactive DLP Demo** 84 | Experience DLP capabilities firsthand: **https://bit.ly/roi-dlp-demo** 85 | 86 | 1. Enter text with various data types in the left pane 87 | 2. Watch DLP identify and classify sensitive information 88 | 3. Experiment with different remediation strategies 89 | 4. Explore contextual confidence ratings 90 | 91 | #### 💻 **Source Code** 92 | The `dlp-demo/` directory contains the complete application source: 93 | - Cloud Run deployment configuration 94 | - Python Flask application 95 | - DLP API integration examples 96 | - Docker containerization setup 97 | 98 | --- 99 | 100 | ## 6. Dataproc 101 | 102 | #### 📈 **Scaling Demonstrations** 103 | - **Manual Scaling**: Traditional cluster resizing (`dataproc_scale_demo.sh`) 104 | - **Autoscaling**: Dynamic resource allocation (`dataproc_autoscale_demo.sh`) 105 | 106 | --- 107 | 108 | ## 7. Dataform 109 | 110 | https://github.com/jwdavis/dataform-demo 111 | 112 | --- 113 | 114 | ## 8. Dataplex 115 | 116 | #### 📊 **Data Profiling** 117 | The `dataplex/profiling/` directory demonstrates: 118 | - Automated data quality assessment 119 | 120 | --- 121 | 122 | ## 9. Cloud Functions 123 | 124 | Examples include: 125 | - Sample function for processing log entries received via Pub/Sub 126 | 127 | --- 128 | 129 | ## 10. Security & IAM 130 | 131 | #### 🔑 **Authentication Examples** 132 | The `security/` directory contains: 133 | - Service account authentication patterns 134 | - OAuth and API key management 135 | - Organization policy examples and constraints 136 | 137 | --- 138 | 139 | ## 🚀 11. Coming Soon... 140 | 141 | The following areas are under active development: 142 | 143 | - **Pub/Sub**: Messaging and event streaming examples 144 | - **Terraform**: Infrastructure as Code templates 145 | - **Utilities**: Helper scripts and tools 146 | 147 | --- 148 | 149 | ## 📋 Quick Reference 150 | 151 | | Service | Directory | Key Features | 152 | | -------- | ----------- | ---------------------------------------------- | 153 | | BigQuery | `bigquery/` | SQL examples, schema design, analytics | 154 | | Composer | `composer/` | Airflow DAGs, workflow orchestration | 155 | | Dataflow | `dataflow/` | Streaming pipelines, Apache Beam | 156 | | DLP | `dlp-demo/` | Data classification, sensitive data protection | 157 | | Dataproc | `dataproc/` | Spark/Hadoop clusters, scaling demos | 158 | | Security | `security/` | IAM, authentication, policies | 159 | 160 | Happy learning! 🎓 161 | -------------------------------------------------------------------------------- /dlp-demo/app/services/dlp_service.py: -------------------------------------------------------------------------------- 1 | """ 2 | Google Cloud DLP service wrapper. 3 | 4 | This module provides a clean interface to Google Cloud DLP API for 5 | text inspection and de-identification operations. 6 | """ 7 | 8 | from typing import Dict, List, Any, Optional 9 | import logging 10 | 11 | from google.cloud import dlp_v2 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class DLPService: 17 | """Service class for Google Cloud DLP operations.""" 18 | 19 | # Standard info types to detect 20 | DEFAULT_INFO_TYPES = [ 21 | "EMAIL_ADDRESS", 22 | "CREDIT_CARD_NUMBER", 23 | "GENERIC_ID", 24 | "IP_ADDRESS", 25 | "PHONE_NUMBER", 26 | "US_DRIVERS_LICENSE_NUMBER", 27 | "US_SOCIAL_SECURITY_NUMBER", 28 | "PERSON_NAME", 29 | "US_PASSPORT", 30 | "DATE_OF_BIRTH", 31 | ] 32 | 33 | def __init__(self, project_id: str, info_types: Optional[List[str]] = None): 34 | """Initialize the DLP service. 35 | 36 | Args: 37 | project_id: Google Cloud project ID 38 | info_types: List of info types to detect (uses defaults if None) 39 | """ 40 | self.project_id = project_id 41 | self.info_types = info_types or self.DEFAULT_INFO_TYPES 42 | self.client = dlp_v2.DlpServiceClient() 43 | self.parent = f"projects/{project_id}" 44 | 45 | logger.info(f"Initialized DLP service for project: {project_id}") 46 | 47 | def inspect_text(self, text: str) -> str: 48 | """Inspect text for sensitive information. 49 | 50 | Args: 51 | text: Text to inspect 52 | 53 | Returns: 54 | HTML-formatted string with inspection results 55 | """ 56 | try: 57 | # Configure inspection 58 | inspect_config = { 59 | "info_types": [{"name": info_type} for info_type in self.info_types], 60 | "include_quote": True, 61 | "min_likelihood": "POSSIBLE", 62 | } 63 | 64 | item = {"value": text} 65 | 66 | # Call the API 67 | response = self.client.inspect_content( 68 | request={ 69 | "parent": self.parent, 70 | "inspect_config": inspect_config, 71 | "item": item, 72 | } 73 | ) 74 | 75 | # Format results 76 | if response.result.findings: 77 | result_parts = [] 78 | for finding in response.result.findings: 79 | parts = [] 80 | if finding.quote: 81 | parts.append(f"Quote: {finding.quote}") 82 | parts.append( 83 | f"Info type: {finding.info_type.name}" 84 | ) 85 | parts.append( 86 | f"Likelihood: {finding.likelihood.name}" 87 | ) 88 | 89 | if finding.location.byte_range.start: 90 | start = finding.location.byte_range.start 91 | end = finding.location.byte_range.end 92 | parts.append(f"Location: {start}-{end}") 93 | 94 | result_parts.append("
".join(parts)) 95 | 96 | return "

".join(result_parts) 97 | else: 98 | return "No sensitive information detected." 99 | 100 | except Exception as e: 101 | logger.error(f"Error inspecting text: {str(e)}") 102 | return f"Error during inspection: {str(e)}" 103 | 104 | def deidentify_text(self, text: str, action: str) -> str: 105 | """De-identify sensitive information in text. 106 | 107 | Args: 108 | text: Text to de-identify 109 | action: Type of de-identification ('redact', 'replace', 'mask') 110 | 111 | Returns: 112 | HTML-formatted string with de-identified text 113 | """ 114 | try: 115 | # Configure inspection 116 | inspect_config = { 117 | "info_types": [{"name": info_type} for info_type in self.info_types] 118 | } 119 | 120 | # Configure transformation based on action 121 | if action == "redact": 122 | transformation = {"redact_config": {}} 123 | elif action == "replace": 124 | transformation = { 125 | "replace_config": {"new_value": {"string_value": "[REDACTED]"}} 126 | } 127 | elif action == "mask": 128 | transformation = { 129 | "character_mask_config": { 130 | "masking_character": "#", 131 | "number_to_mask": 0, # Mask all characters 132 | "characters_to_ignore": [{"characters_to_skip": "(),-/@."}], 133 | } 134 | } 135 | else: 136 | raise ValueError(f"Unsupported action: {action}") 137 | 138 | # Configure de-identification 139 | deidentify_config = { 140 | "info_type_transformations": { 141 | "transformations": [{"primitive_transformation": transformation}] 142 | } 143 | } 144 | 145 | item = {"value": text} 146 | 147 | # Call the API 148 | response = self.client.deidentify_content( 149 | request={ 150 | "parent": self.parent, 151 | "inspect_config": inspect_config, 152 | "deidentify_config": deidentify_config, 153 | "item": item, 154 | } 155 | ) 156 | 157 | # Return formatted result 158 | result_text = response.item.value 159 | return "
".join(result_text.split("\n")) 160 | 161 | except Exception as e: 162 | logger.error(f"Error de-identifying text with action '{action}': {str(e)}") 163 | return f"Error during {action}: {str(e)}" 164 | -------------------------------------------------------------------------------- /dlp-demo/app/templates/dlp-demo.html: -------------------------------------------------------------------------------- 1 | {% extends 'base.html' %} 2 | 3 | {% block head %} 4 | {{ title }} 5 | 6 | 7 | 41 | {% endblock %} 42 | 43 | {% block content %} 44 |
45 |
46 |
47 |
48 |
49 |
Source Text
50 |
51 |
52 | 58 | 59 |
60 |
61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 |
73 |
74 | 75 |
76 | 77 | 78 | Processing... 79 | 80 | 81 | Waiting for input to finish... 82 | 83 | 84 | ✓ Processed 85 | 86 | 87 | Enter text above to see DLP results 88 | 89 |
90 |
91 |
92 |
93 | 94 |
95 |
96 |
97 |
DLP Results
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 | {% endblock %} 108 | 109 | {% block scripts %} 110 | 186 | {% endblock %} 187 | -------------------------------------------------------------------------------- /bigquery/scds_examples.sql: -------------------------------------------------------------------------------- 1 | -- this is all very much in progress 2 | -- # dml to update array, match element in array, update element in array 3 | 4 | -- take 1 5 | -- rebuild the entire table 6 | -- takes about 18 minutes 7 | CREATE OR REPLACE TABLE 8 | `bq_demo.nested_once` AS ( 9 | WITH 10 | 11 | -- denormalize the nested table 12 | denorm AS ( 13 | SELECT 14 | * EXCEPT (line_items) 15 | FROM 16 | `bq_demo.nested_once` n, 17 | n.line_items ), 18 | 19 | -- update line_items with new price if there's an entry in price_updates 20 | updated AS ( 21 | SELECT 22 | denorm.* EXCEPT(prod_price), 23 | if(pu.prod_price is not null, pu.prod_price, denorm.prod_price) AS prod_price 24 | FROM 25 | denorm 26 | left JOIN 27 | `bq_demo.price_updates` pu 28 | ON 29 | denorm.prod_code = pu.prod_code) 30 | 31 | -- reconstitute the nested table 32 | SELECT 33 | * EXCEPT (line_item_num, 34 | prod_code, 35 | qty, 36 | prod_name, 37 | prod_desc, 38 | prod_price), 39 | ARRAY_AGG(STRUCT(line_item_num, 40 | prod_code, 41 | qty, 42 | prod_name, 43 | prod_desc, 44 | prod_price)) as line_items 45 | FROM 46 | updated 47 | GROUP BY 48 | order_num, 49 | order_date, 50 | cust_phone, 51 | cust_email, 52 | cust_zip, 53 | cust_state, 54 | cust_address, 55 | cust_name, 56 | cust_id) 57 | 58 | -- take 2 59 | -- stored procedure to update nested once 60 | -- uses temporary table, updates only rows that need to be updated 61 | -- takes about 19 minutes 62 | BEGIN 63 | -- generate the update table that has only updated order rows 64 | CREATE TEMPORARY TABLE m AS ( 65 | WITH 66 | -- denormalize the nested table 67 | denorm AS ( 68 | SELECT 69 | * EXCEPT(line_items) 70 | FROM 71 | `bq_demo.nested_once` o, 72 | UNNEST(line_items) l), 73 | 74 | -- get the order numbers that have rows that need to be updated 75 | order_numbers AS ( 76 | SELECT 77 | order_num 78 | FROM 79 | denorm d 80 | JOIN 81 | `bq_demo.price_updates` p 82 | ON 83 | d.prod_code = p.prod_code 84 | GROUP BY 85 | order_num), 86 | 87 | -- get the rows that need to be updated 88 | relevant AS ( 89 | SELECT 90 | d.* 91 | FROM 92 | denorm d 93 | JOIN 94 | order_numbers o 95 | ON 96 | o.order_num = d.order_num ), 97 | 98 | -- update line_items with new price if there's an entry in price_updates 99 | updated AS ( 100 | SELECT 101 | r.* EXCEPT (prod_price), 102 | IFNULL(p.prod_price,r.prod_price) AS prod_price 103 | FROM 104 | relevant r 105 | LEFT JOIN 106 | `bq_demo.price_updates` p 107 | ON 108 | r.prod_code = p.prod_code) 109 | 110 | -- reconstitute the nested table 111 | SELECT 112 | * EXCEPT (line_item_num, 113 | prod_code, 114 | qty, 115 | prod_name, 116 | prod_desc, 117 | prod_price), 118 | ARRAY_AGG(STRUCT(line_item_num, 119 | prod_code, 120 | qty, 121 | prod_name, 122 | prod_desc, 123 | prod_price)) AS line_items 124 | FROM 125 | updated 126 | GROUP BY 127 | order_num, 128 | order_date, 129 | cust_phone, 130 | cust_email, 131 | cust_zip, 132 | cust_state, 133 | cust_address, 134 | cust_name, 135 | cust_id); 136 | 137 | -- merge the updated order rows into the original table 138 | MERGE 139 | `bq_demo.nested_once` n 140 | USING 141 | m 142 | ON 143 | m.order_num = n.order_num 144 | WHEN MATCHED THEN UPDATE SET line_items = m.line_items; 145 | END 146 | 147 | -- take 3 148 | -- stored procedure to update nested once 149 | -- does order search before denormalizing 150 | -- avoids temporary table 151 | -- takes like 12-13 minutes 152 | BEGIN 153 | -- create an array of price_updates 154 | -- we can use this to filter rows that need to be updated 155 | -- and avoid the join 156 | DECLARE 157 | prod_codes DEFAULT (array( 158 | SELECT 159 | prod_code 160 | FROM 161 | bq_demo.price_updates)); 162 | 163 | -- avoid the temp table, put everything into the merge 164 | MERGE 165 | bq_demo.nested_once n 166 | USING 167 | ( 168 | WITH 169 | 170 | -- get the rows that need to be updated 171 | relevant AS ( 172 | SELECT 173 | * 174 | FROM 175 | bq_demo.nested_once 176 | WHERE 177 | EXISTS ( 178 | SELECT 179 | * 180 | FROM 181 | UNNEST(line_items) as li 182 | WHERE 183 | prod_code IN unnest(prod_codes))), 184 | 185 | -- denormalize the rows that need to be updated 186 | denorm AS ( 187 | SELECT 188 | * EXCEPT (line_items) 189 | FROM 190 | relevant, 191 | relevant.line_items), 192 | 193 | -- update line_items with new price if there's an entry in price_updates 194 | updated AS ( 195 | SELECT 196 | d.* EXCEPT (prod_price), 197 | IFNULL(p.prod_price,d.prod_price) AS prod_price 198 | FROM 199 | denorm d 200 | LEFT JOIN 201 | `bq_demo.price_updates` p 202 | ON 203 | d.prod_code = p.prod_code) 204 | 205 | -- reconstitute the nested table 206 | SELECT 207 | * EXCEPT (line_item_num, 208 | prod_code, 209 | qty, 210 | prod_name, 211 | prod_desc, 212 | prod_price), 213 | ARRAY_AGG(STRUCT(line_item_num, 214 | prod_code, 215 | qty, 216 | prod_name, 217 | prod_desc, 218 | prod_price)) AS line_items 219 | FROM 220 | updated 221 | GROUP BY 222 | order_num, 223 | order_date, 224 | cust_phone, 225 | cust_email, 226 | cust_zip, 227 | cust_state, 228 | cust_address, 229 | cust_name, 230 | cust_id) u 231 | ON 232 | u.order_num = n.order_num 233 | -- replace array with new array with update values 234 | WHEN MATCHED THEN UPDATE SET line_items = u.line_items; 235 | END 236 | 237 | --take 4 238 | -- stored procedure to update nested once 239 | -- filters before denormalizing 240 | -- uses merge to update denorm 241 | -- uses merge to reconstitute rows then merge into source 242 | -- takes 12-13 minutes (12.5 with 5K slots) 243 | BEGIN 244 | 245 | -- create an array of price_updates 246 | DECLARE 247 | prod_codes DEFAULT (ARRAY( 248 | SELECT 249 | prod_code 250 | FROM 251 | bq_demo.price_updates)); 252 | -- find the rows that need to be updated 253 | -- denormalize them 254 | CREATE TEMPORARY TABLE denorm AS ( 255 | WITH 256 | rows_to_update AS ( 257 | SELECT 258 | * 259 | FROM 260 | bq_demo.nested_once 261 | WHERE 262 | EXISTS ( 263 | SELECT 264 | * 265 | FROM 266 | UNNEST(line_items) AS li 267 | WHERE 268 | prod_code IN UNNEST(prod_codes))) 269 | SELECT 270 | * EXCEPT (line_items) 271 | FROM 272 | rows_to_update, 273 | rows_to_update.line_items); 274 | -- update the denormalized rows 275 | MERGE 276 | denorm d 277 | USING 278 | bq_demo.price_updates p 279 | ON 280 | d.prod_code = p.prod_code 281 | WHEN MATCHED THEN 282 | UPDATE 283 | SET prod_price = p.prd_price; 284 | 285 | -- merge 286 | -- create nested rows, then replace the old rows 287 | MERGE bq_demo.nested_once n 288 | USING ( 289 | SELECT * EXCEPT (line_item_num, prod_code, qty, prod_name, prod_desc, prod_price), ARRAY_AGG(STRUCT(line_item_num, prod_code, qty, prod_name, prod_desc, prod_price)) AS line_items FROM denorm d GROUP BY order_num, order_date, cust_phone, cust_email, cust_zip, cust_state, cust_address, cust_name, cust_id) u 290 | ON u.order_num = n.order_num 291 | WHEN MATCHED THEN 292 | UPDATE 293 | SET 294 | line_items = u.line_items; 295 | END 296 | 297 | -- To dos 298 | -- # dml for type 1 dimension 299 | -- # dml for type 2 dimension 300 | -- # dml to update array, match col 301 | -- # dml to update array, match element in array, delete element in array 302 | -- # dml to update array, match element in array, update elements in array 303 | -- # dml to update array, match element in array, delete elements in array 304 | -- # dml to update array, match element in array, insert element into array 305 | -- # dml to update array, match element in array, insert elements into array -------------------------------------------------------------------------------- /dlp-demo/app/static/images/symbol-full-color.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /dataplex/profiling/main.tf: -------------------------------------------------------------------------------- 1 | # Variables 2 | variable "project_id" { 3 | description = "GCP Project ID" 4 | type = string 5 | } 6 | 7 | variable "region" { 8 | description = "GCP Region" 9 | type = string 10 | default = "us-central1" 11 | } 12 | 13 | variable "environment" { 14 | description = "Environment name (dev, staging, prod)" 15 | type = string 16 | default = "dev" 17 | } 18 | 19 | # Enable required APIs 20 | resource "google_project_service" "required_apis" { 21 | for_each = toset([ 22 | "dataplex.googleapis.com", 23 | "storage.googleapis.com", 24 | "bigquery.googleapis.com", 25 | "dataflow.googleapis.com", 26 | "cloudfunctions.googleapis.com", 27 | "cloudscheduler.googleapis.com", 28 | "pubsub.googleapis.com" 29 | ]) 30 | 31 | project = var.project_id 32 | service = each.value 33 | 34 | disable_dependent_services = false 35 | disable_on_destroy = false 36 | } 37 | 38 | # Cloud Storage Buckets for Data Zones 39 | resource "google_storage_bucket" "raw_zone" { 40 | name = "${var.project_id}-ecommerce-raw-${var.environment}" 41 | location = "US" # Multi-region for Dataplex compatibility 42 | project = var.project_id 43 | force_destroy = true 44 | 45 | uniform_bucket_level_access = true 46 | 47 | versioning { 48 | enabled = true 49 | } 50 | 51 | lifecycle_rule { 52 | condition { 53 | age = 90 54 | } 55 | action { 56 | type = "Delete" 57 | } 58 | } 59 | 60 | labels = { 61 | environment = var.environment 62 | zone = "raw" 63 | purpose = "data-ingestion" 64 | } 65 | 66 | depends_on = [google_project_service.required_apis] 67 | } 68 | 69 | resource "google_storage_bucket" "curated_zone" { 70 | name = "${var.project_id}-ecommerce-curated-${var.environment}" 71 | location = "US" # Multi-region for Dataplex compatibility 72 | project = var.project_id 73 | force_destroy = true 74 | 75 | uniform_bucket_level_access = true 76 | 77 | versioning { 78 | enabled = true 79 | } 80 | 81 | lifecycle_rule { 82 | condition { 83 | age = 365 84 | } 85 | action { 86 | type = "Delete" 87 | } 88 | } 89 | 90 | labels = { 91 | environment = var.environment 92 | zone = "curated" 93 | purpose = "processed-data" 94 | } 95 | 96 | depends_on = [google_project_service.required_apis] 97 | } 98 | 99 | resource "google_storage_bucket" "quarantine_zone" { 100 | name = "${var.project_id}-ecommerce-quarantine-${var.environment}" 101 | location = "US" # Multi-region for Dataplex compatibility 102 | project = var.project_id 103 | force_destroy = true 104 | 105 | uniform_bucket_level_access = true 106 | 107 | labels = { 108 | environment = var.environment 109 | zone = "quarantine" 110 | purpose = "failed-quality-checks" 111 | } 112 | 113 | depends_on = [google_project_service.required_apis] 114 | } 115 | 116 | # Create folder structure in raw zone bucket 117 | resource "google_storage_bucket_object" "raw_zone_folders" { 118 | for_each = toset([ 119 | "web-analytics/", 120 | "mobile-app/", 121 | "pos-systems/" 122 | ]) 123 | 124 | name = each.value 125 | bucket = google_storage_bucket.raw_zone.name 126 | source = "/dev/null" 127 | } 128 | 129 | # Dataplex Lake 130 | resource "google_dataplex_lake" "ecommerce_lake" { 131 | name = "ecommerce-data-lake-${var.environment}" 132 | location = var.region 133 | project = var.project_id 134 | display_name = "E-commerce Data Lake (${upper(var.environment)})" 135 | description = "Data lake for e-commerce customer analytics pipeline" 136 | 137 | labels = { 138 | environment = var.environment 139 | team = "data-engineering" 140 | } 141 | 142 | depends_on = [google_project_service.required_apis] 143 | } 144 | 145 | # Raw Data Zone 146 | resource "google_dataplex_zone" "raw_zone" { 147 | name = "raw-zone" 148 | location = var.region 149 | project = var.project_id 150 | lake = google_dataplex_lake.ecommerce_lake.name 151 | display_name = "Raw Data Zone" 152 | description = "Zone for ingested raw data from multiple sources" 153 | 154 | type = "RAW" 155 | 156 | discovery_spec { 157 | enabled = true 158 | schedule = "0 */4 * * *" # Every 4 hours 159 | 160 | include_patterns = [ 161 | "gs://${google_storage_bucket.raw_zone.name}/**" 162 | ] 163 | } 164 | 165 | resource_spec { 166 | location_type = "MULTI_REGION" 167 | } 168 | 169 | labels = { 170 | environment = var.environment 171 | data-tier = "raw" 172 | } 173 | } 174 | 175 | # Curated Data Zone 176 | resource "google_dataplex_zone" "curated_zone" { 177 | name = "curated-zone" 178 | location = var.region 179 | project = var.project_id 180 | lake = google_dataplex_lake.ecommerce_lake.name 181 | display_name = "Curated Data Zone" 182 | description = "Zone for processed, quality-assured data" 183 | 184 | type = "CURATED" 185 | 186 | discovery_spec { 187 | enabled = true 188 | schedule = "0 6 * * *" # Daily at 6 AM 189 | 190 | include_patterns = [ 191 | "gs://${google_storage_bucket.curated_zone.name}/**" 192 | ] 193 | } 194 | 195 | resource_spec { 196 | location_type = "MULTI_REGION" 197 | } 198 | 199 | labels = { 200 | environment = var.environment 201 | data-tier = "curated" 202 | } 203 | } 204 | 205 | # Raw Zone Asset (single asset for the entire raw bucket) 206 | resource "google_dataplex_asset" "raw_data" { 207 | name = "raw-data-asset" 208 | location = var.region 209 | project = var.project_id 210 | lake = google_dataplex_lake.ecommerce_lake.name 211 | dataplex_zone = google_dataplex_zone.raw_zone.name 212 | display_name = "Raw Data Storage" 213 | description = "Raw data from all sources: web analytics, mobile app, and POS systems" 214 | 215 | resource_spec { 216 | name = "projects/${var.project_id}/buckets/${google_storage_bucket.raw_zone.name}" 217 | type = "STORAGE_BUCKET" 218 | } 219 | 220 | discovery_spec { 221 | enabled = true 222 | schedule = "0 */2 * * *" # Every 2 hours 223 | 224 | # Include patterns to focus discovery on specific folders 225 | include_patterns = [ 226 | "gs://${google_storage_bucket.raw_zone.name}/web-analytics/**", 227 | "gs://${google_storage_bucket.raw_zone.name}/mobile-app/**", 228 | "gs://${google_storage_bucket.raw_zone.name}/pos-systems/**" 229 | ] 230 | 231 | # Exclude temporary or processing files 232 | exclude_patterns = [ 233 | "gs://${google_storage_bucket.raw_zone.name}/**/temp/**", 234 | "gs://${google_storage_bucket.raw_zone.name}/**/_processing/**" 235 | ] 236 | } 237 | 238 | labels = { 239 | zone = "raw" 240 | environment = var.environment 241 | sources = "web-mobile-pos" 242 | } 243 | } 244 | 245 | # Curated Zone Asset 246 | resource "google_dataplex_asset" "curated_data" { 247 | name = "curated-data-asset" 248 | location = var.region 249 | project = var.project_id 250 | lake = google_dataplex_lake.ecommerce_lake.name 251 | dataplex_zone = google_dataplex_zone.curated_zone.name 252 | display_name = "Curated Data Storage" 253 | description = "Processed and quality-assured data ready for analytics" 254 | 255 | resource_spec { 256 | name = "projects/${var.project_id}/buckets/${google_storage_bucket.curated_zone.name}" 257 | type = "STORAGE_BUCKET" 258 | } 259 | 260 | discovery_spec { 261 | enabled = true 262 | schedule = "0 6 * * *" # Daily at 6 AM 263 | } 264 | 265 | labels = { 266 | zone = "curated" 267 | environment = var.environment 268 | quality = "verified" 269 | } 270 | } 271 | 272 | # Data Quality Scan for automated profiling 273 | resource "google_dataplex_datascan" "data_quality_scan" { 274 | data_scan_id = "ecommerce-data-quality-scan" 275 | location = var.region 276 | project = var.project_id 277 | display_name = "E-commerce Data Quality Scan" 278 | description = "Automated data quality scanning and profiling" 279 | 280 | data { 281 | resource = google_dataplex_asset.raw_data.name 282 | } 283 | 284 | execution_spec { 285 | trigger { 286 | schedule { 287 | cron = "0 */4 * * *" # Every 4 hours 288 | } 289 | } 290 | } 291 | 292 | # Data profiling configuration 293 | data_profile_spec { 294 | sampling_percent = 100.0 295 | 296 | # Include specific fields for profiling 297 | include_fields { 298 | field_names = ["*"] # Profile all fields 299 | } 300 | } 301 | 302 | labels = { 303 | environment = var.environment 304 | scan-type = "data-profile" 305 | } 306 | } 307 | 308 | # Data Quality Rules Scan 309 | resource "google_dataplex_datascan" "data_quality_rules" { 310 | data_scan_id = "ecommerce-quality-rules-scan" 311 | location = var.region 312 | project = var.project_id 313 | display_name = "E-commerce Data Quality Rules" 314 | description = "Business rule validation for e-commerce data" 315 | 316 | data { 317 | resource = google_dataplex_asset.raw_data.name 318 | } 319 | 320 | execution_spec { 321 | trigger { 322 | schedule { 323 | cron = "0 */4 * * *" # Every 4 hours 324 | } 325 | } 326 | } 327 | 328 | # Data quality rules 329 | data_quality_spec { 330 | sampling_percent = 100.0 331 | 332 | # Rule: Check for null values in critical fields 333 | rules { 334 | column = "user_id" 335 | dimension = "COMPLETENESS" 336 | threshold = 0.7 # Allow 30% nulls (anonymous users) 337 | 338 | non_null_expectation {} 339 | } 340 | 341 | # Rule: Validate email format if present 342 | rules { 343 | column = "email" 344 | dimension = "VALIDITY" 345 | threshold = 0.95 346 | 347 | regex_expectation { 348 | regex = "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$" 349 | } 350 | } 351 | 352 | # Rule: Check timestamp format 353 | rules { 354 | column = "timestamp" 355 | dimension = "VALIDITY" 356 | threshold = 1.0 357 | 358 | non_null_expectation {} 359 | } 360 | 361 | # Rule: Validate transaction amounts are positive 362 | rules { 363 | column = "price" 364 | dimension = "VALIDITY" 365 | threshold = 0.99 366 | 367 | range_expectation { 368 | min_value = "0" 369 | max_value = "10000" 370 | } 371 | } 372 | } 373 | 374 | labels = { 375 | environment = var.environment 376 | scan-type = "data-quality" 377 | } 378 | } 379 | 380 | # Service Account for Dataplex operations 381 | resource "google_service_account" "dataplex_sa" { 382 | account_id = "dataplex-service-${var.environment}" 383 | display_name = "Dataplex Service Account" 384 | description = "Service account for Dataplex operations and data profiling" 385 | project = var.project_id 386 | } 387 | 388 | # IAM bindings for Dataplex service account 389 | resource "google_project_iam_member" "dataplex_sa_roles" { 390 | for_each = toset([ 391 | "roles/dataplex.developer", 392 | "roles/dataplex.dataReader", 393 | "roles/dataplex.dataWriter", 394 | "roles/storage.objectViewer", 395 | "roles/storage.objectCreator", 396 | "roles/bigquery.dataEditor", 397 | "roles/bigquery.jobUser", 398 | "roles/dataproc.worker", 399 | "roles/dataproc.editor" 400 | ]) 401 | 402 | project = var.project_id 403 | role = each.value 404 | member = "serviceAccount:${google_service_account.dataplex_sa.email}" 405 | } 406 | 407 | # Pub/Sub topic for data quality notifications 408 | resource "google_pubsub_topic" "data_quality_alerts" { 409 | name = "data-quality-alerts-${var.environment}" 410 | project = var.project_id 411 | 412 | labels = { 413 | environment = var.environment 414 | purpose = "data-quality" 415 | } 416 | 417 | depends_on = [google_project_service.required_apis] 418 | } 419 | 420 | # BigQuery dataset for processed data 421 | resource "google_bigquery_dataset" "ecommerce_analytics" { 422 | dataset_id = "ecommerce_analytics_${var.environment}" 423 | project = var.project_id 424 | location = var.region 425 | description = "Analytics dataset for e-commerce customer data" 426 | 427 | labels = { 428 | environment = var.environment 429 | team = "analytics" 430 | } 431 | 432 | depends_on = [google_project_service.required_apis] 433 | } 434 | 435 | # Outputs 436 | output "lake_name" { 437 | description = "Name of the Dataplex lake" 438 | value = google_dataplex_lake.ecommerce_lake.name 439 | } 440 | 441 | output "raw_bucket_name" { 442 | description = "Name of the raw data bucket" 443 | value = google_storage_bucket.raw_zone.name 444 | } 445 | 446 | output "curated_bucket_name" { 447 | description = "Name of the curated data bucket" 448 | value = google_storage_bucket.curated_zone.name 449 | } 450 | 451 | output "quarantine_bucket_name" { 452 | description = "Name of the quarantine bucket" 453 | value = google_storage_bucket.quarantine_zone.name 454 | } 455 | 456 | output "dataplex_service_account" { 457 | description = "Email of the Dataplex service account" 458 | value = google_service_account.dataplex_sa.email 459 | } 460 | 461 | output "pubsub_topic" { 462 | description = "Pub/Sub topic for data quality alerts" 463 | value = google_pubsub_topic.data_quality_alerts.name 464 | } 465 | --------------------------------------------------------------------------------