├── .github
├── pull_request_template.md
└── workflows
│ ├── pull_request.yml
│ └── terraform_plan.yml
├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── pipelines
├── .DS_Store
├── .gitignore
├── README.md
├── anomaly_detection
│ ├── Dockerfile
│ ├── LICENSE
│ ├── MANIFEST.in
│ ├── README.md
│ ├── anomaly_detection_pipeline
│ │ ├── __init__.py
│ │ ├── options.py
│ │ └── pipeline.py
│ ├── cloudbuild.yaml
│ ├── main.py
│ ├── requirements-dev.txt
│ ├── requirements.txt
│ └── setup.py
├── cdp
│ ├── Dockerfile
│ ├── LICENSE
│ ├── MANIFEST.in
│ ├── README.md
│ ├── cdp_pipeline
│ │ ├── __init__.py
│ │ ├── customer_data_platform.py
│ │ ├── generate_transaction_data.py
│ │ └── options.py
│ ├── cloudbuild.yaml
│ ├── input_data
│ │ ├── coupon_redempt.csv
│ │ └── transaction_data.csv
│ ├── main.py
│ ├── requirements.txt
│ ├── schema
│ │ └── unified_table.json
│ ├── scripts
│ │ ├── 01_cloudbuild_and_push_container.sh
│ │ ├── 02_run_dataflow_job.sh
│ │ └── run.sh
│ └── setup.py
├── clickstream_analytics_java
│ ├── .gitattributes
│ ├── .gitignore
│ ├── README.md
│ ├── build.gradle
│ ├── gradle
│ │ └── wrapper
│ │ │ ├── gradle-wrapper.jar
│ │ │ └── gradle-wrapper.properties
│ ├── gradlew
│ ├── gradlew.bat
│ ├── scripts
│ │ └── 01_launch_pipeline.sh
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── com
│ │ │ └── google
│ │ │ └── cloud
│ │ │ └── dataflow
│ │ │ └── solutions
│ │ │ └── clickstream_analytics
│ │ │ ├── BigTableEnrichment.java
│ │ │ ├── ClickstreamPubSubToBq.java
│ │ │ ├── JsonToTableRows.java
│ │ │ └── Metrics.java
│ │ └── resources
│ │ └── streaming_source_deadletter_table_schema.json
├── etl_integration_java
│ ├── .gitignore
│ ├── README.md
│ ├── build.gradle
│ ├── gradle
│ │ └── wrapper
│ │ │ ├── gradle-wrapper.jar
│ │ │ └── gradle-wrapper.properties
│ ├── gradlew
│ ├── gradlew.bat
│ ├── imgs
│ │ └── etl_integration.png
│ ├── scripts
│ │ ├── .gitignore
│ │ ├── 02_run_publisher_dataflow.sh
│ │ └── 03_run_changestream_template.sh
│ └── src
│ │ └── main
│ │ └── java
│ │ └── com
│ │ └── google
│ │ └── cloud
│ │ └── dataflow
│ │ └── solutions
│ │ ├── ETLIntegration.java
│ │ ├── data
│ │ ├── SchemaUtils.java
│ │ └── TaxiObjects.java
│ │ ├── load
│ │ └── Spanner.java
│ │ ├── options
│ │ └── SpannerPublisherOptions.java
│ │ └── transform
│ │ ├── RowToError.java
│ │ └── TaxiEventProcessor.java
├── imgs
│ ├── anomaly_detect_arch.png
│ ├── cdp.png
│ ├── iot_analytics.png
│ ├── log_replication.png
│ ├── market_intel.png
│ └── ml_ai_arch.png
├── iot_analytics
│ ├── Dockerfile
│ ├── LICENCE
│ ├── MANIFEST.in
│ ├── cloudbuild.yaml
│ ├── iot_analytics_pipeline
│ │ ├── __init__.py
│ │ ├── aggregate_metrics.py
│ │ ├── maintenance_model.pkl
│ │ ├── options.py
│ │ ├── parse_timestamp.py
│ │ ├── pipeline.py
│ │ └── trigger_inference.py
│ ├── main.py
│ ├── maintenance_model.pkl
│ ├── readme.md
│ ├── requirements.txt
│ ├── scripts
│ │ ├── 01_cloud_build_and_push.sh
│ │ ├── 02_submit_job.sh
│ │ ├── create_and_populate_bigtable.py
│ │ ├── create_data.py
│ │ ├── maintenance_data.jsonl
│ │ ├── model.py
│ │ ├── publish_on_pubsub.py
│ │ └── vehicle_data.jsonl
│ └── setup.py
├── log_replication_splunk
│ ├── README.md
│ └── scripts
│ │ ├── .gitignore
│ │ └── 01_launch_ps_to_splunk.sh
├── marketing_intelligence
│ ├── Dockerfile
│ ├── LICENSE
│ ├── MANIFEST.in
│ ├── README.md
│ ├── cloudbuild.yaml
│ ├── main.py
│ ├── marketing_intelligence_pipeline
│ │ ├── __init__.py
│ │ ├── options.py
│ │ └── pipeline.py
│ ├── requirements.txt
│ ├── scripts
│ │ ├── .gitignore
│ │ ├── 01_build_and_push_container.sh
│ │ └── 02_run_dataflow.sh
│ └── setup.py
├── ml_ai_python
│ ├── Dockerfile
│ ├── LICENSE
│ ├── MANIFEST.in
│ ├── README.md
│ ├── cloudbuild.yaml
│ ├── main.py
│ ├── ml_ai_pipeline
│ │ ├── __init__.py
│ │ ├── model_handlers.py
│ │ ├── options.py
│ │ └── pipeline.py
│ ├── requirements-dev.txt
│ ├── requirements.txt
│ ├── scripts
│ │ ├── .gitignore
│ │ ├── 01_build_and_push_container.sh
│ │ └── 02_run_dataflow.sh
│ └── setup.py
└── pylintrc
├── renovate.json
├── terraform
├── .gitignore
├── README.md
├── anomaly_detection
│ ├── README.md
│ ├── main.tf
│ └── variables.tf
├── cdp
│ ├── README.md
│ ├── main.tf
│ └── variables.tf
├── clickstream_analytics
│ ├── README.md
│ ├── main.tf
│ └── variables.tf
├── etl_integration
│ ├── README.md
│ ├── main.tf
│ └── variables.tf
├── iot_analytics
│ ├── README.md
│ ├── main.tf
│ └── variables.tf
├── log_replication_splunk
│ ├── README.md
│ ├── main.tf
│ └── variables.tf
├── marketing_intelligence
│ ├── README.md
│ ├── main.tf
│ └── variables.tf
└── ml_ai
│ ├── README.md
│ ├── main.tf
│ └── variables.tf
└── use_cases
├── Anomaly_Detection.md
├── CDP.md
├── Clickstream_Analytics.md
├── ETL_integration.md
├── GenAI_ML.md
├── IoT_Analytics.md
├── Log_replication.md
├── Marketing_Intelligence.md
├── guides
├── ads_analytics_dataflow_guide.pdf
├── anomaly_detection_dataflow_guide.pdf
├── cdp_dataflow_guide.pdf
├── clickstream_analytics_dataflow_guide.pdf
├── etl_dataflow_guide.pdf
├── gaming_analytics_dataflow_guide.pdf
├── genai_ml_dataflow_guide.pdf
├── iot_analytics_dataflow_guide.pdf
├── log_replication_dataflow_guide.pdf
└── market_intel_dataflow_guide.pdf
└── one_pagers
├── anomaly_detection_dataflow_onepager.pdf
├── clickstream_dataflow_onepager.pdf
├── etl_dataflow_onepager.pdf
├── genai_ml_dataflow_onepager.pdf
├── iot_analytics_dataflowonepager.pdf
├── log_replication_dataflow_onepager.pdf
└── market_intel_dataflow_onepager.pdf
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | Thanks for sending your first pull request. Please remove this text before submitting the pull request.
2 |
3 | Make sure that:
4 | * Read 📰
5 | * You have read the [CONTRIBUTING.md]([url](https://github.com/GoogleCloudPlatform/dataflow-solution-guides/blob/main/CONTRIBUTING.md)) file.
6 | * Run in the cloud ☁️
7 | * You have run all your code in Google Cloud and it is working (even if it is not complete yet)
8 | * Code style 🎩
9 | * You have check the code style and quality commands given in the CONTRIBUTING.md file, and your code passess those checks.
10 | * Using Python? 🔍
11 | * If you are submitting a Python pipeline, it needs to have a `setup.py` file in the top level directory of your pipeline.
12 | * Using Java? 🔍
13 | * If you are submitting a Java pipeline, please use Gradle with `spotless` and `errorprone`. Use the `etl_integration_java` pipeline as an example (the `CONTRIBUTING.md` file has the details of the files you can copy to use as a template for your pipeline build).
14 |
15 | Please remove all this text before sending your pull request.
16 |
17 | Thanks for your contribution!
18 |
--------------------------------------------------------------------------------
/.github/workflows/terraform_plan.yml:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | name: 'Terraform deploy'
16 | on:
17 | workflow_dispatch:
18 | inputs:
19 | prnumber:
20 | type: string
21 | description: PR number to build
22 | permissions:
23 | contents: read
24 | id-token: write
25 | jobs:
26 | terraform-plan:
27 | name: 'Terraform plan with Google Cloud'
28 | runs-on: ubuntu-latest
29 | concurrency:
30 | group: terraform-plan-group
31 | cancel-in-progress: true
32 | defaults:
33 | run:
34 | shell: bash
35 | steps:
36 | - name: Checkout
37 | uses: actions/checkout@v4
38 | with:
39 | ref: 'refs/pull/${{ github.event.inputs.prnumber }}/head'
40 | - uses: dorny/paths-filter@v3
41 | id: filter
42 | with:
43 | filters: "Terraform:\n - 'terraform/**' \n"
44 | - name: 'Google Cloud auth'
45 | uses: 'google-github-actions/auth@v2'
46 | with:
47 | project_id: '${{ secrets.TESTING_PROJECT }}'
48 | workload_identity_provider: '${{ secrets.WIF_PROVIDER }}'
49 | service_account: '${{ secrets.WIF_SERVICE_ACCOUNT }}'
50 | - name: Setup Terraform
51 | uses: hashicorp/setup-terraform@v3
52 | - name: Terraform Init
53 | working-directory: terraform
54 | run: |
55 | ls -d */ | while read d
56 | do
57 | echo "Running tf init in directory: $d"
58 | cd $d && terraform init && cd ..
59 | done
60 | - name: Terraform Plan
61 | working-directory: terraform
62 | run: |-
63 | ls -d */ | while read d
64 | do
65 | echo "Running tf plan in directory: $d"
66 | cd $d
67 | echo 'project_create = "false"' > terraform.tfvars
68 | echo 'project_id = "${{ secrets.TESTING_PROJECT }}"' >> terraform.tfvars
69 | echo 'region = "us-central-1"' >> terraform.tfvars
70 | terraform plan -input=false
71 | cd ..
72 | done
73 | - name: Terraform Apply and destroy
74 | working-directory: terraform
75 | run: |-
76 | ls -d */ | while read d
77 | do
78 | echo "Running tf plan in directory: $d"
79 | cd $d
80 | echo 'project_create = "false"' > terraform.tfvars
81 | echo 'project_id = "${{ secrets.TESTING_PROJECT }}"' >> terraform.tfvars
82 | echo 'region = "us-central-1"' >> terraform.tfvars
83 | terraform apply -input=false -auto-approve
84 | terraform destroy -input=false -auto-approve
85 | cd ..
86 | done
87 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to Contribute
2 |
3 | We'd love to accept your patches and contributions to this project. There are
4 | just a few small guidelines you need to follow.
5 |
6 | ## Contributor License Agreement
7 |
8 | Contributions to this project must be accompanied by a Contributor License
9 | Agreement. You (or your employer) retain the copyright to your contribution;
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to to see
12 | your current agreements on file or to sign a new one.
13 |
14 | You generally only need to submit a CLA once, so if you've already submitted one
15 | (even if it was for a different project), you probably don't need to do it
16 | again.
17 |
18 | ## Run in Dataflow and Google Cloud
19 |
20 | Before submitting your contribution, make sure that all your code runs correctly
21 | in Google Cloud, including any Terraform code and any pipeline you write.
22 |
23 | ## Code Quality Checks
24 |
25 | ### For Python code
26 |
27 | You normally will write Python code in a subdirectory of the `pipelines` folder.
28 | Install `yapf` and run the following command in the top level directory of your
29 | pipeline, to reformat your code:
30 |
31 | ```shell
32 | yapf -i -r --style yapf .
33 | ```
34 |
35 | If you install `pylint`, you can check if your code will pass the build with the
36 | following command:
37 |
38 | ```shell
39 | pylint --rcfile ../pylintrc .
40 | ```
41 |
42 | Please note that the configuration file `../pylintrc` is located in the
43 | `pipelines` folder.
44 |
45 | ### For Java code
46 |
47 | Make sure you are using Gradle with the same settings as the existing pipelines
48 | (e.g. use `pipelines/etl_integration_java` as an example), and run the following
49 | command to make your build passes:
50 |
51 | ```shell
52 | ./gradlew build
53 | ```
54 |
55 | If you find code style issues, run this command to fix them:
56 |
57 | ```
58 | shell
59 | ./gradlew spotlessApply
60 | ```
61 |
62 | You can use the following files to copy the Gradle settings to your pipeline:
63 | * `build.gradle`
64 | * `gradlew` and `gradlew.bat`
65 | * The directory `gradle` and all its contents.
66 |
67 | ### For Terraform code
68 |
69 | Run the following command in the top level directory where your Terraform code is located:
70 |
71 | ```shell
72 | terraform fmt
73 | ```
74 |
75 | You can also check for other types of issues with your Terraform code by using the
76 | `terraform validate` command (but bear in mind you need to run `terraform init` command first).
77 |
78 | ## Code Reviews
79 |
80 | All submissions, including submissions by project members, require review. We
81 | use GitHub pull requests for this purpose. Consult
82 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
83 | information on using pull requests.
84 |
85 | ## Community Guidelines
86 |
87 | This project follows [Google's Open Source Community
88 | Guidelines](https:git//opensource.google/conduct/).
89 |
90 | ## Contributor Guide
91 |
92 | If you are new to contributing to open source, you can find helpful information in this contributor guide.
93 |
94 | You may follow these steps to contribute:
95 |
96 | 1. **Fork the official repository.** This will create a copy of the official repository in your own account.
97 | 2. **Sync the branches.** This will ensure that your copy of the repository is up-to-date with the latest changes from the official repository.
98 | 3. **Work on your forked repository's feature branch.** This is where you will make your changes to the code.
99 | 4. **Commit your updates on your forked repository's feature branch.** This will save your changes to your copy of the repository.
100 | 5. **Submit a pull request to the official repository's main branch.** This will request that your changes be merged into the official repository.
101 | 6. **Resolve any lint errors.** This will ensure that your changes are formatted correctly.
102 |
103 | Here are some additional things to keep in mind during the process:
104 |
105 | - **Read the [Google's Open Source Community Guidelines](https://opensource.google/conduct/).** The contribution guidelines will provide you with more information about the project and how to contribute.
106 | - **Test your changes.** Before you submit a pull request, make sure that your changes work as expected.
107 | - **Be patient.** It may take some time for your pull request to be reviewed and merged.
108 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Dataflow Solution Guides
2 |
3 | [](https://github.com/GoogleCloudPlatform/dataflow-solution-guides/actions/workflows/pull_request.yml) [](LICENSE)
4 |
5 | Welcome to the Dataflow Solution Guides!
6 |
7 | The Dataflow Solution Guides offer full end-to-end deployment for the most
8 | common streaming solutions to run
9 | on [Dataflow](https://cloud.google.com/dataflow/).
10 |
11 | This repository contains the following assets for each guide:
12 |
13 | - Full Terraform code to spawn all the necessary Google Cloud infrastructure
14 | - Pipelines code in Python, Java and Go (coming soon) for a
15 | sample pipeline for each use case
16 |
17 | ## Solution guides
18 |
19 | This the list of solution guides available at this moment:
20 |
21 | | Guide | Description | Development status |
22 | | :-------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------: | :-----------------------: |
23 | | [GenAI & Machine Learning Inference](./use_cases/GenAI_ML.md) | Real-time inference with local GenAI models, using a GPU | Ready :white_check_mark: |
24 | | [ETL / Integration](./use_cases/ETL_integration.md) | Real-time change data capture from a Spanner database to BigQuery | Ready :white_check_mark: |
25 | | [Log Replication & Analytics](./use_cases/Log_replication.md) | Real-time log replication into Splunk | Beta :factory: |
26 | | [Marketing Intelligence](./use_cases/Marketing_Intelligence.md) | Real-time marketing intelligence, using an AutoML model deployed in Vertex | Beta :factory: |
27 | | [Clickstream Analytics](./use_cases/Clickstream_Analytics.md) | Real-time clickstream analytics with Bigtable enrichment / data hydration | Work in progress :hammer: |
28 | | [IoT Analytics](./use_cases/IoT_Analytics.md) | Real-time Internet of Things (IoT) analytics with Bigtable enrichment & models deployed in Vertex AI | Work in progress :hammer: |
29 | | [Anomaly Detection](./use_cases/Anomaly_Detection.md) |Real-time detection of anomalies in a stream of data leveraging GenAI with models deployed in Vertex AI | Beta :factory: |
30 | | [Customer Data Platform](./use_cases/CDP.md) | Real-time customer data platform that unifies a customer view from different sources. | Beta :factory: |
31 | | [Gaming Analytics](./use_cases/gaming_analytics.md) | Real-time analyis of gaming data to enhance live gameplay & offer targeting | Beta :factory: |
32 |
33 |
34 |
35 | ## Repository structure
36 |
37 | - `terraform`: This directory contains the Terraform code for deploying the
38 | necessary Google Cloud
39 | infrastructure for each use case.
40 | - `pipelines`: This directory contains the Python, Java, and Go code for the
41 | sample pipelines.
42 | - `use_cases`: This directory contains the documentation of each use case
43 |
44 | ## Getting help
45 |
46 | - GitHub Issues: Report any issues or ask questions on the GitHub repository.
47 | - https://github.com/GoogleCloudPlatform/dataflow-solution-guides/issues
48 | - Stack Overflow: Search for existing solutions or ask questions on Stack
49 | Overflow using the `google-cloud-dataflow` tag:
50 | - https://stackoverflow.com/questions/tagged/google-cloud-dataflow
51 |
52 | ## Contributing
53 |
54 | Your contributions to this repository are welcome.
55 |
56 | - Fork and Pull Request: Fork the repository and submit a pull request with your
57 | changes.
58 | - Follow the Contribution Guidelines: Please follow the contribution guidelines
59 | outlined in the
60 | [CONTRIBUTING.md](CONTRIBUTING.md) file.
61 |
62 | ## Disclaimer
63 |
64 | This is not an officially supported Google product. The code in this repository
65 | is for demonstrative purposes only.
66 |
--------------------------------------------------------------------------------
/pipelines/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/pipelines/.DS_Store
--------------------------------------------------------------------------------
/pipelines/.gitignore:
--------------------------------------------------------------------------------
1 | # Gemma model
2 | ./gemma_2b
3 |
4 | # IDEs
5 | .vscode/
6 | .idea/
7 |
8 | # Byte-compiled / optimized / DLL files
9 | __pycache__/
10 | *.py[cod]
11 | *$py.class
12 |
13 | # C extensions
14 | *.so
15 |
16 | # Distribution / packaging
17 | .Python
18 | build/
19 | develop-eggs/
20 | dist/
21 | downloads/
22 | eggs/
23 | .eggs/
24 | lib/
25 | lib64/
26 | parts/
27 | sdist/
28 | var/
29 | wheels/
30 | share/python-wheels/
31 | *.egg-info/
32 | .installed.cfg
33 | *.egg
34 | MANIFEST
35 |
36 | # Virtual environments.
37 | .env
38 | .venv
39 | env/
40 | venv/
41 | ENV/
42 | env.bak/
43 | venv.bak/
44 |
--------------------------------------------------------------------------------
/pipelines/README.md:
--------------------------------------------------------------------------------
1 | ## Pipelines
2 |
3 | This directory contains sample pipelines for the solution guides. These
4 | pipelines demonstrate how
5 | to use Dataflow to process data in streaming for each one of the use cases.
6 |
7 | The pipelines are written in Python, Java (coming soon), and Go (coming soon).
8 | Each pipeline
9 | includes a README file that provides a detailed description of the pipeline,
10 | including its purpose,
11 | inputs, outputs, and configuration options.
12 |
13 | ## Getting Started
14 |
15 | To get started with the pipelines, follow these steps:
16 |
17 | 1. Choose the pipeline that best suits your needs.
18 | 2. Read the README file for the pipeline to understand its purpose, inputs,
19 | outputs,
20 | and configuration options. MAke sure that you have the necessary
21 | infrastructure ready, using the
22 | corresponding deployment scripts in the `terraform` directory.
23 | 3. Modify the pipeline code to meet your specific requirements.
24 | 4. Run the pipeline using the provided scripts.
25 |
26 | ## Pipelines
27 |
28 | These are the pipelines included in this directory
29 |
30 | | Use case | Programming language | Location |
31 | | :--------------------: | :------------------: | :---------------------------------------------------------: |
32 | | ML & GenAI | Python | [ml_ai_python](./ml_ai_python) |
33 | | ETL & Integration | Java | [etl_integration_java](./etl_integration_java) |
34 | | Customer Data Platform | Python | [cdp](./cdp) |
35 | | Anomaly detection | Python | [anomaly_detection](./anomaly_detection) |
36 | | Marketing Intelligence | Python | [marketing_intelligence](./marketing_intelligence/) |
37 | | Log replication | Dataflow template | [log_replication_splunk](./log_replication_splunk/) |
38 | | Clickstream Analytics | Java | [clickstream_analytics_java](./clickstream_analytics_java/) |
39 |
--------------------------------------------------------------------------------
/pipelines/anomaly_detection/Dockerfile:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | ARG SERVING_BUILD_IMAGE=tensorflow/tensorflow:2.18.0-gpu
16 | FROM ${SERVING_BUILD_IMAGE}
17 | WORKDIR /workspace
18 |
19 | RUN apt-get update -y && apt-get install -y \
20 | cmake
21 |
22 | COPY requirements.txt requirements.txt
23 | COPY main.py main.py
24 | COPY anomaly_detection_pipeline anomaly_detection_pipeline
25 | COPY MANIFEST.in MANIFEST.in
26 | COPY setup.py setup.py
27 |
28 | RUN pip install --upgrade --no-cache-dir pip \
29 | && pip install --no-cache-dir -r requirements.txt \
30 | && pip install --no-cache-dir -e .
31 |
32 | # Copy files from official SDK image, including script/dependencies.
33 | COPY --from=apache/beam_python3.11_sdk:2.63.0 /opt/apache/beam /opt/apache/beam
34 |
35 |
36 | ENV KERAS_BACKEND="tensorflow"
37 |
38 | # Set the entrypoint to Apache Beam SDK launcher.
39 | ENTRYPOINT ["/opt/apache/beam/boot"]
--------------------------------------------------------------------------------
/pipelines/anomaly_detection/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
--------------------------------------------------------------------------------
/pipelines/anomaly_detection/README.md:
--------------------------------------------------------------------------------
1 | # Anomaly Detection sample pipeline (Python)
2 | This sample pipeline demonstrates how to use Dataflow to process data, and detect anomalies
3 | using GenAI.
4 | This pipeline is written in Python.
5 |
6 | This pipeline is part of the [Dataflow Anomaly Detection solution guide](../../use_cases/Anomaly_Detection.md).
7 |
8 | ## Architecture
9 |
10 | The generic architecture for an anomaly detection pipeline looks like as follows:
11 |
12 | 
13 |
14 | In this directory, you will find a specific implementation of the above architecture, with the
15 | following stages:
16 |
17 | 1. **Data ingestion:** Reads data from a Pub/Sub topic.
18 | 2. **Data preprocessing:** The sample pipeline does not do any transformation, but it is trivial
19 | to add a preprocessing step leveraging
20 | [the Enrichment transform](https://cloud.google.com/dataflow/docs/guides/enrichment) to perform
21 | feature engineering before calling the model.
22 | 3. **Inference:** Uses the RunInference transform with a model handler, using Keras and Tensorflow, to call the fraud detection model. The pipeline uses a GPU with the Dataflow worker, to speed up the inference.
23 | 4. **Detections:** The detections are sent to another Pub/Sub topic as output.
24 |
25 |
26 | ## Selecting the cloud region
27 |
28 | Not all the resources may be available in all the regions. The default values included in this
29 | directory have been tested using `us-central1` as region.
30 |
31 | The file `cloudbuild.yaml` is using the machine type `E2_HIGHCPU_8` as the default machine type. If
32 | that's not available in your preferred region, try with other machine types that are available
33 | in Cloud Build:
34 | * https://cloud.google.com/build/docs/api/reference/rest/v1/projects.builds#machinetype
35 |
36 | Moreover, the file `scripts/00_set_environment.sh` specifies a machine type for the Datalow workers.
37 | The selected machine type, `g2-standard-4`, is the recommended one for inference with GPU. If that
38 | type is not available in your region, you can check what machines are available to use with the
39 | following command:
40 |
41 | ```sh
42 | gcloud compute machine-types list --zones=,,...
43 | ```
44 |
45 | See more info about selecting the right type of machine in the following link:
46 | * https://cloud.google.com/compute/docs/machine-resource
47 |
48 | ## How to launch the pipeline
49 |
50 | All the scripts are located in the `scripts` directory and prepared to be launched from the top
51 | sources directory.
52 |
53 | In the script `scripts/00_set_environment.sh`, define the value of the project id and the region variable:
54 |
55 | ```
56 | export PROJECT=
57 | export REGION=
58 | ```
59 |
60 | Leave the rest of variables untouched, although you can override them if you prefer.
61 |
62 | After you edit the script, load those variables into the environment
63 |
64 | ```sh
65 | source scripts/00_set_environment.sh
66 | ```
67 |
68 | And then run the script that builds and publishes the custom Dataflow container. This container will
69 | contain all the required dependencies.
70 |
71 | ```sh
72 | ./scripts/01_build_and_push_container.sh
73 | ```
74 |
75 | This will create a Cloud Build job that can take a few minutes to complete. Once it completes, you
76 | can trigger the pipeline with the following:
77 |
78 | ```sh
79 | ./scripts/02_run_dataflow.sh
80 | ```
81 |
82 | ## Input data
83 |
84 | To send data into the pipeline, you need to publish messages in the `messages` topic. Those
85 | messages are passed "as is" to the model.
86 |
87 | ## Output data
88 |
89 | The predictions are published into the topic `detections`, and can be observed using the
90 | subscription `detections-sub`.
--------------------------------------------------------------------------------
/pipelines/anomaly_detection/anomaly_detection_pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/pipelines/anomaly_detection/anomaly_detection_pipeline/options.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | Options class for the Anomaly Detection pipeline.
16 | """
17 |
18 | from argparse import ArgumentParser
19 |
20 | from apache_beam.options.pipeline_options import PipelineOptions
21 |
22 |
23 | class MyPipelineOptions(PipelineOptions):
24 |
25 | @classmethod
26 | def _add_argparse_args(cls, parser: ArgumentParser):
27 | parser.add_argument("--messages_subscription", type=str)
28 | parser.add_argument("--model_endpoint", type=str)
29 | parser.add_argument("--project", type=str)
30 | parser.add_argument("--location", type=str)
31 | parser.add_argument("--responses_topic", type=str)
32 |
--------------------------------------------------------------------------------
/pipelines/anomaly_detection/anomaly_detection_pipeline/pipeline.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | Anomaly Detection Apache Beam pipeline.
16 | """
17 |
18 | from apache_beam import Pipeline, PCollection
19 | from apache_beam.ml.inference import RunInference
20 | from apache_beam.io.gcp import pubsub
21 |
22 | import apache_beam as beam
23 | from apache_beam.ml.inference.base import PredictionResult
24 | from apache_beam.ml.inference.vertex_ai_inference import VertexAIModelHandlerJSON
25 |
26 | from .options import MyPipelineOptions
27 |
28 |
29 | def _format_output(element: PredictionResult) -> str:
30 | return f"Input: \n{element.example}, \n\n\nOutput: \n{element.inference}"
31 |
32 |
33 | @beam.ptransform_fn
34 | def _extract(p: Pipeline, subscription: str) -> PCollection[str]:
35 | msgs: PCollection[bytes] = p | "Read subscription" >> beam.io.ReadFromPubSub(
36 | subscription=subscription)
37 | return msgs | "Parse" >> beam.Map(lambda x: x.decode("utf-8"))
38 |
39 |
40 | @beam.ptransform_fn
41 | def _transform(msgs: PCollection[str], model_endpoint: str, project: str,
42 | location: str) -> PCollection[str]:
43 | model_handler = VertexAIModelHandlerJSON(
44 | endpoint_id=model_endpoint, project=project, location=location)
45 | preds: PCollection[
46 | PredictionResult] = msgs | "RunInference-vertexai" >> RunInference(
47 | model_handler)
48 | return preds | "Format Output" >> beam.Map(_format_output)
49 |
50 |
51 | def create_pipeline(options: MyPipelineOptions) -> Pipeline:
52 | """ Create the pipeline object.
53 |
54 | Args:
55 | options: The pipeline options, with type `MyPipelineOptions`.
56 |
57 | Returns:
58 | The pipeline object.
59 | """
60 | pipeline = beam.Pipeline(options=options)
61 | # Extract
62 | transactions: PCollection[str] = pipeline | "Read" >> _extract(
63 | subscription=options.messages_subscription)
64 | # Transform
65 | responses: PCollection[str] = transactions | "Transform" >> _transform(
66 | model_endpoint=options.model_endpoint,
67 | project=options.project,
68 | location=options.location)
69 | # Load
70 | responses | "Publish Result" >> pubsub.WriteStringsToPubSub(
71 | topic=options.responses_topic)
72 |
73 | return pipeline
74 |
--------------------------------------------------------------------------------
/pipelines/anomaly_detection/cloudbuild.yaml:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | steps:
16 | - name: 'gcr.io/cloud-builders/docker'
17 | script: |
18 | docker build -t ${_TAG} .
19 | substitutions:
20 | _TAG: unset
21 | options:
22 | substitutionOption: 'ALLOW_LOOSE'
23 | automapSubstitutions: true
24 | machineType: E2_HIGHCPU_8
25 | images:
26 | - ${_TAG}
--------------------------------------------------------------------------------
/pipelines/anomaly_detection/main.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | An Anomaly Detection example for the Dataflow Solution Guides.
17 | """
18 |
19 | import time
20 |
21 | from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions
22 |
23 | from anomaly_detection_pipeline.options import MyPipelineOptions
24 | from anomaly_detection_pipeline.pipeline import create_pipeline
25 |
26 |
27 | def main(options: MyPipelineOptions):
28 | pipeline = create_pipeline(options)
29 | pipeline.run()
30 |
31 |
32 | if __name__ == "__main__":
33 | pipeline_options: PipelineOptions = PipelineOptions()
34 | dataflow_options: GoogleCloudOptions = pipeline_options.view_as(
35 | GoogleCloudOptions)
36 | now_epoch_ms = int(time.time() * 1000)
37 | dataflow_options.job_name = f"anomaly-detection-pipeline-{now_epoch_ms}"
38 | custom_options: MyPipelineOptions = pipeline_options.view_as(
39 | MyPipelineOptions)
40 | main(custom_options)
41 |
--------------------------------------------------------------------------------
/pipelines/anomaly_detection/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | tensorflow==2.18.0
--------------------------------------------------------------------------------
/pipelines/anomaly_detection/requirements.txt:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | apache-beam[gcp]==2.63.0
16 | keras_nlp==0.19.2
17 | keras==3.9.0
18 | protobuf==4.25.6
--------------------------------------------------------------------------------
/pipelines/anomaly_detection/setup.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | Setup file for the Anomaly Detection pipeline.
16 | """
17 |
18 | from setuptools import setup, find_packages
19 |
20 | with open("requirements.txt", encoding="utf-8") as f:
21 | requirements = f.readlines()
22 |
23 | setup(
24 | name="Dataflow Solution for Anomaly Detection pipelines",
25 | version="0.1",
26 | description="Anomaly Detection example for the Dataflow Solution Guides.",
27 | packages=find_packages(),
28 | install_requires=requirements,
29 | )
30 |
--------------------------------------------------------------------------------
/pipelines/cdp/Dockerfile:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | FROM apache/beam_python3.11_sdk:2.63.0
16 | WORKDIR /workspace
17 |
18 | RUN apt-get update -y && apt-get install -y \
19 | cmake
20 |
21 | COPY requirements.txt requirements.txt
22 | COPY main.py main.py
23 | COPY cdp_pipeline cdp_pipeline
24 | COPY schema schema
25 | COPY MANIFEST.in MANIFEST.in
26 | COPY setup.py setup.py
27 |
28 | RUN pip install --upgrade --no-cache-dir pip \
29 | && pip install --no-cache-dir -r requirements.txt \
30 | && pip install --no-cache-dir -e .
31 |
32 | # Copy files from official SDK image, including script/dependencies.
33 | COPY --from=apache/beam_python3.11_sdk:2.63.0 /opt/apache/beam /opt/apache/beam
34 |
35 | # Set the entrypoint to Apache Beam SDK launcher.
36 | ENTRYPOINT ["/opt/apache/beam/boot"]
--------------------------------------------------------------------------------
/pipelines/cdp/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
--------------------------------------------------------------------------------
/pipelines/cdp/README.md:
--------------------------------------------------------------------------------
1 | # Customer Data Platform sample pipeline (Python)
2 |
3 | This sample pipeline demonstrates how to use Dataflow to process the streaming data in order to build Customer Data platform. We will be reading data form multiple streaming sources, two pub-sub topics in this sample pipeline, will join the data and put it in bigquery table for analytics later on.
4 |
5 | This pipeline is part of the [Dataflow Customer Data Platfrom solution guide](../../use_cases/cdp.md).
6 |
7 | ## Architecture
8 |
9 | The generic architecture for an inference pipeline looks like as follows:
10 |
11 | 
12 |
13 | In this directory, you will find a specific implementation of the above architecture, with the
14 | following stages:
15 |
16 | 1. **Data ingestion:** Reads data from a Pub/Sub topic.
17 | 2. **Data preprocessing:** The sample pipeline joins the data from two pub-sub topic based on some key fields. This is to showcase the unification of customer data from different sources to store itin one place.
18 | 3. **Output Data:** The final processed data is then appended to the bigquery table.
19 |
20 | ## Selecting the cloud region
21 |
22 | Not all the resources may be available in all the regions. The default values included in this
23 | directory have been tested using `us-central1` as region.
24 |
25 | Moreover, the file `scripts/00_set_variables.sh` specifies a machine type for the Datalow workers.
26 | The selected machine type, `e2-standard-8`, is the one that we used for unification of data. If that
27 | type is not available in your region, you can check what machines are available to use with the
28 | following command:
29 |
30 | ```sh
31 | gcloud compute machine-types list --zones=,,...
32 | ```
33 |
34 | See more info about selecting the right type of machine in the following link:
35 | * https://cloud.google.com/compute/docs/machine-resource
36 |
37 | ## How to launch the pipeline
38 |
39 | All the scripts are located in the `scripts` directory and prepared to be launched from the top
40 | sources directory.
41 |
42 | In the script `scripts/00_set_variables.sh`, define the value of the project id and the region variable:
43 |
44 | ```
45 | export PROJECT=
46 | export REGION=
47 | ```
48 |
49 | Leave the rest of variables untouched, although you can override them if you prefer.
50 |
51 | After you edit the script, load those variables into the environment
52 |
53 | ```sh
54 | source scripts/00_set_variables.sh
55 | ```
56 |
57 | And then run the script that builds and publishes the custom Dataflow container. This container will
58 | contain all the required dependencies.
59 |
60 | ```sh
61 | ./scripts/01_cloudbuild_and_push_container.sh
62 | ```
63 |
64 | This will create a Cloud Build job that can take a few minutes to complete. Once it completes, you
65 | can trigger the pipeline with the following:
66 |
67 | ```sh
68 | ./scripts/02_run_dataflow_job.sh
69 | ```
70 | You can also directly run below script instead of above 3 steps.
71 |
72 | ```sh
73 | ./scripts/run.sh
74 | ```
75 |
76 | ## Input data
77 |
78 | To send data into the pipeline, you need to publish messages in the `transactions` and `coupon-redemption` topics.
79 | Run the python code below to publish data to these pub-sub topics. This script is reading sample data from GCS buckets and publishing it to the pub-sub topic to create real-time streaming environment for this use case. One can update the GCS bucket location as per their environment. For reference, input files are added to folder ./input_data/.
80 |
81 | ```python3
82 | ./cdp_pipeline/generate_transaction_data.py
83 | ```
84 |
85 | ## Output data
86 |
87 | The unified data from the two pub-sub topics is moved to the bigquery table `output_dataset.unified-table`.
--------------------------------------------------------------------------------
/pipelines/cdp/cdp_pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/pipelines/cdp/cdp_pipeline/generate_transaction_data.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | A data generator for the Customer Data Platform analytics pipeline.
16 | """
17 |
18 | from google.cloud import pubsub_v1
19 | import json
20 | import pandas as pd
21 | import asyncio
22 |
23 |
24 | async def publish_coupons_to_pubsub():
25 | bucket_name = ""
26 | project_id = ""
27 |
28 | # Example: ["27601281299","27757099033","28235291311","27021203242","27101290145","27853175697"]
29 | transactions_id = [
30 | ""
31 | ]
32 | transactions_topic_name = "transactions"
33 | # Reference example - "dataflow-solution-guide-cdp/input_data/transaction_data.csv"
34 | transactions_data = ""
35 |
36 | coupons_topic_name = "coupon_redemption"
37 | # reference example - "dataflow-solution-guide-cdp/input_data/coupon_redempt.csv"
38 | coupons_data = ""
39 |
40 | transactions_df = pd.read_csv(
41 | f"gs://{bucket_name}/{transactions_data}", dtype=str)
42 | coupons_df = pd.read_csv(f"gs://{bucket_name}/{coupons_data}", dtype=str)
43 | publisher = pubsub_v1.PublisherClient()
44 |
45 | transactions_topic_path = publisher.topic_path(project_id,
46 | transactions_topic_name)
47 | coupons_topic_path = publisher.topic_path(project_id, coupons_topic_name)
48 | filtered_trans_df = transactions_df[transactions_df["transaction_id"].isin(
49 | transactions_id)]
50 | filtered_coupons_df = coupons_df[coupons_df["transaction_id"].isin(
51 | transactions_id)]
52 | await asyncio.gather(
53 | publish_coupons(filtered_coupons_df, publisher, coupons_topic_path),
54 | publish_transactions(filtered_trans_df, publisher,
55 | transactions_topic_path))
56 |
57 |
58 | async def publish_coupons(filtered_coupons_df, publisher, coupons_topic_path):
59 | for _, row in filtered_coupons_df.iterrows():
60 | coupon_message = json.dumps(row.to_dict()).encode("utf-8")
61 | print(coupon_message)
62 | future = publisher.publish(coupons_topic_path, coupon_message)
63 | print(f"Published coupon message ID: {future.result()}")
64 | await asyncio.sleep(3)
65 |
66 |
67 | async def publish_transactions(filtered_trans_df, publisher,
68 | transactions_topic_path):
69 | for _, row in filtered_trans_df.iterrows():
70 | transaction_message = json.dumps(row.to_dict()).encode("utf-8")
71 | print(transaction_message)
72 | future = publisher.publish(transactions_topic_path, transaction_message)
73 | print(f"Published transaction message ID: {future.result()}")
74 | await asyncio.sleep(1)
75 |
76 |
77 | if __name__ == "__main__":
78 | asyncio.run(publish_coupons_to_pubsub())
79 |
--------------------------------------------------------------------------------
/pipelines/cdp/cdp_pipeline/options.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | Option class for Customer Data Platform pipeline.
16 | """
17 |
18 | from argparse import ArgumentParser
19 |
20 | from apache_beam.options.pipeline_options import PipelineOptions
21 |
22 |
23 | class MyPipelineOptions(PipelineOptions):
24 |
25 | @classmethod
26 | def _add_argparse_args(cls, parser: ArgumentParser):
27 | parser.add_argument("--transactions_topic", type=str)
28 | parser.add_argument("--coupons_redemption_topic", type=str)
29 | parser.add_argument("--project_id", type=str)
30 | parser.add_argument("--location", type=str)
31 | parser.add_argument("--output_dataset", type=str)
32 | parser.add_argument("--output_table", type=str)
33 |
--------------------------------------------------------------------------------
/pipelines/cdp/cloudbuild.yaml:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | steps:
16 | - name: 'gcr.io/cloud-builders/docker'
17 | script: |
18 | docker build -t ${_TAG} .
19 | substitutions:
20 | _TAG: unset
21 | options:
22 | substitutionOption: 'ALLOW_LOOSE'
23 | automapSubstitutions: true
24 | images:
25 | - ${_TAG}
--------------------------------------------------------------------------------
/pipelines/cdp/input_data/coupon_redempt.csv:
--------------------------------------------------------------------------------
1 | household_key,day,coupon_upc,campaign,transaction_id
2 | 1,421,10000085364,2200,27601281299
3 | 1,421,51700010076,2200,27601281299
4 | 13,609,10000089277,18,28571755990
--------------------------------------------------------------------------------
/pipelines/cdp/input_data/transaction_data.csv:
--------------------------------------------------------------------------------
1 | household_key,transaction_id,day,product_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2 | 1,27601281299,51,941769,1,3.99,436,0,1456,8,0,0
3 | 1,27601281299,51,910635,1,2.99,436,0,1456,8,0,0
--------------------------------------------------------------------------------
/pipelines/cdp/main.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | Customer Data Platform analytics pipeline for the Dataflow Solution Guides.
16 | """
17 |
18 | import time
19 |
20 | from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions
21 |
22 | from cdp_pipeline.options import MyPipelineOptions
23 | from cdp_pipeline.customer_data_platform import create_and_run_pipeline
24 |
25 |
26 | def main(options: MyPipelineOptions):
27 | create_and_run_pipeline(options)
28 |
29 |
30 | if __name__ == "__main__":
31 | pipeline_options: PipelineOptions = PipelineOptions()
32 | dataflow_options: GoogleCloudOptions = pipeline_options.view_as(
33 | GoogleCloudOptions)
34 | now_epoch_ms = int(time.time() * 1000)
35 | dataflow_options.job_name = f"customer-data-platform-{now_epoch_ms}"
36 | custom_options: MyPipelineOptions = pipeline_options.view_as(
37 | MyPipelineOptions)
38 | main(custom_options)
39 |
--------------------------------------------------------------------------------
/pipelines/cdp/requirements.txt:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | apache-beam[gcp]==2.63.0 # Example, use your actual versions
16 | ## Below dependencies are required if you have to run script /cdp_pipeline/generate_transaction_data.py
17 | pandas
18 | fsspec
19 | gcsfs
20 |
--------------------------------------------------------------------------------
/pipelines/cdp/schema/unified_table.json:
--------------------------------------------------------------------------------
1 | {
2 | "fields": [
3 | {
4 | "name": "transaction_id",
5 | "type": "STRING",
6 | "mode": "REQUIRED"
7 | },
8 | {
9 | "name": "household_key",
10 | "type": "STRING",
11 | "mode": "NULLABLE"
12 | },
13 | {
14 | "name": "coupon_upc",
15 | "type": "STRING",
16 | "mode": "NULLABLE"
17 | },
18 | {
19 | "name": "product_id",
20 | "type": "STRING",
21 | "mode": "NULLABLE"
22 | },
23 | {
24 | "name": "coupon_discount",
25 | "type": "STRING",
26 | "mode": "NULLABLE"
27 | }
28 | ]
29 | }
--------------------------------------------------------------------------------
/pipelines/cdp/scripts/01_cloudbuild_and_push_container.sh:
--------------------------------------------------------------------------------
1 | gcloud builds submit \
2 | --region=$REGION \
3 | --default-buckets-behavior=regional-user-owned-bucket \
4 | --substitutions _TAG=$CONTAINER_URI \
5 | .
6 |
--------------------------------------------------------------------------------
/pipelines/cdp/scripts/02_run_dataflow_job.sh:
--------------------------------------------------------------------------------
1 | python3 -m main \
2 | --streaming \
3 | --runner=DataflowRunner \
4 | --project=$PROJECT \
5 | --temp_location=gs://$PROJECT/tmp \
6 | --region=$REGION \
7 | --save_main_session \
8 | --service_account_email=$SERVICE_ACCOUNT \
9 | --subnetwork=$SUBNETWORK \
10 | --sdk_container_image=$CONTAINER_URI \
11 | --max_workers=$MAX_DATAFLOW_WORKERS \
12 | --disk_size_gb=$DISK_SIZE_GB \
13 | --machine_type=$MACHINE_TYPE \
14 | --transactions_topic=$TRANSACTIONS_TOPIC \
15 | --coupons_redemption_topic=$COUPON_REDEMPTION_TOPIC \
16 | --output_dataset=$BQ_DATASET \
17 | --output_table=$BQ_UNIFIED_TABLE \
18 | --project_id=$PROJECT \
19 | --enable_streaming_engine
20 |
--------------------------------------------------------------------------------
/pipelines/cdp/scripts/run.sh:
--------------------------------------------------------------------------------
1 | source ./scripts/00_set_variables.sh
2 | sh ./scripts/01_cloudbuild_and_push_container.sh
3 | sh ./scripts/02_run_dataflow_job.sh
--------------------------------------------------------------------------------
/pipelines/cdp/setup.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | Setup file for Customer Data Platform analytics pipeline.
16 | """
17 |
18 | from setuptools import setup, find_packages
19 |
20 | with open("requirements.txt", encoding="utf-8") as f:
21 | requirements = f.readlines()
22 |
23 | setup(
24 | name="Dataflow Solution for Customer Data Platform",
25 | version="0.1",
26 | description="Customer Data Platform example for the Dataflow Solution Guides",
27 | packages=find_packages(),
28 | install_requires=requirements,
29 | )
30 |
--------------------------------------------------------------------------------
/pipelines/clickstream_analytics_java/.gitattributes:
--------------------------------------------------------------------------------
1 | #
2 | # https://help.github.com/articles/dealing-with-line-endings/
3 | #
4 | # Linux start script should use lf
5 | /gradlew text eol=lf
6 |
7 | # These are Windows script files and should use crlf
8 | *.bat text eol=crlf
9 |
10 |
--------------------------------------------------------------------------------
/pipelines/clickstream_analytics_java/.gitignore:
--------------------------------------------------------------------------------
1 | ### Java template
2 | # Compiled class file
3 | *.class
4 |
5 | # Log file
6 | *.log
7 |
8 | # BlueJ files
9 | *.ctxt
10 |
11 | # Mobile Tools for Java (J2ME)
12 | .mtj.tmp/
13 |
14 | # Package Files #
15 | *.jar
16 | *.war
17 | *.nar
18 | *.ear
19 | *.zip
20 | *.tar.gz
21 | *.rar
22 |
23 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
24 | hs_err_pid*
25 | replay_pid*
26 |
27 | ### Gradle template
28 | .gradle
29 | **/build/
30 | !src/**/build/
31 |
32 | # Ignore Gradle GUI config
33 | gradle-app.setting
34 |
35 | # Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored)
36 | !gradle-wrapper.jar
37 |
38 | # Avoid ignore Gradle wrappper properties
39 | !gradle-wrapper.properties
40 |
41 | # Cache of project
42 | .gradletasknamecache
43 |
44 | # Eclipse Gradle plugin generated files
45 | # Eclipse Core
46 | .project
47 | # JDT-specific (Eclipse Java Development Tools)
48 | .classpath
49 |
50 | # Sources generated by VS Code
51 | bin
--------------------------------------------------------------------------------
/pipelines/clickstream_analytics_java/README.md:
--------------------------------------------------------------------------------
1 | ## ClickStream Dataflow Code
2 |
3 | This Dataflow pipeline processes clickstream analytics data, using session windowing to group events into user sessions, and then writes the aggregated session data to BigQuery. The pipeline is written in Java and uses Pub/Sub as the input source.
4 |
5 | **This pipeline is still under development**.
6 |
7 | This pipeline is part of the [Dataflow Clickstream analytics solution guide](../../use_cases/Clickstream_Analytics.md).
8 |
9 | ## Pipeline Architecture
10 |
11 | 1. **Pub/Sub Subscription:** The pipeline reads clickstream events from a Pub/Sub subscription.
12 |
13 | 2. **Dataflow Pipeline:**
14 |
15 | - **Event Parsing:** Incoming Pub/Sub messages are parsed into structured clickstream event objects.
16 | - **Bigtable Enrichment (TODO):** Enrich session data with additional information from Bigtable (code implementation pending).
17 | - **Session Windowing (TODO):** Events are grouped into sessions using a session windowing strategy (e.g., 30-minute inactivity gap).
18 | - **BigQuery Write:** Aggregated session data is written to BigQuery tables.
19 | - **Dead-letter Queue:** Failed records are written to a BigQuery dead-letter table for further analysis and error handling.
20 |
21 | ## TODO
22 |
23 | The Bigtable enrichment and the session windowing analytics steps are not implemented at this moment.
24 |
25 | ## Pipeline Code
26 |
27 | - To build the project, run `./gradlew build`
28 |
29 | ## How to launch the pipelines
30 |
31 | All the scripts are located in the `scripts` directory and prepared to be launched from the top
32 | sources directory.
33 |
34 | The Terraform code generates a file with all the necessary variables in the location `./scripts/00_set_variables.sh`.
35 |
36 | Run the following command to apply that configuration:
37 |
38 | ```sh
39 | source scripts/00_set_variables.sh
40 | ```
41 |
42 | Then run the analytics pipeline. This pipeline will take data from the input
43 | topic, and will write it to BigQuery, enriching with metadata available in Bigtable, and applying session analytics.
44 |
45 | ```sh
46 | ./scripts/01_launch_pipeline.sh
47 | ```
48 |
--------------------------------------------------------------------------------
/pipelines/clickstream_analytics_java/build.gradle:
--------------------------------------------------------------------------------
1 | // Copyright 2024 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 or the MIT license
5 | // , at your
6 | // option. This file may not be copied, modified, or distributed
7 | // except according to those terms.
8 | plugins {
9 | id 'application'
10 | id "maven-publish"
11 | id "com.diffplug.spotless" version "7.0.2"
12 | id 'com.palantir.git-version' version '3.1.0'
13 | id 'net.ltgt.errorprone' version "4.1.0"
14 | }
15 | ext {
16 | packageName = "clickstream-analytics-java"
17 | javaPackagePath = "com.google.cloud.dataflow.solutions.clickstream_analytics"
18 | appName = "ClickStreamPubSubToBq"
19 | appVersion = "${gitVersion()}-SNAPSHOT"
20 | beamVersion = "2.63.0"
21 | slf4jVersion = "1.7.36"
22 | junitVersion = "4.13.2"
23 | hamcrestVersion = "3.0"
24 | googleJavaFormat = '1.24.0'
25 | errorProneCoreVersion = '2.26.1'
26 |
27 | }
28 | repositories {
29 | mavenCentral()
30 | maven { // Apache Snapshots repository
31 | url "https://repository.apache.org/content/repositories/snapshots/"
32 | }
33 | }
34 | application {
35 | mainClass = "${javaPackagePath}.ClickstreamPubSubToBq"
36 | version = appVersion
37 | }
38 | test {
39 | // JUnit 4.
40 | useJUnit()
41 | dependsOn cleanTest
42 | testLogging.showStandardStreams = true
43 | }
44 | compileJava {
45 | options.compilerArgs.addAll(['-Xlint:deprecation', '-Xlint:unchecked'])
46 | }
47 | run {
48 | if (project.hasProperty('args')) {
49 | args project.args.split('\\s')
50 | }
51 | }
52 | dependencies {
53 | // App dependencies.
54 | implementation "org.apache.beam:beam-sdks-java-core:${beamVersion}"
55 | runtimeOnly "org.apache.beam:beam-runners-direct-java:${beamVersion}"
56 | implementation "org.apache.beam:beam-runners-google-cloud-dataflow-java:${beamVersion}"
57 | implementation "org.apache.beam:beam-sdks-java-io-google-cloud-platform:${beamVersion}"
58 | // Tests dependencies.
59 | testImplementation "junit:junit:${junitVersion}"
60 | testImplementation "org.hamcrest:hamcrest:${hamcrestVersion}"
61 | testImplementation "org.apache.beam:beam-sdks-java-test-utils:${beamVersion}"
62 | implementation 'org.checkerframework:checker-qual:3.49.1'
63 | errorprone "com.google.errorprone:error_prone_core:${errorProneCoreVersion}"
64 |
65 | // Google Java format for Gradle
66 | implementation "com.google.googlejavaformat:google-java-format:${googleJavaFormat}"
67 | }
68 |
69 | // Package a self-contained jar file.
70 | jar {
71 | archiveBaseName = packageName
72 | destinationDirectory = file('build')
73 | manifest {
74 | attributes 'Main-Class': "${javaPackagePath}.ClickstreamPubSubToBq"
75 | }
76 | exclude 'META-INF/*.SF'
77 | exclude 'META-INF/*.DSA'
78 | exclude 'META-INF/*.RSA'
79 | duplicatesStrategy = DuplicatesStrategy.INCLUDE
80 | from {
81 | configurations.runtimeClasspath.collect { it.isDirectory() ? it : zipTree(it) }
82 | }
83 | zip64 true
84 | }
85 | spotless {
86 | format 'misc', {
87 | // define the files to apply `misc` to
88 | target '*.gradle', '*.md', '.gitignore'
89 | // define the steps to apply to those files
90 | trimTrailingWhitespace()
91 | leadingTabsToSpaces(2)
92 | endWithNewline()
93 | }
94 | java {
95 | target project.fileTree(project.rootDir) {
96 | include '**/*.java'
97 | exclude 'build/*'
98 | }
99 | // apply a specific flavor of google-java-format
100 | googleJavaFormat("${googleJavaFormat}").aosp().reflowLongStrings()
101 | // fix formatting of type annotations
102 | formatAnnotations()
103 | // make sure every file has the following copyright header.
104 | licenseHeader '''/*
105 | * Copyright $YEAR Google.
106 | *
107 | * Licensed under the Apache License, Version 2.0 (the "License");
108 | * you may not use this file except in compliance with the License.
109 | * You may obtain a copy of the License at
110 | *
111 | * http://www.apache.org/licenses/LICENSE-2.0
112 | *
113 | * Unless required by applicable law or agreed to in writing, software
114 | * distributed under the License is distributed on an "AS IS" BASIS,
115 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
116 | * See the License for the specific language governing permissions and
117 | * limitations under the License.
118 | */
119 | '''
120 | }
121 | }
122 |
--------------------------------------------------------------------------------
/pipelines/clickstream_analytics_java/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/pipelines/clickstream_analytics_java/gradle/wrapper/gradle-wrapper.jar
--------------------------------------------------------------------------------
/pipelines/clickstream_analytics_java/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-8.13-bin.zip
4 | networkTimeout=10000
5 | validateDistributionUrl=true
6 | zipStoreBase=GRADLE_USER_HOME
7 | zipStorePath=wrapper/dists
8 |
--------------------------------------------------------------------------------
/pipelines/clickstream_analytics_java/gradlew.bat:
--------------------------------------------------------------------------------
1 | @rem
2 | @rem Copyright 2015 the original author or authors.
3 | @rem
4 | @rem Licensed under the Apache License, Version 2.0 (the "License");
5 | @rem you may not use this file except in compliance with the License.
6 | @rem You may obtain a copy of the License at
7 | @rem
8 | @rem https://www.apache.org/licenses/LICENSE-2.0
9 | @rem
10 | @rem Unless required by applicable law or agreed to in writing, software
11 | @rem distributed under the License is distributed on an "AS IS" BASIS,
12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | @rem See the License for the specific language governing permissions and
14 | @rem limitations under the License.
15 | @rem
16 | @rem SPDX-License-Identifier: Apache-2.0
17 | @rem
18 |
19 | @if "%DEBUG%"=="" @echo off
20 | @rem ##########################################################################
21 | @rem
22 | @rem Gradle startup script for Windows
23 | @rem
24 | @rem ##########################################################################
25 |
26 | @rem Set local scope for the variables with windows NT shell
27 | if "%OS%"=="Windows_NT" setlocal
28 |
29 | set DIRNAME=%~dp0
30 | if "%DIRNAME%"=="" set DIRNAME=.
31 | @rem This is normally unused
32 | set APP_BASE_NAME=%~n0
33 | set APP_HOME=%DIRNAME%
34 |
35 | @rem Resolve any "." and ".." in APP_HOME to make it shorter.
36 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
37 |
38 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
39 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
40 |
41 | @rem Find java.exe
42 | if defined JAVA_HOME goto findJavaFromJavaHome
43 |
44 | set JAVA_EXE=java.exe
45 | %JAVA_EXE% -version >NUL 2>&1
46 | if %ERRORLEVEL% equ 0 goto execute
47 |
48 | echo. 1>&2
49 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 1>&2
50 | echo. 1>&2
51 | echo Please set the JAVA_HOME variable in your environment to match the 1>&2
52 | echo location of your Java installation. 1>&2
53 |
54 | goto fail
55 |
56 | :findJavaFromJavaHome
57 | set JAVA_HOME=%JAVA_HOME:"=%
58 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
59 |
60 | if exist "%JAVA_EXE%" goto execute
61 |
62 | echo. 1>&2
63 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 1>&2
64 | echo. 1>&2
65 | echo Please set the JAVA_HOME variable in your environment to match the 1>&2
66 | echo location of your Java installation. 1>&2
67 |
68 | goto fail
69 |
70 | :execute
71 | @rem Setup the command line
72 |
73 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
74 |
75 |
76 | @rem Execute Gradle
77 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
78 |
79 | :end
80 | @rem End local scope for the variables with windows NT shell
81 | if %ERRORLEVEL% equ 0 goto mainEnd
82 |
83 | :fail
84 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
85 | rem the _cmd.exe /c_ return code!
86 | set EXIT_CODE=%ERRORLEVEL%
87 | if %EXIT_CODE% equ 0 set EXIT_CODE=1
88 | if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE%
89 | exit /b %EXIT_CODE%
90 |
91 | :mainEnd
92 | if "%OS%"=="Windows_NT" endlocal
93 |
94 | :omega
95 |
--------------------------------------------------------------------------------
/pipelines/clickstream_analytics_java/scripts/01_launch_pipeline.sh:
--------------------------------------------------------------------------------
1 | ./gradlew run -Pargs="
2 | --runner=DataflowRunner \
3 | --region=$REGION \
4 | --project=$PROJECT \
5 | --gcpTempLocation=$TEMP_LOCATION \
6 | --bqProjectId=$PROJECT \
7 | --bqDataset=$BQ_DATASET \
8 | --bqTable=$BQ_TABLE \
9 | --pubsubSubscription=$SUBSCRIPTION \
10 | --btInstance=$BIGTABLE_INSTANCE \
11 | --btTable=$BIGTABLE_TABLE \
12 | --outputDeadletterTable=$BQ_DEADLETTER_TABLE \
13 | --btLookupKey=$BT_LOOKUP_KEY"
14 |
--------------------------------------------------------------------------------
/pipelines/clickstream_analytics_java/src/main/java/com/google/cloud/dataflow/solutions/clickstream_analytics/BigTableEnrichment.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2024 Google.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package com.google.cloud.dataflow.solutions.clickstream_analytics;
17 |
18 | public class BigTableEnrichment {
19 |
20 | /*** TODO ***/
21 |
22 | }
23 |
--------------------------------------------------------------------------------
/pipelines/clickstream_analytics_java/src/main/java/com/google/cloud/dataflow/solutions/clickstream_analytics/JsonToTableRows.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2024 Google.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package com.google.cloud.dataflow.solutions.clickstream_analytics;
17 |
18 | import com.google.api.services.bigquery.model.TableRow;
19 | import java.io.ByteArrayInputStream;
20 | import java.io.IOException;
21 | import java.io.InputStream;
22 | import java.nio.charset.StandardCharsets;
23 | import org.apache.beam.sdk.coders.Coder.Context;
24 | import org.apache.beam.sdk.io.gcp.bigquery.TableRowJsonCoder;
25 | import org.apache.beam.sdk.transforms.DoFn;
26 | import org.apache.beam.sdk.transforms.PTransform;
27 | import org.apache.beam.sdk.transforms.ParDo;
28 | import org.apache.beam.sdk.values.KV;
29 | import org.apache.beam.sdk.values.PCollection;
30 | import org.apache.beam.sdk.values.PCollectionTuple;
31 | import org.apache.beam.sdk.values.TupleTag;
32 | import org.apache.beam.sdk.values.TupleTagList;
33 | import org.slf4j.Logger;
34 | import org.slf4j.LoggerFactory;
35 |
36 | public class JsonToTableRows {
37 |
38 | private static int MESSAGE_LIMIT_SIZE = 10 * 1024 * 1024;
39 |
40 | public static PTransform, PCollectionTuple> run() {
41 | return new JsonToTableRows.JsonToTableRow();
42 | }
43 |
44 | static final TupleTag SUCCESS_TAG = new TupleTag() {};
45 | static final TupleTag> FAILURE_TAG = new TupleTag>() {};
46 |
47 | private static class JsonToTableRow extends PTransform, PCollectionTuple> {
48 |
49 | @Override
50 | public PCollectionTuple expand(PCollection jsonStrings) {
51 | return jsonStrings.apply(
52 | ParDo.of(new ToJsonDoFn())
53 | .withOutputTags(SUCCESS_TAG, TupleTagList.of(FAILURE_TAG)));
54 | }
55 | }
56 |
57 | private static class ToJsonDoFn extends DoFn {
58 | public static final Logger LOG = LoggerFactory.getLogger(ToJsonDoFn.class);
59 |
60 | @ProcessElement
61 | public void processElement(ProcessContext context) {
62 | String jsonString = context.element();
63 |
64 | byte[] message_in_bytes = jsonString.getBytes(StandardCharsets.UTF_8);
65 |
66 | if (message_in_bytes.length >= JsonToTableRows.MESSAGE_LIMIT_SIZE) {
67 | LOG.error("Row is too big row, size {} bytes", message_in_bytes.length);
68 | Metrics.tooBigMessages.inc();
69 | context.output(FAILURE_TAG, KV.of("TooBigRow", jsonString));
70 | }
71 |
72 | TableRow row;
73 | try (InputStream inputStream = new ByteArrayInputStream(message_in_bytes)) {
74 | row = TableRowJsonCoder.of().decode(inputStream, Context.OUTER);
75 | Metrics.successfulMessages.inc();
76 | context.output(row);
77 |
78 | } catch (IOException e) {
79 | LOG.error(e.getMessage());
80 | Metrics.jsonParseErrorMessages.inc();
81 | context.output(FAILURE_TAG, KV.of("JsonParseError", jsonString));
82 | }
83 | }
84 | }
85 | }
86 |
--------------------------------------------------------------------------------
/pipelines/clickstream_analytics_java/src/main/java/com/google/cloud/dataflow/solutions/clickstream_analytics/Metrics.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2024 Google.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package com.google.cloud.dataflow.solutions.clickstream_analytics;
17 |
18 | import org.apache.beam.sdk.metrics.Counter;
19 |
20 | public final class Metrics {
21 | public static Counter pubsubMessages = counter("pub-sub-messages");
22 | public static Counter successfulMessages = counter("successful-messages");
23 | public static Counter jsonParseErrorMessages = counter("json-parse-failed-messages");
24 | public static Counter tooBigMessages = counter("too-big-messages");
25 | public static Counter failedInsertMessages = counter("failed-insert-messages");
26 |
27 | static Counter counter(String name) {
28 | return org.apache.beam.sdk.metrics.Metrics.counter(Metrics.class, name);
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/pipelines/clickstream_analytics_java/src/main/resources/streaming_source_deadletter_table_schema.json:
--------------------------------------------------------------------------------
1 | {
2 | "fields": [
3 | {
4 | "name": "timestamp",
5 | "type": "TIMESTAMP",
6 | "mode": "REQUIRED"
7 | },
8 | {
9 | "name": "payloadString",
10 | "type": "STRING",
11 | "mode": "REQUIRED"
12 | },
13 | {
14 | "name": "payloadBytes",
15 | "type": "BYTES",
16 | "mode": "REQUIRED"
17 | },
18 | {
19 | "name": "attributes",
20 | "type": "RECORD",
21 | "mode": "REPEATED",
22 | "fields": [
23 | {
24 | "name": "key",
25 | "type": "STRING",
26 | "mode": "NULLABLE"
27 | },
28 | {
29 | "name": "value",
30 | "type": "STRING",
31 | "mode": "NULLABLE"
32 | }
33 | ]
34 | },
35 | {
36 | "name": "errorMessage",
37 | "type": "STRING",
38 | "mode": "NULLABLE"
39 | },
40 | {
41 | "name": "stacktrace",
42 | "type": "STRING",
43 | "mode": "NULLABLE"
44 | }
45 | ]
46 | }
--------------------------------------------------------------------------------
/pipelines/etl_integration_java/.gitignore:
--------------------------------------------------------------------------------
1 | ### Java template
2 | # Compiled class file
3 | *.class
4 |
5 | # Log file
6 | *.log
7 |
8 | # BlueJ files
9 | *.ctxt
10 |
11 | # Mobile Tools for Java (J2ME)
12 | .mtj.tmp/
13 |
14 | # Package Files #
15 | *.jar
16 | *.war
17 | *.nar
18 | *.ear
19 | *.zip
20 | *.tar.gz
21 | *.rar
22 |
23 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
24 | hs_err_pid*
25 | replay_pid*
26 |
27 | ### Gradle template
28 | .gradle
29 | **/build/
30 | !src/**/build/
31 |
32 | # Ignore Gradle GUI config
33 | gradle-app.setting
34 |
35 | # Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored)
36 | !gradle-wrapper.jar
37 |
38 | # Avoid ignore Gradle wrappper properties
39 | !gradle-wrapper.properties
40 |
41 | # Cache of project
42 | .gradletasknamecache
43 |
44 | # Eclipse Gradle plugin generated files
45 | # Eclipse Core
46 | .project
47 | # JDT-specific (Eclipse Java Development Tools)
48 | .classpath
49 |
50 | # Sources generated by VS Code
51 | bin
--------------------------------------------------------------------------------
/pipelines/etl_integration_java/README.md:
--------------------------------------------------------------------------------
1 | # ETL & integration sample pipeline (Java)
2 |
3 | This sample pipeline demonstrates how to use Dataflow to create replicas of transactional databases, using change
4 | streams, to create and maintain constantly updated replicas of the database. This pipeline is written in Java.
5 |
6 | This pipeline is part of the [Dataflow ETL & integration solution guide](../../use_cases/ETL_integration.md).
7 |
8 | ## Architecture
9 |
10 | The generic architecture for both looks like this:
11 |
12 | 
13 |
14 | There are two pipelines in this repository. The first pipeline reads from a Pub/Sub topic of public data, and writes
15 | to a Spanner database. This pipeline's purpose is to keep Spanner with constant updates. The data is written in an
16 | `events` table.
17 |
18 | The second pipeline reads from a change stream from Spanner, and replicates the `events` table in BigQuery. The table
19 | in BigQuery receives updates continuously and has the same data as the Spanner table, with a minimal latency.
20 |
21 | The infrastructure required to launch the pipelines is deployed
22 | through [the accompanying Terraform scripts in this solution guide](../../terraform/etl_integration/README.md).
23 |
24 | ## How to launch the pipelines
25 |
26 | All the scripts are located in the `scripts` directory and prepared to be launched from the top
27 | sources directory.
28 |
29 | The Terraform code generates a file with all the necessary variables in the location `./scripts/01_set_variables.sh`.
30 | Run the following command to apply that configuration:
31 |
32 | ```sh
33 | source scripts/01_set_variables.sh
34 | ```
35 |
36 | Then run the publisher pipeline. This pipeline will take data from the input
37 | topic, and will write it to Spanner. This pipeline is meant only to have
38 | some data in the Spanner change streams for the sake of running this guide
39 | as an example, in a real setting your data would land in Spanner by many
40 | other different means:
41 |
42 | ```sh
43 | ./scripts/02_run_publisher_dataflow.sh
44 | ```
45 |
46 | Once you have the publisher pipeline populating some data into Spanner, you
47 | can read from the change streams to replicate the database into BigQuery.
48 | For that, execute the following:
49 |
50 | ```sh
51 | ./scripts/03_run_changestream_template.sh
52 | ```
53 |
54 | ## Input data
55 |
56 | All the input data is taken by default from the following public Pub/Sub topic:
57 | * `projects/pubsub-public-data/topics/taxirides-realtime`
58 |
59 | So you don't need to send any data anywhere to run this guide as an example.
60 |
61 | ## Output data
62 |
63 | The BigQuery dataset (by default, `replica`) will contain a table (by default,
64 | called `events`, in the `taxis` database), with the same contents as the
65 | Spanner table. This replication will happen in real time with low latency,
66 | as new data lands in the Spanner table (or if any existing record is
67 | modified or deleted).
68 |
--------------------------------------------------------------------------------
/pipelines/etl_integration_java/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/pipelines/etl_integration_java/gradle/wrapper/gradle-wrapper.jar
--------------------------------------------------------------------------------
/pipelines/etl_integration_java/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-8.13-bin.zip
4 | networkTimeout=10000
5 | validateDistributionUrl=true
6 | zipStoreBase=GRADLE_USER_HOME
7 | zipStorePath=wrapper/dists
8 |
--------------------------------------------------------------------------------
/pipelines/etl_integration_java/gradlew.bat:
--------------------------------------------------------------------------------
1 | @rem
2 | @rem Copyright 2015 the original author or authors.
3 | @rem
4 | @rem Licensed under the Apache License, Version 2.0 (the "License");
5 | @rem you may not use this file except in compliance with the License.
6 | @rem You may obtain a copy of the License at
7 | @rem
8 | @rem https://www.apache.org/licenses/LICENSE-2.0
9 | @rem
10 | @rem Unless required by applicable law or agreed to in writing, software
11 | @rem distributed under the License is distributed on an "AS IS" BASIS,
12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | @rem See the License for the specific language governing permissions and
14 | @rem limitations under the License.
15 | @rem
16 | @rem SPDX-License-Identifier: Apache-2.0
17 | @rem
18 |
19 | @if "%DEBUG%"=="" @echo off
20 | @rem ##########################################################################
21 | @rem
22 | @rem Gradle startup script for Windows
23 | @rem
24 | @rem ##########################################################################
25 |
26 | @rem Set local scope for the variables with windows NT shell
27 | if "%OS%"=="Windows_NT" setlocal
28 |
29 | set DIRNAME=%~dp0
30 | if "%DIRNAME%"=="" set DIRNAME=.
31 | @rem This is normally unused
32 | set APP_BASE_NAME=%~n0
33 | set APP_HOME=%DIRNAME%
34 |
35 | @rem Resolve any "." and ".." in APP_HOME to make it shorter.
36 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
37 |
38 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
39 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
40 |
41 | @rem Find java.exe
42 | if defined JAVA_HOME goto findJavaFromJavaHome
43 |
44 | set JAVA_EXE=java.exe
45 | %JAVA_EXE% -version >NUL 2>&1
46 | if %ERRORLEVEL% equ 0 goto execute
47 |
48 | echo. 1>&2
49 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 1>&2
50 | echo. 1>&2
51 | echo Please set the JAVA_HOME variable in your environment to match the 1>&2
52 | echo location of your Java installation. 1>&2
53 |
54 | goto fail
55 |
56 | :findJavaFromJavaHome
57 | set JAVA_HOME=%JAVA_HOME:"=%
58 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
59 |
60 | if exist "%JAVA_EXE%" goto execute
61 |
62 | echo. 1>&2
63 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 1>&2
64 | echo. 1>&2
65 | echo Please set the JAVA_HOME variable in your environment to match the 1>&2
66 | echo location of your Java installation. 1>&2
67 |
68 | goto fail
69 |
70 | :execute
71 | @rem Setup the command line
72 |
73 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
74 |
75 |
76 | @rem Execute Gradle
77 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
78 |
79 | :end
80 | @rem End local scope for the variables with windows NT shell
81 | if %ERRORLEVEL% equ 0 goto mainEnd
82 |
83 | :fail
84 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
85 | rem the _cmd.exe /c_ return code!
86 | set EXIT_CODE=%ERRORLEVEL%
87 | if %EXIT_CODE% equ 0 set EXIT_CODE=1
88 | if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE%
89 | exit /b %EXIT_CODE%
90 |
91 | :mainEnd
92 | if "%OS%"=="Windows_NT" endlocal
93 |
94 | :omega
95 |
--------------------------------------------------------------------------------
/pipelines/etl_integration_java/imgs/etl_integration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/pipelines/etl_integration_java/imgs/etl_integration.png
--------------------------------------------------------------------------------
/pipelines/etl_integration_java/scripts/.gitignore:
--------------------------------------------------------------------------------
1 | 01_set_variables.sh
2 |
--------------------------------------------------------------------------------
/pipelines/etl_integration_java/scripts/02_run_publisher_dataflow.sh:
--------------------------------------------------------------------------------
1 | ./gradlew run -Pargs="
2 | --pipeline=PUBSUB_TO_SPANNER \
3 | --streaming \
4 | --enableStreamingEngine \
5 | --autoscalingAlgorithm=THROUGHPUT_BASED \
6 | --runner=DataflowRunner \
7 | --project=$PROJECT \
8 | --tempLocation=$TEMP_LOCATION \
9 | --region=$REGION \
10 | --serviceAccount=$SERVICE_ACCOUNT \
11 | --subnetwork=$NETWORK \
12 | --maxNumWorkers=$MAX_DATAFLOW_WORKERS \
13 | --experiments=enable_data_sampling;use_network_tags=ssh;dataflow \
14 | --usePublicIps=false \
15 | --pubsubTopic=$TOPIC \
16 | --spannerInstance=$SPANNER_INSTANCE \
17 | --spannerDatabase=$SPANNER_DATABASE \
18 | --spannerTable=$SPANNER_TABLE"
--------------------------------------------------------------------------------
/pipelines/etl_integration_java/scripts/03_run_changestream_template.sh:
--------------------------------------------------------------------------------
1 | gcloud dataflow flex-template run spanner-change-streams \
2 | --template-file-gcs-location=gs://dataflow-templates-$REGION/latest/flex/Spanner_Change_Streams_to_BigQuery \
3 | --project=$PROJECT \
4 | --region $REGION \
5 | --temp-location=$TEMP_LOCATION \
6 | --service-account-email=$SERVICE_ACCOUNT \
7 | --subnetwork=$NETWORK \
8 | --max-workers=$MAX_DATAFLOW_WORKERS \
9 | --worker-machine-type=$WORKER_TYPE \
10 | --disable-public-ips \
11 | --parameters \
12 | spannerInstanceId=$SPANNER_INSTANCE,\
13 | spannerDatabase=$SPANNER_DATABASE,\
14 | spannerMetadataInstanceId=$SPANNER_INSTANCE,\
15 | spannerMetadataDatabase=$SPANNER_METADATA_DB,\
16 | spannerChangeStreamName=$SPANNER_CHANGE_STREAM,\
17 | bigQueryDataset=$BIGQUERY_DATASET
--------------------------------------------------------------------------------
/pipelines/etl_integration_java/src/main/java/com/google/cloud/dataflow/solutions/ETLIntegration.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2024 Google LLC
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.google.cloud.dataflow.solutions;
18 |
19 | import com.google.cloud.dataflow.solutions.data.TaxiObjects;
20 | import com.google.cloud.dataflow.solutions.load.Spanner;
21 | import com.google.cloud.dataflow.solutions.options.SpannerPublisherOptions;
22 | import com.google.cloud.dataflow.solutions.transform.TaxiEventProcessor;
23 | import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions;
24 | import org.apache.beam.sdk.Pipeline;
25 | import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO;
26 | import org.apache.beam.sdk.io.gcp.pubsub.PubsubMessage;
27 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
28 | import org.apache.beam.sdk.values.PCollection;
29 |
30 | public class ETLIntegration {
31 | public static void main(String[] args) {
32 | String jobName = "pubsub-to-spanner";
33 | SpannerPublisherOptions spannerPublisherOptions =
34 | PipelineOptionsFactory.fromArgs(args)
35 | .withoutStrictParsing()
36 | .as(SpannerPublisherOptions.class);
37 |
38 | Pipeline p = createPipeline(spannerPublisherOptions);
39 | p.getOptions().setJobName(jobName);
40 | p.run();
41 | }
42 |
43 | public static Pipeline createPipeline(SpannerPublisherOptions options) {
44 | String projectId = options.as(DataflowPipelineOptions.class).getProject();
45 |
46 | Pipeline p = Pipeline.create(options);
47 |
48 | PCollection msgs =
49 | p.apply("Read topic", PubsubIO.readMessages().fromTopic(options.getPubsubTopic()));
50 |
51 | TaxiEventProcessor.ParsingOutput parsed =
52 | msgs.apply("Parse", TaxiEventProcessor.FromPubsubMessage.parse());
53 | PCollection taxiEvents = parsed.getParsedData();
54 |
55 | taxiEvents.apply(
56 | "Write",
57 | Spanner.Writer.builder()
58 | .projectId(projectId)
59 | .instanceId(options.getSpannerInstance())
60 | .databaseId(options.getSpannerDatabase())
61 | .tableName(options.getSpannerTable())
62 | .build());
63 |
64 | return p;
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/pipelines/etl_integration_java/src/main/java/com/google/cloud/dataflow/solutions/data/SchemaUtils.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2024 Google LLC
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.google.cloud.dataflow.solutions.data;
18 |
19 | import org.apache.beam.sdk.Pipeline;
20 | import org.apache.beam.sdk.schemas.NoSuchSchemaException;
21 | import org.apache.beam.sdk.schemas.Schema;
22 | import org.slf4j.Logger;
23 | import org.slf4j.LoggerFactory;
24 |
25 | public class SchemaUtils {
26 |
27 | private static final Logger LOG = LoggerFactory.getLogger(SchemaUtils.class);
28 |
29 | public static Schema getSchemaForType(Pipeline p, Class classType) {
30 | Schema schema;
31 |
32 | try {
33 | schema = p.getSchemaRegistry().getSchema(classType);
34 | } catch (NoSuchSchemaException e) {
35 | LOG.error(e.getMessage());
36 | throw new IllegalArgumentException(
37 | String.format(
38 | "Could not find schema for %s",
39 | TaxiObjects.TaxiEvent.class.getCanonicalName()));
40 | }
41 |
42 | return schema;
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/pipelines/etl_integration_java/src/main/java/com/google/cloud/dataflow/solutions/data/TaxiObjects.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2024 Google LLC
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.google.cloud.dataflow.solutions.data;
18 |
19 | import com.google.auto.value.AutoValue;
20 | import org.apache.beam.sdk.schemas.AutoValueSchema;
21 | import org.apache.beam.sdk.schemas.annotations.DefaultSchema;
22 | import org.apache.beam.sdk.schemas.annotations.SchemaFieldName;
23 | import org.joda.time.Instant;
24 |
25 | public class TaxiObjects {
26 |
27 | /** Represents Taxi Ride Event */
28 | @DefaultSchema(AutoValueSchema.class)
29 | @AutoValue
30 | public abstract static class TaxiEvent {
31 |
32 | @SchemaFieldName("ride_id")
33 | public abstract String getRideId();
34 |
35 | @SchemaFieldName("point_idx")
36 | public abstract Integer getPointIdx();
37 |
38 | @SchemaFieldName("latitude")
39 | public abstract Double getLatitude();
40 |
41 | @SchemaFieldName("longitude")
42 | public abstract Double getLongitude();
43 |
44 | @SchemaFieldName("timestamp")
45 | public abstract String getTimeStamp();
46 |
47 | @SchemaFieldName("meter_reading")
48 | public abstract Double getMeterReading();
49 |
50 | @SchemaFieldName("meter_increment")
51 | public abstract Double getMeterIncrement();
52 |
53 | @SchemaFieldName("ride_status")
54 | public abstract String getRideStatus();
55 |
56 | @SchemaFieldName("passenger_count")
57 | public abstract Integer getPassengerCount();
58 |
59 | public static Builder builder() {
60 | return new AutoValue_TaxiObjects_TaxiEvent.Builder();
61 | }
62 |
63 | @AutoValue.Builder
64 | public abstract static class Builder {
65 | public abstract Builder setRideId(String value);
66 |
67 | public abstract Builder setPointIdx(Integer value);
68 |
69 | public abstract Builder setLatitude(Double latitude);
70 |
71 | public abstract Builder setLongitude(Double longitude);
72 |
73 | public abstract Builder setTimeStamp(String value);
74 |
75 | public abstract Builder setMeterReading(Double value);
76 |
77 | public abstract Builder setMeterIncrement(Double value);
78 |
79 | public abstract Builder setRideStatus(String value);
80 |
81 | public abstract Builder setPassengerCount(Integer value);
82 |
83 | public abstract TaxiEvent build();
84 | }
85 | }
86 |
87 | @AutoValue
88 | @DefaultSchema(AutoValueSchema.class)
89 | /* Represents a parsing error message event */
90 | public abstract static class ParsingError {
91 | // These field names are determined
92 | @SchemaFieldName("input_data")
93 | public abstract String getInputData();
94 |
95 | @SchemaFieldName("error_message")
96 | public abstract String getErrorMessage();
97 |
98 | @SchemaFieldName("timestamp")
99 | public abstract Instant getTimestamp();
100 |
101 | public static Builder builder() {
102 | return new AutoValue_TaxiObjects_ParsingError.Builder();
103 | }
104 |
105 | @AutoValue.Builder
106 | public abstract static class Builder {
107 | public abstract Builder setInputData(String i);
108 |
109 | public abstract Builder setErrorMessage(String e);
110 |
111 | public abstract Builder setTimestamp(Instant t);
112 |
113 | public abstract ParsingError build();
114 | }
115 | }
116 | }
117 |
--------------------------------------------------------------------------------
/pipelines/etl_integration_java/src/main/java/com/google/cloud/dataflow/solutions/options/SpannerPublisherOptions.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2024 Google LLC
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.google.cloud.dataflow.solutions.options;
18 |
19 | import org.apache.beam.sdk.options.Description;
20 | import org.apache.beam.sdk.options.PipelineOptions;
21 | import org.apache.beam.sdk.options.Validation;
22 |
23 | public interface SpannerPublisherOptions extends PipelineOptions {
24 | @Validation.Required()
25 | @Description("Input topic with data to replicate in Spanner")
26 | void setPubsubTopic(String t);
27 |
28 | String getPubsubTopic();
29 |
30 | @Validation.Required()
31 | @Description("Spanner table to write the data to")
32 | void setSpannerTable(String t);
33 |
34 | String getSpannerTable();
35 |
36 | @Validation.Required()
37 | @Description("Spanner instance to write the data to")
38 | void setSpannerInstance(String s);
39 |
40 | String getSpannerInstance();
41 |
42 | @Validation.Required()
43 | @Description("Spanner database to write the data to")
44 | void setSpannerDatabase(String d);
45 |
46 | String getSpannerDatabase();
47 | }
48 |
--------------------------------------------------------------------------------
/pipelines/etl_integration_java/src/main/java/com/google/cloud/dataflow/solutions/transform/RowToError.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2024 Google LLC
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * https://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.google.cloud.dataflow.solutions.transform;
18 |
19 | import com.google.cloud.dataflow.solutions.data.SchemaUtils;
20 | import com.google.cloud.dataflow.solutions.data.TaxiObjects;
21 | import org.apache.beam.sdk.coders.SerializableCoder;
22 | import org.apache.beam.sdk.schemas.Schema;
23 | import org.apache.beam.sdk.schemas.transforms.Convert;
24 | import org.apache.beam.sdk.transforms.DoFn;
25 | import org.apache.beam.sdk.transforms.PTransform;
26 | import org.apache.beam.sdk.transforms.ParDo;
27 | import org.apache.beam.sdk.values.PCollection;
28 | import org.apache.beam.sdk.values.Row;
29 | import org.joda.time.Instant;
30 | import org.slf4j.Logger;
31 | import org.slf4j.LoggerFactory;
32 |
33 | class RowToError extends PTransform, PCollection> {
34 | public static final Logger LOG = LoggerFactory.getLogger(RowToError.class);
35 |
36 | @Override
37 | public PCollection expand(PCollection errorRows) {
38 | // Create ErrorMessage events for incompatible schema (Failed records from JsonToRow)
39 | Schema errorMessageSchema =
40 | SchemaUtils.getSchemaForType(
41 | errorRows.getPipeline(), TaxiObjects.ParsingError.class);
42 |
43 | return errorRows
44 | .apply(
45 | "Error Message Events",
46 | ParDo.of(new GenerateJsonToRowErrorMsgDoFn(errorMessageSchema)))
47 | .setCoder(SerializableCoder.of(Row.class))
48 | .setRowSchema(errorMessageSchema)
49 | .apply("Error Messages to Row", Convert.fromRows(TaxiObjects.ParsingError.class));
50 | }
51 |
52 | private static class GenerateJsonToRowErrorMsgDoFn extends DoFn {
53 | final Schema errorMessageSchema;
54 |
55 | public GenerateJsonToRowErrorMsgDoFn(Schema errorMessageSchema) {
56 | this.errorMessageSchema = errorMessageSchema;
57 | }
58 |
59 | @ProcessElement
60 | public void processElement(
61 | @FieldAccess("line") String inputData,
62 | @FieldAccess("err") String errorMessage,
63 | @Timestamp Instant timestamp,
64 | OutputReceiver out) {
65 |
66 | out.output(
67 | Row.withSchema(errorMessageSchema)
68 | .withFieldValue("input_data", inputData)
69 | .withFieldValue("error_message", errorMessage)
70 | .withFieldValue("timestamp", timestamp)
71 | .build());
72 | }
73 | }
74 | }
75 |
--------------------------------------------------------------------------------
/pipelines/imgs/anomaly_detect_arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/pipelines/imgs/anomaly_detect_arch.png
--------------------------------------------------------------------------------
/pipelines/imgs/cdp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/pipelines/imgs/cdp.png
--------------------------------------------------------------------------------
/pipelines/imgs/iot_analytics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/pipelines/imgs/iot_analytics.png
--------------------------------------------------------------------------------
/pipelines/imgs/log_replication.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/pipelines/imgs/log_replication.png
--------------------------------------------------------------------------------
/pipelines/imgs/market_intel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/pipelines/imgs/market_intel.png
--------------------------------------------------------------------------------
/pipelines/imgs/ml_ai_arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/pipelines/imgs/ml_ai_arch.png
--------------------------------------------------------------------------------
/pipelines/iot_analytics/Dockerfile:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | FROM apache/beam_python3.11_sdk:2.63.0
16 | WORKDIR /workspace
17 |
18 | RUN apt-get update -y && apt-get install -y \
19 | cmake
20 |
21 | COPY requirements.txt requirements.txt
22 | COPY main.py main.py
23 | COPY iot_analytics_pipeline iot_analytics_pipeline
24 | COPY maintenance_model.pkl maintenance_model.pkl
25 | COPY MANIFEST.in MANIFEST.in
26 | COPY setup.py setup.py
27 |
28 | RUN pip install --upgrade --no-cache-dir pip \
29 | && pip install --no-cache-dir -r requirements.txt \
30 | && pip install --no-cache-dir -e .
31 |
32 | # Copy files from official SDK image, including script/dependencies.
33 | COPY --from=apache/beam_python3.11_sdk:2.63.0 /opt/apache/beam /opt/apache/beam
34 |
35 | # Set the entrypoint to Apache Beam SDK launcher.
36 | ENTRYPOINT ["/opt/apache/beam/boot"]
--------------------------------------------------------------------------------
/pipelines/iot_analytics/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
--------------------------------------------------------------------------------
/pipelines/iot_analytics/cloudbuild.yaml:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | steps:
16 | - name: 'gcr.io/cloud-builders/docker'
17 | script: |
18 | docker build -t ${_TAG} .
19 | substitutions:
20 | _TAG: unset
21 | options:
22 | substitutionOption: 'ALLOW_LOOSE'
23 | automapSubstitutions: true
24 | images:
25 | - ${_TAG}
--------------------------------------------------------------------------------
/pipelines/iot_analytics/iot_analytics_pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/pipelines/iot_analytics/iot_analytics_pipeline/maintenance_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/pipelines/iot_analytics/iot_analytics_pipeline/maintenance_model.pkl
--------------------------------------------------------------------------------
/pipelines/iot_analytics/iot_analytics_pipeline/options.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | Options class for the IoT Analytics pipeline.
16 | """
17 |
18 | from argparse import ArgumentParser
19 | from apache_beam.options.pipeline_options import PipelineOptions
20 |
21 |
22 | class MyPipelineOptions(PipelineOptions):
23 | """
24 | Options class for the IoT Analytics pipeline.
25 | """
26 |
27 | @classmethod
28 | def _add_argparse_args(cls, parser: ArgumentParser):
29 | parser.add_argument(
30 | '--topic',
31 | dest='topic',
32 | help='Pub/sub topic name :"projects/your_project_id/topics/topic_name"')
33 | parser.add_argument(
34 | '--project_id', dest='project', help='Your Google Cloud project ID')
35 | parser.add_argument(
36 | '--dataset', dest='dataset', help='Enter BigQuery Dataset Id')
37 | parser.add_argument('--table', dest='table', help='Enter BigQuery Table Id')
38 | parser.add_argument(
39 | '--bigtable_instance_id',
40 | dest='bigtable_instance_id',
41 | help='Enter BigTable Instance Id')
42 | parser.add_argument(
43 | '--bigtable_table_id',
44 | dest='bigtable_table_id',
45 | help='Enter BigTable Table Id')
46 | parser.add_argument(
47 | '--row_key', dest='row_key', help='Enter BigTable row key')
48 |
--------------------------------------------------------------------------------
/pipelines/iot_analytics/iot_analytics_pipeline/parse_timestamp.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | Pipeline of the IoT Analytics Dataflow Solution guide.
16 | """
17 | import typing
18 | import datetime
19 | from apache_beam.transforms.window import TimestampedValue
20 |
21 |
22 | class VehicleStateEvent(typing.NamedTuple):
23 | """
24 | Class to create VehicleState TimestampedValue
25 | """
26 | vehicle_id: str
27 | timestamp: datetime.datetime
28 | temperature: int
29 | rpm: int
30 | vibration: float
31 | fuel_level: int
32 | mileage: int
33 |
34 | @staticmethod
35 | def convert_json_to_vehicleobj(input_json):
36 | dt_object = datetime.datetime.strptime(input_json["timestamp"],
37 | "%Y-%m-%dT%H:%M:%SZ")
38 | event = VehicleStateEvent(
39 | vehicle_id=input_json["vehicle_id"],
40 | timestamp=dt_object,
41 | temperature=input_json["temperature"],
42 | rpm=input_json["rpm"],
43 | vibration=input_json["vibration"],
44 | fuel_level=input_json["fuel_level"],
45 | mileage=input_json["mileage"])
46 | return TimestampedValue(event, dt_object.timestamp())
47 |
--------------------------------------------------------------------------------
/pipelines/iot_analytics/iot_analytics_pipeline/pipeline.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | Pipeline of the IoT Analytics Dataflow Solution guide.
16 | """
17 | import apache_beam as beam
18 | from apache_beam import Pipeline
19 | from .options import MyPipelineOptions
20 | import json
21 | import pickle
22 | from .aggregate_metrics import AggregateMetrics
23 | from .parse_timestamp import VehicleStateEvent
24 | from .trigger_inference import RunInference
25 | from apache_beam.transforms.window import FixedWindows
26 | from apache_beam.transforms.trigger import AccumulationMode, AfterWatermark
27 | from typing import Any, Dict, Tuple
28 |
29 | from apache_beam.transforms.enrichment import Enrichment
30 | from apache_beam.transforms.enrichment_handlers.bigtable import BigTableEnrichmentHandler
31 |
32 |
33 | def custom_join(left: Dict[str, Any], right: Dict[str, Any]):
34 | enriched = {}
35 | enriched["vehicle_id"] = left["vehicle_id"]
36 | enriched["max_temperature"] = left["max_temperature"]
37 | enriched["max_vibration"] = left["max_vibration"]
38 | enriched["latest_timestamp"] = left["max_timestamp"]
39 | enriched["avg_mileage"] = left["avg_mileage"]
40 | enriched["last_service_date"] = right["maintenance"]["last_service_date"]
41 | enriched["maintenance_type"] = right["maintenance"]["maintenance_type"]
42 | enriched["model"] = right["maintenance"]["model"]
43 | return enriched
44 |
45 |
46 | with open("maintenance_model.pkl", "rb") as model_file:
47 | sklearn_model_handler = pickle.load(model_file)
48 |
49 |
50 | def create_pipeline(pipeline_options: MyPipelineOptions) -> Pipeline:
51 | """ Create the pipeline object.
52 |
53 | Args:
54 | options: The pipeline options, with type `MyPipelineOptions`.
55 |
56 | Returns:
57 | The pipeline object.
58 | """
59 | # Define your pipeline options
60 | bigtable_handler = BigTableEnrichmentHandler(
61 | project_id=pipeline_options.project,
62 | instance_id=pipeline_options.bigtable_instance_id,
63 | table_id=pipeline_options.bigtable_table_id,
64 | row_key=pipeline_options.row_key)
65 | bq_schema = "vehicle_id:STRING, \
66 | max_temperature:INTEGER, \
67 | max_vibration:FLOAT, \
68 | latest_timestamp:TIMESTAMP, \
69 | last_service_date:STRING, \
70 | maintenance_type:STRING, \
71 | model:STRING, \
72 | needs_maintenance:INTEGER"
73 |
74 | pipeline = beam.Pipeline(options=pipeline_options)
75 | enriched_data = pipeline \
76 | | "ReadFromPubSub" >> beam.io.ReadFromPubSub(topic=pipeline_options.topic) \
77 | | "Read JSON" >> beam.Map(json.loads) \
78 | | "Parse&EventTimestamp" >> beam.Map(
79 | VehicleStateEvent.convert_json_to_vehicleobj).with_output_types(
80 | VehicleStateEvent) \
81 | | "AddKeys" >> beam.WithKeys(lambda event: event.vehicle_id).with_output_types(
82 | Tuple[str, VehicleStateEvent]) \
83 | | "Window" >> beam.WindowInto(
84 | FixedWindows(60),
85 | trigger=AfterWatermark(),
86 | accumulation_mode=AccumulationMode.ACCUMULATING) \
87 | | "AggregateMetrics" >> beam.ParDo(AggregateMetrics()).with_output_types(
88 | VehicleStateEvent).with_input_types(Tuple[str, VehicleStateEvent]) \
89 | | "EnrichWithBigtable" >> Enrichment(
90 | bigtable_handler, join_fn=custom_join, timeout=10)
91 | predictions = enriched_data | "RunInference" >> beam.ParDo(
92 | RunInference(model=sklearn_model_handler))
93 | predictions | "WriteToBigQuery" >> beam.io.gcp.bigquery.WriteToBigQuery(
94 | method=beam.io.WriteToBigQuery.Method.STORAGE_WRITE_API,
95 | project=pipeline_options.project,
96 | dataset=pipeline_options.dataset,
97 | table=pipeline_options.table,
98 | schema=bq_schema,
99 | create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
100 | write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)
101 | return pipeline
102 |
--------------------------------------------------------------------------------
/pipelines/iot_analytics/iot_analytics_pipeline/trigger_inference.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | Pipeline of the IoT Analytics Dataflow Solution guide.
16 | """
17 | import apache_beam as beam
18 | import pandas as pd
19 |
20 |
21 | class RunInference(beam.DoFn):
22 | """
23 | A custom model to predict the if vehicle needs_maintenance
24 | """
25 |
26 | def process(self, element):
27 | df = pd.DataFrame([element])
28 | df["last_service_date"] = (
29 | pd.to_datetime(df["last_service_date"]) -
30 | pd.to_datetime(df["last_service_date"]).min()).dt.days
31 | prediction = self.model.predict(
32 | df[["max_temperature", "max_vibration", "last_service_date"]])
33 | results = beam.Row(
34 | vehicle_id=str(element["vehicle_id"]),
35 | max_temperature=float(element["max_temperature"]),
36 | max_vibration=float(element["max_vibration"]),
37 | latest_timestamp=element["latest_timestamp"],
38 | last_service_date=element["last_service_date"],
39 | maintenance_type=element["maintenance_type"],
40 | model=element["model"],
41 | needs_maintenance=prediction[0])
42 | yield results._asdict()
43 |
--------------------------------------------------------------------------------
/pipelines/iot_analytics/main.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | A IoT Analytics example for the Dataflow Solution Guides.
16 | """
17 |
18 | import time
19 |
20 | from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions
21 |
22 | from iot_analytics_pipeline.options import MyPipelineOptions
23 | from iot_analytics_pipeline.pipeline import create_pipeline
24 |
25 |
26 | def main(options: MyPipelineOptions):
27 | pipeline = create_pipeline(options)
28 | pipeline.run()
29 |
30 |
31 | if __name__ == "__main__":
32 | pipeline_options: PipelineOptions = PipelineOptions()
33 | dataflow_options: GoogleCloudOptions = pipeline_options.view_as(
34 | GoogleCloudOptions)
35 | now_epoch_ms = int(time.time() * 1000)
36 | dataflow_options.job_name = f"iot-analytics-pipeline-{now_epoch_ms}"
37 | custom_options: MyPipelineOptions = pipeline_options.view_as(
38 | MyPipelineOptions)
39 | main(custom_options)
40 |
--------------------------------------------------------------------------------
/pipelines/iot_analytics/maintenance_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/pipelines/iot_analytics/maintenance_model.pkl
--------------------------------------------------------------------------------
/pipelines/iot_analytics/requirements.txt:
--------------------------------------------------------------------------------
1 | apache-beam[gcp]==2.63.0
2 | pandas
3 | scikit-learn
4 | ## Below dependencies are required if you have to run script /
5 | google-cloud-bigtable
6 | pandas
7 | google-cloud-pubsub
8 | tabulate
--------------------------------------------------------------------------------
/pipelines/iot_analytics/scripts/01_cloud_build_and_push.sh:
--------------------------------------------------------------------------------
1 | gcloud builds submit \
2 | --region=$REGION \
3 | --default-buckets-behavior=regional-user-owned-bucket \
4 | --substitutions _TAG=$CONTAINER_URI \
5 | .
--------------------------------------------------------------------------------
/pipelines/iot_analytics/scripts/02_submit_job.sh:
--------------------------------------------------------------------------------
1 | python3 -m main \
2 | --streaming \
3 | --runner=DataflowRunner \
4 | --project=$PROJECT_ID \
5 | --temp_location=gs://$PROJECT_ID/tmp \
6 | --region=$REGION \
7 | --save_main_session \
8 | --service_account_email=$SERVICE_ACCOUNT \
9 | --subnetwork=$SUBNETWORK \
10 | --sdk_container_image=$CONTAINER_URI \
11 | --max_workers=$MAX_DATAFLOW_WORKERS \
12 | --topic=$TOPIC_ID \
13 | --dataset=$DATASET \
14 | --table=$TABLE \
15 | --bigtable_instance_id=$INSTANCE_ID \
16 | --bigtable_table_id=$BIGTABLE_TABLE_ID \
17 | --row_key=$ROW_KEY \
18 | --project_id=$PROJECT_ID \
19 | --enable_streaming_engine
--------------------------------------------------------------------------------
/pipelines/iot_analytics/scripts/create_and_populate_bigtable.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | Pipeline of the IoT Analytics Dataflow Solution guide.
16 | """
17 |
18 | # Create a bigtable and populate the weather data table
19 | from google.cloud.bigtable import column_family
20 | from google.cloud.bigtable import row
21 | from google.cloud.bigtable import Client
22 | from datetime import datetime
23 | import os
24 | import json
25 |
26 | # Create Bigtable Data (Weather data) and Load Records
27 | current_directory = os.getcwd()
28 | PROJECT_ID = os.environ.get("PROJECT_ID")
29 | INSTANCE_ID = os.environ.get("BIGTABLE_INSTANCE_ID")
30 | TABLE_ID = os.environ.get("BIGTABLE_TABLE_ID")
31 | MAINTENANCE_DATA_PATH = os.environ.get("MAINTENANCE_DATA_PATH")
32 |
33 | # Create a Bigtable client
34 | client = Client(project=PROJECT_ID, admin=True)
35 | instance = client.instance(INSTANCE_ID)
36 |
37 | # Create a column family.
38 | column_family_id = "maintenance"
39 | max_versions_rule = column_family.MaxVersionsGCRule(2)
40 | column_families = {column_family_id: max_versions_rule}
41 |
42 | # Create a table.
43 | table = instance.table(TABLE_ID)
44 |
45 | # You need admin access to use `.exists()`. If you don't have the admin access, then
46 | # comment out the if-else block.
47 | if not table.exists():
48 | table.create(column_families=column_families)
49 | else:
50 | print(f"Table {TABLE_ID} already exists in {PROJECT_ID}:{INSTANCE_ID}")
51 |
52 | # Define column names for the table.
53 | vehicle_id = "vehicle_id"
54 | last_service_date = "last_service_date"
55 | maintenance_type = "maintenance_type"
56 | make = "make"
57 | model = "model"
58 |
59 | # Sample weather data
60 | maintenance_data = []
61 | try:
62 | with open(MAINTENANCE_DATA_PATH, "r", encoding="utf-8") as f:
63 | for line in f:
64 | try:
65 | data = json.loads(line)
66 | maintenance_data.append(data)
67 | except json.JSONDecodeError as e:
68 | print(f"Error decoding JSON from line: {line.strip()}")
69 | print(f"Error message: {e}")
70 | # Handle the error (e.g., log it, skip the line, or raise an exception)
71 |
72 | except FileNotFoundError:
73 | print(f"File not found: {MAINTENANCE_DATA_PATH}")
74 |
75 | # Populate Bigtable
76 | for record in maintenance_data:
77 | row_key = str(record[vehicle_id]).encode()
78 | row = table.direct_row(row_key)
79 | row.set_cell(
80 | column_family_id,
81 | vehicle_id.encode(),
82 | str(record[vehicle_id]),
83 | timestamp=datetime.utcnow())
84 | row.set_cell(
85 | column_family_id,
86 | last_service_date.encode(),
87 | str(record[last_service_date]),
88 | timestamp=datetime.utcnow())
89 | row.set_cell(
90 | column_family_id,
91 | maintenance_type.encode(),
92 | str(record[maintenance_type]),
93 | timestamp=datetime.utcnow())
94 | row.set_cell(
95 | column_family_id,
96 | make.encode(),
97 | str(record[make]),
98 | timestamp=datetime.utcnow())
99 | row.set_cell(
100 | column_family_id,
101 | model.encode(),
102 | str(record[model]),
103 | timestamp=datetime.utcnow())
104 | row.commit()
105 | print(f"Inserted row for key: {record[vehicle_id]}")
106 |
107 | print("Bigtable populated with sample weather information.")
108 |
--------------------------------------------------------------------------------
/pipelines/iot_analytics/scripts/create_data.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | Pipeline of the IoT Analytics Dataflow Solution guide.
16 | """
17 |
18 | import random
19 | import datetime
20 | import pandas as pd
21 | import os
22 |
23 | # Get Env variables
24 | current_directory = os.getcwd()
25 | VEHICLE_DATA_PATH = os.environ.get("VEHICLE_DATA_PATH")
26 | MAINTENANCE_DATA_PATH = os.environ.get("MAINTENANCE_DATA_PATH")
27 |
28 |
29 | # Function to generate random vehicle data
30 | def generate_vehicle_data(vehicle_id):
31 | return {
32 | "vehicle_id": vehicle_id,
33 | "timestamp": datetime.datetime.now().isoformat(timespec="seconds") + "Z",
34 | "temperature": random.randint(65, 85),
35 | "rpm": random.randint(1500, 3500),
36 | "vibration": round(random.uniform(0.1, 0.5), 2),
37 | "fuel_level": random.randint(50, 90),
38 | "mileage": random.randint(40000, 60000)
39 | }
40 |
41 |
42 | # Function to generate random maintenance data
43 | def generate_maintenance_data(vehicle_id):
44 | return {
45 | "vehicle_id":
46 | vehicle_id,
47 | "last_service_date": (datetime.datetime.now() - datetime.timedelta(
48 | days=random.randint(30, 365))).strftime("%Y-%m-%d"),
49 | "maintenance_type":
50 | random.choice([
51 | "oil_change", "tire_rotation", "brake_check", "filter_replacement"
52 | ]),
53 | "make":
54 | "Ford",
55 | "model":
56 | "F-150"
57 | }
58 |
59 |
60 | # Generate 10 unique vehicle IDs
61 | vehicle_ids = [str(i) for i in range(1000, 1010)]
62 |
63 | # Create vehicle data and maintenance data lists
64 | vehicle_data = [generate_vehicle_data(vehicle_id) for vehicle_id in vehicle_ids]
65 | maintenance_data = [
66 | generate_maintenance_data(vehicle_id) for vehicle_id in vehicle_ids
67 | ]
68 |
69 | # Convert lists to Pandas DataFrames
70 | df_vehicle_data = pd.DataFrame(vehicle_data)
71 | df_maintenance_data = pd.DataFrame(maintenance_data)
72 |
73 | df_vehicle_data.to_json(VEHICLE_DATA_PATH, orient="records", lines=True)
74 | df_maintenance_data.to_json(MAINTENANCE_DATA_PATH, orient="records", lines=True)
75 |
--------------------------------------------------------------------------------
/pipelines/iot_analytics/scripts/maintenance_data.jsonl:
--------------------------------------------------------------------------------
1 | {"vehicle_id":"1000","last_service_date":"2024-12-15","maintenance_type":"oil_change","make":"Ford","model":"F-150"}
2 | {"vehicle_id":"1001","last_service_date":"2024-07-24","maintenance_type":"filter_replacement","make":"Ford","model":"F-150"}
3 | {"vehicle_id":"1002","last_service_date":"2024-02-06","maintenance_type":"tire_rotation","make":"Ford","model":"F-150"}
4 | {"vehicle_id":"1003","last_service_date":"2024-01-27","maintenance_type":"filter_replacement","make":"Ford","model":"F-150"}
5 | {"vehicle_id":"1004","last_service_date":"2024-10-06","maintenance_type":"filter_replacement","make":"Ford","model":"F-150"}
6 | {"vehicle_id":"1005","last_service_date":"2024-06-07","maintenance_type":"brake_check","make":"Ford","model":"F-150"}
7 | {"vehicle_id":"1006","last_service_date":"2024-03-11","maintenance_type":"brake_check","make":"Ford","model":"F-150"}
8 | {"vehicle_id":"1007","last_service_date":"2024-08-15","maintenance_type":"brake_check","make":"Ford","model":"F-150"}
9 | {"vehicle_id":"1008","last_service_date":"2024-07-29","maintenance_type":"tire_rotation","make":"Ford","model":"F-150"}
10 | {"vehicle_id":"1009","last_service_date":"2024-12-15","maintenance_type":"brake_check","make":"Ford","model":"F-150"}
11 |
--------------------------------------------------------------------------------
/pipelines/iot_analytics/scripts/model.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | Creates model for IoT Analytics Solution Dataflow Solution guide.
16 | """
17 |
18 | import pandas as pd
19 | import numpy as np
20 | from datetime import datetime, timedelta
21 | from sklearn.model_selection import train_test_split
22 | from sklearn.linear_model import LogisticRegression
23 | import pickle
24 |
25 |
26 | def create_sample_data(num_samples):
27 | data = {
28 | "vehicle_id": [],
29 | "max_temperature": [],
30 | "max_vibration": [],
31 | "last_service_date": [],
32 | "needs_maintenance": []
33 | }
34 |
35 | for i in range(num_samples):
36 | vehicle_id = str(1000 + i)
37 | max_temperature = np.random.randint(50, 100)
38 | max_vibration = np.random.uniform(0, 1)
39 | last_service_date = datetime.now() - timedelta(
40 | days=np.random.randint(0, 365))
41 | last_service_date_str = last_service_date.strftime("%Y-%m-%d")
42 |
43 | needs_maintenance = (max_temperature > 75) or (max_vibration > 0.5) or (
44 | last_service_date < datetime.now() - timedelta(days=180))
45 |
46 | data["vehicle_id"].append(vehicle_id)
47 | data["max_temperature"].append(max_temperature)
48 | data["max_vibration"].append(max_vibration)
49 | data["last_service_date"].append(last_service_date_str)
50 | data["needs_maintenance"].append(needs_maintenance)
51 |
52 | return pd.DataFrame(data)
53 |
54 |
55 | # Create a sample dataset with 100 samples
56 | df = create_sample_data(100)
57 | print(df.head(n=10).to_markdown())
58 |
59 | # Convert the last_service_date to a datetime object
60 | df["last_service_date"] = pd.to_datetime(df["last_service_date"])
61 |
62 | # Features and target variable
63 | X = df[["max_temperature", "max_vibration", "last_service_date"]]
64 | y = df["needs_maintenance"].astype(int)
65 |
66 | # Convert last_service_date to numeric for modeling
67 | X["last_service_date"] = (X["last_service_date"] -
68 | X["last_service_date"].min()).dt.days
69 |
70 | # Split the dataset
71 | X_train, X_test, y_train, y_test = train_test_split(
72 | X, y, test_size=0.2, random_state=42)
73 |
74 | # Create and train the model
75 | model = LogisticRegression()
76 | model.fit(X_train, y_train)
77 |
78 | # Save the model to a local file
79 | with open("maintenance_model.pkl", "wb") as f:
80 | print("Added Model")
81 | pickle.dump(model, f)
82 |
--------------------------------------------------------------------------------
/pipelines/iot_analytics/scripts/publish_on_pubsub.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | Pipeline of the IoT Analytics Dataflow Solution guide.
16 | """
17 |
18 | import json
19 | from google.cloud import pubsub_v1
20 | import os
21 |
22 |
23 | def publish_messages(project, topic, data_path):
24 | """
25 | Publishes JSON messages from a file to a Pub/Sub topic.
26 |
27 | Args:
28 | project: The ID of the Google Cloud project.
29 | topic: The ID of the Pub/Sub topic.
30 | data_path: The path to the JSON data file.
31 | """
32 |
33 | publisher = pubsub_v1.PublisherClient()
34 | topic_path = publisher.topic_path(project, topic)
35 |
36 | with open(data_path, "r", encoding="utf-8") as f:
37 | for line in f:
38 | try:
39 | # Parse each line as a JSON object
40 | json_data = json.loads(line)
41 |
42 | # Publish the JSON data as a message
43 | message_data = json.dumps(json_data).encode("utf-8")
44 | future = publisher.publish(topic_path, message_data)
45 | print(f"Published message ID: {future.result()}")
46 |
47 | except json.JSONDecodeError as e:
48 | print(f"Error decoding JSON: {e}")
49 |
50 |
51 | if __name__ == "__main__":
52 | current_directory = os.getcwd()
53 | publish_messages(
54 | os.environ.get("PROJECT_ID"), os.environ.get("PUBSUB_TOPIC_ID"),
55 | os.environ.get("VEHICLE_DATA_PATH"))
56 |
--------------------------------------------------------------------------------
/pipelines/iot_analytics/scripts/vehicle_data.jsonl:
--------------------------------------------------------------------------------
1 | {"vehicle_id":"1000","timestamp":"2025-01-18T15:56:41Z","temperature":85,"rpm":1797,"vibration":0.11,"fuel_level":64,"mileage":52571}
2 | {"vehicle_id":"1001","timestamp":"2025-01-18T15:56:41Z","temperature":74,"rpm":1967,"vibration":0.37,"fuel_level":67,"mileage":46017}
3 | {"vehicle_id":"1002","timestamp":"2025-01-18T15:56:41Z","temperature":80,"rpm":2529,"vibration":0.31,"fuel_level":59,"mileage":44782}
4 | {"vehicle_id":"1003","timestamp":"2025-01-18T15:56:41Z","temperature":67,"rpm":3312,"vibration":0.23,"fuel_level":62,"mileage":59421}
5 | {"vehicle_id":"1004","timestamp":"2025-01-18T15:56:41Z","temperature":77,"rpm":3206,"vibration":0.27,"fuel_level":74,"mileage":52049}
6 | {"vehicle_id":"1005","timestamp":"2025-01-18T15:56:41Z","temperature":66,"rpm":3091,"vibration":0.31,"fuel_level":80,"mileage":52200}
7 | {"vehicle_id":"1006","timestamp":"2025-01-18T15:56:41Z","temperature":81,"rpm":2883,"vibration":0.46,"fuel_level":85,"mileage":40869}
8 | {"vehicle_id":"1007","timestamp":"2025-01-18T15:56:41Z","temperature":69,"rpm":1696,"vibration":0.12,"fuel_level":79,"mileage":46986}
9 | {"vehicle_id":"1008","timestamp":"2025-01-18T15:56:41Z","temperature":69,"rpm":3308,"vibration":0.14,"fuel_level":58,"mileage":47238}
10 | {"vehicle_id":"1009","timestamp":"2025-01-18T15:56:41Z","temperature":83,"rpm":2238,"vibration":0.3,"fuel_level":74,"mileage":44609}
11 |
--------------------------------------------------------------------------------
/pipelines/iot_analytics/setup.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | Setup file for the IoT Analytics pipeline.
16 | """
17 |
18 | from setuptools import setup, find_packages
19 |
20 | with open("requirements.txt", encoding="utf-8") as f:
21 | requirements = f.readlines()
22 |
23 | setup(
24 | name="IoT Dataflow Anaytics Pipeline",
25 | version="0.1",
26 | packages=find_packages(),
27 | install_requires=requirements,
28 | package_data={"iot_analytics_pipeline": ["maintenance_model.pkl"]})
29 |
--------------------------------------------------------------------------------
/pipelines/log_replication_splunk/README.md:
--------------------------------------------------------------------------------
1 | # Log replication sample pipeline (Dataflow template)
2 |
3 | This sample pipeline reads log lines with additional metadata from a Pub/Sub
4 | topic, and it redirects to the corresponding log collector in Splunk. The
5 | pipeline leverages the
6 | [Google-provided Dataflow template](https://cloud.google.com/dataflow/docs/guides/templates/provided/pubsub-to-splunk).
7 |
8 |
9 | This pipeline is part of the [Dataflow log replication & analytics solution
10 | guide](../../use_cases/Log_replication.md).
11 |
12 | ## Architecture
13 |
14 | The generic architecture for both looks like this:
15 |
16 | 
17 |
18 | The Terraform code configures a Cloud Logging sink that makes sure that all
19 | logs are sent to the `all-logs` Pub/Sub topic.
20 |
21 | The infrastructure required to launch the pipelines is deployed
22 | through [the accompanying Terraform scripts in this solution guide](../../terraform/log_replication_splunk/README.md).
23 |
24 | ## How to launch the pipeline
25 |
26 | All the scripts are located in the `scripts` directory and prepared to be launched from the top
27 | sources directory.
28 |
29 | The Terraform code generates a file with all the necessary variables in the
30 | location `./scripts/00_set_variables.sh`. Run the following command to
31 | apply that configuration:
32 |
33 | ```sh
34 | source scripts/01_set_variables.sh
35 | ```
36 |
37 | Now you can run the pipeline that will take logs from Pub/Sub and will send
38 | them to Splunk. You need to ensure that there is network connectivity to
39 | access Splunk from Dataflow (e.g. Internet access, if necessary), and that
40 | you have set the required credentials in the Terraform config, so Dataflow
41 | has the required permissions to publish into Splunk:
42 |
43 | ```sh
44 | ./scripts/01_launch_ps_to_splunk.sh
45 | ```
46 |
47 | ## Input data
48 |
49 | All the logs produced in the project are being redirected to the Pub/Sub
50 | topic `all-logs`. The pipeline uses a Pub/Sub subscription, `all-logs-sub`,
51 | so no logs are lost if the pipeline is stopped (during the retention period
52 | of the subscription, which is 30 days by default).
53 |
54 | The regular operation of the project (e.g. launching Dataflow) should
55 | already produce some logs as to observe some output in Splunk for testing
56 | purposes.
57 |
58 | ## Output data
59 |
60 | There are two outputs in this pipeline:
61 | * Splunk, written to the HEC endpoint
62 | * Dead letter queue, the `deadletter-topic` Pub/Sub topic
63 |
64 | When Splunk rejects messages for whatever reason, they are sent to the
65 | `deadletter-topic`.
66 |
67 | If the Splunk endpoint rejects messages because it is overloaded, times out,
68 | etc, Dataflow will retry publishing those messages in Splunk. Only the
69 | messages that are rejected by Splunk due to non-transitory errors are sent
70 | to the dead letter queue.
--------------------------------------------------------------------------------
/pipelines/log_replication_splunk/scripts/.gitignore:
--------------------------------------------------------------------------------
1 | 00_set_variables.sh
2 |
--------------------------------------------------------------------------------
/pipelines/log_replication_splunk/scripts/01_launch_ps_to_splunk.sh:
--------------------------------------------------------------------------------
1 | gcloud dataflow jobs run logs-to-splunk \
2 | --gcs-location gs://dataflow-templates-$REGION/latest/Cloud_PubSub_to_Splunk \
3 | --region $REGION \
4 | --project $PROJECT \
5 | --service-account-email $SERVICE_ACCOUNT \
6 | --staging-location $TEMP_LOCATION \
7 | --subnetwork $NETWORK \
8 | --enable-streaming-engine \
9 | --disable-public-ips \
10 | --max-workers=$MAX_DATAFLOW_WORKERS \
11 | --parameters \
12 | inputSubscription=$INPUT_SUBSCRIPTION,\
13 | url=$SPLUNK_HEC_URL,\
14 | disableCertificateValidation=false,\
15 | includePubsubMessage=false,\
16 | tokenSecretId=$TOKEN_SECRET_ID,\
17 | tokenSource=SECRET_MANAGER,\
18 | enableBatchLogs=true,\
19 | enableGzipHttpCompression=true,\
20 | outputDeadletterTopic=$DEADLETTER_TOPIC
21 |
--------------------------------------------------------------------------------
/pipelines/marketing_intelligence/Dockerfile:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | ARG SERVING_BUILD_IMAGE=tensorflow/tensorflow:2.18.0-gpu
16 | FROM ${SERVING_BUILD_IMAGE}
17 | WORKDIR /workspace
18 |
19 | RUN apt-get update -y && apt-get install -y \
20 | cmake
21 |
22 | COPY requirements.txt requirements.txt
23 | COPY main.py main.py
24 | COPY marketing_intelligence_pipeline marketing_intelligence_pipeline
25 | COPY MANIFEST.in MANIFEST.in
26 | COPY setup.py setup.py
27 |
28 | RUN pip install --upgrade --no-cache-dir pip \
29 | && pip install --no-cache-dir -r requirements.txt \
30 | && pip install --no-cache-dir -e .
31 |
32 | # Copy files from official SDK image, including script/dependencies.
33 | COPY --from=apache/beam_python3.11_sdk:2.63.0 /opt/apache/beam /opt/apache/beam
34 |
35 |
36 | ENV KERAS_BACKEND="tensorflow"
37 |
38 | # Set the entrypoint to Apache Beam SDK launcher.
39 | ENTRYPOINT ["/opt/apache/beam/boot"]
40 |
--------------------------------------------------------------------------------
/pipelines/marketing_intelligence/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
--------------------------------------------------------------------------------
/pipelines/marketing_intelligence/cloudbuild.yaml:
--------------------------------------------------------------------------------
1 | # Copyright 2054 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | steps:
16 | # - name: 'gcr.io/cloud-builders/gsutil'
17 | # script: |
18 | # echo Copying Gemma model from $_GCS_GEMMA_PATH
19 | # gsutil -m -q cp -r $_GCS_GEMMA_PATH /workspace
20 | # echo All files copied.
21 | - name: 'gcr.io/cloud-builders/docker'
22 | script: |
23 | docker build -t ${_TAG} .
24 | substitutions:
25 | _TAG: unset
26 | options:
27 | substitutionOption: 'ALLOW_LOOSE'
28 | automapSubstitutions: true
29 | machineType: E2_HIGHCPU_8
30 | images:
31 | - ${_TAG}
--------------------------------------------------------------------------------
/pipelines/marketing_intelligence/main.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | A Product predicter example for the Dataflow Solution Guides.
16 | """
17 |
18 | import time
19 |
20 | from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions
21 |
22 | from marketing_intelligence_pipeline.options import MyPipelineOptions
23 | from marketing_intelligence_pipeline.pipeline import create_pipeline
24 |
25 |
26 | def main(options: MyPipelineOptions):
27 | pipeline = create_pipeline(options)
28 | pipeline.run()
29 |
30 |
31 | if __name__ == "__main__":
32 | pipeline_options: PipelineOptions = PipelineOptions()
33 | dataflow_options: GoogleCloudOptions = pipeline_options.view_as(
34 | GoogleCloudOptions)
35 | now_epoch_ms = int(time.time() * 1000)
36 | dataflow_options.job_name = f"marketing-intelligence-pipeline-{now_epoch_ms}"
37 | custom_options: MyPipelineOptions = pipeline_options.view_as(
38 | MyPipelineOptions)
39 | main(custom_options)
40 |
--------------------------------------------------------------------------------
/pipelines/marketing_intelligence/marketing_intelligence_pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/pipelines/marketing_intelligence/marketing_intelligence_pipeline/options.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | Options class for the Marketing Intelligence pipeline.
16 | """
17 |
18 | from argparse import ArgumentParser
19 |
20 | from apache_beam.options.pipeline_options import PipelineOptions
21 |
22 |
23 | class MyPipelineOptions(PipelineOptions):
24 |
25 | @classmethod
26 | def _add_argparse_args(cls, parser: ArgumentParser):
27 | parser.add_argument("--messages_subscription", type=str)
28 | parser.add_argument("--model_endpoint", type=str)
29 | parser.add_argument("--project_id", type=str)
30 | parser.add_argument("--location", type=str)
31 | parser.add_argument("--responses_topic", type=str)
32 |
--------------------------------------------------------------------------------
/pipelines/marketing_intelligence/marketing_intelligence_pipeline/pipeline.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | Pipeline of the Marketing Intelligence Dataflow Solution guide.
16 | """
17 |
18 | from apache_beam import Pipeline, PCollection
19 | from apache_beam.ml.inference import RunInference
20 | from apache_beam.io.gcp import pubsub
21 | import json
22 | import apache_beam as beam
23 | from apache_beam.ml.inference.base import PredictionResult
24 | from apache_beam.ml.inference.vertex_ai_inference import VertexAIModelHandlerJSON
25 | from .options import MyPipelineOptions
26 |
27 |
28 | # Format the predictions sent by the Vertex AI Endpoint
29 | def _format_output(element: PredictionResult) -> str:
30 | return f"Input: \n{element.example}, \n\n\nOutput: \n{element.infernece}"
31 |
32 |
33 | # Format the input and send each input as a dictionary
34 | def _format_input(x: bytes) -> dict:
35 | instance_dict = json.loads(x.decode("utf-8"))
36 | return instance_dict
37 |
38 |
39 | # Read input from Pub/Sub (all input data to be sent in String) and format it
40 | @beam.ptransform_fn
41 | def _extract(p: Pipeline, subscription: str) -> PCollection[str]:
42 | msgs: PCollection[bytes] = p | "Read subscription" >> beam.io.ReadFromPubSub(
43 | subscription=subscription)
44 | return msgs | "Parse and format Input" >> beam.Map(_format_input)
45 |
46 |
47 | # TODO Add transformation for BigTable Enrichment
48 |
49 |
50 | # Request predictions from the Vertex AI endpoint by sending the formatted input
51 | @beam.ptransform_fn
52 | def _transform(msgs: PCollection[str], model_endpoint: str, project: str,
53 | location: str) -> PCollection[str]:
54 | model_handler = VertexAIModelHandlerJSON(
55 | endpoint_id=model_endpoint, project=project, location=location)
56 | preds: PCollection[
57 | PredictionResult] = msgs | "RunInference-vertexai" >> RunInference(
58 | model_handler)
59 | return preds | "Format Output" >> beam.Map(_format_output)
60 |
61 |
62 | def create_pipeline(options: MyPipelineOptions) -> Pipeline:
63 | """ Create the pipeline object.
64 |
65 | Args:
66 | options: The pipeline options, with type `MyPipelineOptions`.
67 |
68 | Returns:
69 | The pipeline object.
70 | """
71 | pipeline = beam.Pipeline(options=options)
72 | # Extract
73 | messages: PCollection[str] = pipeline | "Read" >> _extract(
74 | subscription=options.messages_subscription)
75 | # Transform
76 | predictions: PCollection[str] = messages | "Transform" >> _transform(
77 | model_endpoint=options.model_endpoint,
78 | project=options.project_id,
79 | location=options.location)
80 | # Load
81 | predictions | "Publish Result" >> pubsub.WriteStringsToPubSub(
82 | topic=options.responses_topic)
83 |
84 | return pipeline
85 |
--------------------------------------------------------------------------------
/pipelines/marketing_intelligence/requirements.txt:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | apache-beam[gcp]==2.63.0
--------------------------------------------------------------------------------
/pipelines/marketing_intelligence/scripts/.gitignore:
--------------------------------------------------------------------------------
1 | 00_set_variables.sh
--------------------------------------------------------------------------------
/pipelines/marketing_intelligence/scripts/01_build_and_push_container.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2024 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | gcloud builds submit \
16 | --region=$REGION \
17 | --default-buckets-behavior=regional-user-owned-bucket \
18 | --substitutions _TAG=$CONTAINER_URI\
19 | .
--------------------------------------------------------------------------------
/pipelines/marketing_intelligence/scripts/02_run_dataflow.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2024 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | python main.py \
16 | --runner=DataflowRunner \
17 | --project=$PROJECT \
18 | --temp_location=gs://$PROJECT/tmp \
19 | --region=$REGION \
20 | --save_main_session \
21 | --machine_type=$MACHINE_TYPE \
22 | --num_workers=1 \
23 | --disk_size_gb=$DISK_SIZE_GB \
24 | --max_num_workers=$MAX_DATAFLOW_WORKERS \
25 | --no_use_public_ip \
26 | --service_account_email=$SERVICE_ACCOUNT \
27 | --subnetwork=$SUBNETWORK \
28 | --sdk_container_image=$CONTAINER_URI \
29 | --dataflow_service_options="worker_accelerator=type:nvidia-l4;count:1;install-nvidia-driver:5xx" \
30 | --messages_subscription=projects/$PROJECT/subscriptions/dataflow-solutions-guide-market-intelligence-input-sub \
31 | --responses_topic=projects/$PROJECT/topics/dataflow-solutions-guide-market-intelligence-output \
32 | --project_id=$PROJECT \
33 | --model_endpoint=""
34 |
35 |
--------------------------------------------------------------------------------
/pipelines/marketing_intelligence/setup.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | Setup file for Market Intelligence pipeline.
16 | """
17 |
18 | from setuptools import setup, find_packages
19 |
20 | with open("requirements.txt", encoding="utf-8") as f:
21 | requirements = f.readlines()
22 |
23 | setup(
24 | name="Dataflow Solution for Market Intelligence pipelines",
25 | version="0.1",
26 | description="A Product predicter example for the Dataflow Solution Guides.",
27 | packages=find_packages(),
28 | install_requires=requirements,
29 | )
30 |
--------------------------------------------------------------------------------
/pipelines/ml_ai_python/Dockerfile:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | ARG SERVING_BUILD_IMAGE=tensorflow/tensorflow:2.18.0-gpu
16 | FROM ${SERVING_BUILD_IMAGE}
17 | WORKDIR /workspace
18 |
19 | RUN apt-get update -y && apt-get install -y \
20 | cmake
21 |
22 | COPY requirements.txt requirements.txt
23 | COPY main.py main.py
24 | COPY ml_ai_pipeline ml_ai_pipeline
25 | COPY MANIFEST.in MANIFEST.in
26 | COPY setup.py setup.py
27 |
28 | RUN pip install --upgrade --no-cache-dir pip \
29 | && pip install --no-cache-dir -r requirements.txt \
30 | && pip install --no-cache-dir -e .
31 |
32 | # Copy files from official SDK image, including script/dependencies.
33 | COPY --from=apache/beam_python3.11_sdk:2.63.0 /opt/apache/beam /opt/apache/beam
34 |
35 | COPY gemma_2B gemma_2B
36 |
37 | ENV KERAS_BACKEND="tensorflow"
38 |
39 | # Set the entrypoint to Apache Beam SDK launcher.
40 | ENTRYPOINT ["/opt/apache/beam/boot"]
--------------------------------------------------------------------------------
/pipelines/ml_ai_python/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
--------------------------------------------------------------------------------
/pipelines/ml_ai_python/README.md:
--------------------------------------------------------------------------------
1 | # GenAI & Machine Learning inference sample pipeline (Python)
2 |
3 | This sample pipeline demonstrates how to use Dataflow to process data, and calculate predictions
4 | using GenAI, specifically the [Google open source Gemma model](https://ai.google.dev/gemma).
5 | This pipeline is written in Python.
6 |
7 | This pipeline is part of the [Dataflow Gen AI & ML solution guide](../../use_cases/GenAI_ML.md).
8 |
9 | ## Architecture
10 |
11 | The generic architecture for an inference pipeline looks like as follows:
12 |
13 | 
14 |
15 | In this directory, you will find a specific implementation of the above architecture, with the
16 | following stages:
17 |
18 | 1. **Data ingestion:** Reads data from a Pub/Sub topic.
19 | 2. **Data preprocessing:** The sample pipeline does not do any transformation, but it is trivial
20 | to add a preprocessing step leveraging
21 | [the Enrichment transform](https://cloud.google.com/dataflow/docs/guides/enrichment) to perform
22 | feature engineering before calling the model.
23 | 3. **Inference:** Uses the RunInference transform with a custom model handler, using Keras and Tensorflow, to call the Gemma model. The pipeline uses a GPU with the Dataflow worker, to speed up the inference.
24 | 4. **Predictions:** The predictions are sent to another Pub/Sub topic as output.
25 |
26 | ## Gemma model
27 |
28 | The model needs to be uploaded to GCS in a directory named `gemma_2B` in the bucket created by
29 | Terraform (same name as project id).
30 |
31 | For that, please first [download the Gemma model from Kaggle](https://www.kaggle.com/models/google/gemma),
32 | uncompress it and then uploaded it with a command similar to this one:
33 |
34 | ```sh
35 | gcloud storage cp -r LOCAL_DIRECTORY gs:///gemma_2B
36 | ```
37 |
38 | That command will do parallel composite uploads to speed up the uploading of the largest files in
39 | the model.
40 |
41 | ## Selecting the cloud region
42 |
43 | Not all the resources may be available in all the regions. The default values included in this
44 | directory have been tested using `us-central1` as region.
45 |
46 | The file `cloudbuild.yaml` is using the machine type `E2_HIGHCPU_8` as the default machine type. If
47 | that's not available in your preferred region, try with other machine types that are available
48 | in Cloud Build:
49 | * https://cloud.google.com/build/docs/api/reference/rest/v1/projects.builds#machinetype
50 |
51 | Moreover, the file `scripts/00_set_environment.sh` specifies a machine type for the Datalow workers.
52 | The selected machine type, `g2-standard-4`, is the recommended one for inference with GPU. If that
53 | type is not available in your region, you can check what machines are available to use with the
54 | following command:
55 |
56 | ```sh
57 | gcloud compute machine-types list --zones=,,...
58 | ```
59 |
60 | See more info about selecting the right type of machine in the following link:
61 | * https://cloud.google.com/compute/docs/machine-resource
62 |
63 | ## How to launch the pipeline
64 |
65 | All the scripts are located in the `scripts` directory and prepared to be launched from the top
66 | sources directory.
67 |
68 | In the script `scripts/00_set_environment.sh`, define the value of the project id and the region variable:
69 |
70 | ```
71 | export PROJECT=
72 | export REGION=
73 | ```
74 |
75 | Leave the rest of variables untouched, although you can override them if you prefer.
76 |
77 | After you edit the script, load those variables into the environment
78 |
79 | ```sh
80 | source scripts/00_set_environment.sh
81 | ```
82 |
83 | And then run the script that builds and publishes the custom Dataflow container. This container will
84 | contain the Gemma model, and all the required dependencies.
85 |
86 | ```sh
87 | ./scripts/01_build_and_push_container.sh
88 | ```
89 |
90 | This will create a Cloud Build job that can take a few minutes to complete. Once it completes, you
91 | can trigger the pipeline with the following:
92 |
93 | ```sh
94 | ./scripts/02_run_dataflow.sh
95 | ```
96 |
97 | ## Input data
98 |
99 | To send data into the pipeline, you need to publish messages in the `messages` topic. Those
100 | messages are passed "as is" to Gemma, so you may want to add some prompting to the question.
101 |
102 | ## Output data
103 |
104 | The predictions are published into the topic `predictions`, and can be observed using the
105 | subscription `predictions-sub`.
--------------------------------------------------------------------------------
/pipelines/ml_ai_python/cloudbuild.yaml:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | steps:
16 | - name: 'gcr.io/cloud-builders/gsutil'
17 | script: |
18 | echo Copying Gemma model from $_GCS_GEMMA_PATH
19 | gsutil -m -q cp -r $_GCS_GEMMA_PATH /workspace
20 | echo All files copied.
21 | - name: 'gcr.io/cloud-builders/docker'
22 | script: |
23 | docker build -t ${_TAG} .
24 | substitutions:
25 | _GCS_GEMMA_PATH: unset
26 | _TAG: unset
27 | options:
28 | substitutionOption: 'ALLOW_LOOSE'
29 | automapSubstitutions: true
30 | machineType: E2_HIGHCPU_8
31 | images:
32 | - ${_TAG}
--------------------------------------------------------------------------------
/pipelines/ml_ai_python/main.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | A machine learning streaming inference example for the Dataflow Solution Guides.
16 | """
17 |
18 | import time
19 |
20 | from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions
21 |
22 | from ml_ai_pipeline.options import MyPipelineOptions
23 | from ml_ai_pipeline.pipeline import create_pipeline
24 |
25 |
26 | def main(options: MyPipelineOptions):
27 | pipeline = create_pipeline(options)
28 | pipeline.run()
29 |
30 |
31 | if __name__ == "__main__":
32 | pipeline_options: PipelineOptions = PipelineOptions()
33 | dataflow_options: GoogleCloudOptions = pipeline_options.view_as(
34 | GoogleCloudOptions)
35 | now_epoch_ms = int(time.time() * 1000)
36 | dataflow_options.job_name = f"gemma-inference-pipeline-{now_epoch_ms}"
37 | custom_options: MyPipelineOptions = pipeline_options.view_as(
38 | MyPipelineOptions)
39 | main(custom_options)
40 |
--------------------------------------------------------------------------------
/pipelines/ml_ai_python/ml_ai_pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/pipelines/ml_ai_python/ml_ai_pipeline/model_handlers.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | Custom model handlers to be used with RunInference.
16 | """
17 |
18 | from typing import Sequence, Optional, Any, Iterable
19 |
20 | import keras_nlp
21 | from apache_beam.ml.inference.base import ModelHandler, PredictionResult
22 | from keras_nlp.src.models import GemmaCausalLM
23 |
24 |
25 | class GemmaModelHandler(ModelHandler[str, PredictionResult, GemmaCausalLM]):
26 | """
27 | A RunInference model handler for the Gemma model.
28 | """
29 |
30 | def __init__(self, model_name: str = "gemma_2B"):
31 | """ Implementation of the ModelHandler interface for Gemma using text as input.
32 |
33 | Example Usage::
34 |
35 | pcoll | RunInference(GemmaModelHandler())
36 |
37 | Args:
38 | model_name: The Gemma model name. Default is gemma_2B.
39 | """
40 | super().__init__()
41 | self._model_name = model_name
42 | self._env_vars = {}
43 |
44 | def share_model_across_processes(self) -> bool:
45 | """ Indicates if the model should be loaded once-per-VM rather than
46 | once-per-worker-process on a VM. Because Gemma is a large language model,
47 | this will always return True to avoid OOM errors.
48 | """
49 | return True
50 |
51 | def load_model(self) -> GemmaCausalLM:
52 | """Loads and initializes a model for processing."""
53 | return keras_nlp.models.GemmaCausalLM.from_preset(self._model_name)
54 |
55 | def run_inference(
56 | self,
57 | batch: Sequence[str],
58 | model: GemmaCausalLM,
59 | unused: Optional[dict[str, Any]] = None) -> Iterable[PredictionResult]:
60 | """Runs inferences on a batch of text strings.
61 |
62 | Args:
63 | batch: A sequence of examples as text strings.
64 | model: The Gemma model being used.
65 |
66 | Returns:
67 | An Iterable of type PredictionResult.
68 | """
69 | _ = unused # for interface compatibility with Model Handler
70 | # Loop each text string, and use a tuple to store the inference results.
71 | for one_text in batch:
72 | result = model.generate(one_text, max_length=64)
73 | yield PredictionResult(one_text, result, self._model_name)
74 |
--------------------------------------------------------------------------------
/pipelines/ml_ai_python/ml_ai_pipeline/options.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | Options class for the streaming inference pipeline.
16 | """
17 |
18 | from argparse import ArgumentParser
19 |
20 | from apache_beam.options.pipeline_options import PipelineOptions
21 |
22 |
23 | class MyPipelineOptions(PipelineOptions):
24 |
25 | @classmethod
26 | def _add_argparse_args(cls, parser: ArgumentParser):
27 | parser.add_argument("--messages_subscription", type=str)
28 | parser.add_argument("--model_path", type=str)
29 | parser.add_argument("--responses_topic", type=str)
30 |
--------------------------------------------------------------------------------
/pipelines/ml_ai_python/ml_ai_pipeline/pipeline.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | A machine learning streaming inference pipeline for the Dataflow Solution Guides.
16 | """
17 |
18 | from apache_beam import Pipeline, PCollection
19 | from apache_beam.ml.inference import RunInference
20 | from apache_beam.io.gcp import pubsub
21 |
22 | import apache_beam as beam
23 | from apache_beam.ml.inference.base import PredictionResult
24 |
25 | from .model_handlers import GemmaModelHandler
26 | from .options import MyPipelineOptions
27 |
28 |
29 | def _format_output(element: PredictionResult) -> str:
30 | return f"Input: \n{element.example}, \n\n\nOutput: \n{element.inference}"
31 |
32 |
33 | @beam.ptransform_fn
34 | def _extract(p: Pipeline, subscription: str) -> PCollection[str]:
35 | msgs: PCollection[bytes] = p | "Read subscription" >> beam.io.ReadFromPubSub(
36 | subscription=subscription)
37 | return msgs | "Parse" >> beam.Map(lambda x: x.decode("utf-8"))
38 |
39 |
40 | @beam.ptransform_fn
41 | def _transform(msgs: PCollection[str], model_path: str) -> PCollection[str]:
42 | preds: PCollection[
43 | PredictionResult] = msgs | "RunInference-Gemma" >> RunInference(
44 | GemmaModelHandler(model_path))
45 | return preds | "Format Output" >> beam.Map(_format_output)
46 |
47 |
48 | def create_pipeline(options: MyPipelineOptions) -> Pipeline:
49 | """ Create the pipeline object.
50 |
51 | Args:
52 | options: The pipeline options, with type `MyPipelineOptions`.
53 |
54 | Returns:
55 | The pipeline object.
56 | """
57 | pipeline = beam.Pipeline(options=options)
58 | # Extract
59 | msgs: PCollection[str] = pipeline | "Read" >> _extract(
60 | subscription=options.messages_subscription)
61 | # Transform
62 | responses: PCollection[str] = msgs | "Transform" >> _transform(
63 | model_path=options.model_path)
64 | # Load
65 | responses | "Publish Result" >> pubsub.WriteStringsToPubSub(
66 | topic=options.responses_topic)
67 |
68 | return pipeline
69 |
--------------------------------------------------------------------------------
/pipelines/ml_ai_python/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | tensorflow==2.18.0
16 |
--------------------------------------------------------------------------------
/pipelines/ml_ai_python/requirements.txt:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | apache-beam[gcp]==2.63.0
16 | keras_nlp==0.19.2
17 | keras==3.9.0
18 | protobuf==4.25.6
--------------------------------------------------------------------------------
/pipelines/ml_ai_python/scripts/.gitignore:
--------------------------------------------------------------------------------
1 | 00_set_variables.sh
2 |
--------------------------------------------------------------------------------
/pipelines/ml_ai_python/scripts/01_build_and_push_container.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2024 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | gcloud builds submit \
16 | --region=$REGION \
17 | --default-buckets-behavior=regional-user-owned-bucket \
18 | --substitutions _TAG=$CONTAINER_URI,_GCS_GEMMA_PATH=$GCS_GEMMA_PATH \
19 | .
--------------------------------------------------------------------------------
/pipelines/ml_ai_python/scripts/02_run_dataflow.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2024 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | python main.py \
16 | --runner=DataflowRunner \
17 | --project=$PROJECT \
18 | --temp_location=gs://$PROJECT/tmp \
19 | --region=$REGION \
20 | --save_main_session \
21 | --machine_type=$MACHINE_TYPE \
22 | --num_workers=1 \
23 | --disk_size_gb=$DISK_SIZE_GB \
24 | --max_num_workers=$MAX_DATAFLOW_WORKERS \
25 | --no_use_public_ip \
26 | --service_account_email=$SERVICE_ACCOUNT \
27 | --subnetwork=$SUBNETWORK \
28 | --sdk_container_image=$CONTAINER_URI \
29 | --dataflow_service_options="worker_accelerator=type:nvidia-l4;count:1;install-nvidia-driver:5xx" \
30 | --messages_subscription=projects/$PROJECT/subscriptions/messages-sub \
31 | --responses_topic=projects/$PROJECT/topics/predictions \
32 | --model_path="gemma_2B"
33 |
34 |
--------------------------------------------------------------------------------
/pipelines/ml_ai_python/setup.py:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | Setup file for the machine learning streaming inference pipeline.
16 | """
17 |
18 | from setuptools import setup, find_packages
19 |
20 | with open("requirements.txt", encoding="utf-8") as f:
21 | requirements = f.readlines()
22 |
23 | setup(
24 | name="Dataflow Solution for ML/AI pipelines",
25 | version="0.1",
26 | description="A ML/AI pipeline example for the Dataflow Solution Guides.",
27 | packages=find_packages(),
28 | install_requires=requirements,
29 | )
30 |
--------------------------------------------------------------------------------
/renovate.json:
--------------------------------------------------------------------------------
1 | {
2 | "$schema": "https://docs.renovatebot.com/renovate-schema.json",
3 | "extends": [
4 | "config:recommended"
5 | ]
6 | }
7 |
--------------------------------------------------------------------------------
/terraform/.gitignore:
--------------------------------------------------------------------------------
1 | .terraform*
2 | terraform.tfstate*
3 | .idea
4 | terraform.tfvars
5 | backend.tf
--------------------------------------------------------------------------------
/terraform/README.md:
--------------------------------------------------------------------------------
1 | # Deployment of the solution guides
2 |
3 | In this directory, you will find all the Terraform code to spawn all the
4 | necessary infrastructure in Google Cloud to deploy each one of the solution
5 | guides.
6 |
7 | Please refer to [the main documentation in this repo for a full list of all
8 | the use cases](../README.md).
9 |
10 | ## Google Cloud security foundations
11 |
12 | The deployments in this directory follow all the recommendations given in the
13 | [Google Cloud Security Foundations](https://cloud.google.com/architecture/security-foundations).
14 |
15 | Some of the features of the Terraform deployments in this directory are the following:
16 | * **Identity and Access Management (IAM):**
17 | * All resources are created with the minimum required permissions.
18 | * Service accounts are used for all deployments.
19 | * IAM policies are used to restrict access to resources.
20 | * **Network security:**
21 | * All resources are deployed using private IPs only.
22 | * Firewalls are used to restrict access to resources, including network tags for `ssh`, `http-server` and `https-server` access.
23 | * If the project is created by the Terraform scripts, the default network is removed.
24 |
--------------------------------------------------------------------------------
/terraform/anomaly_detection/variables.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | variable "billing_account" {
16 | description = "Billing account for the projects/resources"
17 | type = string
18 | default = null
19 | }
20 |
21 | variable "destroy_all_resources" {
22 | description = "Destroy all resources when calling tf destroy. Use false for production deployments. For test environments, set to true to remove all buckets and bigtable instances."
23 | type = bool
24 | default = true
25 | }
26 |
27 | variable "internet_access" {
28 | description = "Set to true to create a NAT for Dataflow workers to access Internet."
29 | type = bool
30 | default = false
31 | }
32 |
33 | variable "network_prefix" {
34 | description = "Prefix to be used for networks and subnetworks"
35 | type = string
36 | default = "dataflow"
37 | }
38 |
39 | variable "organization" {
40 | description = "Organization for the project/resources"
41 | type = string
42 | default = null
43 | }
44 |
45 | variable "project_create" {
46 | description = "True if you want to create a new project. False to reuse an existing project."
47 | type = bool
48 | }
49 |
50 | variable "project_id" {
51 | description = "Project ID for the project/resources"
52 | type = string
53 | }
54 |
55 | variable "region" {
56 | description = "The region for resources and networking"
57 | type = string
58 | }
59 |
60 | variable "zone" {
61 | description = "The zone for big table. Just a single letter specifying a zone in the region. The default is zone a"
62 | type = string
63 | default = "a"
64 | }
65 |
--------------------------------------------------------------------------------
/terraform/cdp/variables.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | variable "billing_account" {
16 | description = "Billing account for the projects/resources"
17 | type = string
18 | default = null
19 | }
20 |
21 | variable "destroy_all_resources" {
22 | description = "Destroy all resources when calling tf destroy. Use false for production deployments. For test environments, set to true to remove all buckets and Spanner instances."
23 | type = bool
24 | default = true
25 | }
26 |
27 | variable "internet_access" {
28 | description = "Set to true to create a NAT for Dataflow workers to access Internet."
29 | type = bool
30 | default = false
31 | }
32 |
33 | variable "network_prefix" {
34 | description = "Prefix to be used for networks and subnetworks"
35 | type = string
36 | default = "dataflow"
37 | }
38 |
39 | variable "organization" {
40 | description = "Organization for the project/resources"
41 | type = string
42 | default = null
43 | }
44 |
45 | variable "project_create" {
46 | description = "True if you want to create a new project. False to reuse an existing project."
47 | type = bool
48 | }
49 |
50 | variable "project_id" {
51 | description = "Project ID for the project/resources"
52 | type = string
53 | }
54 |
55 | variable "region" {
56 | description = "The region for resources and networking"
57 | type = string
58 | }
59 |
60 | variable "bq_dataset" {
61 | description = "The output bq dataset"
62 | type = string
63 | default = "output_dataset"
64 | }
65 |
66 | variable "bq_table" {
67 | description = "The output bq table"
68 | type = string
69 | default = "unified_data"
70 | }
71 |
72 |
73 |
74 |
75 |
--------------------------------------------------------------------------------
/terraform/clickstream_analytics/variables.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | variable "billing_account" {
16 | description = "Billing account for the projects/resources"
17 | type = string
18 | default = null
19 | }
20 |
21 | variable "destroy_all_resources" {
22 | description = "Destroy all resources when calling tf destroy. Use false for production deployments. For test environments, set to true to remove all buckets and Spanner instances."
23 | type = bool
24 | default = true
25 | }
26 |
27 | variable "network_prefix" {
28 | description = "Prefix to be used for networks and subnetworks"
29 | type = string
30 | default = "dataflow"
31 | }
32 |
33 | variable "organization" {
34 | description = "Organization for the project/resources"
35 | type = string
36 | default = null
37 | }
38 |
39 | variable "project_create" {
40 | description = "True if you want to create a new project. False to reuse an existing project."
41 | type = bool
42 | }
43 |
44 | variable "project_id" {
45 | description = "Project ID for the project/resources"
46 | type = string
47 | }
48 |
49 | variable "region" {
50 | description = "The region for resources and networking"
51 | type = string
52 | }
53 |
--------------------------------------------------------------------------------
/terraform/etl_integration/variables.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | variable "billing_account" {
16 | description = "Billing account for the projects/resources"
17 | type = string
18 | default = null
19 | }
20 |
21 | variable "destroy_all_resources" {
22 | description = "Destroy all resources when calling tf destroy. Use false for production deployments. For test environments, set to true to remove all buckets and Spanner instances."
23 | type = bool
24 | default = true
25 | }
26 |
27 | variable "internet_access" {
28 | description = "Set to true to create a NAT for Dataflow workers to access Internet."
29 | type = bool
30 | default = false
31 | }
32 |
33 | variable "network_prefix" {
34 | description = "Prefix to be used for networks and subnetworks"
35 | type = string
36 | default = "dataflow"
37 | }
38 |
39 | variable "organization" {
40 | description = "Organization for the project/resources"
41 | type = string
42 | default = null
43 | }
44 |
45 | variable "project_create" {
46 | description = "True if you want to create a new project. False to reuse an existing project."
47 | type = bool
48 | }
49 |
50 | variable "project_id" {
51 | description = "Project ID for the project/resources"
52 | type = string
53 | }
54 |
55 | variable "region" {
56 | description = "The region for resources and networking"
57 | type = string
58 | }
59 |
60 |
--------------------------------------------------------------------------------
/terraform/iot_analytics/variables.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2024 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | variable "billing_account" {
16 | description = "Billing account for the projects/resources"
17 | type = string
18 | default = null
19 | }
20 |
21 | variable "destroy_all_resources" {
22 | description = "Destroy all resources when calling tf destroy. Use false for production deployments. For test environments, set to true to remove all buckets and Spanner instances."
23 | type = bool
24 | default = true
25 | }
26 |
27 | variable "network_prefix" {
28 | description = "Prefix to be used for networks and subnetworks"
29 | type = string
30 | default = "dataflow"
31 | }
32 |
33 | variable "organization" {
34 | description = "Organization for the project/resources"
35 | type = string
36 | default = null
37 | }
38 |
39 | variable "project_create" {
40 | description = "True if you want to create a new project. False to reuse an existing project."
41 | type = bool
42 | }
43 |
44 | variable "project_id" {
45 | description = "Project ID for the project/resources"
46 | type = string
47 | }
48 |
49 | variable "region" {
50 | description = "The region for resources and networking"
51 | type = string
52 | }
53 |
54 | variable "pubsub_topic" {
55 | description = "Name for your pub sub topic"
56 | type = string
57 | default = "maintenance-data"
58 | }
--------------------------------------------------------------------------------
/terraform/log_replication_splunk/variables.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | variable "billing_account" {
16 | description = "Billing account for the projects/resources"
17 | type = string
18 | default = null
19 | }
20 |
21 | variable "destroy_all_resources" {
22 | description = "Destroy all resources when calling tf destroy. Use false for production deployments. For test environments, set to true to remove all buckets and Spanner instances."
23 | type = bool
24 | default = true
25 | }
26 |
27 | variable "internet_access" {
28 | description = "Set to true to create a NAT for Dataflow workers to access Internet."
29 | type = bool
30 | default = false
31 | }
32 |
33 | variable "network_prefix" {
34 | description = "Prefix to be used for networks and subnetworks"
35 | type = string
36 | default = "dataflow"
37 | }
38 |
39 | variable "organization" {
40 | description = "Organization for the project/resources"
41 | type = string
42 | default = null
43 | }
44 |
45 | variable "project_create" {
46 | description = "True if you want to create a new project. False to reuse an existing project."
47 | type = bool
48 | }
49 |
50 | variable "project_id" {
51 | description = "Project ID for the project/resources"
52 | type = string
53 | }
54 |
55 | variable "region" {
56 | description = "The region for resources and networking"
57 | type = string
58 | }
59 |
60 | variable "splunk_hec_url" {
61 | description = "The URL for the Splunk HEC endpoint"
62 | type = string
63 | default = "http://some-endpoint:8088"
64 |
65 | }
66 |
67 | variable "splunk_token" {
68 | description = "The token for the Splunk HEC endpoint. It will be stored in Secret Manager"
69 | type = string
70 | default = "WRITE_YOUR_TOKEN_HERE"
71 | }
72 |
73 |
--------------------------------------------------------------------------------
/terraform/marketing_intelligence/variables.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | variable "billing_account" {
16 | description = "Billing account for the projects/resources"
17 | type = string
18 | default = null
19 | }
20 |
21 | variable "destroy_all_resources" {
22 | description = "Destroy all resources when calling tf destroy. Use false for production deployments. For test environments, set to true to remove all buckets and bigtable instances."
23 | type = bool
24 | default = true
25 | }
26 |
27 | variable "internet_access" {
28 | description = "Set to true to create a NAT for Dataflow workers to access Internet."
29 | type = bool
30 | default = false
31 | }
32 |
33 | variable "network_prefix" {
34 | description = "Prefix to be used for networks and subnetworks"
35 | type = string
36 | default = "dataflow"
37 | }
38 |
39 | variable "organization" {
40 | description = "Organization for the project/resources"
41 | type = string
42 | default = null
43 | }
44 |
45 | variable "project_create" {
46 | description = "True if you want to create a new project. False to reuse an existing project."
47 | type = bool
48 | }
49 |
50 | variable "project_id" {
51 | description = "Project ID for the project/resources"
52 | type = string
53 | }
54 |
55 | variable "region" {
56 | description = "The region for resources and networking"
57 | type = string
58 | }
59 |
60 | variable "zone" {
61 | description = "The zone for Bigtable. Just a single lower case letter for the zone. Default is a."
62 | type = string
63 | default = "a"
64 | }
65 |
--------------------------------------------------------------------------------
/terraform/ml_ai/variables.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | variable "billing_account" {
16 | description = "Billing account for the projects/resources"
17 | type = string
18 | default = null
19 | }
20 |
21 | variable "destroy_all_resources" {
22 | description = "Destroy all resources when calling tf destroy. Use false for production deployments. For test environments, set to true to remove all buckets and Spanner instances."
23 | type = bool
24 | default = true
25 | }
26 |
27 | variable "network_prefix" {
28 | description = "Prefix to be used for networks and subnetworks"
29 | type = string
30 | default = "dataflow"
31 | }
32 |
33 | variable "organization" {
34 | description = "Organization for the project/resources"
35 | type = string
36 | default = null
37 | }
38 |
39 | variable "project_create" {
40 | description = "True if you want to create a new project. False to reuse an existing project."
41 | type = bool
42 | }
43 |
44 | variable "project_id" {
45 | description = "Project ID for the project/resources"
46 | type = string
47 | }
48 |
49 | variable "region" {
50 | description = "The region for resources and networking"
51 | type = string
52 | }
53 |
54 |
--------------------------------------------------------------------------------
/use_cases/Anomaly_Detection.md:
--------------------------------------------------------------------------------
1 | # Real-Time Anomaly Detection
2 |
3 | Real-time anomaly detection refers to stream processing workloads that identify abnormal events in-flight and
4 | potentially respond with a relevant measure. Incoming events are analyzed and/or compared against a reference/benchmark
5 | that validates whether a record is irregular or not. Anomaly detection architectures can enhance the security
6 | posture of a company’s infrastructure or mitigate against the threat of malicious actors in a value chain.
7 | Companies are increasingly adding proprietary machine learning models to augment their anomaly detection capabilities.
8 | Low latency is normally a requirement for these kinds of workloads given the inherent nature of these adverse events.
9 |
10 | ## Documentation
11 |
12 | - [One pager: Real-time Anomaly Detection with Dataflow (PDF)](./one_pagers/anomaly_detection_dataflow_onepager.pdf)
13 | - [Real-time Anomaly Detection Solution Guide and Architecture (PDF)](./guides/anomaly_detection_dataflow_guide.pdf)
14 |
15 | ## Assets included in this repository
16 |
17 | - [Terraform code to deploy a project for GenAI & ML inference](../terraform/anomaly_detection/)
18 | - [Sample pipeline in Python for leveraging the Gemma open LLM with Dataflow](../pipelines/anomaly_detection/)
19 |
20 | ## Technical benefits
21 |
22 | Dataflow is the best platform for building real-time
23 | applications. Several unique capabilities make Dataflow the leading choice:
24 |
25 | - **Integrated ML**:Combine your intelligence with your streaming pipeline using Datafow ML.
26 | RunInference helps you seamlessly call models hosted on Vertex AI from your Dataflow
27 | pipelines without the overhead of maintaining ML infrastrucure. Dataflow ML comes with
28 | the combined benefit of decoupling your prediction loop from your main application, thus
29 | eliminating the risk that pipeline stuckness can bring down your application.
30 | - **Low latency**: Dataflow’s at-least-once delivery mode can help your pipeline achieve sub-second
31 | processing latencies, crucial to responding to threats as quickly as possible.
32 |
33 | - **Integrated alerting**: Dataflow’s suite of observability tools enhances your ability to identify
34 | and respond to anomalous events. Create an alert from a Dataflow monitoring dashboard in a matter of a few clicks.
35 |
36 | - **Advanced stream processing**: Apache Beam’s state and timer APIs enables data engineers to manipulate and
37 | analyze state in-flight. These primitives allow you maximum flexibility to express the business logic
38 | that your application requires.
39 |
40 | - **Scalable infrastructure**: Pipeline scale up and down to meet the resourcing requirements of your pipeline.
41 | Powered by battle-tested backends in Shuffle & Streaming Engine, Dataflow is fit to support virtually pipelines
42 | of any size, with minimal tuning needed.
43 |
--------------------------------------------------------------------------------
/use_cases/CDP.md:
--------------------------------------------------------------------------------
1 | # Customer Data Platform
2 |
3 | At its core, a real-time CDP is a sophisticated software solution designed to unify customer data from various sources, providing a single, comprehensive view of each individual customer. The "real-time" element is crucial: it emphasizes the ability to collect, process, and analyze customer data as events occur, enabling businesses to respond instantly to changing customer behaviors and preferences.
4 | Real-time Customer Data Platforms represent a powerful tool for businesses seeking to create more personalized, engaging, and effective customer experiences. By centralizing customer data and enabling real-time analysis, CDPs unlock a new level of customer understanding and responsiveness, leading to better marketing outcomes and stronger customer relationships.
5 |
6 | ## Documentation
7 |
8 | - [Real-time Customer Data Platform Solution Guide and Architecture (PDF)](./guides/cdp_dataflow_guide.pdf)
9 |
10 | ## Assets included in this repository
11 |
12 | - [Terraform code to deploy a project for Customer Data Platform](../terraform/cdp/)
13 | - [Sample pipelines in Python for Customer Data Platform](../pipelines/cdp/)
14 |
15 | ## Technical benefits
16 |
17 | Dataflow provides enormous advantages as a platform for your Customer Data Platform use
18 | cases:
19 |
20 | - **Real-Time Data Ingestion and Processing**: Dataflow enables the seamless and efficient movement of customer data from various sources into the CDP in real-time. This ensures that the CDP is always working with the most up-to-date information, allowing for timely insights and actions.
21 |
22 | - **Enhanced Data Transformation and Enrichment**: Dataflow pipelines can perform complex transformations on incoming data, ensuring it is clean, standardized, and formatted correctly for the CDP.
23 | Additionally, dataflow can enrich customer data with additional context or attributes from external sources, leading to more complete and valuable customer profiles.
24 |
25 | - **Scalability and Flexibility**: Dataflow solutions are designed to handle large volumes of data and can scale effortlessly to accommodate growing data needs. They offer flexibility in terms of data sources, processing logic, and output destinations, making them adaptable to evolving business requirements.
26 |
27 | - **Automation and Efficiency**: Dataflow pipelines can automate data ingestion, transformation, and delivery processes, reducing manual effort and minimizing errors. This streamlines data management, freeing up resources for more strategic tasks.
28 |
29 | - **Improved Data Quality and Governance**: Dataflow enables data validation and cleansing during the ingestion process, ensuring data accuracy and consistency. Data lineage and audit capabilities within dataflow tools help track data transformations and maintain data governance standards.
30 |
31 | - **Actionable Insights and Personalization**: By feeding clean and enriched data into the CDP in real-time, dataflow enables the CDP to generate more accurate and timely insights. These insights can be used to trigger personalized marketing campaigns, recommendations, and customer interactions, leading to improved engagement and conversions.
32 |
33 | - **Omnichannel Customer Experiences**: Dataflow supports the seamless integration of customer data across various touchpoints and channels. This allows the CDP to orchestrate consistent and personalized customer experiences across the entire customer journey.
34 |
--------------------------------------------------------------------------------
/use_cases/Clickstream_Analytics.md:
--------------------------------------------------------------------------------
1 | # Clickstream analytics
2 |
3 | In the fast-paced digital landscape, understanding user behavior is crucial for optimizing websites, apps,
4 | and marketing campaigns. Clickstream analytics provides a continuous stream of data on how users interact
5 | with digital platforms. But to truly capitalize on this information, businesses need insights in real time,
6 | not days later.
7 |
8 | For the full version of this solution guide, please refer to:
9 |
10 | ## Documentation
11 |
12 | - [One pager: Clickstream analytics in real-time with Dataflow (PDF)](./one_pagers/clickstream_dataflow_onepager.pdf)
13 |
14 | ## Assets included in this repository
15 |
16 | - [Terraform code to deploy a project for Clickstream Analytics](../terraform/clickstream_analytics/)
17 | - [Sample pipeline in Java for clickstream analytics with Dataflow](../pipelines/clickstream_analytics_java/)
18 |
19 | ## Technical benefits
20 |
21 | Dataflow provides a robust platform for building and scaling real-time clickstream analytics solutions.
22 | Key capabilities make it the ideal choice for extracting maximum value from user interaction data:
23 |
24 | - Streamlined Clickstream Processing: Dataflow's Apache Beam SDK simplifies the development of complex
25 | clickstream pipelines. Pre-built transforms, state management, and windowing functions make it easy to
26 | aggregate, filter, and enrich clickstream events in real time.
27 | - Clickstream Enrichment: Enrich raw clickstream data with external data sources (e.g., user demographics,
28 | product catalogs) to gain deeper insights into user behavior and preferences. Side inputs and joins in
29 | Dataflow enable seamless data enrichment within your pipelines.
30 | - Real-Time Dashboards and Alerts: Integrate Dataflow with real-time visualization tools and alerting systems
31 | to monitor clickstream metrics, detect anomalies, and trigger actions based on user interactions. Dataflow's
32 | low-latency processing ensures that insights are delivered within seconds.
33 | - Scalability and Cost Efficiency: Dataflow automatically scales to handle fluctuating clickstream volumes.
34 | Pay only for the resources you use, avoiding overprovisioning and unnecessary costs. Right-fitting capabilities
35 | allow you to allocate resources optimally across different pipeline stages.
36 | - Flexible Deployment: Deploy clickstream pipelines on various infrastructure options, including VMs and serverless
37 | options like Cloud Run or Cloud Functions. This flexibility allows you to tailor your deployment to your specific
38 | needs and budget.
39 | - Open-Source Ecosystem: Leverage the power of the Apache Beam ecosystem, including a vast library of I/O
40 | connectors for various data sources and sinks. Dataflow's compatibility with open-source tools ensures flexibility
41 | and avoids vendor lock-in.
42 |
--------------------------------------------------------------------------------
/use_cases/ETL_integration.md:
--------------------------------------------------------------------------------
1 | # ETL / Integration
2 |
3 | Real-time extract-transform-load (ETL) & integration describes systems that are processing & writing
4 | data as soon as it becomes available. This allows for near-instant analysis and decision-making
5 | based on the most up-to-date information. ETL patterns refer to the continuous processing of data,
6 | while integration broadly refers to writing the results of these pipelines to various systems (e.g.
7 | data warehouses, transactional databases, messaging queues). Adopting real-time ETL & integration
8 | architectures are generally regarded as an essential part of modernizing your data systems, and
9 | confer a number of competitive advantages to the company adopting them.
10 |
11 | For the full version of this solution guide, please refer to:
12 | * https://solutions.cloud.google.com/app/solutions/dataflow-real-time-etl-integration
13 |
14 | ## Documentation
15 |
16 | * [One pager: ETL & reverse ETL in real-time with Dataflow (PDF)](./one_pagers/etl_dataflow_onepager.pdf)
17 | * [ETL & reverse ETL Solution Guide and Architecture (PDF)](./guides/etl_dataflow_guide.pdf)
18 |
19 | ## Assets included in this repository
20 |
21 | * [Terraform code to deploy a project for ETL integration](../terraform/etl_integration/)
22 | * [Sample pipelines in Java for ETL / Integration](../pipelines/etl_integration_java/)
23 |
24 | ## Technical benefits
25 |
26 | Dataflow provides enormous advantages as a platform for your real-time ETL and integration use
27 | cases:
28 |
29 | * **Resource efficiency**: Increased resource efficiency with horizontal & vertical autoscaling
30 | * **Unified batch & streaming**: Dataflow’s underlying SDK, Apache Beam, allows developers to
31 | express
32 | batch & streaming pipelines with the same SDK, with minor modifications required to turn a batch
33 | pipeline into a streaming one. This simplifies the traditionally accepted practice of maintaining
34 | two separate systems for batch & stream processing.
35 | * **Limitless scalability**: Dataflow offers two service backends for batch and streaming called
36 | Shuffle
37 | and Streaming Engine, respectively. These backends have scaled
38 |
--------------------------------------------------------------------------------
/use_cases/GenAI_ML.md:
--------------------------------------------------------------------------------
1 | # GenAI & machine learning inference
2 |
3 | Machine learning (ML) and artificial intelligence (AI) empower businesses to respond to evolving
4 | market conditions and tailor their offerings to users and customers. However, decision cycles
5 | involving AI and ML can span days or even weeks, particularly when dealing with larger models
6 | (model retraining, large inference batch pipelines, etc). This solution guide introduces an
7 | architecture designed for real-time predictions, guaranteeing low latency outcomes with both custom
8 | and third-party models. Leveraging the capabilities of graphics processing units (GPUs),
9 | the proposed architecture effectively reduces prediction times to seconds.
10 |
11 | ## Documentation
12 |
13 | - [One pager: GenAI & ML inference in real-time with Dataflow (PDF)](./one_pagers/genai_ml_dataflow_onepager.pdf)
14 | - [Gen AI & ML inference Solution Guide and Architecture (PDF)](./guides/genai_ml_dataflow_guide.pdf)
15 |
16 | For the full documentation of this solution guide, please refer to:
17 |
18 | - https://solutions.cloud.google.com/app/solutions/data-flow-real-time-ml-and-genai
19 |
20 | ## Assets included in this repository
21 |
22 | - [Terraform code to deploy a project for GenAI & ML inference](../terraform/ml_ai/)
23 | - [Sample pipeline in Python for leveraging the Gemma open LLM with Dataflow](../pipelines/ml_ai_python/)
24 |
25 | ## Technical benefits
26 |
27 | Dataflow is the best platform for building real-time ML & generative AI
28 | applications. Several unique capabilities make Dataflow the leading choice:
29 |
30 | - **Developer ease of use with turnkey transforms:** Author complex ML
31 | pipelines using utility transforms that can reduce lines code by orders of magnitude
32 | - [MLTransform](https://cloud.google.com/dataflow/docs/machine-learning/ml-preprocess-data)
33 | helps you prepare your data for training machine learning models without
34 | writing complex code or managing underlying libraries. ML Transforms can
35 | generate embeddings that can push data into vector databases to run
36 | inference.
37 | - [RunInference](https://beam.apache.org/documentation/ml/about-ml/#use-runinference)
38 | lets you efficiently use ML models in your pipelines, and contains a
39 | number of different optimizations underneath the hood that make this an
40 | essential part of any streaming AI pipelines
41 | - **Advanced stream processing**: Customers can implement advanced streaming
42 | architectures using the open-source
43 | [Apache Beam SDK](https://beam.apache.org/get-started/), which provides a rich
44 | set capabilities of including state & timer APIs, transformations, side
45 | inputs, enrichment, and a broad list of I/O connectors.
46 | - **Notebooks integration**: Develop your streaming AI pipeline in a
47 | notebook environment, which allows for interactive development and
48 | sampling unbounded data sources.
49 | - **Cost efficiency**: Run pipelines without wasting precious resources &
50 | cost overruns.
51 | - [GPU support](https://cloud.google.com/dataflow/docs/gpu/gpu-support)
52 | Accelerate your processing with GPUs, which can return results faster
53 | for your most computationally demanding pipelines
54 | - [Right-fitting](https://cloud.google.com/dataflow/docs/guides/right-fitting)
55 | Deploy pipelines on heterogeneous worker pools. Rightfitting allows you
56 | to allocate additional resources to individual stages in your pipeline,
57 | which prevents wasteful utilization for stages that don’t require the
58 | same compute.
59 | - **Open-source compatibility**: Dataflow has support for
60 | [running inference with Gemma](https://cloud.google.com/dataflow/docs/machine-learning/gemma)
61 | as well as a strong integration with
62 | [Tensorflow Extended](https://www.tensorflow.org/tfx)
63 | Customers should feel comfortable that these pipelines can be ported to
64 | any other execution engine with Apache Beam support.
65 |
--------------------------------------------------------------------------------
/use_cases/IoT_Analytics.md:
--------------------------------------------------------------------------------
1 | # IoT Analytics
2 |
3 | Organizations employ Internet of Things (IoT) sensors to monitor their production lines in real-time. These sensors gather critical data on various metrics essential for the manufacturing processes and can be utilized for analytical purposes. This data is operational in nature, and currently, it is not utilized for analytical purposes. With data warehouses, companies have leveraged low-granularity operational data for analytical purposes, enabling them to make more informed decisions utilizing large volumes of data. The use case described herein demonstrates how to replicate the same pattern (analytics on large volumes of low-granularity operational data) but with a crucial additional advantage: low latency. The value of data, and consequently, the decisions made based on that data diminish over time. Real-time analytics significantly enhance the value of such decisions.
4 |
5 | ## Documentation
6 |
7 | - [One pager: IoT analytics in real-time with Dataflow (PDF)](./one_pagers/iot_analytics_dataflowonepager.pdf)
8 | - [IoT Analytics Solution Guide & Architecture (PDF)](./guides/iot_analytics_dataflow_guide.pdf)
9 |
10 | ## Assets included in this repository
11 | - Terraform code to deploy a project for IoT Analytics (WORK IN PROGRESS)
12 | - Sample pipeline in Python for deploying IoT analytics (WORK IN PROGRESS)
13 |
14 | ## Technical benefits
15 | - **Serverless experience:** Data volume can vary widely from connected devices and IoT appliances, which introduce significant overhead when managing infrastructure. Dataflow obviates that need entirely. Dataflow’s service layer goes beyond auto-provisioning. Features like dynamic work rebalancing, autoscaling, and service backends like Streaming Engine are built to handle your workload at any scale without needing user intervention.
16 | - **Streaming AI & ML:** Dataflow’s suite of ML capabilities enable you to evolve your batch ML systems to streaming ML, enabling a world of real-time features and real-time predictions. Apache Beam and Dataflow include several capabilities that simplify the end-to-end machine learning lifecycle. We make data processing easier for AI easier with ML Transform. Implement RunInference to call predictions with your model of choice, whether it be scikit-learn, PyTorch, VertexAI, or Gemma. Dataflow’s integration with Vertex AI alleviates the need to manage complex computing requirements for your machine learning use cases.
17 | - **Extensible connector framework:** Apache Beam provides more than 60 out of the box connectors that support the majority of your I/O needs, including support for popular messaging platforms like Kafka and Pub/Sub and messaging brokers like JMS and MQTT. If your desired input is not supported, Beam also offers a flexible framework that allows you to build a connector for your own source systems.
18 | - **Open & portable:** For IoT use cases, it is a common requirement to process data in both on-device and multi-cloud enviornments Beam allows you the flexibility to run your business logic in the environment of your choice. Execution engines include the Direct Runner (for local execution), Spark and Flink (for your own self-managed & multi-cloud computing environments), and Dataflow (the preferred execution engine for Google Cloud).
19 |
--------------------------------------------------------------------------------
/use_cases/Log_replication.md:
--------------------------------------------------------------------------------
1 | # Log replication & analytics
2 |
3 | Google Cloud produces all kinds of logs that are automatically sent to Cloud
4 | Logging. However, in some situations, you may want to use a third party such
5 | as Splunk for log processing and analytics. This solution presents an
6 | architecture to replicate logs from Cloud Logging to a third-party service,
7 | using Dataflow. The solution ensures that all changes done in the upstream
8 | databases are promptly replicated in the destination analytics replica,
9 | with minimal delay (in the order of single digit seconds).
10 |
11 | ## Documentation
12 |
13 | - [One pager: Log replication and analytics in real-time with Dataflow (PDF)](./one_pagers/log_replication_dataflow_onepager.pdf)
14 | - [Log replication and analytics Solution Guide and Architecture (PDF)](./guides/log_replication_dataflow_guide.pdf)
15 |
16 | ## Assets included in this repository
17 |
18 | - [Terraform code to deploy a project for log replication into Splunk](../terraform/log_replication_splunk/)
19 | - [Use Google-provide templates to run a job to replicate to Splunk](../pipelines/log_replication_splunk/)
20 |
21 | ## Technical benefits
22 |
23 | - **Serverless experience**: Data volume can vary widely from logging
24 | applications or transactional databases. Dataflow obviates that need
25 | entirely. Dataflow’s service layer goes beyond auto-provisioning. Features
26 | like dynamic work rebalancing, autoscaling, and service backends like
27 | Streaming Engine are built to handle your workload at any scale without
28 | needing user intervention.
29 | - **Easy operations**: Dataflow offers several features that helps
30 | organizations ensure the uptime of their pipelines. Snapshots preserve the
31 | state of your pipeline for high availability / disaster recovery scenarios,
32 | while in-place streaming update can seamlessly migate your pipeline to a
33 | new version without any data loss or downtime.
34 | - **Google-provided Templates**: Google provides Dataflow templates make
35 | deployment as easy as filling out a web form. Send logs to Splunk,
36 | Elasticsearch, or Datadog with our partner-provided templates.
37 | - **Low latency**: Dataflow’s at-least-once delivery mode can help your
38 | pipeline achieve sub-second processing latencies, essential for your
39 | mission-critical logging applications.
40 | - **Monitoring tools**: In-line logging, job visualizers, monitoring charts,
41 | integrated error reporting and smart insights help you optimize the
42 | performance of your pipeline, and can catch any stuckness or slowness
43 | issues before they turn into outages.
44 |
--------------------------------------------------------------------------------
/use_cases/Marketing_Intelligence.md:
--------------------------------------------------------------------------------
1 | # Market Intelligence inference
2 |
3 | Real-time marketing intelligence describes the practice of collecting and analyzing data about your market, customers, and competitors as it happens. This enables you to make informed, agile decisions and respond swiftly to emerging trends, customer behaviors, and competitive moves. The advent of data-driven marketing has transformed the way companies approach their marketing activities, and real-time marketing intelligence requires these companies to accelerate their response times to marketing moments. This reference architecture will describe how you can combine data from your various marketing data sources, common patterns for analyzing them, and integrating them with your data warehouse for faster analysis and databases for faster responses.
4 |
5 | ## Documentation
6 |
7 | - [One pager: Marketing intelligence in real-time with Dataflow (PDF)](./one_pagers/market_intel_dataflow_onepager.pdf)
8 | - [Marketing Intelligence Solution Guide and Architecture (PDF)](./guides/market_intel_dataflow_guide.pdf)
9 |
10 | ## Assets included in this repository
11 |
12 | - [Terraform code to deploy a project for Market Intelligence inference](../terraform/marketing_intelligence/)
13 | - [Sample pipeline in Python for leveraging the Gemma open LLM with Dataflow](../pipelines/marketing_intelligence/)
14 |
15 | ## Technical benefits
16 |
17 | Dataflow is the best platform for building real-time ML & generative AI
18 | applications. Several unique capabilities make Dataflow the leading choice:
19 |
20 | - **Developer ease of use with turnkey transforms:** Author complex ML
21 | pipelines using utility transforms that can reduce lines code by orders of magnitude
22 | - [MLTransform](https://cloud.google.com/dataflow/docs/machine-learning/ml-preprocess-data)
23 | helps you prepare your data for training machine learning models without
24 | writing complex code or managing underlying libraries. ML Transforms can
25 | generate embeddings that can push data into vector databases to run
26 | inference.
27 | - [RunInference](https://beam.apache.org/documentation/ml/about-ml/#use-runinference)
28 | lets you efficiently use ML models in your pipelines, and contains a
29 | number of different optimizations underneath the hood that make this an
30 | essential part of any streaming AI pipelines
31 | - **Advanced stream processing**: Customers can implement advanced streaming
32 | architectures using the open-source
33 | [Apache Beam SDK](https://beam.apache.org/get-started/), which provides a rich
34 | set capabilities of including state & timer APIs, transformations, side
35 | inputs, enrichment, and a broad list of I/O connectors.
36 | - **Notebooks integration**: Develop your streaming AI pipeline in a
37 | notebook environment, which allows for interactive development and
38 | sampling unbounded data sources.
39 | - **Cost efficiency**: Run pipelines without wasting precious resources &
40 | cost overruns.
41 | - [GPU support](https://cloud.google.com/dataflow/docs/gpu/gpu-support)
42 | Accelerate your processing with GPUs, which can return results faster
43 | for your most computationally demanding pipelines
44 | - [Right-fitting](https://cloud.google.com/dataflow/docs/guides/right-fitting)
45 | Deploy pipelines on heterogeneous worker pools. Rightfitting allows you
46 | to allocate additional resources to individual stages in your pipeline,
47 | which prevents wasteful utilization for stages that don’t require the
48 | same compute.
49 | - **Open-source compatibility**: Dataflow has support for
50 | [running inference with Gemma](https://cloud.google.com/dataflow/docs/machine-learning/gemma)
51 | as well as a strong integration with
52 | [Tensorflow Extended](https://www.tensorflow.org/tfx)
53 | Customers should feel comfortable that these pipelines can be ported to
54 | any other execution engine with Apache Beam support.
55 |
--------------------------------------------------------------------------------
/use_cases/guides/ads_analytics_dataflow_guide.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/guides/ads_analytics_dataflow_guide.pdf
--------------------------------------------------------------------------------
/use_cases/guides/anomaly_detection_dataflow_guide.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/guides/anomaly_detection_dataflow_guide.pdf
--------------------------------------------------------------------------------
/use_cases/guides/cdp_dataflow_guide.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/guides/cdp_dataflow_guide.pdf
--------------------------------------------------------------------------------
/use_cases/guides/clickstream_analytics_dataflow_guide.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/guides/clickstream_analytics_dataflow_guide.pdf
--------------------------------------------------------------------------------
/use_cases/guides/etl_dataflow_guide.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/guides/etl_dataflow_guide.pdf
--------------------------------------------------------------------------------
/use_cases/guides/gaming_analytics_dataflow_guide.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/guides/gaming_analytics_dataflow_guide.pdf
--------------------------------------------------------------------------------
/use_cases/guides/genai_ml_dataflow_guide.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/guides/genai_ml_dataflow_guide.pdf
--------------------------------------------------------------------------------
/use_cases/guides/iot_analytics_dataflow_guide.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/guides/iot_analytics_dataflow_guide.pdf
--------------------------------------------------------------------------------
/use_cases/guides/log_replication_dataflow_guide.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/guides/log_replication_dataflow_guide.pdf
--------------------------------------------------------------------------------
/use_cases/guides/market_intel_dataflow_guide.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/guides/market_intel_dataflow_guide.pdf
--------------------------------------------------------------------------------
/use_cases/one_pagers/anomaly_detection_dataflow_onepager.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/one_pagers/anomaly_detection_dataflow_onepager.pdf
--------------------------------------------------------------------------------
/use_cases/one_pagers/clickstream_dataflow_onepager.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/one_pagers/clickstream_dataflow_onepager.pdf
--------------------------------------------------------------------------------
/use_cases/one_pagers/etl_dataflow_onepager.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/one_pagers/etl_dataflow_onepager.pdf
--------------------------------------------------------------------------------
/use_cases/one_pagers/genai_ml_dataflow_onepager.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/one_pagers/genai_ml_dataflow_onepager.pdf
--------------------------------------------------------------------------------
/use_cases/one_pagers/iot_analytics_dataflowonepager.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/one_pagers/iot_analytics_dataflowonepager.pdf
--------------------------------------------------------------------------------
/use_cases/one_pagers/log_replication_dataflow_onepager.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/one_pagers/log_replication_dataflow_onepager.pdf
--------------------------------------------------------------------------------
/use_cases/one_pagers/market_intel_dataflow_onepager.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/one_pagers/market_intel_dataflow_onepager.pdf
--------------------------------------------------------------------------------