├── .github ├── pull_request_template.md └── workflows │ ├── pull_request.yml │ └── terraform_plan.yml ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── pipelines ├── .DS_Store ├── .gitignore ├── README.md ├── anomaly_detection │ ├── Dockerfile │ ├── LICENSE │ ├── MANIFEST.in │ ├── README.md │ ├── anomaly_detection_pipeline │ │ ├── __init__.py │ │ ├── options.py │ │ └── pipeline.py │ ├── cloudbuild.yaml │ ├── main.py │ ├── requirements-dev.txt │ ├── requirements.txt │ └── setup.py ├── cdp │ ├── Dockerfile │ ├── LICENSE │ ├── MANIFEST.in │ ├── README.md │ ├── cdp_pipeline │ │ ├── __init__.py │ │ ├── customer_data_platform.py │ │ ├── generate_transaction_data.py │ │ └── options.py │ ├── cloudbuild.yaml │ ├── input_data │ │ ├── coupon_redempt.csv │ │ └── transaction_data.csv │ ├── main.py │ ├── requirements.txt │ ├── schema │ │ └── unified_table.json │ ├── scripts │ │ ├── 01_cloudbuild_and_push_container.sh │ │ ├── 02_run_dataflow_job.sh │ │ └── run.sh │ └── setup.py ├── clickstream_analytics_java │ ├── .gitattributes │ ├── .gitignore │ ├── README.md │ ├── build.gradle │ ├── gradle │ │ └── wrapper │ │ │ ├── gradle-wrapper.jar │ │ │ └── gradle-wrapper.properties │ ├── gradlew │ ├── gradlew.bat │ ├── scripts │ │ └── 01_launch_pipeline.sh │ └── src │ │ └── main │ │ ├── java │ │ └── com │ │ │ └── google │ │ │ └── cloud │ │ │ └── dataflow │ │ │ └── solutions │ │ │ └── clickstream_analytics │ │ │ ├── BigTableEnrichment.java │ │ │ ├── ClickstreamPubSubToBq.java │ │ │ ├── JsonToTableRows.java │ │ │ └── Metrics.java │ │ └── resources │ │ └── streaming_source_deadletter_table_schema.json ├── etl_integration_java │ ├── .gitignore │ ├── README.md │ ├── build.gradle │ ├── gradle │ │ └── wrapper │ │ │ ├── gradle-wrapper.jar │ │ │ └── gradle-wrapper.properties │ ├── gradlew │ ├── gradlew.bat │ ├── imgs │ │ └── etl_integration.png │ ├── scripts │ │ ├── .gitignore │ │ ├── 02_run_publisher_dataflow.sh │ │ └── 03_run_changestream_template.sh │ └── src │ │ └── main │ │ └── java │ │ └── com │ │ └── google │ │ └── cloud │ │ └── dataflow │ │ └── solutions │ │ ├── ETLIntegration.java │ │ ├── data │ │ ├── SchemaUtils.java │ │ └── TaxiObjects.java │ │ ├── load │ │ └── Spanner.java │ │ ├── options │ │ └── SpannerPublisherOptions.java │ │ └── transform │ │ ├── RowToError.java │ │ └── TaxiEventProcessor.java ├── imgs │ ├── anomaly_detect_arch.png │ ├── cdp.png │ ├── iot_analytics.png │ ├── log_replication.png │ ├── market_intel.png │ └── ml_ai_arch.png ├── iot_analytics │ ├── Dockerfile │ ├── LICENCE │ ├── MANIFEST.in │ ├── cloudbuild.yaml │ ├── iot_analytics_pipeline │ │ ├── __init__.py │ │ ├── aggregate_metrics.py │ │ ├── maintenance_model.pkl │ │ ├── options.py │ │ ├── parse_timestamp.py │ │ ├── pipeline.py │ │ └── trigger_inference.py │ ├── main.py │ ├── maintenance_model.pkl │ ├── readme.md │ ├── requirements.txt │ ├── scripts │ │ ├── 01_cloud_build_and_push.sh │ │ ├── 02_submit_job.sh │ │ ├── create_and_populate_bigtable.py │ │ ├── create_data.py │ │ ├── maintenance_data.jsonl │ │ ├── model.py │ │ ├── publish_on_pubsub.py │ │ └── vehicle_data.jsonl │ └── setup.py ├── log_replication_splunk │ ├── README.md │ └── scripts │ │ ├── .gitignore │ │ └── 01_launch_ps_to_splunk.sh ├── marketing_intelligence │ ├── Dockerfile │ ├── LICENSE │ ├── MANIFEST.in │ ├── README.md │ ├── cloudbuild.yaml │ ├── main.py │ ├── marketing_intelligence_pipeline │ │ ├── __init__.py │ │ ├── options.py │ │ └── pipeline.py │ ├── requirements.txt │ ├── scripts │ │ ├── .gitignore │ │ ├── 01_build_and_push_container.sh │ │ └── 02_run_dataflow.sh │ └── setup.py ├── ml_ai_python │ ├── Dockerfile │ ├── LICENSE │ ├── MANIFEST.in │ ├── README.md │ ├── cloudbuild.yaml │ ├── main.py │ ├── ml_ai_pipeline │ │ ├── __init__.py │ │ ├── model_handlers.py │ │ ├── options.py │ │ └── pipeline.py │ ├── requirements-dev.txt │ ├── requirements.txt │ ├── scripts │ │ ├── .gitignore │ │ ├── 01_build_and_push_container.sh │ │ └── 02_run_dataflow.sh │ └── setup.py └── pylintrc ├── renovate.json ├── terraform ├── .gitignore ├── README.md ├── anomaly_detection │ ├── README.md │ ├── main.tf │ └── variables.tf ├── cdp │ ├── README.md │ ├── main.tf │ └── variables.tf ├── clickstream_analytics │ ├── README.md │ ├── main.tf │ └── variables.tf ├── etl_integration │ ├── README.md │ ├── main.tf │ └── variables.tf ├── iot_analytics │ ├── README.md │ ├── main.tf │ └── variables.tf ├── log_replication_splunk │ ├── README.md │ ├── main.tf │ └── variables.tf ├── marketing_intelligence │ ├── README.md │ ├── main.tf │ └── variables.tf └── ml_ai │ ├── README.md │ ├── main.tf │ └── variables.tf └── use_cases ├── Anomaly_Detection.md ├── CDP.md ├── Clickstream_Analytics.md ├── ETL_integration.md ├── GenAI_ML.md ├── IoT_Analytics.md ├── Log_replication.md ├── Marketing_Intelligence.md ├── guides ├── ads_analytics_dataflow_guide.pdf ├── anomaly_detection_dataflow_guide.pdf ├── cdp_dataflow_guide.pdf ├── clickstream_analytics_dataflow_guide.pdf ├── etl_dataflow_guide.pdf ├── gaming_analytics_dataflow_guide.pdf ├── genai_ml_dataflow_guide.pdf ├── iot_analytics_dataflow_guide.pdf ├── log_replication_dataflow_guide.pdf └── market_intel_dataflow_guide.pdf └── one_pagers ├── anomaly_detection_dataflow_onepager.pdf ├── clickstream_dataflow_onepager.pdf ├── etl_dataflow_onepager.pdf ├── genai_ml_dataflow_onepager.pdf ├── iot_analytics_dataflowonepager.pdf ├── log_replication_dataflow_onepager.pdf └── market_intel_dataflow_onepager.pdf /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | Thanks for sending your first pull request. Please remove this text before submitting the pull request. 2 | 3 | Make sure that: 4 | * Read 📰 5 | * You have read the [CONTRIBUTING.md]([url](https://github.com/GoogleCloudPlatform/dataflow-solution-guides/blob/main/CONTRIBUTING.md)) file. 6 | * Run in the cloud ☁️ 7 | * You have run all your code in Google Cloud and it is working (even if it is not complete yet) 8 | * Code style 🎩 9 | * You have check the code style and quality commands given in the CONTRIBUTING.md file, and your code passess those checks. 10 | * Using Python? 🔍 11 | * If you are submitting a Python pipeline, it needs to have a `setup.py` file in the top level directory of your pipeline. 12 | * Using Java? 🔍 13 | * If you are submitting a Java pipeline, please use Gradle with `spotless` and `errorprone`. Use the `etl_integration_java` pipeline as an example (the `CONTRIBUTING.md` file has the details of the files you can copy to use as a template for your pipeline build). 14 | 15 | Please remove all this text before sending your pull request. 16 | 17 | Thanks for your contribution! 18 | -------------------------------------------------------------------------------- /.github/workflows/terraform_plan.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | name: 'Terraform deploy' 16 | on: 17 | workflow_dispatch: 18 | inputs: 19 | prnumber: 20 | type: string 21 | description: PR number to build 22 | permissions: 23 | contents: read 24 | id-token: write 25 | jobs: 26 | terraform-plan: 27 | name: 'Terraform plan with Google Cloud' 28 | runs-on: ubuntu-latest 29 | concurrency: 30 | group: terraform-plan-group 31 | cancel-in-progress: true 32 | defaults: 33 | run: 34 | shell: bash 35 | steps: 36 | - name: Checkout 37 | uses: actions/checkout@v4 38 | with: 39 | ref: 'refs/pull/${{ github.event.inputs.prnumber }}/head' 40 | - uses: dorny/paths-filter@v3 41 | id: filter 42 | with: 43 | filters: "Terraform:\n - 'terraform/**' \n" 44 | - name: 'Google Cloud auth' 45 | uses: 'google-github-actions/auth@v2' 46 | with: 47 | project_id: '${{ secrets.TESTING_PROJECT }}' 48 | workload_identity_provider: '${{ secrets.WIF_PROVIDER }}' 49 | service_account: '${{ secrets.WIF_SERVICE_ACCOUNT }}' 50 | - name: Setup Terraform 51 | uses: hashicorp/setup-terraform@v3 52 | - name: Terraform Init 53 | working-directory: terraform 54 | run: | 55 | ls -d */ | while read d 56 | do 57 | echo "Running tf init in directory: $d" 58 | cd $d && terraform init && cd .. 59 | done 60 | - name: Terraform Plan 61 | working-directory: terraform 62 | run: |- 63 | ls -d */ | while read d 64 | do 65 | echo "Running tf plan in directory: $d" 66 | cd $d 67 | echo 'project_create = "false"' > terraform.tfvars 68 | echo 'project_id = "${{ secrets.TESTING_PROJECT }}"' >> terraform.tfvars 69 | echo 'region = "us-central-1"' >> terraform.tfvars 70 | terraform plan -input=false 71 | cd .. 72 | done 73 | - name: Terraform Apply and destroy 74 | working-directory: terraform 75 | run: |- 76 | ls -d */ | while read d 77 | do 78 | echo "Running tf plan in directory: $d" 79 | cd $d 80 | echo 'project_create = "false"' > terraform.tfvars 81 | echo 'project_id = "${{ secrets.TESTING_PROJECT }}"' >> terraform.tfvars 82 | echo 'region = "us-central-1"' >> terraform.tfvars 83 | terraform apply -input=false -auto-approve 84 | terraform destroy -input=false -auto-approve 85 | cd .. 86 | done 87 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We'd love to accept your patches and contributions to this project. There are 4 | just a few small guidelines you need to follow. 5 | 6 | ## Contributor License Agreement 7 | 8 | Contributions to this project must be accompanied by a Contributor License 9 | Agreement. You (or your employer) retain the copyright to your contribution; 10 | this simply gives us permission to use and redistribute your contributions as 11 | part of the project. Head over to to see 12 | your current agreements on file or to sign a new one. 13 | 14 | You generally only need to submit a CLA once, so if you've already submitted one 15 | (even if it was for a different project), you probably don't need to do it 16 | again. 17 | 18 | ## Run in Dataflow and Google Cloud 19 | 20 | Before submitting your contribution, make sure that all your code runs correctly 21 | in Google Cloud, including any Terraform code and any pipeline you write. 22 | 23 | ## Code Quality Checks 24 | 25 | ### For Python code 26 | 27 | You normally will write Python code in a subdirectory of the `pipelines` folder. 28 | Install `yapf` and run the following command in the top level directory of your 29 | pipeline, to reformat your code: 30 | 31 | ```shell 32 | yapf -i -r --style yapf . 33 | ``` 34 | 35 | If you install `pylint`, you can check if your code will pass the build with the 36 | following command: 37 | 38 | ```shell 39 | pylint --rcfile ../pylintrc . 40 | ``` 41 | 42 | Please note that the configuration file `../pylintrc` is located in the 43 | `pipelines` folder. 44 | 45 | ### For Java code 46 | 47 | Make sure you are using Gradle with the same settings as the existing pipelines 48 | (e.g. use `pipelines/etl_integration_java` as an example), and run the following 49 | command to make your build passes: 50 | 51 | ```shell 52 | ./gradlew build 53 | ``` 54 | 55 | If you find code style issues, run this command to fix them: 56 | 57 | ``` 58 | shell 59 | ./gradlew spotlessApply 60 | ``` 61 | 62 | You can use the following files to copy the Gradle settings to your pipeline: 63 | * `build.gradle` 64 | * `gradlew` and `gradlew.bat` 65 | * The directory `gradle` and all its contents. 66 | 67 | ### For Terraform code 68 | 69 | Run the following command in the top level directory where your Terraform code is located: 70 | 71 | ```shell 72 | terraform fmt 73 | ``` 74 | 75 | You can also check for other types of issues with your Terraform code by using the 76 | `terraform validate` command (but bear in mind you need to run `terraform init` command first). 77 | 78 | ## Code Reviews 79 | 80 | All submissions, including submissions by project members, require review. We 81 | use GitHub pull requests for this purpose. Consult 82 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 83 | information on using pull requests. 84 | 85 | ## Community Guidelines 86 | 87 | This project follows [Google's Open Source Community 88 | Guidelines](https:git//opensource.google/conduct/). 89 | 90 | ## Contributor Guide 91 | 92 | If you are new to contributing to open source, you can find helpful information in this contributor guide. 93 | 94 | You may follow these steps to contribute: 95 | 96 | 1. **Fork the official repository.** This will create a copy of the official repository in your own account. 97 | 2. **Sync the branches.** This will ensure that your copy of the repository is up-to-date with the latest changes from the official repository. 98 | 3. **Work on your forked repository's feature branch.** This is where you will make your changes to the code. 99 | 4. **Commit your updates on your forked repository's feature branch.** This will save your changes to your copy of the repository. 100 | 5. **Submit a pull request to the official repository's main branch.** This will request that your changes be merged into the official repository. 101 | 6. **Resolve any lint errors.** This will ensure that your changes are formatted correctly. 102 | 103 | Here are some additional things to keep in mind during the process: 104 | 105 | - **Read the [Google's Open Source Community Guidelines](https://opensource.google/conduct/).** The contribution guidelines will provide you with more information about the project and how to contribute. 106 | - **Test your changes.** Before you submit a pull request, make sure that your changes work as expected. 107 | - **Be patient.** It may take some time for your pull request to be reviewed and merged. 108 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dataflow Solution Guides 2 | 3 | [![Build and validation](https://github.com/GoogleCloudPlatform/dataflow-solution-guides/actions/workflows/pull_request.yml/badge.svg)](https://github.com/GoogleCloudPlatform/dataflow-solution-guides/actions/workflows/pull_request.yml) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE) 4 | 5 | Welcome to the Dataflow Solution Guides! 6 | 7 | The Dataflow Solution Guides offer full end-to-end deployment for the most 8 | common streaming solutions to run 9 | on [Dataflow](https://cloud.google.com/dataflow/). 10 | 11 | This repository contains the following assets for each guide: 12 | 13 | - Full Terraform code to spawn all the necessary Google Cloud infrastructure 14 | - Pipelines code in Python, Java and Go (coming soon) for a 15 | sample pipeline for each use case 16 | 17 | ## Solution guides 18 | 19 | This the list of solution guides available at this moment: 20 | 21 | | Guide | Description | Development status | 22 | | :-------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------: | :-----------------------: | 23 | | [GenAI & Machine Learning Inference](./use_cases/GenAI_ML.md) | Real-time inference with local GenAI models, using a GPU | Ready :white_check_mark: | 24 | | [ETL / Integration](./use_cases/ETL_integration.md) | Real-time change data capture from a Spanner database to BigQuery | Ready :white_check_mark: | 25 | | [Log Replication & Analytics](./use_cases/Log_replication.md) | Real-time log replication into Splunk | Beta :factory: | 26 | | [Marketing Intelligence](./use_cases/Marketing_Intelligence.md) | Real-time marketing intelligence, using an AutoML model deployed in Vertex | Beta :factory: | 27 | | [Clickstream Analytics](./use_cases/Clickstream_Analytics.md) | Real-time clickstream analytics with Bigtable enrichment / data hydration | Work in progress :hammer: | 28 | | [IoT Analytics](./use_cases/IoT_Analytics.md) | Real-time Internet of Things (IoT) analytics with Bigtable enrichment & models deployed in Vertex AI | Work in progress :hammer: | 29 | | [Anomaly Detection](./use_cases/Anomaly_Detection.md) |Real-time detection of anomalies in a stream of data leveraging GenAI with models deployed in Vertex AI | Beta :factory: | 30 | | [Customer Data Platform](./use_cases/CDP.md) | Real-time customer data platform that unifies a customer view from different sources. | Beta :factory: | 31 | | [Gaming Analytics](./use_cases/gaming_analytics.md) | Real-time analyis of gaming data to enhance live gameplay & offer targeting | Beta :factory: | 32 | 33 | 34 | 35 | ## Repository structure 36 | 37 | - `terraform`: This directory contains the Terraform code for deploying the 38 | necessary Google Cloud 39 | infrastructure for each use case. 40 | - `pipelines`: This directory contains the Python, Java, and Go code for the 41 | sample pipelines. 42 | - `use_cases`: This directory contains the documentation of each use case 43 | 44 | ## Getting help 45 | 46 | - GitHub Issues: Report any issues or ask questions on the GitHub repository. 47 | - https://github.com/GoogleCloudPlatform/dataflow-solution-guides/issues 48 | - Stack Overflow: Search for existing solutions or ask questions on Stack 49 | Overflow using the `google-cloud-dataflow` tag: 50 | - https://stackoverflow.com/questions/tagged/google-cloud-dataflow 51 | 52 | ## Contributing 53 | 54 | Your contributions to this repository are welcome. 55 | 56 | - Fork and Pull Request: Fork the repository and submit a pull request with your 57 | changes. 58 | - Follow the Contribution Guidelines: Please follow the contribution guidelines 59 | outlined in the 60 | [CONTRIBUTING.md](CONTRIBUTING.md) file. 61 | 62 | ## Disclaimer 63 | 64 | This is not an officially supported Google product. The code in this repository 65 | is for demonstrative purposes only. 66 | -------------------------------------------------------------------------------- /pipelines/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/pipelines/.DS_Store -------------------------------------------------------------------------------- /pipelines/.gitignore: -------------------------------------------------------------------------------- 1 | # Gemma model 2 | ./gemma_2b 3 | 4 | # IDEs 5 | .vscode/ 6 | .idea/ 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | share/python-wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | # Virtual environments. 37 | .env 38 | .venv 39 | env/ 40 | venv/ 41 | ENV/ 42 | env.bak/ 43 | venv.bak/ 44 | -------------------------------------------------------------------------------- /pipelines/README.md: -------------------------------------------------------------------------------- 1 | ## Pipelines 2 | 3 | This directory contains sample pipelines for the solution guides. These 4 | pipelines demonstrate how 5 | to use Dataflow to process data in streaming for each one of the use cases. 6 | 7 | The pipelines are written in Python, Java (coming soon), and Go (coming soon). 8 | Each pipeline 9 | includes a README file that provides a detailed description of the pipeline, 10 | including its purpose, 11 | inputs, outputs, and configuration options. 12 | 13 | ## Getting Started 14 | 15 | To get started with the pipelines, follow these steps: 16 | 17 | 1. Choose the pipeline that best suits your needs. 18 | 2. Read the README file for the pipeline to understand its purpose, inputs, 19 | outputs, 20 | and configuration options. MAke sure that you have the necessary 21 | infrastructure ready, using the 22 | corresponding deployment scripts in the `terraform` directory. 23 | 3. Modify the pipeline code to meet your specific requirements. 24 | 4. Run the pipeline using the provided scripts. 25 | 26 | ## Pipelines 27 | 28 | These are the pipelines included in this directory 29 | 30 | | Use case | Programming language | Location | 31 | | :--------------------: | :------------------: | :---------------------------------------------------------: | 32 | | ML & GenAI | Python | [ml_ai_python](./ml_ai_python) | 33 | | ETL & Integration | Java | [etl_integration_java](./etl_integration_java) | 34 | | Customer Data Platform | Python | [cdp](./cdp) | 35 | | Anomaly detection | Python | [anomaly_detection](./anomaly_detection) | 36 | | Marketing Intelligence | Python | [marketing_intelligence](./marketing_intelligence/) | 37 | | Log replication | Dataflow template | [log_replication_splunk](./log_replication_splunk/) | 38 | | Clickstream Analytics | Java | [clickstream_analytics_java](./clickstream_analytics_java/) | 39 | -------------------------------------------------------------------------------- /pipelines/anomaly_detection/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | ARG SERVING_BUILD_IMAGE=tensorflow/tensorflow:2.18.0-gpu 16 | FROM ${SERVING_BUILD_IMAGE} 17 | WORKDIR /workspace 18 | 19 | RUN apt-get update -y && apt-get install -y \ 20 | cmake 21 | 22 | COPY requirements.txt requirements.txt 23 | COPY main.py main.py 24 | COPY anomaly_detection_pipeline anomaly_detection_pipeline 25 | COPY MANIFEST.in MANIFEST.in 26 | COPY setup.py setup.py 27 | 28 | RUN pip install --upgrade --no-cache-dir pip \ 29 | && pip install --no-cache-dir -r requirements.txt \ 30 | && pip install --no-cache-dir -e . 31 | 32 | # Copy files from official SDK image, including script/dependencies. 33 | COPY --from=apache/beam_python3.11_sdk:2.63.0 /opt/apache/beam /opt/apache/beam 34 | 35 | 36 | ENV KERAS_BACKEND="tensorflow" 37 | 38 | # Set the entrypoint to Apache Beam SDK launcher. 39 | ENTRYPOINT ["/opt/apache/beam/boot"] -------------------------------------------------------------------------------- /pipelines/anomaly_detection/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt -------------------------------------------------------------------------------- /pipelines/anomaly_detection/README.md: -------------------------------------------------------------------------------- 1 | # Anomaly Detection sample pipeline (Python) 2 | This sample pipeline demonstrates how to use Dataflow to process data, and detect anomalies 3 | using GenAI. 4 | This pipeline is written in Python. 5 | 6 | This pipeline is part of the [Dataflow Anomaly Detection solution guide](../../use_cases/Anomaly_Detection.md). 7 | 8 | ## Architecture 9 | 10 | The generic architecture for an anomaly detection pipeline looks like as follows: 11 | 12 | ![Architecture](../imgs/anomaly_detect_arch.png) 13 | 14 | In this directory, you will find a specific implementation of the above architecture, with the 15 | following stages: 16 | 17 | 1. **Data ingestion:** Reads data from a Pub/Sub topic. 18 | 2. **Data preprocessing:** The sample pipeline does not do any transformation, but it is trivial 19 | to add a preprocessing step leveraging 20 | [the Enrichment transform](https://cloud.google.com/dataflow/docs/guides/enrichment) to perform 21 | feature engineering before calling the model. 22 | 3. **Inference:** Uses the RunInference transform with a model handler, using Keras and Tensorflow, to call the fraud detection model. The pipeline uses a GPU with the Dataflow worker, to speed up the inference. 23 | 4. **Detections:** The detections are sent to another Pub/Sub topic as output. 24 | 25 | 26 | ## Selecting the cloud region 27 | 28 | Not all the resources may be available in all the regions. The default values included in this 29 | directory have been tested using `us-central1` as region. 30 | 31 | The file `cloudbuild.yaml` is using the machine type `E2_HIGHCPU_8` as the default machine type. If 32 | that's not available in your preferred region, try with other machine types that are available 33 | in Cloud Build: 34 | * https://cloud.google.com/build/docs/api/reference/rest/v1/projects.builds#machinetype 35 | 36 | Moreover, the file `scripts/00_set_environment.sh` specifies a machine type for the Datalow workers. 37 | The selected machine type, `g2-standard-4`, is the recommended one for inference with GPU. If that 38 | type is not available in your region, you can check what machines are available to use with the 39 | following command: 40 | 41 | ```sh 42 | gcloud compute machine-types list --zones=,,... 43 | ``` 44 | 45 | See more info about selecting the right type of machine in the following link: 46 | * https://cloud.google.com/compute/docs/machine-resource 47 | 48 | ## How to launch the pipeline 49 | 50 | All the scripts are located in the `scripts` directory and prepared to be launched from the top 51 | sources directory. 52 | 53 | In the script `scripts/00_set_environment.sh`, define the value of the project id and the region variable: 54 | 55 | ``` 56 | export PROJECT= 57 | export REGION= 58 | ``` 59 | 60 | Leave the rest of variables untouched, although you can override them if you prefer. 61 | 62 | After you edit the script, load those variables into the environment 63 | 64 | ```sh 65 | source scripts/00_set_environment.sh 66 | ``` 67 | 68 | And then run the script that builds and publishes the custom Dataflow container. This container will 69 | contain all the required dependencies. 70 | 71 | ```sh 72 | ./scripts/01_build_and_push_container.sh 73 | ``` 74 | 75 | This will create a Cloud Build job that can take a few minutes to complete. Once it completes, you 76 | can trigger the pipeline with the following: 77 | 78 | ```sh 79 | ./scripts/02_run_dataflow.sh 80 | ``` 81 | 82 | ## Input data 83 | 84 | To send data into the pipeline, you need to publish messages in the `messages` topic. Those 85 | messages are passed "as is" to the model. 86 | 87 | ## Output data 88 | 89 | The predictions are published into the topic `detections`, and can be observed using the 90 | subscription `detections-sub`. -------------------------------------------------------------------------------- /pipelines/anomaly_detection/anomaly_detection_pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /pipelines/anomaly_detection/anomaly_detection_pipeline/options.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Options class for the Anomaly Detection pipeline. 16 | """ 17 | 18 | from argparse import ArgumentParser 19 | 20 | from apache_beam.options.pipeline_options import PipelineOptions 21 | 22 | 23 | class MyPipelineOptions(PipelineOptions): 24 | 25 | @classmethod 26 | def _add_argparse_args(cls, parser: ArgumentParser): 27 | parser.add_argument("--messages_subscription", type=str) 28 | parser.add_argument("--model_endpoint", type=str) 29 | parser.add_argument("--project", type=str) 30 | parser.add_argument("--location", type=str) 31 | parser.add_argument("--responses_topic", type=str) 32 | -------------------------------------------------------------------------------- /pipelines/anomaly_detection/anomaly_detection_pipeline/pipeline.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Anomaly Detection Apache Beam pipeline. 16 | """ 17 | 18 | from apache_beam import Pipeline, PCollection 19 | from apache_beam.ml.inference import RunInference 20 | from apache_beam.io.gcp import pubsub 21 | 22 | import apache_beam as beam 23 | from apache_beam.ml.inference.base import PredictionResult 24 | from apache_beam.ml.inference.vertex_ai_inference import VertexAIModelHandlerJSON 25 | 26 | from .options import MyPipelineOptions 27 | 28 | 29 | def _format_output(element: PredictionResult) -> str: 30 | return f"Input: \n{element.example}, \n\n\nOutput: \n{element.inference}" 31 | 32 | 33 | @beam.ptransform_fn 34 | def _extract(p: Pipeline, subscription: str) -> PCollection[str]: 35 | msgs: PCollection[bytes] = p | "Read subscription" >> beam.io.ReadFromPubSub( 36 | subscription=subscription) 37 | return msgs | "Parse" >> beam.Map(lambda x: x.decode("utf-8")) 38 | 39 | 40 | @beam.ptransform_fn 41 | def _transform(msgs: PCollection[str], model_endpoint: str, project: str, 42 | location: str) -> PCollection[str]: 43 | model_handler = VertexAIModelHandlerJSON( 44 | endpoint_id=model_endpoint, project=project, location=location) 45 | preds: PCollection[ 46 | PredictionResult] = msgs | "RunInference-vertexai" >> RunInference( 47 | model_handler) 48 | return preds | "Format Output" >> beam.Map(_format_output) 49 | 50 | 51 | def create_pipeline(options: MyPipelineOptions) -> Pipeline: 52 | """ Create the pipeline object. 53 | 54 | Args: 55 | options: The pipeline options, with type `MyPipelineOptions`. 56 | 57 | Returns: 58 | The pipeline object. 59 | """ 60 | pipeline = beam.Pipeline(options=options) 61 | # Extract 62 | transactions: PCollection[str] = pipeline | "Read" >> _extract( 63 | subscription=options.messages_subscription) 64 | # Transform 65 | responses: PCollection[str] = transactions | "Transform" >> _transform( 66 | model_endpoint=options.model_endpoint, 67 | project=options.project, 68 | location=options.location) 69 | # Load 70 | responses | "Publish Result" >> pubsub.WriteStringsToPubSub( 71 | topic=options.responses_topic) 72 | 73 | return pipeline 74 | -------------------------------------------------------------------------------- /pipelines/anomaly_detection/cloudbuild.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | steps: 16 | - name: 'gcr.io/cloud-builders/docker' 17 | script: | 18 | docker build -t ${_TAG} . 19 | substitutions: 20 | _TAG: unset 21 | options: 22 | substitutionOption: 'ALLOW_LOOSE' 23 | automapSubstitutions: true 24 | machineType: E2_HIGHCPU_8 25 | images: 26 | - ${_TAG} -------------------------------------------------------------------------------- /pipelines/anomaly_detection/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | An Anomaly Detection example for the Dataflow Solution Guides. 17 | """ 18 | 19 | import time 20 | 21 | from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions 22 | 23 | from anomaly_detection_pipeline.options import MyPipelineOptions 24 | from anomaly_detection_pipeline.pipeline import create_pipeline 25 | 26 | 27 | def main(options: MyPipelineOptions): 28 | pipeline = create_pipeline(options) 29 | pipeline.run() 30 | 31 | 32 | if __name__ == "__main__": 33 | pipeline_options: PipelineOptions = PipelineOptions() 34 | dataflow_options: GoogleCloudOptions = pipeline_options.view_as( 35 | GoogleCloudOptions) 36 | now_epoch_ms = int(time.time() * 1000) 37 | dataflow_options.job_name = f"anomaly-detection-pipeline-{now_epoch_ms}" 38 | custom_options: MyPipelineOptions = pipeline_options.view_as( 39 | MyPipelineOptions) 40 | main(custom_options) 41 | -------------------------------------------------------------------------------- /pipelines/anomaly_detection/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | tensorflow==2.18.0 -------------------------------------------------------------------------------- /pipelines/anomaly_detection/requirements.txt: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apache-beam[gcp]==2.63.0 16 | keras_nlp==0.19.2 17 | keras==3.9.0 18 | protobuf==4.25.6 -------------------------------------------------------------------------------- /pipelines/anomaly_detection/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Setup file for the Anomaly Detection pipeline. 16 | """ 17 | 18 | from setuptools import setup, find_packages 19 | 20 | with open("requirements.txt", encoding="utf-8") as f: 21 | requirements = f.readlines() 22 | 23 | setup( 24 | name="Dataflow Solution for Anomaly Detection pipelines", 25 | version="0.1", 26 | description="Anomaly Detection example for the Dataflow Solution Guides.", 27 | packages=find_packages(), 28 | install_requires=requirements, 29 | ) 30 | -------------------------------------------------------------------------------- /pipelines/cdp/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | FROM apache/beam_python3.11_sdk:2.63.0 16 | WORKDIR /workspace 17 | 18 | RUN apt-get update -y && apt-get install -y \ 19 | cmake 20 | 21 | COPY requirements.txt requirements.txt 22 | COPY main.py main.py 23 | COPY cdp_pipeline cdp_pipeline 24 | COPY schema schema 25 | COPY MANIFEST.in MANIFEST.in 26 | COPY setup.py setup.py 27 | 28 | RUN pip install --upgrade --no-cache-dir pip \ 29 | && pip install --no-cache-dir -r requirements.txt \ 30 | && pip install --no-cache-dir -e . 31 | 32 | # Copy files from official SDK image, including script/dependencies. 33 | COPY --from=apache/beam_python3.11_sdk:2.63.0 /opt/apache/beam /opt/apache/beam 34 | 35 | # Set the entrypoint to Apache Beam SDK launcher. 36 | ENTRYPOINT ["/opt/apache/beam/boot"] -------------------------------------------------------------------------------- /pipelines/cdp/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt -------------------------------------------------------------------------------- /pipelines/cdp/README.md: -------------------------------------------------------------------------------- 1 | # Customer Data Platform sample pipeline (Python) 2 | 3 | This sample pipeline demonstrates how to use Dataflow to process the streaming data in order to build Customer Data platform. We will be reading data form multiple streaming sources, two pub-sub topics in this sample pipeline, will join the data and put it in bigquery table for analytics later on. 4 | 5 | This pipeline is part of the [Dataflow Customer Data Platfrom solution guide](../../use_cases/cdp.md). 6 | 7 | ## Architecture 8 | 9 | The generic architecture for an inference pipeline looks like as follows: 10 | 11 | ![Architecture](../imgs/cdp.png) 12 | 13 | In this directory, you will find a specific implementation of the above architecture, with the 14 | following stages: 15 | 16 | 1. **Data ingestion:** Reads data from a Pub/Sub topic. 17 | 2. **Data preprocessing:** The sample pipeline joins the data from two pub-sub topic based on some key fields. This is to showcase the unification of customer data from different sources to store itin one place. 18 | 3. **Output Data:** The final processed data is then appended to the bigquery table. 19 | 20 | ## Selecting the cloud region 21 | 22 | Not all the resources may be available in all the regions. The default values included in this 23 | directory have been tested using `us-central1` as region. 24 | 25 | Moreover, the file `scripts/00_set_variables.sh` specifies a machine type for the Datalow workers. 26 | The selected machine type, `e2-standard-8`, is the one that we used for unification of data. If that 27 | type is not available in your region, you can check what machines are available to use with the 28 | following command: 29 | 30 | ```sh 31 | gcloud compute machine-types list --zones=,,... 32 | ``` 33 | 34 | See more info about selecting the right type of machine in the following link: 35 | * https://cloud.google.com/compute/docs/machine-resource 36 | 37 | ## How to launch the pipeline 38 | 39 | All the scripts are located in the `scripts` directory and prepared to be launched from the top 40 | sources directory. 41 | 42 | In the script `scripts/00_set_variables.sh`, define the value of the project id and the region variable: 43 | 44 | ``` 45 | export PROJECT= 46 | export REGION= 47 | ``` 48 | 49 | Leave the rest of variables untouched, although you can override them if you prefer. 50 | 51 | After you edit the script, load those variables into the environment 52 | 53 | ```sh 54 | source scripts/00_set_variables.sh 55 | ``` 56 | 57 | And then run the script that builds and publishes the custom Dataflow container. This container will 58 | contain all the required dependencies. 59 | 60 | ```sh 61 | ./scripts/01_cloudbuild_and_push_container.sh 62 | ``` 63 | 64 | This will create a Cloud Build job that can take a few minutes to complete. Once it completes, you 65 | can trigger the pipeline with the following: 66 | 67 | ```sh 68 | ./scripts/02_run_dataflow_job.sh 69 | ``` 70 | You can also directly run below script instead of above 3 steps. 71 | 72 | ```sh 73 | ./scripts/run.sh 74 | ``` 75 | 76 | ## Input data 77 | 78 | To send data into the pipeline, you need to publish messages in the `transactions` and `coupon-redemption` topics. 79 | Run the python code below to publish data to these pub-sub topics. This script is reading sample data from GCS buckets and publishing it to the pub-sub topic to create real-time streaming environment for this use case. One can update the GCS bucket location as per their environment. For reference, input files are added to folder ./input_data/. 80 | 81 | ```python3 82 | ./cdp_pipeline/generate_transaction_data.py 83 | ``` 84 | 85 | ## Output data 86 | 87 | The unified data from the two pub-sub topics is moved to the bigquery table `output_dataset.unified-table`. -------------------------------------------------------------------------------- /pipelines/cdp/cdp_pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /pipelines/cdp/cdp_pipeline/generate_transaction_data.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | A data generator for the Customer Data Platform analytics pipeline. 16 | """ 17 | 18 | from google.cloud import pubsub_v1 19 | import json 20 | import pandas as pd 21 | import asyncio 22 | 23 | 24 | async def publish_coupons_to_pubsub(): 25 | bucket_name = "" 26 | project_id = "" 27 | 28 | # Example: ["27601281299","27757099033","28235291311","27021203242","27101290145","27853175697"] 29 | transactions_id = [ 30 | "" 31 | ] 32 | transactions_topic_name = "transactions" 33 | # Reference example - "dataflow-solution-guide-cdp/input_data/transaction_data.csv" 34 | transactions_data = "" 35 | 36 | coupons_topic_name = "coupon_redemption" 37 | # reference example - "dataflow-solution-guide-cdp/input_data/coupon_redempt.csv" 38 | coupons_data = "" 39 | 40 | transactions_df = pd.read_csv( 41 | f"gs://{bucket_name}/{transactions_data}", dtype=str) 42 | coupons_df = pd.read_csv(f"gs://{bucket_name}/{coupons_data}", dtype=str) 43 | publisher = pubsub_v1.PublisherClient() 44 | 45 | transactions_topic_path = publisher.topic_path(project_id, 46 | transactions_topic_name) 47 | coupons_topic_path = publisher.topic_path(project_id, coupons_topic_name) 48 | filtered_trans_df = transactions_df[transactions_df["transaction_id"].isin( 49 | transactions_id)] 50 | filtered_coupons_df = coupons_df[coupons_df["transaction_id"].isin( 51 | transactions_id)] 52 | await asyncio.gather( 53 | publish_coupons(filtered_coupons_df, publisher, coupons_topic_path), 54 | publish_transactions(filtered_trans_df, publisher, 55 | transactions_topic_path)) 56 | 57 | 58 | async def publish_coupons(filtered_coupons_df, publisher, coupons_topic_path): 59 | for _, row in filtered_coupons_df.iterrows(): 60 | coupon_message = json.dumps(row.to_dict()).encode("utf-8") 61 | print(coupon_message) 62 | future = publisher.publish(coupons_topic_path, coupon_message) 63 | print(f"Published coupon message ID: {future.result()}") 64 | await asyncio.sleep(3) 65 | 66 | 67 | async def publish_transactions(filtered_trans_df, publisher, 68 | transactions_topic_path): 69 | for _, row in filtered_trans_df.iterrows(): 70 | transaction_message = json.dumps(row.to_dict()).encode("utf-8") 71 | print(transaction_message) 72 | future = publisher.publish(transactions_topic_path, transaction_message) 73 | print(f"Published transaction message ID: {future.result()}") 74 | await asyncio.sleep(1) 75 | 76 | 77 | if __name__ == "__main__": 78 | asyncio.run(publish_coupons_to_pubsub()) 79 | -------------------------------------------------------------------------------- /pipelines/cdp/cdp_pipeline/options.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Option class for Customer Data Platform pipeline. 16 | """ 17 | 18 | from argparse import ArgumentParser 19 | 20 | from apache_beam.options.pipeline_options import PipelineOptions 21 | 22 | 23 | class MyPipelineOptions(PipelineOptions): 24 | 25 | @classmethod 26 | def _add_argparse_args(cls, parser: ArgumentParser): 27 | parser.add_argument("--transactions_topic", type=str) 28 | parser.add_argument("--coupons_redemption_topic", type=str) 29 | parser.add_argument("--project_id", type=str) 30 | parser.add_argument("--location", type=str) 31 | parser.add_argument("--output_dataset", type=str) 32 | parser.add_argument("--output_table", type=str) 33 | -------------------------------------------------------------------------------- /pipelines/cdp/cloudbuild.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | steps: 16 | - name: 'gcr.io/cloud-builders/docker' 17 | script: | 18 | docker build -t ${_TAG} . 19 | substitutions: 20 | _TAG: unset 21 | options: 22 | substitutionOption: 'ALLOW_LOOSE' 23 | automapSubstitutions: true 24 | images: 25 | - ${_TAG} -------------------------------------------------------------------------------- /pipelines/cdp/input_data/coupon_redempt.csv: -------------------------------------------------------------------------------- 1 | household_key,day,coupon_upc,campaign,transaction_id 2 | 1,421,10000085364,2200,27601281299 3 | 1,421,51700010076,2200,27601281299 4 | 13,609,10000089277,18,28571755990 -------------------------------------------------------------------------------- /pipelines/cdp/input_data/transaction_data.csv: -------------------------------------------------------------------------------- 1 | household_key,transaction_id,day,product_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc 2 | 1,27601281299,51,941769,1,3.99,436,0,1456,8,0,0 3 | 1,27601281299,51,910635,1,2.99,436,0,1456,8,0,0 -------------------------------------------------------------------------------- /pipelines/cdp/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Customer Data Platform analytics pipeline for the Dataflow Solution Guides. 16 | """ 17 | 18 | import time 19 | 20 | from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions 21 | 22 | from cdp_pipeline.options import MyPipelineOptions 23 | from cdp_pipeline.customer_data_platform import create_and_run_pipeline 24 | 25 | 26 | def main(options: MyPipelineOptions): 27 | create_and_run_pipeline(options) 28 | 29 | 30 | if __name__ == "__main__": 31 | pipeline_options: PipelineOptions = PipelineOptions() 32 | dataflow_options: GoogleCloudOptions = pipeline_options.view_as( 33 | GoogleCloudOptions) 34 | now_epoch_ms = int(time.time() * 1000) 35 | dataflow_options.job_name = f"customer-data-platform-{now_epoch_ms}" 36 | custom_options: MyPipelineOptions = pipeline_options.view_as( 37 | MyPipelineOptions) 38 | main(custom_options) 39 | -------------------------------------------------------------------------------- /pipelines/cdp/requirements.txt: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apache-beam[gcp]==2.63.0 # Example, use your actual versions 16 | ## Below dependencies are required if you have to run script /cdp_pipeline/generate_transaction_data.py 17 | pandas 18 | fsspec 19 | gcsfs 20 | -------------------------------------------------------------------------------- /pipelines/cdp/schema/unified_table.json: -------------------------------------------------------------------------------- 1 | { 2 | "fields": [ 3 | { 4 | "name": "transaction_id", 5 | "type": "STRING", 6 | "mode": "REQUIRED" 7 | }, 8 | { 9 | "name": "household_key", 10 | "type": "STRING", 11 | "mode": "NULLABLE" 12 | }, 13 | { 14 | "name": "coupon_upc", 15 | "type": "STRING", 16 | "mode": "NULLABLE" 17 | }, 18 | { 19 | "name": "product_id", 20 | "type": "STRING", 21 | "mode": "NULLABLE" 22 | }, 23 | { 24 | "name": "coupon_discount", 25 | "type": "STRING", 26 | "mode": "NULLABLE" 27 | } 28 | ] 29 | } -------------------------------------------------------------------------------- /pipelines/cdp/scripts/01_cloudbuild_and_push_container.sh: -------------------------------------------------------------------------------- 1 | gcloud builds submit \ 2 | --region=$REGION \ 3 | --default-buckets-behavior=regional-user-owned-bucket \ 4 | --substitutions _TAG=$CONTAINER_URI \ 5 | . 6 | -------------------------------------------------------------------------------- /pipelines/cdp/scripts/02_run_dataflow_job.sh: -------------------------------------------------------------------------------- 1 | python3 -m main \ 2 | --streaming \ 3 | --runner=DataflowRunner \ 4 | --project=$PROJECT \ 5 | --temp_location=gs://$PROJECT/tmp \ 6 | --region=$REGION \ 7 | --save_main_session \ 8 | --service_account_email=$SERVICE_ACCOUNT \ 9 | --subnetwork=$SUBNETWORK \ 10 | --sdk_container_image=$CONTAINER_URI \ 11 | --max_workers=$MAX_DATAFLOW_WORKERS \ 12 | --disk_size_gb=$DISK_SIZE_GB \ 13 | --machine_type=$MACHINE_TYPE \ 14 | --transactions_topic=$TRANSACTIONS_TOPIC \ 15 | --coupons_redemption_topic=$COUPON_REDEMPTION_TOPIC \ 16 | --output_dataset=$BQ_DATASET \ 17 | --output_table=$BQ_UNIFIED_TABLE \ 18 | --project_id=$PROJECT \ 19 | --enable_streaming_engine 20 | -------------------------------------------------------------------------------- /pipelines/cdp/scripts/run.sh: -------------------------------------------------------------------------------- 1 | source ./scripts/00_set_variables.sh 2 | sh ./scripts/01_cloudbuild_and_push_container.sh 3 | sh ./scripts/02_run_dataflow_job.sh -------------------------------------------------------------------------------- /pipelines/cdp/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Setup file for Customer Data Platform analytics pipeline. 16 | """ 17 | 18 | from setuptools import setup, find_packages 19 | 20 | with open("requirements.txt", encoding="utf-8") as f: 21 | requirements = f.readlines() 22 | 23 | setup( 24 | name="Dataflow Solution for Customer Data Platform", 25 | version="0.1", 26 | description="Customer Data Platform example for the Dataflow Solution Guides", 27 | packages=find_packages(), 28 | install_requires=requirements, 29 | ) 30 | -------------------------------------------------------------------------------- /pipelines/clickstream_analytics_java/.gitattributes: -------------------------------------------------------------------------------- 1 | # 2 | # https://help.github.com/articles/dealing-with-line-endings/ 3 | # 4 | # Linux start script should use lf 5 | /gradlew text eol=lf 6 | 7 | # These are Windows script files and should use crlf 8 | *.bat text eol=crlf 9 | 10 | -------------------------------------------------------------------------------- /pipelines/clickstream_analytics_java/.gitignore: -------------------------------------------------------------------------------- 1 | ### Java template 2 | # Compiled class file 3 | *.class 4 | 5 | # Log file 6 | *.log 7 | 8 | # BlueJ files 9 | *.ctxt 10 | 11 | # Mobile Tools for Java (J2ME) 12 | .mtj.tmp/ 13 | 14 | # Package Files # 15 | *.jar 16 | *.war 17 | *.nar 18 | *.ear 19 | *.zip 20 | *.tar.gz 21 | *.rar 22 | 23 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 24 | hs_err_pid* 25 | replay_pid* 26 | 27 | ### Gradle template 28 | .gradle 29 | **/build/ 30 | !src/**/build/ 31 | 32 | # Ignore Gradle GUI config 33 | gradle-app.setting 34 | 35 | # Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored) 36 | !gradle-wrapper.jar 37 | 38 | # Avoid ignore Gradle wrappper properties 39 | !gradle-wrapper.properties 40 | 41 | # Cache of project 42 | .gradletasknamecache 43 | 44 | # Eclipse Gradle plugin generated files 45 | # Eclipse Core 46 | .project 47 | # JDT-specific (Eclipse Java Development Tools) 48 | .classpath 49 | 50 | # Sources generated by VS Code 51 | bin -------------------------------------------------------------------------------- /pipelines/clickstream_analytics_java/README.md: -------------------------------------------------------------------------------- 1 | ## ClickStream Dataflow Code 2 | 3 | This Dataflow pipeline processes clickstream analytics data, using session windowing to group events into user sessions, and then writes the aggregated session data to BigQuery. The pipeline is written in Java and uses Pub/Sub as the input source. 4 | 5 | **This pipeline is still under development**. 6 | 7 | This pipeline is part of the [Dataflow Clickstream analytics solution guide](../../use_cases/Clickstream_Analytics.md). 8 | 9 | ## Pipeline Architecture 10 | 11 | 1. **Pub/Sub Subscription:** The pipeline reads clickstream events from a Pub/Sub subscription. 12 | 13 | 2. **Dataflow Pipeline:** 14 | 15 | - **Event Parsing:** Incoming Pub/Sub messages are parsed into structured clickstream event objects. 16 | - **Bigtable Enrichment (TODO):** Enrich session data with additional information from Bigtable (code implementation pending). 17 | - **Session Windowing (TODO):** Events are grouped into sessions using a session windowing strategy (e.g., 30-minute inactivity gap). 18 | - **BigQuery Write:** Aggregated session data is written to BigQuery tables. 19 | - **Dead-letter Queue:** Failed records are written to a BigQuery dead-letter table for further analysis and error handling. 20 | 21 | ## TODO 22 | 23 | The Bigtable enrichment and the session windowing analytics steps are not implemented at this moment. 24 | 25 | ## Pipeline Code 26 | 27 | - To build the project, run `./gradlew build` 28 | 29 | ## How to launch the pipelines 30 | 31 | All the scripts are located in the `scripts` directory and prepared to be launched from the top 32 | sources directory. 33 | 34 | The Terraform code generates a file with all the necessary variables in the location `./scripts/00_set_variables.sh`. 35 | 36 | Run the following command to apply that configuration: 37 | 38 | ```sh 39 | source scripts/00_set_variables.sh 40 | ``` 41 | 42 | Then run the analytics pipeline. This pipeline will take data from the input 43 | topic, and will write it to BigQuery, enriching with metadata available in Bigtable, and applying session analytics. 44 | 45 | ```sh 46 | ./scripts/01_launch_pipeline.sh 47 | ``` 48 | -------------------------------------------------------------------------------- /pipelines/clickstream_analytics_java/build.gradle: -------------------------------------------------------------------------------- 1 | // Copyright 2024 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 or the MIT license 5 | // , at your 6 | // option. This file may not be copied, modified, or distributed 7 | // except according to those terms. 8 | plugins { 9 | id 'application' 10 | id "maven-publish" 11 | id "com.diffplug.spotless" version "7.0.2" 12 | id 'com.palantir.git-version' version '3.1.0' 13 | id 'net.ltgt.errorprone' version "4.1.0" 14 | } 15 | ext { 16 | packageName = "clickstream-analytics-java" 17 | javaPackagePath = "com.google.cloud.dataflow.solutions.clickstream_analytics" 18 | appName = "ClickStreamPubSubToBq" 19 | appVersion = "${gitVersion()}-SNAPSHOT" 20 | beamVersion = "2.63.0" 21 | slf4jVersion = "1.7.36" 22 | junitVersion = "4.13.2" 23 | hamcrestVersion = "3.0" 24 | googleJavaFormat = '1.24.0' 25 | errorProneCoreVersion = '2.26.1' 26 | 27 | } 28 | repositories { 29 | mavenCentral() 30 | maven { // Apache Snapshots repository 31 | url "https://repository.apache.org/content/repositories/snapshots/" 32 | } 33 | } 34 | application { 35 | mainClass = "${javaPackagePath}.ClickstreamPubSubToBq" 36 | version = appVersion 37 | } 38 | test { 39 | // JUnit 4. 40 | useJUnit() 41 | dependsOn cleanTest 42 | testLogging.showStandardStreams = true 43 | } 44 | compileJava { 45 | options.compilerArgs.addAll(['-Xlint:deprecation', '-Xlint:unchecked']) 46 | } 47 | run { 48 | if (project.hasProperty('args')) { 49 | args project.args.split('\\s') 50 | } 51 | } 52 | dependencies { 53 | // App dependencies. 54 | implementation "org.apache.beam:beam-sdks-java-core:${beamVersion}" 55 | runtimeOnly "org.apache.beam:beam-runners-direct-java:${beamVersion}" 56 | implementation "org.apache.beam:beam-runners-google-cloud-dataflow-java:${beamVersion}" 57 | implementation "org.apache.beam:beam-sdks-java-io-google-cloud-platform:${beamVersion}" 58 | // Tests dependencies. 59 | testImplementation "junit:junit:${junitVersion}" 60 | testImplementation "org.hamcrest:hamcrest:${hamcrestVersion}" 61 | testImplementation "org.apache.beam:beam-sdks-java-test-utils:${beamVersion}" 62 | implementation 'org.checkerframework:checker-qual:3.49.1' 63 | errorprone "com.google.errorprone:error_prone_core:${errorProneCoreVersion}" 64 | 65 | // Google Java format for Gradle 66 | implementation "com.google.googlejavaformat:google-java-format:${googleJavaFormat}" 67 | } 68 | 69 | // Package a self-contained jar file. 70 | jar { 71 | archiveBaseName = packageName 72 | destinationDirectory = file('build') 73 | manifest { 74 | attributes 'Main-Class': "${javaPackagePath}.ClickstreamPubSubToBq" 75 | } 76 | exclude 'META-INF/*.SF' 77 | exclude 'META-INF/*.DSA' 78 | exclude 'META-INF/*.RSA' 79 | duplicatesStrategy = DuplicatesStrategy.INCLUDE 80 | from { 81 | configurations.runtimeClasspath.collect { it.isDirectory() ? it : zipTree(it) } 82 | } 83 | zip64 true 84 | } 85 | spotless { 86 | format 'misc', { 87 | // define the files to apply `misc` to 88 | target '*.gradle', '*.md', '.gitignore' 89 | // define the steps to apply to those files 90 | trimTrailingWhitespace() 91 | leadingTabsToSpaces(2) 92 | endWithNewline() 93 | } 94 | java { 95 | target project.fileTree(project.rootDir) { 96 | include '**/*.java' 97 | exclude 'build/*' 98 | } 99 | // apply a specific flavor of google-java-format 100 | googleJavaFormat("${googleJavaFormat}").aosp().reflowLongStrings() 101 | // fix formatting of type annotations 102 | formatAnnotations() 103 | // make sure every file has the following copyright header. 104 | licenseHeader '''/* 105 | * Copyright $YEAR Google. 106 | * 107 | * Licensed under the Apache License, Version 2.0 (the "License"); 108 | * you may not use this file except in compliance with the License. 109 | * You may obtain a copy of the License at 110 | * 111 | * http://www.apache.org/licenses/LICENSE-2.0 112 | * 113 | * Unless required by applicable law or agreed to in writing, software 114 | * distributed under the License is distributed on an "AS IS" BASIS, 115 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 116 | * See the License for the specific language governing permissions and 117 | * limitations under the License. 118 | */ 119 | ''' 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /pipelines/clickstream_analytics_java/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/pipelines/clickstream_analytics_java/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /pipelines/clickstream_analytics_java/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-8.13-bin.zip 4 | networkTimeout=10000 5 | validateDistributionUrl=true 6 | zipStoreBase=GRADLE_USER_HOME 7 | zipStorePath=wrapper/dists 8 | -------------------------------------------------------------------------------- /pipelines/clickstream_analytics_java/gradlew.bat: -------------------------------------------------------------------------------- 1 | @rem 2 | @rem Copyright 2015 the original author or authors. 3 | @rem 4 | @rem Licensed under the Apache License, Version 2.0 (the "License"); 5 | @rem you may not use this file except in compliance with the License. 6 | @rem You may obtain a copy of the License at 7 | @rem 8 | @rem https://www.apache.org/licenses/LICENSE-2.0 9 | @rem 10 | @rem Unless required by applicable law or agreed to in writing, software 11 | @rem distributed under the License is distributed on an "AS IS" BASIS, 12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @rem See the License for the specific language governing permissions and 14 | @rem limitations under the License. 15 | @rem 16 | @rem SPDX-License-Identifier: Apache-2.0 17 | @rem 18 | 19 | @if "%DEBUG%"=="" @echo off 20 | @rem ########################################################################## 21 | @rem 22 | @rem Gradle startup script for Windows 23 | @rem 24 | @rem ########################################################################## 25 | 26 | @rem Set local scope for the variables with windows NT shell 27 | if "%OS%"=="Windows_NT" setlocal 28 | 29 | set DIRNAME=%~dp0 30 | if "%DIRNAME%"=="" set DIRNAME=. 31 | @rem This is normally unused 32 | set APP_BASE_NAME=%~n0 33 | set APP_HOME=%DIRNAME% 34 | 35 | @rem Resolve any "." and ".." in APP_HOME to make it shorter. 36 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi 37 | 38 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 39 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" 40 | 41 | @rem Find java.exe 42 | if defined JAVA_HOME goto findJavaFromJavaHome 43 | 44 | set JAVA_EXE=java.exe 45 | %JAVA_EXE% -version >NUL 2>&1 46 | if %ERRORLEVEL% equ 0 goto execute 47 | 48 | echo. 1>&2 49 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 1>&2 50 | echo. 1>&2 51 | echo Please set the JAVA_HOME variable in your environment to match the 1>&2 52 | echo location of your Java installation. 1>&2 53 | 54 | goto fail 55 | 56 | :findJavaFromJavaHome 57 | set JAVA_HOME=%JAVA_HOME:"=% 58 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 59 | 60 | if exist "%JAVA_EXE%" goto execute 61 | 62 | echo. 1>&2 63 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 1>&2 64 | echo. 1>&2 65 | echo Please set the JAVA_HOME variable in your environment to match the 1>&2 66 | echo location of your Java installation. 1>&2 67 | 68 | goto fail 69 | 70 | :execute 71 | @rem Setup the command line 72 | 73 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 74 | 75 | 76 | @rem Execute Gradle 77 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* 78 | 79 | :end 80 | @rem End local scope for the variables with windows NT shell 81 | if %ERRORLEVEL% equ 0 goto mainEnd 82 | 83 | :fail 84 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 85 | rem the _cmd.exe /c_ return code! 86 | set EXIT_CODE=%ERRORLEVEL% 87 | if %EXIT_CODE% equ 0 set EXIT_CODE=1 88 | if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE% 89 | exit /b %EXIT_CODE% 90 | 91 | :mainEnd 92 | if "%OS%"=="Windows_NT" endlocal 93 | 94 | :omega 95 | -------------------------------------------------------------------------------- /pipelines/clickstream_analytics_java/scripts/01_launch_pipeline.sh: -------------------------------------------------------------------------------- 1 | ./gradlew run -Pargs=" 2 | --runner=DataflowRunner \ 3 | --region=$REGION \ 4 | --project=$PROJECT \ 5 | --gcpTempLocation=$TEMP_LOCATION \ 6 | --bqProjectId=$PROJECT \ 7 | --bqDataset=$BQ_DATASET \ 8 | --bqTable=$BQ_TABLE \ 9 | --pubsubSubscription=$SUBSCRIPTION \ 10 | --btInstance=$BIGTABLE_INSTANCE \ 11 | --btTable=$BIGTABLE_TABLE \ 12 | --outputDeadletterTable=$BQ_DEADLETTER_TABLE \ 13 | --btLookupKey=$BT_LOOKUP_KEY" 14 | -------------------------------------------------------------------------------- /pipelines/clickstream_analytics_java/src/main/java/com/google/cloud/dataflow/solutions/clickstream_analytics/BigTableEnrichment.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2024 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.dataflow.solutions.clickstream_analytics; 17 | 18 | public class BigTableEnrichment { 19 | 20 | /*** TODO ***/ 21 | 22 | } 23 | -------------------------------------------------------------------------------- /pipelines/clickstream_analytics_java/src/main/java/com/google/cloud/dataflow/solutions/clickstream_analytics/JsonToTableRows.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2024 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.dataflow.solutions.clickstream_analytics; 17 | 18 | import com.google.api.services.bigquery.model.TableRow; 19 | import java.io.ByteArrayInputStream; 20 | import java.io.IOException; 21 | import java.io.InputStream; 22 | import java.nio.charset.StandardCharsets; 23 | import org.apache.beam.sdk.coders.Coder.Context; 24 | import org.apache.beam.sdk.io.gcp.bigquery.TableRowJsonCoder; 25 | import org.apache.beam.sdk.transforms.DoFn; 26 | import org.apache.beam.sdk.transforms.PTransform; 27 | import org.apache.beam.sdk.transforms.ParDo; 28 | import org.apache.beam.sdk.values.KV; 29 | import org.apache.beam.sdk.values.PCollection; 30 | import org.apache.beam.sdk.values.PCollectionTuple; 31 | import org.apache.beam.sdk.values.TupleTag; 32 | import org.apache.beam.sdk.values.TupleTagList; 33 | import org.slf4j.Logger; 34 | import org.slf4j.LoggerFactory; 35 | 36 | public class JsonToTableRows { 37 | 38 | private static int MESSAGE_LIMIT_SIZE = 10 * 1024 * 1024; 39 | 40 | public static PTransform, PCollectionTuple> run() { 41 | return new JsonToTableRows.JsonToTableRow(); 42 | } 43 | 44 | static final TupleTag SUCCESS_TAG = new TupleTag() {}; 45 | static final TupleTag> FAILURE_TAG = new TupleTag>() {}; 46 | 47 | private static class JsonToTableRow extends PTransform, PCollectionTuple> { 48 | 49 | @Override 50 | public PCollectionTuple expand(PCollection jsonStrings) { 51 | return jsonStrings.apply( 52 | ParDo.of(new ToJsonDoFn()) 53 | .withOutputTags(SUCCESS_TAG, TupleTagList.of(FAILURE_TAG))); 54 | } 55 | } 56 | 57 | private static class ToJsonDoFn extends DoFn { 58 | public static final Logger LOG = LoggerFactory.getLogger(ToJsonDoFn.class); 59 | 60 | @ProcessElement 61 | public void processElement(ProcessContext context) { 62 | String jsonString = context.element(); 63 | 64 | byte[] message_in_bytes = jsonString.getBytes(StandardCharsets.UTF_8); 65 | 66 | if (message_in_bytes.length >= JsonToTableRows.MESSAGE_LIMIT_SIZE) { 67 | LOG.error("Row is too big row, size {} bytes", message_in_bytes.length); 68 | Metrics.tooBigMessages.inc(); 69 | context.output(FAILURE_TAG, KV.of("TooBigRow", jsonString)); 70 | } 71 | 72 | TableRow row; 73 | try (InputStream inputStream = new ByteArrayInputStream(message_in_bytes)) { 74 | row = TableRowJsonCoder.of().decode(inputStream, Context.OUTER); 75 | Metrics.successfulMessages.inc(); 76 | context.output(row); 77 | 78 | } catch (IOException e) { 79 | LOG.error(e.getMessage()); 80 | Metrics.jsonParseErrorMessages.inc(); 81 | context.output(FAILURE_TAG, KV.of("JsonParseError", jsonString)); 82 | } 83 | } 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /pipelines/clickstream_analytics_java/src/main/java/com/google/cloud/dataflow/solutions/clickstream_analytics/Metrics.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2024 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.dataflow.solutions.clickstream_analytics; 17 | 18 | import org.apache.beam.sdk.metrics.Counter; 19 | 20 | public final class Metrics { 21 | public static Counter pubsubMessages = counter("pub-sub-messages"); 22 | public static Counter successfulMessages = counter("successful-messages"); 23 | public static Counter jsonParseErrorMessages = counter("json-parse-failed-messages"); 24 | public static Counter tooBigMessages = counter("too-big-messages"); 25 | public static Counter failedInsertMessages = counter("failed-insert-messages"); 26 | 27 | static Counter counter(String name) { 28 | return org.apache.beam.sdk.metrics.Metrics.counter(Metrics.class, name); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /pipelines/clickstream_analytics_java/src/main/resources/streaming_source_deadletter_table_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "fields": [ 3 | { 4 | "name": "timestamp", 5 | "type": "TIMESTAMP", 6 | "mode": "REQUIRED" 7 | }, 8 | { 9 | "name": "payloadString", 10 | "type": "STRING", 11 | "mode": "REQUIRED" 12 | }, 13 | { 14 | "name": "payloadBytes", 15 | "type": "BYTES", 16 | "mode": "REQUIRED" 17 | }, 18 | { 19 | "name": "attributes", 20 | "type": "RECORD", 21 | "mode": "REPEATED", 22 | "fields": [ 23 | { 24 | "name": "key", 25 | "type": "STRING", 26 | "mode": "NULLABLE" 27 | }, 28 | { 29 | "name": "value", 30 | "type": "STRING", 31 | "mode": "NULLABLE" 32 | } 33 | ] 34 | }, 35 | { 36 | "name": "errorMessage", 37 | "type": "STRING", 38 | "mode": "NULLABLE" 39 | }, 40 | { 41 | "name": "stacktrace", 42 | "type": "STRING", 43 | "mode": "NULLABLE" 44 | } 45 | ] 46 | } -------------------------------------------------------------------------------- /pipelines/etl_integration_java/.gitignore: -------------------------------------------------------------------------------- 1 | ### Java template 2 | # Compiled class file 3 | *.class 4 | 5 | # Log file 6 | *.log 7 | 8 | # BlueJ files 9 | *.ctxt 10 | 11 | # Mobile Tools for Java (J2ME) 12 | .mtj.tmp/ 13 | 14 | # Package Files # 15 | *.jar 16 | *.war 17 | *.nar 18 | *.ear 19 | *.zip 20 | *.tar.gz 21 | *.rar 22 | 23 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 24 | hs_err_pid* 25 | replay_pid* 26 | 27 | ### Gradle template 28 | .gradle 29 | **/build/ 30 | !src/**/build/ 31 | 32 | # Ignore Gradle GUI config 33 | gradle-app.setting 34 | 35 | # Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored) 36 | !gradle-wrapper.jar 37 | 38 | # Avoid ignore Gradle wrappper properties 39 | !gradle-wrapper.properties 40 | 41 | # Cache of project 42 | .gradletasknamecache 43 | 44 | # Eclipse Gradle plugin generated files 45 | # Eclipse Core 46 | .project 47 | # JDT-specific (Eclipse Java Development Tools) 48 | .classpath 49 | 50 | # Sources generated by VS Code 51 | bin -------------------------------------------------------------------------------- /pipelines/etl_integration_java/README.md: -------------------------------------------------------------------------------- 1 | # ETL & integration sample pipeline (Java) 2 | 3 | This sample pipeline demonstrates how to use Dataflow to create replicas of transactional databases, using change 4 | streams, to create and maintain constantly updated replicas of the database. This pipeline is written in Java. 5 | 6 | This pipeline is part of the [Dataflow ETL & integration solution guide](../../use_cases/ETL_integration.md). 7 | 8 | ## Architecture 9 | 10 | The generic architecture for both looks like this: 11 | 12 | ![Architecture](./imgs/etl_integration.png) 13 | 14 | There are two pipelines in this repository. The first pipeline reads from a Pub/Sub topic of public data, and writes 15 | to a Spanner database. This pipeline's purpose is to keep Spanner with constant updates. The data is written in an 16 | `events` table. 17 | 18 | The second pipeline reads from a change stream from Spanner, and replicates the `events` table in BigQuery. The table 19 | in BigQuery receives updates continuously and has the same data as the Spanner table, with a minimal latency. 20 | 21 | The infrastructure required to launch the pipelines is deployed 22 | through [the accompanying Terraform scripts in this solution guide](../../terraform/etl_integration/README.md). 23 | 24 | ## How to launch the pipelines 25 | 26 | All the scripts are located in the `scripts` directory and prepared to be launched from the top 27 | sources directory. 28 | 29 | The Terraform code generates a file with all the necessary variables in the location `./scripts/01_set_variables.sh`. 30 | Run the following command to apply that configuration: 31 | 32 | ```sh 33 | source scripts/01_set_variables.sh 34 | ``` 35 | 36 | Then run the publisher pipeline. This pipeline will take data from the input 37 | topic, and will write it to Spanner. This pipeline is meant only to have 38 | some data in the Spanner change streams for the sake of running this guide 39 | as an example, in a real setting your data would land in Spanner by many 40 | other different means: 41 | 42 | ```sh 43 | ./scripts/02_run_publisher_dataflow.sh 44 | ``` 45 | 46 | Once you have the publisher pipeline populating some data into Spanner, you 47 | can read from the change streams to replicate the database into BigQuery. 48 | For that, execute the following: 49 | 50 | ```sh 51 | ./scripts/03_run_changestream_template.sh 52 | ``` 53 | 54 | ## Input data 55 | 56 | All the input data is taken by default from the following public Pub/Sub topic: 57 | * `projects/pubsub-public-data/topics/taxirides-realtime` 58 | 59 | So you don't need to send any data anywhere to run this guide as an example. 60 | 61 | ## Output data 62 | 63 | The BigQuery dataset (by default, `replica`) will contain a table (by default, 64 | called `events`, in the `taxis` database), with the same contents as the 65 | Spanner table. This replication will happen in real time with low latency, 66 | as new data lands in the Spanner table (or if any existing record is 67 | modified or deleted). 68 | -------------------------------------------------------------------------------- /pipelines/etl_integration_java/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/pipelines/etl_integration_java/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /pipelines/etl_integration_java/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-8.13-bin.zip 4 | networkTimeout=10000 5 | validateDistributionUrl=true 6 | zipStoreBase=GRADLE_USER_HOME 7 | zipStorePath=wrapper/dists 8 | -------------------------------------------------------------------------------- /pipelines/etl_integration_java/gradlew.bat: -------------------------------------------------------------------------------- 1 | @rem 2 | @rem Copyright 2015 the original author or authors. 3 | @rem 4 | @rem Licensed under the Apache License, Version 2.0 (the "License"); 5 | @rem you may not use this file except in compliance with the License. 6 | @rem You may obtain a copy of the License at 7 | @rem 8 | @rem https://www.apache.org/licenses/LICENSE-2.0 9 | @rem 10 | @rem Unless required by applicable law or agreed to in writing, software 11 | @rem distributed under the License is distributed on an "AS IS" BASIS, 12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @rem See the License for the specific language governing permissions and 14 | @rem limitations under the License. 15 | @rem 16 | @rem SPDX-License-Identifier: Apache-2.0 17 | @rem 18 | 19 | @if "%DEBUG%"=="" @echo off 20 | @rem ########################################################################## 21 | @rem 22 | @rem Gradle startup script for Windows 23 | @rem 24 | @rem ########################################################################## 25 | 26 | @rem Set local scope for the variables with windows NT shell 27 | if "%OS%"=="Windows_NT" setlocal 28 | 29 | set DIRNAME=%~dp0 30 | if "%DIRNAME%"=="" set DIRNAME=. 31 | @rem This is normally unused 32 | set APP_BASE_NAME=%~n0 33 | set APP_HOME=%DIRNAME% 34 | 35 | @rem Resolve any "." and ".." in APP_HOME to make it shorter. 36 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi 37 | 38 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 39 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" 40 | 41 | @rem Find java.exe 42 | if defined JAVA_HOME goto findJavaFromJavaHome 43 | 44 | set JAVA_EXE=java.exe 45 | %JAVA_EXE% -version >NUL 2>&1 46 | if %ERRORLEVEL% equ 0 goto execute 47 | 48 | echo. 1>&2 49 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 1>&2 50 | echo. 1>&2 51 | echo Please set the JAVA_HOME variable in your environment to match the 1>&2 52 | echo location of your Java installation. 1>&2 53 | 54 | goto fail 55 | 56 | :findJavaFromJavaHome 57 | set JAVA_HOME=%JAVA_HOME:"=% 58 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 59 | 60 | if exist "%JAVA_EXE%" goto execute 61 | 62 | echo. 1>&2 63 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 1>&2 64 | echo. 1>&2 65 | echo Please set the JAVA_HOME variable in your environment to match the 1>&2 66 | echo location of your Java installation. 1>&2 67 | 68 | goto fail 69 | 70 | :execute 71 | @rem Setup the command line 72 | 73 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 74 | 75 | 76 | @rem Execute Gradle 77 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* 78 | 79 | :end 80 | @rem End local scope for the variables with windows NT shell 81 | if %ERRORLEVEL% equ 0 goto mainEnd 82 | 83 | :fail 84 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 85 | rem the _cmd.exe /c_ return code! 86 | set EXIT_CODE=%ERRORLEVEL% 87 | if %EXIT_CODE% equ 0 set EXIT_CODE=1 88 | if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE% 89 | exit /b %EXIT_CODE% 90 | 91 | :mainEnd 92 | if "%OS%"=="Windows_NT" endlocal 93 | 94 | :omega 95 | -------------------------------------------------------------------------------- /pipelines/etl_integration_java/imgs/etl_integration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/pipelines/etl_integration_java/imgs/etl_integration.png -------------------------------------------------------------------------------- /pipelines/etl_integration_java/scripts/.gitignore: -------------------------------------------------------------------------------- 1 | 01_set_variables.sh 2 | -------------------------------------------------------------------------------- /pipelines/etl_integration_java/scripts/02_run_publisher_dataflow.sh: -------------------------------------------------------------------------------- 1 | ./gradlew run -Pargs=" 2 | --pipeline=PUBSUB_TO_SPANNER \ 3 | --streaming \ 4 | --enableStreamingEngine \ 5 | --autoscalingAlgorithm=THROUGHPUT_BASED \ 6 | --runner=DataflowRunner \ 7 | --project=$PROJECT \ 8 | --tempLocation=$TEMP_LOCATION \ 9 | --region=$REGION \ 10 | --serviceAccount=$SERVICE_ACCOUNT \ 11 | --subnetwork=$NETWORK \ 12 | --maxNumWorkers=$MAX_DATAFLOW_WORKERS \ 13 | --experiments=enable_data_sampling;use_network_tags=ssh;dataflow \ 14 | --usePublicIps=false \ 15 | --pubsubTopic=$TOPIC \ 16 | --spannerInstance=$SPANNER_INSTANCE \ 17 | --spannerDatabase=$SPANNER_DATABASE \ 18 | --spannerTable=$SPANNER_TABLE" -------------------------------------------------------------------------------- /pipelines/etl_integration_java/scripts/03_run_changestream_template.sh: -------------------------------------------------------------------------------- 1 | gcloud dataflow flex-template run spanner-change-streams \ 2 | --template-file-gcs-location=gs://dataflow-templates-$REGION/latest/flex/Spanner_Change_Streams_to_BigQuery \ 3 | --project=$PROJECT \ 4 | --region $REGION \ 5 | --temp-location=$TEMP_LOCATION \ 6 | --service-account-email=$SERVICE_ACCOUNT \ 7 | --subnetwork=$NETWORK \ 8 | --max-workers=$MAX_DATAFLOW_WORKERS \ 9 | --worker-machine-type=$WORKER_TYPE \ 10 | --disable-public-ips \ 11 | --parameters \ 12 | spannerInstanceId=$SPANNER_INSTANCE,\ 13 | spannerDatabase=$SPANNER_DATABASE,\ 14 | spannerMetadataInstanceId=$SPANNER_INSTANCE,\ 15 | spannerMetadataDatabase=$SPANNER_METADATA_DB,\ 16 | spannerChangeStreamName=$SPANNER_CHANGE_STREAM,\ 17 | bigQueryDataset=$BIGQUERY_DATASET -------------------------------------------------------------------------------- /pipelines/etl_integration_java/src/main/java/com/google/cloud/dataflow/solutions/ETLIntegration.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2024 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.dataflow.solutions; 18 | 19 | import com.google.cloud.dataflow.solutions.data.TaxiObjects; 20 | import com.google.cloud.dataflow.solutions.load.Spanner; 21 | import com.google.cloud.dataflow.solutions.options.SpannerPublisherOptions; 22 | import com.google.cloud.dataflow.solutions.transform.TaxiEventProcessor; 23 | import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions; 24 | import org.apache.beam.sdk.Pipeline; 25 | import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO; 26 | import org.apache.beam.sdk.io.gcp.pubsub.PubsubMessage; 27 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 28 | import org.apache.beam.sdk.values.PCollection; 29 | 30 | public class ETLIntegration { 31 | public static void main(String[] args) { 32 | String jobName = "pubsub-to-spanner"; 33 | SpannerPublisherOptions spannerPublisherOptions = 34 | PipelineOptionsFactory.fromArgs(args) 35 | .withoutStrictParsing() 36 | .as(SpannerPublisherOptions.class); 37 | 38 | Pipeline p = createPipeline(spannerPublisherOptions); 39 | p.getOptions().setJobName(jobName); 40 | p.run(); 41 | } 42 | 43 | public static Pipeline createPipeline(SpannerPublisherOptions options) { 44 | String projectId = options.as(DataflowPipelineOptions.class).getProject(); 45 | 46 | Pipeline p = Pipeline.create(options); 47 | 48 | PCollection msgs = 49 | p.apply("Read topic", PubsubIO.readMessages().fromTopic(options.getPubsubTopic())); 50 | 51 | TaxiEventProcessor.ParsingOutput parsed = 52 | msgs.apply("Parse", TaxiEventProcessor.FromPubsubMessage.parse()); 53 | PCollection taxiEvents = parsed.getParsedData(); 54 | 55 | taxiEvents.apply( 56 | "Write", 57 | Spanner.Writer.builder() 58 | .projectId(projectId) 59 | .instanceId(options.getSpannerInstance()) 60 | .databaseId(options.getSpannerDatabase()) 61 | .tableName(options.getSpannerTable()) 62 | .build()); 63 | 64 | return p; 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /pipelines/etl_integration_java/src/main/java/com/google/cloud/dataflow/solutions/data/SchemaUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2024 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.dataflow.solutions.data; 18 | 19 | import org.apache.beam.sdk.Pipeline; 20 | import org.apache.beam.sdk.schemas.NoSuchSchemaException; 21 | import org.apache.beam.sdk.schemas.Schema; 22 | import org.slf4j.Logger; 23 | import org.slf4j.LoggerFactory; 24 | 25 | public class SchemaUtils { 26 | 27 | private static final Logger LOG = LoggerFactory.getLogger(SchemaUtils.class); 28 | 29 | public static Schema getSchemaForType(Pipeline p, Class classType) { 30 | Schema schema; 31 | 32 | try { 33 | schema = p.getSchemaRegistry().getSchema(classType); 34 | } catch (NoSuchSchemaException e) { 35 | LOG.error(e.getMessage()); 36 | throw new IllegalArgumentException( 37 | String.format( 38 | "Could not find schema for %s", 39 | TaxiObjects.TaxiEvent.class.getCanonicalName())); 40 | } 41 | 42 | return schema; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /pipelines/etl_integration_java/src/main/java/com/google/cloud/dataflow/solutions/data/TaxiObjects.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2024 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.dataflow.solutions.data; 18 | 19 | import com.google.auto.value.AutoValue; 20 | import org.apache.beam.sdk.schemas.AutoValueSchema; 21 | import org.apache.beam.sdk.schemas.annotations.DefaultSchema; 22 | import org.apache.beam.sdk.schemas.annotations.SchemaFieldName; 23 | import org.joda.time.Instant; 24 | 25 | public class TaxiObjects { 26 | 27 | /** Represents Taxi Ride Event */ 28 | @DefaultSchema(AutoValueSchema.class) 29 | @AutoValue 30 | public abstract static class TaxiEvent { 31 | 32 | @SchemaFieldName("ride_id") 33 | public abstract String getRideId(); 34 | 35 | @SchemaFieldName("point_idx") 36 | public abstract Integer getPointIdx(); 37 | 38 | @SchemaFieldName("latitude") 39 | public abstract Double getLatitude(); 40 | 41 | @SchemaFieldName("longitude") 42 | public abstract Double getLongitude(); 43 | 44 | @SchemaFieldName("timestamp") 45 | public abstract String getTimeStamp(); 46 | 47 | @SchemaFieldName("meter_reading") 48 | public abstract Double getMeterReading(); 49 | 50 | @SchemaFieldName("meter_increment") 51 | public abstract Double getMeterIncrement(); 52 | 53 | @SchemaFieldName("ride_status") 54 | public abstract String getRideStatus(); 55 | 56 | @SchemaFieldName("passenger_count") 57 | public abstract Integer getPassengerCount(); 58 | 59 | public static Builder builder() { 60 | return new AutoValue_TaxiObjects_TaxiEvent.Builder(); 61 | } 62 | 63 | @AutoValue.Builder 64 | public abstract static class Builder { 65 | public abstract Builder setRideId(String value); 66 | 67 | public abstract Builder setPointIdx(Integer value); 68 | 69 | public abstract Builder setLatitude(Double latitude); 70 | 71 | public abstract Builder setLongitude(Double longitude); 72 | 73 | public abstract Builder setTimeStamp(String value); 74 | 75 | public abstract Builder setMeterReading(Double value); 76 | 77 | public abstract Builder setMeterIncrement(Double value); 78 | 79 | public abstract Builder setRideStatus(String value); 80 | 81 | public abstract Builder setPassengerCount(Integer value); 82 | 83 | public abstract TaxiEvent build(); 84 | } 85 | } 86 | 87 | @AutoValue 88 | @DefaultSchema(AutoValueSchema.class) 89 | /* Represents a parsing error message event */ 90 | public abstract static class ParsingError { 91 | // These field names are determined 92 | @SchemaFieldName("input_data") 93 | public abstract String getInputData(); 94 | 95 | @SchemaFieldName("error_message") 96 | public abstract String getErrorMessage(); 97 | 98 | @SchemaFieldName("timestamp") 99 | public abstract Instant getTimestamp(); 100 | 101 | public static Builder builder() { 102 | return new AutoValue_TaxiObjects_ParsingError.Builder(); 103 | } 104 | 105 | @AutoValue.Builder 106 | public abstract static class Builder { 107 | public abstract Builder setInputData(String i); 108 | 109 | public abstract Builder setErrorMessage(String e); 110 | 111 | public abstract Builder setTimestamp(Instant t); 112 | 113 | public abstract ParsingError build(); 114 | } 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /pipelines/etl_integration_java/src/main/java/com/google/cloud/dataflow/solutions/options/SpannerPublisherOptions.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2024 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.dataflow.solutions.options; 18 | 19 | import org.apache.beam.sdk.options.Description; 20 | import org.apache.beam.sdk.options.PipelineOptions; 21 | import org.apache.beam.sdk.options.Validation; 22 | 23 | public interface SpannerPublisherOptions extends PipelineOptions { 24 | @Validation.Required() 25 | @Description("Input topic with data to replicate in Spanner") 26 | void setPubsubTopic(String t); 27 | 28 | String getPubsubTopic(); 29 | 30 | @Validation.Required() 31 | @Description("Spanner table to write the data to") 32 | void setSpannerTable(String t); 33 | 34 | String getSpannerTable(); 35 | 36 | @Validation.Required() 37 | @Description("Spanner instance to write the data to") 38 | void setSpannerInstance(String s); 39 | 40 | String getSpannerInstance(); 41 | 42 | @Validation.Required() 43 | @Description("Spanner database to write the data to") 44 | void setSpannerDatabase(String d); 45 | 46 | String getSpannerDatabase(); 47 | } 48 | -------------------------------------------------------------------------------- /pipelines/etl_integration_java/src/main/java/com/google/cloud/dataflow/solutions/transform/RowToError.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2024 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.dataflow.solutions.transform; 18 | 19 | import com.google.cloud.dataflow.solutions.data.SchemaUtils; 20 | import com.google.cloud.dataflow.solutions.data.TaxiObjects; 21 | import org.apache.beam.sdk.coders.SerializableCoder; 22 | import org.apache.beam.sdk.schemas.Schema; 23 | import org.apache.beam.sdk.schemas.transforms.Convert; 24 | import org.apache.beam.sdk.transforms.DoFn; 25 | import org.apache.beam.sdk.transforms.PTransform; 26 | import org.apache.beam.sdk.transforms.ParDo; 27 | import org.apache.beam.sdk.values.PCollection; 28 | import org.apache.beam.sdk.values.Row; 29 | import org.joda.time.Instant; 30 | import org.slf4j.Logger; 31 | import org.slf4j.LoggerFactory; 32 | 33 | class RowToError extends PTransform, PCollection> { 34 | public static final Logger LOG = LoggerFactory.getLogger(RowToError.class); 35 | 36 | @Override 37 | public PCollection expand(PCollection errorRows) { 38 | // Create ErrorMessage events for incompatible schema (Failed records from JsonToRow) 39 | Schema errorMessageSchema = 40 | SchemaUtils.getSchemaForType( 41 | errorRows.getPipeline(), TaxiObjects.ParsingError.class); 42 | 43 | return errorRows 44 | .apply( 45 | "Error Message Events", 46 | ParDo.of(new GenerateJsonToRowErrorMsgDoFn(errorMessageSchema))) 47 | .setCoder(SerializableCoder.of(Row.class)) 48 | .setRowSchema(errorMessageSchema) 49 | .apply("Error Messages to Row", Convert.fromRows(TaxiObjects.ParsingError.class)); 50 | } 51 | 52 | private static class GenerateJsonToRowErrorMsgDoFn extends DoFn { 53 | final Schema errorMessageSchema; 54 | 55 | public GenerateJsonToRowErrorMsgDoFn(Schema errorMessageSchema) { 56 | this.errorMessageSchema = errorMessageSchema; 57 | } 58 | 59 | @ProcessElement 60 | public void processElement( 61 | @FieldAccess("line") String inputData, 62 | @FieldAccess("err") String errorMessage, 63 | @Timestamp Instant timestamp, 64 | OutputReceiver out) { 65 | 66 | out.output( 67 | Row.withSchema(errorMessageSchema) 68 | .withFieldValue("input_data", inputData) 69 | .withFieldValue("error_message", errorMessage) 70 | .withFieldValue("timestamp", timestamp) 71 | .build()); 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /pipelines/imgs/anomaly_detect_arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/pipelines/imgs/anomaly_detect_arch.png -------------------------------------------------------------------------------- /pipelines/imgs/cdp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/pipelines/imgs/cdp.png -------------------------------------------------------------------------------- /pipelines/imgs/iot_analytics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/pipelines/imgs/iot_analytics.png -------------------------------------------------------------------------------- /pipelines/imgs/log_replication.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/pipelines/imgs/log_replication.png -------------------------------------------------------------------------------- /pipelines/imgs/market_intel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/pipelines/imgs/market_intel.png -------------------------------------------------------------------------------- /pipelines/imgs/ml_ai_arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/pipelines/imgs/ml_ai_arch.png -------------------------------------------------------------------------------- /pipelines/iot_analytics/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | FROM apache/beam_python3.11_sdk:2.63.0 16 | WORKDIR /workspace 17 | 18 | RUN apt-get update -y && apt-get install -y \ 19 | cmake 20 | 21 | COPY requirements.txt requirements.txt 22 | COPY main.py main.py 23 | COPY iot_analytics_pipeline iot_analytics_pipeline 24 | COPY maintenance_model.pkl maintenance_model.pkl 25 | COPY MANIFEST.in MANIFEST.in 26 | COPY setup.py setup.py 27 | 28 | RUN pip install --upgrade --no-cache-dir pip \ 29 | && pip install --no-cache-dir -r requirements.txt \ 30 | && pip install --no-cache-dir -e . 31 | 32 | # Copy files from official SDK image, including script/dependencies. 33 | COPY --from=apache/beam_python3.11_sdk:2.63.0 /opt/apache/beam /opt/apache/beam 34 | 35 | # Set the entrypoint to Apache Beam SDK launcher. 36 | ENTRYPOINT ["/opt/apache/beam/boot"] -------------------------------------------------------------------------------- /pipelines/iot_analytics/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt -------------------------------------------------------------------------------- /pipelines/iot_analytics/cloudbuild.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | steps: 16 | - name: 'gcr.io/cloud-builders/docker' 17 | script: | 18 | docker build -t ${_TAG} . 19 | substitutions: 20 | _TAG: unset 21 | options: 22 | substitutionOption: 'ALLOW_LOOSE' 23 | automapSubstitutions: true 24 | images: 25 | - ${_TAG} -------------------------------------------------------------------------------- /pipelines/iot_analytics/iot_analytics_pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /pipelines/iot_analytics/iot_analytics_pipeline/maintenance_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/pipelines/iot_analytics/iot_analytics_pipeline/maintenance_model.pkl -------------------------------------------------------------------------------- /pipelines/iot_analytics/iot_analytics_pipeline/options.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Options class for the IoT Analytics pipeline. 16 | """ 17 | 18 | from argparse import ArgumentParser 19 | from apache_beam.options.pipeline_options import PipelineOptions 20 | 21 | 22 | class MyPipelineOptions(PipelineOptions): 23 | """ 24 | Options class for the IoT Analytics pipeline. 25 | """ 26 | 27 | @classmethod 28 | def _add_argparse_args(cls, parser: ArgumentParser): 29 | parser.add_argument( 30 | '--topic', 31 | dest='topic', 32 | help='Pub/sub topic name :"projects/your_project_id/topics/topic_name"') 33 | parser.add_argument( 34 | '--project_id', dest='project', help='Your Google Cloud project ID') 35 | parser.add_argument( 36 | '--dataset', dest='dataset', help='Enter BigQuery Dataset Id') 37 | parser.add_argument('--table', dest='table', help='Enter BigQuery Table Id') 38 | parser.add_argument( 39 | '--bigtable_instance_id', 40 | dest='bigtable_instance_id', 41 | help='Enter BigTable Instance Id') 42 | parser.add_argument( 43 | '--bigtable_table_id', 44 | dest='bigtable_table_id', 45 | help='Enter BigTable Table Id') 46 | parser.add_argument( 47 | '--row_key', dest='row_key', help='Enter BigTable row key') 48 | -------------------------------------------------------------------------------- /pipelines/iot_analytics/iot_analytics_pipeline/parse_timestamp.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Pipeline of the IoT Analytics Dataflow Solution guide. 16 | """ 17 | import typing 18 | import datetime 19 | from apache_beam.transforms.window import TimestampedValue 20 | 21 | 22 | class VehicleStateEvent(typing.NamedTuple): 23 | """ 24 | Class to create VehicleState TimestampedValue 25 | """ 26 | vehicle_id: str 27 | timestamp: datetime.datetime 28 | temperature: int 29 | rpm: int 30 | vibration: float 31 | fuel_level: int 32 | mileage: int 33 | 34 | @staticmethod 35 | def convert_json_to_vehicleobj(input_json): 36 | dt_object = datetime.datetime.strptime(input_json["timestamp"], 37 | "%Y-%m-%dT%H:%M:%SZ") 38 | event = VehicleStateEvent( 39 | vehicle_id=input_json["vehicle_id"], 40 | timestamp=dt_object, 41 | temperature=input_json["temperature"], 42 | rpm=input_json["rpm"], 43 | vibration=input_json["vibration"], 44 | fuel_level=input_json["fuel_level"], 45 | mileage=input_json["mileage"]) 46 | return TimestampedValue(event, dt_object.timestamp()) 47 | -------------------------------------------------------------------------------- /pipelines/iot_analytics/iot_analytics_pipeline/pipeline.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Pipeline of the IoT Analytics Dataflow Solution guide. 16 | """ 17 | import apache_beam as beam 18 | from apache_beam import Pipeline 19 | from .options import MyPipelineOptions 20 | import json 21 | import pickle 22 | from .aggregate_metrics import AggregateMetrics 23 | from .parse_timestamp import VehicleStateEvent 24 | from .trigger_inference import RunInference 25 | from apache_beam.transforms.window import FixedWindows 26 | from apache_beam.transforms.trigger import AccumulationMode, AfterWatermark 27 | from typing import Any, Dict, Tuple 28 | 29 | from apache_beam.transforms.enrichment import Enrichment 30 | from apache_beam.transforms.enrichment_handlers.bigtable import BigTableEnrichmentHandler 31 | 32 | 33 | def custom_join(left: Dict[str, Any], right: Dict[str, Any]): 34 | enriched = {} 35 | enriched["vehicle_id"] = left["vehicle_id"] 36 | enriched["max_temperature"] = left["max_temperature"] 37 | enriched["max_vibration"] = left["max_vibration"] 38 | enriched["latest_timestamp"] = left["max_timestamp"] 39 | enriched["avg_mileage"] = left["avg_mileage"] 40 | enriched["last_service_date"] = right["maintenance"]["last_service_date"] 41 | enriched["maintenance_type"] = right["maintenance"]["maintenance_type"] 42 | enriched["model"] = right["maintenance"]["model"] 43 | return enriched 44 | 45 | 46 | with open("maintenance_model.pkl", "rb") as model_file: 47 | sklearn_model_handler = pickle.load(model_file) 48 | 49 | 50 | def create_pipeline(pipeline_options: MyPipelineOptions) -> Pipeline: 51 | """ Create the pipeline object. 52 | 53 | Args: 54 | options: The pipeline options, with type `MyPipelineOptions`. 55 | 56 | Returns: 57 | The pipeline object. 58 | """ 59 | # Define your pipeline options 60 | bigtable_handler = BigTableEnrichmentHandler( 61 | project_id=pipeline_options.project, 62 | instance_id=pipeline_options.bigtable_instance_id, 63 | table_id=pipeline_options.bigtable_table_id, 64 | row_key=pipeline_options.row_key) 65 | bq_schema = "vehicle_id:STRING, \ 66 | max_temperature:INTEGER, \ 67 | max_vibration:FLOAT, \ 68 | latest_timestamp:TIMESTAMP, \ 69 | last_service_date:STRING, \ 70 | maintenance_type:STRING, \ 71 | model:STRING, \ 72 | needs_maintenance:INTEGER" 73 | 74 | pipeline = beam.Pipeline(options=pipeline_options) 75 | enriched_data = pipeline \ 76 | | "ReadFromPubSub" >> beam.io.ReadFromPubSub(topic=pipeline_options.topic) \ 77 | | "Read JSON" >> beam.Map(json.loads) \ 78 | | "Parse&EventTimestamp" >> beam.Map( 79 | VehicleStateEvent.convert_json_to_vehicleobj).with_output_types( 80 | VehicleStateEvent) \ 81 | | "AddKeys" >> beam.WithKeys(lambda event: event.vehicle_id).with_output_types( 82 | Tuple[str, VehicleStateEvent]) \ 83 | | "Window" >> beam.WindowInto( 84 | FixedWindows(60), 85 | trigger=AfterWatermark(), 86 | accumulation_mode=AccumulationMode.ACCUMULATING) \ 87 | | "AggregateMetrics" >> beam.ParDo(AggregateMetrics()).with_output_types( 88 | VehicleStateEvent).with_input_types(Tuple[str, VehicleStateEvent]) \ 89 | | "EnrichWithBigtable" >> Enrichment( 90 | bigtable_handler, join_fn=custom_join, timeout=10) 91 | predictions = enriched_data | "RunInference" >> beam.ParDo( 92 | RunInference(model=sklearn_model_handler)) 93 | predictions | "WriteToBigQuery" >> beam.io.gcp.bigquery.WriteToBigQuery( 94 | method=beam.io.WriteToBigQuery.Method.STORAGE_WRITE_API, 95 | project=pipeline_options.project, 96 | dataset=pipeline_options.dataset, 97 | table=pipeline_options.table, 98 | schema=bq_schema, 99 | create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, 100 | write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND) 101 | return pipeline 102 | -------------------------------------------------------------------------------- /pipelines/iot_analytics/iot_analytics_pipeline/trigger_inference.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Pipeline of the IoT Analytics Dataflow Solution guide. 16 | """ 17 | import apache_beam as beam 18 | import pandas as pd 19 | 20 | 21 | class RunInference(beam.DoFn): 22 | """ 23 | A custom model to predict the if vehicle needs_maintenance 24 | """ 25 | 26 | def process(self, element): 27 | df = pd.DataFrame([element]) 28 | df["last_service_date"] = ( 29 | pd.to_datetime(df["last_service_date"]) - 30 | pd.to_datetime(df["last_service_date"]).min()).dt.days 31 | prediction = self.model.predict( 32 | df[["max_temperature", "max_vibration", "last_service_date"]]) 33 | results = beam.Row( 34 | vehicle_id=str(element["vehicle_id"]), 35 | max_temperature=float(element["max_temperature"]), 36 | max_vibration=float(element["max_vibration"]), 37 | latest_timestamp=element["latest_timestamp"], 38 | last_service_date=element["last_service_date"], 39 | maintenance_type=element["maintenance_type"], 40 | model=element["model"], 41 | needs_maintenance=prediction[0]) 42 | yield results._asdict() 43 | -------------------------------------------------------------------------------- /pipelines/iot_analytics/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | A IoT Analytics example for the Dataflow Solution Guides. 16 | """ 17 | 18 | import time 19 | 20 | from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions 21 | 22 | from iot_analytics_pipeline.options import MyPipelineOptions 23 | from iot_analytics_pipeline.pipeline import create_pipeline 24 | 25 | 26 | def main(options: MyPipelineOptions): 27 | pipeline = create_pipeline(options) 28 | pipeline.run() 29 | 30 | 31 | if __name__ == "__main__": 32 | pipeline_options: PipelineOptions = PipelineOptions() 33 | dataflow_options: GoogleCloudOptions = pipeline_options.view_as( 34 | GoogleCloudOptions) 35 | now_epoch_ms = int(time.time() * 1000) 36 | dataflow_options.job_name = f"iot-analytics-pipeline-{now_epoch_ms}" 37 | custom_options: MyPipelineOptions = pipeline_options.view_as( 38 | MyPipelineOptions) 39 | main(custom_options) 40 | -------------------------------------------------------------------------------- /pipelines/iot_analytics/maintenance_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/pipelines/iot_analytics/maintenance_model.pkl -------------------------------------------------------------------------------- /pipelines/iot_analytics/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-beam[gcp]==2.63.0 2 | pandas 3 | scikit-learn 4 | ## Below dependencies are required if you have to run script / 5 | google-cloud-bigtable 6 | pandas 7 | google-cloud-pubsub 8 | tabulate -------------------------------------------------------------------------------- /pipelines/iot_analytics/scripts/01_cloud_build_and_push.sh: -------------------------------------------------------------------------------- 1 | gcloud builds submit \ 2 | --region=$REGION \ 3 | --default-buckets-behavior=regional-user-owned-bucket \ 4 | --substitutions _TAG=$CONTAINER_URI \ 5 | . -------------------------------------------------------------------------------- /pipelines/iot_analytics/scripts/02_submit_job.sh: -------------------------------------------------------------------------------- 1 | python3 -m main \ 2 | --streaming \ 3 | --runner=DataflowRunner \ 4 | --project=$PROJECT_ID \ 5 | --temp_location=gs://$PROJECT_ID/tmp \ 6 | --region=$REGION \ 7 | --save_main_session \ 8 | --service_account_email=$SERVICE_ACCOUNT \ 9 | --subnetwork=$SUBNETWORK \ 10 | --sdk_container_image=$CONTAINER_URI \ 11 | --max_workers=$MAX_DATAFLOW_WORKERS \ 12 | --topic=$TOPIC_ID \ 13 | --dataset=$DATASET \ 14 | --table=$TABLE \ 15 | --bigtable_instance_id=$INSTANCE_ID \ 16 | --bigtable_table_id=$BIGTABLE_TABLE_ID \ 17 | --row_key=$ROW_KEY \ 18 | --project_id=$PROJECT_ID \ 19 | --enable_streaming_engine -------------------------------------------------------------------------------- /pipelines/iot_analytics/scripts/create_and_populate_bigtable.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Pipeline of the IoT Analytics Dataflow Solution guide. 16 | """ 17 | 18 | # Create a bigtable and populate the weather data table 19 | from google.cloud.bigtable import column_family 20 | from google.cloud.bigtable import row 21 | from google.cloud.bigtable import Client 22 | from datetime import datetime 23 | import os 24 | import json 25 | 26 | # Create Bigtable Data (Weather data) and Load Records 27 | current_directory = os.getcwd() 28 | PROJECT_ID = os.environ.get("PROJECT_ID") 29 | INSTANCE_ID = os.environ.get("BIGTABLE_INSTANCE_ID") 30 | TABLE_ID = os.environ.get("BIGTABLE_TABLE_ID") 31 | MAINTENANCE_DATA_PATH = os.environ.get("MAINTENANCE_DATA_PATH") 32 | 33 | # Create a Bigtable client 34 | client = Client(project=PROJECT_ID, admin=True) 35 | instance = client.instance(INSTANCE_ID) 36 | 37 | # Create a column family. 38 | column_family_id = "maintenance" 39 | max_versions_rule = column_family.MaxVersionsGCRule(2) 40 | column_families = {column_family_id: max_versions_rule} 41 | 42 | # Create a table. 43 | table = instance.table(TABLE_ID) 44 | 45 | # You need admin access to use `.exists()`. If you don't have the admin access, then 46 | # comment out the if-else block. 47 | if not table.exists(): 48 | table.create(column_families=column_families) 49 | else: 50 | print(f"Table {TABLE_ID} already exists in {PROJECT_ID}:{INSTANCE_ID}") 51 | 52 | # Define column names for the table. 53 | vehicle_id = "vehicle_id" 54 | last_service_date = "last_service_date" 55 | maintenance_type = "maintenance_type" 56 | make = "make" 57 | model = "model" 58 | 59 | # Sample weather data 60 | maintenance_data = [] 61 | try: 62 | with open(MAINTENANCE_DATA_PATH, "r", encoding="utf-8") as f: 63 | for line in f: 64 | try: 65 | data = json.loads(line) 66 | maintenance_data.append(data) 67 | except json.JSONDecodeError as e: 68 | print(f"Error decoding JSON from line: {line.strip()}") 69 | print(f"Error message: {e}") 70 | # Handle the error (e.g., log it, skip the line, or raise an exception) 71 | 72 | except FileNotFoundError: 73 | print(f"File not found: {MAINTENANCE_DATA_PATH}") 74 | 75 | # Populate Bigtable 76 | for record in maintenance_data: 77 | row_key = str(record[vehicle_id]).encode() 78 | row = table.direct_row(row_key) 79 | row.set_cell( 80 | column_family_id, 81 | vehicle_id.encode(), 82 | str(record[vehicle_id]), 83 | timestamp=datetime.utcnow()) 84 | row.set_cell( 85 | column_family_id, 86 | last_service_date.encode(), 87 | str(record[last_service_date]), 88 | timestamp=datetime.utcnow()) 89 | row.set_cell( 90 | column_family_id, 91 | maintenance_type.encode(), 92 | str(record[maintenance_type]), 93 | timestamp=datetime.utcnow()) 94 | row.set_cell( 95 | column_family_id, 96 | make.encode(), 97 | str(record[make]), 98 | timestamp=datetime.utcnow()) 99 | row.set_cell( 100 | column_family_id, 101 | model.encode(), 102 | str(record[model]), 103 | timestamp=datetime.utcnow()) 104 | row.commit() 105 | print(f"Inserted row for key: {record[vehicle_id]}") 106 | 107 | print("Bigtable populated with sample weather information.") 108 | -------------------------------------------------------------------------------- /pipelines/iot_analytics/scripts/create_data.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Pipeline of the IoT Analytics Dataflow Solution guide. 16 | """ 17 | 18 | import random 19 | import datetime 20 | import pandas as pd 21 | import os 22 | 23 | # Get Env variables 24 | current_directory = os.getcwd() 25 | VEHICLE_DATA_PATH = os.environ.get("VEHICLE_DATA_PATH") 26 | MAINTENANCE_DATA_PATH = os.environ.get("MAINTENANCE_DATA_PATH") 27 | 28 | 29 | # Function to generate random vehicle data 30 | def generate_vehicle_data(vehicle_id): 31 | return { 32 | "vehicle_id": vehicle_id, 33 | "timestamp": datetime.datetime.now().isoformat(timespec="seconds") + "Z", 34 | "temperature": random.randint(65, 85), 35 | "rpm": random.randint(1500, 3500), 36 | "vibration": round(random.uniform(0.1, 0.5), 2), 37 | "fuel_level": random.randint(50, 90), 38 | "mileage": random.randint(40000, 60000) 39 | } 40 | 41 | 42 | # Function to generate random maintenance data 43 | def generate_maintenance_data(vehicle_id): 44 | return { 45 | "vehicle_id": 46 | vehicle_id, 47 | "last_service_date": (datetime.datetime.now() - datetime.timedelta( 48 | days=random.randint(30, 365))).strftime("%Y-%m-%d"), 49 | "maintenance_type": 50 | random.choice([ 51 | "oil_change", "tire_rotation", "brake_check", "filter_replacement" 52 | ]), 53 | "make": 54 | "Ford", 55 | "model": 56 | "F-150" 57 | } 58 | 59 | 60 | # Generate 10 unique vehicle IDs 61 | vehicle_ids = [str(i) for i in range(1000, 1010)] 62 | 63 | # Create vehicle data and maintenance data lists 64 | vehicle_data = [generate_vehicle_data(vehicle_id) for vehicle_id in vehicle_ids] 65 | maintenance_data = [ 66 | generate_maintenance_data(vehicle_id) for vehicle_id in vehicle_ids 67 | ] 68 | 69 | # Convert lists to Pandas DataFrames 70 | df_vehicle_data = pd.DataFrame(vehicle_data) 71 | df_maintenance_data = pd.DataFrame(maintenance_data) 72 | 73 | df_vehicle_data.to_json(VEHICLE_DATA_PATH, orient="records", lines=True) 74 | df_maintenance_data.to_json(MAINTENANCE_DATA_PATH, orient="records", lines=True) 75 | -------------------------------------------------------------------------------- /pipelines/iot_analytics/scripts/maintenance_data.jsonl: -------------------------------------------------------------------------------- 1 | {"vehicle_id":"1000","last_service_date":"2024-12-15","maintenance_type":"oil_change","make":"Ford","model":"F-150"} 2 | {"vehicle_id":"1001","last_service_date":"2024-07-24","maintenance_type":"filter_replacement","make":"Ford","model":"F-150"} 3 | {"vehicle_id":"1002","last_service_date":"2024-02-06","maintenance_type":"tire_rotation","make":"Ford","model":"F-150"} 4 | {"vehicle_id":"1003","last_service_date":"2024-01-27","maintenance_type":"filter_replacement","make":"Ford","model":"F-150"} 5 | {"vehicle_id":"1004","last_service_date":"2024-10-06","maintenance_type":"filter_replacement","make":"Ford","model":"F-150"} 6 | {"vehicle_id":"1005","last_service_date":"2024-06-07","maintenance_type":"brake_check","make":"Ford","model":"F-150"} 7 | {"vehicle_id":"1006","last_service_date":"2024-03-11","maintenance_type":"brake_check","make":"Ford","model":"F-150"} 8 | {"vehicle_id":"1007","last_service_date":"2024-08-15","maintenance_type":"brake_check","make":"Ford","model":"F-150"} 9 | {"vehicle_id":"1008","last_service_date":"2024-07-29","maintenance_type":"tire_rotation","make":"Ford","model":"F-150"} 10 | {"vehicle_id":"1009","last_service_date":"2024-12-15","maintenance_type":"brake_check","make":"Ford","model":"F-150"} 11 | -------------------------------------------------------------------------------- /pipelines/iot_analytics/scripts/model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Creates model for IoT Analytics Solution Dataflow Solution guide. 16 | """ 17 | 18 | import pandas as pd 19 | import numpy as np 20 | from datetime import datetime, timedelta 21 | from sklearn.model_selection import train_test_split 22 | from sklearn.linear_model import LogisticRegression 23 | import pickle 24 | 25 | 26 | def create_sample_data(num_samples): 27 | data = { 28 | "vehicle_id": [], 29 | "max_temperature": [], 30 | "max_vibration": [], 31 | "last_service_date": [], 32 | "needs_maintenance": [] 33 | } 34 | 35 | for i in range(num_samples): 36 | vehicle_id = str(1000 + i) 37 | max_temperature = np.random.randint(50, 100) 38 | max_vibration = np.random.uniform(0, 1) 39 | last_service_date = datetime.now() - timedelta( 40 | days=np.random.randint(0, 365)) 41 | last_service_date_str = last_service_date.strftime("%Y-%m-%d") 42 | 43 | needs_maintenance = (max_temperature > 75) or (max_vibration > 0.5) or ( 44 | last_service_date < datetime.now() - timedelta(days=180)) 45 | 46 | data["vehicle_id"].append(vehicle_id) 47 | data["max_temperature"].append(max_temperature) 48 | data["max_vibration"].append(max_vibration) 49 | data["last_service_date"].append(last_service_date_str) 50 | data["needs_maintenance"].append(needs_maintenance) 51 | 52 | return pd.DataFrame(data) 53 | 54 | 55 | # Create a sample dataset with 100 samples 56 | df = create_sample_data(100) 57 | print(df.head(n=10).to_markdown()) 58 | 59 | # Convert the last_service_date to a datetime object 60 | df["last_service_date"] = pd.to_datetime(df["last_service_date"]) 61 | 62 | # Features and target variable 63 | X = df[["max_temperature", "max_vibration", "last_service_date"]] 64 | y = df["needs_maintenance"].astype(int) 65 | 66 | # Convert last_service_date to numeric for modeling 67 | X["last_service_date"] = (X["last_service_date"] - 68 | X["last_service_date"].min()).dt.days 69 | 70 | # Split the dataset 71 | X_train, X_test, y_train, y_test = train_test_split( 72 | X, y, test_size=0.2, random_state=42) 73 | 74 | # Create and train the model 75 | model = LogisticRegression() 76 | model.fit(X_train, y_train) 77 | 78 | # Save the model to a local file 79 | with open("maintenance_model.pkl", "wb") as f: 80 | print("Added Model") 81 | pickle.dump(model, f) 82 | -------------------------------------------------------------------------------- /pipelines/iot_analytics/scripts/publish_on_pubsub.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Pipeline of the IoT Analytics Dataflow Solution guide. 16 | """ 17 | 18 | import json 19 | from google.cloud import pubsub_v1 20 | import os 21 | 22 | 23 | def publish_messages(project, topic, data_path): 24 | """ 25 | Publishes JSON messages from a file to a Pub/Sub topic. 26 | 27 | Args: 28 | project: The ID of the Google Cloud project. 29 | topic: The ID of the Pub/Sub topic. 30 | data_path: The path to the JSON data file. 31 | """ 32 | 33 | publisher = pubsub_v1.PublisherClient() 34 | topic_path = publisher.topic_path(project, topic) 35 | 36 | with open(data_path, "r", encoding="utf-8") as f: 37 | for line in f: 38 | try: 39 | # Parse each line as a JSON object 40 | json_data = json.loads(line) 41 | 42 | # Publish the JSON data as a message 43 | message_data = json.dumps(json_data).encode("utf-8") 44 | future = publisher.publish(topic_path, message_data) 45 | print(f"Published message ID: {future.result()}") 46 | 47 | except json.JSONDecodeError as e: 48 | print(f"Error decoding JSON: {e}") 49 | 50 | 51 | if __name__ == "__main__": 52 | current_directory = os.getcwd() 53 | publish_messages( 54 | os.environ.get("PROJECT_ID"), os.environ.get("PUBSUB_TOPIC_ID"), 55 | os.environ.get("VEHICLE_DATA_PATH")) 56 | -------------------------------------------------------------------------------- /pipelines/iot_analytics/scripts/vehicle_data.jsonl: -------------------------------------------------------------------------------- 1 | {"vehicle_id":"1000","timestamp":"2025-01-18T15:56:41Z","temperature":85,"rpm":1797,"vibration":0.11,"fuel_level":64,"mileage":52571} 2 | {"vehicle_id":"1001","timestamp":"2025-01-18T15:56:41Z","temperature":74,"rpm":1967,"vibration":0.37,"fuel_level":67,"mileage":46017} 3 | {"vehicle_id":"1002","timestamp":"2025-01-18T15:56:41Z","temperature":80,"rpm":2529,"vibration":0.31,"fuel_level":59,"mileage":44782} 4 | {"vehicle_id":"1003","timestamp":"2025-01-18T15:56:41Z","temperature":67,"rpm":3312,"vibration":0.23,"fuel_level":62,"mileage":59421} 5 | {"vehicle_id":"1004","timestamp":"2025-01-18T15:56:41Z","temperature":77,"rpm":3206,"vibration":0.27,"fuel_level":74,"mileage":52049} 6 | {"vehicle_id":"1005","timestamp":"2025-01-18T15:56:41Z","temperature":66,"rpm":3091,"vibration":0.31,"fuel_level":80,"mileage":52200} 7 | {"vehicle_id":"1006","timestamp":"2025-01-18T15:56:41Z","temperature":81,"rpm":2883,"vibration":0.46,"fuel_level":85,"mileage":40869} 8 | {"vehicle_id":"1007","timestamp":"2025-01-18T15:56:41Z","temperature":69,"rpm":1696,"vibration":0.12,"fuel_level":79,"mileage":46986} 9 | {"vehicle_id":"1008","timestamp":"2025-01-18T15:56:41Z","temperature":69,"rpm":3308,"vibration":0.14,"fuel_level":58,"mileage":47238} 10 | {"vehicle_id":"1009","timestamp":"2025-01-18T15:56:41Z","temperature":83,"rpm":2238,"vibration":0.3,"fuel_level":74,"mileage":44609} 11 | -------------------------------------------------------------------------------- /pipelines/iot_analytics/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Setup file for the IoT Analytics pipeline. 16 | """ 17 | 18 | from setuptools import setup, find_packages 19 | 20 | with open("requirements.txt", encoding="utf-8") as f: 21 | requirements = f.readlines() 22 | 23 | setup( 24 | name="IoT Dataflow Anaytics Pipeline", 25 | version="0.1", 26 | packages=find_packages(), 27 | install_requires=requirements, 28 | package_data={"iot_analytics_pipeline": ["maintenance_model.pkl"]}) 29 | -------------------------------------------------------------------------------- /pipelines/log_replication_splunk/README.md: -------------------------------------------------------------------------------- 1 | # Log replication sample pipeline (Dataflow template) 2 | 3 | This sample pipeline reads log lines with additional metadata from a Pub/Sub 4 | topic, and it redirects to the corresponding log collector in Splunk. The 5 | pipeline leverages the 6 | [Google-provided Dataflow template](https://cloud.google.com/dataflow/docs/guides/templates/provided/pubsub-to-splunk). 7 | 8 | 9 | This pipeline is part of the [Dataflow log replication & analytics solution 10 | guide](../../use_cases/Log_replication.md). 11 | 12 | ## Architecture 13 | 14 | The generic architecture for both looks like this: 15 | 16 | ![Architecture](../imgs/log_replication.png) 17 | 18 | The Terraform code configures a Cloud Logging sink that makes sure that all 19 | logs are sent to the `all-logs` Pub/Sub topic. 20 | 21 | The infrastructure required to launch the pipelines is deployed 22 | through [the accompanying Terraform scripts in this solution guide](../../terraform/log_replication_splunk/README.md). 23 | 24 | ## How to launch the pipeline 25 | 26 | All the scripts are located in the `scripts` directory and prepared to be launched from the top 27 | sources directory. 28 | 29 | The Terraform code generates a file with all the necessary variables in the 30 | location `./scripts/00_set_variables.sh`. Run the following command to 31 | apply that configuration: 32 | 33 | ```sh 34 | source scripts/01_set_variables.sh 35 | ``` 36 | 37 | Now you can run the pipeline that will take logs from Pub/Sub and will send 38 | them to Splunk. You need to ensure that there is network connectivity to 39 | access Splunk from Dataflow (e.g. Internet access, if necessary), and that 40 | you have set the required credentials in the Terraform config, so Dataflow 41 | has the required permissions to publish into Splunk: 42 | 43 | ```sh 44 | ./scripts/01_launch_ps_to_splunk.sh 45 | ``` 46 | 47 | ## Input data 48 | 49 | All the logs produced in the project are being redirected to the Pub/Sub 50 | topic `all-logs`. The pipeline uses a Pub/Sub subscription, `all-logs-sub`, 51 | so no logs are lost if the pipeline is stopped (during the retention period 52 | of the subscription, which is 30 days by default). 53 | 54 | The regular operation of the project (e.g. launching Dataflow) should 55 | already produce some logs as to observe some output in Splunk for testing 56 | purposes. 57 | 58 | ## Output data 59 | 60 | There are two outputs in this pipeline: 61 | * Splunk, written to the HEC endpoint 62 | * Dead letter queue, the `deadletter-topic` Pub/Sub topic 63 | 64 | When Splunk rejects messages for whatever reason, they are sent to the 65 | `deadletter-topic`. 66 | 67 | If the Splunk endpoint rejects messages because it is overloaded, times out, 68 | etc, Dataflow will retry publishing those messages in Splunk. Only the 69 | messages that are rejected by Splunk due to non-transitory errors are sent 70 | to the dead letter queue. -------------------------------------------------------------------------------- /pipelines/log_replication_splunk/scripts/.gitignore: -------------------------------------------------------------------------------- 1 | 00_set_variables.sh 2 | -------------------------------------------------------------------------------- /pipelines/log_replication_splunk/scripts/01_launch_ps_to_splunk.sh: -------------------------------------------------------------------------------- 1 | gcloud dataflow jobs run logs-to-splunk \ 2 | --gcs-location gs://dataflow-templates-$REGION/latest/Cloud_PubSub_to_Splunk \ 3 | --region $REGION \ 4 | --project $PROJECT \ 5 | --service-account-email $SERVICE_ACCOUNT \ 6 | --staging-location $TEMP_LOCATION \ 7 | --subnetwork $NETWORK \ 8 | --enable-streaming-engine \ 9 | --disable-public-ips \ 10 | --max-workers=$MAX_DATAFLOW_WORKERS \ 11 | --parameters \ 12 | inputSubscription=$INPUT_SUBSCRIPTION,\ 13 | url=$SPLUNK_HEC_URL,\ 14 | disableCertificateValidation=false,\ 15 | includePubsubMessage=false,\ 16 | tokenSecretId=$TOKEN_SECRET_ID,\ 17 | tokenSource=SECRET_MANAGER,\ 18 | enableBatchLogs=true,\ 19 | enableGzipHttpCompression=true,\ 20 | outputDeadletterTopic=$DEADLETTER_TOPIC 21 | -------------------------------------------------------------------------------- /pipelines/marketing_intelligence/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | ARG SERVING_BUILD_IMAGE=tensorflow/tensorflow:2.18.0-gpu 16 | FROM ${SERVING_BUILD_IMAGE} 17 | WORKDIR /workspace 18 | 19 | RUN apt-get update -y && apt-get install -y \ 20 | cmake 21 | 22 | COPY requirements.txt requirements.txt 23 | COPY main.py main.py 24 | COPY marketing_intelligence_pipeline marketing_intelligence_pipeline 25 | COPY MANIFEST.in MANIFEST.in 26 | COPY setup.py setup.py 27 | 28 | RUN pip install --upgrade --no-cache-dir pip \ 29 | && pip install --no-cache-dir -r requirements.txt \ 30 | && pip install --no-cache-dir -e . 31 | 32 | # Copy files from official SDK image, including script/dependencies. 33 | COPY --from=apache/beam_python3.11_sdk:2.63.0 /opt/apache/beam /opt/apache/beam 34 | 35 | 36 | ENV KERAS_BACKEND="tensorflow" 37 | 38 | # Set the entrypoint to Apache Beam SDK launcher. 39 | ENTRYPOINT ["/opt/apache/beam/boot"] 40 | -------------------------------------------------------------------------------- /pipelines/marketing_intelligence/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt -------------------------------------------------------------------------------- /pipelines/marketing_intelligence/cloudbuild.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2054 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | steps: 16 | # - name: 'gcr.io/cloud-builders/gsutil' 17 | # script: | 18 | # echo Copying Gemma model from $_GCS_GEMMA_PATH 19 | # gsutil -m -q cp -r $_GCS_GEMMA_PATH /workspace 20 | # echo All files copied. 21 | - name: 'gcr.io/cloud-builders/docker' 22 | script: | 23 | docker build -t ${_TAG} . 24 | substitutions: 25 | _TAG: unset 26 | options: 27 | substitutionOption: 'ALLOW_LOOSE' 28 | automapSubstitutions: true 29 | machineType: E2_HIGHCPU_8 30 | images: 31 | - ${_TAG} -------------------------------------------------------------------------------- /pipelines/marketing_intelligence/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | A Product predicter example for the Dataflow Solution Guides. 16 | """ 17 | 18 | import time 19 | 20 | from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions 21 | 22 | from marketing_intelligence_pipeline.options import MyPipelineOptions 23 | from marketing_intelligence_pipeline.pipeline import create_pipeline 24 | 25 | 26 | def main(options: MyPipelineOptions): 27 | pipeline = create_pipeline(options) 28 | pipeline.run() 29 | 30 | 31 | if __name__ == "__main__": 32 | pipeline_options: PipelineOptions = PipelineOptions() 33 | dataflow_options: GoogleCloudOptions = pipeline_options.view_as( 34 | GoogleCloudOptions) 35 | now_epoch_ms = int(time.time() * 1000) 36 | dataflow_options.job_name = f"marketing-intelligence-pipeline-{now_epoch_ms}" 37 | custom_options: MyPipelineOptions = pipeline_options.view_as( 38 | MyPipelineOptions) 39 | main(custom_options) 40 | -------------------------------------------------------------------------------- /pipelines/marketing_intelligence/marketing_intelligence_pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /pipelines/marketing_intelligence/marketing_intelligence_pipeline/options.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Options class for the Marketing Intelligence pipeline. 16 | """ 17 | 18 | from argparse import ArgumentParser 19 | 20 | from apache_beam.options.pipeline_options import PipelineOptions 21 | 22 | 23 | class MyPipelineOptions(PipelineOptions): 24 | 25 | @classmethod 26 | def _add_argparse_args(cls, parser: ArgumentParser): 27 | parser.add_argument("--messages_subscription", type=str) 28 | parser.add_argument("--model_endpoint", type=str) 29 | parser.add_argument("--project_id", type=str) 30 | parser.add_argument("--location", type=str) 31 | parser.add_argument("--responses_topic", type=str) 32 | -------------------------------------------------------------------------------- /pipelines/marketing_intelligence/marketing_intelligence_pipeline/pipeline.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Pipeline of the Marketing Intelligence Dataflow Solution guide. 16 | """ 17 | 18 | from apache_beam import Pipeline, PCollection 19 | from apache_beam.ml.inference import RunInference 20 | from apache_beam.io.gcp import pubsub 21 | import json 22 | import apache_beam as beam 23 | from apache_beam.ml.inference.base import PredictionResult 24 | from apache_beam.ml.inference.vertex_ai_inference import VertexAIModelHandlerJSON 25 | from .options import MyPipelineOptions 26 | 27 | 28 | # Format the predictions sent by the Vertex AI Endpoint 29 | def _format_output(element: PredictionResult) -> str: 30 | return f"Input: \n{element.example}, \n\n\nOutput: \n{element.infernece}" 31 | 32 | 33 | # Format the input and send each input as a dictionary 34 | def _format_input(x: bytes) -> dict: 35 | instance_dict = json.loads(x.decode("utf-8")) 36 | return instance_dict 37 | 38 | 39 | # Read input from Pub/Sub (all input data to be sent in String) and format it 40 | @beam.ptransform_fn 41 | def _extract(p: Pipeline, subscription: str) -> PCollection[str]: 42 | msgs: PCollection[bytes] = p | "Read subscription" >> beam.io.ReadFromPubSub( 43 | subscription=subscription) 44 | return msgs | "Parse and format Input" >> beam.Map(_format_input) 45 | 46 | 47 | # TODO Add transformation for BigTable Enrichment 48 | 49 | 50 | # Request predictions from the Vertex AI endpoint by sending the formatted input 51 | @beam.ptransform_fn 52 | def _transform(msgs: PCollection[str], model_endpoint: str, project: str, 53 | location: str) -> PCollection[str]: 54 | model_handler = VertexAIModelHandlerJSON( 55 | endpoint_id=model_endpoint, project=project, location=location) 56 | preds: PCollection[ 57 | PredictionResult] = msgs | "RunInference-vertexai" >> RunInference( 58 | model_handler) 59 | return preds | "Format Output" >> beam.Map(_format_output) 60 | 61 | 62 | def create_pipeline(options: MyPipelineOptions) -> Pipeline: 63 | """ Create the pipeline object. 64 | 65 | Args: 66 | options: The pipeline options, with type `MyPipelineOptions`. 67 | 68 | Returns: 69 | The pipeline object. 70 | """ 71 | pipeline = beam.Pipeline(options=options) 72 | # Extract 73 | messages: PCollection[str] = pipeline | "Read" >> _extract( 74 | subscription=options.messages_subscription) 75 | # Transform 76 | predictions: PCollection[str] = messages | "Transform" >> _transform( 77 | model_endpoint=options.model_endpoint, 78 | project=options.project_id, 79 | location=options.location) 80 | # Load 81 | predictions | "Publish Result" >> pubsub.WriteStringsToPubSub( 82 | topic=options.responses_topic) 83 | 84 | return pipeline 85 | -------------------------------------------------------------------------------- /pipelines/marketing_intelligence/requirements.txt: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apache-beam[gcp]==2.63.0 -------------------------------------------------------------------------------- /pipelines/marketing_intelligence/scripts/.gitignore: -------------------------------------------------------------------------------- 1 | 00_set_variables.sh -------------------------------------------------------------------------------- /pipelines/marketing_intelligence/scripts/01_build_and_push_container.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | gcloud builds submit \ 16 | --region=$REGION \ 17 | --default-buckets-behavior=regional-user-owned-bucket \ 18 | --substitutions _TAG=$CONTAINER_URI\ 19 | . -------------------------------------------------------------------------------- /pipelines/marketing_intelligence/scripts/02_run_dataflow.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | python main.py \ 16 | --runner=DataflowRunner \ 17 | --project=$PROJECT \ 18 | --temp_location=gs://$PROJECT/tmp \ 19 | --region=$REGION \ 20 | --save_main_session \ 21 | --machine_type=$MACHINE_TYPE \ 22 | --num_workers=1 \ 23 | --disk_size_gb=$DISK_SIZE_GB \ 24 | --max_num_workers=$MAX_DATAFLOW_WORKERS \ 25 | --no_use_public_ip \ 26 | --service_account_email=$SERVICE_ACCOUNT \ 27 | --subnetwork=$SUBNETWORK \ 28 | --sdk_container_image=$CONTAINER_URI \ 29 | --dataflow_service_options="worker_accelerator=type:nvidia-l4;count:1;install-nvidia-driver:5xx" \ 30 | --messages_subscription=projects/$PROJECT/subscriptions/dataflow-solutions-guide-market-intelligence-input-sub \ 31 | --responses_topic=projects/$PROJECT/topics/dataflow-solutions-guide-market-intelligence-output \ 32 | --project_id=$PROJECT \ 33 | --model_endpoint="" 34 | 35 | -------------------------------------------------------------------------------- /pipelines/marketing_intelligence/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Setup file for Market Intelligence pipeline. 16 | """ 17 | 18 | from setuptools import setup, find_packages 19 | 20 | with open("requirements.txt", encoding="utf-8") as f: 21 | requirements = f.readlines() 22 | 23 | setup( 24 | name="Dataflow Solution for Market Intelligence pipelines", 25 | version="0.1", 26 | description="A Product predicter example for the Dataflow Solution Guides.", 27 | packages=find_packages(), 28 | install_requires=requirements, 29 | ) 30 | -------------------------------------------------------------------------------- /pipelines/ml_ai_python/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | ARG SERVING_BUILD_IMAGE=tensorflow/tensorflow:2.18.0-gpu 16 | FROM ${SERVING_BUILD_IMAGE} 17 | WORKDIR /workspace 18 | 19 | RUN apt-get update -y && apt-get install -y \ 20 | cmake 21 | 22 | COPY requirements.txt requirements.txt 23 | COPY main.py main.py 24 | COPY ml_ai_pipeline ml_ai_pipeline 25 | COPY MANIFEST.in MANIFEST.in 26 | COPY setup.py setup.py 27 | 28 | RUN pip install --upgrade --no-cache-dir pip \ 29 | && pip install --no-cache-dir -r requirements.txt \ 30 | && pip install --no-cache-dir -e . 31 | 32 | # Copy files from official SDK image, including script/dependencies. 33 | COPY --from=apache/beam_python3.11_sdk:2.63.0 /opt/apache/beam /opt/apache/beam 34 | 35 | COPY gemma_2B gemma_2B 36 | 37 | ENV KERAS_BACKEND="tensorflow" 38 | 39 | # Set the entrypoint to Apache Beam SDK launcher. 40 | ENTRYPOINT ["/opt/apache/beam/boot"] -------------------------------------------------------------------------------- /pipelines/ml_ai_python/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt -------------------------------------------------------------------------------- /pipelines/ml_ai_python/README.md: -------------------------------------------------------------------------------- 1 | # GenAI & Machine Learning inference sample pipeline (Python) 2 | 3 | This sample pipeline demonstrates how to use Dataflow to process data, and calculate predictions 4 | using GenAI, specifically the [Google open source Gemma model](https://ai.google.dev/gemma). 5 | This pipeline is written in Python. 6 | 7 | This pipeline is part of the [Dataflow Gen AI & ML solution guide](../../use_cases/GenAI_ML.md). 8 | 9 | ## Architecture 10 | 11 | The generic architecture for an inference pipeline looks like as follows: 12 | 13 | ![Architecture](../imgs/ml_ai_arch.png) 14 | 15 | In this directory, you will find a specific implementation of the above architecture, with the 16 | following stages: 17 | 18 | 1. **Data ingestion:** Reads data from a Pub/Sub topic. 19 | 2. **Data preprocessing:** The sample pipeline does not do any transformation, but it is trivial 20 | to add a preprocessing step leveraging 21 | [the Enrichment transform](https://cloud.google.com/dataflow/docs/guides/enrichment) to perform 22 | feature engineering before calling the model. 23 | 3. **Inference:** Uses the RunInference transform with a custom model handler, using Keras and Tensorflow, to call the Gemma model. The pipeline uses a GPU with the Dataflow worker, to speed up the inference. 24 | 4. **Predictions:** The predictions are sent to another Pub/Sub topic as output. 25 | 26 | ## Gemma model 27 | 28 | The model needs to be uploaded to GCS in a directory named `gemma_2B` in the bucket created by 29 | Terraform (same name as project id). 30 | 31 | For that, please first [download the Gemma model from Kaggle](https://www.kaggle.com/models/google/gemma), 32 | uncompress it and then uploaded it with a command similar to this one: 33 | 34 | ```sh 35 | gcloud storage cp -r LOCAL_DIRECTORY gs:///gemma_2B 36 | ``` 37 | 38 | That command will do parallel composite uploads to speed up the uploading of the largest files in 39 | the model. 40 | 41 | ## Selecting the cloud region 42 | 43 | Not all the resources may be available in all the regions. The default values included in this 44 | directory have been tested using `us-central1` as region. 45 | 46 | The file `cloudbuild.yaml` is using the machine type `E2_HIGHCPU_8` as the default machine type. If 47 | that's not available in your preferred region, try with other machine types that are available 48 | in Cloud Build: 49 | * https://cloud.google.com/build/docs/api/reference/rest/v1/projects.builds#machinetype 50 | 51 | Moreover, the file `scripts/00_set_environment.sh` specifies a machine type for the Datalow workers. 52 | The selected machine type, `g2-standard-4`, is the recommended one for inference with GPU. If that 53 | type is not available in your region, you can check what machines are available to use with the 54 | following command: 55 | 56 | ```sh 57 | gcloud compute machine-types list --zones=,,... 58 | ``` 59 | 60 | See more info about selecting the right type of machine in the following link: 61 | * https://cloud.google.com/compute/docs/machine-resource 62 | 63 | ## How to launch the pipeline 64 | 65 | All the scripts are located in the `scripts` directory and prepared to be launched from the top 66 | sources directory. 67 | 68 | In the script `scripts/00_set_environment.sh`, define the value of the project id and the region variable: 69 | 70 | ``` 71 | export PROJECT= 72 | export REGION= 73 | ``` 74 | 75 | Leave the rest of variables untouched, although you can override them if you prefer. 76 | 77 | After you edit the script, load those variables into the environment 78 | 79 | ```sh 80 | source scripts/00_set_environment.sh 81 | ``` 82 | 83 | And then run the script that builds and publishes the custom Dataflow container. This container will 84 | contain the Gemma model, and all the required dependencies. 85 | 86 | ```sh 87 | ./scripts/01_build_and_push_container.sh 88 | ``` 89 | 90 | This will create a Cloud Build job that can take a few minutes to complete. Once it completes, you 91 | can trigger the pipeline with the following: 92 | 93 | ```sh 94 | ./scripts/02_run_dataflow.sh 95 | ``` 96 | 97 | ## Input data 98 | 99 | To send data into the pipeline, you need to publish messages in the `messages` topic. Those 100 | messages are passed "as is" to Gemma, so you may want to add some prompting to the question. 101 | 102 | ## Output data 103 | 104 | The predictions are published into the topic `predictions`, and can be observed using the 105 | subscription `predictions-sub`. -------------------------------------------------------------------------------- /pipelines/ml_ai_python/cloudbuild.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | steps: 16 | - name: 'gcr.io/cloud-builders/gsutil' 17 | script: | 18 | echo Copying Gemma model from $_GCS_GEMMA_PATH 19 | gsutil -m -q cp -r $_GCS_GEMMA_PATH /workspace 20 | echo All files copied. 21 | - name: 'gcr.io/cloud-builders/docker' 22 | script: | 23 | docker build -t ${_TAG} . 24 | substitutions: 25 | _GCS_GEMMA_PATH: unset 26 | _TAG: unset 27 | options: 28 | substitutionOption: 'ALLOW_LOOSE' 29 | automapSubstitutions: true 30 | machineType: E2_HIGHCPU_8 31 | images: 32 | - ${_TAG} -------------------------------------------------------------------------------- /pipelines/ml_ai_python/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | A machine learning streaming inference example for the Dataflow Solution Guides. 16 | """ 17 | 18 | import time 19 | 20 | from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions 21 | 22 | from ml_ai_pipeline.options import MyPipelineOptions 23 | from ml_ai_pipeline.pipeline import create_pipeline 24 | 25 | 26 | def main(options: MyPipelineOptions): 27 | pipeline = create_pipeline(options) 28 | pipeline.run() 29 | 30 | 31 | if __name__ == "__main__": 32 | pipeline_options: PipelineOptions = PipelineOptions() 33 | dataflow_options: GoogleCloudOptions = pipeline_options.view_as( 34 | GoogleCloudOptions) 35 | now_epoch_ms = int(time.time() * 1000) 36 | dataflow_options.job_name = f"gemma-inference-pipeline-{now_epoch_ms}" 37 | custom_options: MyPipelineOptions = pipeline_options.view_as( 38 | MyPipelineOptions) 39 | main(custom_options) 40 | -------------------------------------------------------------------------------- /pipelines/ml_ai_python/ml_ai_pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /pipelines/ml_ai_python/ml_ai_pipeline/model_handlers.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Custom model handlers to be used with RunInference. 16 | """ 17 | 18 | from typing import Sequence, Optional, Any, Iterable 19 | 20 | import keras_nlp 21 | from apache_beam.ml.inference.base import ModelHandler, PredictionResult 22 | from keras_nlp.src.models import GemmaCausalLM 23 | 24 | 25 | class GemmaModelHandler(ModelHandler[str, PredictionResult, GemmaCausalLM]): 26 | """ 27 | A RunInference model handler for the Gemma model. 28 | """ 29 | 30 | def __init__(self, model_name: str = "gemma_2B"): 31 | """ Implementation of the ModelHandler interface for Gemma using text as input. 32 | 33 | Example Usage:: 34 | 35 | pcoll | RunInference(GemmaModelHandler()) 36 | 37 | Args: 38 | model_name: The Gemma model name. Default is gemma_2B. 39 | """ 40 | super().__init__() 41 | self._model_name = model_name 42 | self._env_vars = {} 43 | 44 | def share_model_across_processes(self) -> bool: 45 | """ Indicates if the model should be loaded once-per-VM rather than 46 | once-per-worker-process on a VM. Because Gemma is a large language model, 47 | this will always return True to avoid OOM errors. 48 | """ 49 | return True 50 | 51 | def load_model(self) -> GemmaCausalLM: 52 | """Loads and initializes a model for processing.""" 53 | return keras_nlp.models.GemmaCausalLM.from_preset(self._model_name) 54 | 55 | def run_inference( 56 | self, 57 | batch: Sequence[str], 58 | model: GemmaCausalLM, 59 | unused: Optional[dict[str, Any]] = None) -> Iterable[PredictionResult]: 60 | """Runs inferences on a batch of text strings. 61 | 62 | Args: 63 | batch: A sequence of examples as text strings. 64 | model: The Gemma model being used. 65 | 66 | Returns: 67 | An Iterable of type PredictionResult. 68 | """ 69 | _ = unused # for interface compatibility with Model Handler 70 | # Loop each text string, and use a tuple to store the inference results. 71 | for one_text in batch: 72 | result = model.generate(one_text, max_length=64) 73 | yield PredictionResult(one_text, result, self._model_name) 74 | -------------------------------------------------------------------------------- /pipelines/ml_ai_python/ml_ai_pipeline/options.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Options class for the streaming inference pipeline. 16 | """ 17 | 18 | from argparse import ArgumentParser 19 | 20 | from apache_beam.options.pipeline_options import PipelineOptions 21 | 22 | 23 | class MyPipelineOptions(PipelineOptions): 24 | 25 | @classmethod 26 | def _add_argparse_args(cls, parser: ArgumentParser): 27 | parser.add_argument("--messages_subscription", type=str) 28 | parser.add_argument("--model_path", type=str) 29 | parser.add_argument("--responses_topic", type=str) 30 | -------------------------------------------------------------------------------- /pipelines/ml_ai_python/ml_ai_pipeline/pipeline.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | A machine learning streaming inference pipeline for the Dataflow Solution Guides. 16 | """ 17 | 18 | from apache_beam import Pipeline, PCollection 19 | from apache_beam.ml.inference import RunInference 20 | from apache_beam.io.gcp import pubsub 21 | 22 | import apache_beam as beam 23 | from apache_beam.ml.inference.base import PredictionResult 24 | 25 | from .model_handlers import GemmaModelHandler 26 | from .options import MyPipelineOptions 27 | 28 | 29 | def _format_output(element: PredictionResult) -> str: 30 | return f"Input: \n{element.example}, \n\n\nOutput: \n{element.inference}" 31 | 32 | 33 | @beam.ptransform_fn 34 | def _extract(p: Pipeline, subscription: str) -> PCollection[str]: 35 | msgs: PCollection[bytes] = p | "Read subscription" >> beam.io.ReadFromPubSub( 36 | subscription=subscription) 37 | return msgs | "Parse" >> beam.Map(lambda x: x.decode("utf-8")) 38 | 39 | 40 | @beam.ptransform_fn 41 | def _transform(msgs: PCollection[str], model_path: str) -> PCollection[str]: 42 | preds: PCollection[ 43 | PredictionResult] = msgs | "RunInference-Gemma" >> RunInference( 44 | GemmaModelHandler(model_path)) 45 | return preds | "Format Output" >> beam.Map(_format_output) 46 | 47 | 48 | def create_pipeline(options: MyPipelineOptions) -> Pipeline: 49 | """ Create the pipeline object. 50 | 51 | Args: 52 | options: The pipeline options, with type `MyPipelineOptions`. 53 | 54 | Returns: 55 | The pipeline object. 56 | """ 57 | pipeline = beam.Pipeline(options=options) 58 | # Extract 59 | msgs: PCollection[str] = pipeline | "Read" >> _extract( 60 | subscription=options.messages_subscription) 61 | # Transform 62 | responses: PCollection[str] = msgs | "Transform" >> _transform( 63 | model_path=options.model_path) 64 | # Load 65 | responses | "Publish Result" >> pubsub.WriteStringsToPubSub( 66 | topic=options.responses_topic) 67 | 68 | return pipeline 69 | -------------------------------------------------------------------------------- /pipelines/ml_ai_python/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | tensorflow==2.18.0 16 | -------------------------------------------------------------------------------- /pipelines/ml_ai_python/requirements.txt: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apache-beam[gcp]==2.63.0 16 | keras_nlp==0.19.2 17 | keras==3.9.0 18 | protobuf==4.25.6 -------------------------------------------------------------------------------- /pipelines/ml_ai_python/scripts/.gitignore: -------------------------------------------------------------------------------- 1 | 00_set_variables.sh 2 | -------------------------------------------------------------------------------- /pipelines/ml_ai_python/scripts/01_build_and_push_container.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | gcloud builds submit \ 16 | --region=$REGION \ 17 | --default-buckets-behavior=regional-user-owned-bucket \ 18 | --substitutions _TAG=$CONTAINER_URI,_GCS_GEMMA_PATH=$GCS_GEMMA_PATH \ 19 | . -------------------------------------------------------------------------------- /pipelines/ml_ai_python/scripts/02_run_dataflow.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | python main.py \ 16 | --runner=DataflowRunner \ 17 | --project=$PROJECT \ 18 | --temp_location=gs://$PROJECT/tmp \ 19 | --region=$REGION \ 20 | --save_main_session \ 21 | --machine_type=$MACHINE_TYPE \ 22 | --num_workers=1 \ 23 | --disk_size_gb=$DISK_SIZE_GB \ 24 | --max_num_workers=$MAX_DATAFLOW_WORKERS \ 25 | --no_use_public_ip \ 26 | --service_account_email=$SERVICE_ACCOUNT \ 27 | --subnetwork=$SUBNETWORK \ 28 | --sdk_container_image=$CONTAINER_URI \ 29 | --dataflow_service_options="worker_accelerator=type:nvidia-l4;count:1;install-nvidia-driver:5xx" \ 30 | --messages_subscription=projects/$PROJECT/subscriptions/messages-sub \ 31 | --responses_topic=projects/$PROJECT/topics/predictions \ 32 | --model_path="gemma_2B" 33 | 34 | -------------------------------------------------------------------------------- /pipelines/ml_ai_python/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Setup file for the machine learning streaming inference pipeline. 16 | """ 17 | 18 | from setuptools import setup, find_packages 19 | 20 | with open("requirements.txt", encoding="utf-8") as f: 21 | requirements = f.readlines() 22 | 23 | setup( 24 | name="Dataflow Solution for ML/AI pipelines", 25 | version="0.1", 26 | description="A ML/AI pipeline example for the Dataflow Solution Guides.", 27 | packages=find_packages(), 28 | install_requires=requirements, 29 | ) 30 | -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://docs.renovatebot.com/renovate-schema.json", 3 | "extends": [ 4 | "config:recommended" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /terraform/.gitignore: -------------------------------------------------------------------------------- 1 | .terraform* 2 | terraform.tfstate* 3 | .idea 4 | terraform.tfvars 5 | backend.tf -------------------------------------------------------------------------------- /terraform/README.md: -------------------------------------------------------------------------------- 1 | # Deployment of the solution guides 2 | 3 | In this directory, you will find all the Terraform code to spawn all the 4 | necessary infrastructure in Google Cloud to deploy each one of the solution 5 | guides. 6 | 7 | Please refer to [the main documentation in this repo for a full list of all 8 | the use cases](../README.md). 9 | 10 | ## Google Cloud security foundations 11 | 12 | The deployments in this directory follow all the recommendations given in the 13 | [Google Cloud Security Foundations](https://cloud.google.com/architecture/security-foundations). 14 | 15 | Some of the features of the Terraform deployments in this directory are the following: 16 | * **Identity and Access Management (IAM):** 17 | * All resources are created with the minimum required permissions. 18 | * Service accounts are used for all deployments. 19 | * IAM policies are used to restrict access to resources. 20 | * **Network security:** 21 | * All resources are deployed using private IPs only. 22 | * Firewalls are used to restrict access to resources, including network tags for `ssh`, `http-server` and `https-server` access. 23 | * If the project is created by the Terraform scripts, the default network is removed. 24 | -------------------------------------------------------------------------------- /terraform/anomaly_detection/variables.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | variable "billing_account" { 16 | description = "Billing account for the projects/resources" 17 | type = string 18 | default = null 19 | } 20 | 21 | variable "destroy_all_resources" { 22 | description = "Destroy all resources when calling tf destroy. Use false for production deployments. For test environments, set to true to remove all buckets and bigtable instances." 23 | type = bool 24 | default = true 25 | } 26 | 27 | variable "internet_access" { 28 | description = "Set to true to create a NAT for Dataflow workers to access Internet." 29 | type = bool 30 | default = false 31 | } 32 | 33 | variable "network_prefix" { 34 | description = "Prefix to be used for networks and subnetworks" 35 | type = string 36 | default = "dataflow" 37 | } 38 | 39 | variable "organization" { 40 | description = "Organization for the project/resources" 41 | type = string 42 | default = null 43 | } 44 | 45 | variable "project_create" { 46 | description = "True if you want to create a new project. False to reuse an existing project." 47 | type = bool 48 | } 49 | 50 | variable "project_id" { 51 | description = "Project ID for the project/resources" 52 | type = string 53 | } 54 | 55 | variable "region" { 56 | description = "The region for resources and networking" 57 | type = string 58 | } 59 | 60 | variable "zone" { 61 | description = "The zone for big table. Just a single letter specifying a zone in the region. The default is zone a" 62 | type = string 63 | default = "a" 64 | } 65 | -------------------------------------------------------------------------------- /terraform/cdp/variables.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | variable "billing_account" { 16 | description = "Billing account for the projects/resources" 17 | type = string 18 | default = null 19 | } 20 | 21 | variable "destroy_all_resources" { 22 | description = "Destroy all resources when calling tf destroy. Use false for production deployments. For test environments, set to true to remove all buckets and Spanner instances." 23 | type = bool 24 | default = true 25 | } 26 | 27 | variable "internet_access" { 28 | description = "Set to true to create a NAT for Dataflow workers to access Internet." 29 | type = bool 30 | default = false 31 | } 32 | 33 | variable "network_prefix" { 34 | description = "Prefix to be used for networks and subnetworks" 35 | type = string 36 | default = "dataflow" 37 | } 38 | 39 | variable "organization" { 40 | description = "Organization for the project/resources" 41 | type = string 42 | default = null 43 | } 44 | 45 | variable "project_create" { 46 | description = "True if you want to create a new project. False to reuse an existing project." 47 | type = bool 48 | } 49 | 50 | variable "project_id" { 51 | description = "Project ID for the project/resources" 52 | type = string 53 | } 54 | 55 | variable "region" { 56 | description = "The region for resources and networking" 57 | type = string 58 | } 59 | 60 | variable "bq_dataset" { 61 | description = "The output bq dataset" 62 | type = string 63 | default = "output_dataset" 64 | } 65 | 66 | variable "bq_table" { 67 | description = "The output bq table" 68 | type = string 69 | default = "unified_data" 70 | } 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /terraform/clickstream_analytics/variables.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | variable "billing_account" { 16 | description = "Billing account for the projects/resources" 17 | type = string 18 | default = null 19 | } 20 | 21 | variable "destroy_all_resources" { 22 | description = "Destroy all resources when calling tf destroy. Use false for production deployments. For test environments, set to true to remove all buckets and Spanner instances." 23 | type = bool 24 | default = true 25 | } 26 | 27 | variable "network_prefix" { 28 | description = "Prefix to be used for networks and subnetworks" 29 | type = string 30 | default = "dataflow" 31 | } 32 | 33 | variable "organization" { 34 | description = "Organization for the project/resources" 35 | type = string 36 | default = null 37 | } 38 | 39 | variable "project_create" { 40 | description = "True if you want to create a new project. False to reuse an existing project." 41 | type = bool 42 | } 43 | 44 | variable "project_id" { 45 | description = "Project ID for the project/resources" 46 | type = string 47 | } 48 | 49 | variable "region" { 50 | description = "The region for resources and networking" 51 | type = string 52 | } 53 | -------------------------------------------------------------------------------- /terraform/etl_integration/variables.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | variable "billing_account" { 16 | description = "Billing account for the projects/resources" 17 | type = string 18 | default = null 19 | } 20 | 21 | variable "destroy_all_resources" { 22 | description = "Destroy all resources when calling tf destroy. Use false for production deployments. For test environments, set to true to remove all buckets and Spanner instances." 23 | type = bool 24 | default = true 25 | } 26 | 27 | variable "internet_access" { 28 | description = "Set to true to create a NAT for Dataflow workers to access Internet." 29 | type = bool 30 | default = false 31 | } 32 | 33 | variable "network_prefix" { 34 | description = "Prefix to be used for networks and subnetworks" 35 | type = string 36 | default = "dataflow" 37 | } 38 | 39 | variable "organization" { 40 | description = "Organization for the project/resources" 41 | type = string 42 | default = null 43 | } 44 | 45 | variable "project_create" { 46 | description = "True if you want to create a new project. False to reuse an existing project." 47 | type = bool 48 | } 49 | 50 | variable "project_id" { 51 | description = "Project ID for the project/resources" 52 | type = string 53 | } 54 | 55 | variable "region" { 56 | description = "The region for resources and networking" 57 | type = string 58 | } 59 | 60 | -------------------------------------------------------------------------------- /terraform/iot_analytics/variables.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | variable "billing_account" { 16 | description = "Billing account for the projects/resources" 17 | type = string 18 | default = null 19 | } 20 | 21 | variable "destroy_all_resources" { 22 | description = "Destroy all resources when calling tf destroy. Use false for production deployments. For test environments, set to true to remove all buckets and Spanner instances." 23 | type = bool 24 | default = true 25 | } 26 | 27 | variable "network_prefix" { 28 | description = "Prefix to be used for networks and subnetworks" 29 | type = string 30 | default = "dataflow" 31 | } 32 | 33 | variable "organization" { 34 | description = "Organization for the project/resources" 35 | type = string 36 | default = null 37 | } 38 | 39 | variable "project_create" { 40 | description = "True if you want to create a new project. False to reuse an existing project." 41 | type = bool 42 | } 43 | 44 | variable "project_id" { 45 | description = "Project ID for the project/resources" 46 | type = string 47 | } 48 | 49 | variable "region" { 50 | description = "The region for resources and networking" 51 | type = string 52 | } 53 | 54 | variable "pubsub_topic" { 55 | description = "Name for your pub sub topic" 56 | type = string 57 | default = "maintenance-data" 58 | } -------------------------------------------------------------------------------- /terraform/log_replication_splunk/variables.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | variable "billing_account" { 16 | description = "Billing account for the projects/resources" 17 | type = string 18 | default = null 19 | } 20 | 21 | variable "destroy_all_resources" { 22 | description = "Destroy all resources when calling tf destroy. Use false for production deployments. For test environments, set to true to remove all buckets and Spanner instances." 23 | type = bool 24 | default = true 25 | } 26 | 27 | variable "internet_access" { 28 | description = "Set to true to create a NAT for Dataflow workers to access Internet." 29 | type = bool 30 | default = false 31 | } 32 | 33 | variable "network_prefix" { 34 | description = "Prefix to be used for networks and subnetworks" 35 | type = string 36 | default = "dataflow" 37 | } 38 | 39 | variable "organization" { 40 | description = "Organization for the project/resources" 41 | type = string 42 | default = null 43 | } 44 | 45 | variable "project_create" { 46 | description = "True if you want to create a new project. False to reuse an existing project." 47 | type = bool 48 | } 49 | 50 | variable "project_id" { 51 | description = "Project ID for the project/resources" 52 | type = string 53 | } 54 | 55 | variable "region" { 56 | description = "The region for resources and networking" 57 | type = string 58 | } 59 | 60 | variable "splunk_hec_url" { 61 | description = "The URL for the Splunk HEC endpoint" 62 | type = string 63 | default = "http://some-endpoint:8088" 64 | 65 | } 66 | 67 | variable "splunk_token" { 68 | description = "The token for the Splunk HEC endpoint. It will be stored in Secret Manager" 69 | type = string 70 | default = "WRITE_YOUR_TOKEN_HERE" 71 | } 72 | 73 | -------------------------------------------------------------------------------- /terraform/marketing_intelligence/variables.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | variable "billing_account" { 16 | description = "Billing account for the projects/resources" 17 | type = string 18 | default = null 19 | } 20 | 21 | variable "destroy_all_resources" { 22 | description = "Destroy all resources when calling tf destroy. Use false for production deployments. For test environments, set to true to remove all buckets and bigtable instances." 23 | type = bool 24 | default = true 25 | } 26 | 27 | variable "internet_access" { 28 | description = "Set to true to create a NAT for Dataflow workers to access Internet." 29 | type = bool 30 | default = false 31 | } 32 | 33 | variable "network_prefix" { 34 | description = "Prefix to be used for networks and subnetworks" 35 | type = string 36 | default = "dataflow" 37 | } 38 | 39 | variable "organization" { 40 | description = "Organization for the project/resources" 41 | type = string 42 | default = null 43 | } 44 | 45 | variable "project_create" { 46 | description = "True if you want to create a new project. False to reuse an existing project." 47 | type = bool 48 | } 49 | 50 | variable "project_id" { 51 | description = "Project ID for the project/resources" 52 | type = string 53 | } 54 | 55 | variable "region" { 56 | description = "The region for resources and networking" 57 | type = string 58 | } 59 | 60 | variable "zone" { 61 | description = "The zone for Bigtable. Just a single lower case letter for the zone. Default is a." 62 | type = string 63 | default = "a" 64 | } 65 | -------------------------------------------------------------------------------- /terraform/ml_ai/variables.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | variable "billing_account" { 16 | description = "Billing account for the projects/resources" 17 | type = string 18 | default = null 19 | } 20 | 21 | variable "destroy_all_resources" { 22 | description = "Destroy all resources when calling tf destroy. Use false for production deployments. For test environments, set to true to remove all buckets and Spanner instances." 23 | type = bool 24 | default = true 25 | } 26 | 27 | variable "network_prefix" { 28 | description = "Prefix to be used for networks and subnetworks" 29 | type = string 30 | default = "dataflow" 31 | } 32 | 33 | variable "organization" { 34 | description = "Organization for the project/resources" 35 | type = string 36 | default = null 37 | } 38 | 39 | variable "project_create" { 40 | description = "True if you want to create a new project. False to reuse an existing project." 41 | type = bool 42 | } 43 | 44 | variable "project_id" { 45 | description = "Project ID for the project/resources" 46 | type = string 47 | } 48 | 49 | variable "region" { 50 | description = "The region for resources and networking" 51 | type = string 52 | } 53 | 54 | -------------------------------------------------------------------------------- /use_cases/Anomaly_Detection.md: -------------------------------------------------------------------------------- 1 | # Real-Time Anomaly Detection 2 | 3 | Real-time anomaly detection refers to stream processing workloads that identify abnormal events in-flight and 4 | potentially respond with a relevant measure. Incoming events are analyzed and/or compared against a reference/benchmark 5 | that validates whether a record is irregular or not. Anomaly detection architectures can enhance the security 6 | posture of a company’s infrastructure or mitigate against the threat of malicious actors in a value chain. 7 | Companies are increasingly adding proprietary machine learning models to augment their anomaly detection capabilities. 8 | Low latency is normally a requirement for these kinds of workloads given the inherent nature of these adverse events. 9 | 10 | ## Documentation 11 | 12 | - [One pager: Real-time Anomaly Detection with Dataflow (PDF)](./one_pagers/anomaly_detection_dataflow_onepager.pdf) 13 | - [Real-time Anomaly Detection Solution Guide and Architecture (PDF)](./guides/anomaly_detection_dataflow_guide.pdf) 14 | 15 | ## Assets included in this repository 16 | 17 | - [Terraform code to deploy a project for GenAI & ML inference](../terraform/anomaly_detection/) 18 | - [Sample pipeline in Python for leveraging the Gemma open LLM with Dataflow](../pipelines/anomaly_detection/) 19 | 20 | ## Technical benefits 21 | 22 | Dataflow is the best platform for building real-time 23 | applications. Several unique capabilities make Dataflow the leading choice: 24 | 25 | - **Integrated ML**:Combine your intelligence with your streaming pipeline using Datafow ML. 26 | RunInference helps you seamlessly call models hosted on Vertex AI from your Dataflow 27 | pipelines without the overhead of maintaining ML infrastrucure. Dataflow ML comes with 28 | the combined benefit of decoupling your prediction loop from your main application, thus 29 | eliminating the risk that pipeline stuckness can bring down your application. 30 | - **Low latency**: Dataflow’s at-least-once delivery mode can help your pipeline achieve sub-second 31 | processing latencies, crucial to responding to threats as quickly as possible. 32 | 33 | - **Integrated alerting**: Dataflow’s suite of observability tools enhances your ability to identify 34 | and respond to anomalous events. Create an alert from a Dataflow monitoring dashboard in a matter of a few clicks. 35 | 36 | - **Advanced stream processing**: Apache Beam’s state and timer APIs enables data engineers to manipulate and 37 | analyze state in-flight. These primitives allow you maximum flexibility to express the business logic 38 | that your application requires. 39 | 40 | - **Scalable infrastructure**: Pipeline scale up and down to meet the resourcing requirements of your pipeline. 41 | Powered by battle-tested backends in Shuffle & Streaming Engine, Dataflow is fit to support virtually pipelines 42 | of any size, with minimal tuning needed. 43 | -------------------------------------------------------------------------------- /use_cases/CDP.md: -------------------------------------------------------------------------------- 1 | # Customer Data Platform 2 | 3 | At its core, a real-time CDP is a sophisticated software solution designed to unify customer data from various sources, providing a single, comprehensive view of each individual customer. The "real-time" element is crucial: it emphasizes the ability to collect, process, and analyze customer data as events occur, enabling businesses to respond instantly to changing customer behaviors and preferences. 4 | Real-time Customer Data Platforms represent a powerful tool for businesses seeking to create more personalized, engaging, and effective customer experiences. By centralizing customer data and enabling real-time analysis, CDPs unlock a new level of customer understanding and responsiveness, leading to better marketing outcomes and stronger customer relationships. 5 | 6 | ## Documentation 7 | 8 | - [Real-time Customer Data Platform Solution Guide and Architecture (PDF)](./guides/cdp_dataflow_guide.pdf) 9 | 10 | ## Assets included in this repository 11 | 12 | - [Terraform code to deploy a project for Customer Data Platform](../terraform/cdp/) 13 | - [Sample pipelines in Python for Customer Data Platform](../pipelines/cdp/) 14 | 15 | ## Technical benefits 16 | 17 | Dataflow provides enormous advantages as a platform for your Customer Data Platform use 18 | cases: 19 | 20 | - **Real-Time Data Ingestion and Processing**: Dataflow enables the seamless and efficient movement of customer data from various sources into the CDP in real-time. This ensures that the CDP is always working with the most up-to-date information, allowing for timely insights and actions. 21 | 22 | - **Enhanced Data Transformation and Enrichment**: Dataflow pipelines can perform complex transformations on incoming data, ensuring it is clean, standardized, and formatted correctly for the CDP. 23 | Additionally, dataflow can enrich customer data with additional context or attributes from external sources, leading to more complete and valuable customer profiles. 24 | 25 | - **Scalability and Flexibility**: Dataflow solutions are designed to handle large volumes of data and can scale effortlessly to accommodate growing data needs. They offer flexibility in terms of data sources, processing logic, and output destinations, making them adaptable to evolving business requirements. 26 | 27 | - **Automation and Efficiency**: Dataflow pipelines can automate data ingestion, transformation, and delivery processes, reducing manual effort and minimizing errors. This streamlines data management, freeing up resources for more strategic tasks. 28 | 29 | - **Improved Data Quality and Governance**: Dataflow enables data validation and cleansing during the ingestion process, ensuring data accuracy and consistency. Data lineage and audit capabilities within dataflow tools help track data transformations and maintain data governance standards. 30 | 31 | - **Actionable Insights and Personalization**: By feeding clean and enriched data into the CDP in real-time, dataflow enables the CDP to generate more accurate and timely insights. These insights can be used to trigger personalized marketing campaigns, recommendations, and customer interactions, leading to improved engagement and conversions. 32 | 33 | - **Omnichannel Customer Experiences**: Dataflow supports the seamless integration of customer data across various touchpoints and channels. This allows the CDP to orchestrate consistent and personalized customer experiences across the entire customer journey. 34 | -------------------------------------------------------------------------------- /use_cases/Clickstream_Analytics.md: -------------------------------------------------------------------------------- 1 | # Clickstream analytics 2 | 3 | In the fast-paced digital landscape, understanding user behavior is crucial for optimizing websites, apps, 4 | and marketing campaigns. Clickstream analytics provides a continuous stream of data on how users interact 5 | with digital platforms. But to truly capitalize on this information, businesses need insights in real time, 6 | not days later. 7 | 8 | For the full version of this solution guide, please refer to: 9 | 10 | ## Documentation 11 | 12 | - [One pager: Clickstream analytics in real-time with Dataflow (PDF)](./one_pagers/clickstream_dataflow_onepager.pdf) 13 | 14 | ## Assets included in this repository 15 | 16 | - [Terraform code to deploy a project for Clickstream Analytics](../terraform/clickstream_analytics/) 17 | - [Sample pipeline in Java for clickstream analytics with Dataflow](../pipelines/clickstream_analytics_java/) 18 | 19 | ## Technical benefits 20 | 21 | Dataflow provides a robust platform for building and scaling real-time clickstream analytics solutions. 22 | Key capabilities make it the ideal choice for extracting maximum value from user interaction data: 23 | 24 | - Streamlined Clickstream Processing: Dataflow's Apache Beam SDK simplifies the development of complex 25 | clickstream pipelines. Pre-built transforms, state management, and windowing functions make it easy to 26 | aggregate, filter, and enrich clickstream events in real time. 27 | - Clickstream Enrichment: Enrich raw clickstream data with external data sources (e.g., user demographics, 28 | product catalogs) to gain deeper insights into user behavior and preferences. Side inputs and joins in 29 | Dataflow enable seamless data enrichment within your pipelines. 30 | - Real-Time Dashboards and Alerts: Integrate Dataflow with real-time visualization tools and alerting systems 31 | to monitor clickstream metrics, detect anomalies, and trigger actions based on user interactions. Dataflow's 32 | low-latency processing ensures that insights are delivered within seconds. 33 | - Scalability and Cost Efficiency: Dataflow automatically scales to handle fluctuating clickstream volumes. 34 | Pay only for the resources you use, avoiding overprovisioning and unnecessary costs. Right-fitting capabilities 35 | allow you to allocate resources optimally across different pipeline stages. 36 | - Flexible Deployment: Deploy clickstream pipelines on various infrastructure options, including VMs and serverless 37 | options like Cloud Run or Cloud Functions. This flexibility allows you to tailor your deployment to your specific 38 | needs and budget. 39 | - Open-Source Ecosystem: Leverage the power of the Apache Beam ecosystem, including a vast library of I/O 40 | connectors for various data sources and sinks. Dataflow's compatibility with open-source tools ensures flexibility 41 | and avoids vendor lock-in. 42 | -------------------------------------------------------------------------------- /use_cases/ETL_integration.md: -------------------------------------------------------------------------------- 1 | # ETL / Integration 2 | 3 | Real-time extract-transform-load (ETL) & integration describes systems that are processing & writing 4 | data as soon as it becomes available. This allows for near-instant analysis and decision-making 5 | based on the most up-to-date information. ETL patterns refer to the continuous processing of data, 6 | while integration broadly refers to writing the results of these pipelines to various systems (e.g. 7 | data warehouses, transactional databases, messaging queues). Adopting real-time ETL & integration 8 | architectures are generally regarded as an essential part of modernizing your data systems, and 9 | confer a number of competitive advantages to the company adopting them. 10 | 11 | For the full version of this solution guide, please refer to: 12 | * https://solutions.cloud.google.com/app/solutions/dataflow-real-time-etl-integration 13 | 14 | ## Documentation 15 | 16 | * [One pager: ETL & reverse ETL in real-time with Dataflow (PDF)](./one_pagers/etl_dataflow_onepager.pdf) 17 | * [ETL & reverse ETL Solution Guide and Architecture (PDF)](./guides/etl_dataflow_guide.pdf) 18 | 19 | ## Assets included in this repository 20 | 21 | * [Terraform code to deploy a project for ETL integration](../terraform/etl_integration/) 22 | * [Sample pipelines in Java for ETL / Integration](../pipelines/etl_integration_java/) 23 | 24 | ## Technical benefits 25 | 26 | Dataflow provides enormous advantages as a platform for your real-time ETL and integration use 27 | cases: 28 | 29 | * **Resource efficiency**: Increased resource efficiency with horizontal & vertical autoscaling 30 | * **Unified batch & streaming**: Dataflow’s underlying SDK, Apache Beam, allows developers to 31 | express 32 | batch & streaming pipelines with the same SDK, with minor modifications required to turn a batch 33 | pipeline into a streaming one. This simplifies the traditionally accepted practice of maintaining 34 | two separate systems for batch & stream processing. 35 | * **Limitless scalability**: Dataflow offers two service backends for batch and streaming called 36 | Shuffle 37 | and Streaming Engine, respectively. These backends have scaled 38 | -------------------------------------------------------------------------------- /use_cases/GenAI_ML.md: -------------------------------------------------------------------------------- 1 | # GenAI & machine learning inference 2 | 3 | Machine learning (ML) and artificial intelligence (AI) empower businesses to respond to evolving 4 | market conditions and tailor their offerings to users and customers. However, decision cycles 5 | involving AI and ML can span days or even weeks, particularly when dealing with larger models 6 | (model retraining, large inference batch pipelines, etc). This solution guide introduces an 7 | architecture designed for real-time predictions, guaranteeing low latency outcomes with both custom 8 | and third-party models. Leveraging the capabilities of graphics processing units (GPUs), 9 | the proposed architecture effectively reduces prediction times to seconds. 10 | 11 | ## Documentation 12 | 13 | - [One pager: GenAI & ML inference in real-time with Dataflow (PDF)](./one_pagers/genai_ml_dataflow_onepager.pdf) 14 | - [Gen AI & ML inference Solution Guide and Architecture (PDF)](./guides/genai_ml_dataflow_guide.pdf) 15 | 16 | For the full documentation of this solution guide, please refer to: 17 | 18 | - https://solutions.cloud.google.com/app/solutions/data-flow-real-time-ml-and-genai 19 | 20 | ## Assets included in this repository 21 | 22 | - [Terraform code to deploy a project for GenAI & ML inference](../terraform/ml_ai/) 23 | - [Sample pipeline in Python for leveraging the Gemma open LLM with Dataflow](../pipelines/ml_ai_python/) 24 | 25 | ## Technical benefits 26 | 27 | Dataflow is the best platform for building real-time ML & generative AI 28 | applications. Several unique capabilities make Dataflow the leading choice: 29 | 30 | - **Developer ease of use with turnkey transforms:** Author complex ML 31 | pipelines using utility transforms that can reduce lines code by orders of magnitude 32 | - [MLTransform](https://cloud.google.com/dataflow/docs/machine-learning/ml-preprocess-data) 33 | helps you prepare your data for training machine learning models without 34 | writing complex code or managing underlying libraries. ML Transforms can 35 | generate embeddings that can push data into vector databases to run 36 | inference. 37 | - [RunInference](https://beam.apache.org/documentation/ml/about-ml/#use-runinference) 38 | lets you efficiently use ML models in your pipelines, and contains a 39 | number of different optimizations underneath the hood that make this an 40 | essential part of any streaming AI pipelines 41 | - **Advanced stream processing**: Customers can implement advanced streaming 42 | architectures using the open-source 43 | [Apache Beam SDK](https://beam.apache.org/get-started/), which provides a rich 44 | set capabilities of including state & timer APIs, transformations, side 45 | inputs, enrichment, and a broad list of I/O connectors. 46 | - **Notebooks integration**: Develop your streaming AI pipeline in a 47 | notebook environment, which allows for interactive development and 48 | sampling unbounded data sources. 49 | - **Cost efficiency**: Run pipelines without wasting precious resources & 50 | cost overruns. 51 | - [GPU support](https://cloud.google.com/dataflow/docs/gpu/gpu-support) 52 | Accelerate your processing with GPUs, which can return results faster 53 | for your most computationally demanding pipelines 54 | - [Right-fitting](https://cloud.google.com/dataflow/docs/guides/right-fitting) 55 | Deploy pipelines on heterogeneous worker pools. Rightfitting allows you 56 | to allocate additional resources to individual stages in your pipeline, 57 | which prevents wasteful utilization for stages that don’t require the 58 | same compute. 59 | - **Open-source compatibility**: Dataflow has support for 60 | [running inference with Gemma](https://cloud.google.com/dataflow/docs/machine-learning/gemma) 61 | as well as a strong integration with 62 | [Tensorflow Extended](https://www.tensorflow.org/tfx) 63 | Customers should feel comfortable that these pipelines can be ported to 64 | any other execution engine with Apache Beam support. 65 | -------------------------------------------------------------------------------- /use_cases/IoT_Analytics.md: -------------------------------------------------------------------------------- 1 | # IoT Analytics 2 | 3 | Organizations employ Internet of Things (IoT) sensors to monitor their production lines in real-time. These sensors gather critical data on various metrics essential for the manufacturing processes and can be utilized for analytical purposes. This data is operational in nature, and currently, it is not utilized for analytical purposes. With data warehouses, companies have leveraged low-granularity operational data for analytical purposes, enabling them to make more informed decisions utilizing large volumes of data. The use case described herein demonstrates how to replicate the same pattern (analytics on large volumes of low-granularity operational data) but with a crucial additional advantage: low latency. The value of data, and consequently, the decisions made based on that data diminish over time. Real-time analytics significantly enhance the value of such decisions. 4 | 5 | ## Documentation 6 | 7 | - [One pager: IoT analytics in real-time with Dataflow (PDF)](./one_pagers/iot_analytics_dataflowonepager.pdf) 8 | - [IoT Analytics Solution Guide & Architecture (PDF)](./guides/iot_analytics_dataflow_guide.pdf) 9 | 10 | ## Assets included in this repository 11 | - Terraform code to deploy a project for IoT Analytics (WORK IN PROGRESS) 12 | - Sample pipeline in Python for deploying IoT analytics (WORK IN PROGRESS) 13 | 14 | ## Technical benefits 15 | - **Serverless experience:** Data volume can vary widely from connected devices and IoT appliances, which introduce significant overhead when managing infrastructure. Dataflow obviates that need entirely. Dataflow’s service layer goes beyond auto-provisioning. Features like dynamic work rebalancing, autoscaling, and service backends like Streaming Engine are built to handle your workload at any scale without needing user intervention. 16 | - **Streaming AI & ML:** Dataflow’s suite of ML capabilities enable you to evolve your batch ML systems to streaming ML, enabling a world of real-time features and real-time predictions. Apache Beam and Dataflow include several capabilities that simplify the end-to-end machine learning lifecycle. We make data processing easier for AI easier with ML Transform. Implement RunInference to call predictions with your model of choice, whether it be scikit-learn, PyTorch, VertexAI, or Gemma. Dataflow’s integration with Vertex AI alleviates the need to manage complex computing requirements for your machine learning use cases. 17 | - **Extensible connector framework:** Apache Beam provides more than 60 out of the box connectors that support the majority of your I/O needs, including support for popular messaging platforms like Kafka and Pub/Sub and messaging brokers like JMS and MQTT. If your desired input is not supported, Beam also offers a flexible framework that allows you to build a connector for your own source systems. 18 | - **Open & portable:** For IoT use cases, it is a common requirement to process data in both on-device and multi-cloud enviornments Beam allows you the flexibility to run your business logic in the environment of your choice. Execution engines include the Direct Runner (for local execution), Spark and Flink (for your own self-managed & multi-cloud computing environments), and Dataflow (the preferred execution engine for Google Cloud). 19 | -------------------------------------------------------------------------------- /use_cases/Log_replication.md: -------------------------------------------------------------------------------- 1 | # Log replication & analytics 2 | 3 | Google Cloud produces all kinds of logs that are automatically sent to Cloud 4 | Logging. However, in some situations, you may want to use a third party such 5 | as Splunk for log processing and analytics. This solution presents an 6 | architecture to replicate logs from Cloud Logging to a third-party service, 7 | using Dataflow. The solution ensures that all changes done in the upstream 8 | databases are promptly replicated in the destination analytics replica, 9 | with minimal delay (in the order of single digit seconds). 10 | 11 | ## Documentation 12 | 13 | - [One pager: Log replication and analytics in real-time with Dataflow (PDF)](./one_pagers/log_replication_dataflow_onepager.pdf) 14 | - [Log replication and analytics Solution Guide and Architecture (PDF)](./guides/log_replication_dataflow_guide.pdf) 15 | 16 | ## Assets included in this repository 17 | 18 | - [Terraform code to deploy a project for log replication into Splunk](../terraform/log_replication_splunk/) 19 | - [Use Google-provide templates to run a job to replicate to Splunk](../pipelines/log_replication_splunk/) 20 | 21 | ## Technical benefits 22 | 23 | - **Serverless experience**: Data volume can vary widely from logging 24 | applications or transactional databases. Dataflow obviates that need 25 | entirely. Dataflow’s service layer goes beyond auto-provisioning. Features 26 | like dynamic work rebalancing, autoscaling, and service backends like 27 | Streaming Engine are built to handle your workload at any scale without 28 | needing user intervention. 29 | - **Easy operations**: Dataflow offers several features that helps 30 | organizations ensure the uptime of their pipelines. Snapshots preserve the 31 | state of your pipeline for high availability / disaster recovery scenarios, 32 | while in-place streaming update can seamlessly migate your pipeline to a 33 | new version without any data loss or downtime. 34 | - **Google-provided Templates**: Google provides Dataflow templates make 35 | deployment as easy as filling out a web form. Send logs to Splunk, 36 | Elasticsearch, or Datadog with our partner-provided templates. 37 | - **Low latency**: Dataflow’s at-least-once delivery mode can help your 38 | pipeline achieve sub-second processing latencies, essential for your 39 | mission-critical logging applications. 40 | - **Monitoring tools**: In-line logging, job visualizers, monitoring charts, 41 | integrated error reporting and smart insights help you optimize the 42 | performance of your pipeline, and can catch any stuckness or slowness 43 | issues before they turn into outages. 44 | -------------------------------------------------------------------------------- /use_cases/Marketing_Intelligence.md: -------------------------------------------------------------------------------- 1 | # Market Intelligence inference 2 | 3 | Real-time marketing intelligence describes the practice of collecting and analyzing data about your market, customers, and competitors as it happens. This enables you to make informed, agile decisions and respond swiftly to emerging trends, customer behaviors, and competitive moves. The advent of data-driven marketing has transformed the way companies approach their marketing activities, and real-time marketing intelligence requires these companies to accelerate their response times to marketing moments. This reference architecture will describe how you can combine data from your various marketing data sources, common patterns for analyzing them, and integrating them with your data warehouse for faster analysis and databases for faster responses. 4 | 5 | ## Documentation 6 | 7 | - [One pager: Marketing intelligence in real-time with Dataflow (PDF)](./one_pagers/market_intel_dataflow_onepager.pdf) 8 | - [Marketing Intelligence Solution Guide and Architecture (PDF)](./guides/market_intel_dataflow_guide.pdf) 9 | 10 | ## Assets included in this repository 11 | 12 | - [Terraform code to deploy a project for Market Intelligence inference](../terraform/marketing_intelligence/) 13 | - [Sample pipeline in Python for leveraging the Gemma open LLM with Dataflow](../pipelines/marketing_intelligence/) 14 | 15 | ## Technical benefits 16 | 17 | Dataflow is the best platform for building real-time ML & generative AI 18 | applications. Several unique capabilities make Dataflow the leading choice: 19 | 20 | - **Developer ease of use with turnkey transforms:** Author complex ML 21 | pipelines using utility transforms that can reduce lines code by orders of magnitude 22 | - [MLTransform](https://cloud.google.com/dataflow/docs/machine-learning/ml-preprocess-data) 23 | helps you prepare your data for training machine learning models without 24 | writing complex code or managing underlying libraries. ML Transforms can 25 | generate embeddings that can push data into vector databases to run 26 | inference. 27 | - [RunInference](https://beam.apache.org/documentation/ml/about-ml/#use-runinference) 28 | lets you efficiently use ML models in your pipelines, and contains a 29 | number of different optimizations underneath the hood that make this an 30 | essential part of any streaming AI pipelines 31 | - **Advanced stream processing**: Customers can implement advanced streaming 32 | architectures using the open-source 33 | [Apache Beam SDK](https://beam.apache.org/get-started/), which provides a rich 34 | set capabilities of including state & timer APIs, transformations, side 35 | inputs, enrichment, and a broad list of I/O connectors. 36 | - **Notebooks integration**: Develop your streaming AI pipeline in a 37 | notebook environment, which allows for interactive development and 38 | sampling unbounded data sources. 39 | - **Cost efficiency**: Run pipelines without wasting precious resources & 40 | cost overruns. 41 | - [GPU support](https://cloud.google.com/dataflow/docs/gpu/gpu-support) 42 | Accelerate your processing with GPUs, which can return results faster 43 | for your most computationally demanding pipelines 44 | - [Right-fitting](https://cloud.google.com/dataflow/docs/guides/right-fitting) 45 | Deploy pipelines on heterogeneous worker pools. Rightfitting allows you 46 | to allocate additional resources to individual stages in your pipeline, 47 | which prevents wasteful utilization for stages that don’t require the 48 | same compute. 49 | - **Open-source compatibility**: Dataflow has support for 50 | [running inference with Gemma](https://cloud.google.com/dataflow/docs/machine-learning/gemma) 51 | as well as a strong integration with 52 | [Tensorflow Extended](https://www.tensorflow.org/tfx) 53 | Customers should feel comfortable that these pipelines can be ported to 54 | any other execution engine with Apache Beam support. 55 | -------------------------------------------------------------------------------- /use_cases/guides/ads_analytics_dataflow_guide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/guides/ads_analytics_dataflow_guide.pdf -------------------------------------------------------------------------------- /use_cases/guides/anomaly_detection_dataflow_guide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/guides/anomaly_detection_dataflow_guide.pdf -------------------------------------------------------------------------------- /use_cases/guides/cdp_dataflow_guide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/guides/cdp_dataflow_guide.pdf -------------------------------------------------------------------------------- /use_cases/guides/clickstream_analytics_dataflow_guide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/guides/clickstream_analytics_dataflow_guide.pdf -------------------------------------------------------------------------------- /use_cases/guides/etl_dataflow_guide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/guides/etl_dataflow_guide.pdf -------------------------------------------------------------------------------- /use_cases/guides/gaming_analytics_dataflow_guide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/guides/gaming_analytics_dataflow_guide.pdf -------------------------------------------------------------------------------- /use_cases/guides/genai_ml_dataflow_guide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/guides/genai_ml_dataflow_guide.pdf -------------------------------------------------------------------------------- /use_cases/guides/iot_analytics_dataflow_guide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/guides/iot_analytics_dataflow_guide.pdf -------------------------------------------------------------------------------- /use_cases/guides/log_replication_dataflow_guide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/guides/log_replication_dataflow_guide.pdf -------------------------------------------------------------------------------- /use_cases/guides/market_intel_dataflow_guide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/guides/market_intel_dataflow_guide.pdf -------------------------------------------------------------------------------- /use_cases/one_pagers/anomaly_detection_dataflow_onepager.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/one_pagers/anomaly_detection_dataflow_onepager.pdf -------------------------------------------------------------------------------- /use_cases/one_pagers/clickstream_dataflow_onepager.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/one_pagers/clickstream_dataflow_onepager.pdf -------------------------------------------------------------------------------- /use_cases/one_pagers/etl_dataflow_onepager.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/one_pagers/etl_dataflow_onepager.pdf -------------------------------------------------------------------------------- /use_cases/one_pagers/genai_ml_dataflow_onepager.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/one_pagers/genai_ml_dataflow_onepager.pdf -------------------------------------------------------------------------------- /use_cases/one_pagers/iot_analytics_dataflowonepager.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/one_pagers/iot_analytics_dataflowonepager.pdf -------------------------------------------------------------------------------- /use_cases/one_pagers/log_replication_dataflow_onepager.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/one_pagers/log_replication_dataflow_onepager.pdf -------------------------------------------------------------------------------- /use_cases/one_pagers/market_intel_dataflow_onepager.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/dataflow-solution-guides/71b52852fadaaec1952f2b2f1d3a6c8fdf1908ce/use_cases/one_pagers/market_intel_dataflow_onepager.pdf --------------------------------------------------------------------------------