├── README.md ├── example └── 1.terraform-automation │ ├── output.tf │ ├── terraform.tfvars │ ├── pubsub.tf │ ├── network.tf │ ├── gcs.tf │ ├── iam.tf │ ├── main.tf │ ├── variables.tf │ ├── composer.tf │ ├── csr-cloudbuildtrigger.tf │ └── README.md ├── env-setup ├── composer_variables.template ├── set_composer_variables.sh ├── set_env.sh └── create_buckets.sh ├── source-code ├── build-pipeline │ ├── wait_for_dag_deployed.sh │ ├── deploy_prod.yaml │ └── build_deploy_test.yaml ├── workflow-dag │ ├── support-files │ │ ├── input.txt │ │ └── ref.txt │ ├── data-pipeline-prod.py │ ├── compare_xcom_maps.py │ ├── test_compare_xcom_maps.py │ └── data-pipeline-test.py └── data-processing-code │ ├── src │ ├── test │ │ └── java │ │ │ └── org │ │ │ └── apache │ │ │ └── beam │ │ │ └── examples │ │ │ └── WordCountTest.java │ └── main │ │ └── java │ │ └── org │ │ └── apache │ │ └── beam │ │ └── examples │ │ └── WordCount.java │ └── pom.xml ├── CONTRIBUTING.md └── LICENSE /README.md: -------------------------------------------------------------------------------- 1 | # CI/CD for data processing workflow 2 | This repository contains source code for the guide on how to use Cloud Build and Cloud Composer to create a CI/CD pipeline for building, deployment and testing of a data processing workflow. 3 | 4 | Please refer to the solution guide for the steps to run the code: [solution 5 | tutorial](https://cloud.google.com/solutions/cicd-pipeline-for-data-processing) 6 | -------------------------------------------------------------------------------- /example/1.terraform-automation/output.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | output "composer" { 16 | value = module.composer 17 | description = "Information about the cloud composer resource which is created" 18 | } 19 | 20 | -------------------------------------------------------------------------------- /example/1.terraform-automation/terraform.tfvars: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | //Project must already exists 16 | project_id = "" 17 | 18 | //Network and Subnetwork will be created 19 | network = "composer-network" 20 | subnetwork = "development" 21 | -------------------------------------------------------------------------------- /env-setup/composer_variables.template: -------------------------------------------------------------------------------- 1 | { 2 | "gcp_project": "${GCP_PROJECT_ID}", 3 | "gcp_region": "${COMPOSER_REGION}", 4 | "gcp_zone": "${COMPOSER_ZONE_ID}", 5 | "dataflow_jar_location_test": "${DATAFLOW_JAR_BUCKET_TEST}", 6 | "dataflow_jar_file_test": "to_be_overriden", 7 | "gcs_input_bucket_test": "${INPUT_BUCKET_TEST}", 8 | "gcs_ref_bucket_test": "${REF_BUCKET_TEST}", 9 | "gcs_output_bucket_test": "${RESULT_BUCKET_TEST}", 10 | "dataflow_staging_bucket_test": "${DATAFLOW_STAGING_BUCKET_TEST}", 11 | "pubsub_topic": "${PUBSUB_TOPIC}", 12 | "dataflow_jar_location_prod": "${DATAFLOW_JAR_BUCKET_PROD}", 13 | "dataflow_jar_file_prod": "to_be_overriden", 14 | "gcs_input_bucket_prod": "${INPUT_BUCKET_PROD}", 15 | "gcs_output_bucket_prod": "${RESULT_BUCKET_PROD}", 16 | "dataflow_staging_bucket_prod": "${DATAFLOW_STAGING_BUCKET_PROD}" 17 | } 18 | -------------------------------------------------------------------------------- /example/1.terraform-automation/pubsub.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | module "pubsub" { 16 | source = "terraform-google-modules/pubsub/google" 17 | version = "~> 1.8" 18 | topic = var.pubsub_topic 19 | project_id = var.project_id 20 | depends_on = [ 21 | module.project-services, 22 | ] 23 | } 24 | -------------------------------------------------------------------------------- /source-code/build-pipeline/wait_for_dag_deployed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Script that waits for the specified Cloud Composer DAG to deploy. 4 | # 5 | # Copyright 2019 Google Inc. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # https://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | n=0 20 | until [[ $n -ge $4 ]] 21 | do 22 | status=0 23 | gcloud composer environments run "${1}" --location "${2}" dags list \ 24 | 2>&1 | grep "${3}" && break 25 | status=$? 26 | n=$(($n+1)) 27 | sleep "${5}" 28 | done 29 | exit $status 30 | -------------------------------------------------------------------------------- /example/1.terraform-automation/network.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | module "vpc" { 16 | source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/net-vpc?ref=v18.0.0/" 17 | project_id = var.project_id 18 | name = var.network 19 | subnets = [ 20 | { 21 | ip_cidr_range = "10.0.0.0/24" 22 | name = var.subnetwork 23 | region = var.region 24 | secondary_ip_range = { 25 | pods = "10.57.0.0/17", 26 | services = "10.57.128.0/22", 27 | } 28 | }, 29 | ] 30 | } 31 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We'd love to accept your patches and contributions to this project. There are 4 | just a few small guidelines you need to follow. 5 | 6 | ## Contributor License Agreement 7 | 8 | Contributions to this project must be accompanied by a Contributor License 9 | Agreement. You (or your employer) retain the copyright to your contribution; 10 | this simply gives us permission to use and redistribute your contributions as 11 | part of the project. Head over to to see 12 | your current agreements on file or to sign a new one. 13 | 14 | You generally only need to submit a CLA once, so if you've already submitted one 15 | (even if it was for a different project), you probably don't need to do it 16 | again. 17 | 18 | ## Code reviews 19 | 20 | All submissions, including submissions by project members, require review. We 21 | use GitHub pull requests for this purpose. Consult 22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 23 | information on using pull requests. 24 | 25 | ## Community Guidelines 26 | 27 | This project follows [Google's Open Source Community 28 | Guidelines](https://opensource.google.com/conduct/). 29 | -------------------------------------------------------------------------------- /env-setup/set_composer_variables.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # This script sets the variables in Composer. The variables are needed for the 4 | # data processing DAGs to properly execute, such as project-id, GCP region and 5 | #zone. It also sets Cloud Storage buckets where test files are stored. 6 | # 7 | # Copyright 2019 Google Inc. 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); 10 | # you may not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # https://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | 21 | COMPOSER_VAR_FILE=composer_variables.json 22 | if [ ! -f "${COMPOSER_VAR_FILE}" ]; then 23 | echo "Generate composer variable file ${COMPOSER_VAR_FILE}." 24 | envsubst < composer_variables.template > ${COMPOSER_VAR_FILE} 25 | fi 26 | 27 | gcloud composer environments storage data import \ 28 | --environment ${COMPOSER_ENV_NAME} \ 29 | --location ${COMPOSER_REGION} \ 30 | --source ${COMPOSER_VAR_FILE} 31 | 32 | gcloud composer environments run \ 33 | ${COMPOSER_ENV_NAME} \ 34 | --location ${COMPOSER_REGION} \ 35 | variables import -- /home/airflow/gcs/data/${COMPOSER_VAR_FILE} 36 | -------------------------------------------------------------------------------- /example/1.terraform-automation/gcs.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | module "gcs_buckets_test" { 16 | source = "terraform-google-modules/cloud-storage/google" 17 | project_id = var.project_id 18 | names = [local.dataflow_jar_bucket_test, 19 | local.input_bucket_test, 20 | local.result_bucket_test, 21 | local.ref_bucket_test, 22 | local.dataflow_staging_bucket_test 23 | ] 24 | prefix = "" 25 | set_admin_roles = true 26 | admins = ["${local.composer_service_account}"] 27 | versioning = { 28 | first = true 29 | } 30 | } 31 | 32 | module "gcs_buckets_prod" { 33 | source = "terraform-google-modules/cloud-storage/google" 34 | project_id = var.project_id 35 | names = [local.dataflow_jar_bucket_prod, 36 | local.input_bucket_prod, 37 | local.result_bucket_prod, 38 | local.dataflow_staging_bucket_prod 39 | ] 40 | prefix = "" 41 | set_admin_roles = true 42 | admins = ["${local.composer_service_account}"] 43 | versioning = { 44 | first = true 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /source-code/workflow-dag/support-files/input.txt: -------------------------------------------------------------------------------- 1 | To be, or not to be, that is the question: 2 | Whether 'tis nobler in the mind to suffer 3 | The slings and arrows of outrageous fortune, 4 | Or to take Arms against a Sea of troubles, 5 | And by opposing end them: to die, to sleep 6 | No more; and by a sleep, to say we end 7 | The heart-ache, and the thousand natural shocks 8 | That Flesh is heir to? 'Tis a consummation 9 | Devoutly to be wished. To die, to sleep, 10 | To sleep, perchance to Dream; aye, there's the rub, 11 | For in that sleep of death, what dreams may come, 12 | When we have shuffled off this mortal coil, 13 | Must give us pause. There's the respect 14 | That makes Calamity of so long life: 15 | For who would bear the Whips and Scorns of time, 16 | The Oppressor's wrong, the proud man's Contumely, 17 | The pangs of despised Love, the Law’s delay, 18 | The insolence of Office, and the spurns 19 | That patient merit of the unworthy takes, 20 | When he himself might his Quietus make 21 | With a bare Bodkin? Who would Fardels bear, 22 | To grunt and sweat under a weary life, 23 | But that the dread of something after death, 24 | The undiscovered country, from whose bourn 25 | No traveller returns, puzzles the will, 26 | And makes us rather bear those ills we have, 27 | Than fly to others that we know not of. 28 | Thus conscience does make cowards of us all, 29 | And thus the native hue of Resolution 30 | Is sicklied o'er, with the pale cast of Thought, 31 | And enterprises of great pitch and moment, 32 | With this regard their Currents turn awry, 33 | And lose the name of Action. Soft you now, 34 | The fair Ophelia? Nymph, in thy Orisons 35 | Be all my sins remember'd. 36 | -------------------------------------------------------------------------------- /example/1.terraform-automation/iam.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | module "composer-service-accounts" { 16 | source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/iam-service-account?ref=v18.0.0/" 17 | project_id = var.project_id 18 | name = "composer-default" 19 | generate_key = false 20 | # authoritative roles granted *on* the service accounts to other identities 21 | iam = { 22 | } 23 | # non-authoritative roles granted *to* the service accounts on other resources 24 | iam_project_roles = { 25 | "${var.project_id}" = [ 26 | "roles/logging.logWriter", 27 | "roles/monitoring.metricWriter", 28 | "roles/composer.ServiceAgentV2Ext", 29 | "roles/composer.worker", 30 | "roles/composer.admin", 31 | "roles/dataflow.admin", 32 | "roles/iam.serviceAccountUser", 33 | "roles/compute.networkUser", 34 | ] 35 | } 36 | } 37 | 38 | resource "google_project_iam_member" "cloudbuild_sa" { 39 | for_each = toset(["roles/composer.admin", "roles/composer.worker"]) 40 | project = var.project_id 41 | role = each.key 42 | member = "serviceAccount:${local.project_number}@cloudbuild.gserviceaccount.com" 43 | } 44 | -------------------------------------------------------------------------------- /source-code/build-pipeline/deploy_prod.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | steps: 15 | - name: gcr.io/cloud-builders/gsutil 16 | args: ['cp', 'gs://${_DATAFLOW_JAR_BUCKET_TEST}/${_DATAFLOW_JAR_FILE_LATEST}', 'gs://${_DATAFLOW_JAR_BUCKET_PROD}/dataflow_deployment_$BUILD_ID.jar'] 17 | id: 'deploy-jar-to-prod' 18 | - name: gcr.io/cloud-builders/git 19 | args: ['clone', 'https://source.developers.google.com/p/$PROJECT_ID/r/$REPO_NAME'] 20 | id: 'check-out-source-code' 21 | - name: gcr.io/cloud-builders/gsutil 22 | args: ['cp', 'support-files/input.txt', 'gs://${_COMPOSER_INPUT_BUCKET}'] 23 | dir: '$REPO_NAME/workflow-dag' 24 | id: 'deploy-input-file' 25 | - name: gcr.io/cloud-builders/gcloud 26 | args: ['composer', 'environments', 'run', '${_COMPOSER_ENV_NAME}', '--location', '${_COMPOSER_REGION}', 'variables', 'set', '--', 'dataflow_jar_file_prod', 'dataflow_deployment_$BUILD_ID.jar'] 27 | id: 'set-composer-jar-ref' 28 | - name: gcr.io/cloud-builders/gsutil 29 | args: ['cp', 'data-pipeline-prod.py', '${_COMPOSER_DAG_BUCKET}'] 30 | dir: '$REPO_NAME/workflow-dag' 31 | id: 'deploy-processing-pipeline' 32 | - name: gcr.io/cloud-builders/gcloud 33 | entrypoint: 'bash' 34 | args: ['wait_for_dag_deployed.sh', '${_COMPOSER_ENV_NAME}', '${_COMPOSER_REGION}', '${_COMPOSER_DAG_NAME_PROD}', '6', '20'] 35 | dir: '$REPO_NAME/build-pipeline' 36 | id: 'wait-for-dag-deployed-on-composer' 37 | -------------------------------------------------------------------------------- /env-setup/set_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # This script sets the environment variables for project environment specific 4 | # information such as project_id, region and zone choice. And also name of 5 | # buckets that are used by the build pipeline and the data processing workflow. 6 | # 7 | # Copyright 2019 Google Inc. 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); 10 | # you may not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # https://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | export TEST='test' 21 | export GCP_PROJECT_ID=$(gcloud config list --format 'value(core.project)') 22 | export PROJECT_NUMBER=$(gcloud projects describe "${GCP_PROJECT_ID}" --format='get(projectNumber)') 23 | export DATAFLOW_JAR_BUCKET_TEST="${GCP_PROJECT_ID}-composer-dataflow-source-${TEST}" 24 | export INPUT_BUCKET_TEST="${GCP_PROJECT_ID}-composer-input-${TEST}" 25 | export RESULT_BUCKET_TEST="${GCP_PROJECT_ID}-composer-result-${TEST}" 26 | export REF_BUCKET_TEST="${GCP_PROJECT_ID}-composer-ref-${TEST}" 27 | export DATAFLOW_STAGING_BUCKET_TEST="${GCP_PROJECT_ID}-dataflow-staging-${TEST}" 28 | export PUBSUB_TOPIC='integration-test-complete-topic' 29 | export PROD='prod' 30 | export DATAFLOW_JAR_BUCKET_PROD="${GCP_PROJECT_ID}-composer-dataflow-source-${PROD}" 31 | export INPUT_BUCKET_PROD="${GCP_PROJECT_ID}-composer-input-${PROD}" 32 | export RESULT_BUCKET_PROD="${GCP_PROJECT_ID}-composer-result-${PROD}" 33 | export DATAFLOW_STAGING_BUCKET_PROD="${GCP_PROJECT_ID}-dataflow-staging-${PROD}" 34 | export COMPOSER_REGION='us-central1' 35 | export RESULT_BUCKET_REGION="${COMPOSER_REGION}" 36 | export COMPOSER_ZONE_ID='us-central1-a' 37 | 38 | export COMPOSER_ENV_NAME='data-pipeline-composer' 39 | export SOURCE_CODE_REPO='data-pipeline-source' 40 | export COMPOSER_DAG_NAME_TEST='test_word_count' 41 | export COMPOSER_DAG_NAME_PROD='prod_word_count' 42 | -------------------------------------------------------------------------------- /source-code/workflow-dag/data-pipeline-prod.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Data processing production workflow definition. 15 | """ 16 | import datetime 17 | from airflow import models 18 | from airflow.contrib.operators.dataflow_operator import DataFlowJavaOperator 19 | 20 | dataflow_staging_bucket = 'gs://%s/staging' % ( 21 | models.Variable.get('dataflow_staging_bucket_prod')) 22 | 23 | dataflow_jar_location = 'gs://%s/%s' % ( 24 | models.Variable.get('dataflow_jar_location_prod'), 25 | models.Variable.get('dataflow_jar_file_prod')) 26 | 27 | project = models.Variable.get('gcp_project') 28 | region = models.Variable.get('gcp_region') 29 | zone = models.Variable.get('gcp_zone') 30 | input_bucket = 'gs://' + models.Variable.get('gcs_input_bucket_prod') 31 | output_bucket_name = models.Variable.get('gcs_output_bucket_prod') 32 | output_bucket = 'gs://' + output_bucket_name 33 | output_prefix = 'output' 34 | download_task_prefix = 'download_result' 35 | 36 | yesterday = datetime.datetime.combine( 37 | datetime.datetime.today() - datetime.timedelta(1), 38 | datetime.datetime.min.time()) 39 | 40 | default_args = { 41 | 'dataflow_default_options': { 42 | 'project': project, 43 | 'zone': zone, 44 | 'region': region, 45 | 'stagingLocation': dataflow_staging_bucket 46 | } 47 | } 48 | 49 | with models.DAG( 50 | 'prod_word_count', 51 | schedule_interval=None, 52 | default_args=default_args) as dag: 53 | dataflow_execution = DataFlowJavaOperator( 54 | task_id='wordcount-run', 55 | jar=dataflow_jar_location, 56 | start_date=yesterday, 57 | options={ 58 | 'autoscalingAlgorithm': 'THROUGHPUT_BASED', 59 | 'maxNumWorkers': '3', 60 | 'inputFile': input_bucket+'/input.txt', 61 | 'output': output_bucket+'/'+output_prefix 62 | } 63 | ) 64 | -------------------------------------------------------------------------------- /example/1.terraform-automation/main.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | locals { 16 | composer_dag_bucket = module.composer.gcs_bucket 17 | composer_service_account = module.composer-service-accounts.iam_email 18 | composer_service_account_email = module.composer-service-accounts.email 19 | #test buckets 20 | dataflow_jar_bucket_test = "${var.project_id}-composer-dataflow-source-test-tf" 21 | input_bucket_test = "${var.project_id}-composer-input-test-tf" 22 | ref_bucket_test = "${var.project_id}-composer-ref-test-tf" 23 | result_bucket_test = "${var.project_id}-composer-result-test-tf" 24 | dataflow_staging_bucket_test = "${var.project_id}-dataflow-staging-test-tf" 25 | #prod buckets 26 | dataflow_jar_bucket_prod = "${var.project_id}-composer-dataflow-source-prod-tf" 27 | input_bucket_prod = "${var.project_id}-composer-input-prod-tf" 28 | result_bucket_prod = "${var.project_id}-composer-result-prod-tf" 29 | dataflow_staging_bucket_prod = "${var.project_id}-dataflow-staging-prod-tf" 30 | project_number = data.google_project.project.number 31 | } 32 | 33 | data "google_project" "project" { 34 | project_id = var.project_id 35 | } 36 | 37 | module "project-services" { 38 | source = "terraform-google-modules/project-factory/google//modules/project_services" 39 | project_id = var.project_id 40 | enable_apis = true 41 | disable_services_on_destroy = true 42 | activate_apis = [ 43 | "sourcerepo.googleapis.com", 44 | "compute.googleapis.com", 45 | "iam.googleapis.com", 46 | "pubsub.googleapis.com", 47 | "composer.googleapis.com", 48 | "cloudbuild.googleapis.com", 49 | "compute.googleapis.com", 50 | "servicenetworking.googleapis.com", 51 | "bigquery.googleapis.com", 52 | "monitoring.googleapis.com", 53 | "logging.googleapis.com", 54 | ] 55 | } 56 | 57 | -------------------------------------------------------------------------------- /env-setup/create_buckets.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # This script creates the buckets used by the build pipelines and the data 4 | # processing workflow. It also gives the Cloud Composer service account the 5 | # access level it need to execute the data processing workflow 6 | # 7 | # Copyright 2019 Google Inc. 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); 10 | # you may not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # https://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | 21 | gsutil ls -L "gs://${DATAFLOW_JAR_BUCKET_TEST}" 2>/dev/null \ 22 | || gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${DATAFLOW_JAR_BUCKET_TEST}" 23 | gsutil ls -L "gs://${INPUT_BUCKET_TEST}" 2>/dev/null \ 24 | || gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${INPUT_BUCKET_TEST}" 25 | gsutil ls -L "gs://${REF_BUCKET_TEST}" 2>/dev/null \ 26 | || gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${REF_BUCKET_TEST}" 27 | gsutil ls -L "gs://${RESULT_BUCKET_TEST}" 2>/dev/null \ 28 | || gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${RESULT_BUCKET_TEST}" 29 | gsutil ls -L "gs://${DATAFLOW_STAGING_BUCKET_TEST}" 2>/dev/null \ 30 | || gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${DATAFLOW_STAGING_BUCKET_TEST}" 31 | gsutil ls -L "gs://${DATAFLOW_JAR_BUCKET_PROD}" 2>/dev/null \ 32 | || gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${DATAFLOW_JAR_BUCKET_PROD}" 33 | gsutil ls -L "gs://${INPUT_BUCKET_PROD}" 2>/dev/null \ 34 | || gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${INPUT_BUCKET_PROD}" 35 | gsutil ls -L "gs://${RESULT_BUCKET_PROD}" 2>/dev/null \ 36 | || gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${RESULT_BUCKET_PROD}" 37 | gsutil ls -L "gs://${DATAFLOW_STAGING_BUCKET_PROD}" 2>/dev/null \ 38 | || gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${DATAFLOW_STAGING_BUCKET_PROD}" 39 | 40 | gsutil acl ch -u "${COMPOSER_SERVICE_ACCOUNT}:R" \ 41 | "gs://${DATAFLOW_JAR_BUCKET_TEST}" \ 42 | "gs://${INPUT_BUCKET_TEST}" \ 43 | "gs://${REF_BUCKET_TEST}" \ 44 | "gs://${DATAFLOW_JAR_BUCKET_PROD}" "gs://${INPUT_BUCKET_PROD}" 45 | gsutil acl ch -u "${COMPOSER_SERVICE_ACCOUNT}:W" \ 46 | "gs://${RESULT_BUCKET_TEST}" \ 47 | "gs://${DATAFLOW_STAGING_BUCKET_TEST}" \ 48 | "gs://${RESULT_BUCKET_PROD}" "gs://${DATAFLOW_STAGING_BUCKET_PROD}" 49 | -------------------------------------------------------------------------------- /source-code/workflow-dag/support-files/ref.txt: -------------------------------------------------------------------------------- 1 | Devoutly: 1 2 | dread: 1 3 | from: 1 4 | Be: 1 5 | Flesh: 1 6 | The: 7 7 | turn: 1 8 | thy: 1 9 | off: 1 10 | slings: 1 11 | bourn: 1 12 | does: 1 13 | weary: 1 14 | rather: 1 15 | in: 3 16 | Soft: 1 17 | tis: 1 18 | say: 1 19 | With: 2 20 | pale: 1 21 | Resolution: 1 22 | arrows: 1 23 | Contumely: 1 24 | undiscovered: 1 25 | pitch: 1 26 | lose: 1 27 | all: 2 28 | pangs: 1 29 | Bodkin: 1 30 | thousand: 1 31 | great: 1 32 | their: 1 33 | Love: 1 34 | bear: 3 35 | and: 7 36 | dreams: 1 37 | those: 1 38 | opposing: 1 39 | mind: 1 40 | whose: 1 41 | sicklied: 1 42 | question: 1 43 | There: 1 44 | more: 1 45 | a: 5 46 | puzzles: 1 47 | know: 1 48 | native: 1 49 | will: 1 50 | you: 1 51 | No: 2 52 | have: 2 53 | Calamity: 1 54 | there: 1 55 | Is: 1 56 | insolence: 1 57 | Quietus: 1 58 | conscience: 1 59 | Action: 1 60 | heart: 1 61 | under: 1 62 | end: 2 63 | something: 1 64 | er: 1 65 | us: 3 66 | he: 1 67 | give: 1 68 | Thought: 1 69 | name: 1 70 | with: 1 71 | who: 1 72 | fortune: 1 73 | That: 3 74 | consummation: 1 75 | may: 1 76 | life: 2 77 | or: 1 78 | patient: 1 79 | remember: 1 80 | takes: 1 81 | Tis: 1 82 | o: 1 83 | shocks: 1 84 | my: 1 85 | cowards: 1 86 | so: 1 87 | Whether: 1 88 | we: 4 89 | enterprises: 1 90 | man: 1 91 | heir: 1 92 | by: 2 93 | would: 2 94 | rub: 1 95 | And: 5 96 | unworthy: 1 97 | aye: 1 98 | Whips: 1 99 | Thus: 1 100 | country: 1 101 | what: 1 102 | For: 2 103 | nobler: 1 104 | proud: 1 105 | makes: 2 106 | of: 15 107 | sins: 1 108 | the: 15 109 | To: 4 110 | moment: 1 111 | respect: 1 112 | his: 1 113 | fair: 1 114 | come: 1 115 | traveller: 1 116 | Fardels: 1 117 | Who: 1 118 | Law: 1 119 | Must: 1 120 | take: 1 121 | coil: 1 122 | wrong: 1 123 | Nymph: 1 124 | Sea: 1 125 | now: 1 126 | Than: 1 127 | Or: 1 128 | awry: 1 129 | s: 5 130 | Currents: 1 131 | outrageous: 1 132 | make: 2 133 | is: 2 134 | long: 1 135 | spurns: 1 136 | Oppressor: 1 137 | cast: 1 138 | be: 3 139 | merit: 1 140 | might: 1 141 | time: 1 142 | Scorns: 1 143 | that: 4 144 | delay: 1 145 | grunt: 1 146 | against: 1 147 | Arms: 1 148 | himself: 1 149 | Orisons: 1 150 | troubles: 1 151 | after: 1 152 | them: 1 153 | thus: 1 154 | natural: 1 155 | die: 2 156 | d: 1 157 | ills: 1 158 | Ophelia: 1 159 | wished: 1 160 | to: 11 161 | When: 2 162 | regard: 1 163 | pause: 1 164 | But: 1 165 | Office: 1 166 | this: 2 167 | bare: 1 168 | death: 2 169 | perchance: 1 170 | mortal: 1 171 | fly: 1 172 | hue: 1 173 | suffer: 1 174 | not: 2 175 | others: 1 176 | Dream: 1 177 | sweat: 1 178 | ache: 1 179 | returns: 1 180 | sleep: 5 181 | shuffled: 1 182 | despised: 1 183 | -------------------------------------------------------------------------------- /example/1.terraform-automation/variables.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | variable "project_id" { 16 | description = "Project ID where Cloud Composer Environment is created." 17 | type = string 18 | } 19 | 20 | variable "region" { 21 | description = "Region where the Cloud Composer Environment is created." 22 | default = "us-central1" 23 | type = string 24 | } 25 | 26 | variable "composer_env_name" { 27 | description = "Name of Cloud Composer Environment" 28 | default = "composer-dev-env" 29 | type = string 30 | } 31 | 32 | variable "composer_zone_id" { 33 | description = "Zone value which is passed to the Airflow envt" 34 | default = "us-central1-a" 35 | type = string 36 | } 37 | 38 | variable "network" { 39 | type = string 40 | default = "default" 41 | description = "The VPC network to host the composer cluster." 42 | } 43 | 44 | variable "subnetwork" { 45 | type = string 46 | default = "default" 47 | description = "The subnetwork to host the composer cluster." 48 | } 49 | 50 | variable "pubsub_topic" { 51 | type = string 52 | default = "integration-test-complete-topic" 53 | description = "Name of the pub sub topic." 54 | } 55 | 56 | variable "datapipeline_csr_name" { 57 | type = string 58 | default = "data-pipeline-source" 59 | description = "The CSR repo name to be used for storing the datapipeline source code." 60 | 61 | } 62 | 63 | variable "terraform_deployment_csr_name" { 64 | type = string 65 | default = "terraform-automation-source" 66 | description = "The CSR repo name to be used for storing the terraform code." 67 | 68 | } 69 | 70 | variable "composer_dag_name_prod" { 71 | type = string 72 | default = "prod_word_count" 73 | description = "The Composer DAG name(for prod) to be passed as environment variable." 74 | } 75 | 76 | 77 | variable "composer_dag_name_test" { 78 | type = string 79 | default = "test_word_count" 80 | description = "The Composer DAG name(for test) to be passed as environment variable." 81 | 82 | } 83 | 84 | variable "image_version" { 85 | type = string 86 | description = "The version of the airflow running in the cloud composer environment." 87 | default = "composer-2.0.32-airflow-2.3.4" 88 | } 89 | -------------------------------------------------------------------------------- /source-code/workflow-dag/compare_xcom_maps.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Custom operator that compares dictionaries in xcom. 15 | """ 16 | 17 | from airflow.models import BaseOperator 18 | from airflow.utils.decorators import apply_defaults 19 | 20 | 21 | class CompareXComMapsOperator(BaseOperator): 22 | """Compare dictionary stored in xcom. 23 | 24 | Args: 25 | ref_task_ids: list of task ids from where the reference dictionary 26 | is fetched 27 | res_task_ids: list of task ids from where the comparing dictionary 28 | is fetched 29 | """ 30 | 31 | @apply_defaults 32 | def __init__( 33 | self, 34 | ref_task_ids, 35 | res_task_ids, 36 | *args, **kwargs): 37 | super(CompareXComMapsOperator, self).__init__(*args, **kwargs) 38 | self.ref_task_ids = ref_task_ids 39 | self.res_task_ids = res_task_ids 40 | 41 | def execute(self, context): 42 | ref_obj = self.read_value_as_obj(self.ref_task_ids, context) 43 | res_obj = self.read_value_as_obj(self.res_task_ids, context) 44 | self.compare_obj(ref_obj, res_obj) 45 | return 'result contains the expected values' 46 | 47 | def read_value_as_obj(self, task_ids, context): 48 | ret_obj = {} 49 | for task_id in task_ids: 50 | value_str = context['ti'].xcom_pull( 51 | key=None, 52 | task_ids=task_id) 53 | self.parse_str_obj(value_str, ret_obj) 54 | return ret_obj 55 | 56 | def parse_str_obj(self, str_rep, obj): 57 | entries = str_rep.split('\n') 58 | for entry in entries: 59 | if entry: 60 | key, value = entry.split(': ') 61 | obj[key] = value 62 | 63 | def compare_obj(self, ref_obj, res_obj): 64 | if ref_obj != res_obj: 65 | raise ValueError(self.create_diff_str(ref_obj, res_obj)) 66 | 67 | def create_diff_str(self, ref_obj, res_obj): 68 | msg = 'The result differs from the expected in the following ways:' 69 | for k in ref_obj: 70 | if k not in res_obj: 71 | msg = msg + ('\nmissing key: %s in result' % k) 72 | elif ref_obj[k] != res_obj[k]: 73 | msg = msg + ('\nexpected %s: %s but got %s: %s' % ( 74 | k, ref_obj[k], k, res_obj[k])) 75 | for k in res_obj: 76 | if k not in ref_obj: 77 | msg = msg + ('\nunexpected key: %s in result' % k) 78 | return msg 79 | -------------------------------------------------------------------------------- /example/1.terraform-automation/composer.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | module "composer" { 16 | source = "terraform-google-modules/composer/google//modules/create_environment_v2" 17 | project_id = var.project_id 18 | region = var.region 19 | composer_env_name = var.composer_env_name 20 | network = module.vpc.name 21 | subnetwork = var.subnetwork 22 | enable_private_endpoint = false 23 | composer_service_account = local.composer_service_account_email 24 | image_version = var.image_version 25 | pod_ip_allocation_range_name = "pods" 26 | service_ip_allocation_range_name = "services" 27 | env_variables = { 28 | "AIRFLOW_VAR_GCP_PROJECT" = "${var.project_id}", 29 | "AIRFLOW_VAR_GCP_REGION" = "${var.region}", 30 | "AIRFLOW_VAR_GCP_ZONE" = "${var.composer_zone_id}", 31 | "AIRFLOW_VAR_GCP_NETWORK" = "${var.network}", 32 | "AIRFLOW_VAR_GCP_SUBNETWORK" = "regions/${var.region}/subnetworks/${var.subnetwork}", 33 | "AIRFLOW_VAR_DATAFLOW_JAR_LOCATION_TEST" = "${local.dataflow_jar_bucket_test}", 34 | "DATAFLOW_JAR_FILE_TEST" = "to_be_overriden", 35 | "AIRFLOW_VAR_GCS_INPUT_BUCKET_TEST" = "${local.input_bucket_test}", 36 | "AIRFLOW_VAR_GCS_REF_BUCKET_TEST" = "${local.ref_bucket_test}", 37 | "AIRFLOW_VAR_GCS_OUTPUT_BUCKET_TEST" = "${local.result_bucket_test}", 38 | "AIRFLOW_VAR_DATAFLOW_STAGING_BUCKET_TEST" = "${local.dataflow_staging_bucket_test}", 39 | "AIRFLOW_VAR_PUBSUB_TOPIC" = "${var.pubsub_topic}", 40 | "AIRFLOW_VAR_DATAFLOW_JAR_LOCATION_PROD" = "${local.dataflow_jar_bucket_prod}", 41 | "DATAFLOW_JAR_FILE_PROD" = "to_be_overriden", 42 | "AIRFLOW_VAR_GCS_INPUT_BUCKET_PROD" = "${local.input_bucket_prod}", 43 | "AIRFLOW_VAR_GCS_OUTPUT_BUCKET_PROD" = "${local.result_bucket_prod}", 44 | "AIRFLOW_VAR_DATAFLOW_STAGING_BUCKET_PROD" = "${local.dataflow_staging_bucket_prod}", 45 | } 46 | airflow_config_overrides = { 47 | } 48 | 49 | depends_on = [ 50 | module.vpc, 51 | module.project-services, 52 | ] 53 | } 54 | -------------------------------------------------------------------------------- /source-code/build-pipeline/build_deploy_test.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | steps: 15 | - name: gcr.io/cloud-builders/git 16 | args: ['clone', 'https://source.developers.google.com/p/$PROJECT_ID/r/$REPO_NAME'] 17 | id: 'check-out-source-code' 18 | - name: gcr.io/cloud-builders/mvn:3.5.0-jdk-8 19 | args: ['package', '-q'] 20 | dir: '$REPO_NAME/data-processing-code' 21 | id: 'build-jar' 22 | - name: gcr.io/cloud-builders/gsutil 23 | args: ['cp', '*bundled*.jar', 'gs://${_DATAFLOW_JAR_BUCKET}/dataflow_deployment_$BUILD_ID.jar'] 24 | dir: '$REPO_NAME/data-processing-code/target' 25 | id: 'deploy-jar' 26 | - name: 'apache/airflow:slim-2.3.1-python3.7' 27 | entrypoint: 'python' 28 | args: ['test_compare_xcom_maps.py'] 29 | dir: '$REPO_NAME/workflow-dag' 30 | env: ['PYTHONPATH=/home/airflow/.local/lib/python3.7/site-packages'] 31 | id: 'unit-test-on-operator-code' 32 | - name: gcr.io/cloud-builders/gsutil 33 | args: ['cp', 'support-files/input.txt', 'gs://${_COMPOSER_INPUT_BUCKET}'] 34 | dir: '$REPO_NAME/workflow-dag' 35 | id: 'deploy-test-input-file' 36 | - name: gcr.io/cloud-builders/gsutil 37 | args: ['cp', 'support-files/ref.txt', 'gs://${_COMPOSER_REF_BUCKET}'] 38 | dir: '$REPO_NAME/workflow-dag' 39 | id: 'deploy-test-ref-file' 40 | - name: gcr.io/cloud-builders/gcloud 41 | args: ['composer', 'environments', 'run', '${_COMPOSER_ENV_NAME}', '--location', '${_COMPOSER_REGION}', 'variables', 'set', '--', 'dataflow_jar_file_test', 'dataflow_deployment_$BUILD_ID.jar'] 42 | id: 'set-composer-jar-ref' 43 | - name: gcr.io/cloud-builders/gsutil 44 | args: ['cp', 'compare_xcom_maps.py', '${_COMPOSER_DAG_BUCKET}'] 45 | dir: '$REPO_NAME/workflow-dag' 46 | id: 'deploy-custom-operator' 47 | - name: gcr.io/cloud-builders/gsutil 48 | args: ['cp', 'data-pipeline-test.py', '${_COMPOSER_DAG_BUCKET}'] 49 | dir: '$REPO_NAME/workflow-dag' 50 | id: 'deploy-processing-pipeline' 51 | - name: gcr.io/cloud-builders/gcloud 52 | entrypoint: 'bash' 53 | args: ['wait_for_dag_deployed.sh', '${_COMPOSER_ENV_NAME}', '${_COMPOSER_REGION}', '${_COMPOSER_DAG_NAME_TEST}', '6', '20'] 54 | dir: '$REPO_NAME/build-pipeline' 55 | id: 'wait-for-dag-deployed-on-composer' 56 | - name: gcr.io/cloud-builders/gcloud 57 | args: ['composer', 'environments', 'run', '${_COMPOSER_ENV_NAME}', '--location', '${_COMPOSER_REGION}', 'dags', 'trigger', '--', '${_COMPOSER_DAG_NAME_TEST}', '--run-id=$BUILD_ID'] 58 | id: 'trigger-pipeline-execution' 59 | -------------------------------------------------------------------------------- /example/1.terraform-automation/csr-cloudbuildtrigger.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | resource "google_cloudbuild_trigger" "trigger-build-in-test-environment" { 16 | location = "global" 17 | project = var.project_id 18 | name = "datapipeline-trigger-build-test-environment" 19 | trigger_template { 20 | branch_name = "master" 21 | project_id = var.project_id 22 | repo_name = google_sourcerepo_repository.my-repo.name 23 | } 24 | 25 | substitutions = { 26 | REPO_NAME = google_sourcerepo_repository.my-repo.name 27 | _COMPOSER_DAG_BUCKET = local.composer_dag_bucket 28 | _COMPOSER_DAG_NAME_TEST = var.composer_dag_name_test 29 | _COMPOSER_ENV_NAME = var.composer_env_name 30 | _COMPOSER_INPUT_BUCKET = local.input_bucket_test 31 | _COMPOSER_REF_BUCKET = local.ref_bucket_test 32 | _COMPOSER_REGION = var.region 33 | _DATAFLOW_JAR_BUCKET = local.dataflow_jar_bucket_test 34 | } 35 | 36 | filename = "build-pipeline/build_deploy_test.yaml" 37 | depends_on = [ 38 | google_sourcerepo_repository.my-repo 39 | ] 40 | } 41 | 42 | resource "google_cloudbuild_trigger" "trigger-build-in-prod-environment" { 43 | location = "global" 44 | project = var.project_id 45 | name = "datapipeline-trigger-build-prod-environment" 46 | 47 | source_to_build { 48 | uri = google_sourcerepo_repository.my-repo.url 49 | ref = "refs/heads/master" 50 | repo_type = "CLOUD_SOURCE_REPOSITORIES" 51 | } 52 | 53 | pubsub_config { 54 | topic = module.pubsub.id 55 | } 56 | 57 | substitutions = { 58 | REPO_NAME = google_sourcerepo_repository.my-repo.name 59 | _COMPOSER_DAG_BUCKET = local.composer_dag_bucket 60 | _COMPOSER_DAG_NAME_PROD = var.composer_dag_name_prod 61 | _COMPOSER_ENV_NAME = var.composer_env_name 62 | _COMPOSER_INPUT_BUCKET = local.input_bucket_prod 63 | _COMPOSER_REF_BUCKET = local.ref_bucket_test 64 | _COMPOSER_REGION = var.region 65 | _DATAFLOW_JAR_BUCKET_PROD = local.dataflow_jar_bucket_prod 66 | _DATAFLOW_JAR_FILE_LATEST = "$(body.message.data)" 67 | _DATAFLOW_JAR_BUCKET_TEST = local.dataflow_jar_bucket_test 68 | } 69 | approval_config { 70 | approval_required = true 71 | } 72 | 73 | filename = "build-pipeline/deploy_prod.yaml" 74 | } 75 | 76 | resource "google_sourcerepo_repository" "my-repo" { 77 | name = var.datapipeline_csr_name 78 | project = var.project_id 79 | } 80 | 81 | resource "google_sourcerepo_repository" "tf-source-repo" { 82 | name = var.terraform_deployment_csr_name 83 | project = var.project_id 84 | } 85 | -------------------------------------------------------------------------------- /source-code/data-processing-code/src/test/java/org/apache/beam/examples/WordCountTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Google Inc. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one 5 | * or more contributor license agreements. See the NOTICE file 6 | * distributed with this work for additional information 7 | * regarding copyright ownership. The ASF licenses this file 8 | * to you under the Apache License, Version 2.0 (the 9 | * "License"); you may not use this file except in compliance 10 | * with the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | package org.apache.beam.examples; 21 | 22 | import java.util.Arrays; 23 | import java.util.List; 24 | import org.apache.beam.examples.WordCount.CountWords; 25 | import org.apache.beam.examples.WordCount.ExtractWordsFn; 26 | import org.apache.beam.examples.WordCount.FormatAsTextFn; 27 | import org.apache.beam.sdk.coders.StringUtf8Coder; 28 | import org.apache.beam.sdk.testing.PAssert; 29 | import org.apache.beam.sdk.testing.TestPipeline; 30 | import org.apache.beam.sdk.testing.ValidatesRunner; 31 | import org.apache.beam.sdk.transforms.Create; 32 | import org.apache.beam.sdk.transforms.DoFn; 33 | import org.apache.beam.sdk.transforms.DoFnTester; 34 | import org.apache.beam.sdk.transforms.MapElements; 35 | import org.apache.beam.sdk.values.PCollection; 36 | import org.hamcrest.CoreMatchers; 37 | import org.junit.Assert; 38 | import org.junit.Rule; 39 | import org.junit.Test; 40 | import org.junit.experimental.categories.Category; 41 | import org.junit.runner.RunWith; 42 | import org.junit.runners.JUnit4; 43 | 44 | /** Tests of WordCount. */ 45 | @RunWith(JUnit4.class) 46 | public class WordCountTest { 47 | 48 | /** Example test that tests a specific {@link DoFn}. */ 49 | @Test 50 | public void testExtractWordsFn() throws Exception { 51 | DoFnTester < String, String > extractWordsFn = DoFnTester.of(new ExtractWordsFn()); 52 | 53 | Assert.assertThat( 54 | extractWordsFn.processBundle(" some input words "), 55 | CoreMatchers.hasItems("some", "input", "words")); 56 | Assert.assertThat(extractWordsFn.processBundle(" "), CoreMatchers.hasItems()); 57 | Assert.assertThat( 58 | extractWordsFn.processBundle(" some ", " input", " words"), 59 | CoreMatchers.hasItems("some", "input", "words")); 60 | } 61 | 62 | static final String[] WORDS_ARRAY = 63 | new String[] { 64 | "five", 65 | "five four", 66 | "five four three", 67 | "five four three two", 68 | "", 69 | "five four three two one" 70 | }; 71 | 72 | static final List < String > WORDS = Arrays.asList(WORDS_ARRAY); 73 | 74 | static final String[] COUNTS_ARRAY = new String[] { 75 | "five: 5", 76 | "four: 4", 77 | "three: 3", 78 | "two: 2", 79 | "one: 1" 80 | }; 81 | 82 | @Rule public TestPipeline p = TestPipeline.create(); 83 | 84 | /** Example test that tests a PTransform by using an in-memory input and inspecting the output. */ 85 | @Test 86 | @Category(ValidatesRunner.class) 87 | public void testCountWords() throws Exception { 88 | PCollection < String > input = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder.of())); 89 | 90 | PCollection < String > output = 91 | input.apply(new CountWords()).apply(MapElements.via(new FormatAsTextFn())); 92 | 93 | PAssert.that(output).containsInAnyOrder(COUNTS_ARRAY); 94 | p.run().waitUntilFinish(); 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /source-code/workflow-dag/test_compare_xcom_maps.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Unit test of the CompareXComMapsOperator. 15 | """ 16 | import unittest 17 | from compare_xcom_maps import CompareXComMapsOperator 18 | from unittest import mock 19 | 20 | TASK_ID = 'test_compare_task_id' 21 | REF_TASK_ID = 'download_ref_string' 22 | DOWNLOAD_TASK_PREFIX = 'download_result' 23 | CONTEXT_CLASS_NAME = 'airflow.ti_deps.dep_context' 24 | ERROR_LINE_ONE = 'The result differs from the expected in the following ways:\n' 25 | 26 | 27 | def generate_mock_function(first_value, second_value, third_value): 28 | def mock_function(**kwargs): 29 | return { 30 | REF_TASK_ID: 'a: 1\nb: 2\nc: 3', 31 | DOWNLOAD_TASK_PREFIX+'_1': first_value, 32 | DOWNLOAD_TASK_PREFIX+'_2': second_value, 33 | DOWNLOAD_TASK_PREFIX+'_3': third_value 34 | }[kwargs['task_ids']] 35 | return mock_function 36 | 37 | 38 | def equal_mock(): 39 | return generate_mock_function('c: 3', 'b: 2', 'a: 1') 40 | 41 | 42 | def missing_value_mock(): 43 | return generate_mock_function('b: 2', 'a: 1', 'b: 2') 44 | 45 | 46 | def wrong_value_mock(): 47 | return generate_mock_function('a: 1', 'b: 4', 'c: 3') 48 | 49 | 50 | def unexpected_value_mock(): 51 | return generate_mock_function('a: 1', 'c: 3\nd: 4', 'b: 2') 52 | 53 | 54 | class CompareXComMapsOperatorTest(unittest.TestCase): 55 | 56 | def setUp(self): 57 | super(CompareXComMapsOperatorTest, self).setUp() 58 | self.xcom_compare = CompareXComMapsOperator( 59 | task_id=TASK_ID, 60 | ref_task_ids=[REF_TASK_ID], 61 | res_task_ids=[DOWNLOAD_TASK_PREFIX+'_1', 62 | DOWNLOAD_TASK_PREFIX+'_2', 63 | DOWNLOAD_TASK_PREFIX+'_3']) 64 | 65 | def test_init(self): 66 | self.assertEqual(self.xcom_compare.task_id, TASK_ID) 67 | self.assertListEqual(self.xcom_compare.ref_task_ids, [REF_TASK_ID]) 68 | self.assertListEqual(self.xcom_compare.res_task_ids, 69 | [DOWNLOAD_TASK_PREFIX+'_1', 70 | DOWNLOAD_TASK_PREFIX+'_2', 71 | DOWNLOAD_TASK_PREFIX+'_3']) 72 | 73 | def assertRaisesWithMessage(self, error_type, msg, func, *args, **kwargs): 74 | with self.assertRaises(error_type) as context: 75 | func(*args, **kwargs) 76 | self.assertEqual(msg, str(context.exception)) 77 | 78 | def execute_value_error(self, mock_func, error_expect_tr): 79 | with mock.patch(CONTEXT_CLASS_NAME) as context_mock: 80 | context_mock['ti'].xcom_pull = mock_func 81 | self.assertRaisesWithMessage( 82 | ValueError, 83 | error_expect_tr, 84 | self.xcom_compare.execute, context_mock) 85 | 86 | def test_equal(self): 87 | with mock.patch(CONTEXT_CLASS_NAME) as context_mock: 88 | context_mock['ti'].xcom_pull = equal_mock() 89 | self.xcom_compare.execute(context_mock) 90 | 91 | def test_missing_value(self): 92 | self.execute_value_error( 93 | missing_value_mock(), 94 | '{}{}'.format(ERROR_LINE_ONE, 'missing key: c in result')) 95 | 96 | def test_wrong_value(self): 97 | self.execute_value_error( 98 | wrong_value_mock(), 99 | '{}{}'.format(ERROR_LINE_ONE, 'expected b: 2 but got b: 4')) 100 | 101 | def test_unexpected_value(self): 102 | self.execute_value_error( 103 | unexpected_value_mock(), 104 | '{}{}'.format(ERROR_LINE_ONE, 'unexpected key: d in result')) 105 | 106 | suite = unittest.TestLoader().loadTestsFromTestCase(CompareXComMapsOperatorTest) 107 | unittest.TextTestRunner(verbosity=2).run(suite) 108 | -------------------------------------------------------------------------------- /source-code/workflow-dag/data-pipeline-test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Data processing test workflow definition. 15 | """ 16 | import datetime 17 | from airflow import models 18 | from airflow.contrib.operators.dataflow_operator import DataFlowJavaOperator 19 | from airflow.providers.google.cloud.transfers.gcs_to_local import GCSToLocalFilesystemOperator 20 | from airflow.providers.google.cloud.operators.pubsub import PubSubPublishMessageOperator 21 | from compare_xcom_maps import CompareXComMapsOperator 22 | 23 | dataflow_jar_file_test = models.Variable.get('dataflow_jar_file_test') 24 | 25 | dataflow_staging_bucket = 'gs://%s/staging' % ( 26 | models.Variable.get('dataflow_staging_bucket_test')) 27 | 28 | dataflow_jar_location = 'gs://%s/%s' % ( 29 | models.Variable.get('dataflow_jar_location_test'), 30 | dataflow_jar_file_test) 31 | 32 | project = models.Variable.get('gcp_project') 33 | region = models.Variable.get('gcp_region') 34 | zone = models.Variable.get('gcp_zone') 35 | input_bucket = 'gs://' + models.Variable.get('gcs_input_bucket_test') 36 | output_bucket_name = models.Variable.get('gcs_output_bucket_test') 37 | output_bucket = 'gs://' + output_bucket_name 38 | ref_bucket = models.Variable.get('gcs_ref_bucket_test') 39 | pubsub_topic = models.Variable.get('pubsub_topic') 40 | output_prefix = 'output' 41 | download_task_prefix = 'download_result' 42 | 43 | yesterday = datetime.datetime.combine( 44 | datetime.datetime.today() - datetime.timedelta(1), 45 | datetime.datetime.min.time()) 46 | 47 | default_args = { 48 | 'dataflow_default_options': { 49 | 'project': project, 50 | 'zone': zone, 51 | 'region': region, 52 | 'stagingLocation': dataflow_staging_bucket 53 | } 54 | } 55 | 56 | with models.DAG( 57 | 'test_word_count', 58 | schedule_interval=None, 59 | default_args=default_args) as dag: 60 | dataflow_execution = DataFlowJavaOperator( 61 | task_id='wordcount-run', 62 | jar=dataflow_jar_location, 63 | start_date=yesterday, 64 | options={ 65 | 'autoscalingAlgorithm': 'THROUGHPUT_BASED', 66 | 'maxNumWorkers': '3', 67 | 'inputFile': input_bucket+'/input.txt', 68 | 'output': output_bucket+'/'+output_prefix 69 | } 70 | ) 71 | download_expected = GCSToLocalFilesystemOperator( 72 | task_id='download_ref_string', 73 | bucket=ref_bucket, 74 | object_name='ref.txt', 75 | store_to_xcom_key='ref_str', 76 | start_date=yesterday 77 | ) 78 | download_result_one = GCSToLocalFilesystemOperator( 79 | task_id=download_task_prefix+'_1', 80 | bucket=output_bucket_name, 81 | object_name=output_prefix+'-00000-of-00003', 82 | store_to_xcom_key='res_str_1', 83 | start_date=yesterday 84 | ) 85 | download_result_two = GCSToLocalFilesystemOperator( 86 | task_id=download_task_prefix+'_2', 87 | bucket=output_bucket_name, 88 | object_name=output_prefix+'-00001-of-00003', 89 | store_to_xcom_key='res_str_2', 90 | start_date=yesterday 91 | ) 92 | download_result_three = GCSToLocalFilesystemOperator( 93 | task_id=download_task_prefix+'_3', 94 | bucket=output_bucket_name, 95 | object_name=output_prefix+'-00002-of-00003', 96 | store_to_xcom_key='res_str_3', 97 | start_date=yesterday 98 | ) 99 | compare_result = CompareXComMapsOperator( 100 | task_id='do_comparison', 101 | ref_task_ids=['download_ref_string'], 102 | res_task_ids=[download_task_prefix+'_1', 103 | download_task_prefix+'_2', 104 | download_task_prefix+'_3'], 105 | start_date=yesterday 106 | ) 107 | 108 | publish_task = PubSubPublishMessageOperator( 109 | task_id='publish_test_complete', 110 | project=project, 111 | topic=pubsub_topic, 112 | messages=[{'data': dataflow_jar_file_test.encode('utf-8')}], 113 | start_date=yesterday 114 | ) 115 | 116 | dataflow_execution >> download_result_one 117 | dataflow_execution >> download_result_two 118 | dataflow_execution >> download_result_three 119 | 120 | download_expected >> compare_result 121 | download_result_one >> compare_result 122 | download_result_two >> compare_result 123 | download_result_three >> compare_result 124 | 125 | compare_result >> publish_task 126 | -------------------------------------------------------------------------------- /example/1.terraform-automation/README.md: -------------------------------------------------------------------------------- 1 | # Terraform Automation Source 2 | 3 | 4 | ## Introduction 5 | 6 | This repository contains the terraform modules which helps in automating the GCP resource creation. These terraform module will create 7 | - Cloud Composer 8 | - GCS bucket 9 | - Cloud Build Trigger 10 | - Cloud Source Repository 11 | - Service Account for Cloud Composer 12 | - Pub-Sub respource 13 | - Network resources like VPC and Subnet 14 | - Enable Google API's 15 | 16 | ## Pre requistie 17 | 18 | 1. You must have a GCP project created with `project owner` permissions. 19 | 2. Recommendation is to use cloud-shell [link](https://cloud.google.com/shell). If using other shell then ensure that terraform and Gcloud cli are installed in your machine. 20 | 21 | ## Execution 22 | 23 | 1. git clone this repository and `cd ci-cd-for-data-processing-workflow/example/1.terraform-automation`. 24 | 25 | `e.g.: git clone https://github.com/GoogleCloudPlatform/ci-cd-for-data-processing-workflow.git && cd ci-cd-for-data-processing-workflow/example/1.terraform-automation` 26 | 2. Perform a gcloud login using `gcloud auth application-default login` 27 | 3. Update `terraform.tfvars` with the values. Variable `project_id` is the mandatory variable that needs to be updated and other variables are optional. 28 | 4. Execute `terraform init` 29 | 5. Execute `terraform plan` and validate the resources displayed in the output. 30 | 6. Execute `terraform apply` and confirm with `yes` when asked to create resources in your google project. 31 | 7. Once the above steps complete, you would have created the GCP resources listed in the `Introduction` section. You should now also be able to see the two source code repositories created in your project, one for the `terraform-automation-source` and another for the `data-pipeline-source`. 32 | 8. You can now push the three folders(build-pipeline, data-processing-code, workflow-dag) present inside the `source-code` folder ([link](https://github.com/GoogleCloudPlatform/ci-cd-for-data-processing-workflow/tree/master/source-code)) in the Code Source repository created with the name `data-pipeline-source`. 33 | 9. Code push will trigger the Cloudbuild trigger which would create the jobs inside the cloud composer created via the terraform. 34 | 35 | 36 | ## Disclaimer 37 | 38 | Copyright 2022 Google. This software is provided as-is, without warranty or representation for any use or purpose. 39 | Your use of it is subject to your agreement with Google. 40 | 41 | 42 | ## Requirements 43 | 44 | No requirements. 45 | 46 | ## Providers 47 | 48 | | Name | Version | 49 | |------|---------| 50 | | [google](#provider\_google) | 4.44.1 | 51 | 52 | ## Modules 53 | 54 | | Name | Source | Version | 55 | |------|--------|---------| 56 | | [composer](#module\_composer) | terraform-google-modules/composer/google//modules/create_environment_v2 | n/a | 57 | | [composer-service-accounts](#module\_composer-service-accounts) | github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/iam-service-account | v18.0.0/ | 58 | | [gcs\_buckets\_prod](#module\_gcs\_buckets\_prod) | terraform-google-modules/cloud-storage/google | n/a | 59 | | [gcs\_buckets\_test](#module\_gcs\_buckets\_test) | terraform-google-modules/cloud-storage/google | n/a | 60 | | [pubsub](#module\_pubsub) | terraform-google-modules/pubsub/google | ~> 1.8 | 61 | | [vpc](#module\_vpc) | github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/net-vpc | v18.0.0/ | 62 | 63 | ## Resources 64 | 65 | | Name | Type | 66 | |------|------| 67 | | [google_cloudbuild_trigger.trigger-build-in-prod-environment](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | 68 | | [google_cloudbuild_trigger.trigger-build-in-test-environment](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | 69 | | [google_project_iam_member.cloudbuild_sa](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | 70 | | [google_project_service.project](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_service) | resource | 71 | | [google_sourcerepo_repository.my-repo](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/sourcerepo_repository) | resource | 72 | | [google_sourcerepo_repository.tf-source-repo](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/sourcerepo_repository) | resource | 73 | | [google_project.project](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/project) | data source | 74 | 75 | ## Inputs 76 | 77 | | Name | Description | Type | Default | Required | 78 | |------|-------------|------|---------|:--------:| 79 | | [composer\_dag\_name\_prod](#input\_composer\_dag\_name\_prod) | The Composer DAG name(for prod) to be passed as environment variable. | `string` | `"prod_word_count"` | no | 80 | | [composer\_dag\_name\_test](#input\_composer\_dag\_name\_test) | The Composer DAG name(for test) to be passed as environment variable. | `string` | `"test_word_count"` | no | 81 | | [composer\_env\_name](#input\_composer\_env\_name) | Name of Cloud Composer Environment | `string` | `"composer-dev-env"` | no | 82 | | [composer\_zone\_id](#input\_composer\_zone\_id) | Zone value which is passed to the Airflow envt | `string` | `"us-central1-a"` | no | 83 | | [datapipeline\_csr\_name](#input\_datapipeline\_csr\_name) | The CSR repo name to be used for storing the datapipeline source code. | `string` | `"data-pipeline-source"` | no | 84 | | [image\_version](#input\_image\_version) | The version of the aiflow running in the cloud composer environment. | `string` | `"composer-2.0.32-airflow-2.3.4"` | no | 85 | | [network](#input\_network) | The VPC network to host the composer cluster. | `string` | `"default"` | no | 86 | | [project\_id](#input\_project\_id) | Project ID where Cloud Composer Environment is created. | `string` | n/a | yes | 87 | | [pubsub\_topic](#input\_pubsub\_topic) | Name of the pub sub topic. | `string` | `"integration-test-complete-topic"` | no | 88 | | [region](#input\_region) | Region where the Cloud Composer Environment is created. | `string` | `"us-central1"` | no | 89 | | [subnetwork](#input\_subnetwork) | The subnetwork to host the composer cluster. | `string` | `"default"` | no | 90 | | [terraform\_deployment\_csr\_name](#input\_terraform\_deployment\_csr\_name) | The CSR repo name to be used for storing the terraform code. | `string` | `"terraform-automation-source"` | no | 91 | 92 | ## Outputs 93 | 94 | | Name | Description | 95 | |------|-------------| 96 | | [composer](#output\_composer) | Information about the cloud composer resource which is created | 97 | 98 | -------------------------------------------------------------------------------- /source-code/data-processing-code/src/main/java/org/apache/beam/examples/WordCount.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Google Inc. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one 5 | * or more contributor license agreements. See the NOTICE file 6 | * distributed with this work for additional information 7 | * regarding copyright ownership. The ASF licenses this file 8 | * to you under the Apache License, Version 2.0 (the 9 | * "License"); you may not use this file except in compliance 10 | * with the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | package org.apache.beam.examples; 21 | 22 | import org.apache.beam.sdk.Pipeline; 23 | import org.apache.beam.sdk.io.TextIO; 24 | import org.apache.beam.sdk.metrics.Counter; 25 | import org.apache.beam.sdk.metrics.Distribution; 26 | import org.apache.beam.sdk.metrics.Metrics; 27 | import org.apache.beam.sdk.options.Default; 28 | import org.apache.beam.sdk.options.Description; 29 | import org.apache.beam.sdk.options.PipelineOptions; 30 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 31 | import org.apache.beam.sdk.options.Validation.Required; 32 | import org.apache.beam.sdk.transforms.Count; 33 | import org.apache.beam.sdk.transforms.DoFn; 34 | import org.apache.beam.sdk.transforms.MapElements; 35 | import org.apache.beam.sdk.transforms.PTransform; 36 | import org.apache.beam.sdk.transforms.ParDo; 37 | import org.apache.beam.sdk.transforms.SimpleFunction; 38 | import org.apache.beam.sdk.values.KV; 39 | import org.apache.beam.sdk.values.PCollection; 40 | 41 | /** 42 | * An example that counts words in Shakespeare and includes Beam best practices. 43 | * 44 | *

This class, {@link WordCount}, is the second in a series of four successively more detailed 45 | * 'word count' examples. You may first want to take a look at {@link MinimalWordCount}. After 46 | * you've looked at this example, then see the {@link DebuggingWordCount} pipeline, for introduction 47 | * of additional concepts. 48 | * 49 | *

For a detailed walkthrough of this example, see 51 | * https://beam.apache.org/get-started/wordcount-example/ 52 | * 53 | *

Basic concepts, also in the MinimalWordCount example: Reading text files; counting a 54 | * PCollection; writing to text files 55 | * 56 | *

New Concepts: 57 | * 58 | *

 59 |  *   1. Executing a Pipeline both locally and using the selected runner
 60 |  *   2. Using ParDo with static DoFns defined out-of-line
 61 |  *   3. Building a composite transform
 62 |  *   4. Defining your own pipeline options
 63 |  * 
64 | * 65 | *

Concept #1: you can execute this pipeline either locally or using by selecting another runner. 66 | * These are now command-line options and not hard-coded as they were in the MinimalWordCount 67 | * example. 68 | * 69 | *

To change the runner, specify: 70 | * 71 | *

{@code
 72 |  * --runner=YOUR_SELECTED_RUNNER
 73 |  * }
74 | * 75 | *

To execute this pipeline, specify a local output file (if using the {@code DirectRunner}) or 76 | * output prefix on a supported distributed file system. 77 | * 78 | *

{@code
 79 |  * --output=[YOUR_LOCAL_FILE | YOUR_OUTPUT_PREFIX]
 80 |  * }
81 | * 82 | *

The input file defaults to a public data set containing the text of of King Lear, by William 83 | * Shakespeare. You can override it and choose your own input with {@code --inputFile}. 84 | */ 85 | public class WordCount { 86 | 87 | /** 88 | * Concept #2: You can make your pipeline assembly code less verbose by defining your DoFns 89 | * statically out-of-line. This DoFn tokenizes lines of text into individual words; we pass it to 90 | * a ParDo in the pipeline. 91 | */ 92 | static class ExtractWordsFn extends DoFn { 93 | private final Counter emptyLines = Metrics.counter(ExtractWordsFn.class, "emptyLines"); 94 | private final Distribution lineLenDist = 95 | Metrics.distribution(ExtractWordsFn.class, "lineLenDistro"); 96 | private static final String TOKENIZER_PATTERN = "[^\\p{L}]+"; 97 | 98 | @ProcessElement 99 | public void processElement(@Element String element, OutputReceiver receiver) { 100 | lineLenDist.update(element.length()); 101 | if (element.trim().isEmpty()) { 102 | emptyLines.inc(); 103 | } 104 | 105 | // Split the line into words. 106 | String[] words = element.split(TOKENIZER_PATTERN, -1); 107 | 108 | // Output each word encountered into the output PCollection. 109 | for (String word: words) { 110 | if (!word.isEmpty()) { 111 | receiver.output(word); 112 | } 113 | } 114 | } 115 | } 116 | 117 | /** A SimpleFunction that converts a Word and Count into a printable string. */ 118 | public static class FormatAsTextFn extends SimpleFunction, String> { 119 | @Override 120 | public String apply(KV input) { 121 | return input.getKey() + ": " + input.getValue(); 122 | } 123 | } 124 | 125 | /** 126 | * A PTransform that converts a PCollection containing lines of text into a PCollection of 127 | * formatted word counts. 128 | * 129 | *

Concept #3: This is a custom composite transform that bundles two transforms (ParDo and 130 | * Count) as a reusable PTransform subclass. Using composite transforms allows for easy reuse, 131 | * modular testing, and an improved monitoring experience. 132 | */ 133 | public static class CountWords 134 | extends PTransform, PCollection>> { 135 | @Override 136 | public PCollection>expand(PCollection lines) { 137 | 138 | // Convert lines of text into individual words. 139 | PCollection words = lines.apply(ParDo.of(new ExtractWordsFn())); 140 | 141 | // Count the number of times each word occurs. 142 | PCollection> wordCounts = words.apply(Count.perElement()); 143 | 144 | return wordCounts; 145 | } 146 | } 147 | 148 | /** 149 | * Options supported by {@link WordCount}. 150 | * 151 | *

Concept #4: Defining your own configuration options. Here, you can add your own arguments to 152 | * be processed by the command-line parser, and specify default values for them. You can then 153 | * access the options values in your pipeline code. 154 | * 155 | *

Inherits standard configuration options. 156 | */ 157 | public interface WordCountOptions extends PipelineOptions { 158 | 159 | /** 160 | * By default, this example reads from a public dataset containing the text of King Lear. Set 161 | * this option to choose a different input file or glob. 162 | */ 163 | @Description("Path of the file to read from") 164 | @Default.String("gs://apache-beam-samples/shakespeare/kinglear.txt") 165 | String getInputFile(); 166 | 167 | void setInputFile(String value); 168 | 169 | /** Set this required option to specify where to write the output. */ 170 | @Description("Path of the file to write to") 171 | @Required 172 | String getOutput(); 173 | 174 | void setOutput(String value); 175 | } 176 | 177 | static void runWordCount(WordCountOptions options) { 178 | Pipeline p = Pipeline.create(options); 179 | 180 | // Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the 181 | // static FormatAsTextFn() to the ParDo transform. 182 | p.apply("ReadLines", TextIO.read().from(options.getInputFile())) 183 | .apply(new CountWords()) 184 | .apply(MapElements.via(new FormatAsTextFn())) 185 | .apply("WriteCounts", TextIO.write().to(options.getOutput()).withNumShards(3)); 186 | 187 | p.run().waitUntilFinish(); 188 | } 189 | 190 | public static void main(String[] args) { 191 | WordCountOptions options = 192 | PipelineOptionsFactory.fromArgs(args).withValidation().as(WordCountOptions.class); 193 | runWordCount(options); 194 | } 195 | } 196 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | -------------------------------------------------------------------------------- /source-code/data-processing-code/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 22 | 4.0.0 23 | org.example 24 | word-count-beam 25 | 0.1 26 | jar 27 | 28 | 2.19.0 29 | 1.28.0 30 | [30.0-jre,) 31 | 2.1 32 | [2.9.10.8,) 33 | 2.10.3 34 | [4.13.1,) 35 | 3.7.0 36 | 1.6.0 37 | 3.0.2 38 | 3.1.0 39 | 3.0.0 40 | 1.7.25 41 | 2.21.0 42 | 43 | 44 | 45 | apache.snapshots 46 | Apache Development Snapshot Repository 47 | https://repository.apache.org/content/repositories/snapshots/ 48 | 49 | false 50 | 51 | 52 | true 53 | 54 | 55 | 56 | 57 | 58 | 59 | org.apache.maven.plugins 60 | maven-compiler-plugin 61 | ${maven-compiler-plugin.version} 62 | 63 | 1.8 64 | 1.8 65 | 66 | 67 | 69 | 70 | org.apache.maven.plugins 71 | maven-surefire-plugin 72 | ${maven-surefire-plugin.version} 73 | 74 | all 75 | 4 76 | true 77 | 78 | 79 | 80 | org.apache.maven.surefire 81 | surefire-junit47 82 | ${maven-surefire-plugin.version} 83 | 84 | 85 | 86 | 87 | org.apache.maven.plugins 88 | maven-jar-plugin 89 | ${maven-jar-plugin.version} 90 | 91 | 92 | 93 | true 94 | lib/ 95 | org.apache.beam.examples.WordCount 96 | 97 | 98 | 99 | 100 | 104 | 105 | org.apache.maven.plugins 106 | maven-shade-plugin 107 | ${maven-shade-plugin.version} 108 | 109 | 110 | package 111 | 112 | shade 113 | 114 | 115 | ${project.artifactId}-bundled-${project.version} 116 | 117 | 118 | *:* 119 | 120 | META-INF/LICENSE 121 | META-INF/*.SF 122 | META-INF/*.DSA 123 | META-INF/*.RSA 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | org.codehaus.mojo 139 | exec-maven-plugin 140 | ${maven-exec-plugin.version} 141 | 142 | false 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | direct-runner 151 | 152 | true 153 | 154 | 155 | 156 | 157 | org.apache.beam 158 | beam-runners-direct-java 159 | ${beam.version} 160 | runtime 161 | 162 | 163 | 164 | 165 | dataflow-runner 166 | 167 | 168 | 169 | org.apache.beam 170 | beam-runners-google-cloud-dataflow-java 171 | ${beam.version} 172 | runtime 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | org.apache.beam 181 | beam-sdks-java-core 182 | ${beam.version} 183 | 184 | 185 | 186 | org.apache.beam 187 | beam-sdks-java-io-google-cloud-platform 188 | ${beam.version} 189 | 190 | 191 | 192 | com.google.api-client 193 | google-api-client 194 | ${google-clients.version} 195 | 196 | 198 | 199 | com.google.guava 200 | guava-jdk5 201 | 202 | 203 | 204 | 205 | com.google.http-client 206 | google-http-client 207 | ${google-clients.version} 208 | 209 | 211 | 212 | com.google.guava 213 | guava-jdk5 214 | 215 | 216 | 217 | 218 | joda-time 219 | joda-time 220 | ${joda.version} 221 | 222 | 223 | com.google.guava 224 | guava 225 | ${guava.version} 226 | 227 | 228 | 229 | org.slf4j 230 | slf4j-api 231 | ${slf4j.version} 232 | 233 | 234 | org.slf4j 235 | slf4j-jdk14 236 | ${slf4j.version} 237 | 238 | runtime 239 | 240 | 241 | 242 | org.apache.beam 243 | beam-runners-direct-java 244 | ${beam.version} 245 | test 246 | 247 | 248 | org.hamcrest 249 | hamcrest-core 250 | ${hamcrest.version} 251 | test 252 | 253 | 254 | org.mockito 255 | mockito-core 256 | ${mockito.version} 257 | test 258 | 259 | 260 | junit 261 | junit 262 | ${junit.version} 263 | 264 | 265 | org.apache.beam 266 | beam-runners-google-cloud-dataflow-java 267 | ${beam.version} 268 | runtime 269 | 270 | 271 | 272 | --------------------------------------------------------------------------------