├── README.md
├── example
└── 1.terraform-automation
│ ├── output.tf
│ ├── terraform.tfvars
│ ├── pubsub.tf
│ ├── network.tf
│ ├── gcs.tf
│ ├── iam.tf
│ ├── main.tf
│ ├── variables.tf
│ ├── composer.tf
│ ├── csr-cloudbuildtrigger.tf
│ └── README.md
├── env-setup
├── composer_variables.template
├── set_composer_variables.sh
├── set_env.sh
└── create_buckets.sh
├── source-code
├── build-pipeline
│ ├── wait_for_dag_deployed.sh
│ ├── deploy_prod.yaml
│ └── build_deploy_test.yaml
├── workflow-dag
│ ├── support-files
│ │ ├── input.txt
│ │ └── ref.txt
│ ├── data-pipeline-prod.py
│ ├── compare_xcom_maps.py
│ ├── test_compare_xcom_maps.py
│ └── data-pipeline-test.py
└── data-processing-code
│ ├── src
│ ├── test
│ │ └── java
│ │ │ └── org
│ │ │ └── apache
│ │ │ └── beam
│ │ │ └── examples
│ │ │ └── WordCountTest.java
│ └── main
│ │ └── java
│ │ └── org
│ │ └── apache
│ │ └── beam
│ │ └── examples
│ │ └── WordCount.java
│ └── pom.xml
├── CONTRIBUTING.md
└── LICENSE
/README.md:
--------------------------------------------------------------------------------
1 | # CI/CD for data processing workflow
2 | This repository contains source code for the guide on how to use Cloud Build and Cloud Composer to create a CI/CD pipeline for building, deployment and testing of a data processing workflow.
3 |
4 | Please refer to the solution guide for the steps to run the code: [solution
5 | tutorial](https://cloud.google.com/solutions/cicd-pipeline-for-data-processing)
6 |
--------------------------------------------------------------------------------
/example/1.terraform-automation/output.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | output "composer" {
16 | value = module.composer
17 | description = "Information about the cloud composer resource which is created"
18 | }
19 |
20 |
--------------------------------------------------------------------------------
/example/1.terraform-automation/terraform.tfvars:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | //Project must already exists
16 | project_id = ""
17 |
18 | //Network and Subnetwork will be created
19 | network = "composer-network"
20 | subnetwork = "development"
21 |
--------------------------------------------------------------------------------
/env-setup/composer_variables.template:
--------------------------------------------------------------------------------
1 | {
2 | "gcp_project": "${GCP_PROJECT_ID}",
3 | "gcp_region": "${COMPOSER_REGION}",
4 | "gcp_zone": "${COMPOSER_ZONE_ID}",
5 | "dataflow_jar_location_test": "${DATAFLOW_JAR_BUCKET_TEST}",
6 | "dataflow_jar_file_test": "to_be_overriden",
7 | "gcs_input_bucket_test": "${INPUT_BUCKET_TEST}",
8 | "gcs_ref_bucket_test": "${REF_BUCKET_TEST}",
9 | "gcs_output_bucket_test": "${RESULT_BUCKET_TEST}",
10 | "dataflow_staging_bucket_test": "${DATAFLOW_STAGING_BUCKET_TEST}",
11 | "pubsub_topic": "${PUBSUB_TOPIC}",
12 | "dataflow_jar_location_prod": "${DATAFLOW_JAR_BUCKET_PROD}",
13 | "dataflow_jar_file_prod": "to_be_overriden",
14 | "gcs_input_bucket_prod": "${INPUT_BUCKET_PROD}",
15 | "gcs_output_bucket_prod": "${RESULT_BUCKET_PROD}",
16 | "dataflow_staging_bucket_prod": "${DATAFLOW_STAGING_BUCKET_PROD}"
17 | }
18 |
--------------------------------------------------------------------------------
/example/1.terraform-automation/pubsub.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | module "pubsub" {
16 | source = "terraform-google-modules/pubsub/google"
17 | version = "~> 1.8"
18 | topic = var.pubsub_topic
19 | project_id = var.project_id
20 | depends_on = [
21 | module.project-services,
22 | ]
23 | }
24 |
--------------------------------------------------------------------------------
/source-code/build-pipeline/wait_for_dag_deployed.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Script that waits for the specified Cloud Composer DAG to deploy.
4 | #
5 | # Copyright 2019 Google Inc.
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # https://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 |
19 | n=0
20 | until [[ $n -ge $4 ]]
21 | do
22 | status=0
23 | gcloud composer environments run "${1}" --location "${2}" dags list \
24 | 2>&1 | grep "${3}" && break
25 | status=$?
26 | n=$(($n+1))
27 | sleep "${5}"
28 | done
29 | exit $status
30 |
--------------------------------------------------------------------------------
/example/1.terraform-automation/network.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | module "vpc" {
16 | source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/net-vpc?ref=v18.0.0/"
17 | project_id = var.project_id
18 | name = var.network
19 | subnets = [
20 | {
21 | ip_cidr_range = "10.0.0.0/24"
22 | name = var.subnetwork
23 | region = var.region
24 | secondary_ip_range = {
25 | pods = "10.57.0.0/17",
26 | services = "10.57.128.0/22",
27 | }
28 | },
29 | ]
30 | }
31 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to Contribute
2 |
3 | We'd love to accept your patches and contributions to this project. There are
4 | just a few small guidelines you need to follow.
5 |
6 | ## Contributor License Agreement
7 |
8 | Contributions to this project must be accompanied by a Contributor License
9 | Agreement. You (or your employer) retain the copyright to your contribution;
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to to see
12 | your current agreements on file or to sign a new one.
13 |
14 | You generally only need to submit a CLA once, so if you've already submitted one
15 | (even if it was for a different project), you probably don't need to do it
16 | again.
17 |
18 | ## Code reviews
19 |
20 | All submissions, including submissions by project members, require review. We
21 | use GitHub pull requests for this purpose. Consult
22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
23 | information on using pull requests.
24 |
25 | ## Community Guidelines
26 |
27 | This project follows [Google's Open Source Community
28 | Guidelines](https://opensource.google.com/conduct/).
29 |
--------------------------------------------------------------------------------
/env-setup/set_composer_variables.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # This script sets the variables in Composer. The variables are needed for the
4 | # data processing DAGs to properly execute, such as project-id, GCP region and
5 | #zone. It also sets Cloud Storage buckets where test files are stored.
6 | #
7 | # Copyright 2019 Google Inc.
8 | #
9 | # Licensed under the Apache License, Version 2.0 (the "License");
10 | # you may not use this file except in compliance with the License.
11 | # You may obtain a copy of the License at
12 | #
13 | # https://www.apache.org/licenses/LICENSE-2.0
14 | #
15 | # Unless required by applicable law or agreed to in writing, software
16 | # distributed under the License is distributed on an "AS IS" BASIS,
17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | # See the License for the specific language governing permissions and
19 | # limitations under the License.
20 |
21 | COMPOSER_VAR_FILE=composer_variables.json
22 | if [ ! -f "${COMPOSER_VAR_FILE}" ]; then
23 | echo "Generate composer variable file ${COMPOSER_VAR_FILE}."
24 | envsubst < composer_variables.template > ${COMPOSER_VAR_FILE}
25 | fi
26 |
27 | gcloud composer environments storage data import \
28 | --environment ${COMPOSER_ENV_NAME} \
29 | --location ${COMPOSER_REGION} \
30 | --source ${COMPOSER_VAR_FILE}
31 |
32 | gcloud composer environments run \
33 | ${COMPOSER_ENV_NAME} \
34 | --location ${COMPOSER_REGION} \
35 | variables import -- /home/airflow/gcs/data/${COMPOSER_VAR_FILE}
36 |
--------------------------------------------------------------------------------
/example/1.terraform-automation/gcs.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | module "gcs_buckets_test" {
16 | source = "terraform-google-modules/cloud-storage/google"
17 | project_id = var.project_id
18 | names = [local.dataflow_jar_bucket_test,
19 | local.input_bucket_test,
20 | local.result_bucket_test,
21 | local.ref_bucket_test,
22 | local.dataflow_staging_bucket_test
23 | ]
24 | prefix = ""
25 | set_admin_roles = true
26 | admins = ["${local.composer_service_account}"]
27 | versioning = {
28 | first = true
29 | }
30 | }
31 |
32 | module "gcs_buckets_prod" {
33 | source = "terraform-google-modules/cloud-storage/google"
34 | project_id = var.project_id
35 | names = [local.dataflow_jar_bucket_prod,
36 | local.input_bucket_prod,
37 | local.result_bucket_prod,
38 | local.dataflow_staging_bucket_prod
39 | ]
40 | prefix = ""
41 | set_admin_roles = true
42 | admins = ["${local.composer_service_account}"]
43 | versioning = {
44 | first = true
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/source-code/workflow-dag/support-files/input.txt:
--------------------------------------------------------------------------------
1 | To be, or not to be, that is the question:
2 | Whether 'tis nobler in the mind to suffer
3 | The slings and arrows of outrageous fortune,
4 | Or to take Arms against a Sea of troubles,
5 | And by opposing end them: to die, to sleep
6 | No more; and by a sleep, to say we end
7 | The heart-ache, and the thousand natural shocks
8 | That Flesh is heir to? 'Tis a consummation
9 | Devoutly to be wished. To die, to sleep,
10 | To sleep, perchance to Dream; aye, there's the rub,
11 | For in that sleep of death, what dreams may come,
12 | When we have shuffled off this mortal coil,
13 | Must give us pause. There's the respect
14 | That makes Calamity of so long life:
15 | For who would bear the Whips and Scorns of time,
16 | The Oppressor's wrong, the proud man's Contumely,
17 | The pangs of despised Love, the Law’s delay,
18 | The insolence of Office, and the spurns
19 | That patient merit of the unworthy takes,
20 | When he himself might his Quietus make
21 | With a bare Bodkin? Who would Fardels bear,
22 | To grunt and sweat under a weary life,
23 | But that the dread of something after death,
24 | The undiscovered country, from whose bourn
25 | No traveller returns, puzzles the will,
26 | And makes us rather bear those ills we have,
27 | Than fly to others that we know not of.
28 | Thus conscience does make cowards of us all,
29 | And thus the native hue of Resolution
30 | Is sicklied o'er, with the pale cast of Thought,
31 | And enterprises of great pitch and moment,
32 | With this regard their Currents turn awry,
33 | And lose the name of Action. Soft you now,
34 | The fair Ophelia? Nymph, in thy Orisons
35 | Be all my sins remember'd.
36 |
--------------------------------------------------------------------------------
/example/1.terraform-automation/iam.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | module "composer-service-accounts" {
16 | source = "github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/iam-service-account?ref=v18.0.0/"
17 | project_id = var.project_id
18 | name = "composer-default"
19 | generate_key = false
20 | # authoritative roles granted *on* the service accounts to other identities
21 | iam = {
22 | }
23 | # non-authoritative roles granted *to* the service accounts on other resources
24 | iam_project_roles = {
25 | "${var.project_id}" = [
26 | "roles/logging.logWriter",
27 | "roles/monitoring.metricWriter",
28 | "roles/composer.ServiceAgentV2Ext",
29 | "roles/composer.worker",
30 | "roles/composer.admin",
31 | "roles/dataflow.admin",
32 | "roles/iam.serviceAccountUser",
33 | "roles/compute.networkUser",
34 | ]
35 | }
36 | }
37 |
38 | resource "google_project_iam_member" "cloudbuild_sa" {
39 | for_each = toset(["roles/composer.admin", "roles/composer.worker"])
40 | project = var.project_id
41 | role = each.key
42 | member = "serviceAccount:${local.project_number}@cloudbuild.gserviceaccount.com"
43 | }
44 |
--------------------------------------------------------------------------------
/source-code/build-pipeline/deploy_prod.yaml:
--------------------------------------------------------------------------------
1 | # Copyright 2019 Google Inc.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | steps:
15 | - name: gcr.io/cloud-builders/gsutil
16 | args: ['cp', 'gs://${_DATAFLOW_JAR_BUCKET_TEST}/${_DATAFLOW_JAR_FILE_LATEST}', 'gs://${_DATAFLOW_JAR_BUCKET_PROD}/dataflow_deployment_$BUILD_ID.jar']
17 | id: 'deploy-jar-to-prod'
18 | - name: gcr.io/cloud-builders/git
19 | args: ['clone', 'https://source.developers.google.com/p/$PROJECT_ID/r/$REPO_NAME']
20 | id: 'check-out-source-code'
21 | - name: gcr.io/cloud-builders/gsutil
22 | args: ['cp', 'support-files/input.txt', 'gs://${_COMPOSER_INPUT_BUCKET}']
23 | dir: '$REPO_NAME/workflow-dag'
24 | id: 'deploy-input-file'
25 | - name: gcr.io/cloud-builders/gcloud
26 | args: ['composer', 'environments', 'run', '${_COMPOSER_ENV_NAME}', '--location', '${_COMPOSER_REGION}', 'variables', 'set', '--', 'dataflow_jar_file_prod', 'dataflow_deployment_$BUILD_ID.jar']
27 | id: 'set-composer-jar-ref'
28 | - name: gcr.io/cloud-builders/gsutil
29 | args: ['cp', 'data-pipeline-prod.py', '${_COMPOSER_DAG_BUCKET}']
30 | dir: '$REPO_NAME/workflow-dag'
31 | id: 'deploy-processing-pipeline'
32 | - name: gcr.io/cloud-builders/gcloud
33 | entrypoint: 'bash'
34 | args: ['wait_for_dag_deployed.sh', '${_COMPOSER_ENV_NAME}', '${_COMPOSER_REGION}', '${_COMPOSER_DAG_NAME_PROD}', '6', '20']
35 | dir: '$REPO_NAME/build-pipeline'
36 | id: 'wait-for-dag-deployed-on-composer'
37 |
--------------------------------------------------------------------------------
/env-setup/set_env.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # This script sets the environment variables for project environment specific
4 | # information such as project_id, region and zone choice. And also name of
5 | # buckets that are used by the build pipeline and the data processing workflow.
6 | #
7 | # Copyright 2019 Google Inc.
8 | #
9 | # Licensed under the Apache License, Version 2.0 (the "License");
10 | # you may not use this file except in compliance with the License.
11 | # You may obtain a copy of the License at
12 | #
13 | # https://www.apache.org/licenses/LICENSE-2.0
14 | #
15 | # Unless required by applicable law or agreed to in writing, software
16 | # distributed under the License is distributed on an "AS IS" BASIS,
17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | # See the License for the specific language governing permissions and
19 | # limitations under the License.
20 | export TEST='test'
21 | export GCP_PROJECT_ID=$(gcloud config list --format 'value(core.project)')
22 | export PROJECT_NUMBER=$(gcloud projects describe "${GCP_PROJECT_ID}" --format='get(projectNumber)')
23 | export DATAFLOW_JAR_BUCKET_TEST="${GCP_PROJECT_ID}-composer-dataflow-source-${TEST}"
24 | export INPUT_BUCKET_TEST="${GCP_PROJECT_ID}-composer-input-${TEST}"
25 | export RESULT_BUCKET_TEST="${GCP_PROJECT_ID}-composer-result-${TEST}"
26 | export REF_BUCKET_TEST="${GCP_PROJECT_ID}-composer-ref-${TEST}"
27 | export DATAFLOW_STAGING_BUCKET_TEST="${GCP_PROJECT_ID}-dataflow-staging-${TEST}"
28 | export PUBSUB_TOPIC='integration-test-complete-topic'
29 | export PROD='prod'
30 | export DATAFLOW_JAR_BUCKET_PROD="${GCP_PROJECT_ID}-composer-dataflow-source-${PROD}"
31 | export INPUT_BUCKET_PROD="${GCP_PROJECT_ID}-composer-input-${PROD}"
32 | export RESULT_BUCKET_PROD="${GCP_PROJECT_ID}-composer-result-${PROD}"
33 | export DATAFLOW_STAGING_BUCKET_PROD="${GCP_PROJECT_ID}-dataflow-staging-${PROD}"
34 | export COMPOSER_REGION='us-central1'
35 | export RESULT_BUCKET_REGION="${COMPOSER_REGION}"
36 | export COMPOSER_ZONE_ID='us-central1-a'
37 |
38 | export COMPOSER_ENV_NAME='data-pipeline-composer'
39 | export SOURCE_CODE_REPO='data-pipeline-source'
40 | export COMPOSER_DAG_NAME_TEST='test_word_count'
41 | export COMPOSER_DAG_NAME_PROD='prod_word_count'
42 |
--------------------------------------------------------------------------------
/source-code/workflow-dag/data-pipeline-prod.py:
--------------------------------------------------------------------------------
1 | # Copyright 2019 Google Inc.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Data processing production workflow definition.
15 | """
16 | import datetime
17 | from airflow import models
18 | from airflow.contrib.operators.dataflow_operator import DataFlowJavaOperator
19 |
20 | dataflow_staging_bucket = 'gs://%s/staging' % (
21 | models.Variable.get('dataflow_staging_bucket_prod'))
22 |
23 | dataflow_jar_location = 'gs://%s/%s' % (
24 | models.Variable.get('dataflow_jar_location_prod'),
25 | models.Variable.get('dataflow_jar_file_prod'))
26 |
27 | project = models.Variable.get('gcp_project')
28 | region = models.Variable.get('gcp_region')
29 | zone = models.Variable.get('gcp_zone')
30 | input_bucket = 'gs://' + models.Variable.get('gcs_input_bucket_prod')
31 | output_bucket_name = models.Variable.get('gcs_output_bucket_prod')
32 | output_bucket = 'gs://' + output_bucket_name
33 | output_prefix = 'output'
34 | download_task_prefix = 'download_result'
35 |
36 | yesterday = datetime.datetime.combine(
37 | datetime.datetime.today() - datetime.timedelta(1),
38 | datetime.datetime.min.time())
39 |
40 | default_args = {
41 | 'dataflow_default_options': {
42 | 'project': project,
43 | 'zone': zone,
44 | 'region': region,
45 | 'stagingLocation': dataflow_staging_bucket
46 | }
47 | }
48 |
49 | with models.DAG(
50 | 'prod_word_count',
51 | schedule_interval=None,
52 | default_args=default_args) as dag:
53 | dataflow_execution = DataFlowJavaOperator(
54 | task_id='wordcount-run',
55 | jar=dataflow_jar_location,
56 | start_date=yesterday,
57 | options={
58 | 'autoscalingAlgorithm': 'THROUGHPUT_BASED',
59 | 'maxNumWorkers': '3',
60 | 'inputFile': input_bucket+'/input.txt',
61 | 'output': output_bucket+'/'+output_prefix
62 | }
63 | )
64 |
--------------------------------------------------------------------------------
/example/1.terraform-automation/main.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | locals {
16 | composer_dag_bucket = module.composer.gcs_bucket
17 | composer_service_account = module.composer-service-accounts.iam_email
18 | composer_service_account_email = module.composer-service-accounts.email
19 | #test buckets
20 | dataflow_jar_bucket_test = "${var.project_id}-composer-dataflow-source-test-tf"
21 | input_bucket_test = "${var.project_id}-composer-input-test-tf"
22 | ref_bucket_test = "${var.project_id}-composer-ref-test-tf"
23 | result_bucket_test = "${var.project_id}-composer-result-test-tf"
24 | dataflow_staging_bucket_test = "${var.project_id}-dataflow-staging-test-tf"
25 | #prod buckets
26 | dataflow_jar_bucket_prod = "${var.project_id}-composer-dataflow-source-prod-tf"
27 | input_bucket_prod = "${var.project_id}-composer-input-prod-tf"
28 | result_bucket_prod = "${var.project_id}-composer-result-prod-tf"
29 | dataflow_staging_bucket_prod = "${var.project_id}-dataflow-staging-prod-tf"
30 | project_number = data.google_project.project.number
31 | }
32 |
33 | data "google_project" "project" {
34 | project_id = var.project_id
35 | }
36 |
37 | module "project-services" {
38 | source = "terraform-google-modules/project-factory/google//modules/project_services"
39 | project_id = var.project_id
40 | enable_apis = true
41 | disable_services_on_destroy = true
42 | activate_apis = [
43 | "sourcerepo.googleapis.com",
44 | "compute.googleapis.com",
45 | "iam.googleapis.com",
46 | "pubsub.googleapis.com",
47 | "composer.googleapis.com",
48 | "cloudbuild.googleapis.com",
49 | "compute.googleapis.com",
50 | "servicenetworking.googleapis.com",
51 | "bigquery.googleapis.com",
52 | "monitoring.googleapis.com",
53 | "logging.googleapis.com",
54 | ]
55 | }
56 |
57 |
--------------------------------------------------------------------------------
/env-setup/create_buckets.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # This script creates the buckets used by the build pipelines and the data
4 | # processing workflow. It also gives the Cloud Composer service account the
5 | # access level it need to execute the data processing workflow
6 | #
7 | # Copyright 2019 Google Inc.
8 | #
9 | # Licensed under the Apache License, Version 2.0 (the "License");
10 | # you may not use this file except in compliance with the License.
11 | # You may obtain a copy of the License at
12 | #
13 | # https://www.apache.org/licenses/LICENSE-2.0
14 | #
15 | # Unless required by applicable law or agreed to in writing, software
16 | # distributed under the License is distributed on an "AS IS" BASIS,
17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | # See the License for the specific language governing permissions and
19 | # limitations under the License.
20 |
21 | gsutil ls -L "gs://${DATAFLOW_JAR_BUCKET_TEST}" 2>/dev/null \
22 | || gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${DATAFLOW_JAR_BUCKET_TEST}"
23 | gsutil ls -L "gs://${INPUT_BUCKET_TEST}" 2>/dev/null \
24 | || gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${INPUT_BUCKET_TEST}"
25 | gsutil ls -L "gs://${REF_BUCKET_TEST}" 2>/dev/null \
26 | || gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${REF_BUCKET_TEST}"
27 | gsutil ls -L "gs://${RESULT_BUCKET_TEST}" 2>/dev/null \
28 | || gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${RESULT_BUCKET_TEST}"
29 | gsutil ls -L "gs://${DATAFLOW_STAGING_BUCKET_TEST}" 2>/dev/null \
30 | || gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${DATAFLOW_STAGING_BUCKET_TEST}"
31 | gsutil ls -L "gs://${DATAFLOW_JAR_BUCKET_PROD}" 2>/dev/null \
32 | || gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${DATAFLOW_JAR_BUCKET_PROD}"
33 | gsutil ls -L "gs://${INPUT_BUCKET_PROD}" 2>/dev/null \
34 | || gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${INPUT_BUCKET_PROD}"
35 | gsutil ls -L "gs://${RESULT_BUCKET_PROD}" 2>/dev/null \
36 | || gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${RESULT_BUCKET_PROD}"
37 | gsutil ls -L "gs://${DATAFLOW_STAGING_BUCKET_PROD}" 2>/dev/null \
38 | || gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${DATAFLOW_STAGING_BUCKET_PROD}"
39 |
40 | gsutil acl ch -u "${COMPOSER_SERVICE_ACCOUNT}:R" \
41 | "gs://${DATAFLOW_JAR_BUCKET_TEST}" \
42 | "gs://${INPUT_BUCKET_TEST}" \
43 | "gs://${REF_BUCKET_TEST}" \
44 | "gs://${DATAFLOW_JAR_BUCKET_PROD}" "gs://${INPUT_BUCKET_PROD}"
45 | gsutil acl ch -u "${COMPOSER_SERVICE_ACCOUNT}:W" \
46 | "gs://${RESULT_BUCKET_TEST}" \
47 | "gs://${DATAFLOW_STAGING_BUCKET_TEST}" \
48 | "gs://${RESULT_BUCKET_PROD}" "gs://${DATAFLOW_STAGING_BUCKET_PROD}"
49 |
--------------------------------------------------------------------------------
/source-code/workflow-dag/support-files/ref.txt:
--------------------------------------------------------------------------------
1 | Devoutly: 1
2 | dread: 1
3 | from: 1
4 | Be: 1
5 | Flesh: 1
6 | The: 7
7 | turn: 1
8 | thy: 1
9 | off: 1
10 | slings: 1
11 | bourn: 1
12 | does: 1
13 | weary: 1
14 | rather: 1
15 | in: 3
16 | Soft: 1
17 | tis: 1
18 | say: 1
19 | With: 2
20 | pale: 1
21 | Resolution: 1
22 | arrows: 1
23 | Contumely: 1
24 | undiscovered: 1
25 | pitch: 1
26 | lose: 1
27 | all: 2
28 | pangs: 1
29 | Bodkin: 1
30 | thousand: 1
31 | great: 1
32 | their: 1
33 | Love: 1
34 | bear: 3
35 | and: 7
36 | dreams: 1
37 | those: 1
38 | opposing: 1
39 | mind: 1
40 | whose: 1
41 | sicklied: 1
42 | question: 1
43 | There: 1
44 | more: 1
45 | a: 5
46 | puzzles: 1
47 | know: 1
48 | native: 1
49 | will: 1
50 | you: 1
51 | No: 2
52 | have: 2
53 | Calamity: 1
54 | there: 1
55 | Is: 1
56 | insolence: 1
57 | Quietus: 1
58 | conscience: 1
59 | Action: 1
60 | heart: 1
61 | under: 1
62 | end: 2
63 | something: 1
64 | er: 1
65 | us: 3
66 | he: 1
67 | give: 1
68 | Thought: 1
69 | name: 1
70 | with: 1
71 | who: 1
72 | fortune: 1
73 | That: 3
74 | consummation: 1
75 | may: 1
76 | life: 2
77 | or: 1
78 | patient: 1
79 | remember: 1
80 | takes: 1
81 | Tis: 1
82 | o: 1
83 | shocks: 1
84 | my: 1
85 | cowards: 1
86 | so: 1
87 | Whether: 1
88 | we: 4
89 | enterprises: 1
90 | man: 1
91 | heir: 1
92 | by: 2
93 | would: 2
94 | rub: 1
95 | And: 5
96 | unworthy: 1
97 | aye: 1
98 | Whips: 1
99 | Thus: 1
100 | country: 1
101 | what: 1
102 | For: 2
103 | nobler: 1
104 | proud: 1
105 | makes: 2
106 | of: 15
107 | sins: 1
108 | the: 15
109 | To: 4
110 | moment: 1
111 | respect: 1
112 | his: 1
113 | fair: 1
114 | come: 1
115 | traveller: 1
116 | Fardels: 1
117 | Who: 1
118 | Law: 1
119 | Must: 1
120 | take: 1
121 | coil: 1
122 | wrong: 1
123 | Nymph: 1
124 | Sea: 1
125 | now: 1
126 | Than: 1
127 | Or: 1
128 | awry: 1
129 | s: 5
130 | Currents: 1
131 | outrageous: 1
132 | make: 2
133 | is: 2
134 | long: 1
135 | spurns: 1
136 | Oppressor: 1
137 | cast: 1
138 | be: 3
139 | merit: 1
140 | might: 1
141 | time: 1
142 | Scorns: 1
143 | that: 4
144 | delay: 1
145 | grunt: 1
146 | against: 1
147 | Arms: 1
148 | himself: 1
149 | Orisons: 1
150 | troubles: 1
151 | after: 1
152 | them: 1
153 | thus: 1
154 | natural: 1
155 | die: 2
156 | d: 1
157 | ills: 1
158 | Ophelia: 1
159 | wished: 1
160 | to: 11
161 | When: 2
162 | regard: 1
163 | pause: 1
164 | But: 1
165 | Office: 1
166 | this: 2
167 | bare: 1
168 | death: 2
169 | perchance: 1
170 | mortal: 1
171 | fly: 1
172 | hue: 1
173 | suffer: 1
174 | not: 2
175 | others: 1
176 | Dream: 1
177 | sweat: 1
178 | ache: 1
179 | returns: 1
180 | sleep: 5
181 | shuffled: 1
182 | despised: 1
183 |
--------------------------------------------------------------------------------
/example/1.terraform-automation/variables.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | variable "project_id" {
16 | description = "Project ID where Cloud Composer Environment is created."
17 | type = string
18 | }
19 |
20 | variable "region" {
21 | description = "Region where the Cloud Composer Environment is created."
22 | default = "us-central1"
23 | type = string
24 | }
25 |
26 | variable "composer_env_name" {
27 | description = "Name of Cloud Composer Environment"
28 | default = "composer-dev-env"
29 | type = string
30 | }
31 |
32 | variable "composer_zone_id" {
33 | description = "Zone value which is passed to the Airflow envt"
34 | default = "us-central1-a"
35 | type = string
36 | }
37 |
38 | variable "network" {
39 | type = string
40 | default = "default"
41 | description = "The VPC network to host the composer cluster."
42 | }
43 |
44 | variable "subnetwork" {
45 | type = string
46 | default = "default"
47 | description = "The subnetwork to host the composer cluster."
48 | }
49 |
50 | variable "pubsub_topic" {
51 | type = string
52 | default = "integration-test-complete-topic"
53 | description = "Name of the pub sub topic."
54 | }
55 |
56 | variable "datapipeline_csr_name" {
57 | type = string
58 | default = "data-pipeline-source"
59 | description = "The CSR repo name to be used for storing the datapipeline source code."
60 |
61 | }
62 |
63 | variable "terraform_deployment_csr_name" {
64 | type = string
65 | default = "terraform-automation-source"
66 | description = "The CSR repo name to be used for storing the terraform code."
67 |
68 | }
69 |
70 | variable "composer_dag_name_prod" {
71 | type = string
72 | default = "prod_word_count"
73 | description = "The Composer DAG name(for prod) to be passed as environment variable."
74 | }
75 |
76 |
77 | variable "composer_dag_name_test" {
78 | type = string
79 | default = "test_word_count"
80 | description = "The Composer DAG name(for test) to be passed as environment variable."
81 |
82 | }
83 |
84 | variable "image_version" {
85 | type = string
86 | description = "The version of the airflow running in the cloud composer environment."
87 | default = "composer-2.0.32-airflow-2.3.4"
88 | }
89 |
--------------------------------------------------------------------------------
/source-code/workflow-dag/compare_xcom_maps.py:
--------------------------------------------------------------------------------
1 | # Copyright 2019 Google Inc.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Custom operator that compares dictionaries in xcom.
15 | """
16 |
17 | from airflow.models import BaseOperator
18 | from airflow.utils.decorators import apply_defaults
19 |
20 |
21 | class CompareXComMapsOperator(BaseOperator):
22 | """Compare dictionary stored in xcom.
23 |
24 | Args:
25 | ref_task_ids: list of task ids from where the reference dictionary
26 | is fetched
27 | res_task_ids: list of task ids from where the comparing dictionary
28 | is fetched
29 | """
30 |
31 | @apply_defaults
32 | def __init__(
33 | self,
34 | ref_task_ids,
35 | res_task_ids,
36 | *args, **kwargs):
37 | super(CompareXComMapsOperator, self).__init__(*args, **kwargs)
38 | self.ref_task_ids = ref_task_ids
39 | self.res_task_ids = res_task_ids
40 |
41 | def execute(self, context):
42 | ref_obj = self.read_value_as_obj(self.ref_task_ids, context)
43 | res_obj = self.read_value_as_obj(self.res_task_ids, context)
44 | self.compare_obj(ref_obj, res_obj)
45 | return 'result contains the expected values'
46 |
47 | def read_value_as_obj(self, task_ids, context):
48 | ret_obj = {}
49 | for task_id in task_ids:
50 | value_str = context['ti'].xcom_pull(
51 | key=None,
52 | task_ids=task_id)
53 | self.parse_str_obj(value_str, ret_obj)
54 | return ret_obj
55 |
56 | def parse_str_obj(self, str_rep, obj):
57 | entries = str_rep.split('\n')
58 | for entry in entries:
59 | if entry:
60 | key, value = entry.split(': ')
61 | obj[key] = value
62 |
63 | def compare_obj(self, ref_obj, res_obj):
64 | if ref_obj != res_obj:
65 | raise ValueError(self.create_diff_str(ref_obj, res_obj))
66 |
67 | def create_diff_str(self, ref_obj, res_obj):
68 | msg = 'The result differs from the expected in the following ways:'
69 | for k in ref_obj:
70 | if k not in res_obj:
71 | msg = msg + ('\nmissing key: %s in result' % k)
72 | elif ref_obj[k] != res_obj[k]:
73 | msg = msg + ('\nexpected %s: %s but got %s: %s' % (
74 | k, ref_obj[k], k, res_obj[k]))
75 | for k in res_obj:
76 | if k not in ref_obj:
77 | msg = msg + ('\nunexpected key: %s in result' % k)
78 | return msg
79 |
--------------------------------------------------------------------------------
/example/1.terraform-automation/composer.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2022 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | module "composer" {
16 | source = "terraform-google-modules/composer/google//modules/create_environment_v2"
17 | project_id = var.project_id
18 | region = var.region
19 | composer_env_name = var.composer_env_name
20 | network = module.vpc.name
21 | subnetwork = var.subnetwork
22 | enable_private_endpoint = false
23 | composer_service_account = local.composer_service_account_email
24 | image_version = var.image_version
25 | pod_ip_allocation_range_name = "pods"
26 | service_ip_allocation_range_name = "services"
27 | env_variables = {
28 | "AIRFLOW_VAR_GCP_PROJECT" = "${var.project_id}",
29 | "AIRFLOW_VAR_GCP_REGION" = "${var.region}",
30 | "AIRFLOW_VAR_GCP_ZONE" = "${var.composer_zone_id}",
31 | "AIRFLOW_VAR_GCP_NETWORK" = "${var.network}",
32 | "AIRFLOW_VAR_GCP_SUBNETWORK" = "regions/${var.region}/subnetworks/${var.subnetwork}",
33 | "AIRFLOW_VAR_DATAFLOW_JAR_LOCATION_TEST" = "${local.dataflow_jar_bucket_test}",
34 | "DATAFLOW_JAR_FILE_TEST" = "to_be_overriden",
35 | "AIRFLOW_VAR_GCS_INPUT_BUCKET_TEST" = "${local.input_bucket_test}",
36 | "AIRFLOW_VAR_GCS_REF_BUCKET_TEST" = "${local.ref_bucket_test}",
37 | "AIRFLOW_VAR_GCS_OUTPUT_BUCKET_TEST" = "${local.result_bucket_test}",
38 | "AIRFLOW_VAR_DATAFLOW_STAGING_BUCKET_TEST" = "${local.dataflow_staging_bucket_test}",
39 | "AIRFLOW_VAR_PUBSUB_TOPIC" = "${var.pubsub_topic}",
40 | "AIRFLOW_VAR_DATAFLOW_JAR_LOCATION_PROD" = "${local.dataflow_jar_bucket_prod}",
41 | "DATAFLOW_JAR_FILE_PROD" = "to_be_overriden",
42 | "AIRFLOW_VAR_GCS_INPUT_BUCKET_PROD" = "${local.input_bucket_prod}",
43 | "AIRFLOW_VAR_GCS_OUTPUT_BUCKET_PROD" = "${local.result_bucket_prod}",
44 | "AIRFLOW_VAR_DATAFLOW_STAGING_BUCKET_PROD" = "${local.dataflow_staging_bucket_prod}",
45 | }
46 | airflow_config_overrides = {
47 | }
48 |
49 | depends_on = [
50 | module.vpc,
51 | module.project-services,
52 | ]
53 | }
54 |
--------------------------------------------------------------------------------
/source-code/build-pipeline/build_deploy_test.yaml:
--------------------------------------------------------------------------------
1 | # Copyright 2019 Google Inc.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | steps:
15 | - name: gcr.io/cloud-builders/git
16 | args: ['clone', 'https://source.developers.google.com/p/$PROJECT_ID/r/$REPO_NAME']
17 | id: 'check-out-source-code'
18 | - name: gcr.io/cloud-builders/mvn:3.5.0-jdk-8
19 | args: ['package', '-q']
20 | dir: '$REPO_NAME/data-processing-code'
21 | id: 'build-jar'
22 | - name: gcr.io/cloud-builders/gsutil
23 | args: ['cp', '*bundled*.jar', 'gs://${_DATAFLOW_JAR_BUCKET}/dataflow_deployment_$BUILD_ID.jar']
24 | dir: '$REPO_NAME/data-processing-code/target'
25 | id: 'deploy-jar'
26 | - name: 'apache/airflow:slim-2.3.1-python3.7'
27 | entrypoint: 'python'
28 | args: ['test_compare_xcom_maps.py']
29 | dir: '$REPO_NAME/workflow-dag'
30 | env: ['PYTHONPATH=/home/airflow/.local/lib/python3.7/site-packages']
31 | id: 'unit-test-on-operator-code'
32 | - name: gcr.io/cloud-builders/gsutil
33 | args: ['cp', 'support-files/input.txt', 'gs://${_COMPOSER_INPUT_BUCKET}']
34 | dir: '$REPO_NAME/workflow-dag'
35 | id: 'deploy-test-input-file'
36 | - name: gcr.io/cloud-builders/gsutil
37 | args: ['cp', 'support-files/ref.txt', 'gs://${_COMPOSER_REF_BUCKET}']
38 | dir: '$REPO_NAME/workflow-dag'
39 | id: 'deploy-test-ref-file'
40 | - name: gcr.io/cloud-builders/gcloud
41 | args: ['composer', 'environments', 'run', '${_COMPOSER_ENV_NAME}', '--location', '${_COMPOSER_REGION}', 'variables', 'set', '--', 'dataflow_jar_file_test', 'dataflow_deployment_$BUILD_ID.jar']
42 | id: 'set-composer-jar-ref'
43 | - name: gcr.io/cloud-builders/gsutil
44 | args: ['cp', 'compare_xcom_maps.py', '${_COMPOSER_DAG_BUCKET}']
45 | dir: '$REPO_NAME/workflow-dag'
46 | id: 'deploy-custom-operator'
47 | - name: gcr.io/cloud-builders/gsutil
48 | args: ['cp', 'data-pipeline-test.py', '${_COMPOSER_DAG_BUCKET}']
49 | dir: '$REPO_NAME/workflow-dag'
50 | id: 'deploy-processing-pipeline'
51 | - name: gcr.io/cloud-builders/gcloud
52 | entrypoint: 'bash'
53 | args: ['wait_for_dag_deployed.sh', '${_COMPOSER_ENV_NAME}', '${_COMPOSER_REGION}', '${_COMPOSER_DAG_NAME_TEST}', '6', '20']
54 | dir: '$REPO_NAME/build-pipeline'
55 | id: 'wait-for-dag-deployed-on-composer'
56 | - name: gcr.io/cloud-builders/gcloud
57 | args: ['composer', 'environments', 'run', '${_COMPOSER_ENV_NAME}', '--location', '${_COMPOSER_REGION}', 'dags', 'trigger', '--', '${_COMPOSER_DAG_NAME_TEST}', '--run-id=$BUILD_ID']
58 | id: 'trigger-pipeline-execution'
59 |
--------------------------------------------------------------------------------
/example/1.terraform-automation/csr-cloudbuildtrigger.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | resource "google_cloudbuild_trigger" "trigger-build-in-test-environment" {
16 | location = "global"
17 | project = var.project_id
18 | name = "datapipeline-trigger-build-test-environment"
19 | trigger_template {
20 | branch_name = "master"
21 | project_id = var.project_id
22 | repo_name = google_sourcerepo_repository.my-repo.name
23 | }
24 |
25 | substitutions = {
26 | REPO_NAME = google_sourcerepo_repository.my-repo.name
27 | _COMPOSER_DAG_BUCKET = local.composer_dag_bucket
28 | _COMPOSER_DAG_NAME_TEST = var.composer_dag_name_test
29 | _COMPOSER_ENV_NAME = var.composer_env_name
30 | _COMPOSER_INPUT_BUCKET = local.input_bucket_test
31 | _COMPOSER_REF_BUCKET = local.ref_bucket_test
32 | _COMPOSER_REGION = var.region
33 | _DATAFLOW_JAR_BUCKET = local.dataflow_jar_bucket_test
34 | }
35 |
36 | filename = "build-pipeline/build_deploy_test.yaml"
37 | depends_on = [
38 | google_sourcerepo_repository.my-repo
39 | ]
40 | }
41 |
42 | resource "google_cloudbuild_trigger" "trigger-build-in-prod-environment" {
43 | location = "global"
44 | project = var.project_id
45 | name = "datapipeline-trigger-build-prod-environment"
46 |
47 | source_to_build {
48 | uri = google_sourcerepo_repository.my-repo.url
49 | ref = "refs/heads/master"
50 | repo_type = "CLOUD_SOURCE_REPOSITORIES"
51 | }
52 |
53 | pubsub_config {
54 | topic = module.pubsub.id
55 | }
56 |
57 | substitutions = {
58 | REPO_NAME = google_sourcerepo_repository.my-repo.name
59 | _COMPOSER_DAG_BUCKET = local.composer_dag_bucket
60 | _COMPOSER_DAG_NAME_PROD = var.composer_dag_name_prod
61 | _COMPOSER_ENV_NAME = var.composer_env_name
62 | _COMPOSER_INPUT_BUCKET = local.input_bucket_prod
63 | _COMPOSER_REF_BUCKET = local.ref_bucket_test
64 | _COMPOSER_REGION = var.region
65 | _DATAFLOW_JAR_BUCKET_PROD = local.dataflow_jar_bucket_prod
66 | _DATAFLOW_JAR_FILE_LATEST = "$(body.message.data)"
67 | _DATAFLOW_JAR_BUCKET_TEST = local.dataflow_jar_bucket_test
68 | }
69 | approval_config {
70 | approval_required = true
71 | }
72 |
73 | filename = "build-pipeline/deploy_prod.yaml"
74 | }
75 |
76 | resource "google_sourcerepo_repository" "my-repo" {
77 | name = var.datapipeline_csr_name
78 | project = var.project_id
79 | }
80 |
81 | resource "google_sourcerepo_repository" "tf-source-repo" {
82 | name = var.terraform_deployment_csr_name
83 | project = var.project_id
84 | }
85 |
--------------------------------------------------------------------------------
/source-code/data-processing-code/src/test/java/org/apache/beam/examples/WordCountTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Google Inc.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one
5 | * or more contributor license agreements. See the NOTICE file
6 | * distributed with this work for additional information
7 | * regarding copyright ownership. The ASF licenses this file
8 | * to you under the Apache License, Version 2.0 (the
9 | * "License"); you may not use this file except in compliance
10 | * with the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | */
20 | package org.apache.beam.examples;
21 |
22 | import java.util.Arrays;
23 | import java.util.List;
24 | import org.apache.beam.examples.WordCount.CountWords;
25 | import org.apache.beam.examples.WordCount.ExtractWordsFn;
26 | import org.apache.beam.examples.WordCount.FormatAsTextFn;
27 | import org.apache.beam.sdk.coders.StringUtf8Coder;
28 | import org.apache.beam.sdk.testing.PAssert;
29 | import org.apache.beam.sdk.testing.TestPipeline;
30 | import org.apache.beam.sdk.testing.ValidatesRunner;
31 | import org.apache.beam.sdk.transforms.Create;
32 | import org.apache.beam.sdk.transforms.DoFn;
33 | import org.apache.beam.sdk.transforms.DoFnTester;
34 | import org.apache.beam.sdk.transforms.MapElements;
35 | import org.apache.beam.sdk.values.PCollection;
36 | import org.hamcrest.CoreMatchers;
37 | import org.junit.Assert;
38 | import org.junit.Rule;
39 | import org.junit.Test;
40 | import org.junit.experimental.categories.Category;
41 | import org.junit.runner.RunWith;
42 | import org.junit.runners.JUnit4;
43 |
44 | /** Tests of WordCount. */
45 | @RunWith(JUnit4.class)
46 | public class WordCountTest {
47 |
48 | /** Example test that tests a specific {@link DoFn}. */
49 | @Test
50 | public void testExtractWordsFn() throws Exception {
51 | DoFnTester < String, String > extractWordsFn = DoFnTester.of(new ExtractWordsFn());
52 |
53 | Assert.assertThat(
54 | extractWordsFn.processBundle(" some input words "),
55 | CoreMatchers.hasItems("some", "input", "words"));
56 | Assert.assertThat(extractWordsFn.processBundle(" "), CoreMatchers.hasItems());
57 | Assert.assertThat(
58 | extractWordsFn.processBundle(" some ", " input", " words"),
59 | CoreMatchers.hasItems("some", "input", "words"));
60 | }
61 |
62 | static final String[] WORDS_ARRAY =
63 | new String[] {
64 | "five",
65 | "five four",
66 | "five four three",
67 | "five four three two",
68 | "",
69 | "five four three two one"
70 | };
71 |
72 | static final List < String > WORDS = Arrays.asList(WORDS_ARRAY);
73 |
74 | static final String[] COUNTS_ARRAY = new String[] {
75 | "five: 5",
76 | "four: 4",
77 | "three: 3",
78 | "two: 2",
79 | "one: 1"
80 | };
81 |
82 | @Rule public TestPipeline p = TestPipeline.create();
83 |
84 | /** Example test that tests a PTransform by using an in-memory input and inspecting the output. */
85 | @Test
86 | @Category(ValidatesRunner.class)
87 | public void testCountWords() throws Exception {
88 | PCollection < String > input = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder.of()));
89 |
90 | PCollection < String > output =
91 | input.apply(new CountWords()).apply(MapElements.via(new FormatAsTextFn()));
92 |
93 | PAssert.that(output).containsInAnyOrder(COUNTS_ARRAY);
94 | p.run().waitUntilFinish();
95 | }
96 | }
97 |
--------------------------------------------------------------------------------
/source-code/workflow-dag/test_compare_xcom_maps.py:
--------------------------------------------------------------------------------
1 | # Copyright 2019 Google Inc.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Unit test of the CompareXComMapsOperator.
15 | """
16 | import unittest
17 | from compare_xcom_maps import CompareXComMapsOperator
18 | from unittest import mock
19 |
20 | TASK_ID = 'test_compare_task_id'
21 | REF_TASK_ID = 'download_ref_string'
22 | DOWNLOAD_TASK_PREFIX = 'download_result'
23 | CONTEXT_CLASS_NAME = 'airflow.ti_deps.dep_context'
24 | ERROR_LINE_ONE = 'The result differs from the expected in the following ways:\n'
25 |
26 |
27 | def generate_mock_function(first_value, second_value, third_value):
28 | def mock_function(**kwargs):
29 | return {
30 | REF_TASK_ID: 'a: 1\nb: 2\nc: 3',
31 | DOWNLOAD_TASK_PREFIX+'_1': first_value,
32 | DOWNLOAD_TASK_PREFIX+'_2': second_value,
33 | DOWNLOAD_TASK_PREFIX+'_3': third_value
34 | }[kwargs['task_ids']]
35 | return mock_function
36 |
37 |
38 | def equal_mock():
39 | return generate_mock_function('c: 3', 'b: 2', 'a: 1')
40 |
41 |
42 | def missing_value_mock():
43 | return generate_mock_function('b: 2', 'a: 1', 'b: 2')
44 |
45 |
46 | def wrong_value_mock():
47 | return generate_mock_function('a: 1', 'b: 4', 'c: 3')
48 |
49 |
50 | def unexpected_value_mock():
51 | return generate_mock_function('a: 1', 'c: 3\nd: 4', 'b: 2')
52 |
53 |
54 | class CompareXComMapsOperatorTest(unittest.TestCase):
55 |
56 | def setUp(self):
57 | super(CompareXComMapsOperatorTest, self).setUp()
58 | self.xcom_compare = CompareXComMapsOperator(
59 | task_id=TASK_ID,
60 | ref_task_ids=[REF_TASK_ID],
61 | res_task_ids=[DOWNLOAD_TASK_PREFIX+'_1',
62 | DOWNLOAD_TASK_PREFIX+'_2',
63 | DOWNLOAD_TASK_PREFIX+'_3'])
64 |
65 | def test_init(self):
66 | self.assertEqual(self.xcom_compare.task_id, TASK_ID)
67 | self.assertListEqual(self.xcom_compare.ref_task_ids, [REF_TASK_ID])
68 | self.assertListEqual(self.xcom_compare.res_task_ids,
69 | [DOWNLOAD_TASK_PREFIX+'_1',
70 | DOWNLOAD_TASK_PREFIX+'_2',
71 | DOWNLOAD_TASK_PREFIX+'_3'])
72 |
73 | def assertRaisesWithMessage(self, error_type, msg, func, *args, **kwargs):
74 | with self.assertRaises(error_type) as context:
75 | func(*args, **kwargs)
76 | self.assertEqual(msg, str(context.exception))
77 |
78 | def execute_value_error(self, mock_func, error_expect_tr):
79 | with mock.patch(CONTEXT_CLASS_NAME) as context_mock:
80 | context_mock['ti'].xcom_pull = mock_func
81 | self.assertRaisesWithMessage(
82 | ValueError,
83 | error_expect_tr,
84 | self.xcom_compare.execute, context_mock)
85 |
86 | def test_equal(self):
87 | with mock.patch(CONTEXT_CLASS_NAME) as context_mock:
88 | context_mock['ti'].xcom_pull = equal_mock()
89 | self.xcom_compare.execute(context_mock)
90 |
91 | def test_missing_value(self):
92 | self.execute_value_error(
93 | missing_value_mock(),
94 | '{}{}'.format(ERROR_LINE_ONE, 'missing key: c in result'))
95 |
96 | def test_wrong_value(self):
97 | self.execute_value_error(
98 | wrong_value_mock(),
99 | '{}{}'.format(ERROR_LINE_ONE, 'expected b: 2 but got b: 4'))
100 |
101 | def test_unexpected_value(self):
102 | self.execute_value_error(
103 | unexpected_value_mock(),
104 | '{}{}'.format(ERROR_LINE_ONE, 'unexpected key: d in result'))
105 |
106 | suite = unittest.TestLoader().loadTestsFromTestCase(CompareXComMapsOperatorTest)
107 | unittest.TextTestRunner(verbosity=2).run(suite)
108 |
--------------------------------------------------------------------------------
/source-code/workflow-dag/data-pipeline-test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2019 Google Inc.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Data processing test workflow definition.
15 | """
16 | import datetime
17 | from airflow import models
18 | from airflow.contrib.operators.dataflow_operator import DataFlowJavaOperator
19 | from airflow.providers.google.cloud.transfers.gcs_to_local import GCSToLocalFilesystemOperator
20 | from airflow.providers.google.cloud.operators.pubsub import PubSubPublishMessageOperator
21 | from compare_xcom_maps import CompareXComMapsOperator
22 |
23 | dataflow_jar_file_test = models.Variable.get('dataflow_jar_file_test')
24 |
25 | dataflow_staging_bucket = 'gs://%s/staging' % (
26 | models.Variable.get('dataflow_staging_bucket_test'))
27 |
28 | dataflow_jar_location = 'gs://%s/%s' % (
29 | models.Variable.get('dataflow_jar_location_test'),
30 | dataflow_jar_file_test)
31 |
32 | project = models.Variable.get('gcp_project')
33 | region = models.Variable.get('gcp_region')
34 | zone = models.Variable.get('gcp_zone')
35 | input_bucket = 'gs://' + models.Variable.get('gcs_input_bucket_test')
36 | output_bucket_name = models.Variable.get('gcs_output_bucket_test')
37 | output_bucket = 'gs://' + output_bucket_name
38 | ref_bucket = models.Variable.get('gcs_ref_bucket_test')
39 | pubsub_topic = models.Variable.get('pubsub_topic')
40 | output_prefix = 'output'
41 | download_task_prefix = 'download_result'
42 |
43 | yesterday = datetime.datetime.combine(
44 | datetime.datetime.today() - datetime.timedelta(1),
45 | datetime.datetime.min.time())
46 |
47 | default_args = {
48 | 'dataflow_default_options': {
49 | 'project': project,
50 | 'zone': zone,
51 | 'region': region,
52 | 'stagingLocation': dataflow_staging_bucket
53 | }
54 | }
55 |
56 | with models.DAG(
57 | 'test_word_count',
58 | schedule_interval=None,
59 | default_args=default_args) as dag:
60 | dataflow_execution = DataFlowJavaOperator(
61 | task_id='wordcount-run',
62 | jar=dataflow_jar_location,
63 | start_date=yesterday,
64 | options={
65 | 'autoscalingAlgorithm': 'THROUGHPUT_BASED',
66 | 'maxNumWorkers': '3',
67 | 'inputFile': input_bucket+'/input.txt',
68 | 'output': output_bucket+'/'+output_prefix
69 | }
70 | )
71 | download_expected = GCSToLocalFilesystemOperator(
72 | task_id='download_ref_string',
73 | bucket=ref_bucket,
74 | object_name='ref.txt',
75 | store_to_xcom_key='ref_str',
76 | start_date=yesterday
77 | )
78 | download_result_one = GCSToLocalFilesystemOperator(
79 | task_id=download_task_prefix+'_1',
80 | bucket=output_bucket_name,
81 | object_name=output_prefix+'-00000-of-00003',
82 | store_to_xcom_key='res_str_1',
83 | start_date=yesterday
84 | )
85 | download_result_two = GCSToLocalFilesystemOperator(
86 | task_id=download_task_prefix+'_2',
87 | bucket=output_bucket_name,
88 | object_name=output_prefix+'-00001-of-00003',
89 | store_to_xcom_key='res_str_2',
90 | start_date=yesterday
91 | )
92 | download_result_three = GCSToLocalFilesystemOperator(
93 | task_id=download_task_prefix+'_3',
94 | bucket=output_bucket_name,
95 | object_name=output_prefix+'-00002-of-00003',
96 | store_to_xcom_key='res_str_3',
97 | start_date=yesterday
98 | )
99 | compare_result = CompareXComMapsOperator(
100 | task_id='do_comparison',
101 | ref_task_ids=['download_ref_string'],
102 | res_task_ids=[download_task_prefix+'_1',
103 | download_task_prefix+'_2',
104 | download_task_prefix+'_3'],
105 | start_date=yesterday
106 | )
107 |
108 | publish_task = PubSubPublishMessageOperator(
109 | task_id='publish_test_complete',
110 | project=project,
111 | topic=pubsub_topic,
112 | messages=[{'data': dataflow_jar_file_test.encode('utf-8')}],
113 | start_date=yesterday
114 | )
115 |
116 | dataflow_execution >> download_result_one
117 | dataflow_execution >> download_result_two
118 | dataflow_execution >> download_result_three
119 |
120 | download_expected >> compare_result
121 | download_result_one >> compare_result
122 | download_result_two >> compare_result
123 | download_result_three >> compare_result
124 |
125 | compare_result >> publish_task
126 |
--------------------------------------------------------------------------------
/example/1.terraform-automation/README.md:
--------------------------------------------------------------------------------
1 | # Terraform Automation Source
2 |
3 |
4 | ## Introduction
5 |
6 | This repository contains the terraform modules which helps in automating the GCP resource creation. These terraform module will create
7 | - Cloud Composer
8 | - GCS bucket
9 | - Cloud Build Trigger
10 | - Cloud Source Repository
11 | - Service Account for Cloud Composer
12 | - Pub-Sub respource
13 | - Network resources like VPC and Subnet
14 | - Enable Google API's
15 |
16 | ## Pre requistie
17 |
18 | 1. You must have a GCP project created with `project owner` permissions.
19 | 2. Recommendation is to use cloud-shell [link](https://cloud.google.com/shell). If using other shell then ensure that terraform and Gcloud cli are installed in your machine.
20 |
21 | ## Execution
22 |
23 | 1. git clone this repository and `cd ci-cd-for-data-processing-workflow/example/1.terraform-automation`.
24 |
25 | `e.g.: git clone https://github.com/GoogleCloudPlatform/ci-cd-for-data-processing-workflow.git && cd ci-cd-for-data-processing-workflow/example/1.terraform-automation`
26 | 2. Perform a gcloud login using `gcloud auth application-default login`
27 | 3. Update `terraform.tfvars` with the values. Variable `project_id` is the mandatory variable that needs to be updated and other variables are optional.
28 | 4. Execute `terraform init`
29 | 5. Execute `terraform plan` and validate the resources displayed in the output.
30 | 6. Execute `terraform apply` and confirm with `yes` when asked to create resources in your google project.
31 | 7. Once the above steps complete, you would have created the GCP resources listed in the `Introduction` section. You should now also be able to see the two source code repositories created in your project, one for the `terraform-automation-source` and another for the `data-pipeline-source`.
32 | 8. You can now push the three folders(build-pipeline, data-processing-code, workflow-dag) present inside the `source-code` folder ([link](https://github.com/GoogleCloudPlatform/ci-cd-for-data-processing-workflow/tree/master/source-code)) in the Code Source repository created with the name `data-pipeline-source`.
33 | 9. Code push will trigger the Cloudbuild trigger which would create the jobs inside the cloud composer created via the terraform.
34 |
35 |
36 | ## Disclaimer
37 |
38 | Copyright 2022 Google. This software is provided as-is, without warranty or representation for any use or purpose.
39 | Your use of it is subject to your agreement with Google.
40 |
41 |
42 | ## Requirements
43 |
44 | No requirements.
45 |
46 | ## Providers
47 |
48 | | Name | Version |
49 | |------|---------|
50 | | [google](#provider\_google) | 4.44.1 |
51 |
52 | ## Modules
53 |
54 | | Name | Source | Version |
55 | |------|--------|---------|
56 | | [composer](#module\_composer) | terraform-google-modules/composer/google//modules/create_environment_v2 | n/a |
57 | | [composer-service-accounts](#module\_composer-service-accounts) | github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/iam-service-account | v18.0.0/ |
58 | | [gcs\_buckets\_prod](#module\_gcs\_buckets\_prod) | terraform-google-modules/cloud-storage/google | n/a |
59 | | [gcs\_buckets\_test](#module\_gcs\_buckets\_test) | terraform-google-modules/cloud-storage/google | n/a |
60 | | [pubsub](#module\_pubsub) | terraform-google-modules/pubsub/google | ~> 1.8 |
61 | | [vpc](#module\_vpc) | github.com/GoogleCloudPlatform/cloud-foundation-fabric//modules/net-vpc | v18.0.0/ |
62 |
63 | ## Resources
64 |
65 | | Name | Type |
66 | |------|------|
67 | | [google_cloudbuild_trigger.trigger-build-in-prod-environment](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource |
68 | | [google_cloudbuild_trigger.trigger-build-in-test-environment](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource |
69 | | [google_project_iam_member.cloudbuild_sa](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource |
70 | | [google_project_service.project](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_service) | resource |
71 | | [google_sourcerepo_repository.my-repo](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/sourcerepo_repository) | resource |
72 | | [google_sourcerepo_repository.tf-source-repo](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/sourcerepo_repository) | resource |
73 | | [google_project.project](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/project) | data source |
74 |
75 | ## Inputs
76 |
77 | | Name | Description | Type | Default | Required |
78 | |------|-------------|------|---------|:--------:|
79 | | [composer\_dag\_name\_prod](#input\_composer\_dag\_name\_prod) | The Composer DAG name(for prod) to be passed as environment variable. | `string` | `"prod_word_count"` | no |
80 | | [composer\_dag\_name\_test](#input\_composer\_dag\_name\_test) | The Composer DAG name(for test) to be passed as environment variable. | `string` | `"test_word_count"` | no |
81 | | [composer\_env\_name](#input\_composer\_env\_name) | Name of Cloud Composer Environment | `string` | `"composer-dev-env"` | no |
82 | | [composer\_zone\_id](#input\_composer\_zone\_id) | Zone value which is passed to the Airflow envt | `string` | `"us-central1-a"` | no |
83 | | [datapipeline\_csr\_name](#input\_datapipeline\_csr\_name) | The CSR repo name to be used for storing the datapipeline source code. | `string` | `"data-pipeline-source"` | no |
84 | | [image\_version](#input\_image\_version) | The version of the aiflow running in the cloud composer environment. | `string` | `"composer-2.0.32-airflow-2.3.4"` | no |
85 | | [network](#input\_network) | The VPC network to host the composer cluster. | `string` | `"default"` | no |
86 | | [project\_id](#input\_project\_id) | Project ID where Cloud Composer Environment is created. | `string` | n/a | yes |
87 | | [pubsub\_topic](#input\_pubsub\_topic) | Name of the pub sub topic. | `string` | `"integration-test-complete-topic"` | no |
88 | | [region](#input\_region) | Region where the Cloud Composer Environment is created. | `string` | `"us-central1"` | no |
89 | | [subnetwork](#input\_subnetwork) | The subnetwork to host the composer cluster. | `string` | `"default"` | no |
90 | | [terraform\_deployment\_csr\_name](#input\_terraform\_deployment\_csr\_name) | The CSR repo name to be used for storing the terraform code. | `string` | `"terraform-automation-source"` | no |
91 |
92 | ## Outputs
93 |
94 | | Name | Description |
95 | |------|-------------|
96 | | [composer](#output\_composer) | Information about the cloud composer resource which is created |
97 |
98 |
--------------------------------------------------------------------------------
/source-code/data-processing-code/src/main/java/org/apache/beam/examples/WordCount.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Google Inc.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one
5 | * or more contributor license agreements. See the NOTICE file
6 | * distributed with this work for additional information
7 | * regarding copyright ownership. The ASF licenses this file
8 | * to you under the Apache License, Version 2.0 (the
9 | * "License"); you may not use this file except in compliance
10 | * with the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | */
20 | package org.apache.beam.examples;
21 |
22 | import org.apache.beam.sdk.Pipeline;
23 | import org.apache.beam.sdk.io.TextIO;
24 | import org.apache.beam.sdk.metrics.Counter;
25 | import org.apache.beam.sdk.metrics.Distribution;
26 | import org.apache.beam.sdk.metrics.Metrics;
27 | import org.apache.beam.sdk.options.Default;
28 | import org.apache.beam.sdk.options.Description;
29 | import org.apache.beam.sdk.options.PipelineOptions;
30 | import org.apache.beam.sdk.options.PipelineOptionsFactory;
31 | import org.apache.beam.sdk.options.Validation.Required;
32 | import org.apache.beam.sdk.transforms.Count;
33 | import org.apache.beam.sdk.transforms.DoFn;
34 | import org.apache.beam.sdk.transforms.MapElements;
35 | import org.apache.beam.sdk.transforms.PTransform;
36 | import org.apache.beam.sdk.transforms.ParDo;
37 | import org.apache.beam.sdk.transforms.SimpleFunction;
38 | import org.apache.beam.sdk.values.KV;
39 | import org.apache.beam.sdk.values.PCollection;
40 |
41 | /**
42 | * An example that counts words in Shakespeare and includes Beam best practices.
43 | *
44 | *
This class, {@link WordCount}, is the second in a series of four successively more detailed
45 | * 'word count' examples. You may first want to take a look at {@link MinimalWordCount}. After
46 | * you've looked at this example, then see the {@link DebuggingWordCount} pipeline, for introduction
47 | * of additional concepts.
48 | *
49 | *
Basic concepts, also in the MinimalWordCount example: Reading text files; counting a
54 | * PCollection; writing to text files
55 | *
56 | *
New Concepts:
57 | *
58 | *
59 | * 1. Executing a Pipeline both locally and using the selected runner
60 | * 2. Using ParDo with static DoFns defined out-of-line
61 | * 3. Building a composite transform
62 | * 4. Defining your own pipeline options
63 | *
64 | *
65 | *
Concept #1: you can execute this pipeline either locally or using by selecting another runner.
66 | * These are now command-line options and not hard-coded as they were in the MinimalWordCount
67 | * example.
68 | *
69 | *
To execute this pipeline, specify a local output file (if using the {@code DirectRunner}) or
76 | * output prefix on a supported distributed file system.
77 | *
78 | *
The input file defaults to a public data set containing the text of of King Lear, by William
83 | * Shakespeare. You can override it and choose your own input with {@code --inputFile}.
84 | */
85 | public class WordCount {
86 |
87 | /**
88 | * Concept #2: You can make your pipeline assembly code less verbose by defining your DoFns
89 | * statically out-of-line. This DoFn tokenizes lines of text into individual words; we pass it to
90 | * a ParDo in the pipeline.
91 | */
92 | static class ExtractWordsFn extends DoFn {
93 | private final Counter emptyLines = Metrics.counter(ExtractWordsFn.class, "emptyLines");
94 | private final Distribution lineLenDist =
95 | Metrics.distribution(ExtractWordsFn.class, "lineLenDistro");
96 | private static final String TOKENIZER_PATTERN = "[^\\p{L}]+";
97 |
98 | @ProcessElement
99 | public void processElement(@Element String element, OutputReceiver receiver) {
100 | lineLenDist.update(element.length());
101 | if (element.trim().isEmpty()) {
102 | emptyLines.inc();
103 | }
104 |
105 | // Split the line into words.
106 | String[] words = element.split(TOKENIZER_PATTERN, -1);
107 |
108 | // Output each word encountered into the output PCollection.
109 | for (String word: words) {
110 | if (!word.isEmpty()) {
111 | receiver.output(word);
112 | }
113 | }
114 | }
115 | }
116 |
117 | /** A SimpleFunction that converts a Word and Count into a printable string. */
118 | public static class FormatAsTextFn extends SimpleFunction, String> {
119 | @Override
120 | public String apply(KV input) {
121 | return input.getKey() + ": " + input.getValue();
122 | }
123 | }
124 |
125 | /**
126 | * A PTransform that converts a PCollection containing lines of text into a PCollection of
127 | * formatted word counts.
128 | *
129 | *
Concept #3: This is a custom composite transform that bundles two transforms (ParDo and
130 | * Count) as a reusable PTransform subclass. Using composite transforms allows for easy reuse,
131 | * modular testing, and an improved monitoring experience.
132 | */
133 | public static class CountWords
134 | extends PTransform, PCollection>> {
135 | @Override
136 | public PCollection>expand(PCollection lines) {
137 |
138 | // Convert lines of text into individual words.
139 | PCollection words = lines.apply(ParDo.of(new ExtractWordsFn()));
140 |
141 | // Count the number of times each word occurs.
142 | PCollection> wordCounts = words.apply(Count.perElement());
143 |
144 | return wordCounts;
145 | }
146 | }
147 |
148 | /**
149 | * Options supported by {@link WordCount}.
150 | *
151 | *
Concept #4: Defining your own configuration options. Here, you can add your own arguments to
152 | * be processed by the command-line parser, and specify default values for them. You can then
153 | * access the options values in your pipeline code.
154 | *
155 | *
Inherits standard configuration options.
156 | */
157 | public interface WordCountOptions extends PipelineOptions {
158 |
159 | /**
160 | * By default, this example reads from a public dataset containing the text of King Lear. Set
161 | * this option to choose a different input file or glob.
162 | */
163 | @Description("Path of the file to read from")
164 | @Default.String("gs://apache-beam-samples/shakespeare/kinglear.txt")
165 | String getInputFile();
166 |
167 | void setInputFile(String value);
168 |
169 | /** Set this required option to specify where to write the output. */
170 | @Description("Path of the file to write to")
171 | @Required
172 | String getOutput();
173 |
174 | void setOutput(String value);
175 | }
176 |
177 | static void runWordCount(WordCountOptions options) {
178 | Pipeline p = Pipeline.create(options);
179 |
180 | // Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the
181 | // static FormatAsTextFn() to the ParDo transform.
182 | p.apply("ReadLines", TextIO.read().from(options.getInputFile()))
183 | .apply(new CountWords())
184 | .apply(MapElements.via(new FormatAsTextFn()))
185 | .apply("WriteCounts", TextIO.write().to(options.getOutput()).withNumShards(3));
186 |
187 | p.run().waitUntilFinish();
188 | }
189 |
190 | public static void main(String[] args) {
191 | WordCountOptions options =
192 | PipelineOptionsFactory.fromArgs(args).withValidation().as(WordCountOptions.class);
193 | runWordCount(options);
194 | }
195 | }
196 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
--------------------------------------------------------------------------------
/source-code/data-processing-code/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
18 |
22 | 4.0.0
23 | org.example
24 | word-count-beam
25 | 0.1
26 | jar
27 |
28 | 2.19.0
29 | 1.28.0
30 | [30.0-jre,)
31 | 2.1
32 | [2.9.10.8,)
33 | 2.10.3
34 | [4.13.1,)
35 | 3.7.0
36 | 1.6.0
37 | 3.0.2
38 | 3.1.0
39 | 3.0.0
40 | 1.7.25
41 | 2.21.0
42 |
43 |
44 |
45 | apache.snapshots
46 | Apache Development Snapshot Repository
47 | https://repository.apache.org/content/repositories/snapshots/
48 |
49 | false
50 |
51 |
52 | true
53 |
54 |
55 |
56 |
57 |
58 |
59 | org.apache.maven.plugins
60 | maven-compiler-plugin
61 | ${maven-compiler-plugin.version}
62 |
63 | 1.8
64 | 1.8
65 |
66 |
67 |
69 |
70 | org.apache.maven.plugins
71 | maven-surefire-plugin
72 | ${maven-surefire-plugin.version}
73 |
74 | all
75 | 4
76 | true
77 |
78 |
79 |
80 | org.apache.maven.surefire
81 | surefire-junit47
82 | ${maven-surefire-plugin.version}
83 |
84 |
85 |
86 |
87 | org.apache.maven.plugins
88 | maven-jar-plugin
89 | ${maven-jar-plugin.version}
90 |
91 |
92 |
93 | true
94 | lib/
95 | org.apache.beam.examples.WordCount
96 |
97 |
98 |
99 |
100 |
104 |
105 | org.apache.maven.plugins
106 | maven-shade-plugin
107 | ${maven-shade-plugin.version}
108 |
109 |
110 | package
111 |
112 | shade
113 |
114 |
115 | ${project.artifactId}-bundled-${project.version}
116 |
117 |
118 | *:*
119 |
120 | META-INF/LICENSE
121 | META-INF/*.SF
122 | META-INF/*.DSA
123 | META-INF/*.RSA
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 | org.codehaus.mojo
139 | exec-maven-plugin
140 | ${maven-exec-plugin.version}
141 |
142 | false
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 | direct-runner
151 |
152 | true
153 |
154 |
155 |
156 |
157 | org.apache.beam
158 | beam-runners-direct-java
159 | ${beam.version}
160 | runtime
161 |
162 |
163 |
164 |
165 | dataflow-runner
166 |
167 |
168 |
169 | org.apache.beam
170 | beam-runners-google-cloud-dataflow-java
171 | ${beam.version}
172 | runtime
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 | org.apache.beam
181 | beam-sdks-java-core
182 | ${beam.version}
183 |
184 |
185 |
186 | org.apache.beam
187 | beam-sdks-java-io-google-cloud-platform
188 | ${beam.version}
189 |
190 |
191 |
192 | com.google.api-client
193 | google-api-client
194 | ${google-clients.version}
195 |
196 |
198 |
199 | com.google.guava
200 | guava-jdk5
201 |
202 |
203 |
204 |
205 | com.google.http-client
206 | google-http-client
207 | ${google-clients.version}
208 |
209 |
211 |
212 | com.google.guava
213 | guava-jdk5
214 |
215 |
216 |
217 |
218 | joda-time
219 | joda-time
220 | ${joda.version}
221 |
222 |
223 | com.google.guava
224 | guava
225 | ${guava.version}
226 |
227 |
228 |
229 | org.slf4j
230 | slf4j-api
231 | ${slf4j.version}
232 |
233 |
234 | org.slf4j
235 | slf4j-jdk14
236 | ${slf4j.version}
237 |
238 | runtime
239 |
240 |
241 |
242 | org.apache.beam
243 | beam-runners-direct-java
244 | ${beam.version}
245 | test
246 |
247 |
248 | org.hamcrest
249 | hamcrest-core
250 | ${hamcrest.version}
251 | test
252 |
253 |
254 | org.mockito
255 | mockito-core
256 | ${mockito.version}
257 | test
258 |
259 |
260 | junit
261 | junit
262 | ${junit.version}
263 |
264 |
265 | org.apache.beam
266 | beam-runners-google-cloud-dataflow-java
267 | ${beam.version}
268 | runtime
269 |
270 |
271 |
272 |
--------------------------------------------------------------------------------