├── helpers ├── bq-list-locations │ ├── requirements.txt │ └── main.py └── bq-remote-functions │ └── get-policy-tags │ └── requirements.txt ├── diagrams ├── summary.png ├── security model example.jpg ├── standard mode-design.jpg ├── discovery-service-design.jpg ├── discovery-service-usage-cron.jpg ├── standard mode-usage-immediate scan.jpg ├── standard mode-usage-inventory scan.jpg ├── standard mode-usage-event based scan.jpg └── discovery-service-usage-tagger-notification.jpg ├── docs ├── common-iam-example.md ├── common-quotas.md ├── common-terraform-3-apply.md ├── common-limits.md ├── common-terraform-1-prepare.md └── release-notes │ └── v2.0.0.md ├── terraform ├── modules │ ├── gcs │ │ ├── output.tf │ │ ├── variables.tf │ │ ├── providers.tf │ │ └── main.tf │ ├── cloud-run │ │ ├── output.tf │ │ ├── providers.tf │ │ ├── variables.tf │ │ └── main.tf │ ├── cloud-logging │ │ ├── output.tf │ │ ├── variables.tf │ │ ├── main.tf │ │ └── providers.tf │ ├── pubsub │ │ ├── output.tf │ │ ├── providers.tf │ │ ├── variables.tf │ │ └── main.tf │ ├── bq-remote-function │ │ ├── procedures │ │ │ └── deploy_get_policy_tags_remote_func.tpl │ │ ├── providers.tf │ │ ├── output.tf │ │ └── variables.tf │ ├── dlp │ │ ├── output.tf │ │ ├── providers.tf │ │ └── variables.tf │ ├── bigquery │ │ ├── views │ │ │ ├── v_tagging_actions.tpl │ │ │ ├── v_steps.tpl │ │ │ ├── v_tracking_id_to_table_map.tpl │ │ │ ├── v_errors_retryable.tpl │ │ │ ├── v_errors_non_retryable.tpl │ │ │ ├── v_service_calls.tpl │ │ │ ├── v_log_label_history.tpl │ │ │ ├── v_log_tag_history.tpl │ │ │ ├── v_run_summary.tpl │ │ │ ├── v_run_summary_counts.tpl │ │ │ └── v_broken_steps.tpl │ │ ├── providers.tf │ │ ├── output.tf │ │ └── variables.tf │ ├── iam │ │ ├── providers.tf │ │ ├── variables.tf │ │ └── output.tf │ ├── data-catalog │ │ ├── providers.tf │ │ ├── output.tf │ │ ├── variables.tf │ │ └── main.tf │ ├── cloud-scheduler │ │ ├── providers.tf │ │ ├── variables.tf │ │ └── main.tf │ └── data_project_permissions_in_standard_mode │ │ ├── providers.tf │ │ ├── variables.tf │ │ └── main.tf ├── outputs.tf ├── stacks │ ├── inspection │ │ ├── output.tf │ │ ├── providers.tf │ │ ├── iam.tf │ │ └── variables.tf │ └── common │ │ ├── providers.tf │ │ └── output.tf └── terraform.tf ├── services ├── library │ └── src │ │ ├── main │ │ ├── java │ │ │ └── com │ │ │ │ └── google │ │ │ │ └── cloud │ │ │ │ └── pso │ │ │ │ └── bq_pii_classifier │ │ │ │ ├── entities │ │ │ │ ├── JsonMessage.java │ │ │ │ ├── SolutionMode.java │ │ │ │ ├── DispatcherType.java │ │ │ │ ├── dlp │ │ │ │ │ ├── DataProfileActionOrBuilder.java │ │ │ │ │ ├── TableDataProfileOrBuilder.java │ │ │ │ │ ├── DataProfilePubSubMessageOrBuilder.java │ │ │ │ │ └── dlp.proto │ │ │ │ ├── ResourceLabel.java │ │ │ │ ├── FunctionLifeCycleEvent.java │ │ │ │ ├── TableScanLimitsType.java │ │ │ │ ├── TablePolicyTags.java │ │ │ │ ├── NonRetryableApplicationException.java │ │ │ │ ├── PolicyTagInfo.java │ │ │ │ ├── InfoTypeInfo.java │ │ │ │ ├── ApplicationLog.java │ │ │ │ ├── Operation.java │ │ │ │ ├── TagHistoryLogEntry.java │ │ │ │ ├── TableScanLimitsConfig.java │ │ │ │ ├── PubSubEvent.java │ │ │ │ └── TableSpec.java │ │ │ │ ├── services │ │ │ │ ├── set │ │ │ │ │ ├── PersistentSet.java │ │ │ │ │ └── GCSPersistentSetImpl.java │ │ │ │ ├── findings │ │ │ │ │ ├── FindingsReaderType.java │ │ │ │ │ ├── FindingsReader.java │ │ │ │ │ └── FindingsReaderFactory.java │ │ │ │ ├── pubsub │ │ │ │ │ ├── PubSubService.java │ │ │ │ │ ├── SuccessPubSubMessage.java │ │ │ │ │ ├── FailedPubSubMessage.java │ │ │ │ │ ├── PubSubPublishResults.java │ │ │ │ │ └── PubSubServiceImpl.java │ │ │ │ ├── dlp │ │ │ │ │ ├── DlpService.java │ │ │ │ │ └── DlpServiceImpl.java │ │ │ │ ├── scan │ │ │ │ │ ├── Scanner.java │ │ │ │ │ └── BigQueryScannerImpl.java │ │ │ │ └── bq │ │ │ │ │ └── BigQueryService.java │ │ │ │ ├── functions │ │ │ │ ├── tagger │ │ │ │ │ ├── TaggerDlpJobRequest.java │ │ │ │ │ ├── TaggerTableSpecRequest.java │ │ │ │ │ └── ColumnTaggingAction.java │ │ │ │ ├── inspector │ │ │ │ │ ├── InspectorRequest.java │ │ │ │ │ └── InspectorConfig.java │ │ │ │ └── dispatcher │ │ │ │ │ ├── BigQueryScope.java │ │ │ │ │ └── DispatcherConfig.java │ │ │ │ └── helpers │ │ │ │ ├── ThrowableInfo.java │ │ │ │ └── TrackingHelper.java │ │ └── resources │ │ │ ├── logback.xml │ │ │ └── sql │ │ │ └── v_dlp_fields_findings_auto_dlp.tpl │ │ └── test │ │ └── java │ │ └── com │ │ └── google │ │ └── cloud │ │ └── pso │ │ └── bq_pii_classifier │ │ └── functions │ │ ├── helpers │ │ ├── TrackingHelperTest.java │ │ └── UtilsTest.java │ │ ├── entities │ │ ├── TableSpecTest.java │ │ └── InfoTypeInfoTest.java │ │ └── dispatcher │ │ └── BigQueryScopeTest.java ├── cloudbuild_deploy_common_services.yaml ├── cloudbuild_deploy_all_services.yaml ├── dispatcher-inspection-app │ ├── src │ │ └── main │ │ │ └── java │ │ │ └── com │ │ │ └── google │ │ │ └── cloud │ │ │ └── pso │ │ │ └── bq_pii_classifier │ │ │ └── dispatcher │ │ │ └── Environment.java │ └── pom.xml ├── tagger-app │ ├── pom.xml │ └── src │ │ └── main │ │ └── java │ │ └── com │ │ └── google │ │ └── cloud │ │ └── pso │ │ └── bq_pii_classifier │ │ └── tagger │ │ └── Environment.java ├── inspector-app │ ├── pom.xml │ └── src │ │ └── main │ │ └── java │ │ └── com │ │ └── google │ │ └── cloud │ │ └── pso │ │ └── bq_pii_classifier │ │ └── inspector │ │ └── Environment.java ├── dispatcher-tagging-app │ ├── pom.xml │ └── src │ │ └── main │ │ └── java │ │ └── com │ │ └── google │ │ └── cloud │ │ └── pso │ │ └── bq_pii_classifier │ │ └── dispatcher │ │ └── Environment.java └── pom.xml ├── scripts ├── deploy_all.sh ├── deploy_all_services.sh ├── deploy_all_cloudbuild.sh ├── schedulers_action.sh ├── cancel_running_bq_jobs.py ├── deploy_common_services.sh ├── deploy_inspection_services.sh ├── prepare_auto_dlp_results_dataset.sh ├── prepare_stress_test.sh ├── deploy_common_services_cloudbuild.sh ├── prepare_terraform_service_account_on_data_projects.sh ├── deploy_terraform.sh ├── deploy_all_services_cloudbuild.sh ├── prepare_host_project_for_auto_dlp_apis.sh ├── prepare_terraform_service_account_on_host_project.sh ├── prepare_data_projects_for_auto_dlp_mode.sh ├── prepare_end_user_permissions.sh └── prepare_data_projects_for_standard_mode.sh ├── .github ├── dependabot.yaml └── workflows │ └── main.yml ├── CONTRIBUTING.md └── .gitignore /helpers/bq-list-locations/requirements.txt: -------------------------------------------------------------------------------- 1 | google-cloud-bigquery -------------------------------------------------------------------------------- /diagrams/summary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/bq-pii-classifier/HEAD/diagrams/summary.png -------------------------------------------------------------------------------- /docs/common-iam-example.md: -------------------------------------------------------------------------------- 1 | ## Data Access Model Example 2 | 3 | ![alt text](../diagrams/security%20model%20example.jpg) -------------------------------------------------------------------------------- /terraform/modules/gcs/output.tf: -------------------------------------------------------------------------------- 1 | output "create_gcs_flags_bucket_name" { 2 | value = google_storage_bucket.gcs_flags_bucket.name 3 | } -------------------------------------------------------------------------------- /diagrams/security model example.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/bq-pii-classifier/HEAD/diagrams/security model example.jpg -------------------------------------------------------------------------------- /diagrams/standard mode-design.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/bq-pii-classifier/HEAD/diagrams/standard mode-design.jpg -------------------------------------------------------------------------------- /terraform/modules/cloud-run/output.tf: -------------------------------------------------------------------------------- 1 | output "service_endpoint" { 2 | value = google_cloud_run_service.service.status[0].url 3 | } 4 | 5 | -------------------------------------------------------------------------------- /diagrams/discovery-service-design.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/bq-pii-classifier/HEAD/diagrams/discovery-service-design.jpg -------------------------------------------------------------------------------- /terraform/modules/cloud-logging/output.tf: -------------------------------------------------------------------------------- 1 | output "service_account" { 2 | value = google_logging_project_sink.bigquery-logging-sink.writer_identity 3 | } -------------------------------------------------------------------------------- /diagrams/discovery-service-usage-cron.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/bq-pii-classifier/HEAD/diagrams/discovery-service-usage-cron.jpg -------------------------------------------------------------------------------- /diagrams/standard mode-usage-immediate scan.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/bq-pii-classifier/HEAD/diagrams/standard mode-usage-immediate scan.jpg -------------------------------------------------------------------------------- /diagrams/standard mode-usage-inventory scan.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/bq-pii-classifier/HEAD/diagrams/standard mode-usage-inventory scan.jpg -------------------------------------------------------------------------------- /terraform/outputs.tf: -------------------------------------------------------------------------------- 1 | output "bq_get_policy_tags_remote_function_deployment_status" { 2 | value = module.bq-remote-func-get-table-policy-tags.deploy_job_status 3 | } -------------------------------------------------------------------------------- /diagrams/standard mode-usage-event based scan.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/bq-pii-classifier/HEAD/diagrams/standard mode-usage-event based scan.jpg -------------------------------------------------------------------------------- /diagrams/discovery-service-usage-tagger-notification.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/bq-pii-classifier/HEAD/diagrams/discovery-service-usage-tagger-notification.jpg -------------------------------------------------------------------------------- /terraform/modules/pubsub/output.tf: -------------------------------------------------------------------------------- 1 | 2 | output "topic-id" { 3 | value = google_pubsub_topic.topic.id 4 | } 5 | 6 | output "topic-name" { 7 | value = google_pubsub_topic.topic.name 8 | } -------------------------------------------------------------------------------- /terraform/modules/cloud-logging/variables.tf: -------------------------------------------------------------------------------- 1 | variable "project" { 2 | type = string 3 | } 4 | variable "dataset" { 5 | type = string 6 | } 7 | variable "log_sink_name" { 8 | type = string 9 | } -------------------------------------------------------------------------------- /helpers/bq-remote-functions/get-policy-tags/requirements.txt: -------------------------------------------------------------------------------- 1 | functions-framework==3.* 2 | google-cloud-bigquery 3 | google-cloud-datacatalog 4 | google-cloud-logging 5 | google-cloud-datastore 6 | pytz -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/entities/JsonMessage.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.pso.bq_pii_classifier.entities; 2 | 3 | public interface JsonMessage { 4 | 5 | String toJsonString(); 6 | } 7 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/entities/SolutionMode.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.pso.bq_pii_classifier.entities; 2 | 3 | public enum SolutionMode { 4 | STANDARD_DLP, 5 | AUTO_DLP 6 | } 7 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/entities/DispatcherType.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.pso.bq_pii_classifier.entities; 2 | 3 | public enum DispatcherType { 4 | INSPECTION, 5 | TAGGING 6 | } 7 | -------------------------------------------------------------------------------- /terraform/stacks/inspection/output.tf: -------------------------------------------------------------------------------- 1 | output "sa_inspection_dispatcher_email" { 2 | value = google_service_account.sa_inspection_dispatcher.email 3 | } 4 | 5 | output "sa_inspector_email" { 6 | value = google_service_account.sa_inspector.email 7 | } -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/services/set/PersistentSet.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.pso.bq_pii_classifier.services.set; 2 | 3 | public interface PersistentSet { 4 | 5 | void add(String key); 6 | boolean contains(String key); 7 | } 8 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/services/findings/FindingsReaderType.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.pso.bq_pii_classifier.services.findings; 2 | 3 | public enum FindingsReaderType { 4 | AUTO_DLP, 5 | STANDARD_DLP_WITH_MIXED_INFO_TYPES_PROMOTION, 6 | STANDARD_DLP_WITHOUT_MIXED_INFO_TYPES_PROMOTION 7 | } 8 | -------------------------------------------------------------------------------- /terraform/modules/bq-remote-function/procedures/deploy_get_policy_tags_remote_func.tpl: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE FUNCTION `${project}.${dataset}`.${function_name}(table_spec STRING) RETURNS JSON 2 | 3 | REMOTE WITH CONNECTION `${project}.${connection_region}.${connection_name}` 4 | OPTIONS ( 5 | endpoint = "${cloud_function_url}", 6 | max_batching_rows = 100 7 | ); -------------------------------------------------------------------------------- /terraform/modules/dlp/output.tf: -------------------------------------------------------------------------------- 1 | output "created_inspection_templates" { 2 | value = { 3 | region = var.region 4 | ids = google_data_loss_prevention_inspect_template.inspection_template[*].id 5 | } 6 | } 7 | 8 | output "inspection_templates" { 9 | value = google_data_loss_prevention_inspect_template.inspection_template[*] 10 | } 11 | 12 | -------------------------------------------------------------------------------- /docs/common-quotas.md: -------------------------------------------------------------------------------- 1 | ## GCP Quotas 2 | 3 | Inspector: 4 | * DLP: 600 requests per min 5 | * DLP: 1000 running jobs 6 | 7 | Tagger: 8 | * Maximum rate of dataset metadata update operations (including patch) 9 | * 5 operations every 10 seconds per dataset 10 | 11 | Rate limiting for each service/step could be configured in the corresponding 12 | PubSub push subscription via Terraform. -------------------------------------------------------------------------------- /terraform/modules/gcs/variables.tf: -------------------------------------------------------------------------------- 1 | variable "project" { 2 | type = string 3 | } 4 | 5 | variable "region" { 6 | type = string 7 | } 8 | 9 | variable "gcs_flags_bucket_name" { 10 | type = string 11 | } 12 | 13 | variable "gcs_flags_bucket_admins" { 14 | type = list(string) 15 | } 16 | 17 | variable "terraform_data_deletion_protection" { 18 | type = bool 19 | } 20 | 21 | variable "default_labels" { 22 | type = map(string) 23 | } -------------------------------------------------------------------------------- /terraform/modules/bigquery/views/v_tagging_actions.tpl: -------------------------------------------------------------------------------- 1 | SELECT 2 | th.start_time, 3 | th.run_id, 4 | th.tracker, 5 | th.project_id, 6 | th.dataset_id, 7 | th.table_id, 8 | th.field_id, 9 | m.info_type, 10 | th.existing_policy_tag, 11 | th.new_policy_tag, 12 | th.operation, 13 | th.details 14 | FROM `${project}.${dataset}.${v_log_tag_history}` th 15 | INNER JOIN `${project}.${dataset}.${v_config_infotypes_policytags_map}` m 16 | ON th.new_policy_tag = m.policy_tag 17 | ORDER BY tracker -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/entities/dlp/DataProfileActionOrBuilder.java: -------------------------------------------------------------------------------- 1 | // Generated by the protocol buffer compiler. DO NOT EDIT! 2 | // source: dlp.proto 3 | 4 | package com.google.cloud.pso.bq_pii_classifier.entities.dlp; 5 | 6 | public interface DataProfileActionOrBuilder extends 7 | // @@protoc_insertion_point(interface_extends:com.google.cloud.pso.bq_pii_classifier.entities.dlp.DataProfileAction) 8 | com.google.protobuf.MessageOrBuilder { 9 | } 10 | -------------------------------------------------------------------------------- /terraform/modules/bigquery/views/v_steps.tpl: -------------------------------------------------------------------------------- 1 | SELECT 2 | TIMESTAMP_MILLIS(CAST(SUBSTR(jsonPayload.global_run_id, 0, 13) AS INT64)) AS start_time, 3 | jsonPayload.global_run_id AS run_id, 4 | jsonPayload.global_tracker AS tracker, 5 | jsonPayload.global_logger_name AS function_name, 6 | jsonPayload.function_lifecycle_functionnumber AS function_number, 7 | jsonPayload.function_lifecycle_event AS step 8 | FROM `${project}.${dataset}.${logging_table}` 9 | WHERE jsonPayload.global_app_log = 'TRACKER_LOG' 10 | 11 | -------------------------------------------------------------------------------- /terraform/modules/cloud-logging/main.tf: -------------------------------------------------------------------------------- 1 | 2 | resource "google_logging_project_sink" "bigquery-logging-sink" { 3 | name = var.log_sink_name 4 | destination = "bigquery.googleapis.com/projects/${var.project}/datasets/${var.dataset}" 5 | filter = "resource.type=cloud_run_revision jsonPayload.global_app=bq-pii-classifier" 6 | # Use a unique writer (creates a unique service account used for writing) 7 | unique_writer_identity = true 8 | bigquery_options { 9 | use_partitioned_tables = true 10 | } 11 | } -------------------------------------------------------------------------------- /terraform/modules/dlp/providers.tf: -------------------------------------------------------------------------------- 1 | # adding provider block in all modules for tflint 2 | # https://github.com/terraform-linters/tflint-ruleset-terraform/blob/v0.2.2/docs/rules/terraform_required_providers.md 3 | terraform { 4 | required_version = ">= 1.12.1" 5 | 6 | required_providers { 7 | google = { 8 | source = "hashicorp/google" 9 | version = "= 5.20.0" 10 | } 11 | 12 | google-beta = { 13 | source = "hashicorp/google" 14 | version = "= 5.20.0" 15 | } 16 | } 17 | } -------------------------------------------------------------------------------- /terraform/modules/gcs/providers.tf: -------------------------------------------------------------------------------- 1 | # adding provider block in all modules for tflint 2 | # https://github.com/terraform-linters/tflint-ruleset-terraform/blob/v0.2.2/docs/rules/terraform_required_providers.md 3 | terraform { 4 | required_version = ">= 1.12.1" 5 | 6 | required_providers { 7 | google = { 8 | source = "hashicorp/google" 9 | version = "= 5.20.0" 10 | } 11 | 12 | google-beta = { 13 | source = "hashicorp/google" 14 | version = "= 5.20.0" 15 | } 16 | } 17 | } -------------------------------------------------------------------------------- /terraform/modules/iam/providers.tf: -------------------------------------------------------------------------------- 1 | # adding provider block in all modules for tflint 2 | # https://github.com/terraform-linters/tflint-ruleset-terraform/blob/v0.2.2/docs/rules/terraform_required_providers.md 3 | terraform { 4 | required_version = ">= 1.12.1" 5 | 6 | required_providers { 7 | google = { 8 | source = "hashicorp/google" 9 | version = "= 5.20.0" 10 | } 11 | 12 | google-beta = { 13 | source = "hashicorp/google" 14 | version = "= 5.20.0" 15 | } 16 | } 17 | } -------------------------------------------------------------------------------- /terraform/modules/bigquery/providers.tf: -------------------------------------------------------------------------------- 1 | # adding provider block in all modules for tflint 2 | # https://github.com/terraform-linters/tflint-ruleset-terraform/blob/v0.2.2/docs/rules/terraform_required_providers.md 3 | terraform { 4 | required_version = ">= 1.12.1" 5 | 6 | required_providers { 7 | google = { 8 | source = "hashicorp/google" 9 | version = "= 5.20.0" 10 | } 11 | 12 | google-beta = { 13 | source = "hashicorp/google" 14 | version = "= 5.20.0" 15 | } 16 | } 17 | } -------------------------------------------------------------------------------- /terraform/modules/pubsub/providers.tf: -------------------------------------------------------------------------------- 1 | # adding provider block in all modules for tflint 2 | # https://github.com/terraform-linters/tflint-ruleset-terraform/blob/v0.2.2/docs/rules/terraform_required_providers.md 3 | terraform { 4 | required_version = ">= 1.12.1" 5 | 6 | required_providers { 7 | google = { 8 | source = "hashicorp/google" 9 | version = "= 5.20.0" 10 | } 11 | 12 | google-beta = { 13 | source = "hashicorp/google" 14 | version = "= 5.20.0" 15 | } 16 | } 17 | } -------------------------------------------------------------------------------- /terraform/stacks/common/providers.tf: -------------------------------------------------------------------------------- 1 | # adding provider block in all modules for tflint 2 | # https://github.com/terraform-linters/tflint-ruleset-terraform/blob/v0.2.2/docs/rules/terraform_required_providers.md 3 | terraform { 4 | required_version = ">= 1.12.1" 5 | 6 | required_providers { 7 | google = { 8 | source = "hashicorp/google" 9 | version = "= 5.20.0" 10 | } 11 | 12 | google-beta = { 13 | source = "hashicorp/google" 14 | version = "= 5.20.0" 15 | } 16 | } 17 | } -------------------------------------------------------------------------------- /terraform/modules/bigquery/views/v_tracking_id_to_table_map.tpl: -------------------------------------------------------------------------------- 1 | SELECT DISTINCT 2 | jsonPayload.global_run_id AS run_id, 3 | jsonPayload.dispatched_tracking_id AS tracking_id, 4 | jsonPayload.dispatched_tablespec AS tablespec, 5 | jsonPayload.dispatched_tablespec_project AS project_id, 6 | jsonPayload.dispatched_tablespec_dataset AS dataset_id, 7 | jsonPayload.dispatched_tablespec_table AS table_id 8 | FROM 9 | `${project}.${dataset}.${logging_table}` 10 | WHERE 11 | jsonPayload.global_app_log = 'DISPATCHED_REQUESTS_LOG' -------------------------------------------------------------------------------- /terraform/modules/cloud-logging/providers.tf: -------------------------------------------------------------------------------- 1 | # adding provider block in all modules for tflint 2 | # https://github.com/terraform-linters/tflint-ruleset-terraform/blob/v0.2.2/docs/rules/terraform_required_providers.md 3 | terraform { 4 | required_version = ">= 1.12.1" 5 | 6 | required_providers { 7 | google = { 8 | source = "hashicorp/google" 9 | version = "= 5.20.0" 10 | } 11 | 12 | google-beta = { 13 | source = "hashicorp/google" 14 | version = "= 5.20.0" 15 | } 16 | } 17 | } -------------------------------------------------------------------------------- /terraform/modules/cloud-run/providers.tf: -------------------------------------------------------------------------------- 1 | # adding provider block in all modules for tflint 2 | # https://github.com/terraform-linters/tflint-ruleset-terraform/blob/v0.2.2/docs/rules/terraform_required_providers.md 3 | terraform { 4 | required_version = ">= 1.12.1" 5 | 6 | required_providers { 7 | google = { 8 | source = "hashicorp/google" 9 | version = "= 5.20.0" 10 | } 11 | 12 | google-beta = { 13 | source = "hashicorp/google" 14 | version = "= 5.20.0" 15 | } 16 | } 17 | } -------------------------------------------------------------------------------- /terraform/modules/data-catalog/providers.tf: -------------------------------------------------------------------------------- 1 | # adding provider block in all modules for tflint 2 | # https://github.com/terraform-linters/tflint-ruleset-terraform/blob/v0.2.2/docs/rules/terraform_required_providers.md 3 | terraform { 4 | required_version = ">= 1.12.1" 5 | 6 | required_providers { 7 | google = { 8 | source = "hashicorp/google" 9 | version = "= 5.20.0" 10 | } 11 | 12 | google-beta = { 13 | source = "hashicorp/google" 14 | version = "= 5.20.0" 15 | } 16 | } 17 | } -------------------------------------------------------------------------------- /terraform/stacks/inspection/providers.tf: -------------------------------------------------------------------------------- 1 | # adding provider block in all modules for tflint 2 | # https://github.com/terraform-linters/tflint-ruleset-terraform/blob/v0.2.2/docs/rules/terraform_required_providers.md 3 | terraform { 4 | required_version = ">= 1.12.1" 5 | 6 | required_providers { 7 | google = { 8 | source = "hashicorp/google" 9 | version = "= 5.20.0" 10 | } 11 | 12 | google-beta = { 13 | source = "hashicorp/google" 14 | version = "= 5.20.0" 15 | } 16 | } 17 | } -------------------------------------------------------------------------------- /terraform/modules/cloud-scheduler/providers.tf: -------------------------------------------------------------------------------- 1 | # adding provider block in all modules for tflint 2 | # https://github.com/terraform-linters/tflint-ruleset-terraform/blob/v0.2.2/docs/rules/terraform_required_providers.md 3 | terraform { 4 | required_version = ">= 1.12.1" 5 | 6 | required_providers { 7 | google = { 8 | source = "hashicorp/google" 9 | version = "= 5.20.0" 10 | } 11 | 12 | google-beta = { 13 | source = "hashicorp/google" 14 | version = "= 5.20.0" 15 | } 16 | } 17 | } -------------------------------------------------------------------------------- /terraform/modules/bigquery/views/v_errors_retryable.tpl: -------------------------------------------------------------------------------- 1 | WITH retryable AS 2 | ( 3 | SELECT DISTINCT 4 | jsonPayload.global_run_id AS run_id, 5 | jsonPayload.retryable_ex_tracking_id AS tracking_id, 6 | resource.labels.service_name AS service_name, 7 | jsonPayload.retryable_ex_name AS exception_name, 8 | jsonPayload.retryable_ex_msg AS exception_message, 9 | FROM 10 | `${project}.${dataset}.${logging_table}` 11 | WHERE 12 | jsonPayload.global_app_log = 'RETRYABLE_EXCEPTIONS_LOG' 13 | ORDER BY 2 14 | ) 15 | 16 | SELECT * FROM retryable -------------------------------------------------------------------------------- /terraform/modules/data_project_permissions_in_standard_mode/providers.tf: -------------------------------------------------------------------------------- 1 | # adding provider block in all modules for tflint 2 | # https://github.com/terraform-linters/tflint-ruleset-terraform/blob/v0.2.2/docs/rules/terraform_required_providers.md 3 | terraform { 4 | required_version = ">= 1.12.1" 5 | 6 | required_providers { 7 | google = { 8 | source = "hashicorp/google" 9 | version = "= 5.20.0" 10 | } 11 | 12 | google-beta = { 13 | source = "hashicorp/google" 14 | version = "= 5.20.0" 15 | } 16 | } 17 | } -------------------------------------------------------------------------------- /terraform/modules/bigquery/views/v_errors_non_retryable.tpl: -------------------------------------------------------------------------------- 1 | WITH nonretryable AS 2 | ( 3 | SELECT DISTINCT 4 | jsonPayload.global_run_id AS run_id, 5 | jsonPayload.non_retryable_ex_tracking_id AS tracking_id, 6 | resource.labels.service_name AS service_name, 7 | jsonPayload.non_retryable_ex_name AS exception_name, 8 | jsonPayload.non_retryable_ex_msg AS exception_message, 9 | FROM 10 | `${project}.${dataset}.${logging_table}` 11 | WHERE 12 | jsonPayload.global_app_log = 'NON_RETRYABLE_EXCEPTIONS_LOG' 13 | ORDER BY 2 14 | ) 15 | 16 | SELECT * FROM nonretryable -------------------------------------------------------------------------------- /terraform/modules/bigquery/views/v_service_calls.tpl: -------------------------------------------------------------------------------- 1 | SELECT 2 | run_id, 3 | tracker, 4 | SUM(CASE WHEN step = 'START' AND function_number = 2 THEN 1 ELSE 0 END) AS inspector_starts, 5 | SUM(CASE WHEN step = 'END' AND function_number = 2 THEN 1 ELSE 0 END) AS inspector_ends, 6 | SUM(CASE WHEN step = 'START' AND function_number = 3 THEN 1 ELSE 0 END) AS tagger_starts, 7 | SUM(CASE WHEN step = 'END' AND function_number = 3 THEN 1 ELSE 0 END) AS tagger_ends, 8 | FROM 9 | `${project}.${dataset}.${logging_view_steps}` 10 | WHERE function_number > 1 11 | GROUP BY 1,2 -------------------------------------------------------------------------------- /terraform/modules/data_project_permissions_in_standard_mode/variables.tf: -------------------------------------------------------------------------------- 1 | variable "target_project" { 2 | type = string 3 | } 4 | 5 | variable "sa_inspection_dispatcher_email" { 6 | type = string 7 | } 8 | 9 | variable "sa_tagging_dispatcher_email" { 10 | type = string 11 | } 12 | 13 | variable "sa_inspector_email" { 14 | type = string 15 | } 16 | 17 | variable "sa_tagger_email" { 18 | type = string 19 | } 20 | 21 | variable "sa_dlp_email" { 22 | type = string 23 | } 24 | 25 | variable "sa_bq_remote_func_get_policy_tags_email" { 26 | type = string 27 | } -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/services/findings/FindingsReader.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.pso.bq_pii_classifier.services.findings; 2 | 3 | import com.google.cloud.pso.bq_pii_classifier.entities.NonRetryableApplicationException; 4 | import com.google.cloud.pso.bq_pii_classifier.entities.TablePolicyTags; 5 | 6 | import java.io.IOException; 7 | 8 | public interface FindingsReader { 9 | 10 | TablePolicyTags getFieldsToPolicyTagsMap(String lookupKey) throws InterruptedException, NonRetryableApplicationException, IOException; 11 | 12 | } 13 | -------------------------------------------------------------------------------- /terraform/modules/cloud-scheduler/variables.tf: -------------------------------------------------------------------------------- 1 | variable "project" { 2 | type = string 3 | } 4 | variable "region" { 5 | type = string 6 | } 7 | variable "scheduler_name" { 8 | type = string 9 | } 10 | variable "target_uri" { 11 | type = string 12 | } 13 | variable "cron_expression" { 14 | type = string 15 | } 16 | 17 | # DLP scanning scope 18 | variable "tables_exclude_list" { 19 | type = list(string) 20 | } 21 | variable "datasets_include_list" { 22 | type = list(string) 23 | } 24 | variable "datasets_exclude_list" { 25 | type = list(string) 26 | } 27 | variable "projects_include_list" { 28 | type = list(string) 29 | } 30 | -------------------------------------------------------------------------------- /terraform/modules/pubsub/variables.tf: -------------------------------------------------------------------------------- 1 | variable "project" { 2 | type = string 3 | } 4 | 5 | variable "topic" { 6 | type = string 7 | } 8 | variable "subscription_name" { 9 | type = string 10 | } 11 | variable "subscription_endpoint" { 12 | type = string 13 | } 14 | variable "subscription_service_account" { 15 | type = string 16 | } 17 | variable "topic_publishers_sa_emails" { 18 | type = list(string) 19 | } 20 | variable "subscription_message_retention_duration" { 21 | type = string 22 | } 23 | variable "subscription_ack_deadline_seconds" { 24 | type = number 25 | } 26 | variable "default_labels" { 27 | type = map(string) 28 | } 29 | -------------------------------------------------------------------------------- /services/cloudbuild_deploy_common_services.yaml: -------------------------------------------------------------------------------- 1 | steps: 2 | - name: maven:3.8.6-openjdk-18 3 | id: deploy_services 4 | env: 5 | - 'TAGGING_DISPATCHER_IMAGE=${_TAGGING_DISPATCHER_IMAGE}' 6 | - 'TAGGER_IMAGE=${_TAGGER_IMAGE}' 7 | script: 8 | 9 | set -e 10 | 11 | echo "TAGGING_DISPATCHER_IMAGE = ${TAGGING_DISPATCHER_IMAGE}" 12 | echo "TAGGER_IMAGE = ${TAGGER_IMAGE}" 13 | 14 | mvn install 15 | 16 | mvn compile jib:build -f=dispatcher-tagging-app/pom.xml -Dimage="${TAGGING_DISPATCHER_IMAGE}" 17 | 18 | mvn compile jib:build -f=tagger-app/pom.xml -Dimage="${TAGGER_IMAGE}" 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /terraform/terraform.tf: -------------------------------------------------------------------------------- 1 | # adding provider block in all modules for tflint 2 | # https://github.com/terraform-linters/tflint-ruleset-terraform/blob/v0.2.2/docs/rules/terraform_required_providers.md 3 | terraform { 4 | required_version = ">= 1.12.1" 5 | 6 | required_providers { 7 | google = { 8 | source = "hashicorp/google" 9 | version = "= 5.20.0" 10 | } 11 | 12 | google-beta = { 13 | source = "hashicorp/google" 14 | version = "= 5.20.0" 15 | } 16 | } 17 | 18 | # DONT REMOVE 19 | provider_meta "google" { 20 | module_name = "cloud-solutions/bq-pii-classifier–deploy-v2.0" 21 | } 22 | 23 | backend "gcs" {} 24 | } 25 | -------------------------------------------------------------------------------- /terraform/stacks/common/output.tf: -------------------------------------------------------------------------------- 1 | output "dlp_inspection_templates_ids" { 2 | value = local.created_dlp_inspection_templates 3 | } 4 | 5 | output "bq_results_dataset" { 6 | value = module.bigquery.results_dataset 7 | } 8 | 9 | output "tagger_topic_name" { 10 | value = module.pubsub-tagger.topic-name 11 | } 12 | 13 | output "tagger_topic_id" { 14 | value = module.pubsub-tagger.topic-id 15 | } 16 | 17 | output "sa_tagging_dispatcher_email" { 18 | value = module.iam.sa_tagging_dispatcher_email 19 | } 20 | 21 | output "sa_tagger_email" { 22 | value = module.iam.sa_tagger_email 23 | } 24 | 25 | output "info_type_map" { 26 | value = local.info_types_map 27 | } 28 | 29 | -------------------------------------------------------------------------------- /docs/common-terraform-3-apply.md: -------------------------------------------------------------------------------- 1 | 2 | ### Deploy solution via Terraform 3 | 4 | ``` 5 | cd terraform 6 | 7 | terraform init \ 8 | -backend-config="bucket=${BUCKET_NAME}" \ 9 | -backend-config="prefix=terraform-state" 10 | 11 | terraform workspace new $CONFIG 12 | # or, if it's not the first deployment 13 | terraform workspace select $CONFIG 14 | 15 | terraform plan -var-file=$VARS 16 | 17 | terraform apply -var-file=$VARS -auto-approve 18 | 19 | ``` 20 | 21 | PS: In case you're deploying to a new project where DLP has 22 | never run before, the DLP service account won't be created and Terraform will fail. 23 | In that case, run a sample DLP job to force DLP to create the service account. 24 | -------------------------------------------------------------------------------- /terraform/modules/bigquery/views/v_log_label_history.tpl: -------------------------------------------------------------------------------- 1 | SELECT 2 | TIMESTAMP_MILLIS(CAST(SUBSTR(jsonPayload.global_run_id, 0, 13) AS INT64)) AS start_time, 3 | jsonPayload.global_run_id AS run_id, 4 | jsonPayload.global_tracker AS tracker, 5 | jsonPayload.labels_history_log_project_id AS project_id, 6 | jsonPayload.labels_history_log_dataset_id AS dataset_id, 7 | jsonPayload.labels_history_log_table_id AS table_id, 8 | jsonPayload.labels_history_log_label_key AS label_key, 9 | jsonPayload.labels_history_log_label_value AS label_value, 10 | jsonPayload.labels_history_log_is_dry_run AS is_dry_run_labels, 11 | FROM `${project}.${dataset}.${logging_table}` 12 | WHERE jsonPayload.global_app_log = 'LABEL_HISTORY_LOG' -------------------------------------------------------------------------------- /services/library/src/test/java/com/google/cloud/pso/bq_pii_classifier/functions/helpers/TrackingHelperTest.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.pso.bq_pii_classifier.functions.helpers; 2 | 3 | import com.google.cloud.pso.bq_pii_classifier.helpers.TrackingHelper; 4 | import org.junit.Test; 5 | 6 | import static org.junit.Assert.assertEquals; 7 | 8 | public class TrackingHelperTest { 9 | 10 | @Test 11 | public void extractTrackingIdFromJobName(){ 12 | assertEquals("1748425779458-I-2026f88e-82f4-44c7-954d-d8e396f01b73_1", 13 | TrackingHelper.extractTrackingIdFromJobName(String.format("//projects/locations/dlpJobs/i-1748425779458-I-2026f88e-82f4-44c7-954d-d8e396f01b73_1"))); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /terraform/modules/bq-remote-function/providers.tf: -------------------------------------------------------------------------------- 1 | # adding provider block in all modules for tflint 2 | # https://github.com/terraform-linters/tflint-ruleset-terraform/blob/v0.2.2/docs/rules/terraform_required_providers.md 3 | terraform { 4 | required_version = ">= 1.12.1" 5 | 6 | required_providers { 7 | google = { 8 | source = "hashicorp/google" 9 | version = "= 5.20.0" 10 | } 11 | 12 | google-beta = { 13 | source = "hashicorp/google" 14 | version = "= 5.20.0" 15 | } 16 | 17 | archive = { 18 | source = "hashicorp/archive" 19 | version = "2.7.0" 20 | } 21 | 22 | random = { 23 | source = "hashicorp/random" 24 | version = "3.6.2" 25 | } 26 | } 27 | } -------------------------------------------------------------------------------- /terraform/modules/iam/variables.tf: -------------------------------------------------------------------------------- 1 | variable "project" { 2 | type = string 3 | } 4 | variable "sa_tagging_dispatcher" { 5 | type = string 6 | } 7 | variable "sa_tagger" { 8 | type = string 9 | } 10 | variable "sa_tagging_dispatcher_tasks" { 11 | type = string 12 | } 13 | variable "sa_tagger_tasks" { 14 | type = string 15 | } 16 | variable "taxonomy_parent_tags" { 17 | type = list(object({ 18 | id = string, 19 | domain = string, 20 | display_name = string 21 | })) 22 | } 23 | variable "iam_mapping" { 24 | type = map(map(list(string))) 25 | } 26 | variable "dlp_service_account" { 27 | type = string 28 | } 29 | variable "tagger_role" { 30 | type = string 31 | } 32 | variable "bq_results_dataset" { 33 | type = string 34 | } -------------------------------------------------------------------------------- /helpers/bq-list-locations/main.py: -------------------------------------------------------------------------------- 1 | from google.cloud import bigquery 2 | 3 | 4 | def get_dataset_locations(project_id): 5 | """Fetches all unique BigQuery dataset locations for a project.""" 6 | 7 | client = bigquery.Client(project=project_id) 8 | 9 | bq_locations = set() 10 | 11 | for dataset_list_item in client.list_datasets(): 12 | dataset = client.get_dataset(dataset_list_item.reference) # Get full dataset object 13 | bq_locations.add(dataset.location) # Access the location from the full object 14 | 15 | return bq_locations 16 | 17 | # Example usage 18 | project_id = "example-project" # Replace with your actual project ID 19 | locations = get_dataset_locations(project_id) 20 | print("Dataset locations:", locations) -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/entities/ResourceLabel.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.pso.bq_pii_classifier.entities; 2 | 3 | public class ResourceLabel { 4 | private String key; 5 | private String value; 6 | 7 | public ResourceLabel(String key, String value) { 8 | this.key = key; 9 | this.value = value; 10 | } 11 | 12 | public String getKey() { 13 | return key; 14 | } 15 | 16 | public String getValue() { 17 | return value; 18 | } 19 | 20 | @Override 21 | public String toString() { 22 | return "ResourceLabel{" + 23 | "key='" + key + '\'' + 24 | ", value='" + value + '\'' + 25 | '}'; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /terraform/modules/bigquery/output.tf: -------------------------------------------------------------------------------- 1 | output "config_view_infotype_policytag_map" { 2 | value = google_bigquery_table.config_view_infotypes_policytags_map.table_id 3 | } 4 | 5 | output "results_dataset" { 6 | value = google_bigquery_dataset.results_dataset.dataset_id 7 | } 8 | 9 | output "results_table_standard_dlp" { 10 | value = google_bigquery_table.standard_dlp_results_table.table_id 11 | } 12 | 13 | output "config_view_dataset_domain_map" { 14 | value = google_bigquery_table.config_view_dataset_domain_map.table_id 15 | } 16 | 17 | output "config_view_project_domain_map" { 18 | value = google_bigquery_table.config_view_project_domain_map.table_id 19 | } 20 | 21 | output "logging_table" { 22 | value = google_bigquery_table.logging_table.table_id 23 | } -------------------------------------------------------------------------------- /terraform/modules/dlp/variables.tf: -------------------------------------------------------------------------------- 1 | variable "project" { 2 | type = string 3 | } 4 | variable "region" { 5 | type = string 6 | } 7 | variable "classification_taxonomy" { 8 | type = list(object({ 9 | info_type = string 10 | info_type_category = string 11 | # (standard | custom) 12 | policy_tag = string 13 | classification = string 14 | inspection_template_number = number 15 | taxonomy_number = number 16 | })) 17 | } 18 | 19 | variable "custom_info_types_dictionaries" { 20 | type = list(object({ 21 | name = string 22 | likelihood = string 23 | dictionary =list(string) 24 | })) 25 | } 26 | 27 | variable "custom_info_types_regex" { 28 | type = list(object({ 29 | name = string 30 | likelihood = string 31 | regex = string 32 | })) 33 | } -------------------------------------------------------------------------------- /terraform/modules/data-catalog/output.tf: -------------------------------------------------------------------------------- 1 | output "created_taxonomy" { 2 | value = google_data_catalog_taxonomy.domain_taxonomy 3 | } 4 | 5 | output "created_parent_tags" { 6 | value = [for entry in google_data_catalog_policy_tag.parent_tags: { 7 | id = entry.id 8 | display_name = entry.display_name 9 | domain = trim(element(split("|", entry.description), 0), " ") 10 | }] 11 | } 12 | 13 | output "created_children_tags" { 14 | value = [for entry in google_data_catalog_policy_tag.children_tags: { 15 | policy_tag_id = entry.id 16 | domain = trim(element(split("|", entry.description), 0), " ") 17 | classification = trim(element(split("|", entry.description), 1), " ") 18 | info_type = trim(element(split("|", entry.description), 2), " ") 19 | region = var.region 20 | }] 21 | } 22 | 23 | -------------------------------------------------------------------------------- /terraform/modules/iam/output.tf: -------------------------------------------------------------------------------- 1 | 2 | 3 | output "sa_tagging_dispatcher_email" { 4 | value = google_service_account.sa_tagging_dispatcher.email 5 | } 6 | 7 | 8 | output "sa_tagger_email" { 9 | value = google_service_account.sa_tagger.email 10 | } 11 | 12 | output "sa_tagging_dispatcher_tasks_email" { 13 | value = google_service_account.sa_tagging_dispatcher_tasks.email 14 | } 15 | 16 | 17 | output "sa_tagger_tasks_email" { 18 | value = google_service_account.sa_tagger_tasks.email 19 | } 20 | 21 | output "local_parent_tags_with_members_list" { 22 | value = local.parent_tags_with_members_list 23 | } 24 | 25 | output "local_iam_members_list" { 26 | value = local.iam_members_list 27 | } 28 | 29 | output "debug_policy_tag_readers" { 30 | value = google_data_catalog_policy_tag_iam_member.policy_tag_reader 31 | 32 | } -------------------------------------------------------------------------------- /terraform/modules/bigquery/views/v_log_tag_history.tpl: -------------------------------------------------------------------------------- 1 | SELECT 2 | TIMESTAMP_MILLIS(CAST(SUBSTR(jsonPayload.global_run_id, 0, 13) AS INT64)) AS start_time, 3 | jsonPayload.global_run_id AS run_id, 4 | jsonPayload.global_tracker AS tracker, 5 | jsonPayload.tag_history_log_project_id AS project_id, 6 | jsonPayload.tag_history_log_dataset_id AS dataset_id, 7 | jsonPayload.tag_history_log_table_id AS table_id, 8 | jsonPayload.tag_history_log_field_name AS field_id, 9 | jsonPayload.tag_history_log_existing_policy_tag_id AS existing_policy_tag, 10 | jsonPayload.tag_history_log_new_policy_tag_id AS new_policy_tag, 11 | jsonPayload.tag_history_log_column_tagging_action AS operation, 12 | jsonPayload.tag_history_log_description AS details 13 | FROM `${project}.${dataset}.${logging_table}` 14 | WHERE jsonPayload.global_app_log = 'TAG_HISTORY_LOG' -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/functions/tagger/TaggerDlpJobRequest.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.pso.bq_pii_classifier.functions.tagger; 2 | 3 | import com.google.cloud.pso.bq_pii_classifier.entities.Operation; 4 | 5 | public class TaggerDlpJobRequest extends Operation { 6 | 7 | private String dlpJobName; 8 | 9 | public TaggerDlpJobRequest(String runId, String trackingId, String dlpJobName) { 10 | super(runId, trackingId); 11 | this.dlpJobName = dlpJobName; 12 | } 13 | 14 | public String getDlpJobName() { 15 | return dlpJobName; 16 | } 17 | 18 | @Override 19 | public String toString() { 20 | return "TaggerDlpJobRequest{" + 21 | "dlpJobName='" + dlpJobName + '\'' + 22 | "} " + super.toString(); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /docs/common-limits.md: -------------------------------------------------------------------------------- 1 | ## Solution Limits 2 | 3 | General limits: 4 | * Supports 1 GCP region only: 5 | A table must be in the same GCP region as the taxonomy in order to use its policy tags. If tables 6 | span multiple regions, the solution must be extended to create replicas of the taxonomies in other regions 7 | and include them in the InfoType to policy tag mapping views created by Terraform. 8 | 9 | [Data Catalog Limits:](https://cloud.google.com/data-catalog/docs/resources/quotas) 10 | * 40 taxonomies per project --> 40 domains to configure in the domain mapping (1 taxonomy per domain) 11 | * 100 policy tags per taxonomy --> 100 data classifications and DLP types to scan for 12 | 13 | [BigQuery Limits:](https://cloud.google.com/bigquery/quotas) 14 | * 1 policy tag per column --> One column could be identified as only one DLP InfoType. 15 | -------------------------------------------------------------------------------- /scripts/deploy_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Copyright 2022 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # https://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | # exit script when errors occur 20 | set -e 21 | 22 | # set the working dir as the scripts directory 23 | cd "$(dirname "$0")" 24 | 25 | 26 | ./deploy_all_services.sh 27 | 28 | ./deploy_terraform.sh 29 | -------------------------------------------------------------------------------- /services/library/src/test/java/com/google/cloud/pso/bq_pii_classifier/functions/entities/TableSpecTest.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.pso.bq_pii_classifier.functions.entities; 2 | 3 | import com.google.cloud.pso.bq_pii_classifier.entities.TableSpec; 4 | import com.google.cloud.pso.bq_pii_classifier.helpers.Utils; 5 | import com.google.privacy.dlp.v2.Table; 6 | import org.junit.Test; 7 | 8 | import static org.junit.Assert.assertEquals; 9 | 10 | public class TableSpecTest { 11 | 12 | @Test 13 | public void fromFullResource() { 14 | 15 | String input = "//bigquery.googleapis.com/projects/test_project/datasets/test_dataset/tables/test_table"; 16 | TableSpec expected = new TableSpec("test_project", "test_dataset", "test_table"); 17 | TableSpec actual = TableSpec.fromFullResource(input); 18 | 19 | assertEquals(expected, actual); 20 | } 21 | 22 | 23 | } 24 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/entities/FunctionLifeCycleEvent.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.pso.bq_pii_classifier.entities; 18 | 19 | public enum FunctionLifeCycleEvent { 20 | START, 21 | END 22 | } 23 | -------------------------------------------------------------------------------- /scripts/deploy_all_services.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Copyright 2022 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # https://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | # exit script when errors occur 20 | set -e 21 | 22 | # set the working dir as the scripts directory 23 | cd "$(dirname "$0")" 24 | 25 | ./deploy_common_services.sh 26 | 27 | ./deploy_inspection_services.sh 28 | 29 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/entities/TableScanLimitsType.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.pso.bq_pii_classifier.entities; 18 | 19 | public enum TableScanLimitsType { 20 | NUMBER_OF_ROWS, 21 | PERCENTAGE_OF_ROWS 22 | } 23 | -------------------------------------------------------------------------------- /terraform/modules/cloud-scheduler/main.tf: -------------------------------------------------------------------------------- 1 | 2 | # https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloud_scheduler_job 3 | 4 | resource "google_cloud_scheduler_job" "scheduler_job" { 5 | project = var.project 6 | region = var.region 7 | name = var.scheduler_name 8 | description = "CRON job to trigger BQ Security Classifier" 9 | schedule = var.cron_expression 10 | 11 | retry_config { 12 | retry_count = 0 13 | } 14 | 15 | pubsub_target { 16 | # topic.id is the topic's full resource name. 17 | topic_name = var.target_uri 18 | data = base64encode(jsonencode({ 19 | datasetIncludeList = var.datasets_include_list 20 | projectIncludeList = var.projects_include_list 21 | datasetExcludeList = var.datasets_exclude_list 22 | tableExcludeList = var.tables_exclude_list 23 | })) 24 | } 25 | } 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /terraform/modules/bigquery/views/v_run_summary.tpl: -------------------------------------------------------------------------------- 1 | WITH failed AS ( 2 | 3 | SELECT 4 | run_id, 5 | tracking_id, 6 | 'FAILED' AS status, 7 | 'Tracker has non retryable exception(s)' AS details 8 | FROM `${project}.${dataset}.${v_errors_non_retryable}` 9 | GROUP BY 1, 2, 3 10 | ) 11 | , 12 | success AS ( 13 | 14 | -- Check for Tagger call completion 15 | SELECT DISTINCT 16 | run_id, 17 | tracker AS tracking_id, 18 | 'SUCCESS' AS status, 19 | 'Tagger completed the expected ${inspection_templates_count} call(s) successfully' AS details 20 | FROM 21 | `${project}.${dataset}.${v_service_calls}` 22 | WHERE tagger_ends = ${inspection_templates_count} 23 | ) 24 | , 25 | final AS 26 | ( 27 | SELECT * FROM failed 28 | UNION ALL 29 | SELECT * FROM success 30 | ) 31 | 32 | 33 | SELECT 34 | TIMESTAMP_MILLIS(CAST(SUBSTR(final.run_id, 0, 13) AS INT64)) AS timestamp, 35 | final.*, 36 | FROM final ORDER BY run_id DESC, status, tracking_id -------------------------------------------------------------------------------- /scripts/deploy_all_cloudbuild.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # /* 5 | # * Copyright 2023 Google LLC 6 | # * 7 | # * Licensed under the Apache License, Version 2.0 (the "License"); 8 | # * you may not use this file except in compliance with the License. 9 | # * You may obtain a copy of the License at 10 | # * 11 | # * https://www.apache.org/licenses/LICENSE-2.0 12 | # * 13 | # * Unless required by applicable law or agreed to in writing, software 14 | # * distributed under the License is distributed on an "AS IS" BASIS, 15 | # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # * See the License for the specific language governing permissions and 17 | # * limitations under the License. 18 | # */ 19 | # 20 | 21 | # exit script when errors occur 22 | set -e 23 | 24 | # set the working dir as the scripts directory 25 | cd "$(dirname "$0")" 26 | 27 | ./deploy_all_services_cloudbuild.sh 28 | 29 | ./deploy_terraform.sh 30 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/functions/tagger/TaggerTableSpecRequest.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.pso.bq_pii_classifier.functions.tagger; 2 | 3 | import com.google.cloud.bigquery.Table; 4 | import com.google.cloud.pso.bq_pii_classifier.entities.Operation; 5 | import com.google.cloud.pso.bq_pii_classifier.entities.TableSpec; 6 | 7 | public class TaggerTableSpecRequest extends Operation { 8 | 9 | private TableSpec targetTable; 10 | 11 | public TaggerTableSpecRequest(String runId, String trackingId, TableSpec targetTable) { 12 | super(runId, trackingId); 13 | this.targetTable = targetTable; 14 | } 15 | 16 | public TableSpec getTargetTable() { 17 | return targetTable; 18 | } 19 | 20 | @Override 21 | public String toString() { 22 | return "TaggerTableSpecRequest{" + 23 | "targetTable=" + targetTable.toSqlString() + 24 | "} " + super.toString(); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /terraform/modules/gcs/main.tf: -------------------------------------------------------------------------------- 1 | # https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket 2 | resource "google_storage_bucket" "gcs_flags_bucket" { 3 | project = var.project 4 | name = var.gcs_flags_bucket_name 5 | # This bucket is used by the services so let's create in the same compute region 6 | location = var.region 7 | 8 | force_destroy = !var.terraform_data_deletion_protection 9 | 10 | lifecycle_rule { 11 | condition { 12 | # Clean up old flags to save storage and GCS operations overhead 13 | age = 3 # days 14 | } 15 | action { 16 | type = "Delete" 17 | } 18 | } 19 | 20 | uniform_bucket_level_access = true 21 | labels = var.default_labels 22 | } 23 | 24 | resource "google_storage_bucket_iam_binding" "gcs_flags_bucket_iam_bindings" { 25 | bucket = google_storage_bucket.gcs_flags_bucket.name 26 | role = "roles/storage.objectAdmin" 27 | members = var.gcs_flags_bucket_admins 28 | } 29 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/entities/TablePolicyTags.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.pso.bq_pii_classifier.entities; 2 | 3 | import java.util.Map; 4 | 5 | public class TablePolicyTags { 6 | 7 | private TableSpec tableSpec; 8 | private Map fieldsPolicyTags; 9 | 10 | 11 | public TablePolicyTags(TableSpec tableSpec, Map fieldsPolicyTags) { 12 | this.tableSpec = tableSpec; 13 | this.fieldsPolicyTags = fieldsPolicyTags; 14 | } 15 | 16 | public TableSpec getTableSpec() { 17 | return tableSpec; 18 | } 19 | 20 | public Map getFieldsPolicyTags() { 21 | return fieldsPolicyTags; 22 | } 23 | 24 | @Override 25 | public String toString() { 26 | return "TablePolicyTags{" + 27 | "tableSpec=" + tableSpec + 28 | ", fieldsPolicyTags=" + fieldsPolicyTags + 29 | '}'; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/entities/NonRetryableApplicationException.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.pso.bq_pii_classifier.entities; 18 | 19 | public class NonRetryableApplicationException extends Exception { 20 | public NonRetryableApplicationException(String msg){ 21 | super(msg); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /terraform/modules/cloud-run/variables.tf: -------------------------------------------------------------------------------- 1 | variable "project" { 2 | type = string 3 | } 4 | variable "region" { 5 | type = string 6 | } 7 | 8 | variable "service_name" { 9 | type = string 10 | } 11 | variable "service_image" { 12 | type = string 13 | } 14 | variable "service_account_email" { 15 | type = string 16 | } 17 | variable "invoker_service_account_email" { 18 | type = string 19 | } 20 | 21 | variable "environment_variables" { 22 | type = list(object({ 23 | name = string, 24 | value = string 25 | })) 26 | } 27 | 28 | variable "max_memory" { 29 | type = string 30 | default = "1Gi" 31 | } 32 | 33 | variable "max_cpu" { 34 | type = string 35 | default = "1" 36 | } 37 | 38 | variable "max_containers" { 39 | type = number 40 | default = 10 41 | } 42 | 43 | variable "max_requests_per_container" { 44 | type = number 45 | default = 80 46 | } 47 | 48 | variable "timeout_seconds" { 49 | type = number 50 | } 51 | 52 | variable "default_labels" { 53 | type = map(string) 54 | } -------------------------------------------------------------------------------- /.github/dependabot.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | version: 2 15 | updates: 16 | - directory: "/" 17 | package-ecosystem: "github-actions" 18 | open-pull-requests-limit: 10 19 | schedule: 20 | interval: "daily" 21 | - directory: "/terraform" 22 | package-ecosystem: "terraform" 23 | schedule: 24 | interval: "daily" 25 | - directory: "/services" 26 | package-ecosystem: "maven" 27 | schedule: 28 | interval: "daily" -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/helpers/ThrowableInfo.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.pso.bq_pii_classifier.helpers; 2 | 3 | public class ThrowableInfo { 4 | 5 | private Throwable throwable; 6 | private boolean isRetryable; 7 | private String notes; 8 | 9 | public ThrowableInfo(Throwable throwable, boolean isRetryable, String notes) { 10 | this.throwable = throwable; 11 | this.isRetryable = isRetryable; 12 | this.notes = notes; 13 | } 14 | 15 | public Throwable getThrowable() { 16 | return throwable; 17 | } 18 | 19 | public boolean isRetryable() { 20 | return isRetryable; 21 | } 22 | 23 | public String getNotes() { 24 | return notes; 25 | } 26 | 27 | @Override 28 | public String toString() { 29 | return "ThrowableInfo{" + 30 | "exception=" + throwable + 31 | ", isRetryable=" + isRetryable + 32 | ", notes='" + notes + '\'' + 33 | '}'; 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /scripts/schedulers_action.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Copyright 2022 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # https://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | # exit script when errors occur 20 | set -e 21 | 22 | SCHEDULERS=$(gcloud scheduler jobs list --format="value(ID)" --project="${PROJECT_ID}") 23 | echo $SCHEDULERS 24 | ## Splitting the string into array 25 | 26 | read -a ARRAY <<< ${SCHEDULERS} 27 | 28 | for i in "${!ARRAY[@]}" 29 | do 30 | echo "$1 ${ARRAY[i]}.." 31 | gcloud scheduler jobs "${1}" "${ARRAY[i]}" 32 | done 33 | -------------------------------------------------------------------------------- /scripts/cancel_running_bq_jobs.py: -------------------------------------------------------------------------------- 1 | # /usr/bin/python 2 | 3 | # Usage: 4 | # python -m venv /tmp/venv/bq-pii-classifier 5 | # source /tmp/venv/bq-pii-classifier/bin/activate 6 | # pip install google-cloud-bigquery 7 | # python scripts/cancel_running_bq_jobs.py 8 | 9 | # This is a dev utility script to cancel running and pending BQ jobs 10 | 11 | from google.cloud import bigquery 12 | 13 | import datetime 14 | 15 | # Construct a BigQuery client object. 16 | client = bigquery.Client() 17 | 18 | # Use all_users to include jobs run by all users in the project. 19 | print("Running Jobs:") 20 | for job in client.list_jobs(max_results=1000, all_users=True, state_filter="running"): 21 | print("Will cancel job_id {} | user: {} | state : {}".format(job.job_id, job.user_email, job.state)) 22 | client.cancel_job(job.job_id) 23 | 24 | print("Pending Jobs:") 25 | for job in client.list_jobs(max_results=1000, all_users=True, state_filter="pending"): 26 | print("Will cancel job_id {} | user: {} | state : {}".format(job.job_id, job.user_email, job.state)) 27 | client.cancel_job(job.job_id) 28 | -------------------------------------------------------------------------------- /services/cloudbuild_deploy_all_services.yaml: -------------------------------------------------------------------------------- 1 | steps: 2 | - name: maven:3.8.6-openjdk-18 3 | id: deploy_services 4 | env: 5 | - 'TAGGING_DISPATCHER_IMAGE=${_TAGGING_DISPATCHER_IMAGE}' 6 | - 'TAGGER_IMAGE=${_TAGGER_IMAGE}' 7 | - 'INSPECTION_DISPATCHER_IMAGE=${_INSPECTION_DISPATCHER_IMAGE}' 8 | - 'INSPECTOR_IMAGE=${_INSPECTOR_IMAGE}' 9 | script: 10 | 11 | set -e 12 | 13 | echo "TAGGING_DISPATCHER_IMAGE = ${TAGGING_DISPATCHER_IMAGE}" 14 | echo "TAGGER_IMAGE = ${TAGGER_IMAGE}" 15 | echo "INSPECTION_DISPATCHER_IMAGE = ${INSPECTION_DISPATCHER_IMAGE}" 16 | echo "INSPECTOR_IMAGE = ${INSPECTOR_IMAGE}" 17 | 18 | mvn install 19 | 20 | mvn compile jib:build -f=dispatcher-tagging-app/pom.xml -Dimage="${TAGGING_DISPATCHER_IMAGE}" 21 | 22 | mvn compile jib:build -f=tagger-app/pom.xml -Dimage="${TAGGER_IMAGE}" 23 | 24 | mvn compile jib:build -f=dispatcher-inspection-app/pom.xml -Dimage="${INSPECTION_DISPATCHER_IMAGE}" 25 | 26 | mvn compile jib:build -f=inspector-app/pom.xml -Dimage="${INSPECTOR_IMAGE}" 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /terraform/modules/bigquery/variables.tf: -------------------------------------------------------------------------------- 1 | variable "project" { 2 | type = string 3 | } 4 | 5 | variable "region" { 6 | type = string 7 | } 8 | 9 | variable "dataset" { 10 | type = string 11 | 12 | } 13 | 14 | variable "standard_dlp_results_table_name" { 15 | type = string 16 | } 17 | 18 | variable "logging_sink_sa" { 19 | type = string 20 | } 21 | 22 | 23 | variable "created_policy_tags" { 24 | type = list(object({ 25 | domain = string, 26 | classification = string, 27 | info_type = string, 28 | policy_tag_id = string 29 | region = string 30 | })) 31 | } 32 | 33 | variable "projects_domains_mapping" { 34 | type = list(object({ 35 | project = string, 36 | domain = string 37 | })) 38 | } 39 | 40 | variable "dataset_domains_mapping" { 41 | type = list(object({ 42 | project = string, 43 | dataset = string, 44 | domain = string 45 | })) 46 | } 47 | 48 | variable "inspection_templates_count" {type = number} 49 | 50 | variable "terraform_data_deletion_protection" { 51 | type = bool 52 | } 53 | 54 | variable "default_labels" { 55 | type = map(string) 56 | } -------------------------------------------------------------------------------- /scripts/deploy_common_services.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Copyright 2022 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # https://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | # exit script when errors occur 20 | set -e 21 | 22 | # set the working dir as the scripts directory 23 | cd "$(dirname "$0")" 24 | 25 | gcloud auth configure-docker "${COMPUTE_REGION}-docker.pkg.dev" 26 | 27 | cd ../services 28 | mvn install 29 | 30 | cd dispatcher-tagging-app 31 | mvn compile jib:build -Dimage="${TAGGING_DISPATCHER_IMAGE}" 32 | 33 | cd ../tagger-app 34 | mvn compile jib:build -Dimage="${TAGGER_IMAGE}" 35 | 36 | -------------------------------------------------------------------------------- /scripts/deploy_inspection_services.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Copyright 2022 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # https://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | # exit script when errors occur 20 | set -e 21 | 22 | # set the working dir as the scripts directory 23 | cd "$(dirname "$0")" 24 | 25 | gcloud auth configure-docker "${COMPUTE_REGION}-docker.pkg.dev" 26 | 27 | cd ../services 28 | mvn install 29 | 30 | cd dispatcher-inspection-app 31 | mvn compile jib:build -Dimage="${INSPECTION_DISPATCHER_IMAGE}" 32 | 33 | cd ../inspector-app 34 | mvn compile jib:build -Dimage="${INSPECTOR_IMAGE}" 35 | 36 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/services/set/GCSPersistentSetImpl.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.pso.bq_pii_classifier.services.set; 2 | 3 | import com.google.cloud.storage.*; 4 | import com.google.cloud.storage.Storage; 5 | import com.google.cloud.storage.StorageOptions; 6 | 7 | public class GCSPersistentSetImpl implements PersistentSet { 8 | 9 | private Storage storage; 10 | private String bucketName; 11 | 12 | public GCSPersistentSetImpl(String bucketName) { 13 | // Instantiates a client 14 | this.storage = StorageOptions.getDefaultInstance().getService(); 15 | this.bucketName = bucketName; 16 | } 17 | 18 | @Override 19 | public void add(String key) { 20 | BlobId blobId = BlobId.of(bucketName, key); 21 | BlobInfo blobInfo = BlobInfo.newBuilder(blobId).build(); 22 | storage.create(blobInfo); 23 | } 24 | 25 | @Override 26 | public boolean contains(String key) { 27 | BlobId blobId = BlobId.of(bucketName, key); 28 | Blob blob = storage.get(blobId); 29 | return blob != null; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/entities/PolicyTagInfo.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.pso.bq_pii_classifier.entities; 2 | 3 | public class PolicyTagInfo { 4 | 5 | private final String infoType; 6 | private final String policyTagId; 7 | private final String classification; 8 | 9 | public PolicyTagInfo(String infoType, String policyTagId, String classification) { 10 | this.infoType = infoType; 11 | this.policyTagId = policyTagId; 12 | this.classification = classification; 13 | } 14 | 15 | public String getInfoType() { 16 | return infoType; 17 | } 18 | 19 | public String getPolicyTagId() { 20 | return policyTagId; 21 | } 22 | 23 | public String getClassification() { 24 | return classification; 25 | } 26 | 27 | @Override 28 | public String toString() { 29 | return "PolicyTagInfo{" + 30 | "infoType='" + infoType + '\'' + 31 | ", policyTagId='" + policyTagId + '\'' + 32 | ", classification='" + classification + '\'' + 33 | '}'; 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /terraform/modules/data-catalog/variables.tf: -------------------------------------------------------------------------------- 1 | variable "project" { 2 | type = string 3 | } 4 | variable "region" { 5 | type = string 6 | } 7 | 8 | variable "domain" { 9 | type = string 10 | description = "the domain name for the taxonomy" 11 | } 12 | 13 | variable "classification_taxonomy" { 14 | type = list(object({ 15 | info_type = string 16 | info_type_category = string 17 | # (standard | custom) 18 | policy_tag = string 19 | classification = string 20 | inspection_template_number = number 21 | taxonomy_number = number 22 | })) 23 | description = "A lis of Maps defining children nodes" 24 | } 25 | 26 | // Use ["FINE_GRAINED_ACCESS_CONTROL"] to restrict IAM access on tagged columns. 27 | // Use [] NOT to restrict IAM access. 28 | variable "data_catalog_taxonomy_activated_policy_types" { 29 | type = list(string) 30 | description = "A lis of policy types for the created taxonomy(s)" 31 | } 32 | 33 | variable "taxonomy_number" {type = number} 34 | 35 | variable "taxonomy_name_suffix" { 36 | type = string 37 | default = "" 38 | description = "Suffix added to taxonomy display name to make it unique within an org" 39 | } 40 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/services/pubsub/PubSubService.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.pso.bq_pii_classifier.services.pubsub; 18 | 19 | import com.google.cloud.pso.bq_pii_classifier.entities.JsonMessage; 20 | 21 | import java.io.IOException; 22 | import java.util.List; 23 | 24 | public interface PubSubService { 25 | 26 | PubSubPublishResults publishTableOperationRequests(String projectId, String topicId, List messages) throws IOException, InterruptedException; 27 | } 28 | -------------------------------------------------------------------------------- /terraform/modules/bq-remote-function/output.tf: -------------------------------------------------------------------------------- 1 | # /* 2 | # * Copyright 2023 Google LLC 3 | # * 4 | # * Licensed under the Apache License, Version 2.0 (the "License"); 5 | # * you may not use this file except in compliance with the License. 6 | # * You may obtain a copy of the License at 7 | # * 8 | # * https://www.apache.org/licenses/LICENSE-2.0 9 | # * 10 | # * Unless required by applicable law or agreed to in writing, software 11 | # * distributed under the License is distributed on an "AS IS" BASIS, 12 | # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # * See the License for the specific language governing permissions and 14 | # * limitations under the License. 15 | # */ 16 | 17 | output "bq_connection_sa_email" { 18 | value = google_bigquery_connection.connection.cloud_resource[0].service_account_id 19 | } 20 | 21 | output "deployment_procedure" { 22 | value = google_bigquery_routine.routine_deploy_functions.routine_id 23 | } 24 | 25 | output "cloud_function_sa_email" { 26 | value = google_service_account.sa_function.email 27 | } 28 | 29 | output "deploy_job_status" { 30 | value = google_bigquery_job.deploy_remote_functions_job.status 31 | } 32 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We'd love to accept your patches and contributions to this project. There are 4 | just a few small guidelines you need to follow. 5 | 6 | ## Contributor License Agreement 7 | 8 | Contributions to this project must be accompanied by a Contributor License 9 | Agreement. You (or your employer) retain the copyright to your contribution; 10 | this simply gives us permission to use and redistribute your contributions as 11 | part of the project. Head over to to see 12 | your current agreements on file or to sign a new one. 13 | 14 | You generally only need to submit a CLA once, so if you've already submitted one 15 | (even if it was for a different project), you probably don't need to do it 16 | again. 17 | 18 | ## Code Reviews 19 | 20 | All submissions, including submissions by project members, require review. We 21 | use GitHub pull requests for this purpose. Consult 22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 23 | information on using pull requests. 24 | 25 | ## Community Guidelines 26 | 27 | This project follows [Google's Open Source Community 28 | Guidelines](https://opensource.google/conduct/). -------------------------------------------------------------------------------- /scripts/prepare_auto_dlp_results_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Copyright 2022 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # https://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | echo "Setting access for TAGGING_DISPATCHER service account.. " 20 | 21 | bq query --location "${DATA_REGION}" --nouse_legacy_sql \ 22 | "GRANT \`roles/bigquery.dataViewer\` ON SCHEMA \`${AUTO_DLP_DATASET}\` TO 'serviceAccount:${SA_TAGGING_DISPATCHER_EMAIL}'" 23 | 24 | echo "Setting access for TAGGER service account.. " 25 | 26 | bq query --location "${DATA_REGION}" --nouse_legacy_sql \ 27 | "GRANT \`roles/bigquery.dataViewer\` ON SCHEMA \`${AUTO_DLP_DATASET}\` TO 'serviceAccount:${SA_TAGGER_EMAIL}'" 28 | 29 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/services/dlp/DlpService.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.pso.bq_pii_classifier.services.dlp; 18 | 19 | import com.google.privacy.dlp.v2.BigQueryTable; 20 | import com.google.privacy.dlp.v2.CreateDlpJobRequest; 21 | import com.google.privacy.dlp.v2.DlpJob; 22 | 23 | public interface DlpService { 24 | 25 | DlpJob submitJob(CreateDlpJobRequest createDlpJobRequest); 26 | 27 | DlpJob.JobState getJobState(String jobId); 28 | 29 | BigQueryTable getInspectedTable(String jobId); 30 | 31 | void shutDown(); 32 | } 33 | -------------------------------------------------------------------------------- /scripts/prepare_stress_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | # 5 | # Copyright 2022 Google LLC 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # https://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | # use FIFOs as semaphores and use them to ensure that new processes are spawned as soon as possible and that no more than N processes runs at the same time. But it requires more code. 21 | 22 | task(){ 23 | 24 | [[ ${#1} -lt 10 ]] && SUFFIX="0${1}" || SUFFIX="$1" 25 | echo "${DESTINATION_TABLE_SPEC_PREFIX}_${SUFFIX}" 26 | bq cp --force "${ORIGIN_TABLE_SPEC}" "${DESTINATION_TABLE_SPEC_PREFIX}_${SUFFIX}"; 27 | } 28 | 29 | N=50 30 | ( 31 | for table in {1..1000}; do 32 | ((i=i%N)); ((i++==0)) && wait 33 | task "${table}" & 34 | done 35 | ) 36 | 37 | -------------------------------------------------------------------------------- /services/library/src/test/java/com/google/cloud/pso/bq_pii_classifier/functions/entities/InfoTypeInfoTest.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.pso.bq_pii_classifier.functions.entities; 2 | 3 | import com.google.cloud.pso.bq_pii_classifier.entities.InfoTypeInfo; 4 | import com.google.cloud.pso.bq_pii_classifier.entities.TableSpec; 5 | import org.junit.Test; 6 | 7 | import java.util.Map; 8 | 9 | import static org.junit.Assert.assertEquals; 10 | 11 | public class InfoTypeInfoTest { 12 | 13 | @Test 14 | public void fromFullResource() { 15 | 16 | String input = "{\"BLOOD_TYPE\":{\"classification\":\"Health_PII\",\"labels\":[{\"key\":\"dg_data_category_health\",\"value\":\"yes\"}]}," + 17 | " \"STREET_ADDRESS\":{\"classification\":\"Location_PII\",\"labels\":[{\"key\":\"dg_data_category_location\",\"value\":\"yes\"}]}}"; 18 | Map map = InfoTypeInfo.fromJsonMap(input); 19 | 20 | assertEquals(2, map.size()); 21 | assertEquals("Health_PII", map.get("BLOOD_TYPE").getClassification()); 22 | assertEquals("dg_data_category_health", map.get("BLOOD_TYPE").getLabels().get(0).getKey()); 23 | assertEquals("yes", map.get("BLOOD_TYPE").getLabels().get(0).getValue()); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /scripts/deploy_common_services_cloudbuild.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # /* 5 | # * Copyright 2023 Google LLC 6 | # * 7 | # * Licensed under the Apache License, Version 2.0 (the "License"); 8 | # * you may not use this file except in compliance with the License. 9 | # * You may obtain a copy of the License at 10 | # * 11 | # * https://www.apache.org/licenses/LICENSE-2.0 12 | # * 13 | # * Unless required by applicable law or agreed to in writing, software 14 | # * distributed under the License is distributed on an "AS IS" BASIS, 15 | # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # * See the License for the specific language governing permissions and 17 | # * limitations under the License. 18 | # */ 19 | # 20 | 21 | # exit script when errors occur 22 | set -e 23 | 24 | # set the working dir as the scripts directory 25 | cd "$(dirname "$0")" 26 | 27 | cd ../services 28 | 29 | # make sure that the project is valid before submitting a build job 30 | mvn install 31 | 32 | gcloud builds submit \ 33 | --project $PROJECT_ID \ 34 | --region $COMPUTE_REGION \ 35 | --config cloudbuild_deploy_common_services.yaml \ 36 | --substitutions _TAGGING_DISPATCHER_IMAGE=${TAGGING_DISPATCHER_IMAGE},_TAGGER_IMAGE=${TAGGER_IMAGE} 37 | 38 | -------------------------------------------------------------------------------- /scripts/prepare_terraform_service_account_on_data_projects.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Copyright 2022 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # https://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | for project in "$@" 20 | do 21 | 22 | echo "Preparing data project ${project} .." 23 | 24 | # This allow terraform to grant required iam roles to service accounts used by the solution (e.g. read bq data) 25 | # Granting the IAM roles happen in Terraform or the scripts/prepare_data_projects_for_standard_mode.sh and scripts/prepare_data_projects_for_auto_dlp_mode.sh 26 | gcloud projects add-iam-policy-binding "${project}" \ 27 | --member="serviceAccount:${TF_SA}@${PROJECT_ID}.iam.gserviceaccount.com" \ 28 | --role="roles/iam.securityAdmin" 29 | 30 | done -------------------------------------------------------------------------------- /scripts/deploy_terraform.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Copyright 2022 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # https://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | # exit script when errors occur 20 | set -e 21 | 22 | # set the working dir as the scripts directory 23 | cd "$(dirname "$0")" 24 | 25 | # resume the created schedule in order for terraform to update them 26 | ./schedulers_action.sh "resume" 27 | 28 | cd ../terraform 29 | 30 | terraform init \ 31 | -backend-config="bucket=${BUCKET_NAME}" \ 32 | -backend-config="prefix=terraform-state" 33 | 34 | terraform apply -lock=false -var-file="${VARS}" -auto-approve 35 | 36 | # set the working dir as the scripts directory 37 | cd ../scripts 38 | # pause the created schedulers to fake on-demand schedulers 39 | ./schedulers_action.sh "pause" -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/entities/InfoTypeInfo.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.pso.bq_pii_classifier.entities; 2 | 3 | import com.google.gson.Gson; 4 | import java.util.List; 5 | import java.util.Map; 6 | import com.google.gson.reflect.TypeToken; 7 | import java.lang.reflect.Type; 8 | 9 | public class InfoTypeInfo { 10 | 11 | private String classification; 12 | private List labels; 13 | 14 | public InfoTypeInfo(String classification, List labels) { 15 | this.classification = classification; 16 | this.labels = labels; 17 | } 18 | 19 | public String getClassification() { 20 | return classification; 21 | } 22 | 23 | public List getLabels() { 24 | return labels; 25 | } 26 | 27 | @Override 28 | public String toString() { 29 | return "InfoTypeInfo{" + 30 | "classification='" + classification + '\'' + 31 | ", labels=" + labels + 32 | '}'; 33 | } 34 | 35 | public static Map fromJsonMap(String jsonStr){ 36 | Gson gson = new Gson(); 37 | Type mapType = new TypeToken>(){}.getType(); 38 | return gson.fromJson(jsonStr, mapType); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /scripts/deploy_all_services_cloudbuild.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # /* 5 | # * Copyright 2023 Google LLC 6 | # * 7 | # * Licensed under the Apache License, Version 2.0 (the "License"); 8 | # * you may not use this file except in compliance with the License. 9 | # * You may obtain a copy of the License at 10 | # * 11 | # * https://www.apache.org/licenses/LICENSE-2.0 12 | # * 13 | # * Unless required by applicable law or agreed to in writing, software 14 | # * distributed under the License is distributed on an "AS IS" BASIS, 15 | # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # * See the License for the specific language governing permissions and 17 | # * limitations under the License. 18 | # */ 19 | # 20 | 21 | # exit script when errors occur 22 | set -e 23 | 24 | # set the working dir as the scripts directory 25 | cd "$(dirname "$0")" 26 | 27 | cd ../services 28 | 29 | # make sure that the project is valid before submitting a build job 30 | mvn install 31 | 32 | gcloud builds submit \ 33 | --project $PROJECT_ID \ 34 | --region $COMPUTE_REGION \ 35 | --config cloudbuild_deploy_all_services.yaml \ 36 | --substitutions _TAGGING_DISPATCHER_IMAGE=${TAGGING_DISPATCHER_IMAGE},_TAGGER_IMAGE=${TAGGER_IMAGE},_INSPECTION_DISPATCHER_IMAGE=${INSPECTION_DISPATCHER_IMAGE},_INSPECTOR_IMAGE=${INSPECTOR_IMAGE} 37 | 38 | -------------------------------------------------------------------------------- /terraform/modules/bigquery/views/v_run_summary_counts.tpl: -------------------------------------------------------------------------------- 1 | WITH dispatched AS ( 2 | SELECT 3 | jsonPayload.global_run_id AS run_id, 4 | COUNT(jsonPayload.dispatched_tracking_id) AS dispatched_tracking_id_count 5 | FROM `${project}.${dataset}.${logging_table}` 6 | WHERE jsonPayload.global_app_log = 'DISPATCHED_REQUESTS_LOG' 7 | GROUP BY 1 8 | ) 9 | , failed_dispatched AS ( 10 | SELECT 11 | jsonPayload.global_run_id AS run_id, 12 | COUNT(jsonPayload.failed_dispatcher_entity_id) AS failed_dispatched_entity_count, 13 | FROM `${project}.${dataset}.${logging_table}` 14 | WHERE jsonPayload.global_app_log = 'FAILED_DISPATCHED_REQUESTS_LOG' 15 | GROUP BY 1 16 | ) 17 | , final AS ( 18 | SELECT 19 | s.run_id, 20 | s.timestamp, 21 | d.dispatched_tracking_id_count, 22 | fd.failed_dispatched_entity_count, 23 | SUM(CASE WHEN s.status = 'SUCCESS' THEN 1 ELSE 0 END) AS success_trackers_count, 24 | SUM(CASE WHEN s.status = 'FAILED' THEN 1 ELSE 0 END) AS failed_trackers_count, 25 | FROM `${project}.${dataset}.${v_run_summary}` s 26 | LEFT JOIN dispatched d ON s.run_id = d.run_id 27 | LEFT JOIN failed_dispatched fd ON s.run_id = fd.run_id 28 | GROUP BY 1,2,3,4 29 | 30 | ) 31 | 32 | SELECT 33 | f.*, 34 | f.dispatched_tracking_id_count - (f.success_trackers_count + f.failed_trackers_count) AS in_progress_trackers_count 35 | FROM final f 36 | ORDER BY run_id DESC -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/services/scan/Scanner.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.pso.bq_pii_classifier.services.scan; 18 | 19 | import com.google.cloud.pso.bq_pii_classifier.entities.NonRetryableApplicationException; 20 | 21 | import java.util.List; 22 | 23 | public interface Scanner { 24 | 25 | 26 | // list datasets under a project in the format "project.dataset" 27 | List listParents(String project) throws NonRetryableApplicationException, InterruptedException; 28 | 29 | // list tables or dlpJobNames under a project/dataset in the format "project.dataset.table" 30 | List listChildren(String project, String dataset) throws InterruptedException, NonRetryableApplicationException; 31 | } 32 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/functions/tagger/ColumnTaggingAction.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.pso.bq_pii_classifier.functions.tagger; 18 | 19 | public enum ColumnTaggingAction { 20 | 21 | // keep existing policy tag 22 | // e.g. keep existing manual tagging from an external taxonomy 23 | KEEP_EXISTING, 24 | // Overwrite the existing policy tag 25 | // e.g. previous run detected as STREET_ADDRESS and now as PERSON_NAME (across solution-managed taxonomies) 26 | OVERWRITE, 27 | // No change detected in policy tags 28 | NO_CHANGE, 29 | // Apply a policy tag to a column without existing tags 30 | CREATE, 31 | 32 | // Same action logic but without applying the tags to columns (only for logging) 33 | DRY_RUN_KEEP_EXISTING, 34 | DRY_RUN_OVERWRITE, 35 | DRY_RUN_NO_CHANGE, 36 | DRY_RUN_CREATE 37 | } 38 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/services/pubsub/SuccessPubSubMessage.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.pso.bq_pii_classifier.services.pubsub; 18 | 19 | import com.google.cloud.pso.bq_pii_classifier.entities.JsonMessage; 20 | 21 | public class SuccessPubSubMessage { 22 | 23 | private JsonMessage msg; 24 | private String msgId; 25 | 26 | public SuccessPubSubMessage(JsonMessage msg, String msgId) { 27 | this.msg = msg; 28 | this.msgId = msgId; 29 | } 30 | 31 | public JsonMessage getMsg() { 32 | return msg; 33 | } 34 | 35 | public String getMsgId() { 36 | return msgId; 37 | } 38 | 39 | @Override 40 | public String toString() { 41 | return "PubSubSuccessMessage{" + 42 | "msg='" + msg.toJsonString() + '\'' + 43 | ", msgId='" + msgId + '\'' + 44 | '}'; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /services/library/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | [ignore] 24 | [ignore] 25 | [ignore] 26 | [ignore] 27 | [ignore] 28 | [ignore] 29 | 30 | 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/services/pubsub/FailedPubSubMessage.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.pso.bq_pii_classifier.services.pubsub; 18 | 19 | import com.google.cloud.pso.bq_pii_classifier.entities.JsonMessage; 20 | 21 | public class FailedPubSubMessage { 22 | 23 | private JsonMessage msg; 24 | private Exception exception; 25 | 26 | 27 | public FailedPubSubMessage(JsonMessage msg, Exception exception) { 28 | this.msg = msg; 29 | this.exception = exception; 30 | } 31 | 32 | public JsonMessage getMsg() { 33 | return msg; 34 | } 35 | 36 | public Exception getException() { 37 | return exception; 38 | } 39 | 40 | @Override 41 | public String toString() { 42 | return "PubSubFailedMessage{" + 43 | "msg='" + msg + '\'' + 44 | ", exception=" + exception + 45 | '}'; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/entities/dlp/TableDataProfileOrBuilder.java: -------------------------------------------------------------------------------- 1 | // Generated by the protocol buffer compiler. DO NOT EDIT! 2 | // source: dlp.proto 3 | 4 | package com.google.cloud.pso.bq_pii_classifier.entities.dlp; 5 | 6 | public interface TableDataProfileOrBuilder extends 7 | // @@protoc_insertion_point(interface_extends:com.google.cloud.pso.bq_pii_classifier.entities.dlp.TableDataProfile) 8 | com.google.protobuf.MessageOrBuilder { 9 | 10 | /** 11 | *
12 |    * The name of the profile.
13 |    * 
14 | * 15 | * string name = 1; 16 | * @return The name. 17 | */ 18 | java.lang.String getName(); 19 | /** 20 | *
21 |    * The name of the profile.
22 |    * 
23 | * 24 | * string name = 1; 25 | * @return The bytes for name. 26 | */ 27 | com.google.protobuf.ByteString 28 | getNameBytes(); 29 | 30 | /** 31 | *
32 |    * The resource name of the table.
33 |    * https://cloud.google.com/apis/design/resource_names#full_resource_name
34 |    * 
35 | * 36 | * string full_resource = 3; 37 | * @return The fullResource. 38 | */ 39 | java.lang.String getFullResource(); 40 | /** 41 | *
42 |    * The resource name of the table.
43 |    * https://cloud.google.com/apis/design/resource_names#full_resource_name
44 |    * 
45 | * 46 | * string full_resource = 3; 47 | * @return The bytes for fullResource. 48 | */ 49 | com.google.protobuf.ByteString 50 | getFullResourceBytes(); 51 | } 52 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/functions/inspector/InspectorRequest.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.pso.bq_pii_classifier.functions.inspector; 2 | 3 | import com.google.cloud.pso.bq_pii_classifier.entities.Operation; 4 | import com.google.cloud.pso.bq_pii_classifier.entities.TableSpec; 5 | 6 | public class InspectorRequest extends Operation { 7 | 8 | private final TableSpec targetTable; 9 | private final String inspectionTemplate; 10 | 11 | private final String jobRegion; 12 | 13 | public InspectorRequest(String runId, 14 | String trackingId, 15 | TableSpec targetTable, 16 | String inspectionTemplate, 17 | String jobRegion) { 18 | super(runId, trackingId); 19 | this.targetTable = targetTable; 20 | this.inspectionTemplate = inspectionTemplate; 21 | this.jobRegion = jobRegion; 22 | } 23 | 24 | public TableSpec getTargetTable() { 25 | return targetTable; 26 | } 27 | 28 | public String getInspectionTemplate() { 29 | return inspectionTemplate; 30 | } 31 | 32 | public String getJobRegion() { 33 | return jobRegion; 34 | } 35 | 36 | @Override 37 | public String toString() { 38 | return "InspectorRequest{" + 39 | "targetTable=" + targetTable.toSqlString() + 40 | ", inspectionTemplate='" + inspectionTemplate + '\'' + 41 | ", jobRegion='" + jobRegion + '\'' + 42 | "} " + super.toString(); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/entities/ApplicationLog.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.pso.bq_pii_classifier.entities; 18 | 19 | public enum ApplicationLog { 20 | // Used for generic logging event 21 | DEFAULT_LOG, 22 | // Used to log function start/stop 23 | TRACKER_LOG, 24 | // Used to log column-level tagging actions 25 | TAG_HISTORY_LOG, 26 | // Used to log table-level resource labels actions 27 | LABEL_HISTORY_LOG, 28 | // Used to log success dispatched requests per run 29 | DISPATCHED_REQUESTS_LOG, 30 | // Used to log failed dispatched requests per run 31 | FAILED_DISPATCHED_REQUESTS_LOG, 32 | // To capture trackers with non retryable exceptions during processing 33 | NON_RETRYABLE_EXCEPTIONS_LOG, 34 | // To capture trackers with retryable exceptions during processing 35 | RETRYABLE_EXCEPTIONS_LOG, 36 | // To capture table schema after applying policy tags 37 | TABLE_SCHEMA_LOG 38 | } 39 | -------------------------------------------------------------------------------- /scripts/prepare_host_project_for_auto_dlp_apis.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Copyright 2022 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # https://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | for project_number in "$@" 20 | do 21 | 22 | echo "Preparing access to host project resources for DLP service account in project number ${project_number} .." 23 | 24 | echo "Granting Editor role for serviceAccount:service-${project_number}@dlp-api.iam.gserviceaccount.com on Auto DLP dataset ${AUTO_DLP_DATASET}" 25 | 26 | bq query --location "${DATA_REGION}" --nouse_legacy_sql \ 27 | "GRANT \`roles/bigquery.dataEditor\` ON SCHEMA \`${AUTO_DLP_DATASET}\` TO 'serviceAccount:service-${project_number}@dlp-api.iam.gserviceaccount.com'" 28 | 29 | echo "Granting Publisher role for serviceAccount:service-${project_number}@dlp-api.iam.gserviceaccount.com on the Tagger Pub/Sub topic" 30 | 31 | gcloud pubsub topics add-iam-policy-binding tagger_topic \ 32 | --project="${PROJECT_ID}" \ 33 | --member="serviceAccount:service-${project_number}@dlp-api.iam.gserviceaccount.com" \ 34 | --role="roles/pubsub.publisher" 35 | 36 | 37 | done -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/entities/Operation.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.pso.bq_pii_classifier.entities; 18 | 19 | import com.google.gson.Gson; 20 | 21 | public class Operation implements JsonMessage { 22 | 23 | private String runId; 24 | private String trackingId; 25 | 26 | public Operation() { 27 | } 28 | 29 | public Operation(String runId, String trackingId) { 30 | this.runId = runId; 31 | this.trackingId = trackingId; 32 | } 33 | 34 | public String getRunId() { 35 | return runId; 36 | } 37 | 38 | public String getTrackingId() { 39 | return trackingId; 40 | } 41 | 42 | @Override 43 | public String toString() { 44 | return "Operation{" + 45 | " runId='" + runId + '\'' + 46 | ", trackingId='" + trackingId + '\'' + 47 | '}'; 48 | } 49 | 50 | @Override 51 | public String toJsonString (){ 52 | return new Gson().toJson(this); 53 | 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/services/pubsub/PubSubPublishResults.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.pso.bq_pii_classifier.services.pubsub; 18 | 19 | import java.util.List; 20 | 21 | public class PubSubPublishResults { 22 | 23 | private List successMessages; 24 | private List failedMessages; 25 | 26 | public PubSubPublishResults(List successMessages, List failedMessages) { 27 | this.successMessages = successMessages; 28 | this.failedMessages = failedMessages; 29 | } 30 | 31 | public List getSuccessMessages() { 32 | return successMessages; 33 | } 34 | 35 | public List getFailedMessages() { 36 | return failedMessages; 37 | } 38 | 39 | @Override 40 | public String toString() { 41 | return "PubSubPublishResults{" + 42 | "successMessages=" + successMessages + 43 | ", failedMessages=" + failedMessages + 44 | '}'; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /scripts/prepare_terraform_service_account_on_host_project.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Copyright 2022 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # https://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | gcloud iam service-accounts create "${TF_SA}" \ 20 | --description="Used by Terraform to deploy GCP resources" \ 21 | --display-name="Terraform Service Account" 22 | 23 | roles=( 24 | "roles/iam.roleAdmin" 25 | "roles/resourcemanager.projectIamAdmin" 26 | "roles/serviceusage.serviceUsageAdmin" 27 | "roles/iam.serviceAccountAdmin" 28 | "roles/iam.serviceAccountUser" 29 | "roles/iam.serviceAccountTokenCreator" 30 | "roles/bigquery.dataEditor" 31 | "roles/bigquery.user" 32 | "roles/bigquery.connectionAdmin" 33 | "roles/run.admin" 34 | "roles/pubsub.admin" 35 | "roles/logging.configWriter" 36 | "roles/datastore.owner" 37 | "roles/cloudfunctions.developer" 38 | "roles/dlp.admin" 39 | "roles/datacatalog.admin" 40 | "roles/cloudscheduler.admin" 41 | "roles/storage.admin" 42 | ) 43 | 44 | for role in "${roles[@]}"; do 45 | echo "Granting ${role} .." 46 | gcloud projects add-iam-policy-binding "${PROJECT_ID}" \ 47 | --member="serviceAccount:${TF_SA}@${PROJECT_ID}.iam.gserviceaccount.com" \ 48 | --role="$role" 49 | done -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | name: CI 15 | 16 | on: [pull_request] 17 | 18 | jobs: 19 | lint: 20 | runs-on: ubuntu-latest 21 | steps: 22 | 23 | ########################## 24 | # Checkout the code base # 25 | ########################## 26 | - name: Checkout Code 27 | uses: actions/checkout@v4 28 | with: 29 | # super-linter needs the full git history to get the 30 | # list of files that changed across commits 31 | fetch-depth: 0 32 | 33 | ################################ 34 | # Run Linter against code base # 35 | ################################ 36 | - name: Super Lint Code Base 37 | uses: super-linter/super-linter@v7.2.1 38 | env: 39 | VALIDATE_ALL_CODEBASE: true 40 | VALIDATE_GOOGLE_JAVA_FORMAT: true 41 | VALIDATE_TERRAFORM_FMT: true 42 | FIX_GOOGLE_JAVA_FORMAT: true 43 | FIX_TERRAFORM_FMT: true 44 | VALIDATE_TERRAFORM_TFLINT: true 45 | #VALIDATE_TERRAFORM_TERRASCAN: false 46 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 47 | FORMAT: checkstyle 48 | SAVE_SUPER_LINTER_OUTPUT: true -------------------------------------------------------------------------------- /scripts/prepare_data_projects_for_auto_dlp_mode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Copyright 2022 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # https://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | for project in "$@" 20 | do 21 | 22 | echo "Preparing data project ${project} .." 23 | 24 | # Tagging Dispatcher needs to know the location of datasets 25 | gcloud projects add-iam-policy-binding "${project}" \ 26 | --member="serviceAccount:${SA_TAGGING_DISPATCHER_EMAIL}" \ 27 | --role="roles/bigquery.metadataViewer" 28 | 29 | # Tagger needs to read table schema and update tables policy tags 30 | gcloud projects add-iam-policy-binding "${project}" \ 31 | --member="serviceAccount:${SA_TAGGER_EMAIL}" \ 32 | --role="roles/bigquery.dataOwner" 33 | 34 | # Cloud Function remote_get_table_policy_tags needs to read tables policy tags (metadata) 35 | gcloud projects add-iam-policy-binding "${project}" \ 36 | --member="serviceAccount:${SA_BQ_REMOTE_FUNC_GET_POLICY_TAGS}" \ 37 | --role="roles/bigquery.metadataViewer" 38 | 39 | # Cloud Function remote_get_table_policy_tags needs to read taxonomy data (metadata) 40 | gcloud projects add-iam-policy-binding "${project}" \ 41 | --member="serviceAccount:${SA_BQ_REMOTE_FUNC_GET_POLICY_TAGS}" \ 42 | --role="roles/datacatalog.viewer" 43 | 44 | done 45 | -------------------------------------------------------------------------------- /services/library/src/test/java/com/google/cloud/pso/bq_pii_classifier/functions/dispatcher/BigQueryScopeTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.pso.bq_pii_classifier.functions.dispatcher; 18 | 19 | import com.google.gson.Gson; 20 | import org.junit.Test; 21 | 22 | import java.util.ArrayList; 23 | import java.util.Arrays; 24 | 25 | import static org.junit.Assert.assertEquals; 26 | 27 | public class BigQueryScopeTest { 28 | 29 | @Test 30 | public void fromJson() { 31 | 32 | String input = "{\n" + 33 | "\"datasetExcludeList\":[],\n" + 34 | "\"datasetIncludeList\":[],\n" + 35 | "\"projectIncludeList\":[\"project1\", \"project2\"],\n" + 36 | "\"tableExcludeList\":[]\n" + 37 | "}"; 38 | 39 | BigQueryScope expected = new BigQueryScope( 40 | new ArrayList<>(Arrays.asList("project1", "project2")), 41 | new ArrayList<>(), 42 | new ArrayList<>(), 43 | new ArrayList<>() 44 | ); 45 | 46 | Gson gson = new Gson(); 47 | BigQueryScope actual = gson.fromJson(input, BigQueryScope.class); 48 | 49 | assertEquals(expected, actual); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /docs/common-terraform-1-prepare.md: -------------------------------------------------------------------------------- 1 | ### Env setup 2 | 3 | In a terminal shell, set and export the following variables. 4 | 5 | ``` 6 | export PROJECT_ID= 7 | export PROJECT_NUMBER=< get from GCP project home page> 8 | export TF_SA=bq-pii-classifier-terraform 9 | export COMPUTE_REGION=< region to deploy infra resources > 10 | export DATA_REGION=< region where the target data resides > 11 | export BUCKET_NAME=${PROJECT_ID}-bq-pii-classifier 12 | export BUCKET=gs://${BUCKET_NAME} 13 | export DOCKER_REPO_NAME=bq-pii-classifier 14 | export ACCOUNT=< personal account email > 15 | 16 | gcloud config set project $PROJECT_ID 17 | gcloud config set account $ACCOUNT 18 | gcloud config set compute/region $COMPUTE_REGION 19 | 20 | gcloud auth login 21 | gcloud auth application-default login 22 | ``` 23 | 24 | ### GCP Set up 25 | 26 | * Enable APIs 27 | * Enable [Cloud Resource Manager API](https://console.cloud.google.com/apis/library/cloudresourcemanager.googleapis.com) 28 | * Enable [IAM API](https://console.developers.google.com/apis/api/iam.googleapis.com/overview) 29 | * Enable [Data Catalog API](https://console.developers.google.com/apis/api/datacatalog.googleapis.com/overview) 30 | * Enable [Artifact Registry](https://console.developers.google.com/apis/api/artifactregistry.googleapis.com/overview) 31 | 32 | 33 | ### Prepare Terraform State Bucket 34 | 35 | ``` 36 | gsutil mb -p $PROJECT_ID -l $COMPUTE_REGION -b on $BUCKET 37 | ``` 38 | 39 | ### Prepare Terraform Service Account 40 | 41 | Terraform needs to run with a service account to deploy DLP resources. User accounts are not enough. 42 | 43 | ``` 44 | ./scripts/prepare_terraform_service_account_on_host_project.sh 45 | ``` 46 | 47 | ### Prepare a Docker Repo 48 | 49 | We need a Docker Repository to publish images that are used by this solution 50 | 51 | ``` 52 | gcloud artifacts repositories create $DOCKER_REPO_NAME --repository-format=docker \ 53 | --project=PROJECT_ID --location=$COMPUTE_REGION --description="Docker repository" 54 | ``` -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/services/bq/BigQueryService.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.pso.bq_pii_classifier.services.bq; 18 | 19 | import com.google.api.services.bigquery.model.TableFieldSchema; 20 | import com.google.cloud.bigquery.Job; 21 | import com.google.cloud.bigquery.TableResult; 22 | import com.google.cloud.pso.bq_pii_classifier.entities.TableSpec; 23 | 24 | import java.io.IOException; 25 | import java.math.BigInteger; 26 | import java.util.List; 27 | import java.util.Map; 28 | 29 | public interface BigQueryService { 30 | String getDatasetLocation(String projectId, String datasetId) throws IOException; 31 | 32 | Job submitJob(String query); 33 | 34 | TableResult waitAndGetJobResults(Job queryJob) throws InterruptedException, RuntimeException; 35 | 36 | List getTableSchemaFields(TableSpec tableSpec) throws IOException; 37 | 38 | void patchTableSchema(TableSpec tableSpec, List updatedFields) throws IOException; 39 | void patchTableLabels(TableSpec tableSpec, Map tableLabels) throws IOException; 40 | 41 | void patchTable(TableSpec tableSpec, List updatedFields, Map tableLabels) throws IOException; 42 | 43 | BigInteger getTableNumRows(TableSpec tableSpec) throws IOException; 44 | 45 | boolean tableExists(TableSpec tableSpec); 46 | } 47 | -------------------------------------------------------------------------------- /scripts/prepare_end_user_permissions.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Copyright 2022 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # https://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | # This script will grant required permissions to access data and metadata for end-users per group 20 | 21 | # input format: 22 | # prepare_end_user_permissions.sh group@domain.com host-project "project1 project2 project3 etc" 23 | # This will grant group@domain.com permissions on the list of projects 1,2,3 24 | # and on host-project 25 | 26 | set -e 27 | 28 | n=1 29 | group=${!n} 30 | 31 | n=2 32 | host_project=${!n} 33 | 34 | n=3 35 | data_projects=${!n} 36 | 37 | echo "Group is ${group}" 38 | echo "Host project is [${host_project}]" 39 | echo "Projects are [${data_projects}]" 40 | 41 | 42 | ##### Data Catalog Viewer on the host project (to view policy tags in BigQuery UI) 43 | gcloud projects add-iam-policy-binding "${host_project}" \ 44 | --member="group:${group}" \ 45 | --role="roles/datacatalog.viewer" 46 | 47 | ##### For each project with marketing datasets 48 | for project in $data_projects; do 49 | echo "Preparing permissions for group '${group}' on project '${project}' .." 50 | 51 | ##### BigQuery Reader (to read data) 52 | gcloud projects add-iam-policy-binding "${project}" \ 53 | --member="group:${group}" \ 54 | --role="roles/bigquery.dataViewer" 55 | 56 | ##### BigQuery Job User (to submit query jobs) 57 | gcloud projects add-iam-policy-binding "${project}" \ 58 | --member="group:${group}" \ 59 | --role="roles/bigquery.jobUser" 60 | 61 | 62 | done 63 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/services/dlp/DlpServiceImpl.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.pso.bq_pii_classifier.services.dlp; 18 | 19 | import com.google.cloud.dlp.v2.DlpServiceClient; 20 | import com.google.privacy.dlp.v2.BigQueryTable; 21 | import com.google.privacy.dlp.v2.CreateDlpJobRequest; 22 | import com.google.privacy.dlp.v2.DlpJob; 23 | 24 | import java.io.IOException; 25 | 26 | public class DlpServiceImpl implements DlpService { 27 | 28 | DlpServiceClient dlpServiceClient; 29 | 30 | public DlpServiceImpl () throws IOException { 31 | dlpServiceClient = DlpServiceClient.create(); 32 | } 33 | 34 | @Override 35 | public void shutDown(){ 36 | dlpServiceClient.shutdown(); 37 | } 38 | 39 | @Override 40 | public DlpJob submitJob(CreateDlpJobRequest createDlpJobRequest){ 41 | return dlpServiceClient.createDlpJob(createDlpJobRequest); 42 | } 43 | 44 | @Override 45 | public DlpJob.JobState getJobState(String jobId){ 46 | return dlpServiceClient.getDlpJob(jobId).getState(); 47 | } 48 | @Override 49 | public BigQueryTable getInspectedTable(String jobId){ 50 | return dlpServiceClient.getDlpJob(jobId) 51 | .getInspectDetails() 52 | .getRequestedOptions() 53 | .getJobConfig() 54 | .getStorageConfig() 55 | .getBigQueryOptions() 56 | .getTableReference(); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /terraform/modules/bq-remote-function/variables.tf: -------------------------------------------------------------------------------- 1 | # /* 2 | # * Copyright 2023 Google LLC 3 | # * 4 | # * Licensed under the Apache License, Version 2.0 (the "License"); 5 | # * you may not use this file except in compliance with the License. 6 | # * You may obtain a copy of the License at 7 | # * 8 | # * https://www.apache.org/licenses/LICENSE-2.0 9 | # * 10 | # * Unless required by applicable law or agreed to in writing, software 11 | # * distributed under the License is distributed on an "AS IS" BASIS, 12 | # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # * See the License for the specific language governing permissions and 14 | # * limitations under the License. 15 | # */ 16 | 17 | variable "service_account_name" { 18 | type = string 19 | } 20 | 21 | variable "project" { 22 | type = string 23 | } 24 | 25 | variable "compute_region" { 26 | type = string 27 | } 28 | 29 | variable "data_region" { 30 | type = string 31 | } 32 | 33 | variable "function_name" { 34 | type = string 35 | } 36 | 37 | variable "cloud_function_src_dir" { 38 | type = string 39 | } 40 | 41 | variable "cloud_function_temp_dir" { 42 | type = string 43 | } 44 | 45 | variable "function_entry_point" { 46 | type = string 47 | } 48 | 49 | variable "env_variables" { 50 | type = map(string) 51 | } 52 | 53 | variable "bigquery_dataset_name" { 54 | type = string 55 | } 56 | 57 | variable "deployment_procedure_path" { 58 | type = string 59 | } 60 | 61 | variable "cloud_functions_sa_extra_roles" { type = list(string) } 62 | 63 | variable "cf_max_instance_count" { 64 | type = number 65 | default = 3 66 | } 67 | 68 | variable "cf_min_instance_count" { 69 | type = number 70 | default = 1 71 | } 72 | 73 | variable "cf_available_memory" { 74 | type = string 75 | default = "1Gi" 76 | } 77 | 78 | variable "cf_timeout_seconds" { 79 | type = number 80 | default = 3600 81 | } 82 | 83 | variable "cf_max_instance_request_concurrency" { 84 | type = number 85 | default = 80 86 | } 87 | 88 | variable "cf_available_cpu" { 89 | type = string 90 | default = "2" 91 | } 92 | 93 | variable "datastore_database_name" { 94 | type = string 95 | default = "(default)" 96 | } -------------------------------------------------------------------------------- /terraform/modules/cloud-run/main.tf: -------------------------------------------------------------------------------- 1 | #https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloud_run_service 2 | 3 | locals { 4 | timestamp = formatdate("YYYY-MM-DD-hh:mm:ss", timestamp()) 5 | } 6 | 7 | resource "google_cloud_run_service" "service" { 8 | project = var.project 9 | name = var.service_name 10 | location = var.region 11 | 12 | template { 13 | spec { 14 | 15 | timeout_seconds = var.timeout_seconds 16 | service_account_name = var.service_account_email 17 | 18 | container_concurrency = var.max_requests_per_container 19 | 20 | containers { 21 | image = var.service_image 22 | 23 | resources { 24 | limits = { 25 | "memory": var.max_memory 26 | "cpu": var.max_cpu 27 | } 28 | } 29 | 30 | dynamic env { 31 | for_each = var.environment_variables 32 | content { 33 | name = env.value["name"] 34 | value = env.value["value"] 35 | } 36 | } 37 | 38 | 39 | # Hack to force terraform to re-deploy this service (e.g. update latest image) 40 | env { 41 | name = "TERRAFORM_UPDATED_AT" 42 | value = local.timestamp 43 | } 44 | } 45 | } 46 | 47 | metadata { 48 | annotations = { 49 | "autoscaling.knative.dev/maxScale" = var.max_containers 50 | } 51 | } 52 | } 53 | 54 | metadata { 55 | annotations = { 56 | "run.googleapis.com/ingress" : "internal" 57 | } 58 | 59 | // cloud run labels must not include '-'. If so, it will not appear in the YAML definition 60 | labels = var.default_labels 61 | } 62 | 63 | traffic { 64 | percent = 100 65 | latest_revision = true 66 | } 67 | } 68 | 69 | ### Dispatcher Tasks SA must be able to invoke Dispatcher service #### 70 | resource "google_cloud_run_service_iam_member" "sa_invoker" { 71 | 72 | project = google_cloud_run_service.service.project 73 | location = google_cloud_run_service.service.location 74 | service = google_cloud_run_service.service.name 75 | role = "roles/run.invoker" 76 | member = "serviceAccount:${var.invoker_service_account_email}" 77 | } -------------------------------------------------------------------------------- /terraform/modules/data_project_permissions_in_standard_mode/main.tf: -------------------------------------------------------------------------------- 1 | 2 | # Inspection Dispatcher needs to list datasets and tables in a project and know the location of datasets 3 | resource "google_project_iam_member" "data_project_iam_inspection_dispatcher_bq_metadata_viewer" { 4 | project = var.target_project 5 | role = "roles/bigquery.metadataViewer" 6 | member = "serviceAccount:${var.sa_inspection_dispatcher_email}" 7 | } 8 | 9 | # Tagging Dispatcher needs to know the location of datasets 10 | resource "google_project_iam_member" "data_project_iam_tagging_dispatcher_bq_metadata_viewer" { 11 | project = var.target_project 12 | role = "roles/bigquery.metadataViewer" 13 | member = "serviceAccount:${var.sa_tagging_dispatcher_email}" 14 | } 15 | 16 | # Inspector needs to view table's metadata (row count) 17 | resource "google_project_iam_member" "data_project_iam_inspector_bq_metadata_viewer" { 18 | project = var.target_project 19 | role = "roles/bigquery.metadataViewer" 20 | member = "serviceAccount:${var.sa_inspector_email}" 21 | } 22 | 23 | # Tagger needs to read table schema and update tables policy tags 24 | resource "google_project_iam_member" "data_project_tagger_bq_data_owner" { 25 | project = var.target_project 26 | role = "roles/bigquery.dataOwner" 27 | member = "serviceAccount:${var.sa_tagger_email}" 28 | } 29 | 30 | # DLP service account needs to read and inspect bigquery data 31 | resource "google_project_iam_member" "data_project_dlp_bq_data_viewer" { 32 | project = var.target_project 33 | role = "roles/bigquery.dataViewer" 34 | member = "serviceAccount:${var.sa_dlp_email}" 35 | } 36 | 37 | # Cloud Function remote_get_table_policy_tags needs to read tables policy tags (metadata) 38 | resource "google_project_iam_member" "data_project_iam_remote_func_bq_metadata_viewer" { 39 | project = var.target_project 40 | role = "roles/bigquery.metadataViewer" 41 | member = "serviceAccount:${var.sa_bq_remote_func_get_policy_tags_email}" 42 | } 43 | 44 | resource "google_project_iam_member" "data_project_iam_remote_func_datacatalog_viewer" { 45 | project = var.target_project 46 | role = "roles/datacatalog.viewer" 47 | member = "serviceAccount:${var.sa_bq_remote_func_get_policy_tags_email}" 48 | } 49 | 50 | -------------------------------------------------------------------------------- /terraform/modules/data-catalog/main.tf: -------------------------------------------------------------------------------- 1 | # https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/data_catalog_policy_tag 2 | 3 | 4 | ### Create One taxonomy and it's hierarchy 5 | 6 | resource "google_data_catalog_taxonomy" "domain_taxonomy" { 7 | provider = google-beta 8 | project = var.project 9 | region = var.region 10 | display_name = title("${var.domain} Taxonomy ${var.taxonomy_number}${var.taxonomy_name_suffix}") 11 | description = "Policy tags assigned by BQ PII Classifier for domain '${var.domain}' - ${var.taxonomy_number} in region '${var.region}'" 12 | activated_policy_types = var.data_catalog_taxonomy_activated_policy_types 13 | } 14 | 15 | locals { 16 | // get distinct list of parents 17 | // sort to create and index them in order 18 | parent_nodes = sort(distinct([ 19 | for entry in var.classification_taxonomy : entry["classification"] 20 | ])) 21 | } 22 | 23 | resource "google_data_catalog_policy_tag" "parent_tags" { 24 | count = length(local.parent_nodes) 25 | provider = google-beta 26 | taxonomy = google_data_catalog_taxonomy.domain_taxonomy.id 27 | display_name = local.parent_nodes[count.index] 28 | # FIXME: this is a hack to propagate the domain the output variable "created_parent_tags". Find an alternative 29 | description = "${var.domain} | ${local.parent_nodes[count.index]}" 30 | } 31 | 32 | resource "google_data_catalog_policy_tag" "children_tags" { 33 | count = length(var.classification_taxonomy) 34 | provider = google-beta 35 | taxonomy = google_data_catalog_taxonomy.domain_taxonomy.id 36 | 37 | # How to decide the parent policy tag resource: 38 | # get the list element from var.nodes based on the loop index 39 | # get the "classification" field from the element 40 | # get the index of the "parent" value from locals.parent_nodes 41 | parent_policy_tag = google_data_catalog_policy_tag.parent_tags[index 42 | (local.parent_nodes, lookup(var.classification_taxonomy[count.index], "classification", "NA"))].id 43 | 44 | display_name = var.classification_taxonomy[count.index]["policy_tag"] 45 | 46 | # FIXME: this is a hack to propagate the domain, info type and classification to the output variable "created_children_tags". Find an alternative 47 | description = "${var.domain} | ${lookup(var.classification_taxonomy[count.index],"classification", "NA")} | ${lookup(var.classification_taxonomy[count.index],"info_type", "NA")}" 48 | } 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/services/scan/BigQueryScannerImpl.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.pso.bq_pii_classifier.services.scan; 18 | 19 | import com.google.cloud.bigquery.BigQuery; 20 | import com.google.cloud.bigquery.BigQueryOptions; 21 | import com.google.cloud.bigquery.DatasetId; 22 | import com.google.cloud.bigquery.TableDefinition; 23 | 24 | import java.io.IOException; 25 | import java.util.ArrayList; 26 | import java.util.List; 27 | import java.util.stream.Collectors; 28 | import java.util.stream.StreamSupport; 29 | 30 | public class BigQueryScannerImpl implements Scanner { 31 | 32 | private BigQuery bqService; 33 | 34 | public BigQueryScannerImpl() throws IOException { 35 | 36 | bqService = BigQueryOptions.getDefaultInstance().getService(); 37 | } 38 | 39 | @Override 40 | public List listChildren(String projectId, String datasetId) { 41 | return StreamSupport.stream(bqService.listTables(DatasetId.of(projectId, datasetId)).iterateAll().spliterator(), 42 | false) 43 | .filter(t -> t.getDefinition().getType().equals(TableDefinition.Type.TABLE)) 44 | .map(t -> String.format("%s.%s.%s", projectId, datasetId, t.getTableId().getTable())) 45 | .collect(Collectors.toCollection(ArrayList::new)); 46 | } 47 | 48 | @Override 49 | public List listParents(String projectId) { 50 | return StreamSupport.stream(bqService.listDatasets(projectId) 51 | .iterateAll() 52 | .spliterator(), 53 | false) 54 | .map(d -> String.format("%s.%s", projectId, d.getDatasetId().getDataset())) 55 | .collect(Collectors.toCollection(ArrayList::new)); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /terraform/modules/bigquery/views/v_broken_steps.tpl: -------------------------------------------------------------------------------- 1 | WITH dispacthed_requests AS 2 | ( 3 | SELECT DISTINCT 4 | jsonPayload.global_run_id AS run_id, 5 | jsonPayload.dispatched_tracking_id AS dispatched_tracking_id 6 | FROM 7 | `${project}.${dataset}.${logging_table}` 8 | WHERE 9 | jsonPayload.global_app_log = 'DISPATCHED_REQUESTS_LOG' 10 | 11 | -- -- Unit testing 12 | -- -- r1 tagger starts and doesn't finish successfully for t3 13 | -- SELECT 'r1' AS run_id, 'r1_t1_tagged' AS dispatched_tracking_id UNION ALL 14 | -- SELECT 'r1' AS run_id, 'r1_t2_tagged' AS dispatched_tracking_id UNION ALL 15 | -- SELECT 'r1' AS run_id, 'r1_t3_nottagged' AS dispatched_tracking_id UNION ALL 16 | -- -- r2 tagger doesn't start for t2 17 | -- SELECT 'r2' AS run_id, 'r2_t1_tagged' AS dispatched_tracking_id UNION ALL 18 | -- SELECT 'r2' AS run_id, 'r2_t2_tagged' AS dispatched_tracking_id 19 | ) 20 | 21 | 22 | 23 | , tagger_calls AS 24 | ( 25 | SELECT 26 | run_id, 27 | tracker, 28 | inspector_starts, 29 | inspector_ends, 30 | tagger_starts, 31 | tagger_ends, 32 | FROM 33 | `${project}.${dataset}.${v_service_calls}` 34 | 35 | -- -- Unit tests 36 | -- -- r1 tagger starts and doesn't finish successfully for t3 37 | -- SELECT 'r1' AS run_id, 'r1_t1_tagged' AS tracker, 1 AS tagger_starts, 1 tagger_ends UNION ALL 38 | -- SELECT 'r1' AS run_id, 'r1_t2_tagged' AS tracker, 1 AS tagger_starts, 1 tagger_ends UNION ALL 39 | -- SELECT 'r1' AS run_id, 'r1_t3_nottagged' AS tracker, 1 AS tagger_starts, 0 tagger_ends UNION ALL 40 | -- -- r2 tagger doesn't start for t2 41 | -- SELECT 'r2' AS run_id, 'r2_t1_tagged' AS tracker, 1 AS tagger_starts, 1 tagger_ends 42 | ) 43 | 44 | -- select the dispatched trackers that has no corresponding tagger call finish marker 45 | SELECT 46 | d.run_id, 47 | d.dispatched_tracking_id, 48 | t.tagger_starts, 49 | t.tagger_ends, 50 | 'Tagger did not run or complete successfully.' AS msg 51 | FROM 52 | dispacthed_requests d 53 | LEFT JOIN tagger_calls t ON d.dispatched_tracking_id = t.tracker 54 | WHERE t.tracker IS NULL 55 | 56 | UNION ALL 57 | 58 | -- select the projects, datasets or tables that failed at the dispatcher step 59 | SELECT DISTINCT 60 | jsonPayload.global_run_id AS run_id, 61 | jsonPayload.failed_dispatcher_entity_id AS entity_id, 62 | null AS tagger_starts, 63 | null AS tagger_ends, 64 | jsonPayload.global_msg AS msg 65 | FROM 66 | `${project}.${dataset}.${logging_table}` 67 | WHERE 68 | jsonPayload.global_app_log = 'FAILED_DISPATCHED_REQUESTS_LOG' 69 | 70 | -------------------------------------------------------------------------------- /terraform/modules/pubsub/main.tf: -------------------------------------------------------------------------------- 1 | 2 | # https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/pubsub_topic 3 | 4 | resource "google_pubsub_topic" "topic" { 5 | project = var.project 6 | name = var.topic 7 | labels = var.default_labels 8 | } 9 | 10 | # https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/pubsub_subscription 11 | 12 | resource "google_pubsub_subscription" "subscription" { 13 | project = var.project 14 | name = var.subscription_name 15 | topic = google_pubsub_topic.topic.name 16 | 17 | # Use a relatively high value to avoid re-sending the message when the deadline expires. 18 | # Especially with the dispatchers that could take few minutes to list all tables for large scopes 19 | ack_deadline_seconds = var.subscription_ack_deadline_seconds 20 | 21 | # How long to retain unacknowledged messages in the subscription's backlog, from the moment a message is published. 22 | # In case of unexpected problems we want to avoid a buildup that re-trigger functions (e.g. Tagger issuing unnecessary BQ queries) 23 | # It also sets how long should we keep trying to process one run 24 | message_retention_duration = var.subscription_message_retention_duration 25 | retain_acked_messages = false 26 | 27 | enable_message_ordering = false 28 | 29 | # The message sent to a subscriber is guaranteed not to be resent before the message's acknowledgement deadline expires 30 | enable_exactly_once_delivery = false 31 | 32 | # Policy to delete the subscription when in-active 33 | expiration_policy { 34 | # Never Expires. Empty to avoid the 31 days expiration. 35 | ttl = "" 36 | } 37 | 38 | retry_policy { 39 | # The minimum delay between consecutive deliveries of a given message 40 | minimum_backoff = "60s" # 41 | # The maximum delay between consecutive deliveries of a given message 42 | maximum_backoff = "600s" # 10 mins 43 | } 44 | 45 | push_config { 46 | push_endpoint = var.subscription_endpoint 47 | 48 | oidc_token { 49 | service_account_email = var.subscription_service_account 50 | } 51 | } 52 | 53 | labels = var.default_labels 54 | } 55 | 56 | # Allow an SA to publish to this topic 57 | resource "google_pubsub_topic_iam_member" "sa_topic_publisher" { 58 | count = length(var.topic_publishers_sa_emails) 59 | project = var.project 60 | topic = google_pubsub_topic.topic.id 61 | role = "roles/pubsub.publisher" 62 | member = "serviceAccount:${var.topic_publishers_sa_emails[count.index]}" 63 | } 64 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/entities/dlp/DataProfilePubSubMessageOrBuilder.java: -------------------------------------------------------------------------------- 1 | // Generated by the protocol buffer compiler. DO NOT EDIT! 2 | // source: dlp.proto 3 | 4 | package com.google.cloud.pso.bq_pii_classifier.entities.dlp; 5 | 6 | public interface DataProfilePubSubMessageOrBuilder extends 7 | // @@protoc_insertion_point(interface_extends:com.google.cloud.pso.bq_pii_classifier.entities.dlp.DataProfilePubSubMessage) 8 | com.google.protobuf.MessageOrBuilder { 9 | 10 | /** 11 | *
12 |    * If `DetailLevel` is `TABLE_PROFILE` this will be fully populated.
13 |    * Otherwise, if `DetailLevel` is `RESOURCE_NAME`, then only `name` and
14 |    * `full_resource` will be populated.
15 |    * 
16 | * 17 | * .com.google.cloud.pso.bq_pii_classifier.entities.dlp.TableDataProfile profile = 1; 18 | * @return Whether the profile field is set. 19 | */ 20 | boolean hasProfile(); 21 | /** 22 | *
23 |    * If `DetailLevel` is `TABLE_PROFILE` this will be fully populated.
24 |    * Otherwise, if `DetailLevel` is `RESOURCE_NAME`, then only `name` and
25 |    * `full_resource` will be populated.
26 |    * 
27 | * 28 | * .com.google.cloud.pso.bq_pii_classifier.entities.dlp.TableDataProfile profile = 1; 29 | * @return The profile. 30 | */ 31 | com.google.cloud.pso.bq_pii_classifier.entities.dlp.TableDataProfile getProfile(); 32 | /** 33 | *
34 |    * If `DetailLevel` is `TABLE_PROFILE` this will be fully populated.
35 |    * Otherwise, if `DetailLevel` is `RESOURCE_NAME`, then only `name` and
36 |    * `full_resource` will be populated.
37 |    * 
38 | * 39 | * .com.google.cloud.pso.bq_pii_classifier.entities.dlp.TableDataProfile profile = 1; 40 | */ 41 | com.google.cloud.pso.bq_pii_classifier.entities.dlp.TableDataProfileOrBuilder getProfileOrBuilder(); 42 | 43 | /** 44 | *
45 |    * The event that caused the Pub/Sub message to be sent.
46 |    * 
47 | * 48 | * .com.google.cloud.pso.bq_pii_classifier.entities.dlp.DataProfileAction.EventType event = 2; 49 | * @return The enum numeric value on the wire for event. 50 | */ 51 | int getEventValue(); 52 | /** 53 | *
54 |    * The event that caused the Pub/Sub message to be sent.
55 |    * 
56 | * 57 | * .com.google.cloud.pso.bq_pii_classifier.entities.dlp.DataProfileAction.EventType event = 2; 58 | * @return The event. 59 | */ 60 | com.google.cloud.pso.bq_pii_classifier.entities.dlp.DataProfileAction.EventType getEvent(); 61 | } 62 | -------------------------------------------------------------------------------- /scripts/prepare_data_projects_for_standard_mode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Copyright 2022 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # https://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | for project in "$@" 20 | do 21 | 22 | echo "Preparing data project ${project} .." 23 | 24 | # Inspection Dispatcher needs to list datasets and tables in a project and know the location of datasets 25 | gcloud projects add-iam-policy-binding "${project}" \ 26 | --member="serviceAccount:${SA_INSPECTION_DISPATCHER_EMAIL}" \ 27 | --role="roles/bigquery.metadataViewer" 28 | 29 | # Tagging Dispatcher needs to know the location of datasets 30 | gcloud projects add-iam-policy-binding "${project}" \ 31 | --member="serviceAccount:${SA_TAGGING_DISPATCHER_EMAIL}" \ 32 | --role="roles/bigquery.metadataViewer" 33 | 34 | # Inspector needs to view table's metadata (row count) 35 | gcloud projects add-iam-policy-binding "${project}" \ 36 | --member="serviceAccount:${SA_INSPECTOR_EMAIL}" \ 37 | --role="roles/bigquery.metadataViewer" 38 | 39 | # Tagger needs to read table schema and update tables policy tags 40 | gcloud projects add-iam-policy-binding "${project}" \ 41 | --member="serviceAccount:${SA_TAGGER_EMAIL}" \ 42 | --role="roles/bigquery.dataOwner" 43 | 44 | # DLP service account needs to read and inspect bigquery data 45 | gcloud projects add-iam-policy-binding "${project}" \ 46 | --member="serviceAccount:${SA_DLP_EMAIL}" \ 47 | --role="roles/bigquery.dataViewer" 48 | 49 | # Cloud Function remote_get_table_policy_tags needs to read tables policy tags (metadata) 50 | gcloud projects add-iam-policy-binding "${project}" \ 51 | --member="serviceAccount:${SA_BQ_REMOTE_FUNC_GET_POLICY_TAGS}" \ 52 | --role="roles/bigquery.metadataViewer" 53 | 54 | # Cloud Function remote_get_table_policy_tags needs to read taxonomy data (metadata) 55 | gcloud projects add-iam-policy-binding "${project}" \ 56 | --member="serviceAccount:${SA_BQ_REMOTE_FUNC_GET_POLICY_TAGS}" \ 57 | --role="roles/datacatalog.viewer" 58 | 59 | done 60 | -------------------------------------------------------------------------------- /services/dispatcher-inspection-app/src/main/java/com/google/cloud/pso/bq_pii_classifier/dispatcher/Environment.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.pso.bq_pii_classifier.dispatcher; 18 | 19 | import com.google.cloud.pso.bq_pii_classifier.entities.DispatcherType; 20 | import com.google.cloud.pso.bq_pii_classifier.entities.SolutionMode; 21 | import com.google.cloud.pso.bq_pii_classifier.functions.dispatcher.DispatcherConfig; 22 | import com.google.cloud.pso.bq_pii_classifier.helpers.Utils; 23 | import com.google.gson.Gson; 24 | 25 | import java.util.ArrayList; 26 | 27 | public class Environment { 28 | 29 | public DispatcherConfig toConfig(){ 30 | return new DispatcherConfig( 31 | getProjectId(), 32 | getComputeRegionId(), 33 | getDataRegionId(), 34 | new Gson().fromJson(getSourceDataRegions().toLowerCase(), ArrayList.class), 35 | getInspectionTopic(), 36 | DispatcherType.INSPECTION, 37 | SolutionMode.STANDARD_DLP, 38 | Utils.parseJsonToMap(getDlpInspectionTemplatesIds(), "region", "ids") 39 | ); 40 | } 41 | 42 | public String getProjectId(){ 43 | return Utils.getConfigFromEnv("PROJECT_ID", true); 44 | } 45 | 46 | public String getComputeRegionId(){ 47 | return Utils.getConfigFromEnv("COMPUTE_REGION_ID", true); 48 | } 49 | 50 | public String getDataRegionId(){ 51 | return Utils.getConfigFromEnv("DATA_REGION_ID", true); 52 | } 53 | 54 | public String getSourceDataRegions(){ 55 | return Utils.getConfigFromEnv("SOURCE_DATA_REGIONS", true); 56 | } 57 | 58 | public String getInspectionTopic() { return Utils.getConfigFromEnv("INSPECTION_TOPIC", true); } 59 | 60 | public String getGcsFlagsBucket(){ 61 | return Utils.getConfigFromEnv("GCS_FLAGS_BUCKET", true); 62 | } 63 | 64 | public String getDlpInspectionTemplatesIds(){ 65 | return Utils.getConfigFromEnv("DLP_INSPECTION_TEMPLATES_IDS", true); 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/* 2 | 3 | no-git 4 | no-git/* 5 | 6 | TODO.md 7 | 8 | terraform/tfvars/* 9 | *.tfvars 10 | tfvars_* 11 | 12 | *.DS_Store 13 | 14 | .idea/ 15 | .gcloudignore 16 | *.iml 17 | 18 | .terraform/ 19 | .terraform* 20 | 21 | # Byte-compiled / optimized / DLL files 22 | __pycache__/ 23 | *.py[cod] 24 | *$py.class 25 | 26 | # C extensions 27 | *.so 28 | 29 | # Distribution / packaging 30 | .Python 31 | build/ 32 | develop-eggs/ 33 | dist/ 34 | downloads/ 35 | eggs/ 36 | .eggs/ 37 | lib/ 38 | lib64/ 39 | parts/ 40 | sdist/ 41 | var/ 42 | wheels/ 43 | pip-wheel-metadata/ 44 | share/python-wheels/ 45 | *.egg-info/ 46 | .installed.cfg 47 | *.egg 48 | MANIFEST 49 | 50 | # PyInstaller 51 | # Usually these files are written by a python script from a template 52 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 53 | *.manifest 54 | *.spec 55 | 56 | # Installer logs 57 | pip-log.txt 58 | pip-delete-this-directory.txt 59 | 60 | # Unit test / coverage reports 61 | htmlcov/ 62 | .tox/ 63 | .nox/ 64 | .coverage 65 | .coverage.* 66 | .cache 67 | nosetests.xml 68 | coverage.xml 69 | *.cover 70 | *.py,cover 71 | .hypothesis/ 72 | .pytest_cache/ 73 | 74 | # Translations 75 | *.mo 76 | *.pot 77 | 78 | # Django stuff: 79 | *.log 80 | local_settings.py 81 | db.sqlite3 82 | db.sqlite3-journal 83 | 84 | # Flask stuff: 85 | instance/ 86 | .webassets-cache 87 | 88 | # Scrapy stuff: 89 | .scrapy 90 | 91 | # Sphinx documentation 92 | docs/_build/ 93 | 94 | # PyBuilder 95 | target/ 96 | 97 | # Jupyter Notebook 98 | .ipynb_checkpoints 99 | 100 | # IPython 101 | profile_default/ 102 | ipython_config.py 103 | 104 | # pyenv 105 | .python-version 106 | 107 | # pipenv 108 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 109 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 110 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 111 | # install all needed dependencies. 112 | #Pipfile.lock 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/helpers/TrackingHelper.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.pso.bq_pii_classifier.helpers; 18 | 19 | import com.google.cloud.pso.bq_pii_classifier.entities.TableSpec; 20 | import org.apache.commons.codec.digest.DigestUtils; 21 | 22 | import java.util.UUID; 23 | 24 | public class TrackingHelper { 25 | 26 | private static final String taggingRunSuffix = "-T"; 27 | private static final String inspectionRunSuffix = "-I"; 28 | private static final String oneTimeTaggingSuffix = "-A"; 29 | private static final Integer suffixLength = 2; 30 | 31 | public static String generateTaggingRunId(){ 32 | return generateRunId(taggingRunSuffix); 33 | } 34 | 35 | public static String generateInspectionRunId(){ 36 | return generateRunId(inspectionRunSuffix); 37 | } 38 | 39 | public static String generateOneTimeTaggingSuffix(){ 40 | return generateRunId(oneTimeTaggingSuffix); 41 | } 42 | 43 | private static String generateRunId(String suffix){ 44 | return String.format("%s%s", System.currentTimeMillis(), suffix); 45 | } 46 | 47 | public static String parseRunIdAsPrefix(String str){ 48 | // currentTimeMillis() will always be 13 chars between Sep 9 2001 at 01:46:40.000 UTC and Nov 20 2286 at 17:46:39.999 UTC 49 | return str.substring(0, (13 + suffixLength)); 50 | } 51 | 52 | public static String generateTrackingId (String runId, String table){ 53 | 54 | // using UUIDs only resulted in unexpected collisions in some runs. 55 | // adding table name hash for extra "randomness" 56 | 57 | return String.format("%s-%s", runId, UUID.randomUUID()); 58 | } 59 | 60 | /** 61 | * 62 | * @param jobName Dlp Job name in format projects/locations/dlpJobs/i-_templateNumber 63 | * @return tracking-number part 64 | */ 65 | public static String extractTrackingIdFromJobName(String jobName){ 66 | String [] splits = jobName.split("/"); 67 | return splits[splits.length-1].substring(2); 68 | } 69 | 70 | } 71 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/entities/TagHistoryLogEntry.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.pso.bq_pii_classifier.entities; 18 | 19 | import com.google.cloud.pso.bq_pii_classifier.functions.tagger.ColumnTaggingAction; 20 | import org.slf4j.event.Level; 21 | 22 | public class TagHistoryLogEntry { 23 | 24 | private TableSpec tableSpec; 25 | private String fieldName; 26 | private String existingPolicyTagId; 27 | private String newPolicyTagId; 28 | private ColumnTaggingAction columnTaggingAction; 29 | private String description; 30 | private Level logLevel; 31 | 32 | public TagHistoryLogEntry(TableSpec tableSpec, String fieldName, String existingPolicyTagId, String newPolicyTagId, ColumnTaggingAction columnTaggingAction, String description, Level logLevel) { 33 | this.tableSpec = tableSpec; 34 | this.fieldName = fieldName; 35 | this.existingPolicyTagId = existingPolicyTagId; 36 | this.newPolicyTagId = newPolicyTagId; 37 | this.columnTaggingAction = columnTaggingAction; 38 | this.description = description; 39 | this.logLevel = logLevel; 40 | } 41 | 42 | public String getFieldName() { 43 | return fieldName; 44 | } 45 | 46 | public String getExistingPolicyTagId() { 47 | return existingPolicyTagId; 48 | } 49 | 50 | public String getNewPolicyTagId() { 51 | return newPolicyTagId; 52 | } 53 | 54 | public ColumnTaggingAction getColumnTaggingAction() { 55 | return columnTaggingAction; 56 | } 57 | 58 | public String getDescription() { 59 | return description; 60 | } 61 | 62 | public Level getLogLevel() { 63 | return logLevel; 64 | } 65 | 66 | public TableSpec getTableSpec() { 67 | return tableSpec; 68 | } 69 | 70 | public String toLogString() { 71 | 72 | return String.format("%s | %s | %s | %s | %s | %s | %s | %s", 73 | tableSpec.getProject(), 74 | tableSpec.getDataset(), 75 | tableSpec.getTable(), 76 | fieldName, 77 | existingPolicyTagId, 78 | newPolicyTagId, 79 | columnTaggingAction, 80 | description 81 | ); 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/services/findings/FindingsReaderFactory.java: -------------------------------------------------------------------------------- 1 | package com.google.cloud.pso.bq_pii_classifier.services.findings; 2 | 3 | import com.google.cloud.pso.bq_pii_classifier.services.bq.BigQueryService; 4 | 5 | public class FindingsReaderFactory { 6 | 7 | public static FindingsReaderType findReader( 8 | boolean isAutoDlp, 9 | boolean promoteMixedPiiTypes 10 | ){ 11 | if (isAutoDlp){ 12 | return FindingsReaderType.AUTO_DLP; 13 | }else{ 14 | if(promoteMixedPiiTypes){ 15 | return FindingsReaderType.STANDARD_DLP_WITH_MIXED_INFO_TYPES_PROMOTION; 16 | }else{ 17 | return FindingsReaderType.STANDARD_DLP_WITHOUT_MIXED_INFO_TYPES_PROMOTION; 18 | } 19 | } 20 | } 21 | 22 | public static FindingsReader getNewReader( 23 | FindingsReaderType readerType, 24 | BigQueryService bqService, 25 | String dlpProject, 26 | String dlpDataset, 27 | String dlpTable, 28 | String datasetDomainMapView, 29 | String projectDomainMapView, 30 | String infoTypesPolicyTagsMapView 31 | ) { 32 | 33 | switch (readerType){ 34 | case AUTO_DLP: 35 | return new FindingsReaderAutoDlp( 36 | bqService, 37 | dlpProject, 38 | dlpDataset, 39 | dlpTable, 40 | datasetDomainMapView, 41 | projectDomainMapView, 42 | infoTypesPolicyTagsMapView 43 | ); 44 | case STANDARD_DLP_WITH_MIXED_INFO_TYPES_PROMOTION: 45 | return new FindingsReaderStandardDlp( 46 | bqService, 47 | dlpProject, 48 | dlpDataset, 49 | dlpTable, 50 | datasetDomainMapView, 51 | projectDomainMapView, 52 | infoTypesPolicyTagsMapView, 53 | "sql/v_dlp_fields_findings_with_promotion.tpl" 54 | ); 55 | case STANDARD_DLP_WITHOUT_MIXED_INFO_TYPES_PROMOTION: 56 | return new FindingsReaderStandardDlp( 57 | bqService, 58 | dlpProject, 59 | dlpDataset, 60 | dlpTable, 61 | datasetDomainMapView, 62 | projectDomainMapView, 63 | infoTypesPolicyTagsMapView, 64 | "sql/v_dlp_fields_findings_without_promotion.tpl" 65 | ); 66 | default: throw new java.lang.UnsupportedOperationException( 67 | String.format("FindingsReader %s is not supported", readerType) 68 | ); 69 | } 70 | 71 | 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /terraform/stacks/inspection/iam.tf: -------------------------------------------------------------------------------- 1 | ############## Service Accounts ###################################### 2 | 3 | resource "google_service_account" "sa_inspection_dispatcher" { 4 | project = var.project 5 | account_id = var.sa_inspection_dispatcher 6 | display_name = "Runtime SA for Inspection Dispatcher service" 7 | } 8 | 9 | resource "google_service_account" "sa_inspector" { 10 | project = var.project 11 | account_id = var.sa_inspector 12 | display_name = "Runtime SA for Inspector service" 13 | } 14 | 15 | resource "google_service_account" "sa_inspection_dispatcher_tasks" { 16 | project = var.project 17 | account_id = var.sa_inspection_dispatcher_tasks 18 | display_name = "To authorize PubSub Push requests to Inspection Dispatcher Service" 19 | } 20 | 21 | resource "google_service_account" "sa_inspector_tasks" { 22 | project = var.project 23 | account_id = var.sa_inspector_tasks 24 | display_name = "To authorize PubSub Push requests to Inspector Service" 25 | } 26 | 27 | ############## Service Accounts Access ################################ 28 | 29 | # Use google_project_iam_member because it's Non-authoritative. 30 | # It Updates the IAM policy to grant a role to a new member. 31 | # Other members for the role for the project are preserved. 32 | 33 | 34 | #### Dispatcher Tasks Permissions ### 35 | 36 | resource "google_service_account_iam_member" "sa_inspection_dispatcher_account_user_sa_dispatcher_tasks" { 37 | service_account_id = google_service_account.sa_inspection_dispatcher_tasks.name 38 | role = "roles/iam.serviceAccountUser" 39 | member = "serviceAccount:${google_service_account.sa_inspection_dispatcher_tasks.email}" 40 | } 41 | 42 | #### Dispatcher SA Permissions ### 43 | 44 | # Grant sa_dispatcher access to submit query jobs 45 | resource "google_project_iam_member" "sa_inspection_dispatcher_bq_job_user" { 46 | project = var.project 47 | role = "roles/bigquery.jobUser" 48 | member = "serviceAccount:${google_service_account.sa_inspection_dispatcher.email}" 49 | } 50 | 51 | 52 | #### Inspector Tasks SA Permissions ### 53 | 54 | resource "google_service_account_iam_member" "sa_inspector_account_user_sa_inspector_tasks" { 55 | service_account_id = google_service_account.sa_inspector.name 56 | role = "roles/iam.serviceAccountUser" 57 | member = "serviceAccount:${google_service_account.sa_inspector_tasks.email}" 58 | } 59 | 60 | #### Inspector SA Permissions ### 61 | 62 | # Grant sa_inspector access to list dlp jobs 63 | resource "google_project_iam_member" "sa_inspector_dlp_jobs_editor" { 64 | project = var.project 65 | role = "roles/dlp.jobsEditor" 66 | member = "serviceAccount:${google_service_account.sa_inspector.email}" 67 | } 68 | 69 | # Grant sa_inspector access to read dlp templates 70 | resource "google_project_iam_member" "sa_inspector_dlp_template_reader" { 71 | project = var.project 72 | role = "roles/dlp.inspectTemplatesReader" 73 | member = "serviceAccount:${google_service_account.sa_inspector.email}" 74 | } -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/entities/dlp/dlp.proto: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Copy of https://github.com/googleapis/googleapis/blob/10c88bb5c489c8ad1edb0e7f6a17cdd07147966e/google/privacy/dlp/v2/dlp.proto#L4818 16 | 17 | syntax = "proto3"; 18 | 19 | package com.google.cloud.pso.bq_pii_classifier.entities.dlp; 20 | 21 | // enables generating a separate .java file for each generated class 22 | option java_multiple_files = true; 23 | // specifies in what Java package name your generated classes should live 24 | option java_package = "com.google.cloud.pso.bq_pii_classifier.entities.dlp"; 25 | 26 | // A task to execute when a data profile has been generated. 27 | message DataProfileAction { 28 | 29 | // Types of event that can trigger an action. 30 | enum EventType { 31 | // Unused. 32 | EVENT_TYPE_UNSPECIFIED = 0; 33 | 34 | // New profile (not a re-profile). 35 | NEW_PROFILE = 1; 36 | 37 | // Changed one of the following profile metrics: 38 | // * Table data risk score 39 | // * Table sensitivity score 40 | // * Table resource visibility 41 | // * Table encryption type 42 | // * Table predicted infoTypes 43 | // * Table other infoTypes 44 | CHANGED_PROFILE = 2; 45 | 46 | // Table data risk score or sensitivity score increased. 47 | SCORE_INCREASED = 3; 48 | 49 | // A user (non-internal) error occurred. 50 | ERROR_CHANGED = 4; 51 | } 52 | } 53 | 54 | // The profile for a scanned table. 55 | message TableDataProfile { 56 | 57 | // The name of the profile. 58 | string name = 1; 59 | 60 | // The resource name of the table. 61 | // https://cloud.google.com/apis/design/resource_names#full_resource_name 62 | string full_resource = 3; 63 | } 64 | 65 | // The message that will be published to a Pub/Sub topic. 66 | // To receive a message of protocol buffer schema type, convert the message data 67 | // to an object of this proto class. 68 | // https://cloud.google.com/pubsub/docs/samples/pubsub-subscribe-proto-messages 69 | message DataProfilePubSubMessage { 70 | // If `DetailLevel` is `TABLE_PROFILE` this will be fully populated. 71 | // Otherwise, if `DetailLevel` is `RESOURCE_NAME`, then only `name` and 72 | // `full_resource` will be populated. 73 | TableDataProfile profile = 1; 74 | 75 | // The event that caused the Pub/Sub message to be sent. 76 | DataProfileAction.EventType event = 2; 77 | } -------------------------------------------------------------------------------- /services/library/src/test/java/com/google/cloud/pso/bq_pii_classifier/functions/helpers/UtilsTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.pso.bq_pii_classifier.functions.helpers; 18 | 19 | 20 | import com.google.cloud.pso.bq_pii_classifier.helpers.Utils; 21 | import org.junit.Test; 22 | import java.util.HashMap; 23 | import java.util.List; 24 | import static org.junit.Assert.assertEquals; 25 | 26 | public class UtilsTest { 27 | 28 | @Test 29 | public void extractTaxonomyIdFromPolicyTagId() { 30 | 31 | String input = "projects//locations//taxonomies//policyTags/> actual = Utils.parseJsonToMap(jsonString, "region", "ids"); 54 | 55 | HashMap> expected = new HashMap<>(); 56 | expected.put("eu", List.of("projects/p/locations/europe/inspectTemplates/1", "projects/p/locations/europe/inspectTemplates/2")); 57 | expected.put("europe-west3", List.of("projects/p/locations/europe-west3/inspectTemplates/1")); 58 | 59 | assertEquals(expected, actual); 60 | } 61 | 62 | @Test 63 | public void testExtractDLPRegionFromJobNameToBQRegion() { 64 | 65 | assertEquals("eu", Utils.extractDLPRegionFromJobNameToBQRegion("projects/p/locations/europe/dlpJobs/job")); 66 | assertEquals("europe-west3", Utils.extractDLPRegionFromJobNameToBQRegion("projects/p/locations/europe-west3/dlpJobs/job")); 67 | } 68 | } -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/entities/TableScanLimitsConfig.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.pso.bq_pii_classifier.entities; 18 | 19 | import com.google.common.reflect.TypeToken; 20 | import com.google.gson.Gson; 21 | import com.google.gson.JsonElement; 22 | import com.google.gson.JsonParser; 23 | 24 | import java.lang.reflect.Type; 25 | import java.util.SortedMap; 26 | 27 | public class TableScanLimitsConfig { 28 | 29 | private TableScanLimitsType scanLimitsType; 30 | private SortedMap limitsIntervals; 31 | 32 | /** 33 | * Expects a String in the format of "{\"limitType\": \"NUMBER_OF_ROWS\", \"limits\": {\"5000\": \"500\",\"1000\": \"100\", \"2000\": \"200\"}}" 34 | * Where limitType = NUMBER_OF_ROWS | PERCENTAGE_OF_ROWS 35 | * And limits = "max table size": "number of rows" 36 | * @param jsonString 37 | */ 38 | public TableScanLimitsConfig(String jsonString){ 39 | 40 | JsonElement root = JsonParser.parseString(jsonString).getAsJsonObject(); 41 | String limitType = root.getAsJsonObject().get("limitType").getAsString(); 42 | JsonElement limits = root.getAsJsonObject().get("limits").getAsJsonObject(); 43 | Gson gson = new Gson(); 44 | Type mapType = new TypeToken>() {}.getType(); 45 | 46 | this.scanLimitsType = TableScanLimitsType.valueOf(limitType); 47 | this.limitsIntervals = gson.fromJson(limits, mapType); 48 | } 49 | 50 | public Integer getTableScanLimitBasedOnNumRows (Integer numRows){ 51 | 52 | // loop on the sorted intervals and return the value for the right interval bracket 53 | for(Integer IntervalEnd: limitsIntervals.keySet()){ 54 | if (numRows <= IntervalEnd){ 55 | return limitsIntervals.get(IntervalEnd); 56 | } 57 | } 58 | // if no interval found return the value of the highest bracket 59 | return limitsIntervals.get(limitsIntervals.lastKey()); 60 | } 61 | 62 | public TableScanLimitsType getScanLimitsType() { 63 | return scanLimitsType; 64 | } 65 | 66 | @Override 67 | public String toString() { 68 | return "TableScanLimitsConfig{" + 69 | "scanLimitsType=" + scanLimitsType + 70 | ", limitsIntervals=" + limitsIntervals + 71 | '}'; 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /services/tagger-app/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 17 | 18 | 20 | 4.0.0 21 | 22 | com.google.cloud.pso 23 | bq-pii-classifier-tagger-app 24 | 2.0.0 25 | bq-pii-classifier-tagger-app 26 | Tagger Service of the BQ PII Classifier 27 | 28 | 29 | 30 | com.google.cloud.pso 31 | bq-pii-classifier 32 | 2.0.0 33 | 34 | 35 | 36 | 37 | org.springframework.boot 38 | spring-boot-starter-actuator 39 | 40 | 41 | org.springframework.boot 42 | spring-boot-starter-web 43 | 44 | 45 | com.google.cloud.pso 46 | bq-pii-classifier-library 47 | ${project.version} 48 | 49 | 50 | 51 | org.springframework.boot 52 | spring-boot-starter-test 53 | test 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | org.springframework.boot 62 | spring-boot-maven-plugin 63 | 64 | 65 | 66 | com.google.cloud.tools 67 | jib-maven-plugin 68 | 3.4.0 69 | 70 | 71 | ${maven.jib.base_image} 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /services/inspector-app/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 17 | 18 | 20 | 4.0.0 21 | 22 | com.google.cloud.pso 23 | bq-pii-classifier-inspector-app 24 | 2.0.0 25 | bq-pii-classifier-inspector-app 26 | Inspector Service of the BQ PII Classifier 27 | 28 | 29 | 30 | com.google.cloud.pso 31 | bq-pii-classifier 32 | 2.0.0 33 | 34 | 35 | 36 | 37 | org.springframework.boot 38 | spring-boot-starter-actuator 39 | 40 | 41 | org.springframework.boot 42 | spring-boot-starter-web 43 | 44 | 45 | com.google.cloud.pso 46 | bq-pii-classifier-library 47 | ${project.version} 48 | 49 | 50 | 51 | org.springframework.boot 52 | spring-boot-starter-test 53 | test 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | org.springframework.boot 62 | spring-boot-maven-plugin 63 | 64 | 65 | 66 | com.google.cloud.tools 67 | jib-maven-plugin 68 | 3.4.0 69 | 70 | 71 | ${maven.jib.base_image} 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /services/inspector-app/src/main/java/com/google/cloud/pso/bq_pii_classifier/inspector/Environment.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.pso.bq_pii_classifier.inspector; 17 | 18 | import com.google.cloud.pso.bq_pii_classifier.functions.inspector.InspectorConfig; 19 | import com.google.cloud.pso.bq_pii_classifier.functions.tagger.TaggerConfig; 20 | import com.google.cloud.pso.bq_pii_classifier.helpers.Utils; 21 | import com.google.gson.Gson; 22 | 23 | import java.util.ArrayList; 24 | import java.util.HashSet; 25 | 26 | public class Environment { 27 | 28 | 29 | 30 | public InspectorConfig toConfig (){ 31 | 32 | return new InspectorConfig( 33 | getProjectId(), 34 | getBqResultsDataset(), 35 | getBqResultsTable(), 36 | getDlpNotificationTopic(), 37 | getMinLikelihood(), 38 | Integer.parseInt(getMaxFindings()), 39 | Integer.parseInt(getSamplingMethod()), 40 | new Gson().fromJson(getDlpInspectionTemplatesIds(), ArrayList.class), 41 | getTableScanLimitsJsonConfig() 42 | ); 43 | } 44 | 45 | 46 | public String getProjectId(){ 47 | return Utils.getConfigFromEnv("PROJECT_ID", true); 48 | } 49 | 50 | public String getBqResultsDataset(){ 51 | return Utils.getConfigFromEnv("BQ_RESULTS_DATASET", true); 52 | } 53 | 54 | public String getBqResultsTable(){ 55 | return Utils.getConfigFromEnv("BQ_RESULTS_TABLE", true); 56 | } 57 | 58 | public String getDlpNotificationTopic(){ 59 | return Utils.getConfigFromEnv("DLP_NOTIFICATION_TOPIC", true); 60 | } 61 | 62 | public String getMinLikelihood(){ 63 | return Utils.getConfigFromEnv("MIN_LIKELIHOOD", true); 64 | } 65 | 66 | public String getMaxFindings(){ 67 | return Utils.getConfigFromEnv("MAX_FINDINGS_PER_ITEM", true); 68 | } 69 | 70 | public String getSamplingMethod(){ 71 | return Utils.getConfigFromEnv("SAMPLING_METHOD", true); 72 | } 73 | 74 | public String getDlpInspectionTemplatesIds(){ 75 | return Utils.getConfigFromEnv("DLP_INSPECTION_TEMPLATES_IDS", true); 76 | } 77 | 78 | public String getTableScanLimitsJsonConfig(){ 79 | return Utils.getConfigFromEnv("TABLE_SCAN_LIMITS_JSON_CONFIG", true); 80 | } 81 | 82 | public String getGcsFlagsBucket(){ 83 | return Utils.getConfigFromEnv("GCS_FLAGS_BUCKET", true); 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /docs/release-notes/v2.0.0.md: -------------------------------------------------------------------------------- 1 | 2 | # Version 2.0.0 3 | 4 | ## New Features 5 | 6 | ### Multiple BigQuery Regions Support 7 | 8 | In previous releases, the solution was able to inspect data in only one BigQuery region, requiring N regional-deployments 9 | to support more than one region. Starting at version 2.0.0 one deployment of the solution is able to inspect data 10 | in multiple BigQuery regions. 11 | 12 | To inspect data in multiple BigQuery regions, the `source_data_regions` variable must be configured in Terraform. 13 | 14 | ### Classification Taxonomy Scalability 15 | 16 | In previous releases, the solution was using only one DLP inspection template (to define the info types to be inspected) 17 | and one policy tag taxonomy (to include a policy tag per defined info type). Accordingly, the solution inherited the product 18 | limitations on the number of info types per inspection template and number of tags per taxonomy. 19 | 20 | Starting at version 2.0.0, info types and policy tags could be split over multiple templates and taxonomies using the 21 | `inspection_template_number` and `taxonomy_number` fields in the `classification_taxonomy` terraform variable. 22 | 23 | ### BigQuery Table Labels 24 | 25 | Users can map DLP info types to resource labels, via the `labels` field in the `classification_taxonomy` terraform variable, 26 | to be latter attached to inspected tables where this info type was found. This is useful as a way to annotate tables with desired 27 | metadata about data sensitivity. 28 | 29 | ### Custom Info Types 30 | 31 | Regex and dictionary custom info types are now configured in the main terraform variables file instead of the dlp module 32 | as before. 33 | 34 | 35 | ## Migration from v1.x.x 36 | 37 | The following changes must be added to your Terraform variables file: 38 | 39 | * Add `source_data_regions` to configure the BigQuery regions to be inspected in your projects. This will deploy one DLP 40 | inspection template and one policy tag taxonomy per configured region. Datasets that resides in regions that are not 41 | configured will be skipped. 42 | * Custom info types definition, if have been used, must be moved from the Terraform [dlp module](../../terraform/modules/dlp/main.tf) 43 | to your variables file in either the `custom_info_types_dictionaries` or `custom_info_types_regex` variables. Corresponding 44 | entries per custom info type must be added to the `classification_taxonomy` as well. 45 | * Rename `is_dry_run` to `is_dry_run_tags`: this controls if policy tags are actually attached to BigQuery columns based on DLP findings. 46 | Set to `True` to not attach policy tags and `False` to attach them. 47 | * Add a new variable `is_dry_run_labels` to control if labels should be attached to BigQuery tables based on DLP findings 48 | (as optionally configured in `classification_taxonomy.labels`). Set to `True` to not attach labels and `False` to attach them. 49 | * Remove the `tables_include_list` variable. BigQuery scan scope ends at dataset level now. 50 | 51 | Please refer to the [standard-mode](../guide-standard-dlp.md) and [discovery-service-mode](../guide-discovery-service.md) 52 | deployment guides for more details about the Terraform variables. 53 | 54 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/services/pubsub/PubSubServiceImpl.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.pso.bq_pii_classifier.services.pubsub; 18 | 19 | 20 | import com.google.api.core.ApiFuture; 21 | import com.google.cloud.pso.bq_pii_classifier.entities.JsonMessage; 22 | import com.google.cloud.pubsub.v1.Publisher; 23 | import com.google.protobuf.ByteString; 24 | import com.google.pubsub.v1.PubsubMessage; 25 | import com.google.pubsub.v1.TopicName; 26 | 27 | import java.io.IOException; 28 | import java.util.ArrayList; 29 | import java.util.List; 30 | import java.util.concurrent.TimeUnit; 31 | 32 | public class PubSubServiceImpl implements PubSubService { 33 | 34 | 35 | @Override 36 | public PubSubPublishResults publishTableOperationRequests(String projectId, String topicId, List messages) 37 | throws IOException, InterruptedException { 38 | 39 | List successMessages = new ArrayList<>(); 40 | List failedMessages = new ArrayList<>(); 41 | 42 | Publisher publisher = null; 43 | try { 44 | TopicName topicName = TopicName.of(projectId, topicId); 45 | // Create a publisher instance with default settings bound to the topic 46 | publisher = Publisher.newBuilder(topicName).build(); 47 | for (final JsonMessage msg : messages) { 48 | ByteString data = ByteString.copyFromUtf8(msg.toJsonString()); 49 | PubsubMessage pubsubMessage = PubsubMessage.newBuilder().setData(data).build(); 50 | 51 | // Once published, returns a server-assigned message id (unique within the topic) 52 | ApiFuture future = publisher.publish(pubsubMessage); 53 | try{ 54 | // wait and retrieves results 55 | String messageId = future.get(); 56 | successMessages.add(new SuccessPubSubMessage(msg, messageId)); 57 | }catch (Exception ex){ 58 | failedMessages.add(new FailedPubSubMessage(msg, ex)); 59 | } 60 | } 61 | 62 | return new PubSubPublishResults(successMessages, failedMessages); 63 | 64 | } finally { 65 | if (publisher != null) { 66 | // When finished with the publisher, shutdown to free up resources. 67 | publisher.shutdown(); 68 | publisher.awaitTermination(1, TimeUnit.MINUTES); 69 | } 70 | } 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /services/dispatcher-tagging-app/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 17 | 18 | 20 | 4.0.0 21 | 22 | com.google.cloud.pso 23 | bq-pii-classifier-tagging-dispatcher-app 24 | 2.0.0 25 | bq-pii-classifier-tagging-dispatcher-app 26 | Dispatcher Service of the BQ PII Classifier in Tagging-only mode 27 | 28 | 29 | 30 | com.google.cloud.pso 31 | bq-pii-classifier 32 | 2.0.0 33 | 34 | 35 | 36 | 37 | org.springframework.boot 38 | spring-boot-starter-actuator 39 | 40 | 41 | org.springframework.boot 42 | spring-boot-starter-web 43 | 44 | 45 | com.google.cloud.pso 46 | bq-pii-classifier-library 47 | ${project.version} 48 | 49 | 50 | 51 | com.google.code.gson 52 | gson 53 | 2.11.0 54 | 55 | 56 | 57 | org.springframework.boot 58 | spring-boot-starter-test 59 | test 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | org.springframework.boot 68 | spring-boot-maven-plugin 69 | 70 | 71 | 72 | com.google.cloud.tools 73 | jib-maven-plugin 74 | 3.4.0 75 | 76 | 77 | ${maven.jib.base_image} 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /services/dispatcher-inspection-app/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 17 | 18 | 20 | 4.0.0 21 | 22 | com.google.cloud.pso 23 | bq-pii-classifier-dispatcher-app 24 | 2.0.0 25 | bq-pii-classifier-inspection-dispatcher-app 26 | Dispatcher Service of the BQ PII Classifier in Inspection mode 27 | 28 | 29 | 30 | com.google.cloud.pso 31 | bq-pii-classifier 32 | 2.0.0 33 | 34 | 35 | 36 | 37 | org.springframework.boot 38 | spring-boot-starter-actuator 39 | 40 | 41 | org.springframework.boot 42 | spring-boot-starter-web 43 | 44 | 45 | com.google.cloud.pso 46 | bq-pii-classifier-library 47 | ${project.version} 48 | 49 | 50 | 51 | com.google.code.gson 52 | gson 53 | 2.11.0 54 | 55 | 56 | 57 | org.springframework.boot 58 | spring-boot-starter-test 59 | test 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | org.springframework.boot 68 | spring-boot-maven-plugin 69 | 70 | 71 | 72 | com.google.cloud.tools 73 | jib-maven-plugin 74 | 3.4.0 75 | 76 | 77 | ${maven.jib.base_image} 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /services/dispatcher-tagging-app/src/main/java/com/google/cloud/pso/bq_pii_classifier/dispatcher/Environment.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.pso.bq_pii_classifier.dispatcher; 17 | 18 | import com.google.cloud.pso.bq_pii_classifier.entities.DispatcherType; 19 | import com.google.cloud.pso.bq_pii_classifier.entities.SolutionMode; 20 | import com.google.cloud.pso.bq_pii_classifier.functions.dispatcher.DispatcherConfig; 21 | import com.google.cloud.pso.bq_pii_classifier.helpers.Utils; 22 | import com.google.gson.Gson; 23 | 24 | import java.util.ArrayList; 25 | 26 | public class Environment { 27 | 28 | public DispatcherConfig toConfig(){ 29 | return new DispatcherConfig( 30 | getProjectId(), 31 | getComputeRegionId(), 32 | getDataRegionId(), 33 | new Gson().fromJson(getSourceDataRegions().toLowerCase(), ArrayList.class), 34 | getTaggerTopic(), 35 | DispatcherType.TAGGING, 36 | getIsAutoDlpMode() ? SolutionMode.AUTO_DLP : SolutionMode.STANDARD_DLP, 37 | Utils.parseJsonToMap(getDlpInspectionTemplatesIds(), "region", "ids") 38 | ); 39 | } 40 | 41 | public String getProjectId(){ 42 | return Utils.getConfigFromEnv("PROJECT_ID", true); 43 | } 44 | 45 | public String getComputeRegionId(){ 46 | return Utils.getConfigFromEnv("COMPUTE_REGION_ID", true); 47 | } 48 | 49 | public String getDataRegionId(){ 50 | return Utils.getConfigFromEnv("DATA_REGION_ID", true); 51 | } 52 | 53 | public String getSourceDataRegions(){ 54 | return Utils.getConfigFromEnv("SOURCE_DATA_REGIONS", true); 55 | } 56 | 57 | public String getTaggerTopic() { return Utils.getConfigFromEnv("TAGGER_TOPIC", true); } 58 | 59 | public String getGcsFlagsBucket(){ 60 | return Utils.getConfigFromEnv("GCS_FLAGS_BUCKET", true); 61 | } 62 | 63 | public String getSolutionDataset(){ 64 | return Utils.getConfigFromEnv("SOLUTION_DATASET", true); 65 | } 66 | 67 | public Boolean getIsAutoDlpMode(){ 68 | return Boolean.valueOf(Utils.getConfigFromEnv("IS_AUTO_DLP_MODE", true)); 69 | } 70 | 71 | public String getDlpTableStandard(){ 72 | return Utils.getConfigFromEnv("DLP_TABLE_STANDARD", true); 73 | } 74 | 75 | public String getDlpTableAuto(){ 76 | return Utils.getConfigFromEnv("DLP_TABLE_AUTO", true); 77 | } 78 | 79 | public String getLoggingTable(){ 80 | return Utils.getConfigFromEnv("LOGGING_TABLE", true); 81 | } 82 | 83 | public String getDlpInspectionTemplatesIds(){ 84 | return Utils.getConfigFromEnv("DLP_INSPECTION_TEMPLATES_IDS", true); 85 | } 86 | 87 | } 88 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/entities/PubSubEvent.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.pso.bq_pii_classifier.entities; 18 | 19 | import java.nio.charset.StandardCharsets; 20 | import java.util.Map; 21 | 22 | // Body.Message is the payload of a Pub/Sub event. Please refer to the docs for 23 | // additional information regarding Pub/Sub events. 24 | public class PubSubEvent { 25 | 26 | private Message message; 27 | 28 | public PubSubEvent() {} 29 | 30 | public Message getMessage() { 31 | return message; 32 | } 33 | 34 | public void setMessage(Message message) { 35 | this.message = message; 36 | } 37 | 38 | public class Message { 39 | 40 | private String messageId; 41 | private String publishTime; 42 | private byte[] data; 43 | private Map attributes; 44 | 45 | public Message() {} 46 | 47 | public Message(String messageId, String publishTime, byte[] data, Map attributes) { 48 | this.messageId = messageId; 49 | this.publishTime = publishTime; 50 | this.data = data; 51 | this.attributes = attributes; 52 | } 53 | 54 | public String getMessageId() { 55 | return messageId; 56 | } 57 | 58 | public void setMessageId(String messageId) { 59 | this.messageId = messageId; 60 | } 61 | 62 | public String getPublishTime() { 63 | return publishTime; 64 | } 65 | 66 | public void setPublishTime(String publishTime) { 67 | this.publishTime = publishTime; 68 | } 69 | 70 | public byte[] getData() { 71 | return data; 72 | } 73 | 74 | public void setData(byte[] data) { 75 | this.data = data; 76 | } 77 | 78 | public Map getAttributes() { 79 | return attributes; 80 | } 81 | 82 | public void setAttributes(Map attributes) { 83 | this.attributes = attributes; 84 | } 85 | 86 | public String dataToUtf8String (){ 87 | return new String(data, StandardCharsets.UTF_8); 88 | } 89 | 90 | @Override 91 | public String toString() { 92 | return "Message{" + 93 | "messageId='" + messageId + '\'' + 94 | ", publishTime='" + publishTime + '\'' + 95 | ", data='" + data + '\'' + 96 | ", attributes=" + attributes + 97 | '}'; 98 | } 99 | } 100 | } -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/entities/TableSpec.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.pso.bq_pii_classifier.entities; 18 | 19 | import com.google.cloud.bigquery.TableId; 20 | import com.google.cloud.pso.bq_pii_classifier.helpers.Utils; 21 | 22 | import java.util.List; 23 | import java.util.Objects; 24 | 25 | public class TableSpec { 26 | 27 | private String project; 28 | private String dataset; 29 | private String table; 30 | 31 | public TableSpec(String project, String dataset, String table) { 32 | this.project = project; 33 | this.dataset = dataset; 34 | this.table = table; 35 | } 36 | 37 | public String getProject() { 38 | return project; 39 | } 40 | 41 | public String getDataset() { 42 | return dataset; 43 | } 44 | 45 | public String getTable() { 46 | return table; 47 | } 48 | 49 | public String toSqlString(){ 50 | return String.format("%s.%s.%s", project, dataset, table); 51 | } 52 | 53 | public TableId toTableId(){ return TableId.of(project, dataset, table); } 54 | 55 | // parse from "project.dataset.table" format 56 | public static TableSpec fromSqlString(String sqlTableId){ 57 | List targetTableSpecs = Utils.tokenize(sqlTableId, ".", true); 58 | return new TableSpec( 59 | targetTableSpecs.get(0), 60 | targetTableSpecs.get(1), 61 | targetTableSpecs.get(2) 62 | ); 63 | } 64 | 65 | // parse from "//bigquery.googleapis.com/projects/#project_name/datasets/#dataset_name/tables/#table_name>" 66 | public static TableSpec fromFullResource(String fullResource){ 67 | List tokens = Utils.tokenize(fullResource, "/", true); 68 | return new TableSpec( 69 | tokens.get(2), 70 | tokens.get(4), 71 | tokens.get(6) 72 | ); 73 | } 74 | 75 | @Override 76 | public boolean equals(Object o) { 77 | if (this == o) return true; 78 | if (o == null || getClass() != o.getClass()) return false; 79 | TableSpec tableSpec = (TableSpec) o; 80 | return Objects.equals(project, tableSpec.project) && 81 | Objects.equals(dataset, tableSpec.dataset) && 82 | Objects.equals(table, tableSpec.table); 83 | } 84 | 85 | @Override 86 | public int hashCode() { 87 | return Objects.hash(project, dataset, table); 88 | } 89 | 90 | @Override 91 | public String toString() { 92 | return "TableSpec{" + 93 | "project='" + project + '\'' + 94 | ", dataset='" + dataset + '\'' + 95 | ", table='" + table + '\'' + 96 | '}'; 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /services/library/src/main/resources/sql/v_dlp_fields_findings_auto_dlp.tpl: -------------------------------------------------------------------------------- 1 | WITH config AS 2 | ( 3 | SELECT * FROM `${project}.${dataset}.${config_view_infotypes_policytags_map}` 4 | ) 5 | , datasets_domains AS 6 | ( 7 | SELECT * FROM `${project}.${dataset}.${config_view_dataset_domain_map}` 8 | ) 9 | , projects_domains AS 10 | ( 11 | SELECT * FROM `${project}.${dataset}.${config_view_project_domain_map}` 12 | ) 13 | , ranked_profiles AS ( 14 | 15 | SELECT 16 | CASE WHEN SPLIT(column_profile.name, "/")[OFFSET(3)] = "europe" THEN "eu" ELSE SPLIT(column_profile.name, "/")[OFFSET(3)] END AS table_region, 17 | column_profile.dataset_project_id, 18 | column_profile.dataset_id, 19 | column_profile.table_id, 20 | column_profile.column AS column_name, 21 | column_profile.column_info_type.info_type.name AS dlp_info_type, 22 | column_profile.other_matches AS dlp_other_matches, 23 | RANK() OVER (PARTITION BY CONCAT(column_profile.dataset_project_id, '.', column_profile.dataset_id, '.', column_profile.table_id) ORDER BY column_profile.profile_last_generated.timestamp DESC) AS column_profile_rank 24 | FROM `${project}.${dataset}.${results_table}` 25 | WHERE (column_profile.column_info_type.info_type.name IS NOT NULL OR column_profile.other_matches IS NOT NULL) 26 | AND CONCAT(column_profile.dataset_project_id, '.', column_profile.dataset_id, '.', column_profile.table_id) = '${param_lookup_key}' 27 | 28 | ), latest_profiles AS ( 29 | 30 | SELECT 31 | table_region, 32 | dataset_project_id, 33 | dataset_id, 34 | table_id, 35 | column_name, 36 | CASE 37 | -- If Auto DLP promotes only one PII type, use this PII 38 | WHEN dlp_info_type IS NOT NULL THEN dlp_info_type 39 | -- If Auto DLP doesn't promote a PII type but finds only one "Other PII" type, use that one other PII type 40 | WHEN dlp_info_type IS NULL AND ARRAY_LENGTH(dlp_other_matches) = 1 THEN dlp_other_matches[ORDINAL (1)].info_type.name 41 | -- If Auto DLP doesn't promote a PII type but finds more than one "Other PII" type, use MIXED 42 | WHEN dlp_info_type IS NULL AND ARRAY_LENGTH(dlp_other_matches) > 1 THEN "MIXED" END AS final_info_type, 43 | FROM ranked_profiles 44 | WHERE column_profile_rank = 1 45 | ) 46 | 47 | 48 | SELECT 49 | -- DLP reports column names for nested repeated records with the array index of the finding. 50 | -- normalize the column names for nested repeated records by removing the '[index]' part and selecting distinct 51 | -- e.g. hits[0].referer, hits[1].referer, etc becomes hits.referer 52 | REGEXP_REPLACE(l.column_name, r"(\[\d+\]\.)", '.') AS field_name, 53 | l.final_info_type AS info_type, 54 | c.policy_tag, 55 | c.classification 56 | FROM latest_profiles l 57 | LEFT JOIN datasets_domains dd ON dd.project = l.dataset_project_id AND dd.dataset = l.dataset_id 58 | LEFT JOIN projects_domains pd ON pd.project = l.dataset_project_id 59 | -- get tag ids that belong to certain domain. Use dataset-level domain if found, else project-level domain 60 | LEFT JOIN config c ON c.domain = COALESCE(dd.domain , pd.domain ) AND c.info_type = l.final_info_type AND c.region = l.table_region 61 | WHERE l.final_info_type IS NOT NULL 62 | ORDER BY 1,2 63 | -------------------------------------------------------------------------------- /services/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 17 | 18 | 20 | 4.0.0 21 | 22 | com.google.cloud.pso 23 | bq-pii-classifier 24 | 2.0.0 25 | pom 26 | 27 | 28 | 17 29 | 30 | 3.5.0 31 | 2024.0.1 32 | 26.61.0 33 | 34 | eclipse-temurin:17 35 | 36 | 37 | 38 | 39 | org.springframework.boot 40 | spring-boot-starter-parent 41 | 3.5.0 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | org.springframework.boot 50 | spring-boot-dependencies 51 | ${spring.boot.version} 52 | pom 53 | import 54 | 55 | 56 | 57 | org.springframework.cloud 58 | spring-cloud-dependencies 59 | ${spring.cloud.version} 60 | pom 61 | import 62 | 63 | 64 | 65 | com.google.cloud 66 | libraries-bom 67 | ${google.cloud.bom.version} 68 | pom 69 | import 70 | 71 | 72 | 73 | 74 | 75 | library 76 | dispatcher-tagging-app 77 | dispatcher-inspection-app 78 | inspector-app 79 | tagger-app 80 | 81 | 82 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/functions/dispatcher/BigQueryScope.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.pso.bq_pii_classifier.functions.dispatcher; 18 | 19 | import java.util.ArrayList; 20 | import java.util.List; 21 | import java.util.Objects; 22 | import java.util.stream.Collectors; 23 | 24 | public class BigQueryScope { 25 | 26 | private List projectIncludeList; 27 | private List datasetIncludeList; 28 | private List datasetExcludeList; 29 | private List tableExcludeList; 30 | 31 | 32 | public BigQueryScope() { 33 | this.projectIncludeList = new ArrayList<>(); 34 | this.datasetIncludeList = new ArrayList<>(); 35 | this.datasetExcludeList = new ArrayList<>(); 36 | this.tableExcludeList = new ArrayList<>(); 37 | } 38 | 39 | public BigQueryScope(List projectIncludeList, List datasetIncludeList, List datasetExcludeList, List tableExcludeList) { 40 | this.projectIncludeList = projectIncludeList.stream().map(String::toLowerCase).collect(Collectors.toList()); 41 | this.datasetIncludeList = datasetIncludeList.stream().map(String::toLowerCase).collect(Collectors.toList()); 42 | this.datasetExcludeList = datasetExcludeList.stream().map(String::toLowerCase).collect(Collectors.toList()); 43 | this.tableExcludeList = tableExcludeList.stream().map(String::toLowerCase).collect(Collectors.toList()); 44 | } 45 | 46 | public List getProjectIncludeList() { 47 | return projectIncludeList; 48 | } 49 | 50 | public List getDatasetIncludeList() { 51 | return datasetIncludeList; 52 | } 53 | 54 | public List getDatasetExcludeList() { 55 | return datasetExcludeList; 56 | } 57 | 58 | 59 | public List getTableExcludeList() { 60 | return tableExcludeList; 61 | } 62 | 63 | 64 | @Override 65 | public String toString() { 66 | return "BigQueryScope{" + 67 | "projectIncludeList=" + projectIncludeList + 68 | ", datasetIncludeList=" + datasetIncludeList + 69 | ", datasetExcludeList=" + datasetExcludeList + 70 | ", tableExcludeList=" + tableExcludeList + 71 | '}'; 72 | } 73 | 74 | @Override 75 | public boolean equals(Object o) { 76 | if (this == o) return true; 77 | if (o == null || getClass() != o.getClass()) return false; 78 | BigQueryScope that = (BigQueryScope) o; 79 | return Objects.equals(projectIncludeList, that.projectIncludeList) && 80 | Objects.equals(datasetIncludeList, that.datasetIncludeList) && 81 | Objects.equals(datasetExcludeList, that.datasetExcludeList) && 82 | Objects.equals(tableExcludeList, that.tableExcludeList); 83 | } 84 | 85 | @Override 86 | public int hashCode() { 87 | return Objects.hash(projectIncludeList, datasetIncludeList, datasetExcludeList, tableExcludeList); 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/functions/inspector/InspectorConfig.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.pso.bq_pii_classifier.functions.inspector; 18 | 19 | import java.util.List; 20 | 21 | public class InspectorConfig { 22 | 23 | private String projectId; 24 | private String bqResultsDataset; 25 | private String bqResultsTable; 26 | private String dlpNotificationTopic; 27 | private String minLikelihood; 28 | private Integer maxFindings; 29 | private Integer samplingMethod; 30 | private List dlpInspectionTemplatesIds; 31 | private String tableScanLimitsJsonConfig; 32 | 33 | public InspectorConfig(String projectId, String bqResultsDataset, String bqResultsTable, String dlpNotificationTopic, String minLikelihood, Integer maxFindings, Integer samplingMethod, List dlpInspectionTemplatesIds, String tableScanLimitsJsonConfig) { 34 | this.projectId = projectId; 35 | this.bqResultsDataset = bqResultsDataset; 36 | this.bqResultsTable = bqResultsTable; 37 | this.dlpNotificationTopic = dlpNotificationTopic; 38 | this.minLikelihood = minLikelihood; 39 | this.maxFindings = maxFindings; 40 | this.samplingMethod = samplingMethod; 41 | this.dlpInspectionTemplatesIds = dlpInspectionTemplatesIds; 42 | this.tableScanLimitsJsonConfig = tableScanLimitsJsonConfig; 43 | } 44 | 45 | public String getProjectId() { 46 | return projectId; 47 | } 48 | 49 | public String getBqResultsDataset() { 50 | return bqResultsDataset; 51 | } 52 | 53 | public String getBqResultsTable() { 54 | return bqResultsTable; 55 | } 56 | 57 | public String getDlpNotificationTopic() { 58 | return dlpNotificationTopic; 59 | } 60 | 61 | public String getMinLikelihood() { 62 | return minLikelihood; 63 | } 64 | 65 | public Integer getMaxFindings() { 66 | return maxFindings; 67 | } 68 | 69 | public Integer getSamplingMethod() { 70 | return samplingMethod; 71 | } 72 | 73 | public List getDlpInspectionTemplatesIds() { 74 | return dlpInspectionTemplatesIds; 75 | } 76 | 77 | public String getTableScanLimitsJsonConfig() { 78 | return tableScanLimitsJsonConfig; 79 | } 80 | 81 | @Override 82 | public String toString() { 83 | return "InspectorConfig{" + 84 | "projectId='" + projectId + '\'' + 85 | ", bqResultsDataset='" + bqResultsDataset + '\'' + 86 | ", bqResultsTable='" + bqResultsTable + '\'' + 87 | ", dlpNotificationTopic='" + dlpNotificationTopic + '\'' + 88 | ", minLikelihood='" + minLikelihood + '\'' + 89 | ", maxFindings='" + maxFindings + '\'' + 90 | ", samplingMethod='" + samplingMethod + '\'' + 91 | ", dlpInspectionTemplatesIds='" + dlpInspectionTemplatesIds + '\'' + 92 | ", tableScanLimitsJsonConfig='" + tableScanLimitsJsonConfig + '\'' + 93 | '}'; 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /services/tagger-app/src/main/java/com/google/cloud/pso/bq_pii_classifier/tagger/Environment.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.pso.bq_pii_classifier.tagger; 17 | 18 | import com.google.cloud.pso.bq_pii_classifier.entities.InfoTypeInfo; 19 | import com.google.cloud.pso.bq_pii_classifier.functions.tagger.TaggerConfig; 20 | import com.google.cloud.pso.bq_pii_classifier.helpers.Utils; 21 | 22 | import java.util.HashSet; 23 | import java.util.Map; 24 | import java.util.Set; 25 | 26 | public class Environment { 27 | 28 | public TaggerConfig toConfig (){ 29 | return new TaggerConfig( 30 | getProjectId(), 31 | new HashSet<>(Utils.tokenize(getTaxonomies(), ",", true)), 32 | getDlpDataset(), 33 | getDlpTableStandard(), 34 | getDlpTableAuto(), 35 | getConfigViewInfoTypePolicyTagsMap(), 36 | getConfigViewDatasetDomainMap(), 37 | getConfigViewProjectDomainMap(), 38 | getPromoteMixedTypes(), 39 | getIsAutoDlpMode(), 40 | getIsDryRunTags(), 41 | getIsDryRunLabels(), 42 | getInfoTypeMap() 43 | ); 44 | } 45 | 46 | public String getProjectId(){ 47 | return Utils.getConfigFromEnv("PROJECT_ID", true); 48 | } 49 | 50 | public String getTaxonomies(){ 51 | return Utils.getConfigFromEnv("TAXONOMIES", true); 52 | } 53 | 54 | public Boolean getIsDryRunTags(){ 55 | return Boolean.valueOf(Utils.getConfigFromEnv("IS_DRY_RUN_TAGS", true)); 56 | } 57 | 58 | public Boolean getIsDryRunLabels(){ 59 | return Boolean.valueOf(Utils.getConfigFromEnv("IS_DRY_RUN_LABELS", true)); 60 | } 61 | 62 | public String getGcsFlagsBucket(){ 63 | return Utils.getConfigFromEnv("GCS_FLAGS_BUCKET", true); 64 | } 65 | 66 | public String getDlpDataset(){ 67 | return Utils.getConfigFromEnv("DLP_DATASET", true); 68 | } 69 | 70 | public String getDlpTableStandard(){ 71 | return Utils.getConfigFromEnv("DLP_TABLE_STANDARD", true); 72 | } 73 | 74 | public String getDlpTableAuto(){ 75 | return Utils.getConfigFromEnv("DLP_TABLE_AUTO", true); 76 | } 77 | 78 | public String getConfigViewInfoTypePolicyTagsMap(){ 79 | return Utils.getConfigFromEnv("VIEW_INFOTYPE_POLICYTAGS_MAP", true); 80 | } 81 | 82 | public String getConfigViewDatasetDomainMap(){ 83 | return Utils.getConfigFromEnv("VIEW_DATASET_DOMAIN_MAP", true); 84 | } 85 | 86 | public String getConfigViewProjectDomainMap(){ 87 | return Utils.getConfigFromEnv("VIEW_PROJECT_DOMAIN_MAP", true); 88 | } 89 | 90 | public Boolean getPromoteMixedTypes(){ 91 | return Boolean.valueOf(Utils.getConfigFromEnv("PROMOTE_MIXED_TYPES", true)); 92 | } 93 | 94 | public Boolean getIsAutoDlpMode(){ 95 | return Boolean.valueOf(Utils.getConfigFromEnv("IS_AUTO_DLP_MODE", true)); 96 | } 97 | 98 | public Map getInfoTypeMap(){ 99 | return InfoTypeInfo.fromJsonMap(Utils.getConfigFromEnv("INFO_TYPE_MAP", true)); 100 | } 101 | 102 | 103 | } 104 | -------------------------------------------------------------------------------- /services/library/src/main/java/com/google/cloud/pso/bq_pii_classifier/functions/dispatcher/DispatcherConfig.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.pso.bq_pii_classifier.functions.dispatcher; 18 | 19 | import com.google.cloud.pso.bq_pii_classifier.entities.DispatcherType; 20 | import com.google.cloud.pso.bq_pii_classifier.entities.SolutionMode; 21 | 22 | import java.util.List; 23 | import java.util.Map; 24 | import java.util.stream.Collectors; 25 | 26 | public class DispatcherConfig { 27 | 28 | private String projectId; 29 | private String computeRegionId; 30 | private String dataRegionId; 31 | 32 | private List sourceDataRegions; 33 | private String outputTopic; 34 | private DispatcherType dispatcherType; 35 | private SolutionMode solutionMode; 36 | private Map> dlpInspectionTemplatesIdsPerRegion; 37 | 38 | public DispatcherConfig(String projectId, 39 | String computeRegionId, 40 | String dataRegionId, 41 | List sourceDataRegions, 42 | String outputTopic, 43 | DispatcherType dispatcherType, 44 | SolutionMode solutionMode, 45 | Map> dlpInspectionTemplatesIdsPerRegion 46 | ) { 47 | this.projectId = projectId.toLowerCase(); 48 | this.computeRegionId = computeRegionId.toLowerCase(); 49 | this.dataRegionId = dataRegionId.toLowerCase(); 50 | this.sourceDataRegions = sourceDataRegions.stream().map(String::toLowerCase).collect(Collectors.toList()); 51 | this.outputTopic = outputTopic.toLowerCase(); 52 | this.dispatcherType = dispatcherType; 53 | this.solutionMode = solutionMode; 54 | this.dlpInspectionTemplatesIdsPerRegion = dlpInspectionTemplatesIdsPerRegion; 55 | } 56 | 57 | public DispatcherType getDispatcherType() { 58 | return dispatcherType; 59 | } 60 | 61 | public String getDataRegionId() { 62 | return dataRegionId; 63 | } 64 | 65 | public List getSourceDataRegions() { 66 | return sourceDataRegions; 67 | } 68 | 69 | public String getProjectId() { 70 | return projectId; 71 | } 72 | 73 | public String getComputeRegionId() { 74 | return computeRegionId; 75 | } 76 | 77 | public String getOutputTopic() { 78 | return outputTopic; 79 | } 80 | 81 | public SolutionMode getSolutionMode() { 82 | return solutionMode; 83 | } 84 | 85 | public Map> getDlpInspectionTemplatesIdsPerRegion() { 86 | return dlpInspectionTemplatesIdsPerRegion; 87 | } 88 | 89 | @Override 90 | public String toString() { 91 | return "DispatcherConfig{" + 92 | "projectId='" + projectId + '\'' + 93 | ", computeRegionId='" + computeRegionId + '\'' + 94 | ", dataRegionId='" + dataRegionId + '\'' + 95 | ", sourceDataRegions'" + sourceDataRegions + '\'' + 96 | ", outputTopic='" + outputTopic + '\'' + 97 | ", dispatcherType=" + dispatcherType + 98 | ", solutionMode=" + solutionMode + 99 | ", dlpInspectionTemplatesIds=" + dlpInspectionTemplatesIdsPerRegion + 100 | '}'; 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /terraform/stacks/inspection/variables.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | variable "project" { 16 | type = string 17 | } 18 | 19 | variable "compute_region" { 20 | type = string 21 | } 22 | 23 | variable "data_region" { 24 | type = string 25 | } 26 | 27 | variable "source_data_regions" { 28 | type = list(string) 29 | } 30 | 31 | variable "sa_inspection_dispatcher" { 32 | type = string 33 | } 34 | 35 | variable "sa_inspection_dispatcher_tasks" { 36 | type = string 37 | } 38 | 39 | variable "sa_inspector" { 40 | type = string 41 | } 42 | 43 | variable "sa_inspector_tasks" { 44 | type = string 45 | } 46 | 47 | variable "scheduler_name" { 48 | type = string 49 | } 50 | 51 | variable "dispatcher_service_name" { 52 | type = string 53 | } 54 | 55 | variable "inspector_service_name" { 56 | type = string 57 | } 58 | 59 | variable "dispatcher_pubsub_topic" { 60 | type = string 61 | } 62 | 63 | variable "dispatcher_pubsub_sub" { 64 | type = string 65 | } 66 | 67 | variable "inspector_pubsub_topic" { 68 | type = string 69 | } 70 | 71 | variable "inspector_pubsub_sub" { 72 | type = string 73 | } 74 | 75 | variable "dispatcher_service_image" { 76 | type = string 77 | } 78 | variable "inspector_service_image" { 79 | type = string 80 | } 81 | 82 | # BQ scanning scope 83 | # Optional fields. At least one should be provided among the _INCLUDE configs 84 | # format: project.dataset.table1, project.dataset.table2, etc 85 | variable "datasets_include_list" { 86 | type = list(string) 87 | } 88 | variable "projects_include_list" { 89 | type = list(string) 90 | } 91 | variable "datasets_exclude_list" { 92 | type = list(string) 93 | } 94 | variable "tables_exclude_list" { 95 | type = list(string) 96 | } 97 | 98 | variable "cloud_scheduler_account" { 99 | type = string 100 | description = "Service agent account for Cloud Scheduler. Format service-@gcp-sa-cloudscheduler.iam.gserviceaccount.com" 101 | } 102 | 103 | variable "bigquery_dataset_name" { 104 | type = string 105 | } 106 | 107 | variable "standard_dlp_results_table_name" { 108 | type = string 109 | } 110 | 111 | variable "dlp_inspection_templates_ids" { 112 | description = "A list of objects, each representing a deployment of inspection templates per region" 113 | type = list(object({ 114 | ids = list(string) 115 | region = string 116 | })) 117 | } 118 | 119 | variable "cron_expression" { 120 | type = string 121 | description = "Cron expression used by the Cloud Scheduler to run a full scan" 122 | } 123 | 124 | variable "table_scan_limits_json_config" { 125 | type = string 126 | description = "JSON config to specify table scan limits intervals" 127 | } 128 | 129 | variable "tagger_topic_id" { 130 | type = string 131 | } 132 | 133 | variable "dlp_min_likelihood" { 134 | type = string 135 | } 136 | 137 | variable "dlp_max_findings_per_item" { 138 | type = number 139 | } 140 | 141 | //How to sample rows if not all rows are scanned. Meaningful only when used in conjunction with either rows_limit or rows_limit_percent. If not specified, rows are scanned in the order BigQuery reads them. 142 | // 143 | //RANDOM_START = 2 144 | //SAMPLE_METHOD_UNSPECIFIED = 0 145 | //TOP = 1 146 | variable "dlp_sampling_method" { 147 | type = number 148 | } 149 | 150 | variable "gcs_flags_bucket_name" { 151 | type = string 152 | } 153 | 154 | # Dispatcher settings. 155 | variable "dispatcher_service_timeout_seconds" { 156 | type = number 157 | } 158 | 159 | variable "dispatcher_subscription_ack_deadline_seconds" { 160 | type = number 161 | } 162 | 163 | variable "dispatcher_subscription_message_retention_duration" { 164 | type = string 165 | } 166 | 167 | # Inspector settings. 168 | variable "inspector_service_timeout_seconds" { 169 | type = number 170 | } 171 | 172 | variable "inspector_subscription_ack_deadline_seconds" { 173 | type = number 174 | } 175 | 176 | variable "inspector_subscription_message_retention_duration" { 177 | type = string 178 | } 179 | 180 | variable "default_labels" { 181 | type = map(string) 182 | } 183 | 184 | 185 | 186 | 187 | --------------------------------------------------------------------------------