├── requirements.txt ├── images ├── dbt_datacatalog.png ├── DataCatalog_dbt-tag.png └── DataCatalog_dbt-tag-template.png ├── config.py ├── tag_template ├── dbt_metadata_tag_template.txt ├── gcloud_dbt_tag_template_create.sh └── dbt_metadata_tag_template.json ├── datacatalog_functions.py ├── main.py ├── README.md └── dbt_metadata.py /requirements.txt: -------------------------------------------------------------------------------- 1 | # Function dependencies, for example: 2 | # package>=version 3 | google-cloud-datacatalog 4 | datetime 5 | -------------------------------------------------------------------------------- /images/dbt_datacatalog.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dbt-content/google-datacatalog-dbt-tag/HEAD/images/dbt_datacatalog.png -------------------------------------------------------------------------------- /images/DataCatalog_dbt-tag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dbt-content/google-datacatalog-dbt-tag/HEAD/images/DataCatalog_dbt-tag.png -------------------------------------------------------------------------------- /images/DataCatalog_dbt-tag-template.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dbt-content/google-datacatalog-dbt-tag/HEAD/images/DataCatalog_dbt-tag-template.png -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | 2 | from google.cloud import datacatalog 3 | 4 | dbt_auth_token='xxxxxxxxxxxxxxxxxx' 5 | dbt_headers = {"Authorization": "Token "+dbt_auth_token} 6 | 7 | dbt_metadata_tag_template_id="dbt_metadata" 8 | dbt_tag_template_project="dbt-test-301310" 9 | 10 | datacatalog_client = datacatalog.DataCatalogClient() -------------------------------------------------------------------------------- /tag_template/dbt_metadata_tag_template.txt: -------------------------------------------------------------------------------- 1 | dbt_run_id dbt Run id STRING 2 | dbt_run_timestamp dbt Run timestamp DATETIME 3 | dbt_duration dbt Duration STRING 4 | dbt_run_duration dbt Run duration STRING 5 | dbt_run_url dbt Run url STRING 6 | dbt_project_name dbt Project name STRING 7 | dbt_model_name dbt Model name STRING 8 | dbt_sql_run_url dbt SQL run url STRING 9 | dbt_job_id dbt Job id STRING 10 | dbt_job_name dbt Job name STRING 11 | dbt_job_url dbt Job url STRING 12 | dbt_cloud_project_id dbt Cloud Project id STRING 13 | dbt_cloud_project_name dbt Cloud Project name STRING 14 | dbt_cloud_project_url dbt Cloud Project url STRING 15 | approximate_bytes_size Approximate bytes size STRING 16 | approximate_rows_count Approximate rows count STRING 17 | -------------------------------------------------------------------------------- /datacatalog_functions.py: -------------------------------------------------------------------------------- 1 | 2 | from google.cloud import datacatalog 3 | from config import datacatalog_client 4 | import config 5 | 6 | # --------------------------------------------------------------- 7 | # -------------GET DBT METADATA TAG TEMPLATE ------------------- 8 | # --------------------------------------------------------------- 9 | 10 | def get_dbt_tag_template(): 11 | 12 | scope = datacatalog.SearchCatalogRequest.Scope() 13 | scope.include_project_ids.append(config.dbt_tag_template_project) 14 | 15 | tag_templates = datacatalog_client.search_catalog(scope=scope, query='type=tag_template name:'+config.dbt_metadata_tag_template_id) 16 | 17 | for tag_template in tag_templates: 18 | dbt_metadata_tag_template_name=tag_template.relative_resource_name 19 | 20 | #print('dbt Metadata tag template name : {}'.format(dbt_metadata_tag_template_name)) 21 | 22 | return (dbt_metadata_tag_template_name) 23 | 24 | # ------------------------------------------------------------- 25 | # ------------- GET BIGQUERY ENTRY ID ------------------------- 26 | # ------------------------------------------------------------- 27 | 28 | def get_bq_entry_name(project,dataset,name): 29 | 30 | resource_name = '//bigquery.googleapis.com/projects/{}/datasets/{}/tables/{}'.format(project,dataset,name) 31 | bq_entry = datacatalog_client.lookup_entry(request={"linked_resource": resource_name}) 32 | 33 | #print('BigQuery entry name : {}'.format(bq_entry.name)) 34 | 35 | return (bq_entry.name) -------------------------------------------------------------------------------- /tag_template/gcloud_dbt_tag_template_create.sh: -------------------------------------------------------------------------------- 1 | gcloud --project data-catalog tag-templates create dbt_metadata \ 2 | --location=us-central1 \ 3 | --display-name="dbt Metadata" \ 4 | --field=id=dbt_run_id,display-name="dbt Run id",type=string,required=TRUE \ 5 | --field=id=dbt_run_timestamp,display-name="dbt Run timestamp",type=timestamp,required=TRUE \ 6 | --field=id=dbt_duration,display-name="dbt Duration",type=string\ 7 | --field=id=dbt_run_duration,display-name="dbt Run duration",type=string\ 8 | --field=id=dbt_run_url,display-name="dbt Run url",type=string\ 9 | --field=id=dbt_project_name,display-name="dbt Project name",type=string\ 10 | --field=id=dbt_model_name,display-name="dbt Model name",type=string\ 11 | --field=id=dbt_sql_run_url,display-name="dbt SQL run url",type=string\ 12 | --field=id=dbt_job_id,display-name="dbt Job id",type=string\ 13 | --field=id=dbt_job_name,display-name="dbt Job name",type=string\ 14 | --field=id=dbt_job_url,display-name="dbt Job url",type=string\ 15 | --field=id=dbt_cloud_project_id,display-name="dbt Cloud Project id",type=string\ 16 | --field=id=dbt_cloud_project_name,display-name="dbt Cloud Project name",type=string\ 17 | --field=id=dbt_cloud_project_url,display-name="dbt Cloud Project url",type=string\ 18 | --field=id=approximate_bytes_size,display-name="Approximate bytes size",type=string\ 19 | --field=id=approximate_rows_count,display-name="Approximate rows count",type=string -------------------------------------------------------------------------------- /tag_template/dbt_metadata_tag_template.json: -------------------------------------------------------------------------------- 1 | { 2 | "displayName":"dbt Metadata", 3 | "fields":{ 4 | "dbt_run_id":{ 5 | "displayName":"dbt Run id", 6 | "isRequired": "true", 7 | "type":{ 8 | "primitiveType":"STRING" 9 | } 10 | }, 11 | "dbt_run_timestamp":{ 12 | "displayName":"dbt Run timestamp", 13 | "isRequired": "true", 14 | "type":{ 15 | "primitiveType":"DATETIME" 16 | } 17 | }, 18 | "dbt_duration":{ 19 | "displayName":"dbt Duration", 20 | "isRequired": "false", 21 | "type":{ 22 | "primitiveType":"STRING" 23 | } 24 | }, 25 | "dbt_run_duration":{ 26 | "displayName":"dbt Run duration", 27 | "isRequired": "false", 28 | "type":{ 29 | "primitiveType":"STRING" 30 | } 31 | }, 32 | "dbt_run_url":{ 33 | "displayName":"dbt Run url", 34 | "isRequired": "false", 35 | "type":{ 36 | "primitiveType":"STRING" 37 | } 38 | }, 39 | "dbt_project_name":{ 40 | "displayName":"dbt Project name", 41 | "isRequired": "false", 42 | "type":{ 43 | "primitiveType":"STRING" 44 | } 45 | }, 46 | "dbt_model_name":{ 47 | "displayName":"dbt Model name", 48 | "isRequired": "false", 49 | "type":{ 50 | "primitiveType":"STRING" 51 | } 52 | }, 53 | "dbt_sql_run_url":{ 54 | "displayName":"dbt SQL run url", 55 | "isRequired": "false", 56 | "type":{ 57 | "primitiveType":"STRING" 58 | } 59 | }, 60 | "dbt_job_id":{ 61 | "displayName":"dbt Job id", 62 | "isRequired": "false", 63 | "type":{ 64 | "primitiveType":"STRING" 65 | } 66 | }, 67 | "dbt_job_name":{ 68 | "displayName":"dbt Job name", 69 | "isRequired": "false", 70 | "type":{ 71 | "primitiveType":"STRING" 72 | } 73 | }, 74 | "dbt_job_url":{ 75 | "displayName":"dbt Job url", 76 | "isRequired": "false", 77 | "type":{ 78 | "primitiveType":"STRING" 79 | } 80 | }, 81 | "dbt_cloud_project_id":{ 82 | "displayName":"dbt Cloud Project id", 83 | "isRequired": "false", 84 | "type":{ 85 | "primitiveType":"STRING" 86 | } 87 | }, 88 | "dbt_cloud_project_name":{ 89 | "displayName":"dbt Cloud Project name", 90 | "isRequired": "false", 91 | "type":{ 92 | "primitiveType":"STRING" 93 | } 94 | }, 95 | "dbt_cloud_project_url":{ 96 | "displayName":"dbt Cloud Project url", 97 | "isRequired": "false", 98 | "type":{ 99 | "primitiveType":"STRING" 100 | } 101 | }, 102 | "approximate_bytes_size":{ 103 | "displayName":"Approximate bytes size", 104 | "isRequired": "false", 105 | "type":{ 106 | "primitiveType":"STRING" 107 | } 108 | }, 109 | "approximate_rows_count":{ 110 | "displayName":"Approximate rows count", 111 | "isRequired": "false", 112 | "type":{ 113 | "primitiveType":"STRING" 114 | } 115 | } 116 | } 117 | } -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from google.cloud import datacatalog 2 | from datetime import datetime 3 | from google.protobuf.timestamp_pb2 import Timestamp 4 | from config import datacatalog_client 5 | import dbt_metadata 6 | import datacatalog_functions 7 | import requests 8 | import json 9 | import config 10 | 11 | def dbt_update_datacatalog(request): 12 | 13 | request_json = request.get_json() 14 | if request_json and 'dbt_run_id' in request_json and 'dbt_account_id' in request_json: 15 | dbt_run_id = request_json['dbt_run_id'] 16 | dbt_account_id = request_json['dbt_account_id'] 17 | else: 18 | return 'No dbt Run ID or dbt Account ID for Data Catalog update' 19 | 20 | # --------------------------------------------------------------------------------------------------------------------- 21 | # ----------------------------------------------------------------------------------------------------------------- 22 | 23 | dbt_run_metadata = dbt_metadata.get_dbt_run(dbt_account_id,dbt_run_id) 24 | print('dbt Run Metadata : {}'.format(dbt_run_metadata)) 25 | 26 | dbt_catalog = dbt_metadata.get_dbt_catalog(dbt_account_id,dbt_run_id) 27 | print('dbt Catalog : {}'.format(dbt_catalog)) 28 | 29 | dbt_tag_template = datacatalog_functions.get_dbt_tag_template() 30 | 31 | # ----------- Loop on Model found in Catalog and update Data Catalog dbt tag for each table or view --------------------------- 32 | 33 | for model in dbt_catalog: 34 | 35 | # ---------------Get BigQuery entry id for the table or the view ------------------------------ 36 | 37 | bq_entry_name = datacatalog_functions.get_bq_entry_name(model["bq_projet"],model["bq_dataset"],model["bq_object_name"]) 38 | 39 | # ----------------------------------------------------------------------------------------------- 40 | # ------------- LIST TAGS OF BIGQUERY ENTRY AND SEARCH IF A DBT TAG EXIST ----------------------- 41 | # ----------------------------------------------------------------------------------------------- 42 | 43 | dbt_tag_template_found=False 44 | for entry_tag in datacatalog_client.list_tags(parent=bq_entry_name): 45 | 46 | if entry_tag.template==dbt_tag_template: 47 | dbt_tag_template_found=True 48 | dbt_entry_tag_name=entry_tag.name 49 | break 50 | 51 | # ------------------------------------------------------------------------------- 52 | # ------------- UPDATE OR CREATE A TAG ON ENTRY TABLE OR VIEW ------------------- 53 | # ------------------------------------------------------------------------------- 54 | 55 | tag = datacatalog.Tag() 56 | dbt_run_timestamp = Timestamp() 57 | 58 | # ------------- Tag creation for Run Metadata ---------------------- 59 | 60 | for key in dbt_run_metadata: 61 | 62 | value = dbt_run_metadata[key] 63 | tag_field = datacatalog.TagField() 64 | 65 | if key=="dbt_run_timestamp": 66 | dbt_run_timestamp.FromDatetime(datetime.strptime(value, "%Y-%m-%d %H:%M:%S.%f+00:00")) 67 | tag_field.timestamp_value = dbt_run_timestamp 68 | else: 69 | tag_field.string_value = value 70 | 71 | tag.fields[key] = tag_field 72 | 73 | # ------------- Tag creation for Model Metadata ---------------------- 74 | 75 | for key in model["tag"]: 76 | 77 | value = model["tag"][key] 78 | tag_field = datacatalog.TagField() 79 | tag_field.string_value = value 80 | tag.fields[key] = tag_field 81 | 82 | if dbt_tag_template_found: 83 | 84 | # ------------- UPDATE AN EXISTING DBT TAG ON THE BIGQUERY ENTRY---------------------- 85 | tag.name=dbt_entry_tag_name 86 | tag = datacatalog_client.update_tag(tag=tag) 87 | 88 | else: 89 | 90 | # ------------- CREATE A NEW DBTTAG ON THE BIGQUERY ENTRY ---------------------- 91 | tag.template = dbt_tag_template 92 | tag=datacatalog_client.create_tag(parent=bq_entry_name, tag=tag) 93 | 94 | return "{} Data Catalog dbt tags updated for run {}".format(len(dbt_catalog),dbt_run_id) 95 | 96 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Google Cloud Data Catalog and dbt 2 | 3 | 4 | 5 | Example to create or update a [Google Cloud Data Catalog](https://cloud.google.com/data-catalog/) tag on BigQuery tables or views with [dbt Cloud](https://cloud.getdbt.com/) metadata via a Python [Cloud Function](https://cloud.google.com/functions). 6 | 7 | Data Catalog tag : **[dbt Run Metadata](https://github.com/dbt-content/google-datacatalog-dbt-tag/blob/main/tag_template/dbt_metadata_tag_template.txt)** tag attached to the BigQuery table or view and containing information from the dbt Run used to create or update the BigQuery table or view : Run durations and date, dbt Project and Model, Cloud job, Cloud project and approximative size and rows count. 8 | 9 | To activate, learn and use Cloud Data Catalog, go to [https://cloud.google.com/data-catalog](https://cloud.google.com/data-catalog) and [https://console.cloud.google.com/datacatalog](https://console.cloud.google.com/datacatalog). 10 | 11 | This repository contains the Cloud Function Python code to create or update the Data Catalog tag. 12 | 13 | This Cloud Function uses: 14 | - [Python Client for Google Cloud Data Catalog API](https://googleapis.dev/python/datacatalog/latest/index.html#) 15 | - [dbt Cloud API](https://docs.getdbt.com/dbt-cloud/api/) 16 | 17 | In your Cloud Function, you need the 5 files: 18 | - [main.py](https://github.com/dbt-content/google-datacatalog-dbt-tag/blob/main/main.py) 19 | - [config.py](https://github.com/dbt-content/google-datacatalog-dbt-tag/blob/main/config.py) where you need to update your **GCP project name** (where the dbt Tag Template is created) and the **[dbt Auth Token](https://docs.getdbt.com/dbt-cloud/api/#section/Authentication/TokenAuth)** (to use dbt Cloud API). You can also update the tag template ID if needed. 20 | - [datacatalog_functions.py](https://github.com/dbt-content/google-datacatalog-dbt-tag/blob/main/datacatalog_functions.py) 21 | - [dbt_metadata.py](https://github.com/dbt-content/google-datacatalog-dbt-tag/blob/main/dbt_metadata.py) 22 | - [requirements.txt](https://github.com/dbt-content/google-datacatalog-dbt-tag/blob/main/requirements.txt) 23 | 24 | 25 | Before runing the Cloud Function (and create or update tags), you need to create the Data Catalog Tag Template for [dbt Run Metadata](https://github.com/dbt-content/google-datacatalog-dbt-tag/blob/main/tag_template/dbt_metadata_tag_template.txt). 26 | 27 | You can use: 28 | 29 | - **Cloud Console** where you can [manage your Tag Templates](https://console.cloud.google.com/datacatalog?q=type%3DTAG_TEMPLATE) 30 | 31 | - **gcloud** and the command `gcloud data-catalog tag-templates create`, full command lines in [gcloud_dbt_tag_template_create.sh](https://github.com/dbt-content/google-datacatalog-dbt-tag/blob/main/tag_template/gcloud_dbt_tag_template_create.sh), more details with and [example](https://cloud.google.com/data-catalog/docs/quickstart-tagging#data-catalog-quickstart-gcloud) and [reference](https://cloud.google.com/sdk/gcloud/reference/data-catalog/tag-templates/create). But be aware that with gcloud command line, you cannot manage template tag fields's order, fields will be in alphabetical order. 32 | 33 | - **REST API** with the tag template json file [dbt_metadata_tag_template.json](https://github.com/dbt-content/google-datacatalog-dbt-tag/blob/main/tag_template/dbt_metadata_tag_template.json), more details with an [example](https://cloud.google.com/data-catalog/docs/quickstart-tagging#data-catalog-quickstart-drest) and [reference](https://cloud.google.com/data-catalog/docs/reference/rest/v1/projects.locations.tagTemplates/create). 34 | 35 | To use the Cloud Function you just have to pass the dbt Cloud Run ID and the dbt Cloud Account ID in a JSON format like ```{"dbt_run_id":"13161733","dbt_account_id":"11442"}```. 36 | 37 | When the Data Catalog template tag is created and when a tag is created or updated on BigQuery tables or views, you can find all results from [https://console.cloud.google.com/datacatalog](https://console.cloud.google.com/datacatalog). 38 | 39 | 40 | Finally, you can also search BigQuery tables or views in Cloud Data Catalog with a dbt tag from your own application like [https://github.com/dbt-content/dbt-datacatalog-explorer](https://github.com/dbt-content/dbt-datacatalog-explorer) 41 | 42 |
43 | Happy tagging ! 44 |


45 | 46 | ![image](images/DataCatalog_dbt-tag.png) 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /dbt_metadata.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import config 4 | 5 | # ---------------------------------------------------------------------------------- 6 | # ----------- FUNCTION TO GET ARRAY OF TABLES AND VIEWS UPDATED BY THE MODEL 7 | #----------------------------------------------------------------------------------- 8 | 9 | def get_dbt_catalog(account_id,run_id): 10 | 11 | dbt_catalog_endpoint = "https://cloud.getdbt.com/api/v2/accounts/{}/runs/{}/artifacts/catalog.json".format(account_id,run_id) 12 | 13 | resp = requests.get( 14 | url=dbt_catalog_endpoint, 15 | headers=config.dbt_headers 16 | ) 17 | catalog=resp.json() 18 | 19 | #print('Status Code Get Catalog: {}'.format(resp.status_code)) 20 | #print('Result : {}'.format(catalog)) 21 | 22 | dbt_catalog=[] 23 | 24 | # ------- Loop on nodes of Catalog artifact -------------------------------------- 25 | 26 | for node_key in catalog["nodes"]: 27 | 28 | unique_id_array=catalog["nodes"][node_key]["unique_id"].split(".") 29 | dbt_project_name=unique_id_array[1] 30 | dbt_model_name=unique_id_array[2] 31 | 32 | bq_object_name=catalog["nodes"][node_key]["metadata"]["name"] 33 | bq_dataset=catalog["nodes"][node_key]["metadata"]["schema"] 34 | bq_projet=catalog["nodes"][node_key]["metadata"]["database"] 35 | 36 | artifact_sql_run="https://cloud.getdbt.com/api/v2/accounts/{}/runs/{}/artifacts/run/{}/models/{}.sql".format(account_id,run_id,dbt_project_name,bq_object_name) 37 | 38 | if catalog["nodes"][node_key]["stats"]["has_stats"]["value"]: 39 | approximate_bytes_size=catalog["nodes"][node_key]["stats"]["num_bytes"]["value"] 40 | approximate_rows_count=catalog["nodes"][node_key]["stats"]["num_rows"]["value"] 41 | else: 42 | approximate_bytes_size="na" 43 | approximate_rows_count="na" 44 | 45 | dbt_model = { 46 | "bq_object_name":bq_object_name, 47 | "bq_projet":bq_projet, 48 | "bq_dataset":bq_dataset, 49 | "tag":{ 50 | "approximate_bytes_size":str(approximate_bytes_size), 51 | "approximate_rows_count":str(approximate_rows_count), 52 | "dbt_project_name":dbt_project_name, 53 | "dbt_model_name":dbt_model_name, 54 | "dbt_sql_run_url":artifact_sql_run 55 | } 56 | } 57 | 58 | dbt_catalog.append(dbt_model) 59 | 60 | return dbt_catalog 61 | 62 | # ------------------------------------------------------------------------ 63 | # ----------- FUNCTION TO GET METADATA FROM THE RUN 64 | #------------------------------------------------------------------------- 65 | 66 | def get_dbt_run(account_id,run_id): 67 | 68 | # ------- Get dbt Cloud account name -------------------------------------- 69 | 70 | dbt_endpoint = "https://cloud.getdbt.com/api/v2/accounts/{}/".format(account_id) 71 | resp = requests.get( 72 | url=dbt_endpoint, 73 | headers=config.dbt_headers 74 | ) 75 | dbt_object=resp.json() 76 | account_name=dbt_object["data"]["name"] 77 | 78 | # ------- Get dbt Cloud run -------------------------------------- 79 | 80 | dbt_endpoint = "https://cloud.getdbt.com/api/v2/accounts/{}/runs/{}/".format(account_id,run_id) 81 | resp = requests.get( 82 | url=dbt_endpoint, 83 | headers=config.dbt_headers 84 | ) 85 | dbt_object=resp.json() 86 | 87 | #print('Status Code Get Run: {}'.format(resp.status_code)) 88 | #print('Result : {}'.format(dbt_object)) 89 | 90 | project_id=dbt_object["data"]["project_id"] 91 | job_id=dbt_object["data"]["job_id"] 92 | run_url=dbt_object["data"]["href"] 93 | duration=dbt_object["data"]["duration_humanized"] 94 | run_duration=dbt_object["data"]["run_duration_humanized"] 95 | run_finished_at=dbt_object["data"]["finished_at"] 96 | 97 | # ------- Get dbt Cloud project name -------------------------------------- 98 | 99 | dbt_endpoint = "https://cloud.getdbt.com/api/v2/accounts/{}/projects/{}/".format(account_id,project_id) 100 | resp = requests.get( 101 | url=dbt_endpoint, 102 | headers=config.dbt_headers 103 | ) 104 | dbt_object=resp.json() 105 | project_name=dbt_object["data"]["name"] 106 | 107 | # ------- Get dbt Cloud job name -------------------------------------- 108 | 109 | dbt_endpoint = "https://cloud.getdbt.com/api/v2/accounts/{}/jobs/{}/".format(account_id,job_id) 110 | resp = requests.get( 111 | url=dbt_endpoint, 112 | headers=config.dbt_headers 113 | ) 114 | dbt_object=resp.json() 115 | job_name=dbt_object["data"]["name"] 116 | 117 | project_url="https://cloud.getdbt.com/#/accounts/{}/projects/{}/dashboard/".format(account_id,project_id) 118 | job_url="https://cloud.getdbt.com/#/accounts/{}/projects/{}/jobs/{}/".format(account_id,project_id,job_id) 119 | 120 | dbt_metadata = { 121 | "dbt_run_id": str(run_id), 122 | "dbt_run_timestamp":run_finished_at, 123 | "dbt_duration": duration, 124 | "dbt_run_duration": run_duration, 125 | "dbt_run_url": run_url, 126 | "dbt_job_id": str(job_id), 127 | "dbt_job_name": job_name, 128 | "dbt_job_url": job_url, 129 | "dbt_cloud_project_id": str(project_id), 130 | "dbt_cloud_project_name": project_name, 131 | "dbt_cloud_project_url": project_url 132 | } 133 | 134 | return dbt_metadata --------------------------------------------------------------------------------