├── BigQuery ├── BigQuery_partition_demo.sql ├── bq_procedures.sql ├── bq_table_clusters.sql ├── data_lineage_demo.sql ├── gcp_billingdata_analysis.sql └── table_function.sql ├── CloudFunctions ├── bq_events_to_gcs_cf.py ├── gcs_events_cf.py ├── pubsub_events_cf.py └── sheets_to_bigquery_dataload_cf.py ├── CloudSQL ├── cloudsql_python_connect.py └── federated_query_demo.sql ├── CloudSpanner ├── cloud_spanner_intro_demo.sql └── spanner_python_client_demo.py ├── Common_Realtime_Usecases ├── dataflow_spanner_demo │ ├── beam_dataflow-to-spanner.py │ ├── census-db-schema.sql │ ├── census_100_testing.csv │ ├── create-spanner-database-CLI.txt │ └── installations_beam.sh └── iam_snapshots.py ├── Composer ├── airflow_dataproc_automate_dag.py ├── dataflow_python_operator_dag.py ├── gcs_to_bq_and_bq_operators.py ├── https_operators_demo_dag.py └── python_bash_operators_dag.py ├── Dataflow ├── batch_etl_avro_data_cloudsql.py ├── beam_stream_data_process.py ├── dataflow_batch_demo.py ├── dataflow_batch_log_process.py └── process_nested_data_sql_demo.py ├── Dataproc └── pyspark_bq_to_gcs_demo.py ├── GCP_Data_Eng_concept_files ├── GCP_storage_db.png ├── Initfile.txt └── gcp_etl_services.png ├── GCS ├── object_lifecycle_mngmnt_cli.txt └── object_lifecylce_mngmnt.py ├── GoogleCloudStorage ├── gcs_python_client_demo.py └── object_versioning_cli.txt ├── README.md └── Security └── secretmanager_python_connect.py /BigQuery/BigQuery_partition_demo.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Author : @ Anjan GCP Data Engineering 3 | 4 | Created SQLs to Demo BigQuery Table Partitioning 5 | 1. TIME UNIT (MONTHLY) 6 | 2. INTEGER RANGE 7 | 3. INGESTION TIME UNIT 8 | 9 | */ 10 | 11 | /************** Time Unit Partitioning *******************/ 12 | 13 | -- Query this table to understand the data distribution across different dates 14 | 15 | SELECT min(start_time), max(start_time) FROM `gcp-data-eng-374308.bigquery_demos.bikeshare_trips`; 16 | 17 | select DATE_TRUNC(start_time, DAY) as year,count(*) from `gcp-data-eng-374308.bigquery_demos.bikeshare_trips` 18 | group by 1 order by 1; 19 | 20 | select DATE_TRUNC(start_time, MONTH) as year,count(*) from `gcp-data-eng-374308.bigquery_demos.bikeshare_trips` 21 | group by 1 order by 1; 22 | 23 | select DATE_TRUNC(start_time, YEAR) as year,count(*) from `gcp-data-eng-374308.bigquery_demos.bikeshare_trips` 24 | group by 1 order by 1; 25 | 26 | --Create MONTHLY Partitioned table based on TIME UNIT columns 27 | create or replace table bigquery_demos.bikeshare_trips_p 28 | ( 29 | trip_id INT64, 30 | subscriber_type STRING, 31 | bikeid STRING, 32 | start_time TIMESTAMP, 33 | start_station_id INT64, 34 | start_station_name STRING, 35 | end_station_id STRING, 36 | end_station_name STRING, 37 | duration_minutes INT64 38 | ) 39 | PARTITION BY 40 | TIMESTAMP_TRUNC(start_time, MONTH); 41 | 42 | --Create partition table usning SQL query result 43 | 44 | create or replace table bigquery_demos.bikeshare_trips_sql 45 | ( 46 | trip_id INT64, 47 | subscriber_type STRING, 48 | bikeid STRING, 49 | start_time TIMESTAMP, 50 | start_station_id INT64, 51 | start_station_name STRING, 52 | end_station_id STRING, 53 | end_station_name STRING, 54 | duration_minutes INT64 55 | ) 56 | PARTITION BY 57 | start_time 58 | AS (SELECT TIMESTAMP_TRUNC(start_time , DAY) 59 | FROM `gcp-data-eng-374308.bigquery_demos.bikeshare_trips`); 60 | 61 | --Insert data into Partitioned table 62 | insert into bigquery_demos.bikeshare_trips_p 63 | select * from bigquery_demos.bikeshare_trips; 64 | 65 | -- Query non Partitioned table 66 | select * from bigquery_demos.bikeshare_trips 67 | where start_time > '2020-12-01 00:00:00 UTC'; 68 | 69 | -- Query partioned table and see the difference 70 | select * from bigquery_demos.bikeshare_trips_p 71 | where start_time > '2020-12-01 00:00:00 UTC'; 72 | 73 | 74 | /************** Integer Range Partitioning *******************/ 75 | 76 | -- Query this table to understand the data distribution across INTEGER type column 77 | SELECT id, 78 | text, 79 | score, 80 | creation_date 81 | FROM `bigquery-public-data.stackoverflow.comments`; 82 | 83 | --Creat Partitioned table 84 | create or replace table bigquery_demos.stackoverflow_comments_p 85 | ( 86 | id INT64, 87 | text STRINg, 88 | score INT64, 89 | creation_date TIMESTAMP 90 | ) 91 | partition by RANGE_BUCKET(id, GENERATE_ARRAY(0, 140390264, 100000)); 92 | 93 | --Insert data into partitioned table 94 | insert into bigquery_demos.stackoverflow_comments_p 95 | SELECT id, 96 | text, 97 | score, 98 | creation_date 99 | FROM `bigquery-public-data.stackoverflow.comments`; 100 | 101 | 102 | --Query non Partitioned table 103 | SELECT id, 104 | text, 105 | score, 106 | creation_date 107 | FROM `bigquery-public-data.stackoverflow.comments` 108 | where id between 1000 and 100000; 109 | 110 | --Query Partitioned table 111 | SELECT id, 112 | text, 113 | score, 114 | creation_date 115 | FROM `bigquery_demos.stackoverflow_comments_p` 116 | where id between 1000 and 100000; 117 | 118 | /************** Data Ingestion Time Unit Partitioning *******************/ 119 | 120 | --See the data distribution across HOUR/DAY/MONTH/YEAR ? 121 | SELECT TIMESTAMP_TRUNC(trip_start_timestamp, HOUR),count(*) 122 | FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips` 123 | where trip_start_timestamp > '2021-10-01 18:15:00 UTC' 124 | group by 1 125 | order by 1 desc; 126 | 127 | --Create partition table based on ingestion time with HOUR as partition criteria 128 | create or replace table bigquery_demos.taxi_trips 129 | ( 130 | unique_key STRING, 131 | taxi_id STRING, 132 | trip_start_timestamp TIMESTAMP, 133 | trip_end_timestamp TIMESTAMP, 134 | trip_seconds INT64, 135 | trip_miles FLOAT64, 136 | pickup_census_tract INT64, 137 | dropoff_census_tract INT64, 138 | pickup_community_area INT64, 139 | dropoff_community_area INT64, 140 | fare FLOAT64, 141 | tips FLOAT64, 142 | tolls FLOAT64, 143 | extras FLOAT64, 144 | trip_total FLOAT64, 145 | payment_type STRING, 146 | company STRING, 147 | pickup_latitude FLOAT64, 148 | pickup_longitude FLOAT64, 149 | pickup_location STRING, 150 | dropoff_latitude FLOAT64, 151 | dropoff_longitude FLOAT64, 152 | dropoff_location STRING 153 | ) 154 | PARTITION BY 155 | DATETIME_TRUNC(_PARTITIONTIME,HOUR) 156 | OPTIONS ( 157 | partition_expiration_days = 3, 158 | require_partition_filter = TRUE); 159 | 160 | -- Query Partitioned table 161 | SELECT 162 | * 163 | FROM 164 | bigquery_demos.taxi_trips 165 | WHERE 166 | _PARTITIONTIME > TIMESTAMP_SUB(TIMESTAMP('2016-04-15'), INTERVAL 2 HOUR); 167 | 168 | SELECT 169 | * 170 | FROM 171 | bigquery_demos.taxi_trips 172 | WHERE 173 | _PARTITIONTIME BETWEEN TIMESTAMP('2016-04-15') AND TIMESTAMP('2016-04-14'); 174 | 175 | -- If you want to update partition filter requirement or expiration use below DDLs 176 | 177 | ALTER TABLE bigquery_demos.taxi_trips 178 | SET OPTIONS ( 179 | -- Sets partition expiration to 5 days 180 | partition_expiration_days = 5, 181 | require_partition_filter = false); 182 | 183 | -------------------------------------------------------------------------------- /BigQuery/bq_procedures.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Author : @ Anjan GCP Data Engineering 3 | Created SQLs required for Big Query Procedures and Anonymous Blocks Demo 4 | */ 5 | 6 | -- Metadata View which gives Table meta data details liek row count, Size ..etc 7 | 8 | SELECT * FROM austin_crime.__TABLES__; 9 | 10 | -- Query to get all Table metadata details with formatted results 11 | SELECT 12 | dataset_id AS dataset_name, 13 | table_id AS table_name, 14 | current_date AS stats_collect_date, 15 | row_count AS record_count, 16 | TIMESTAMP_MILLIS(last_modified_time) AS last_modified_time, 17 | size_bytes/POW(10,9) AS size_in_gb 18 | FROM 19 | `gcp-data-eng-374308`.austin_crime.__TABLES__ 20 | WHERE 21 | type=1; 22 | 23 | -- Table to capture the stats like table - row count, size ..etc 24 | CREATE OR REPLACE TABLE 25 | analysis.table_stats ( dataset_name STRING, 26 | table_name STRING, 27 | stats_collect_date DATE, 28 | record_count INT64, 29 | last_modified_time TIMESTAMP, 30 | size_in_gb FLOAT64 ); 31 | 32 | select * from analysis.table_stats; 33 | 34 | /*************************************************************************************/ 35 | 36 | /* 37 | Author : @ Anjan GCP Data Engineering 38 | 39 | Creating a Anonymous BLOCK to capture table stas like row count , Size, modified time ..etc for all the 40 | Table in a Project (ALL Datasets) 41 | 42 | Steps: 43 | 1. Loop to iterate throgh All datasets in a Big QUery project 44 | 2. Delete the data if exist for the same DATE while we are rinning this procedure 45 | 3. Constrcuting Dynamic SQL for each dataset to get the stats and insert the same into resultant table --> table_stats 46 | 5. Executing Dynamic SQL to capture actual results 47 | 48 | Created SQLs required for Big Query Procedures and Anonymous Blocks Demo 49 | 50 | */ 51 | 52 | #standardSQL 53 | DECLARE DATASETS_TO_CHECK ARRAY; 54 | DECLARE i INT64 DEFAULT 0; 55 | DECLARE Dataset STRING ; 56 | declare Qry string; 57 | 58 | SET DATASETS_TO_CHECK = ( 59 | WITH req_datasets as 60 | ( select schema_name 61 | from `gcp-data-eng-374308`.INFORMATION_SCHEMA.SCHEMATA 62 | ) 63 | SELECT ARRAY_AGG(schema_name) from req_datasets 64 | ); 65 | 66 | LOOP SET i = i + 1; 67 | BEGIN 68 | IF i > ARRAY_LENGTH(DATASETS_TO_CHECK) THEN 69 | LEAVE; 70 | END IF; 71 | 72 | delete from analysis.table_stats where dataset_name=DATASETS_TO_CHECK[ORDINAL(i)] and stats_collect_date = current_date; 73 | set Qry =CONCAT("insert analysis.table_stats select dataset_id as dataset_name,table_id as table_name,current_date as stats_collect_date, row_count as record_count,TIMESTAMP_MILLIS(last_modified_time) AS last_modified_time,size_bytes/pow(10,9) as size_in_gb FROM `gcp-data-eng-374308`.", DATASETS_TO_CHECK[ORDINAL(i)],".__TABLES__ where type=1"); 74 | 75 | execute immediate Qry; 76 | EXCEPTION 77 | WHEN ERROR THEN CONTINUE ; 78 | END; 79 | END LOOP; 80 | 81 | /************************************************************************************************************************/ 82 | 83 | /* 84 | Author : @ Anjan GCP Data Engineering 85 | 86 | Creating Procedure with INPUT dataset list woth comma seperated 87 | Steps: 88 | 1. For loop to iterate throgh given dataset list 89 | 2. Delete the data if exist for the same DATE while we are rinning this procedure 90 | 3. Constrcuting Dynamic SQL for each dataset to get the stats and insert the same into resultant table --> table_stats 91 | 5. Executing Dynamic SQL to capture actual results 92 | 6. Calling/Executing Proedure with CALL key word 93 | 94 | Created SQLs required for Big Query Procedures and Anonymous Blocks Demo 95 | */ 96 | CREATE OR REPLACE PROCEDURE 97 | analysis.sp_collect_stats(dataset_list STRING, OUT status STRING) 98 | BEGIN 99 | DECLARE qry STRING; 100 | 101 | FOR rec IN ( 102 | SELECT 103 | schema_name as dataset_name 104 | FROM 105 | `gcp-data-eng-374308`.INFORMATION_SCHEMA.SCHEMATA 106 | WHERE 107 | schema_name IN (SELECT * FROM UNNEST(SPLIT(dataset_list))) ) 108 | DO 109 | 110 | DELETE FROM analysis.table_stats WHERE dataset_name=rec.dataset_name AND stats_collect_date = current_date; 111 | 112 | SET qry =CONCAT("insert analysis.table_stats select dataset_id as dataset_name,table_id as table_name,current_date as stats_collect_date,row_count as record_count,TIMESTAMP_MILLIS(last_modified_time) AS last_modified_time,size_bytes/pow(10,9) as size_in_gb FROM `gcp-data-eng-374308`.", rec.dataset_name,".__TABLES__ where type=1"); 113 | 114 | EXECUTE IMMEDIATE qry; 115 | 116 | set status = 'SUCCESS'; 117 | 118 | END FOR; 119 | END; 120 | 121 | --Calling Procedure 122 | begin 123 | declare out_status string; 124 | CALL analysis.sp_collect_stats('analysis,austin_crime',out_status); 125 | select out_status; 126 | end; 127 | -------------------------------------------------------------------------------- /BigQuery/bq_table_clusters.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Author : @ Anjan GCP Data Engineering 3 | Created SQLs to Demo BigQuery Table Clustering 4 | */ 5 | -- Create cluster Table 6 | CREATE OR REPLACE TABLE bigquery_demos.pageviews_cluster 7 | ( 8 | datehour TIMESTAMP, 9 | wiki STRING, 10 | title STRING, 11 | views INTEGER 12 | ) 13 | CLUSTER BY 14 | wiki 15 | OPTIONS ( 16 | description = 'a table clustered by wiki'); 17 | 18 | -- Create cluster Table with SQL query 19 | CREATE OR REPLACE TABLE bigquery_demos.pageviews_cluster 20 | ( 21 | datehour TIMESTAMP, 22 | wiki STRING, 23 | title STRING, 24 | views INTEGER 25 | ) 26 | CLUSTER BY 27 | wiki 28 | AS ( 29 | SELECT * FROM bigquery_demos.pageviews 30 | ); 31 | 32 | --Insert data into cluster table DML 33 | insert into bigquery_demos.pageviews_cluster 34 | select * from gcp-data-eng-374308.bigquery_demos.pageviews; 35 | 36 | -- Create cluster with partition Table 37 | CREATE OR REPLACE TABLE bigquery_demos.pageviews_cluster_partition 38 | ( 39 | datehour TIMESTAMP, 40 | wiki STRING, 41 | title STRING, 42 | views INTEGER 43 | ) 44 | PARTITION BY TIMESTAMP_TRUNC(datehour,DAY) 45 | CLUSTER BY 46 | wiki 47 | OPTIONS ( 48 | description = 'a table clustered by wiki and partition by date'); 49 | 50 | -- Insert data , DML 51 | insert into bigquery_demos.pageviews_cluster_partition 52 | select * from gcp-data-eng-374308.bigquery_demos.pageviews; 53 | 54 | 55 | 56 | -- Query non cluster table 57 | SELECT * FROM `gcp-data-eng-374308.bigquery_demos.pageviews` 58 | where DATE(datehour) > "2023-03-28" 59 | and wiki ='sr.m' 60 | limit 10; 61 | 62 | -- Query cluster table 63 | SELECT * FROM `gcp-data-eng-374308.bigquery_demos.pageviews_cluster` 64 | where DATE(datehour) > "2023-03-28" 65 | and wiki ='sr.m' 66 | limit 10; 67 | 68 | -- Query partioned cluster table 69 | SELECT * FROM `gcp-data-eng-374308.bigquery_demos.pageviews_cluster_partition` 70 | where DATE(datehour) > "2023-03-28" 71 | and wiki ='sr.m' 72 | limit 10; 73 | 74 | -------------------------------------------------------------------------------- /BigQuery/data_lineage_demo.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Author : @ Anjan GCP Data Engineering 3 | Created SQLs to demo Biqguery Datalineage 4 | */ 5 | 6 | -- Create conslolidated sales table bi joining customer and items tables 7 | create or replace table data_eng_demos.cust_item_sales_dtls as 8 | SELECT 9 | customer.fname||''||customer.lname as customer_name, 10 | items.itm_name, 11 | sales.qty, 12 | sales.price, 13 | sales.ord_date 14 | FROM 15 | `gcp-dataeng-demo-431907.data_eng_demos.customer` AS customer 16 | INNER JOIN `gcp-dataeng-demo-431907.data_eng_demos.sales` AS sales ON customer.cust_id = sales.cust_id 17 | INNER JOIN `gcp-dataeng-demo-431907.data_eng_demos.items` AS items ON sales.item_id = items.item_id; 18 | 19 | -- Create Aggregate table based on customer name 20 | create or replace table data_eng_demos.customer_agg_sales as 21 | SELECT 22 | customer_name, 23 | SUM(qty) AS tot_qty, 24 | SUM(price) AS tot_price 25 | FROM 26 | data_eng_demos.cust_item_sales_dtls 27 | GROUP BY 28 | 1; 29 | 30 | -- Create Aggregate table based on item name 31 | create or replace table data_eng_demos.item_agg_sales as 32 | SELECT 33 | itm_name, 34 | SUM(qty) AS tot_qty, 35 | SUM(price) AS tot_price 36 | FROM 37 | data_eng_demos.cust_item_sales_dtls 38 | GROUP BY 39 | 1; 40 | 41 | -------------------------------------------------------------------------------- /BigQuery/gcp_billingdata_analysis.sql: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | 4 | Author : @ Anjan GCP Data Engineering 5 | 6 | SQLs to analyze Billing data based on different dimensions 7 | 8 | */ 9 | 10 | /************** Billing main columns and plain data *****************/ 11 | SELECT 12 | invoice.month, 13 | service.description as service, 14 | usage_start_time, 15 | usage_end_time, 16 | project.name, 17 | location.region, 18 | cost, 19 | currency, 20 | usage.amount, 21 | usage.unit, 22 | (select SUM(c.amount) 23 | from UNNEST(credits) c) as credits_amount, 24 | (select STRING_AGG(c.full_name) 25 | from UNNEST(credits) c) as crdit_full_name 26 | FROM 27 | `gcp-data-eng-374308.analysis.gcp_billing_export_v1_01CBD1_B38C45_E20EEB`; 28 | 29 | /************** Total uasge cost per month without Credits *****************/ 30 | 31 | SELECT 32 | invoice.month, 33 | SUM(cost) AS total, 34 | SUM(CAST(cost AS NUMERIC)) AS total_exact 35 | FROM `gcp-data-eng-374308.analysis.gcp_billing_export_v1_01CBD1_B38C45_E20EEB` 36 | GROUP BY 1 37 | ORDER BY 1 ASC; 38 | 39 | /************** Total uasge cost per month with Credits *****************/ 40 | 41 | SELECT 42 | invoice.month, 43 | SUM(cost) 44 | + SUM(IFNULL((SELECT SUM(c.amount) 45 | FROM UNNEST(credits) c), 0)) 46 | AS total, 47 | (SUM(CAST(cost AS NUMERIC)) 48 | + SUM(IFNULL((SELECT SUM(CAST(c.amount AS NUMERIC)) 49 | FROM UNNEST(credits) AS c), 0))) 50 | AS total_exact 51 | FROM `gcp-data-eng-374308.analysis.gcp_billing_export_v1_01CBD1_B38C45_E20EEB` 52 | GROUP BY 1 53 | ORDER BY 1 ASC; 54 | 55 | /************** Total uasge cost per month group by Service without Credits *****************/ 56 | 57 | SELECT 58 | invoice.month, 59 | service.description as service, 60 | SUM(cost) AS total, 61 | (SUM(CAST(cost AS NUMERIC))) AS total_exact 62 | FROM `gcp-data-eng-374308.analysis.gcp_billing_export_v1_01CBD1_B38C45_E20EEB` 63 | GROUP BY 1,2 64 | ORDER BY 1 ASC; 65 | 66 | /************** Total uasge cost per month group by Service with Credits *****************/ 67 | 68 | SELECT 69 | invoice.month, 70 | service.description as service, 71 | SUM(cost) 72 | + SUM(IFNULL((SELECT SUM(c.amount) 73 | FROM UNNEST(credits) c), 0)) 74 | AS total, 75 | (SUM(CAST(cost AS NUMERIC)) 76 | + SUM(IFNULL((SELECT SUM(CAST(c.amount AS NUMERIC)) 77 | FROM UNNEST(credits) AS c), 0))) 78 | AS total_exact 79 | FROM `gcp-data-eng-374308.analysis.gcp_billing_export_v1_01CBD1_B38C45_E20EEB` 80 | GROUP BY 1,2 81 | ORDER BY 1 ASC; 82 | 83 | 84 | /************** Total uasge cost for a perticular service *****************/ 85 | 86 | SELECT 87 | SUM(cost) AS cost_before_credits, 88 | labels.value AS cluster_name 89 | FROM `gcp-data-eng-374308.analysis.gcp_billing_export_resource_v1_01CBD1_B38C45_E20EEB` 90 | LEFT JOIN UNNEST(labels) as labels 91 | ON labels.key = "goog-k8s-cluster-name" 92 | GROUP BY labels.value; 93 | 94 | -------------------------------------------------------------------------------- /BigQuery/table_function.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Author : @ Anjan GCP Data Engineering 3 | */ 4 | # Creating Table function 5 | CREATE OR REPLACE TABLE FUNCTION gcp_dataeng_demos.TableFunctionDemo(filter_timestap timestamp) AS 6 | ( 7 | SELECT 8 | id, 9 | owner_display_name, 10 | score 11 | FROM 12 | `bigquery-public-data.stackoverflow.posts_answers` 13 | WHERE 14 | creation_date >= TIMESTAMP(filter_timestap) 15 | AND owner_display_name IS NOT NULL 16 | ); 17 | 18 | # Query Table function 19 | SELECT 20 | * 21 | FROM 22 | gcp_dataeng_demos.TableFunctionDemo(TIMESTAMP('2022-06-01 00:00:00.000000 UTC')); 23 | 24 | # Join Table function result with other table 25 | SELECT 26 | a.id, 27 | a.owner_display_name, 28 | a.score, 29 | b.view_count, 30 | b.title 31 | FROM 32 | `bigquery-public-data.stackoverflow.posts_questions` b 33 | JOIN 34 | gcp_dataeng_demos.TableFunctionDemo(TIMESTAMP('2022-01-01 00:00:00.000000 UTC')) a 35 | ON 36 | UPPER(a.owner_display_name) =UPPER( b.owner_display_name) 37 | -------------------------------------------------------------------------------- /CloudFunctions/bq_events_to_gcs_cf.py: -------------------------------------------------------------------------------- 1 | import functions_framework 2 | from google.cloud import bigquery 3 | from datetime import datetime 4 | 5 | ''' 6 | Dependencies to be installed 7 | 8 | db-dtypes 9 | fsspec 10 | gcsfs 11 | bigquery 12 | 13 | ''' 14 | 15 | # CloudEvent function to be triggered by an Eventarc Cloud Audit Logging trigger 16 | # Note: this is NOT designed for second-party (Cloud Audit Logs -> Pub/Sub) triggers! 17 | @functions_framework.cloud_event 18 | def hello_auditlog(cloudevent): 19 | 20 | # Print out details from the `protoPayload` 21 | # This field encapsulates a Cloud Audit Logging entry 22 | # See https://cloud.google.com/logging/docs/audit#audit_log_entry_structure 23 | 24 | payload = cloudevent.data.get("protoPayload") 25 | if payload: 26 | 27 | # Timestamp in string format 28 | now = datetime.now() 29 | timpstamp = now.strftime("%m%d%Y%H%M%S") 30 | 31 | # Build Big Query client 32 | bucket_name = 'data_eng_demos' 33 | project = "gcp-dataeng-demos-365206" 34 | dataset_id = "gcp_dataeng_demos" 35 | table_id = "demo_cf" 36 | 37 | # Write data into GCS/csv file using dataframe 38 | client = bigquery.Client(project=project) 39 | destination_uri = "gs://{}/{}".format(bucket_name, "bq_to_gcs_extract" + timpstamp + ".csv") 40 | qry = "select * from " + project + "." + dataset_id + "." + table_id 41 | df_qry_result = client.query(qry).to_dataframe() 42 | df_qry_result.to_csv(destination_uri) 43 | 44 | print( 45 | "Exported {}:{}.{} to {}".format(project, dataset_id, table_id, destination_uri) 46 | ) 47 | -------------------------------------------------------------------------------- /CloudFunctions/gcs_events_cf.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pandas.io import gbq 3 | from google.cloud import bigquery 4 | 5 | ''' 6 | Python Dependencies to be installed 7 | 8 | gcsfs 9 | fsspec 10 | pandas 11 | pandas-gbq 12 | 13 | ''' 14 | 15 | def hello_gcs(event, context): 16 | """Triggered by a change to a Cloud Storage bucket. 17 | Args: 18 | event (dict): Event payload. 19 | context (google.cloud.functions.Context): Metadata for the event. 20 | """ 21 | 22 | lst = [] 23 | file_name = event['name'] 24 | table_name = file_name.split('.')[0] 25 | 26 | # Event,File metadata details writing into Big Query 27 | dct={ 28 | 'Event_ID':context.event_id, 29 | 'Event_type':context.event_type, 30 | 'Bucket_name':event['bucket'], 31 | 'File_name':event['name'], 32 | 'Created':event['timeCreated'], 33 | 'Updated':event['updated'] 34 | } 35 | lst.append(dct) 36 | df_metadata = pd.DataFrame.from_records(lst) 37 | df_metadata.to_gbq('gcp_dataeng_demos.data_loading_metadata', 38 | project_id='gcp-dataeng-demos-365206', 39 | if_exists='append', 40 | location='us') 41 | 42 | # Actual file data , writing to Big Query 43 | df_data = pd.read_csv('gs://' + event['bucket'] + '/' + file_name) 44 | 45 | df_data.to_gbq('gcp_dataeng_demos.' + table_name, 46 | project_id='gcp-dataeng-demos-365206', 47 | if_exists='append', 48 | location='us') 49 | -------------------------------------------------------------------------------- /CloudFunctions/pubsub_events_cf.py: -------------------------------------------------------------------------------- 1 | 2 | def hello_pubsub(event, context): 3 | """ 4 | Background Cloud Function to be triggered by Pub/Sub. 5 | Required dependencies : bigquery 6 | Configure environment variables while creating and deploying cloud function 7 | for bigquery dataset and table 8 | dataset:gcp_dataeng_demos 9 | table:cf_pubsub_demo 10 | 11 | """ 12 | import base64 13 | import json 14 | import os,sys 15 | from google.cloud import bigquery 16 | 17 | if 'data' in event: 18 | data_buffer = base64.b64decode(event['data']).decode('utf-8') 19 | 20 | message='{'+'"Actvity_Time": "{}"'.format(json.loads(data_buffer)['timestamp']) + ',' +'"Resource_Name": "{}"'.format(json.loads(data_buffer)['protoPayload']['resourceName']) + ',' +'"Actvity_Type": "{}"'.format(json.loads(data_buffer)['protoPayload']['methodName']) + ',' +'"Activity_done_by": "{}"'.format(json.loads(data_buffer)['protoPayload']['authenticationInfo']['principalEmail']) + ',' + '"Change_in_IAM_policies": "{}"'.format(json.loads(data_buffer)['protoPayload']['serviceData']['policyDelta']['bindingDeltas'])+'}' 21 | bq_data=json.loads(message) 22 | print(bq_data) 23 | 24 | def to_bigquery(dataset, table, document): 25 | bigquery_client = bigquery.Client() 26 | dataset_ref = bigquery_client.dataset(dataset) 27 | table_ref = dataset_ref.table(table) 28 | table = bigquery_client.get_table(table_ref) 29 | errors = bigquery_client.insert_rows(table, [document]) 30 | if errors != [] : 31 | print(errors, file=sys.stderr) 32 | to_bigquery(os.environ['dataset'], os.environ['table'], bq_data) 33 | 34 | else: 35 | print('Hello World') 36 | -------------------------------------------------------------------------------- /CloudFunctions/sheets_to_bigquery_dataload_cf.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Author : @ Anjan GCP Data Engineering 3 | Cloud function code to: 4 | 1. Read data from publicly shared google sheets 5 | 2. Load data into Bigquery using pandas and bigquery APIs 6 | ''' 7 | 8 | import functions_framework 9 | import pandas as pd 10 | import pandas_gbq 11 | 12 | @functions_framework.http 13 | def hello_http(request): 14 | 15 | message = 'Function executed Successfully' 16 | # Read Google Sheet data into Pandas dataframe 17 | # and write that data into Bigquery Table 18 | sheet_id = '1r9b-CN86hCmwnnm_-6aosLwZpzKyLRjsW9fxr0ALVRE' 19 | sheet_name = "Sheet1" 20 | url_1 = "https://docs.google.com/spreadsheets/d/{}/gviz/tq?tqx=out:csv&sheet={}".format(sheet_id,sheet_name) 21 | #print(url_1) 22 | df = pd.read_csv(url_1) 23 | df.to_gbq('gcp_dataeng_demos.public_fruit_to_bq', 24 | 'gcp-dataeng-demos-383407', 25 | chunksize=10000, 26 | if_exists='append' 27 | ) 28 | print("Data loaded successfully") 29 | return message 30 | 31 | 32 | 33 | """ 34 | 35 | Author : @ Anjan GCP Data Engineering 36 | Cloud function code to: 37 | 1. Read data from private google sheets 38 | 2. grant edit access to compute engine SA on google sheet 39 | 3. upload SA creds to secret manager 40 | 4. authenticate SA by downloading SA creds into CF code using python API 41 | 2. Load data into Bigquery using pandas and bigquery APIs 42 | 43 | Roles required by SA - 44 | 45 | Secret Manager Secret Accessor 46 | 47 | Installations required - 48 | 49 | functions-framework==3.* 50 | google-cloud-secret-manager 51 | requests 52 | pandas 53 | gspread_pandas 54 | pandas_gbq 55 | 56 | """ 57 | 58 | # Imports 59 | import functions_framework 60 | import requests as req 61 | import pandas as pd 62 | from google.cloud import secretmanager 63 | import gspread_pandas 64 | import json 65 | import pandas_gbq 66 | 67 | # Cloud Function 68 | @functions_framework.http 69 | def hello_http(request): 70 | message = 'Function executed Successfully' 71 | # Create the Secret Manager client. 72 | client = secretmanager.SecretManagerServiceClient() 73 | # Build the resource name of the secret version. 74 | project_id = '414888653736' 75 | secret_id = 'sa_cred' 76 | version_id = '1' 77 | name = "projects/{}/secrets/{}/versions/{}".format(project_id,secret_id,version_id) 78 | # Access the secret version. 79 | response = client.access_secret_version(request={"name": name}) 80 | # Print the secret payload. 81 | # snippet is showing how to access the secret material. 82 | payload = response.payload.data.decode("UTF-8") 83 | # convert secret value into json format 84 | credentials = json.loads(payload) 85 | # Defining scopes for gsheet and gdrive APIs 86 | scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'] 87 | # Access gsheet into gspread_pandas varaible 88 | google_sheet_file_1 = gspread_pandas.Spread('1GmKAaZQS-sLaQRmMzNXaFt6lXOkQbvxGNx2_c3NnoVk', config=credentials) 89 | # Convert into pandas dataframe 90 | df = google_sheet_file_1.sheet_to_df(header_rows=1).astype(str) 91 | df.reset_index(inplace=True) 92 | # Write values into Bigquery Table with append mode 93 | df.to_gbq('gcp_dataeng_demos.sa_sheet_to_bq', 94 | 'gcp-dataeng-demos-383407', 95 | chunksize=10000, 96 | if_exists='append' 97 | ) 98 | print("Data loaded successfully") 99 | return message 100 | -------------------------------------------------------------------------------- /CloudSQL/cloudsql_python_connect.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Installations required - 3 | pip install cloud-sql-python-connector["pymysql"] SQLAlchemy 4 | pip install google-cloud-secret-manager 5 | ''' 6 | #Import required dependencies 7 | from google.cloud.sql.connector import Connector 8 | import sqlalchemy 9 | 10 | # Function to get CloudSQL instance password from Secret Manager 11 | def access_secret_version(project_id, secret_id, version_id): 12 | """ 13 | Access the payload for the given secret version if one exists. The version 14 | can be a version number as a string (e.g. "5") or an alias (e.g. "latest"). 15 | """ 16 | 17 | # Import the Secret Manager client library. 18 | from google.cloud import secretmanager 19 | 20 | # Create the Secret Manager client. 21 | client = secretmanager.SecretManagerServiceClient() 22 | 23 | # Build the resource name of the secret version. 24 | name = f"projects/{project_id}/secrets/{secret_id}/versions/{version_id}" 25 | 26 | # Access the secret version. 27 | response = client.access_secret_version(request={"name": name}) 28 | # Print the secret payload. 29 | # snippet is showing how to access the secret material. 30 | payload = response.payload.data.decode("UTF-8") 31 | return payload 32 | 33 | # Function call to get DB password ino a local varaiable 34 | db_password = access_secret_version('gcp-data-eng-374308', 'cloudsql_pwd','1') 35 | 36 | 37 | # initialize Connector object 38 | connector = Connector() 39 | 40 | # function to return the database connection 41 | def getconn(): 42 | conn= connector.connect( 43 | "gcp-data-eng-374308:asia-south1:sql-demo", 44 | "pymysql", 45 | user="root", 46 | password=db_password, 47 | db="gcp_demo" 48 | ) 49 | return conn 50 | # create connection pool 51 | pool = sqlalchemy.create_engine( 52 | "mysql+pymysql://", 53 | creator=getconn, 54 | ) 55 | 56 | # insert statement (DML statement for data load) 57 | insert_stmt = sqlalchemy.text( 58 | "INSERT INTO basic_dtls (idn, name) VALUES (:idn, :name)", 59 | ) 60 | 61 | # interact with Cloud SQL database using connection pool 62 | with pool.connect() as db_conn: 63 | 64 | # Create Table 65 | db_conn.execute("CREATE TABLE basic_dtls(idn INT, name VARCHAR(200))") 66 | 67 | # Insert data into Table 68 | 69 | db_conn.execute(insert_stmt, idn=1, name="AAA") 70 | db_conn.execute(insert_stmt, idn=2, name="BBB") 71 | db_conn.execute(insert_stmt, idn=3, name="CCC") 72 | 73 | 74 | # query database 75 | result = db_conn.execute("SELECT * from basic_dtls").fetchall() 76 | 77 | # Do something with the results 78 | for row in result: 79 | print(row) 80 | 81 | # Dropping Table 82 | #db_conn.execute("DROP TABLE basic_dtls") 83 | -------------------------------------------------------------------------------- /CloudSQL/federated_query_demo.sql: -------------------------------------------------------------------------------- 1 | -- List all tables in a database. 2 | SELECT * FROM EXTERNAL_QUERY("projects/gcp-data-eng-374308/locations/asia-south1/connections/cloudsql_connect", 3 | "select * from information_schema.tables;"); 4 | 5 | -- List all columns in a table. 6 | SELECT * FROM EXTERNAL_QUERY("projects/gcp-data-eng-374308/locations/asia-south1/connections/cloudsql_connect", 7 | "select * from information_schema.columns where table_name='product_master';"); 8 | 9 | 10 | -- Query data from CloudSQL table. 11 | SELECT * FROM EXTERNAL_QUERY("projects/gcp-data-eng-374308/locations/asia-south1/connections/cloudsql_connect", 12 | "select * from product_master;"); 13 | 14 | --Join Bigquery table with Cloud SQL table 15 | 16 | SELECT pm.product_name, 17 | pm.product_desc, 18 | dtls.qty, 19 | dtls.price, 20 | dtls.date 21 | FROM `gcp-data-eng-374308.federated_demo.product_sales_dtls` dtls 22 | JOIN 23 | ( 24 | SELECT * 25 | FROM EXTERNAL_QUERY("projects/gcp-data-eng-374308/locations/asia-south1/connections/cloudsql_connect", 26 | "select * from product_master;") 27 | ) pm 28 | ON dtls.product_code = pm.product_code; 29 | 30 | -------------------------------------------------------------------------------- /CloudSpanner/cloud_spanner_intro_demo.sql: -------------------------------------------------------------------------------- 1 | -- Create Table GoogleSQL 2 | CREATE TABLE employee ( 3 | idn INT64, 4 | name STRING(MAX), 5 | salary FLOAT64, 6 | ) PRIMARY KEY(idn); 7 | 8 | --Insert data 9 | insert into employee(idn,name,salary) 10 | values(1,'aa',100.5),(2,'bb',1200.0); 11 | 12 | 13 | -- Create Table PostgresSQL 14 | CREATE TABLE employee ( 15 | idn bigint NOT NULL, 16 | name character varying(256), 17 | salary numeric, 18 | PRIMARY KEY(idn) 19 | ); 20 | 21 | --CLI create instance 22 | gcloud spanner instances create gcp-demo-instance --config=regional-us-central1 \ 23 | --description="Demo Instance" --nodes=1 24 | 25 | -- Set Instance 26 | gcloud config set spanner/instance gcp-demo-instance 27 | 28 | --Create database 29 | gcloud spanner databases create example-db 30 | 31 | -- Create table 32 | gcloud spanner databases ddl update example-db \ 33 | --ddl='CREATE TABLE Books ( BookId INT64 NOT NULL, 34 | BookName STRING(1024), 35 | BookCatgry STRING(1024)) PRIMARY KEY (BookId)' 36 | -- Insert data 37 | gcloud spanner rows insert --database=example-db \ 38 | --table=Books \ 39 | --data=BookId=1,BookName=abc,BookCatgry=Finance 40 | 41 | gcloud spanner rows insert --database=example-db \ 42 | --table=Books \ 43 | --data=BookId=2,BookName=aaa,BookCatgry=Comic 44 | 45 | gcloud spanner rows insert --database=example-db \ 46 | --table=Books \ 47 | --data=BookId=3,BookName=ccc,BookCatgry=History 48 | 49 | -- Query data 50 | gcloud spanner databases execute-sql example-db \ 51 | --sql='SELECT * FROM Books' 52 | 53 | -- Delete database 54 | gcloud spanner databases delete example-db 55 | 56 | -- Delete Instance 57 | gcloud spanner instances delete gcp-demo-instance 58 | 59 | -------------------------------------------------------------------------------- /CloudSpanner/spanner_python_client_demo.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Author @ Anjan GCP Data Engineering 3 | 4 | Install Spanner Client Libraries 5 | pip install google-cloud-spanner==3.31.0 6 | 7 | Note: This is only for Educational purpose 8 | 9 | These code samples will demo basic Cloud Spanner Operations 10 | 1. Create Spanner Instance 11 | 2. Create Spanner Database (Google Standard SQL), Table 12 | 3. Insert data using DML statements 13 | 4. Query Spanner data 14 | 15 | ''' 16 | 17 | from google.cloud import spanner 18 | 19 | # Function to create Spanner Instance 20 | def create_instance(instance_id,region): 21 | """Creates an instance.""" 22 | spanner_client = spanner.Client() 23 | 24 | config_name = "{}/instanceConfigs/regional-{}".format( 25 | spanner_client.project_name,region 26 | ) 27 | 28 | instance = spanner_client.instance( 29 | instance_id, 30 | configuration_name=config_name, 31 | display_name="Demo Instance.", 32 | node_count=1 33 | ) 34 | 35 | instance.create() 36 | 37 | print("Waiting for operation to complete...") 38 | print("Created instance {}".format(instance_id)) 39 | 40 | # Function to create Spanner Database and Tables 41 | def create_database(instance_id, database_id): 42 | """Creates a database and tables for demo data.""" 43 | spanner_client = spanner.Client() 44 | instance = spanner_client.instance(instance_id) 45 | 46 | database = instance.database( 47 | database_id, 48 | ddl_statements=[ 49 | """CREATE TABLE employee ( 50 | empid INT64 NOT NULL, 51 | empname STRING(1024), 52 | salary FLOAT64 53 | ) PRIMARY KEY (empid)""" 54 | ], 55 | ) 56 | 57 | database.create() 58 | 59 | print("Waiting for operation to complete...") 60 | print("Created database {} on instance {}".format(database_id, instance_id)) 61 | 62 | # Function to insert data into Spanner database Table 63 | def insert_data(instance_id, database_id): 64 | #Inserts sample data into the given database. 65 | 66 | spanner_client = spanner.Client() 67 | instance = spanner_client.instance(instance_id) 68 | database = instance.database(database_id) 69 | 70 | with database.batch() as batch: 71 | batch.insert( 72 | table="employee", 73 | columns=("empid", "empname", "salary"), 74 | values=[ 75 | (1, "Marc", 2032.5), 76 | (2, "Catalina", 1298.3), 77 | (3, "Alice", 3087.5), 78 | (4, "Lea", 1567.9), 79 | (5, "David", 2224.6), 80 | ], 81 | ) 82 | print("Inserted data.") 83 | 84 | 85 | # Function to query data from Spanner Database Table 86 | def query_data(instance_id, database_id): 87 | """Queries sample data from the database using SQL.""" 88 | spanner_client = spanner.Client() 89 | instance = spanner_client.instance(instance_id) 90 | database = instance.database(database_id) 91 | 92 | with database.snapshot() as snapshot: 93 | results = snapshot.execute_sql( 94 | "SELECT empid,empname,salary AlbumTitle FROM employee" 95 | ) 96 | 97 | for row in results: 98 | print("Emp ID: {}, Emp Name: {}, Salary: {}".format(*row)) 99 | 100 | # Create Spanner instance 101 | create_instance('gcp-dataeng-demo','asia-south1') 102 | 103 | #Create database and Table 104 | create_database('gcp-dataeng-demo','demo_db') 105 | 106 | # Insert data 107 | insert_data('gcp-dataeng-demo','demo_db') 108 | 109 | # Query data 110 | query_data('gcp-dataeng-demo','demo_db') 111 | 112 | # Delete Instance 113 | # gcloud spanner instances delete gcp-dataeng-demo 114 | -------------------------------------------------------------------------------- /Common_Realtime_Usecases/dataflow_spanner_demo/beam_dataflow-to-spanner.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Author : @ Anjan GCP Data Engineering 3 | Created for education purpose only 4 | ''' 5 | 6 | import argparse 7 | import logging 8 | import re, os 9 | from typing import NamedTuple, List 10 | 11 | import apache_beam as beam 12 | from apache_beam.options.pipeline_options import PipelineOptions 13 | from apache_beam.options.pipeline_options import SetupOptions 14 | from apache_beam.io.gcp.spanner import SpannerInsert 15 | from apache_beam.dataframe.io import read_csv 16 | from apache_beam.dataframe import convert 17 | 18 | # Inferring schema using Named Tuple 19 | class SpannerRow(NamedTuple): 20 | trid: int 21 | age: int 22 | workclass: str 23 | education: str 24 | marital_status: str 25 | occupation: str 26 | relationship: str 27 | sex: str 28 | native_country: str 29 | income_bracket: str 30 | beam.coders.registry.register_coder(SpannerRow, beam.coders.RowCoder) 31 | 32 | # User defined tranformation to replace ? 33 | def ValueReplace(column): 34 | if column == '?': 35 | column = 'NA' 36 | return column 37 | 38 | # Pipeline entry point , passing user input arguments 39 | def main(argv=None, save_main_session=True): 40 | """Main entry point.""" 41 | projectid = os.environ.get('GOOGLE_CLOUD_PROJECT') 42 | parser = argparse.ArgumentParser() 43 | parser.add_argument( 44 | '--input', 45 | dest='input', 46 | default='census_100_testing.csv', 47 | help='Input filename.') 48 | parser.add_argument( 49 | '--instance', 50 | dest='instance', 51 | default='test-spanner-instance', 52 | help='Spanner instance ID.') 53 | parser.add_argument( 54 | '--database', 55 | dest='database', 56 | default = 'census-db', 57 | help='Spanner database.') 58 | parser.add_argument( 59 | '--table', 60 | dest='table', 61 | default = 'census', 62 | help='Spanner table.') 63 | known_args, pipeline_args = parser.parse_known_args(argv) 64 | 65 | pipeline_options = PipelineOptions(pipeline_args) 66 | pipeline_options.view_as(SetupOptions).save_main_session = save_main_session 67 | 68 | # Beam pipeline , collection of Tranformations 69 | with beam.Pipeline(options=pipeline_options) as p: 70 | census = p | 'Read CSV to dataframe' >> read_csv('gs://gcp-dataeng-demos1993/census_100_testing.csv') 71 | census = ( convert.to_pcollection(census) 72 | | "Filter age is null rows" >> beam.Filter(lambda x: x.age ) 73 | | "Filter workclass value ? rows" >> beam.Filter(lambda x: x.workclass != '?') 74 | 75 | | 'Convert to Spanner Rows' >> beam.Map(lambda x : SpannerRow( x.trid, 76 | x.age, 77 | x.workclass, 78 | ValueReplace(x.education), 79 | x.marital_status, 80 | ValueReplace(x.occupation), 81 | x.relationship, 82 | x.sex, 83 | ValueReplace(x.native_country), 84 | x.income_bracket 85 | )) 86 | ) 87 | # Writing data to Spanner Database 88 | 89 | census | 'Write to Spanner' >> SpannerInsert( 90 | project_id= 'gcp-dataeng-demo-431907', 91 | instance_id= 'test-spanner-instance', 92 | database_id= 'census-db', 93 | table= 'census') 94 | 95 | census | beam.Map(print) 96 | 97 | if __name__ == '__main__': 98 | logging.getLogger().setLevel(logging.INFO) 99 | main() 100 | -------------------------------------------------------------------------------- /Common_Realtime_Usecases/dataflow_spanner_demo/census-db-schema.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE census ( 2 | trid INT64 NOT NULL, 3 | age INT64 NOT NULL, 4 | workclass STRING(MAX), 5 | education STRING(MAX), 6 | marital_status STRING(MAX), 7 | occupation STRING(MAX), 8 | relationship STRING(MAX), 9 | sex STRING(MAX), 10 | native_country STRING(MAX), 11 | income_bracket STRING(MAX) 12 | ) PRIMARY KEY (trid); 13 | -------------------------------------------------------------------------------- /Common_Realtime_Usecases/dataflow_spanner_demo/census_100_testing.csv: -------------------------------------------------------------------------------- 1 | trid,age,workclass,education,marital_status,occupation,relationship,sex,native_country,income_bracket 2 | 111,39,Private,9th,Married-civ-spouse,Other-service,Wife,Female,United-States,<=50K 3 | 112,77,Private,9th,Married-civ-spouse,Priv-house-serv,Wife,Female,United-States,<=50K 4 | 113,38,Private,9th,Married-civ-spouse,Other-service,Wife,Female,Haiti,<=50K 5 | 114,28,Private,9th,Married-civ-spouse,Protective-serv,Wife,Female,United-States,<=50K 6 | 115,37,Private,9th,Married-civ-spouse,Machine-op-inspct,Wife,Female,United-States,<=50K 7 | 116,35,?,9th,Married-civ-spouse,?,Wife,Female,United-States,<=50K 8 | 117,45,Private,9th,Married-civ-spouse,Machine-op-inspct,Wife,Female,United-States,>50K 9 | 118,55,Private,9th,Married-civ-spouse,Tech-support,Wife,Female,United-States,<=50K 10 | 119,27,Private,9th,Married-civ-spouse,Machine-op-inspct,Wife,Female,Portugal,<=50K 11 | 120,31,Private,9th,Married-civ-spouse,Exec-managerial,Wife,Female,United-States,<=50K 12 | 121,30,Private,9th,Married-civ-spouse,Machine-op-inspct,Wife,Female,Portugal,<=50K 13 | 122,28,Private,9th,Married-civ-spouse,Machine-op-inspct,Wife,Female,United-States,<=50K 14 | 123,,Private,10th,Married-civ-spouse,Machine-op-inspct,Wife,Female,United-States,<=50K 15 | 124,46,Private,9th,Married-civ-spouse,Machine-op-inspct,Wife,Female,United-States,<=50K 16 | 125,70,Private,9th,Married-civ-spouse,Machine-op-inspct,Wife,Female,United-States,<=50K 17 | 126,31,Private,9th,Married-civ-spouse,Farming-fishing,Wife,Female,United-States,<=50K 18 | 127,40,Local-gov,9th,Married-civ-spouse,Other-service,Wife,Female,Yugoslavia,>50K 19 | 128,52,Local-gov,9th,Married-civ-spouse,Other-service,Wife,Female,United-States,<=50K 20 | 129,46,Self-emp-inc,9th,Married-civ-spouse,Adm-clerical,Wife,Female,United-States,<=50K 21 | 130,41,Self-emp-inc,9th,Married-civ-spouse,Sales,Wife,Female,Dominican-Republic,<=50K 22 | 131,41,?,9th,Married-civ-spouse,?,Wife,Female,Hong,<=50K 23 | 132,72,Private,9th,Married-civ-spouse,Exec-managerial,Wife,Female,United-States,>50K 24 | 133,75,?,9th,Married-civ-spouse,?,Husband,Male,United-States,<=50K 25 | 134,77,?,9th,Married-civ-spouse,?,Husband,Male,United-States,<=50K 26 | 135,66,?,9th,Married-civ-spouse,?,Husband,Male,United-States,<=50K 27 | 136,45,Private,9th,Married-civ-spouse,Adm-clerical,Husband,Male,United-States,>50K 28 | 137,57,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,>50K 29 | 138,57,Private,9th,Married-civ-spouse,Sales,Husband,Male,United-States,>50K 30 | 139,47,Private,9th,Married-civ-spouse,Machine-op-inspct,Husband,Male,?,<=50K 31 | 140,61,Private,9th,Married-civ-spouse,Machine-op-inspct,Husband,Male,Trinadad&Tobago,<=50K 32 | 141,63,Private,9th,Married-civ-spouse,Farming-fishing,Husband,Male,United-States,<=50K 33 | 142,32,Private,9th,Married-civ-spouse,Farming-fishing,Husband,Male,United-States,<=50K 34 | 143,56,Private,9th,Married-civ-spouse,Transport-moving,Husband,Male,United-States,<=50K 35 | 144,38,Private,9th,Married-civ-spouse,Transport-moving,Husband,Male,United-States,<=50K 36 | 145,58,Private,9th,Married-civ-spouse,Machine-op-inspct,Husband,Male,United-States,<=50K 37 | 146,44,Private,9th,Married-civ-spouse,Machine-op-inspct,Husband,Male,United-States,<=50K 38 | 147,53,Private,9th,Married-civ-spouse,Machine-op-inspct,Husband,Male,United-States,<=50K 39 | 148,44,Private,9th,Married-civ-spouse,Machine-op-inspct,Husband,Male,United-States,<=50K 40 | 149,62,Private,9th,Married-civ-spouse,Other-service,Husband,Male,United-States,<=50K 41 | 150,68,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,<=50K 42 | 151,31,Private,9th,Married-civ-spouse,Handlers-cleaners,Husband,Male,United-States,<=50K 43 | 152,58,Private,9th,Married-civ-spouse,Handlers-cleaners,Husband,Male,United-States,<=50K 44 | 153,28,Local-gov,9th,Married-civ-spouse,Craft-repair,Husband,Male,Trinadad&Tobago,>50K 45 | 154,,Local-gov,12th,Married-civ-spouse,Craft-repair,Husband,Male,Trinadad&Tobago,>50K 46 | 155,51,Local-gov,9th,Married-civ-spouse,Transport-moving,Husband,Male,United-States,<=50K 47 | 156,35,Federal-gov,9th,Married-civ-spouse,Farming-fishing,Husband,Male,United-States,<=50K 48 | 157,35,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,Mexico,<=50K 49 | 158,30,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,Mexico,<=50K 50 | 159,63,?,9th,Married-civ-spouse,?,Husband,Male,United-States,>50K 51 | 160,68,?,9th,Married-civ-spouse,?,Husband,Male,United-States,<=50K 52 | 161,67,?,9th,Married-civ-spouse,?,Husband,Male,United-States,<=50K 53 | 162,69,?,9th,Married-civ-spouse,?,Husband,Male,United-States,<=50K 54 | 163,74,?,9th,Married-civ-spouse,?,Husband,Male,United-States,<=50K 55 | 164,60,?,9th,Married-civ-spouse,?,Husband,Male,United-States,<=50K 56 | 165,66,?,9th,Married-civ-spouse,?,Husband,Male,United-States,<=50K 57 | 166,66,?,9th,Married-civ-spouse,?,Husband,Male,United-States,<=50K 58 | 167,64,?,9th,Married-civ-spouse,?,Husband,Male,United-States,<=50K 59 | 168,50,?,9th,Married-civ-spouse,?,Husband,Male,United-States,<=50K 60 | 169,45,Private,9th,Married-civ-spouse,Other-service,Husband,Male,United-States,>50K 61 | 170,54,Private,9th,Married-civ-spouse,Exec-managerial,Husband,Male,United-States,>50K 62 | 171,51,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,>50K 63 | 172,58,Private,9th,Married-civ-spouse,Handlers-cleaners,Husband,Male,United-States,>50K 64 | 173,37,Private,9th,Married-civ-spouse,Machine-op-inspct,Husband,Male,United-States,>50K 65 | 174,59,Private,9th,Married-civ-spouse,Transport-moving,Husband,Male,United-States,>50K 66 | 175,31,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,<=50K 67 | 176,33,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,Mexico,<=50K 68 | 177,30,Private,9th,Married-civ-spouse,Other-service,Husband,Male,United-States,<=50K 69 | 178,38,Private,9th,Married-civ-spouse,Farming-fishing,Husband,Male,Mexico,<=50K 70 | 179,76,Private,9th,Married-civ-spouse,Protective-serv,Husband,Male,United-States,<=50K 71 | 180,35,Private,9th,Married-civ-spouse,Handlers-cleaners,Husband,Male,United-States,<=50K 72 | 181,39,Private,9th,Married-civ-spouse,Handlers-cleaners,Husband,Male,Mexico,<=50K 73 | 182,31,Private,9th,Married-civ-spouse,Machine-op-inspct,Husband,Male,United-States,<=50K 74 | 183,60,Private,9th,Married-civ-spouse,Machine-op-inspct,Husband,Male,United-States,<=50K 75 | 184,46,Private,9th,Married-civ-spouse,Machine-op-inspct,Husband,Male,United-States,<=50K 76 | 185,60,Private,9th,Married-civ-spouse,Machine-op-inspct,Husband,Male,United-States,<=50K 77 | 186,63,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,<=50K 78 | 187,26,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,<=50K 79 | 188,39,Private,9th,Married-civ-spouse,Transport-moving,Husband,Male,United-States,<=50K 80 | 189,59,Private,9th,Married-civ-spouse,Other-service,Husband,Male,United-States,<=50K 81 | 190,27,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,<=50K 82 | 191,26,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,<=50K 83 | 192,36,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,Guatemala,<=50K 84 | 193,69,Private,9th,Married-civ-spouse,Other-service,Husband,Male,United-States,<=50K 85 | 194,62,Private,9th,Married-civ-spouse,Other-service,Husband,Male,United-States,<=50K 86 | 195,41,Private,9th,Married-civ-spouse,Transport-moving,Husband,Male,United-States,<=50K 87 | 196,60,Private,9th,Married-civ-spouse,Sales,Husband,Male,United-States,<=50K 88 | 197,28,Private,9th,Married-civ-spouse,Sales,Husband,Male,United-States,<=50K 89 | 198,51,Private,9th,Married-civ-spouse,Sales,Husband,Male,United-States,<=50K 90 | 199,56,Private,9th,Married-civ-spouse,Sales,Husband,Male,United-States,<=50K 91 | 200,38,Private,9th,Married-civ-spouse,Adm-clerical,Husband,Male,United-States,<=50K 92 | 201,61,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,<=50K 93 | 202,38,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,<=50K 94 | 203,30,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,<=50K 95 | 204,34,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,<=50K 96 | 205,37,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,<=50K 97 | 206,42,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,Mexico,<=50K 98 | 207,32,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,<=50K 99 | 208,29,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,<=50K 100 | 209,27,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,<=50K -------------------------------------------------------------------------------- /Common_Realtime_Usecases/dataflow_spanner_demo/create-spanner-database-CLI.txt: -------------------------------------------------------------------------------- 1 | gcloud spanner instances create test-spanner-instance --config=regional-$1 --description="test-spanner-instance" --processing-units=100 2 | 3 | gcloud spanner databases create census-db --instance=test-spanner-instance --database-dialect=GOOGLE_STANDARD_SQL --ddl-file=./census-db-schema.sql 4 | -------------------------------------------------------------------------------- /Common_Realtime_Usecases/dataflow_spanner_demo/installations_beam.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | pip3 install apache-beam 3 | pip3 install apache-beam[gcp] 4 | pip3 install apache-beam[dataframe] 5 | -------------------------------------------------------------------------------- /Common_Realtime_Usecases/iam_snapshots.py: -------------------------------------------------------------------------------- 1 | import functions_framework 2 | from googleapiclient import discovery 3 | from google.cloud import bigquery 4 | from datetime import datetime 5 | import os 6 | 7 | ''' 8 | Dependencies to be installed 9 | 10 | bigquery 11 | google-api-python-client 12 | ''' 13 | 14 | @functions_framework.http 15 | def hello_http(request): 16 | """HTTP Cloud Function. 17 | Args: 18 | request (flask.Request): The request object. 19 | 20 | Returns: 21 | The response text, or any set of values that can be turned into a 22 | Response object using `make_response` 23 | . 24 | """ 25 | request_json = request.get_json(silent=True) 26 | request_args = request.args 27 | 28 | # Get project , Dataset , Table details from Function Env 29 | PROJECT_ID = os.environ.get("project") 30 | DATASET_ID = os.environ.get("dataset") 31 | TABLE_ID = os.environ.get("table") 32 | 33 | #from apiclient.discovery import build 34 | service = discovery.build('cloudresourcemanager', 'v1') 35 | 36 | # Get IAM roles from specified project 37 | policy_request = service.projects().getIamPolicy(resource=PROJECT_ID, body={}) 38 | policy_response = policy_request.execute() 39 | #print(policy_response['bindings']) 40 | 41 | # Deriving current timestamp for snapshot 42 | now = datetime.now() 43 | date_time = now.strftime("%m/%d/%Y %H:%M:%S") 44 | rows_to_insert = [] 45 | 46 | # Append snapshot time to each row 47 | for i in policy_response['bindings']: 48 | i['snapshot_time'] = date_time 49 | rows_to_insert.append(i) 50 | #print(rows_to_insert) 51 | # Construct a BigQuery client object. 52 | client = bigquery.Client() 53 | 54 | # load job confuguration 55 | load_job = client.insert_rows_json("{}.{}.{}".format(PROJECT_ID,DATASET_ID,TABLE_ID), rows_to_insert 56 | ) 57 | 58 | if load_job == []: 59 | print('Data has been loaded successfully') 60 | else: 61 | print("Somee issue with loading data") 62 | 63 | name = 'Succeeded' 64 | return 'Function Execution {}!'.format(name) 65 | -------------------------------------------------------------------------------- /Composer/airflow_dataproc_automate_dag.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author : @ Anjan GCP Data Engineering 3 | 4 | Example Airflow DAG to demo below Dataproc use cases 5 | Airflow operators for managing a dataproc cluster 6 | 1. Create Dataproc cluster 7 | 2. Submit PySpark jobs (in parallel) 8 | 3. Delete dataproc cluster 9 | """ 10 | import os 11 | from datetime import datetime 12 | from airflow import models 13 | from airflow.providers.google.cloud.operators.dataproc import ( 14 | ClusterGenerator, 15 | DataprocCreateClusterOperator, 16 | DataprocDeleteClusterOperator, 17 | DataprocSubmitJobOperator 18 | ) 19 | 20 | # Param initializations 21 | DAG_ID = "dataproc_cluster_jobs" 22 | PROJECT_ID = "gcp-dataeng-demos-383407" 23 | BUCKET_NAME = "dataprco-airflow-demos" 24 | CLUSTER_NAME = "dataeng-demos-airflow" 25 | REGION = "asia-south2" 26 | ZONE = "asia-south2-a" 27 | 28 | #PySPark scripts paths 29 | SCRIPT_BUCKET_PATH = "gcpdataeng-demos/scripts" 30 | # BQ -> AGGREGATE -> GCS 31 | SCRIPT_NAME_1 = "pyspark_bq_to_gcs.py" 32 | # GCS -> AGGREGATE -> BQ 33 | SCRIPT_NAME_2 = "pyspark_gcs_to_bq.py" 34 | 35 | # Cluster definition: Generating Cluster Config for DataprocCreateClusterOperator 36 | 37 | INIT_FILE = "goog-dataproc-initialization-actions-asia-south2/connectors/connectors.sh" 38 | 39 | # Generating cluster Configurations with this operator 40 | CLUSTER_GENERATOR_CONFIG = ClusterGenerator( 41 | project_id=PROJECT_ID, 42 | zone=ZONE, 43 | master_machine_type="n1-standard-2", 44 | worker_machine_type="n1-standard-2", 45 | num_workers=2, 46 | storage_bucket=BUCKET_NAME, 47 | init_actions_uris=[f"gs://{INIT_FILE}"], 48 | metadata={"bigquery-connector-version":"1.2.0","spark-bigquery-connector-version":"0.21.0"} 49 | ).make() 50 | 51 | # PySpark job configs for Job1 52 | PYSPARK_JOB_1 = { 53 | "reference": {"project_id": PROJECT_ID}, 54 | "placement": {"cluster_name": CLUSTER_NAME}, 55 | "pyspark_job": {"main_python_file_uri": f"gs://{SCRIPT_BUCKET_PATH}/{SCRIPT_NAME_1}"} 56 | } 57 | # PySpark job configs for Job2 58 | PYSPARK_JOB_2 = { 59 | "reference": {"project_id": PROJECT_ID}, 60 | "placement": {"cluster_name": CLUSTER_NAME}, 61 | "pyspark_job": {"main_python_file_uri": f"gs://{SCRIPT_BUCKET_PATH}/{SCRIPT_NAME_2}"} 62 | 63 | } 64 | 65 | # DAH definition is here 66 | with models.DAG( 67 | DAG_ID, 68 | schedule="@once", 69 | start_date=datetime(2023, 1, 1), 70 | catchup=False, 71 | tags=["example", "dataproc"], 72 | ) as dag: 73 | 74 | # Create cluster with generates cluster config operator 75 | create_dataproc_cluster = DataprocCreateClusterOperator( 76 | task_id="create_dataproc_cluster", 77 | cluster_name=CLUSTER_NAME, 78 | project_id=PROJECT_ID, 79 | region=REGION, 80 | cluster_config=CLUSTER_GENERATOR_CONFIG, 81 | ) 82 | 83 | # PySpark task to read data from Bigquery , perform agrregate on data and write data into GCS 84 | pyspark_task_bq_to_gcs = DataprocSubmitJobOperator( 85 | task_id="pyspark_task_bq_to_gcs", 86 | job=PYSPARK_JOB_1, 87 | region=REGION, 88 | project_id=PROJECT_ID 89 | ) 90 | 91 | # PySpark task to read data from GCS , perform agrregate on data and write data into Bigquery 92 | pyspark_task_gcs_to_bq = DataprocSubmitJobOperator( 93 | task_id="pyspark_task_gcs_to_bq", 94 | job=PYSPARK_JOB_2, 95 | region=REGION, 96 | project_id=PROJECT_ID 97 | ) 98 | 99 | # Delete Cluster once done with jobs 100 | delete_cluster = DataprocDeleteClusterOperator( 101 | task_id="delete_cluster", 102 | project_id=PROJECT_ID, 103 | cluster_name=CLUSTER_NAME, 104 | region=REGION 105 | ) 106 | 107 | # Set task dependencies 108 | create_dataproc_cluster >> [pyspark_task_bq_to_gcs,pyspark_task_gcs_to_bq] >> delete_cluster 109 | -------------------------------------------------------------------------------- /Composer/dataflow_python_operator_dag.py: -------------------------------------------------------------------------------- 1 | # Import statement 2 | import os 3 | from datetime import datetime, timedelta 4 | from airflow import DAG 5 | from airflow.operators.dummy import DummyOperator 6 | from airflow.contrib.operators.dataflow_operator import DataFlowPythonOperator 7 | 8 | # Define yesterday value for setting up start for DAG 9 | yesterday = datetime.combine(datetime.today() - timedelta(1), datetime.min.time()) 10 | 11 | # Default arguments 12 | default_args = { 13 | 'start_date': yesterday, 14 | 'email_on_failure': False, 15 | 'email_on_retry': False, 16 | 'retries': 1, 17 | 'retry_delay': timedelta(minutes=5) 18 | } 19 | 20 | # DAG main definition 21 | with DAG(dag_id='DataflowPythonOperator', 22 | catchup=False, 23 | schedule_interval=timedelta(days=1), 24 | default_args=default_args 25 | ) as dag: 26 | 27 | # Dummy Start task 28 | start = DummyOperator( 29 | task_id='start', 30 | dag=dag, 31 | ) 32 | 33 | # Dataflow batch job log process task 34 | dataflow_batch_process_logs = DataFlowPythonOperator( 35 | task_id='dataflow_batch_process_logs', 36 | py_file='gs://us-central1-composer-scd2-5607404f-bucket/dags/scripts/dataflow_batch_log_process.py', 37 | options={ 38 | 'output': 'gs://data_eng_demos/output' 39 | }, 40 | dataflow_default_options={ 41 | 'project': 'data-eng-demos19', 42 | "staging_location": "gs://data_eng_demos/staging", 43 | "temp_location": "gs://data_eng_demos/temp" 44 | }, 45 | dag=dag) 46 | 47 | # Dummy end task 48 | end = DummyOperator( 49 | task_id='end', 50 | dag=dag, 51 | ) 52 | 53 | # Setting up Task dependencies using Airflow standard notations 54 | start >> dataflow_batch_process_logs >> end 55 | -------------------------------------------------------------------------------- /Composer/gcs_to_bq_and_bq_operators.py: -------------------------------------------------------------------------------- 1 | # import statements 2 | import os 3 | from datetime import datetime, timedelta 4 | from airflow import DAG 5 | from airflow.operators.dummy import DummyOperator 6 | from airflow.contrib.operators.bigquery_operator import BigQueryOperator 7 | from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator 8 | 9 | # Custom Python logic for derriving data value 10 | yesterday = datetime.combine(datetime.today() - timedelta(1), datetime.min.time()) 11 | 12 | # Default arguments 13 | default_args = { 14 | 'start_date': yesterday, 15 | 'email_on_failure': False, 16 | 'email_on_retry': False, 17 | 'retries': 1, 18 | 'retry_delay': timedelta(minutes=5) 19 | } 20 | 21 | # DAG definitions 22 | with DAG(dag_id='GCS_to_BQ_and_AGG', 23 | catchup=False, 24 | schedule_interval=timedelta(days=1), 25 | default_args=default_args 26 | ) as dag: 27 | 28 | # Dummy strat task 29 | start = DummyOperator( 30 | task_id='start', 31 | dag=dag, 32 | ) 33 | 34 | # GCS to BigQuery data load Operator and task 35 | gcs_to_bq_load = GoogleCloudStorageToBigQueryOperator( 36 | task_id='gcs_to_bq_load', 37 | bucket='data_eng_demos', 38 | source_objects=['greenhouse_dtls.csv'], 39 | destination_project_dataset_table='data-eng-demos19.gcp_dataeng_demos.gcs_to_bq_table', 40 | schema_fields=[ 41 | {'name': 'year', 'type': 'STRING', 'mode': 'NULLABLE'}, 42 | {'name': 'anzsic', 'type': 'STRING', 'mode': 'NULLABLE'}, 43 | {'name': 'nzsioc', 'type': 'STRING', 'mode': 'NULLABLE'}, 44 | {'name': 'anzsic_descriptor', 'type': 'STRING', 'mode': 'NULLABLE'}, 45 | {'name': 'category', 'type': 'STRING', 'mode': 'NULLABLE'}, 46 | {'name': 'variable', 'type': 'STRING', 'mode': 'NULLABLE'}, 47 | {'name': 'units', 'type': 'STRING', 'mode': 'NULLABLE'}, 48 | {'name': 'magnitude', 'type': 'STRING', 'mode': 'NULLABLE'}, 49 | {'name': 'source', 'type': 'STRING', 'mode': 'NULLABLE'}, 50 | {'name': 'data_value', 'type': 'FLOAT', 'mode': 'NULLABLE'} 51 | ], 52 | skip_leading_rows=1, 53 | create_disposition='CREATE_IF_NEEDED', 54 | write_disposition='WRITE_TRUNCATE', 55 | dag=dag) 56 | 57 | # BigQuery task, operator 58 | create_aggr_bq_table = BigQueryOperator( 59 | task_id='create_aggr_bq_table', 60 | use_legacy_sql=False, 61 | allow_large_results=True, 62 | sql="CREATE OR REPLACE TABLE gcp_dataeng_demos.bq_table_aggr AS \ 63 | SELECT \ 64 | year,\ 65 | anzsic_descriptor,\ 66 | variable,\ 67 | source,\ 68 | SUM(data_value) as sum_data_value\ 69 | FROM data-eng-demos19.gcp_dataeng_demos.gcs_to_bq_table \ 70 | GROUP BY \ 71 | year,\ 72 | anzsic_descriptor,\ 73 | variable,\ 74 | source", 75 | dag=dag) 76 | 77 | # Dummy end task 78 | end = DummyOperator( 79 | task_id='end', 80 | dag=dag, 81 | ) 82 | 83 | # Settting up task dependency 84 | start >> gcs_to_bq_load >> create_aggr_bq_table >> end 85 | -------------------------------------------------------------------------------- /Composer/https_operators_demo_dag.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | Author : @ Anjan GCP Data Engineering 4 | This Airflow DAG code to demo - Demo HTTP operators 5 | 1. Extract data from HTTP API 6 | 2. Pull data from Xcom 7 | 3. Write data into GCS bucket in JSON format 8 | 9 | """ 10 | from __future__ import annotations 11 | 12 | import json 13 | import os 14 | from datetime import datetime 15 | 16 | from airflow import DAG 17 | from airflow.providers.http.operators.http import SimpleHttpOperator 18 | from airflow.providers.http.sensors.http import HttpSensor 19 | from airflow.operators.python import PythonOperator 20 | from google.cloud import storage 21 | 22 | # Dag name 23 | DAG_ID = "demo_http_operator_to_gcs" 24 | 25 | # ths python funcgion writes data from Xcom to GCS byucket as a JSON file 26 | def WriteToGcs(ti): 27 | data = ti.xcom_pull(task_ids=['get_http_data']) 28 | bucket_name = 'gcpdataeng-demos' 29 | destination_blob_name = 'stock_data.json' 30 | storage_client = storage.Client() 31 | bucket = storage_client.bucket(bucket_name) 32 | blob = bucket.blob(destination_blob_name) 33 | 34 | blob.upload_from_string(str(data)) 35 | 36 | print( 37 | f"{destination_blob_name} with contents uploaded to {bucket_name}." 38 | ) 39 | # DAG definitions with all required params 40 | dag = DAG( 41 | DAG_ID, 42 | default_args={"retries": 1}, 43 | tags=["example"], 44 | start_date=datetime(2023, 4, 26), 45 | catchup=False, 46 | ) 47 | 48 | # Task to get data from given HTTP end point 49 | get_http_data = SimpleHttpOperator( 50 | task_id="get_http_data", 51 | http_conn_id="http_conn_id_demo", 52 | method="GET", 53 | endpoint="/query?function=TOURNAMENT_PORTFOLIO&season=2021-09&apikey=demo", 54 | response_filter = lambda response : json.loads(response.text), 55 | dag=dag 56 | ) 57 | # Task to write data from Xcom to GCS bucket 58 | write_data_to_gcs = PythonOperator( 59 | task_id = 'write_data_to_gcs', 60 | python_callable = WriteToGcs 61 | ) 62 | # Task dependency set 63 | get_http_data >> write_data_to_gcs 64 | -------------------------------------------------------------------------------- /Composer/python_bash_operators_dag.py: -------------------------------------------------------------------------------- 1 | #Import dependencies 2 | import os 3 | from datetime import datetime, timedelta 4 | from airflow import DAG 5 | from airflow.operators.dummy import DummyOperator 6 | from airflow.operators.python import PythonOperator 7 | from airflow.operators.bash import BashOperator 8 | 9 | # Python logic to derive yetsreday's date 10 | yesterday = datetime.combine(datetime.today() - timedelta(1), datetime.min.time()) 11 | 12 | # Default arguments 13 | default_args = { 14 | 'start_date': yesterday, 15 | 'email_on_failure': False, 16 | 'email_on_retry': False, 17 | 'retries': 1, 18 | 'retry_delay': timedelta(minutes=5) 19 | } 20 | 21 | # Python custom logic/function for python callables 22 | def print_hello(): 23 | print('Hey I am Python operator') 24 | 25 | # DAG definitions 26 | with DAG(dag_id='bash_python_operator_demo', 27 | catchup=False, 28 | schedule_interval=timedelta(days=1), 29 | default_args=default_args 30 | ) as dag: 31 | 32 | # Tasks starts here 33 | 34 | # Dummy Start task 35 | start = DummyOperator( 36 | task_id='start', 37 | dag=dag, 38 | ) 39 | 40 | # Bash operator , task 41 | bash_task = BashOperator( 42 | task_id='bash_task', 43 | bash_command="date;echo 'Hey I am bash operator'", 44 | ) 45 | # Python operator , task 46 | pyhon_task = PythonOperator( 47 | task_id='pyhon_task', 48 | python_callable=print_hello, 49 | dag=dag) 50 | 51 | # Dummy end task 52 | end = DummyOperator( 53 | task_id='end', 54 | dag=dag, 55 | ) 56 | 57 | # Setting up Task dependencies using Airflow standard notations 58 | start >> bash_task >> pyhon_task >> end 59 | -------------------------------------------------------------------------------- /Dataflow/batch_etl_avro_data_cloudsql.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Author : Anjan GCP Data Engineering 3 | This code should be used only for Eductional purpose 4 | ''' 5 | # Import Dependencies 6 | import apache_beam as beam 7 | from apache_beam.runners.interactive.interactive_runner import InteractiveRunner 8 | import apache_beam.runners.interactive.interactive_beam as ib 9 | from apache_beam.io.avroio import ReadFromAvro 10 | from apache_beam.io import WriteToAvro 11 | from google.cloud.sql.connector import Connector 12 | import sqlalchemy 13 | from apache_beam.io import ReadFromText 14 | from apache_beam.io import WriteToText 15 | from apache_beam.options.pipeline_options import PipelineOptions 16 | import logging 17 | 18 | # Setting up the Apache Beam pipeline options. 19 | beam_options = PipelineOptions( 20 | #save_main_session=True, 21 | #runner='DirectRunner', 22 | setup_file = '/home/jupyter/setup.py', 23 | runner='DataflowRunner', 24 | project='gcp-dataeng-demos-395910', 25 | temp_location='gs://dataflow_demos2/tmp', 26 | region='asia-south2') 27 | 28 | class WriteToCloudSQL(beam.DoFn): 29 | def process(self,element): 30 | 31 | from google.cloud.sql.connector import Connector 32 | import sqlalchemy 33 | # function to return the database connection 34 | def getconn(): 35 | connector = Connector() 36 | conn= connector.connect( 37 | "gcp-dataeng-demos-395910:asia-south1:sql-demo", 38 | "pymysql", 39 | user="root", 40 | password='$qlDem0', 41 | db="gcp_demos" 42 | ) 43 | return conn 44 | # create connection pool 45 | 46 | pool = sqlalchemy.create_engine( 47 | "mysql+pymysql://", 48 | creator=getconn, 49 | ) 50 | # insert statement (DML statement for data load) 51 | insert_stmt = sqlalchemy.text("INSERT INTO git_downloads_agg (os_name, os_version,no_f_downloads) VALUES (:os_name, :os_version,:no_f_downloads)",) 52 | 53 | # interact with Cloud SQL database using connection pool 54 | with pool.connect() as db_conn: 55 | 56 | # Create Table 57 | create_table = sqlalchemy.text("CREATE TABLE IF NOT EXISTS git_downloads_agg(os_name VARCHAR(20), os_version VARCHAR(20), no_f_downloads INT)") 58 | db_conn.execute(create_table) 59 | 60 | # Insert data into Table 61 | db_conn.execute(insert_stmt, parameters={'os_name':element['os_name'], 'os_version':element['os_version'],'no_f_downloads':int(element['no_f_downloads'])}) 62 | db_conn.commit() 63 | def run(): 64 | # Beam Pipeline starts here 65 | with beam.Pipeline(options=beam_options) as pipeline: 66 | 67 | # Read AVRO files from GCS location 68 | read_raw = pipeline | 'Read' >> beam.io.ReadFromAvro('gs://dataflow_demos2/input_data/pypy_filedownloads.avro') 69 | 70 | # Filter , Clean and Aggregate data ,Number of PYPY downloads by country,project, version 71 | agg_cntry = (read_raw | 'Filter Data' >> beam.Filter(lambda x: x['details']['python'].startswith('3.10')) 72 | | 'Get required data' >> beam.Map(lambda x:(x['country_code']+','+x['project']+','+x['details']['python'],x['timestamp'])) 73 | | 'Combine per key' >> beam.combiners.Count.PerKey() 74 | | 'Make it to dict again' >> beam.Map(lambda x: {'country_code':x[0].split(',')[0],'project':x[0].split(',')[1], 75 | 'python_version':x[0].split(',')[2],'no_of_downloads':x[1]}) 76 | #| 'Print' >> beam.Map(print) 77 | ) 78 | 79 | # Write Transformed data into GCS in AVRO format 80 | agg_cntry | 'WriteToAvro' >> WriteToAvro('gs://dataflow_demos2/output_data/agg_output_data.avro', 81 | schema={ 82 | "type": "record", "name": "agg_downloads", 83 | "fields": [ 84 | {"name": "country_code", "type": "string"}, 85 | {"name": "project", "type": "string"}, 86 | {"name": "python_version", "type": "string"}, 87 | {"name": "no_of_downloads", "type": "int"} 88 | ] 89 | } 90 | ) 91 | 92 | 93 | # Filter , Clean and Aggregate data ,Number of PYPY downloads by os name and version 94 | aggr_os_version = (read_raw | 'Filter Data with OS' >> beam.Filter(lambda x: x['details']['system']['name'] == 'Windows' and x['details']['rustc_version'] != None) 95 | | 'Get os data woth others' >> beam.Map(lambda x: (x['details']['system']['name']+','+x['details']['rustc_version'],x['timestamp'])) 96 | | 'Combine per key os' >> beam.combiners.Count.PerKey() 97 | | 'Make it dict of agg results' >> beam.Map(lambda x: {'os_name':x[0].split(',')[0],'os_version':x[0].split(',')[1],'no_f_downloads':x[1]}) 98 | #| 'Print' >> beam.Map(print) 99 | ) 100 | # Write Results into CloudSQL(MySQL) Table 101 | aggr_os_version| 'Write results to CloudSQL Table' >> beam.ParDo(WriteToCloudSQL()) 102 | 103 | # Run Pipeline here 104 | if __name__ == "__main__": 105 | logging.getLogger().setLevel(logging.INFO) 106 | run() 107 | -------------------------------------------------------------------------------- /Dataflow/beam_stream_data_process.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 3 | Author : @ Anjan GCP Data Engineering 4 | 5 | Created this Apache Beam code to demo Dataflow Stream ETL pipeline 6 | 7 | Steps Explaining this code; 8 | 9 | 1. A Python APP acting as a stream data source and publishing continous data into PubSub Topic 10 | 2. Read data into Apache Beam Pipeline -> into Unbounded PCollection from PubSub Topic 11 | 3. Applly Unboubded data processing concepts like event timestamps, fixed windows 12 | 4. Aggregate the data 13 | 5. Write aggrgated into Big Query Table 14 | 6. Run this Pipeline on Dataflow Runner 15 | 16 | 17 | CLI command to run this pipeline on dataflow 18 | 19 | python3 -m \ 20 | --input_topic projects//topics/ \ 21 | --output_path gs:///output \ 22 | --project \ 23 | --region us-west1 \ 24 | --temp_location gs:///temp \ 25 | --runner DataflowRunner 26 | ''' 27 | 28 | import argparse 29 | import apache_beam as beam 30 | import logging 31 | from apache_beam.options.pipeline_options import PipelineOptions 32 | from apache_beam import window 33 | from datetime import datetime as dt 34 | 35 | class AddWindowdtlsFn(beam.DoFn): 36 | 37 | def process(self, element, window=beam.DoFn.WindowParam): 38 | window_start = window.start.to_utc_datetime() 39 | window_end = window.end.to_utc_datetime() 40 | pc = str(element) + ' [ ' + str(window_start) + ' - ' + str(window_end) + ' ]' 41 | pc = pc.split('\n') 42 | return pc 43 | 44 | def run(input_topic, output_path, pipeline_args=None): 45 | # Set `save_main_session` to True so DoFns can access globally imported modules. 46 | pipeline_options = PipelineOptions( 47 | pipeline_args, streaming=True, save_main_session=True 48 | ) 49 | with beam.Pipeline(options=pipeline_options) as p: 50 | #p = beam.Pipeline(options=options) 51 | ( 52 | p | "Read Events stream data from Topic" >> beam.io.ReadFromPubSub(topic=input_topic) 53 | | "Covert from Bytes to String" >> beam.Map(lambda s: s.decode("utf-8")) 54 | | 'Events Data' >> beam.Map(lambda x: {'event_nbr':x.split(',')[0],'event_time':dt.strptime(x.split(',')[1],'%Y-%m-%d %H:%M:%S.%f')}) 55 | | 'Events with Timestamps' >> beam.Map(lambda events: beam.window.TimestampedValue(events['event_nbr'], events['event_time'].timestamp())) 56 | | 'Events fixed Window' >> beam.WindowInto(window.FixedWindows(5)) 57 | | 'No of events per Window' >> beam.combiners.Count.Globally().without_defaults() 58 | | 'Final results with Window Info' >> beam.ParDo(AddWindowdtlsFn()) 59 | | 'String To BigQuery Row' >> beam.Map(lambda s: {'window_count': s}) 60 | #| 'Write Windowed resuls to GCS' >> beam.io.WriteToText(output_gcs_location + '/events_per_window_output.txt') 61 | #| 'Write to PubSub' >> beam.io.WriteToPubSub(topic=topic_sink) 62 | | 'Write to BigQuery' >> beam.io.Write(beam.io.WriteToBigQuery( 63 | '', 64 | dataset='', 65 | project='', 66 | schema ='window_count:STRING', 67 | create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, 68 | write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND 69 | ) 70 | ) 71 | 72 | ) #| beam.Map(print) 73 | 74 | if __name__ == "__main__": 75 | logging.getLogger().setLevel(logging.INFO) 76 | parser = argparse.ArgumentParser() 77 | parser.add_argument( 78 | "--input_topic", 79 | help="The Cloud Pub/Sub topic to read from." 80 | '"projects//topics/".', 81 | ) 82 | parser.add_argument( 83 | "--output_path", 84 | help="Path of the output GCS file including the prefix.", 85 | ) 86 | known_args, pipeline_args = parser.parse_known_args() 87 | run( 88 | known_args.input_topic, 89 | known_args.output_path, 90 | pipeline_args 91 | ) 92 | -------------------------------------------------------------------------------- /Dataflow/dataflow_batch_demo.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Author : Anjan GCP Data Engineering 3 | This code should be used only for Eductional purpose 4 | This code is to perform 5 | 1. Read data from CSV file 6 | 2. Tranform the data using Beam ParDo (User defined logic) 7 | 3. Write Transformed data into specified Big Query Table. 8 | ''' 9 | # Import required modules and methods 10 | import argparse 11 | import logging 12 | import apache_beam as beam 13 | import re 14 | from apache_beam.io import ReadFromText 15 | from apache_beam.io import WriteToText 16 | from apache_beam.options.pipeline_options import PipelineOptions 17 | 18 | # ParDo Class for parallel processing by applying user defined tranformations 19 | class scrip_val(beam.DoFn): 20 | def process(self, element): 21 | try: 22 | line = element.split('"') 23 | if line[9] == 'BUY': 24 | tp=line[3]+','+line[11].replace(',','') 25 | else: 26 | tp=line[3]+',-'+line[11].replace(',','') 27 | tp=tp.split() 28 | return tp 29 | except: 30 | logging.info('Some Error occured') 31 | 32 | # Entry run method for triggering pipline 33 | def run(): 34 | #Input arguments , reading from commandline 35 | parser = argparse.ArgumentParser() 36 | parser.add_argument('--input', 37 | dest='input', 38 | default='gs://dataflow_demo19', 39 | help='Input file to process.') 40 | parser.add_argument('--output', 41 | dest='output', 42 | required=True, 43 | help='Output file to write results to.') 44 | known_args, pipeline_args = parser.parse_known_args() 45 | 46 | # Function to SUM grouped elements 47 | def sum_groups(word_ones): 48 | (word, ones) = word_ones 49 | return word + ',' + str(sum(ones)) 50 | ''' 51 | def format_result(bulk_deal): 52 | (bulk, deal) = bulk_deal 53 | return '%s: %d' % (bulk, deal) 54 | ''' 55 | # Function to parse and format given input to Big Query readable JSON format 56 | def parse_method(string_input): 57 | 58 | values = re.split(",",re.sub('\r\n', '', re.sub(u'"', '', string_input))) 59 | row = dict( 60 | zip(('SYMBOL', 'BUY_SELL_QTY'), 61 | values)) 62 | return row 63 | 64 | # Main Pipeline 65 | with beam.Pipeline(options=PipelineOptions(pipeline_args)) as p: 66 | lines = p | 'read' >> ReadFromText(known_args.input,skip_header_lines=1) 67 | counts = ( 68 | lines 69 | | 'Get required tuple' >> beam.ParDo(scrip_val()) 70 | | 'PairWithValue' >> beam.Map(lambda x: (x.split(',')[0],int(x.split(',')[1]))) 71 | | 'Group by Key' >> beam.GroupByKey() 72 | | 'Sum Group' >> beam.Map(sum_groups) 73 | | 'To String' >> beam.Map(lambda s: str(s)) 74 | | 'String To BigQuery Row' >> beam.Map(lambda s: parse_method(s)) 75 | #| 'format' >> beam.Map(format_result) 76 | #| 'Print' >> beam.Map(print) 77 | #| 'write' >> WriteToText(known_args.output) 78 | ) 79 | # Write to Big Query Sink 80 | counts| 'Write to BigQuery' >> beam.io.Write( 81 | beam.io.WriteToBigQuery( 82 | 'batach_data', 83 | dataset='dataflow_demo', 84 | project='gcp-dataeng-demos', 85 | schema ='SYMBOL:STRING,BUY_SELL_QTY:INTEGER', 86 | create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, 87 | write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE 88 | ) 89 | ) 90 | 91 | # Trigger entry function here 92 | if __name__ == '__main__': 93 | logging.getLogger().setLevel(logging.INFO) 94 | run() 95 | -------------------------------------------------------------------------------- /Dataflow/dataflow_batch_log_process.py: -------------------------------------------------------------------------------- 1 | # Import required modules and methods 2 | import argparse 3 | import logging 4 | import apache_beam as beam 5 | import re 6 | from apache_beam.io import ReadFromText 7 | from apache_beam.io import WriteToText 8 | from apache_beam.transforms.sql import SqlTransform 9 | from apache_beam.options.pipeline_options import PipelineOptions 10 | import json 11 | import ast 12 | 13 | # Setting up the Apache Beam pipeline options. 14 | beam_options = PipelineOptions( 15 | save_main_session=True, 16 | #runner='DirectRunner', 17 | runner='DataflowRunner', 18 | project='data-eng-demos19', 19 | temp_location='gs://data_eng_demos/temp', 20 | region='us-central1') 21 | 22 | # ParDo Class for parallel processing by applying user defined tranformations 23 | class ParseJSON(beam.DoFn): 24 | def process(self, element): 25 | try: 26 | dict_line = json.loads(element) 27 | sub_str = dict_line['protoPayload']['methodName'] 28 | if 'google.cloud' in sub_str: 29 | sub_str = sub_str.split('.')[4] + '.' + sub_str.split('.')[5] 30 | st = '{' + "'user':'" + dict_line['protoPayload']['authenticationInfo']['principalEmail'] + "','job_type':'" + sub_str.lower().rstrip('job') + "','info_type':'" + dict_line['severity'] + "','timestamp':'" + dict_line['timestamp'] + "'}" 31 | st = st.replace("'",'"') 32 | return st.split('\n') 33 | except: 34 | logging.info('Some Error occured') 35 | 36 | # Entry Function to run Pipeline 37 | def run(): 38 | # Set `save_main_session` to True so DoFns can access globally imported modules. 39 | with beam.Pipeline(options=beam_options) as p: 40 | 41 | result = ( 42 | p | 'Read from GCS' >> ReadFromText('gs://logs_bucket19/cloudaudit.googleapis.com/data_access/2022/12/28/*.json') 43 | | 'Parse logs to string representation of dict' >> beam.ParDo(ParseJSON()) 44 | | 'Convert String to Dict' >> beam.Map(lambda x: json.loads(x)) 45 | #| beam.Map(print) 46 | ) 47 | 48 | write_to_gcs = (result | 'get job type tuple' >> beam.Map(lambda x : ( x['job_type']+',' + x['info_type'],1)) 49 | | 'combine per key and sum' >> beam.CombinePerKey(sum) 50 | | 'format to JSON' >> beam.Map(lambda x : "{'job_type':'"+ x[0].split(',')[0] + 51 | "','info_type':'" + x[0].split(',')[1] + "','count':" + str(x[1]) +"}" ) 52 | #| beam.Map(print) 53 | | 'write final results into GCS bucket' >> beam.io.WriteToText('gs://data_eng_demos/output/bq_job_stats.txt') 54 | ) 55 | 56 | write_to_bq = result | 'Write parsed results to BigQuery' >> beam.io.Write(beam.io.WriteToBigQuery( 57 | 'bq_auditlog_parsed_data', 58 | dataset='gcp_dataeng_demos', 59 | project='data-eng-demos19', 60 | schema ='SCHEMA_AUTODETECT', 61 | create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, 62 | write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE 63 | ) 64 | ) 65 | 66 | if __name__ == "__main__": 67 | logging.getLogger().setLevel(logging.INFO) 68 | run() -------------------------------------------------------------------------------- /Dataflow/process_nested_data_sql_demo.py: -------------------------------------------------------------------------------- 1 | # Import required modules and methods 2 | import argparse 3 | import logging 4 | import apache_beam as beam 5 | import re 6 | from apache_beam.io import ReadFromText 7 | from apache_beam.io import WriteToText 8 | from apache_beam.transforms.sql import SqlTransform 9 | from apache_beam.options.pipeline_options import PipelineOptions 10 | import json 11 | import ast 12 | 13 | # Setting up the Apache Beam pipeline options. 14 | beam_options = PipelineOptions( 15 | save_main_session=True, 16 | #runner='DirectRunner', 17 | runner='DataflowRunner', 18 | project='gcp-dataeng-demos-355417', 19 | temp_location='gs://dataflow_demo19/temp', 20 | region='us-central1') 21 | 22 | # ParDo Class for parallel processing by applying user defined tranformations 23 | class ParseJSON(beam.DoFn): 24 | def process(self, element): 25 | try: 26 | dict_line = json.loads(element) 27 | lst = [] 28 | st = str(dict_line) 29 | st = st.split("'Performance':")[0] + "'Previous3IPLBattingAvg':" + str(dict_line['Previous3IPLBattingAvg']) + "," 30 | for l in dict_line['Performance']: 31 | result = (st + str(l).lstrip('{')) 32 | result = result.replace("'",'"') 33 | lst.append(result) 34 | return lst 35 | except: 36 | logging.info('Some Error occured') 37 | 38 | # Beam SQL Transformation query applied on Pcollection 39 | qry = '''SELECT 40 | PlayerName, 41 | Age, 42 | team, 43 | Previous3IPLBattingAvg, 44 | SUM(RunsScored) as total_RunsScored, 45 | SUM(Wickets) AS total_Wickets, 46 | FROM 47 | PCOLLECTION 48 | GROUP BY 49 | 1,2,3,4''' 50 | 51 | # Mapper function to update dict Previous3IPLBattingAvg values from String to List 52 | def StrLstUpdate(dct): 53 | dct.update({'Previous3IPLBattingAvg' : ast.literal_eval(dct['Previous3IPLBattingAvg'])}) 54 | return dct 55 | 56 | # Entry Function to run Pipeline 57 | def run(): 58 | # Set `save_main_session` to True so DoFns can access globally imported modules. 59 | with beam.Pipeline(options=beam_options) as p: 60 | 61 | result = ( 62 | p | 'Read from GCS' >> ReadFromText('gs://dataflow_demo19/input/ipl_player_stats.json') 63 | | 'Parse JSON and flatten' >> beam.ParDo(ParseJSON()) 64 | | 'Filter required data' >> beam.Filter(lambda x : ('"NotBowled"' not in x)) | beam.Filter(lambda x : ('"NotBatted"' not in x)) 65 | | 'Parse List to Dict' >> beam.Map(lambda x: json.loads(x)) 66 | | 'Convert as Beam Rows' >> beam.Map(lambda x: beam.Row( 67 | PlayerName = str(x['PlayerName']), 68 | Age = str(x['Age']), 69 | Team = str(x['Team']), 70 | MatchNo = str(x['MatchNo']), 71 | RunsScored = int(x['RunsScored']), 72 | Wickets = int(x['Wickets']), 73 | Previous3IPLBattingAvg = str(x['Previous3IPLBattingAvg']) 74 | ) 75 | ) 76 | 77 | | 'Get Palyer Stats by Appying Beam SQL Transform' >> SqlTransform(qry, dialect='zetasql') 78 | | 'Convert to Bigquery readable Dict' >> beam.Map(lambda row : row._asdict()) 79 | | 'Convert String representation of Previous3IPLBattingAvg to Nested' >> beam.Map(lambda x : StrLstUpdate(x)) 80 | #| beam.Map(print)) 81 | #Write to Big Query 82 | | 'Write Final results to BigQuery' >> beam.io.Write(beam.io.WriteToBigQuery( 83 | 'batach_data', 84 | dataset='dataflow_demos', 85 | project='gcp-dataeng-demos-355417', 86 | schema ='SCHEMA_AUTODETECT', 87 | create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, 88 | write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE 89 | ) 90 | )) 91 | 92 | if __name__ == "__main__": 93 | logging.getLogger().setLevel(logging.INFO) 94 | run() 95 | -------------------------------------------------------------------------------- /Dataproc/pyspark_bq_to_gcs_demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Author : @ Anjan GCP Data Engineering 4 | This script is created to demo below concepts 5 | 1. Create Spark session on Dataproc cluster 6 | 2. Read input data from Big Query table 7 | 3. Apply Transformations to group and aggregate data by using Spark SQL 8 | 4. Write resultant data to GCS buacket --> File 9 | 10 | BigQuery I/O PySpark Demo - BigQuery --> Aggregate data --> write results to GCS 11 | 12 | """ 13 | 14 | from pyspark.sql import SparkSession 15 | 16 | # Spark session 17 | spark = SparkSession \ 18 | .builder \ 19 | .master('yarn') \ 20 | .appName('spark-bigquery-gcs-demo') \ 21 | .getOrCreate() 22 | 23 | # Use the Cloud Storage bucket for temporary BigQuery export data used 24 | # by the connector. 25 | bucket = "gcp-dataeng-demos" 26 | spark.conf.set('temporaryGcsBucket', bucket) 27 | 28 | # Load data from BigQuery Covid19 public dataset. 29 | covid19 = spark.read.format('bigquery') \ 30 | .option('table', 'bigquery-public-data:covid19_open_data.covid19_open_data') \ 31 | .load() 32 | covid19.createOrReplaceTempView('covid19') 33 | 34 | # Perform data aggregation. 35 | covid19 = spark.sql( 36 | 'SELECT \ 37 | country_name,\ 38 | EXTRACT(year FROM date) AS year,\ 39 | SUM(new_confirmed) AS new_confirmed,\ 40 | SUM(new_deceased) AS new_deceased,\ 41 | SUM(cumulative_confirmed) AS cumulative_confirmed,\ 42 | SUM(cumulative_deceased) AS cumulative_deceased\ 43 | FROM \ 44 | covid19 \ 45 | GROUP BY \ 46 | 1,\ 47 | 2 \ 48 | ORDER BY \ 49 | 1,\ 50 | 2') 51 | 52 | # Write results to GCS bucket 53 | covid19.write.csv('gs://gcp-dataeng-demos/coutrywise_cases') 54 | -------------------------------------------------------------------------------- /GCP_Data_Eng_concept_files/GCP_storage_db.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anjangcp/GCP-Data-Engineering-Demo-Codes/f92f4ce688459c9913d2ddef838c411eaeb969a0/GCP_Data_Eng_concept_files/GCP_storage_db.png -------------------------------------------------------------------------------- /GCP_Data_Eng_concept_files/Initfile.txt: -------------------------------------------------------------------------------- 1 | Welcome!! 2 | -------------------------------------------------------------------------------- /GCP_Data_Eng_concept_files/gcp_etl_services.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anjangcp/GCP-Data-Engineering-Demo-Codes/f92f4ce688459c9913d2ddef838c411eaeb969a0/GCP_Data_Eng_concept_files/gcp_etl_services.png -------------------------------------------------------------------------------- /GCS/object_lifecycle_mngmnt_cli.txt: -------------------------------------------------------------------------------- 1 | 2 | /****** Object Lifecycle Management Config JSON File Example ****************/ 3 | 4 | { 5 | "lifecycle": { 6 | "rule": [ 7 | { 8 | "action": { 9 | "type": "SetStorageClass", 10 | "storageClass": "NEARLINE" 11 | }, 12 | "condition": { 13 | "age": 365, 14 | "matchesStorageClass": ["STANDARD"] 15 | } 16 | }, 17 | { 18 | "action": { 19 | "type": "SetStorageClass", 20 | "storageClass": "COLDLINE" 21 | }, 22 | "condition": { 23 | "age": 730, 24 | "matchesStorageClass": ["NEARLINE"] 25 | } 26 | } 27 | ] 28 | } 29 | } 30 | 31 | /****************** CLI Commands **********************/ 32 | 33 | 34 | -- List Rules 35 | 36 | gcloud storage buckets describe gs://gcp-data-eng-demos --format="default(lifecycle)" 37 | 38 | -- Create Rules 39 | 40 | gcloud storage buckets update gs://gcp-data-eng-demos \ 41 | --lifecycle-file=/home/gcpdataeng1982/gcs_lifecycle_config_file.json 42 | 43 | -- Clear Rules 44 | 45 | gcloud storage buckets update gs://gcp-data-eng-demos --clear-lifecycle 46 | -------------------------------------------------------------------------------- /GCS/object_lifecylce_mngmnt.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Author : @ Anjan GCP Data Engineering 3 | Created SQLs required for Big Query Procedures and Anonymous Blocks Demo 4 | 5 | This code is to demo , how to define and manage GCS Object LifeCycle Management rules using Python Client Libraries 6 | ''' 7 | 8 | from google.cloud import storage 9 | 10 | def enable_bucket_lifecycle_management(bucket_name): 11 | """Enable lifecycle management for a bucket""" 12 | 13 | storage_client = storage.Client() 14 | 15 | bucket = storage_client.get_bucket(bucket_name) 16 | rules = bucket.lifecycle_rules 17 | 18 | # Print default Rules 19 | print(f"Lifecycle management rules for bucket {bucket_name} are {list(rules)}") 20 | 21 | # Add new Rules 22 | #bucket.add_lifecycle_delete_rule(age=2) 23 | 24 | #Clearing Rules 25 | bucket.clear_lifecyle_rules() 26 | bucket.patch() 27 | 28 | # Print Rules again after modifications 29 | rules = bucket.lifecycle_rules 30 | print(f"Lifecycle management is enable for bucket {bucket_name} and the rules are {list(rules)}") 31 | 32 | return bucket 33 | 34 | enable_bucket_lifecycle_management('gcp-data-eng-demos') 35 | -------------------------------------------------------------------------------- /GoogleCloudStorage/gcs_python_client_demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Author : @ Anjan GCP Data Engineering 4 | 5 | Created for educational purpose only ... 6 | 7 | This python file has different python functions to explain 8 | 1. Create GCS bucket 9 | 2. Manage GCS bucket 10 | 3. Upload Storage objects 11 | 4. Download Storage objects 12 | 5. Bulk uploads and downloads 13 | 6. Delete bucket 14 | using Python GCS clinet libraries 15 | 16 | Insallations 17 | pip3 install google-cloud-storage 18 | 19 | """ 20 | 21 | from google.cloud import storage 22 | 23 | """ Create Bucket """ 24 | 25 | bucket_name = "demo_gcp_dataeng123" 26 | 27 | def create_bucket_class_location(bucket_name): 28 | """ 29 | Create a new bucket in the US region with the coldline storage 30 | class 31 | """ 32 | # bucket_name = "your-new-bucket-name" 33 | 34 | storage_client = storage.Client() 35 | 36 | bucket = storage_client.bucket(bucket_name) 37 | bucket.storage_class = "STANDARD" 38 | new_bucket = storage_client.create_bucket(bucket, location="us") 39 | 40 | print( 41 | "Created bucket {} in {} with storage class {}".format( 42 | new_bucket.name, new_bucket.location, new_bucket.storage_class 43 | ) 44 | ) 45 | return new_bucket 46 | 47 | create_bucket_class_location(bucket_name) 48 | 49 | """ List Buckets """ 50 | 51 | def list_buckets(): 52 | 53 | storage_client = storage.Client() 54 | buckets = storage_client.list_buckets() 55 | 56 | for bucket in buckets: 57 | print(bucket.name) 58 | 59 | #list_buckets() 60 | 61 | """ Get Bucket metadata info """ 62 | 63 | bucket_name = "demo_gcp_dataeng123" 64 | 65 | def bucket_metadata(bucket_name): 66 | """Prints out a bucket's metadata.""" 67 | # bucket_name = 'your-bucket-name' 68 | 69 | storage_client = storage.Client() 70 | bucket = storage_client.get_bucket(bucket_name) 71 | 72 | print(f"ID: {bucket.id}") 73 | print(f"Name: {bucket.name}") 74 | print(f"Storage Class: {bucket.storage_class}") 75 | print(f"Location: {bucket.location}") 76 | print(f"Location Type: {bucket.location_type}") 77 | print(f"Cors: {bucket.cors}") 78 | print(f"Default Event Based Hold: {bucket.default_event_based_hold}") 79 | print(f"Default KMS Key Name: {bucket.default_kms_key_name}") 80 | print(f"Metageneration: {bucket.metageneration}") 81 | print( 82 | f"Public Access Prevention: {bucket.iam_configuration.public_access_prevention}" 83 | ) 84 | print(f"Retention Effective Time: {bucket.retention_policy_effective_time}") 85 | print(f"Retention Period: {bucket.retention_period}") 86 | print(f"Retention Policy Locked: {bucket.retention_policy_locked}") 87 | print(f"Object Retention Mode: {bucket.object_retention_mode}") 88 | print(f"Requester Pays: {bucket.requester_pays}") 89 | print(f"Self Link: {bucket.self_link}") 90 | print(f"Time Created: {bucket.time_created}") 91 | print(f"Versioning Enabled: {bucket.versioning_enabled}") 92 | print(f"Labels: {bucket.labels}") 93 | 94 | #bucket_metadata(bucket_name) 95 | 96 | bucket_name = "demo_gcp_dataeng123" 97 | source_file_name = "/home/gcpdataeng36/input_batch_data.csv" 98 | destination_blob_name = "input_batch_data.csv" 99 | 100 | 101 | def upload_blob(bucket_name, source_file_name, destination_blob_name): 102 | """Uploads a file to the bucket.""" 103 | 104 | storage_client = storage.Client() 105 | bucket = storage_client.bucket(bucket_name) 106 | blob = bucket.blob(destination_blob_name) 107 | 108 | blob.upload_from_filename(source_file_name) 109 | 110 | print( 111 | f"File {source_file_name} uploaded to {destination_blob_name}." 112 | ) 113 | #upload_blob(bucket_name, source_file_name, destination_blob_name) 114 | 115 | 116 | """ Get object's ACLs """ 117 | 118 | bucket_name = "demo_gcp_dataeng123" 119 | blob_name = 'input_batch_data.csv' 120 | def print_blob_acl(bucket_name, blob_name): 121 | """Prints out a blob's access control list.""" 122 | 123 | storage_client = storage.Client() 124 | bucket = storage_client.bucket(bucket_name) 125 | blob = bucket.blob(blob_name) 126 | 127 | for entry in blob.acl: 128 | print(f"{entry['role']}: {entry['entity']}") 129 | 130 | #print_blob_acl(bucket_name, blob_name) 131 | 132 | """ List all objcet in a bucket """ 133 | 134 | bucket_name = "demo_gcp_dataeng123" 135 | 136 | def list_blobs(bucket_name): 137 | """Lists all the blobs in the bucket.""" 138 | # bucket_name = "your-bucket-name" 139 | 140 | storage_client = storage.Client() 141 | 142 | # Note: Client.list_blobs requires at least package version 1.17.0. 143 | blobs = storage_client.list_blobs(bucket_name) 144 | 145 | # Note: The call returns a response only when the iterator is consumed. 146 | for blob in blobs: 147 | print(blob.name) 148 | 149 | #list_blobs(bucket_name) 150 | 151 | """ Upload multiple objects with transfer manager in parallel """ 152 | 153 | bucket_name = "demo_gcp_dataeng123" 154 | filenames = ["demo1.mov","input_batch_data.csv","ind_niftyrealtylist.csv"] 155 | source_directory="/home/gcpdataeng36" 156 | workers=8 157 | 158 | def upload_many_blobs_with_transfer_manager( 159 | bucket_name, filenames, source_directory, workers 160 | ): 161 | """Upload every file in a list to a bucket, concurrently in a process pool. 162 | 163 | Each blob name is derived from the filename, not including the 164 | `source_directory` parameter. For complete control of the blob name for each 165 | file (and other aspects of individual blob metadata), use 166 | transfer_manager.upload_many() instead. 167 | """ 168 | 169 | from google.cloud.storage import Client, transfer_manager 170 | import datetime 171 | 172 | storage_client = Client() 173 | bucket = storage_client.bucket(bucket_name) 174 | 175 | print("start time:",datetime.datetime.now()) 176 | 177 | results = transfer_manager.upload_many_from_filenames( 178 | bucket, filenames, source_directory=source_directory, max_workers=workers 179 | ) 180 | 181 | for name, result in zip(filenames, results): 182 | # The results list is either `None` or an exception for each filename in 183 | # the input list, in order. 184 | 185 | if isinstance(result, Exception): 186 | print("Failed to upload {} due to exception: {}".format(name, result)) 187 | else: 188 | print("Uploaded {} to {}.".format(name, bucket.name)) 189 | print("end time:",datetime.datetime.now()) 190 | 191 | #upload_many_blobs_with_transfer_manager(bucket_name, filenames, source_directory, workers) 192 | 193 | """ Upload large files in chunks """ 194 | 195 | bucket_name = "demo_gcp_dataeng123" 196 | source_filename = "/home/gcpdataeng36/demo1.mov" 197 | destination_blob_name = "demo1.mov" 198 | workers=8 199 | 200 | def upload_chunks_concurrently( 201 | bucket_name, 202 | source_filename, 203 | destination_blob_name, 204 | chunk_size=32 * 1024 * 1024, 205 | workers=8, 206 | ): 207 | """Upload a single file, in chunks, concurrently in a process pool.""" 208 | 209 | from google.cloud.storage import Client, transfer_manager 210 | import datetime 211 | 212 | print("start time:",datetime.datetime.now()) 213 | storage_client = Client() 214 | bucket = storage_client.bucket(bucket_name) 215 | blob = bucket.blob(destination_blob_name) 216 | 217 | transfer_manager.upload_chunks_concurrently( 218 | source_filename, blob, chunk_size=chunk_size, max_workers=workers 219 | ) 220 | 221 | print(f"File {source_filename} uploaded to {destination_blob_name}.") 222 | print("end time:",datetime.datetime.now()) 223 | 224 | #upload_chunks_concurrently(bucket_name,source_filename,destination_blob_name,chunk_size=32 * 1024 * 1024,workers=8) 225 | 226 | """ Download multiple files """ 227 | 228 | bucket_name = "demo_gcp_dataeng123" 229 | blob_names = ["demo1.mov","input_batch_data.csv","ind_niftyrealtylist.csv"] 230 | destination_directory = "/home/gcpdataeng36/downloads" 231 | workers=8 232 | 233 | def download_many_blobs_with_transfer_manager( 234 | bucket_name, blob_names, destination_directory, workers=8 235 | ): 236 | """Download blobs in a list by name, concurrently in a process pool. 237 | 238 | The filename of each blob once downloaded is derived from the blob name and 239 | the `destination_directory `parameter. For complete control of the filename 240 | of each blob, use transfer_manager.download_many() instead. 241 | 242 | Directories will be created automatically as needed to accommodate blob 243 | names that include slashes. 244 | """ 245 | from google.cloud.storage import Client, transfer_manager 246 | 247 | storage_client = Client() 248 | bucket = storage_client.bucket(bucket_name) 249 | 250 | results = transfer_manager.download_many_to_path( 251 | bucket, blob_names, destination_directory=destination_directory, max_workers=workers 252 | ) 253 | 254 | for name, result in zip(blob_names, results): 255 | # The results list is either `None` or an exception for each blob in 256 | # the input list, in order. 257 | 258 | if isinstance(result, Exception): 259 | print("Failed to download {} due to exception: {}".format(name, result)) 260 | else: 261 | print("Downloaded {} to {}.".format(name, destination_directory + name)) 262 | 263 | 264 | #download_many_blobs_with_transfer_manager(bucket_name, blob_names, destination_directory, workers=8) 265 | 266 | """ delete bucket """ 267 | 268 | bucket_name = "demo_gcp_dataeng123" 269 | 270 | def delete_bucket(bucket_name): 271 | """Deletes a bucket. The bucket must be empty.""" 272 | # bucket_name = "your-bucket-name" 273 | 274 | storage_client = storage.Client() 275 | 276 | bucket = storage_client.get_bucket(bucket_name) 277 | bucket.delete() 278 | 279 | print(f"Bucket {bucket.name} deleted") 280 | 281 | #delete_bucket(bucket_name) 282 | -------------------------------------------------------------------------------- /GoogleCloudStorage/object_versioning_cli.txt: -------------------------------------------------------------------------------- 1 | gcloud storage buckets update gs://version-demo --versioning 2 | 3 | gcloud storage buckets describe gs://version-demo --format="default(versioning)" 4 | 5 | gcloud storage ls --all-versions gs://version-demo 6 | 7 | gcloud storage cp gs://version-demo/lables.key#1678617129615474 gs://version-demo/lables.key 8 | 9 | gcloud storage rm gs://version-demo/lables.key#1678617129615474 10 | 11 | gcloud storage buckets update gs://version-demo --no-versioning 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GCP-Data-Engineering-Demo-Codes 2 | Demo Codes will be shared here , srticlty for educational purpose. 3 | 4 | 1. Codes will be categorized based on GCP services (Ex: Big Query , Dataflow, Dataproc...etc) 5 | 2. Codes included with inline developer comments at each section of the code. 6 | 7 | Note for users: If you have to try these codes , please change GCP_project, Big Query dataset/tables, GCS bucket/Folder ..etc accordingly as per your needs. 8 | -------------------------------------------------------------------------------- /Security/secretmanager_python_connect.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Secretmanager python client libraries to be installed with below commands 3 | ----------------------------------------- 4 | pip install google-cloud-secret-manager 5 | ''' 6 | def access_secret_version(project_id, secret_id, version_id): 7 | """ 8 | Access the payload for the given secret version if one exists. The version 9 | can be a version number as a string (e.g. "5") or an alias (e.g. "latest"). 10 | """ 11 | 12 | # Import the Secret Manager client library. 13 | from google.cloud import secretmanager 14 | 15 | # Create the Secret Manager client. 16 | client = secretmanager.SecretManagerServiceClient() 17 | 18 | # Build the resource name of the secret version. 19 | name = f"projects/{project_id}/secrets/{secret_id}/versions/{version_id}" 20 | 21 | # Access the secret version. 22 | response = client.access_secret_version(request={"name": name}) 23 | # Print the secret payload. 24 | # snippet is showing how to access the secret material. 25 | payload = response.payload.data.decode("UTF-8") 26 | print("Plaintext: {}".format(payload)) 27 | 28 | # Function call to show output 29 | access_secret_version('gcp-data-eng-374308', 'cloudsql_pwd','1') 30 | --------------------------------------------------------------------------------