├── BigQuery
    ├── BigQuery_partition_demo.sql
    ├── bq_procedures.sql
    ├── bq_table_clusters.sql
    ├── data_lineage_demo.sql
    ├── gcp_billingdata_analysis.sql
    └── table_function.sql
├── CloudFunctions
    ├── bq_events_to_gcs_cf.py
    ├── gcs_events_cf.py
    ├── pubsub_events_cf.py
    └── sheets_to_bigquery_dataload_cf.py
├── CloudSQL
    ├── cloudsql_python_connect.py
    └── federated_query_demo.sql
├── CloudSpanner
    ├── cloud_spanner_intro_demo.sql
    └── spanner_python_client_demo.py
├── Common_Realtime_Usecases
    ├── dataflow_spanner_demo
    │   ├── beam_dataflow-to-spanner.py
    │   ├── census-db-schema.sql
    │   ├── census_100_testing.csv
    │   ├── create-spanner-database-CLI.txt
    │   └── installations_beam.sh
    └── iam_snapshots.py
├── Composer
    ├── airflow_dataproc_automate_dag.py
    ├── dataflow_python_operator_dag.py
    ├── gcs_to_bq_and_bq_operators.py
    ├── https_operators_demo_dag.py
    └── python_bash_operators_dag.py
├── Dataflow
    ├── batch_etl_avro_data_cloudsql.py
    ├── beam_stream_data_process.py
    ├── dataflow_batch_demo.py
    ├── dataflow_batch_log_process.py
    └── process_nested_data_sql_demo.py
├── Dataproc
    └── pyspark_bq_to_gcs_demo.py
├── GCP_Data_Eng_concept_files
    ├── GCP_storage_db.png
    ├── Initfile.txt
    └── gcp_etl_services.png
├── GCS
    ├── object_lifecycle_mngmnt_cli.txt
    └── object_lifecylce_mngmnt.py
├── GoogleCloudStorage
    ├── gcs_python_client_demo.py
    └── object_versioning_cli.txt
├── README.md
└── Security
    └── secretmanager_python_connect.py


/BigQuery/BigQuery_partition_demo.sql:
--------------------------------------------------------------------------------
  1 | /*
  2 | Author : @ Anjan GCP Data Engineering
  3 | 
  4 | Created SQLs to Demo  BigQuery Table Partitioning
  5 |   1. TIME UNIT (MONTHLY)
  6 |   2. INTEGER RANGE
  7 |   3. INGESTION TIME UNIT
  8 | 
  9 | */
 10 | 
 11 | /************** Time Unit Partitioning  *******************/
 12 | 
 13 | -- Query this table to understand the data distribution across different dates
 14 | 
 15 | SELECT  min(start_time), max(start_time) FROM `gcp-data-eng-374308.bigquery_demos.bikeshare_trips`;
 16 | 
 17 | select DATE_TRUNC(start_time, DAY) as year,count(*) from `gcp-data-eng-374308.bigquery_demos.bikeshare_trips`
 18 | group by 1 order by 1;
 19 | 
 20 | select DATE_TRUNC(start_time, MONTH) as year,count(*) from `gcp-data-eng-374308.bigquery_demos.bikeshare_trips`
 21 | group by 1 order by 1;
 22 | 
 23 | select DATE_TRUNC(start_time, YEAR) as year,count(*) from `gcp-data-eng-374308.bigquery_demos.bikeshare_trips`
 24 | group by 1 order by 1;
 25 | 
 26 | --Create MONTHLY Partitioned table based on TIME UNIT columns
 27 | create or replace table bigquery_demos.bikeshare_trips_p
 28 | (
 29 | trip_id	INT64,				
 30 | subscriber_type	STRING,		
 31 | bikeid	STRING,			
 32 | start_time	TIMESTAMP,
 33 | start_station_id	INT64,			
 34 | start_station_name	STRING,				
 35 | end_station_id	STRING,				
 36 | end_station_name	STRING,				
 37 | duration_minutes	INT64	
 38 | )
 39 | PARTITION BY
 40 |   TIMESTAMP_TRUNC(start_time, MONTH);
 41 | 
 42 | --Create partition table usning SQL query result
 43 | 
 44 | create or replace table bigquery_demos.bikeshare_trips_sql
 45 | (
 46 | trip_id	INT64,				
 47 | subscriber_type	STRING,		
 48 | bikeid	STRING,			
 49 | start_time	TIMESTAMP,
 50 | start_station_id	INT64,			
 51 | start_station_name	STRING,				
 52 | end_station_id	STRING,				
 53 | end_station_name	STRING,				
 54 | duration_minutes	INT64	
 55 | )
 56 | PARTITION BY
 57 |   start_time
 58 |   AS (SELECT  TIMESTAMP_TRUNC(start_time , DAY)
 59 |       FROM `gcp-data-eng-374308.bigquery_demos.bikeshare_trips`);
 60 |   
 61 | --Insert data into Partitioned table  
 62 | insert into bigquery_demos.bikeshare_trips_p
 63 | select * from bigquery_demos.bikeshare_trips;
 64 | 
 65 | -- Query non Partitioned table
 66 | select * from bigquery_demos.bikeshare_trips
 67 | where start_time > '2020-12-01 00:00:00 UTC';
 68 | 
 69 | -- Query partioned table and see the difference
 70 | select * from bigquery_demos.bikeshare_trips_p
 71 | where start_time > '2020-12-01 00:00:00 UTC';
 72 | 
 73 | 
 74 | /************** Integer Range Partitioning  *******************/
 75 | 
 76 | -- Query this table to understand the data distribution across INTEGER type column
 77 | SELECT id,
 78 | text,
 79 | score,
 80 | creation_date  
 81 | FROM `bigquery-public-data.stackoverflow.comments`;
 82 | 
 83 | --Creat Partitioned table
 84 | create or replace table bigquery_demos.stackoverflow_comments_p
 85 | (
 86 |   id INT64,
 87 |   text STRINg,
 88 |   score INT64,
 89 |   creation_date TIMESTAMP
 90 | )
 91 | partition by RANGE_BUCKET(id, GENERATE_ARRAY(0, 140390264, 100000));
 92 | 
 93 | --Insert data into partitioned table
 94 | insert into bigquery_demos.stackoverflow_comments_p
 95 | SELECT id,
 96 | text,
 97 | score,
 98 | creation_date  
 99 | FROM `bigquery-public-data.stackoverflow.comments`;
100 | 
101 | 
102 | --Query non Partitioned table
103 | SELECT id,
104 | text,
105 | score,
106 | creation_date  
107 | FROM `bigquery-public-data.stackoverflow.comments` 
108 | where id between 1000 and 100000;
109 | 
110 | --Query Partitioned table
111 | SELECT id,
112 | text,
113 | score,
114 | creation_date  
115 | FROM `bigquery_demos.stackoverflow_comments_p` 
116 | where id between 1000 and 100000;
117 | 
118 | /************** Data Ingestion Time Unit Partitioning  *******************/
119 | 
120 | --See the data distribution across HOUR/DAY/MONTH/YEAR ?
121 | SELECT TIMESTAMP_TRUNC(trip_start_timestamp, HOUR),count(*)
122 |  FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
123 |  where trip_start_timestamp > '2021-10-01 18:15:00 UTC'
124 |  group by 1
125 |  order by 1 desc;
126 | 
127 | --Create partition table based on ingestion time with HOUR as partition criteria
128 | create or replace table bigquery_demos.taxi_trips
129 | (
130 | unique_key       STRING,
131 | taxi_id       STRING,
132 | trip_start_timestamp       TIMESTAMP,
133 | trip_end_timestamp       TIMESTAMP,
134 | trip_seconds       INT64,
135 | trip_miles       FLOAT64,
136 | pickup_census_tract       INT64,
137 | dropoff_census_tract       INT64,
138 | pickup_community_area       INT64,
139 | dropoff_community_area       INT64,
140 | fare       FLOAT64,
141 | tips       FLOAT64,
142 | tolls       FLOAT64,
143 | extras       FLOAT64,
144 | trip_total       FLOAT64,
145 | payment_type       STRING,
146 | company       STRING,
147 | pickup_latitude       FLOAT64,
148 | pickup_longitude       FLOAT64,
149 | pickup_location       STRING,
150 | dropoff_latitude       FLOAT64,
151 | dropoff_longitude       FLOAT64,
152 | dropoff_location       STRING
153 | )
154 | PARTITION BY
155 |  DATETIME_TRUNC(_PARTITIONTIME,HOUR)
156 |   OPTIONS (
157 |     partition_expiration_days = 3,
158 |     require_partition_filter = TRUE);
159 | 
160 | -- Query Partitioned table
161 | SELECT
162 |   *
163 | FROM
164 |   bigquery_demos.taxi_trips
165 | WHERE
166 |   _PARTITIONTIME > TIMESTAMP_SUB(TIMESTAMP('2016-04-15'), INTERVAL 2 HOUR);
167 | 
168 |   SELECT
169 |   *
170 | FROM
171 |   bigquery_demos.taxi_trips
172 | WHERE
173 |   _PARTITIONTIME BETWEEN TIMESTAMP('2016-04-15') AND TIMESTAMP('2016-04-14');
174 | 
175 |   -- If you want to update partition filter requirement or expiration  use below DDLs
176 | 
177 | ALTER TABLE bigquery_demos.taxi_trips
178 | SET OPTIONS (
179 |     -- Sets partition expiration to 5 days
180 |     partition_expiration_days = 5,
181 |     require_partition_filter = false);
182 | 
183 | 


--------------------------------------------------------------------------------
/BigQuery/bq_procedures.sql:
--------------------------------------------------------------------------------
  1 | /*
  2 | Author : @ Anjan GCP Data Engineering
  3 | Created SQLs required for Big Query Procedures and Anonymous Blocks Demo
  4 | */
  5 | 
  6 | -- Metadata View which  gives Table meta data details liek row count, Size ..etc
  7 | 
  8 | SELECT * FROM austin_crime.__TABLES__;
  9 | 
 10 | -- Query to get all Table metadata details with formatted results
 11 | SELECT
 12 |   dataset_id AS dataset_name,
 13 |   table_id AS table_name,
 14 |   current_date AS stats_collect_date,
 15 |   row_count AS record_count,
 16 |   TIMESTAMP_MILLIS(last_modified_time) AS last_modified_time,
 17 |   size_bytes/POW(10,9) AS size_in_gb
 18 | FROM
 19 |   `gcp-data-eng-374308`.austin_crime.__TABLES__
 20 | WHERE
 21 |   type=1;
 22 | 
 23 | -- Table to capture the stats like table - row count, size ..etc
 24 | CREATE OR REPLACE TABLE
 25 |   analysis.table_stats ( dataset_name STRING,
 26 |     table_name STRING,
 27 |     stats_collect_date DATE,
 28 |     record_count INT64,
 29 |     last_modified_time TIMESTAMP,
 30 |     size_in_gb FLOAT64 );
 31 | 
 32 | select * from analysis.table_stats;
 33 | 
 34 | /*************************************************************************************/
 35 | 
 36 | /*
 37 | Author : @ Anjan GCP Data Engineering
 38 | 
 39 | Creating a Anonymous BLOCK to capture table stas like  row count , Size, modified time ..etc for all the
 40 | Table in a Project (ALL Datasets)
 41 | 
 42 | Steps:
 43 |   1. Loop to iterate throgh All datasets in a Big QUery project
 44 |   2. Delete the data if exist for the same DATE while we are rinning this procedure
 45 |   3. Constrcuting Dynamic SQL for each dataset to get the stats and insert the same into resultant table --> table_stats
 46 |   5. Executing Dynamic SQL to capture actual results
 47 | 
 48 | Created SQLs required for Big Query Procedures and Anonymous Blocks Demo
 49 | 
 50 | */
 51 | 
 52 | #standardSQL 
 53 | DECLARE DATASETS_TO_CHECK ARRAY<STRING>; 
 54 | DECLARE i INT64 DEFAULT 0; 
 55 | DECLARE Dataset STRING ; 
 56 | declare Qry string; 
 57 | 
 58 | SET DATASETS_TO_CHECK = ( 
 59 | WITH req_datasets as 
 60 | ( select schema_name 
 61 | from `gcp-data-eng-374308`.INFORMATION_SCHEMA.SCHEMATA 
 62 | ) 
 63 | SELECT ARRAY_AGG(schema_name) from req_datasets 
 64 | ); 
 65 | 
 66 | LOOP SET i = i + 1; 
 67 |   BEGIN
 68 |   IF i > ARRAY_LENGTH(DATASETS_TO_CHECK) THEN 
 69 |     LEAVE; 
 70 |   END IF; 
 71 | 
 72 |   delete from analysis.table_stats where dataset_name=DATASETS_TO_CHECK[ORDINAL(i)] and stats_collect_date = current_date; 
 73 |   set Qry =CONCAT("insert analysis.table_stats select dataset_id as dataset_name,table_id as table_name,current_date as stats_collect_date,     row_count as record_count,TIMESTAMP_MILLIS(last_modified_time) AS last_modified_time,size_bytes/pow(10,9) as size_in_gb FROM `gcp-data-eng-374308`.", DATASETS_TO_CHECK[ORDINAL(i)],".__TABLES__ where type=1"); 
 74 | 
 75 |   execute immediate Qry; 
 76 |   EXCEPTION
 77 |     WHEN ERROR THEN CONTINUE ;
 78 |   END;
 79 | END LOOP;
 80 | 
 81 | /************************************************************************************************************************/
 82 | 
 83 | /*
 84 | Author : @ Anjan GCP Data Engineering
 85 | 
 86 | Creating Procedure with INPUT dataset list woth comma seperated
 87 | Steps:
 88 |   1. For loop to iterate throgh given dataset list
 89 |   2. Delete the data if exist for the same DATE while we are rinning this procedure
 90 |   3. Constrcuting Dynamic SQL for each dataset to get the stats and insert the same into resultant table --> table_stats
 91 |   5. Executing Dynamic SQL to capture actual results
 92 |   6. Calling/Executing Proedure with CALL key word
 93 | 
 94 | Created SQLs required for Big Query Procedures and Anonymous Blocks Demo
 95 | */
 96 | CREATE OR REPLACE PROCEDURE
 97 |   analysis.sp_collect_stats(dataset_list STRING, OUT status STRING)
 98 | BEGIN
 99 | DECLARE qry STRING;
100 | 
101 |   FOR rec IN (
102 |   SELECT
103 |     schema_name as dataset_name
104 |   FROM
105 |     `gcp-data-eng-374308`.INFORMATION_SCHEMA.SCHEMATA
106 |   WHERE
107 |     schema_name IN (SELECT * FROM UNNEST(SPLIT(dataset_list))) ) 
108 | DO
109 | 
110 | DELETE FROM analysis.table_stats WHERE dataset_name=rec.dataset_name AND stats_collect_date = current_date;
111 | 
112 | SET qry =CONCAT("insert analysis.table_stats select dataset_id as dataset_name,table_id as table_name,current_date as stats_collect_date,row_count as record_count,TIMESTAMP_MILLIS(last_modified_time) AS last_modified_time,size_bytes/pow(10,9) as size_in_gb FROM `gcp-data-eng-374308`.", rec.dataset_name,".__TABLES__ where type=1");
113 | 
114 | EXECUTE IMMEDIATE qry;
115 | 
116 | set status = 'SUCCESS';
117 | 
118 | END FOR;
119 | END;
120 | 
121 | --Calling Procedure
122 | begin
123 | declare out_status string; 
124 | CALL analysis.sp_collect_stats('analysis,austin_crime',out_status);
125 | select out_status;
126 | end;
127 | 


--------------------------------------------------------------------------------
/BigQuery/bq_table_clusters.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Author : @ Anjan GCP Data Engineering
 3 | Created SQLs to Demo  BigQuery Table Clustering
 4 | */
 5 | -- Create cluster Table
 6 | CREATE OR REPLACE TABLE bigquery_demos.pageviews_cluster
 7 | (
 8 | datehour	TIMESTAMP,			
 9 | wiki	STRING,			
10 | title	STRING,			
11 | views	INTEGER
12 | )
13 | CLUSTER BY
14 |   wiki
15 |   OPTIONS (
16 |     description = 'a table clustered by wiki');
17 | 
18 | -- Create cluster Table with SQL query 
19 | CREATE OR REPLACE TABLE bigquery_demos.pageviews_cluster
20 | (
21 | datehour	TIMESTAMP,			
22 | wiki	STRING,			
23 | title	STRING,			
24 | views	INTEGER
25 | )
26 | CLUSTER BY
27 |   wiki
28 | AS (
29 |   SELECT * FROM bigquery_demos.pageviews
30 | );
31 | 
32 | --Insert data into cluster table DML
33 | insert into bigquery_demos.pageviews_cluster
34 | select * from gcp-data-eng-374308.bigquery_demos.pageviews;
35 | 
36 | -- Create cluster with partition Table
37 | CREATE OR REPLACE TABLE bigquery_demos.pageviews_cluster_partition
38 | (
39 | datehour	TIMESTAMP,			
40 | wiki	STRING,			
41 | title	STRING,			
42 | views	INTEGER
43 | )
44 | PARTITION BY TIMESTAMP_TRUNC(datehour,DAY)
45 | CLUSTER BY
46 |   wiki
47 |   OPTIONS (
48 |     description = 'a table clustered by wiki and partition by date');
49 | 
50 | -- Insert data , DML
51 | insert into bigquery_demos.pageviews_cluster_partition
52 | select * from gcp-data-eng-374308.bigquery_demos.pageviews;
53 | 
54 | 
55 | 
56 | -- Query non cluster table
57 | SELECT * FROM `gcp-data-eng-374308.bigquery_demos.pageviews` 
58 | where DATE(datehour) > "2023-03-28"
59 | and wiki ='sr.m'
60 | limit 10;
61 | 
62 | -- Query cluster table
63 | SELECT * FROM `gcp-data-eng-374308.bigquery_demos.pageviews_cluster` 
64 | where DATE(datehour) > "2023-03-28"
65 | and wiki ='sr.m'
66 | limit 10;
67 | 
68 | -- Query partioned cluster table
69 | SELECT * FROM `gcp-data-eng-374308.bigquery_demos.pageviews_cluster_partition` 
70 | where DATE(datehour) > "2023-03-28"
71 | and wiki ='sr.m'
72 | limit 10;
73 | 
74 | 


--------------------------------------------------------------------------------
/BigQuery/data_lineage_demo.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Author : @ Anjan GCP Data Engineering
 3 | Created SQLs to demo Biqguery Datalineage
 4 | */
 5 | 
 6 | -- Create conslolidated sales table bi joining customer and items tables
 7 | create or replace table data_eng_demos.cust_item_sales_dtls as
 8 | SELECT
 9 |     customer.fname||''||customer.lname as customer_name,
10 |     items.itm_name,
11 |     sales.qty,
12 |     sales.price,
13 |     sales.ord_date
14 |   FROM
15 |     `gcp-dataeng-demo-431907.data_eng_demos.customer` AS customer
16 |     INNER JOIN `gcp-dataeng-demo-431907.data_eng_demos.sales` AS sales ON customer.cust_id = sales.cust_id
17 |     INNER JOIN `gcp-dataeng-demo-431907.data_eng_demos.items` AS items ON sales.item_id = items.item_id;
18 | 
19 | -- Create Aggregate table based on customer name
20 | create or replace table data_eng_demos.customer_agg_sales as
21 | SELECT
22 |   customer_name,
23 |   SUM(qty) AS tot_qty,
24 |   SUM(price) AS tot_price
25 | FROM
26 |   data_eng_demos.cust_item_sales_dtls
27 | GROUP BY
28 |   1;
29 | 
30 | -- Create Aggregate table based on item name
31 | create or replace table data_eng_demos.item_agg_sales as
32 | SELECT
33 |   itm_name,
34 |   SUM(qty) AS tot_qty,
35 |   SUM(price) AS tot_price
36 | FROM
37 |   data_eng_demos.cust_item_sales_dtls
38 | GROUP BY
39 |   1;
40 | 
41 | 


--------------------------------------------------------------------------------
/BigQuery/gcp_billingdata_analysis.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | /*
 3 | 
 4 | Author : @ Anjan GCP Data Engineering
 5 | 
 6 | SQLs to analyze Billing data based on different dimensions
 7 | 
 8 | */
 9 | 
10 | /************** Billing main columns and plain data  *****************/
11 | SELECT
12 |   invoice.month,
13 |   service.description as service,
14 |   usage_start_time,
15 |   usage_end_time,
16 |   project.name,
17 |   location.region,
18 |   cost,
19 |   currency,
20 |   usage.amount,
21 |   usage.unit,
22 |   (select SUM(c.amount)
23 |   from UNNEST(credits) c) as credits_amount,
24 |   (select STRING_AGG(c.full_name)
25 |   from UNNEST(credits) c) as crdit_full_name
26 | FROM
27 |   `gcp-data-eng-374308.analysis.gcp_billing_export_v1_01CBD1_B38C45_E20EEB`;
28 | 
29 | /************** Total uasge cost per month  without Credits *****************/
30 | 
31 | SELECT
32 |   invoice.month,
33 |   SUM(cost) AS total,
34 |   SUM(CAST(cost AS NUMERIC)) AS total_exact
35 | FROM `gcp-data-eng-374308.analysis.gcp_billing_export_v1_01CBD1_B38C45_E20EEB`
36 | GROUP BY 1
37 | ORDER BY 1 ASC;
38 | 
39 | /************** Total uasge cost per month  with Credits  *****************/
40 | 
41 | SELECT
42 |   invoice.month,
43 |   SUM(cost)
44 |     + SUM(IFNULL((SELECT SUM(c.amount)
45 |                   FROM UNNEST(credits) c), 0))
46 |     AS total,
47 |   (SUM(CAST(cost AS NUMERIC))
48 |     + SUM(IFNULL((SELECT SUM(CAST(c.amount AS NUMERIC))
49 |                   FROM UNNEST(credits) AS c), 0)))
50 |     AS total_exact
51 | FROM `gcp-data-eng-374308.analysis.gcp_billing_export_v1_01CBD1_B38C45_E20EEB`
52 | GROUP BY 1
53 | ORDER BY 1 ASC;
54 | 
55 | /************** Total uasge cost per month group by Service  without Credits  *****************/
56 | 
57 | SELECT
58 |   invoice.month,
59 |   service.description as service,
60 |   SUM(cost) AS total,
61 |   (SUM(CAST(cost AS NUMERIC))) AS total_exact
62 | FROM `gcp-data-eng-374308.analysis.gcp_billing_export_v1_01CBD1_B38C45_E20EEB`
63 | GROUP BY 1,2
64 | ORDER BY 1 ASC;
65 | 
66 | /************** Total uasge cost per month group by Service  with Credits  *****************/
67 | 
68 | SELECT
69 |   invoice.month,
70 |   service.description as service,
71 |   SUM(cost)
72 |     + SUM(IFNULL((SELECT SUM(c.amount)
73 |                   FROM UNNEST(credits) c), 0))
74 |     AS total,
75 |   (SUM(CAST(cost AS NUMERIC))
76 |     + SUM(IFNULL((SELECT SUM(CAST(c.amount AS NUMERIC))
77 |                   FROM UNNEST(credits) AS c), 0)))
78 |     AS total_exact
79 | FROM `gcp-data-eng-374308.analysis.gcp_billing_export_v1_01CBD1_B38C45_E20EEB`
80 | GROUP BY 1,2
81 | ORDER BY 1 ASC;
82 | 
83 | 
84 | /************** Total uasge cost for a perticular service  *****************/
85 | 
86 | SELECT
87 |   SUM(cost) AS cost_before_credits,
88 |   labels.value AS cluster_name
89 | FROM  `gcp-data-eng-374308.analysis.gcp_billing_export_resource_v1_01CBD1_B38C45_E20EEB`
90 | LEFT JOIN UNNEST(labels) as labels
91 |   ON labels.key = "goog-k8s-cluster-name"
92 | GROUP BY labels.value;
93 | 
94 | 


--------------------------------------------------------------------------------
/BigQuery/table_function.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Author : @ Anjan GCP Data Engineering
 3 | */
 4 | #  Creating Table function
 5 | CREATE OR REPLACE TABLE FUNCTION gcp_dataeng_demos.TableFunctionDemo(filter_timestap timestamp) AS 
 6 |   (
 7 |   SELECT
 8 |     id,
 9 |     owner_display_name,
10 |     score
11 |   FROM
12 |     `bigquery-public-data.stackoverflow.posts_answers`
13 |   WHERE
14 |     creation_date >= TIMESTAMP(filter_timestap)
15 |     AND owner_display_name IS NOT NULL 
16 |     );
17 | 
18 | # Query Table function
19 | SELECT
20 |   *
21 | FROM
22 |   gcp_dataeng_demos.TableFunctionDemo(TIMESTAMP('2022-06-01 00:00:00.000000 UTC'));
23 | 
24 | # Join Table function result with other table 
25 | SELECT
26 |   a.id,
27 |   a.owner_display_name,
28 |   a.score,
29 |   b.view_count,
30 |   b.title
31 | FROM
32 |   `bigquery-public-data.stackoverflow.posts_questions` b
33 | JOIN
34 |   gcp_dataeng_demos.TableFunctionDemo(TIMESTAMP('2022-01-01 00:00:00.000000 UTC')) a
35 | ON
36 |   UPPER(a.owner_display_name) =UPPER( b.owner_display_name)
37 | 


--------------------------------------------------------------------------------
/CloudFunctions/bq_events_to_gcs_cf.py:
--------------------------------------------------------------------------------
 1 | import functions_framework
 2 | from google.cloud import bigquery
 3 | from datetime import datetime
 4 | 
 5 | '''
 6 | Dependencies to be installed
 7 | 
 8 | db-dtypes
 9 | fsspec
10 | gcsfs
11 | bigquery
12 | 
13 | '''
14 | 
15 | # CloudEvent function to be triggered by an Eventarc Cloud Audit Logging trigger
16 | # Note: this is NOT designed for second-party (Cloud Audit Logs -> Pub/Sub) triggers!
17 | @functions_framework.cloud_event
18 | def hello_auditlog(cloudevent):
19 | 
20 |    # Print out details from the `protoPayload`
21 |    # This field encapsulates a Cloud Audit Logging entry
22 |    # See https://cloud.google.com/logging/docs/audit#audit_log_entry_structure
23 | 
24 |    payload = cloudevent.data.get("protoPayload")
25 |    if payload:
26 |        
27 |        # Timestamp in string format 
28 |        now = datetime.now()
29 |        timpstamp = now.strftime("%m%d%Y%H%M%S")
30 |        
31 |        # Build Big Query client 
32 |        bucket_name = 'data_eng_demos'
33 |        project = "gcp-dataeng-demos-365206"
34 |        dataset_id = "gcp_dataeng_demos"
35 |        table_id = "demo_cf"
36 |        
37 |        # Write data into GCS/csv file using dataframe 
38 |        client = bigquery.Client(project=project)
39 |        destination_uri = "gs://{}/{}".format(bucket_name, "bq_to_gcs_extract" + timpstamp + ".csv")
40 |        qry = "select * from " + project + "." + dataset_id + "." + table_id
41 |        df_qry_result = client.query(qry).to_dataframe()
42 |        df_qry_result.to_csv(destination_uri)
43 | 
44 |        print(
45 |                 "Exported {}:{}.{} to {}".format(project, dataset_id, table_id, destination_uri)
46 |             )
47 | 


--------------------------------------------------------------------------------
/CloudFunctions/gcs_events_cf.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from pandas.io import gbq
 3 | from google.cloud import bigquery
 4 | 
 5 | '''
 6 | Python Dependencies to be installed
 7 | 
 8 | gcsfs
 9 | fsspec
10 | pandas
11 | pandas-gbq
12 | 
13 | '''
14 | 
15 | def hello_gcs(event, context):
16 |     """Triggered by a change to a Cloud Storage bucket.
17 |     Args:
18 |          event (dict): Event payload.
19 |          context (google.cloud.functions.Context): Metadata for the event.
20 |     """
21 | 
22 |     lst = []
23 |     file_name = event['name']
24 |     table_name = file_name.split('.')[0]
25 | 
26 |     # Event,File metadata details writing into Big Query
27 |     dct={
28 |          'Event_ID':context.event_id,
29 |          'Event_type':context.event_type,
30 |          'Bucket_name':event['bucket'],
31 |          'File_name':event['name'],
32 |          'Created':event['timeCreated'],
33 |          'Updated':event['updated']
34 |         }
35 |     lst.append(dct)
36 |     df_metadata = pd.DataFrame.from_records(lst)
37 |     df_metadata.to_gbq('gcp_dataeng_demos.data_loading_metadata', 
38 |                         project_id='gcp-dataeng-demos-365206', 
39 |                         if_exists='append',
40 |                         location='us')
41 |     
42 |     # Actual file data , writing to Big Query
43 |     df_data = pd.read_csv('gs://' + event['bucket'] + '/' + file_name)
44 | 
45 |     df_data.to_gbq('gcp_dataeng_demos.' + table_name, 
46 |                         project_id='gcp-dataeng-demos-365206', 
47 |                         if_exists='append',
48 |                         location='us')
49 | 


--------------------------------------------------------------------------------
/CloudFunctions/pubsub_events_cf.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def hello_pubsub(event, context):
 3 |     """
 4 |     Background Cloud Function to be triggered by Pub/Sub.
 5 |     Required dependencies : bigquery
 6 |     Configure environment variables while creating and deploying cloud function
 7 |     for bigquery dataset and table
 8 |     dataset:gcp_dataeng_demos
 9 |     table:cf_pubsub_demo
10 |     
11 |     """
12 |     import base64
13 |     import json
14 |     import os,sys
15 |     from google.cloud import bigquery
16 | 
17 |     if 'data' in event:
18 |         data_buffer = base64.b64decode(event['data']).decode('utf-8')
19 | 
20 |         message='{'+'"Actvity_Time": "{}"'.format(json.loads(data_buffer)['timestamp']) + ',' +'"Resource_Name": "{}"'.format(json.loads(data_buffer)['protoPayload']['resourceName']) + ',' +'"Actvity_Type": "{}"'.format(json.loads(data_buffer)['protoPayload']['methodName']) + ',' +'"Activity_done_by": "{}"'.format(json.loads(data_buffer)['protoPayload']['authenticationInfo']['principalEmail']) + ',' + '"Change_in_IAM_policies": "{}"'.format(json.loads(data_buffer)['protoPayload']['serviceData']['policyDelta']['bindingDeltas'])+'}'
21 |         bq_data=json.loads(message)
22 |         print(bq_data)
23 |         
24 |         def to_bigquery(dataset, table, document):
25 |             bigquery_client = bigquery.Client()
26 |             dataset_ref = bigquery_client.dataset(dataset)
27 |             table_ref = dataset_ref.table(table)
28 |             table = bigquery_client.get_table(table_ref)
29 |             errors = bigquery_client.insert_rows(table, [document])
30 |             if errors != [] :
31 |                  print(errors, file=sys.stderr)
32 |         to_bigquery(os.environ['dataset'], os.environ['table'], bq_data)
33 |         
34 |     else:
35 |         print('Hello World')
36 | 


--------------------------------------------------------------------------------
/CloudFunctions/sheets_to_bigquery_dataload_cf.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Author : @ Anjan GCP Data Engineering
  3 |  Cloud function code to:
  4 |  	1. Read data from publicly shared  google sheets
  5 |  	2. Load data into Bigquery  using pandas and bigquery APIs 
  6 | '''
  7 | 
  8 | import functions_framework
  9 | import pandas as pd
 10 | import pandas_gbq
 11 | 
 12 | @functions_framework.http
 13 | def hello_http(request):
 14 | 
 15 |     message = 'Function executed Successfully'
 16 |     # Read Google Sheet data into Pandas dataframe
 17 |     # and write that data into Bigquery Table
 18 |     sheet_id = '1r9b-CN86hCmwnnm_-6aosLwZpzKyLRjsW9fxr0ALVRE'
 19 |     sheet_name = "Sheet1"
 20 |     url_1 = "https://docs.google.com/spreadsheets/d/{}/gviz/tq?tqx=out:csv&sheet={}".format(sheet_id,sheet_name)
 21 |     #print(url_1)
 22 |     df = pd.read_csv(url_1)
 23 |     df.to_gbq('gcp_dataeng_demos.public_fruit_to_bq',
 24 |               'gcp-dataeng-demos-383407',
 25 |               chunksize=10000, 
 26 |               if_exists='append'
 27 |                       )
 28 |     print("Data loaded successfully")
 29 |     return message
 30 |     
 31 |     
 32 |  
 33 | """ 
 34 | 
 35 | Author : @ Anjan GCP Data Engineering
 36 |  Cloud function code to:
 37 |  	1. Read data from private  google sheets
 38 |  	2. grant edit access to compute engine SA on google sheet
 39 |  	3. upload SA creds to secret manager
 40 |  	4. authenticate SA by downloading SA creds into CF code using python API
 41 |  	2. Load data into Bigquery  using pandas and bigquery APIs
 42 | 
 43 | Roles required by  SA -
 44 | 
 45 | Secret Manager Secret Accessor  
 46 | 
 47 | Installations required -
 48 | 
 49 | functions-framework==3.*
 50 | google-cloud-secret-manager
 51 | requests
 52 | pandas
 53 | gspread_pandas
 54 | pandas_gbq
 55 | 
 56 | """
 57 | 
 58 | # Imports
 59 | import functions_framework
 60 | import requests as req
 61 | import pandas as pd
 62 | from google.cloud import secretmanager
 63 | import gspread_pandas
 64 | import json
 65 | import pandas_gbq
 66 | 
 67 | # Cloud Function 
 68 | @functions_framework.http
 69 | def hello_http(request):
 70 |     message = 'Function executed Successfully'
 71 |     # Create the Secret Manager client.
 72 |     client = secretmanager.SecretManagerServiceClient()
 73 |     # Build the resource name of the secret version.
 74 |     project_id = '414888653736'
 75 |     secret_id = 'sa_cred'
 76 |     version_id = '1'
 77 |     name = "projects/{}/secrets/{}/versions/{}".format(project_id,secret_id,version_id)
 78 |     # Access the secret version.
 79 |     response = client.access_secret_version(request={"name": name})
 80 |     # Print the secret payload.
 81 |     # snippet is showing how to access the secret material.
 82 |     payload = response.payload.data.decode("UTF-8")
 83 |     # convert secret value into json format
 84 |     credentials = json.loads(payload)
 85 | 	# Defining scopes for gsheet and gdrive APIs
 86 |     scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
 87 |     # Access gsheet into gspread_pandas varaible
 88 |     google_sheet_file_1 = gspread_pandas.Spread('1GmKAaZQS-sLaQRmMzNXaFt6lXOkQbvxGNx2_c3NnoVk', config=credentials)
 89 |     # Convert into pandas dataframe
 90 |     df = google_sheet_file_1.sheet_to_df(header_rows=1).astype(str)
 91 |     df.reset_index(inplace=True)
 92 |     # Write values into Bigquery Table with append mode
 93 |     df.to_gbq('gcp_dataeng_demos.sa_sheet_to_bq',
 94 |              'gcp-dataeng-demos-383407',
 95 |               chunksize=10000, 
 96 |               if_exists='append'
 97 |                 )
 98 |     print("Data loaded successfully")
 99 |     return message
100 | 


--------------------------------------------------------------------------------
/CloudSQL/cloudsql_python_connect.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Installations required -
 3 | pip install cloud-sql-python-connector["pymysql"] SQLAlchemy
 4 | pip install google-cloud-secret-manager
 5 | '''
 6 | #Import required dependencies
 7 | from google.cloud.sql.connector import Connector
 8 | import sqlalchemy
 9 | 
10 | # Function to get CloudSQL instance password from Secret Manager
11 | def access_secret_version(project_id, secret_id, version_id):
12 |     """
13 |     Access the payload for the given secret version if one exists. The version
14 |     can be a version number as a string (e.g. "5") or an alias (e.g. "latest").
15 |     """
16 | 
17 |     # Import the Secret Manager client library.
18 |     from google.cloud import secretmanager
19 | 
20 |     # Create the Secret Manager client.
21 |     client = secretmanager.SecretManagerServiceClient()
22 | 
23 |     # Build the resource name of the secret version.
24 |     name = f"projects/{project_id}/secrets/{secret_id}/versions/{version_id}"
25 | 
26 |     # Access the secret version.
27 |     response = client.access_secret_version(request={"name": name})
28 |     # Print the secret payload.
29 |     # snippet is showing how to access the secret material.
30 |     payload = response.payload.data.decode("UTF-8")
31 |     return payload
32 | 
33 | # Function call to get DB password ino a local varaiable  
34 | db_password = access_secret_version('gcp-data-eng-374308', 'cloudsql_pwd','1')
35 | 
36 | 
37 | # initialize Connector object
38 | connector = Connector()
39 | 
40 | # function to return the database connection
41 | def getconn():
42 |     conn= connector.connect(
43 |         "gcp-data-eng-374308:asia-south1:sql-demo",
44 |         "pymysql",
45 |         user="root",
46 |         password=db_password,
47 |         db="gcp_demo"
48 |     )
49 |     return conn
50 | # create connection pool
51 | pool = sqlalchemy.create_engine(
52 |     "mysql+pymysql://",
53 |     creator=getconn,
54 | )
55 | 
56 | # insert statement (DML statement for data load)
57 | insert_stmt = sqlalchemy.text(
58 |     "INSERT INTO basic_dtls (idn, name) VALUES (:idn, :name)",
59 | )
60 | 
61 | # interact with Cloud SQL database using connection pool
62 | with pool.connect() as db_conn:
63 |     
64 |     # Create Table
65 |     db_conn.execute("CREATE TABLE basic_dtls(idn INT, name VARCHAR(200))")
66 | 
67 |     # Insert data into Table
68 |     
69 |     db_conn.execute(insert_stmt, idn=1, name="AAA")
70 |     db_conn.execute(insert_stmt, idn=2, name="BBB")
71 |     db_conn.execute(insert_stmt, idn=3, name="CCC")
72 | 
73 | 
74 |     # query database
75 |     result = db_conn.execute("SELECT * from basic_dtls").fetchall()
76 | 
77 |     # Do something with the results
78 |     for row in result:
79 |         print(row)
80 | 
81 |     # Dropping Table
82 |     #db_conn.execute("DROP TABLE basic_dtls")
83 | 


--------------------------------------------------------------------------------
/CloudSQL/federated_query_demo.sql:
--------------------------------------------------------------------------------
 1 | -- List all tables in a database.
 2 | SELECT * FROM EXTERNAL_QUERY("projects/gcp-data-eng-374308/locations/asia-south1/connections/cloudsql_connect",
 3 | "select * from information_schema.tables;");
 4 | 
 5 | -- List all columns in a table.
 6 | SELECT * FROM EXTERNAL_QUERY("projects/gcp-data-eng-374308/locations/asia-south1/connections/cloudsql_connect",
 7 | "select * from information_schema.columns where table_name='product_master';");
 8 | 
 9 | 
10 | -- Query data from CloudSQL table.
11 | SELECT * FROM EXTERNAL_QUERY("projects/gcp-data-eng-374308/locations/asia-south1/connections/cloudsql_connect",
12 | "select * from product_master;");
13 | 
14 | --Join Bigquery table with Cloud SQL table
15 | 
16 | SELECT  pm.product_name,
17 |         pm.product_desc,
18 |         dtls.qty,
19 |         dtls.price,
20 |         dtls.date
21 | FROM `gcp-data-eng-374308.federated_demo.product_sales_dtls` dtls 
22 | JOIN 
23 | (
24 |   SELECT * 
25 |   FROM EXTERNAL_QUERY("projects/gcp-data-eng-374308/locations/asia-south1/connections/cloudsql_connect",
26 |                       "select * from product_master;")
27 | ) pm
28 | ON dtls.product_code = pm.product_code;
29 | 
30 | 


--------------------------------------------------------------------------------
/CloudSpanner/cloud_spanner_intro_demo.sql:
--------------------------------------------------------------------------------
 1 | -- Create Table GoogleSQL
 2 | CREATE TABLE employee (
 3 |   idn INT64,
 4 |   name STRING(MAX),
 5 |   salary FLOAT64,
 6 | ) PRIMARY KEY(idn);
 7 | 
 8 | --Insert data
 9 | insert into employee(idn,name,salary) 
10 | values(1,'aa',100.5),(2,'bb',1200.0);
11 | 
12 | 
13 | -- Create Table PostgresSQL
14 | CREATE TABLE employee (
15 |   idn bigint NOT NULL,
16 |   name character varying(256),
17 |   salary numeric,
18 |   PRIMARY KEY(idn)
19 | );
20 | 
21 | --CLI create instance
22 | gcloud spanner instances create gcp-demo-instance --config=regional-us-central1 \
23 |     --description="Demo Instance" --nodes=1
24 | 
25 | -- Set Instance
26 | gcloud config set spanner/instance gcp-demo-instance
27 | 
28 | --Create database
29 | gcloud spanner databases create example-db
30 | 
31 | -- Create table
32 | gcloud spanner databases ddl update example-db \
33 |   --ddl='CREATE TABLE Books ( BookId INT64 NOT NULL, 
34 |                                 BookName STRING(1024), 
35 |                                 BookCatgry STRING(1024)) PRIMARY KEY (BookId)'
36 | -- Insert data                              
37 | gcloud spanner rows insert --database=example-db \
38 |       --table=Books \
39 |       --data=BookId=1,BookName=abc,BookCatgry=Finance
40 |       
41 | gcloud spanner rows insert --database=example-db \
42 |       --table=Books \
43 |       --data=BookId=2,BookName=aaa,BookCatgry=Comic
44 |       
45 | gcloud spanner rows insert --database=example-db \
46 |       --table=Books \
47 |       --data=BookId=3,BookName=ccc,BookCatgry=History
48 | 
49 | -- Query data
50 | gcloud spanner databases execute-sql example-db \
51 |     --sql='SELECT * FROM Books'
52 |  
53 |  -- Delete database   
54 | gcloud spanner databases delete example-db
55 | 
56 | -- Delete Instance
57 | gcloud spanner instances delete gcp-demo-instance
58 | 
59 | 


--------------------------------------------------------------------------------
/CloudSpanner/spanner_python_client_demo.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Author @ Anjan GCP Data Engineering
  3 | 
  4 | Install Spanner Client Libraries
  5 | pip install google-cloud-spanner==3.31.0
  6 | 
  7 | Note: This is only for Educational purpose
  8 | 
  9 | These code samples will demo basic Cloud Spanner Operations
 10 | 	1. Create Spanner Instance
 11 | 	2. Create Spanner Database (Google Standard SQL), Table
 12 | 	3. Insert data using DML statements
 13 | 	4. Query Spanner data
 14 |     
 15 | '''
 16 | 
 17 | from google.cloud import spanner 
 18 | 
 19 | # Function to create  Spanner Instance
 20 | def create_instance(instance_id,region):
 21 |     """Creates an instance."""
 22 |     spanner_client = spanner.Client()
 23 | 
 24 |     config_name = "{}/instanceConfigs/regional-{}".format(
 25 |         spanner_client.project_name,region
 26 |     )
 27 | 
 28 |     instance = spanner_client.instance(
 29 |         instance_id,
 30 |         configuration_name=config_name,
 31 |         display_name="Demo Instance.",
 32 |         node_count=1
 33 |     )
 34 | 
 35 |     instance.create()
 36 | 
 37 |     print("Waiting for operation to complete...")
 38 |     print("Created instance {}".format(instance_id))
 39 | 
 40 | # Function to create  Spanner Database and Tables
 41 | def create_database(instance_id, database_id):
 42 |     """Creates a database and tables for demo data."""
 43 |     spanner_client = spanner.Client()
 44 |     instance = spanner_client.instance(instance_id)
 45 | 
 46 |     database = instance.database(
 47 |         database_id,
 48 |         ddl_statements=[
 49 |             """CREATE TABLE employee (
 50 |             empid     INT64 NOT NULL,
 51 |             empname    STRING(1024),
 52 |             salary FLOAT64
 53 |         ) PRIMARY KEY (empid)"""
 54 |         ],
 55 |     )
 56 | 
 57 |     database.create()
 58 | 
 59 |     print("Waiting for operation to complete...")
 60 |     print("Created database {} on instance {}".format(database_id, instance_id))
 61 | 
 62 | # Function to insert data into Spanner database Table
 63 | def insert_data(instance_id, database_id):
 64 |     #Inserts sample data into the given database.
 65 | 
 66 |     spanner_client = spanner.Client()
 67 |     instance = spanner_client.instance(instance_id)
 68 |     database = instance.database(database_id)
 69 | 
 70 |     with database.batch() as batch:
 71 |         batch.insert(
 72 |             table="employee",
 73 |             columns=("empid", "empname", "salary"),
 74 |             values=[
 75 |                 (1, "Marc", 2032.5),
 76 |                 (2, "Catalina", 1298.3),
 77 |                 (3, "Alice", 3087.5),
 78 |                 (4, "Lea", 1567.9),
 79 |                 (5, "David", 2224.6),
 80 |             ],
 81 |         )
 82 |     print("Inserted data.")
 83 | 
 84 | 
 85 | # Function to query data from Spanner Database Table
 86 | def query_data(instance_id, database_id):
 87 |     """Queries sample data from the database using SQL."""
 88 |     spanner_client = spanner.Client()
 89 |     instance = spanner_client.instance(instance_id)
 90 |     database = instance.database(database_id)
 91 | 
 92 |     with database.snapshot() as snapshot:
 93 |         results = snapshot.execute_sql(
 94 |             "SELECT empid,empname,salary AlbumTitle FROM employee"
 95 |         )
 96 | 
 97 |         for row in results:
 98 |             print("Emp ID: {}, Emp Name: {}, Salary: {}".format(*row))
 99 |             
100 | # Create Spanner instance
101 | create_instance('gcp-dataeng-demo','asia-south1')
102 | 
103 | #Create database and Table
104 | create_database('gcp-dataeng-demo','demo_db')
105 | 
106 | # Insert data
107 | insert_data('gcp-dataeng-demo','demo_db')
108 | 
109 | # Query data
110 | query_data('gcp-dataeng-demo','demo_db')
111 | 
112 | # Delete Instance
113 | # gcloud spanner instances delete gcp-dataeng-demo
114 | 


--------------------------------------------------------------------------------
/Common_Realtime_Usecases/dataflow_spanner_demo/beam_dataflow-to-spanner.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Author : @ Anjan GCP Data Engineering
  3 | Created for education purpose only
  4 | '''
  5 | 
  6 | import argparse
  7 | import logging
  8 | import re, os
  9 | from typing import NamedTuple, List
 10 | 
 11 | import apache_beam as beam
 12 | from apache_beam.options.pipeline_options import PipelineOptions
 13 | from apache_beam.options.pipeline_options import SetupOptions
 14 | from apache_beam.io.gcp.spanner import SpannerInsert
 15 | from apache_beam.dataframe.io import read_csv
 16 | from apache_beam.dataframe import convert
 17 | 
 18 | # Inferring schema using Named Tuple
 19 | class SpannerRow(NamedTuple):
 20 |     trid: int
 21 |     age: int
 22 |     workclass: str
 23 |     education: str
 24 |     marital_status: str
 25 |     occupation: str
 26 |     relationship: str
 27 |     sex: str
 28 |     native_country: str
 29 |     income_bracket: str
 30 | beam.coders.registry.register_coder(SpannerRow, beam.coders.RowCoder)
 31 | 
 32 | # User defined tranformation to replace ?
 33 | def ValueReplace(column):
 34 |     if column == '?':
 35 |         column = 'NA'
 36 |     return column
 37 | 
 38 | # Pipeline entry point , passing user input arguments
 39 | def main(argv=None, save_main_session=True):
 40 |     """Main entry point."""
 41 |     projectid = os.environ.get('GOOGLE_CLOUD_PROJECT')
 42 |     parser = argparse.ArgumentParser()
 43 |     parser.add_argument(
 44 |         '--input',
 45 |         dest='input',
 46 |         default='census_100_testing.csv',
 47 |         help='Input filename.')
 48 |     parser.add_argument(
 49 |         '--instance',
 50 |         dest='instance',
 51 |         default='test-spanner-instance',
 52 |         help='Spanner instance ID.')
 53 |     parser.add_argument(
 54 |         '--database',
 55 |         dest='database',
 56 |         default = 'census-db',      
 57 |         help='Spanner database.')
 58 |     parser.add_argument(
 59 |         '--table',
 60 |         dest='table',
 61 |         default = 'census',      
 62 |         help='Spanner table.')
 63 |     known_args, pipeline_args = parser.parse_known_args(argv)
 64 | 
 65 |     pipeline_options = PipelineOptions(pipeline_args)
 66 |     pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
 67 | 
 68 |     # Beam pipeline , collection of Tranformations 
 69 |     with beam.Pipeline(options=pipeline_options) as p:
 70 |         census = p | 'Read CSV to dataframe' >> read_csv('gs://gcp-dataeng-demos1993/census_100_testing.csv')
 71 |         census = ( convert.to_pcollection(census)
 72 |                    | "Filter age is null rows" >> beam.Filter(lambda x: x.age )
 73 |                    | "Filter workclass value ? rows" >> beam.Filter(lambda x: x.workclass != '?') 
 74 |               
 75 |         | 'Convert to Spanner Rows' >> beam.Map(lambda x : SpannerRow(   x.trid,
 76 |                                                                          x.age, 
 77 |                                                                          x.workclass, 
 78 |                                                                          ValueReplace(x.education), 
 79 |                                                                          x.marital_status, 
 80 |                                                                          ValueReplace(x.occupation),
 81 |                                                                          x.relationship,
 82 |                                                                          x.sex,
 83 |                                                                          ValueReplace(x.native_country),
 84 |                                                                          x.income_bracket
 85 |                                                                         ))
 86 |         )
 87 |         # Writing data to Spanner Database
 88 |         
 89 |         census | 'Write to Spanner' >> SpannerInsert(
 90 |                     project_id= 'gcp-dataeng-demo-431907',
 91 |                     instance_id= 'test-spanner-instance',
 92 |                     database_id= 'census-db',
 93 |                     table= 'census')
 94 |         
 95 |         census | beam.Map(print)
 96 |         
 97 | if __name__ == '__main__':
 98 |   logging.getLogger().setLevel(logging.INFO)
 99 |   main()
100 | 


--------------------------------------------------------------------------------
/Common_Realtime_Usecases/dataflow_spanner_demo/census-db-schema.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE census (
 2 |      trid INT64 NOT NULL,
 3 |      age INT64 NOT NULL, 
 4 |      workclass STRING(MAX), 
 5 |      education STRING(MAX),
 6 |      marital_status STRING(MAX),
 7 |      occupation STRING(MAX),
 8 |      relationship STRING(MAX),
 9 |      sex STRING(MAX),
10 |      native_country STRING(MAX),
11 |      income_bracket STRING(MAX)
12 | ) PRIMARY KEY (trid);
13 | 


--------------------------------------------------------------------------------
/Common_Realtime_Usecases/dataflow_spanner_demo/census_100_testing.csv:
--------------------------------------------------------------------------------
  1 | trid,age,workclass,education,marital_status,occupation,relationship,sex,native_country,income_bracket
  2 | 111,39,Private,9th,Married-civ-spouse,Other-service,Wife,Female,United-States,<=50K
  3 | 112,77,Private,9th,Married-civ-spouse,Priv-house-serv,Wife,Female,United-States,<=50K
  4 | 113,38,Private,9th,Married-civ-spouse,Other-service,Wife,Female,Haiti,<=50K
  5 | 114,28,Private,9th,Married-civ-spouse,Protective-serv,Wife,Female,United-States,<=50K
  6 | 115,37,Private,9th,Married-civ-spouse,Machine-op-inspct,Wife,Female,United-States,<=50K
  7 | 116,35,?,9th,Married-civ-spouse,?,Wife,Female,United-States,<=50K
  8 | 117,45,Private,9th,Married-civ-spouse,Machine-op-inspct,Wife,Female,United-States,>50K
  9 | 118,55,Private,9th,Married-civ-spouse,Tech-support,Wife,Female,United-States,<=50K
 10 | 119,27,Private,9th,Married-civ-spouse,Machine-op-inspct,Wife,Female,Portugal,<=50K
 11 | 120,31,Private,9th,Married-civ-spouse,Exec-managerial,Wife,Female,United-States,<=50K
 12 | 121,30,Private,9th,Married-civ-spouse,Machine-op-inspct,Wife,Female,Portugal,<=50K
 13 | 122,28,Private,9th,Married-civ-spouse,Machine-op-inspct,Wife,Female,United-States,<=50K
 14 | 123,,Private,10th,Married-civ-spouse,Machine-op-inspct,Wife,Female,United-States,<=50K
 15 | 124,46,Private,9th,Married-civ-spouse,Machine-op-inspct,Wife,Female,United-States,<=50K
 16 | 125,70,Private,9th,Married-civ-spouse,Machine-op-inspct,Wife,Female,United-States,<=50K
 17 | 126,31,Private,9th,Married-civ-spouse,Farming-fishing,Wife,Female,United-States,<=50K
 18 | 127,40,Local-gov,9th,Married-civ-spouse,Other-service,Wife,Female,Yugoslavia,>50K
 19 | 128,52,Local-gov,9th,Married-civ-spouse,Other-service,Wife,Female,United-States,<=50K
 20 | 129,46,Self-emp-inc,9th,Married-civ-spouse,Adm-clerical,Wife,Female,United-States,<=50K
 21 | 130,41,Self-emp-inc,9th,Married-civ-spouse,Sales,Wife,Female,Dominican-Republic,<=50K
 22 | 131,41,?,9th,Married-civ-spouse,?,Wife,Female,Hong,<=50K
 23 | 132,72,Private,9th,Married-civ-spouse,Exec-managerial,Wife,Female,United-States,>50K
 24 | 133,75,?,9th,Married-civ-spouse,?,Husband,Male,United-States,<=50K
 25 | 134,77,?,9th,Married-civ-spouse,?,Husband,Male,United-States,<=50K
 26 | 135,66,?,9th,Married-civ-spouse,?,Husband,Male,United-States,<=50K
 27 | 136,45,Private,9th,Married-civ-spouse,Adm-clerical,Husband,Male,United-States,>50K
 28 | 137,57,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,>50K
 29 | 138,57,Private,9th,Married-civ-spouse,Sales,Husband,Male,United-States,>50K
 30 | 139,47,Private,9th,Married-civ-spouse,Machine-op-inspct,Husband,Male,?,<=50K
 31 | 140,61,Private,9th,Married-civ-spouse,Machine-op-inspct,Husband,Male,Trinadad&Tobago,<=50K
 32 | 141,63,Private,9th,Married-civ-spouse,Farming-fishing,Husband,Male,United-States,<=50K
 33 | 142,32,Private,9th,Married-civ-spouse,Farming-fishing,Husband,Male,United-States,<=50K
 34 | 143,56,Private,9th,Married-civ-spouse,Transport-moving,Husband,Male,United-States,<=50K
 35 | 144,38,Private,9th,Married-civ-spouse,Transport-moving,Husband,Male,United-States,<=50K
 36 | 145,58,Private,9th,Married-civ-spouse,Machine-op-inspct,Husband,Male,United-States,<=50K
 37 | 146,44,Private,9th,Married-civ-spouse,Machine-op-inspct,Husband,Male,United-States,<=50K
 38 | 147,53,Private,9th,Married-civ-spouse,Machine-op-inspct,Husband,Male,United-States,<=50K
 39 | 148,44,Private,9th,Married-civ-spouse,Machine-op-inspct,Husband,Male,United-States,<=50K
 40 | 149,62,Private,9th,Married-civ-spouse,Other-service,Husband,Male,United-States,<=50K
 41 | 150,68,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,<=50K
 42 | 151,31,Private,9th,Married-civ-spouse,Handlers-cleaners,Husband,Male,United-States,<=50K
 43 | 152,58,Private,9th,Married-civ-spouse,Handlers-cleaners,Husband,Male,United-States,<=50K
 44 | 153,28,Local-gov,9th,Married-civ-spouse,Craft-repair,Husband,Male,Trinadad&Tobago,>50K
 45 | 154,,Local-gov,12th,Married-civ-spouse,Craft-repair,Husband,Male,Trinadad&Tobago,>50K
 46 | 155,51,Local-gov,9th,Married-civ-spouse,Transport-moving,Husband,Male,United-States,<=50K
 47 | 156,35,Federal-gov,9th,Married-civ-spouse,Farming-fishing,Husband,Male,United-States,<=50K
 48 | 157,35,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,Mexico,<=50K
 49 | 158,30,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,Mexico,<=50K
 50 | 159,63,?,9th,Married-civ-spouse,?,Husband,Male,United-States,>50K
 51 | 160,68,?,9th,Married-civ-spouse,?,Husband,Male,United-States,<=50K
 52 | 161,67,?,9th,Married-civ-spouse,?,Husband,Male,United-States,<=50K
 53 | 162,69,?,9th,Married-civ-spouse,?,Husband,Male,United-States,<=50K
 54 | 163,74,?,9th,Married-civ-spouse,?,Husband,Male,United-States,<=50K
 55 | 164,60,?,9th,Married-civ-spouse,?,Husband,Male,United-States,<=50K
 56 | 165,66,?,9th,Married-civ-spouse,?,Husband,Male,United-States,<=50K
 57 | 166,66,?,9th,Married-civ-spouse,?,Husband,Male,United-States,<=50K
 58 | 167,64,?,9th,Married-civ-spouse,?,Husband,Male,United-States,<=50K
 59 | 168,50,?,9th,Married-civ-spouse,?,Husband,Male,United-States,<=50K
 60 | 169,45,Private,9th,Married-civ-spouse,Other-service,Husband,Male,United-States,>50K
 61 | 170,54,Private,9th,Married-civ-spouse,Exec-managerial,Husband,Male,United-States,>50K
 62 | 171,51,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,>50K
 63 | 172,58,Private,9th,Married-civ-spouse,Handlers-cleaners,Husband,Male,United-States,>50K
 64 | 173,37,Private,9th,Married-civ-spouse,Machine-op-inspct,Husband,Male,United-States,>50K
 65 | 174,59,Private,9th,Married-civ-spouse,Transport-moving,Husband,Male,United-States,>50K
 66 | 175,31,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,<=50K
 67 | 176,33,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,Mexico,<=50K
 68 | 177,30,Private,9th,Married-civ-spouse,Other-service,Husband,Male,United-States,<=50K
 69 | 178,38,Private,9th,Married-civ-spouse,Farming-fishing,Husband,Male,Mexico,<=50K
 70 | 179,76,Private,9th,Married-civ-spouse,Protective-serv,Husband,Male,United-States,<=50K
 71 | 180,35,Private,9th,Married-civ-spouse,Handlers-cleaners,Husband,Male,United-States,<=50K
 72 | 181,39,Private,9th,Married-civ-spouse,Handlers-cleaners,Husband,Male,Mexico,<=50K
 73 | 182,31,Private,9th,Married-civ-spouse,Machine-op-inspct,Husband,Male,United-States,<=50K
 74 | 183,60,Private,9th,Married-civ-spouse,Machine-op-inspct,Husband,Male,United-States,<=50K
 75 | 184,46,Private,9th,Married-civ-spouse,Machine-op-inspct,Husband,Male,United-States,<=50K
 76 | 185,60,Private,9th,Married-civ-spouse,Machine-op-inspct,Husband,Male,United-States,<=50K
 77 | 186,63,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,<=50K
 78 | 187,26,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,<=50K
 79 | 188,39,Private,9th,Married-civ-spouse,Transport-moving,Husband,Male,United-States,<=50K
 80 | 189,59,Private,9th,Married-civ-spouse,Other-service,Husband,Male,United-States,<=50K
 81 | 190,27,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,<=50K
 82 | 191,26,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,<=50K
 83 | 192,36,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,Guatemala,<=50K
 84 | 193,69,Private,9th,Married-civ-spouse,Other-service,Husband,Male,United-States,<=50K
 85 | 194,62,Private,9th,Married-civ-spouse,Other-service,Husband,Male,United-States,<=50K
 86 | 195,41,Private,9th,Married-civ-spouse,Transport-moving,Husband,Male,United-States,<=50K
 87 | 196,60,Private,9th,Married-civ-spouse,Sales,Husband,Male,United-States,<=50K
 88 | 197,28,Private,9th,Married-civ-spouse,Sales,Husband,Male,United-States,<=50K
 89 | 198,51,Private,9th,Married-civ-spouse,Sales,Husband,Male,United-States,<=50K
 90 | 199,56,Private,9th,Married-civ-spouse,Sales,Husband,Male,United-States,<=50K
 91 | 200,38,Private,9th,Married-civ-spouse,Adm-clerical,Husband,Male,United-States,<=50K
 92 | 201,61,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,<=50K
 93 | 202,38,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,<=50K
 94 | 203,30,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,<=50K
 95 | 204,34,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,<=50K
 96 | 205,37,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,<=50K
 97 | 206,42,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,Mexico,<=50K
 98 | 207,32,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,<=50K
 99 | 208,29,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,<=50K
100 | 209,27,Private,9th,Married-civ-spouse,Craft-repair,Husband,Male,United-States,<=50K


--------------------------------------------------------------------------------
/Common_Realtime_Usecases/dataflow_spanner_demo/create-spanner-database-CLI.txt:
--------------------------------------------------------------------------------
1 | gcloud spanner instances create test-spanner-instance --config=regional-$1 --description="test-spanner-instance" --processing-units=100
2 | 
3 | gcloud spanner databases create census-db --instance=test-spanner-instance --database-dialect=GOOGLE_STANDARD_SQL --ddl-file=./census-db-schema.sql
4 | 


--------------------------------------------------------------------------------
/Common_Realtime_Usecases/dataflow_spanner_demo/installations_beam.sh:
--------------------------------------------------------------------------------
1 | #! /bin/sh
2 | pip3 install apache-beam
3 | pip3 install apache-beam[gcp]
4 | pip3 install apache-beam[dataframe]
5 | 


--------------------------------------------------------------------------------
/Common_Realtime_Usecases/iam_snapshots.py:
--------------------------------------------------------------------------------
 1 | import functions_framework
 2 | from googleapiclient import discovery
 3 | from google.cloud import bigquery
 4 | from datetime import datetime
 5 | import os
 6 | 
 7 | '''
 8 | Dependencies to be installed
 9 | 
10 | bigquery
11 | google-api-python-client
12 | '''
13 | 
14 | @functions_framework.http
15 | def hello_http(request):
16 |     """HTTP Cloud Function.
17 |     Args:
18 |         request (flask.Request): The request object.
19 |         <https://flask.palletsprojects.com/en/1.1.x/api/#incoming-request-data>
20 |     Returns:
21 |         The response text, or any set of values that can be turned into a
22 |         Response object using `make_response`
23 |         <https://flask.palletsprojects.com/en/1.1.x/api/#flask.make_response>.
24 |     """
25 |     request_json = request.get_json(silent=True)
26 |     request_args = request.args
27 |     
28 |     # Get project , Dataset , Table details from Function Env
29 |     PROJECT_ID = os.environ.get("project")
30 |     DATASET_ID = os.environ.get("dataset")
31 |     TABLE_ID = os.environ.get("table")
32 | 
33 |     #from apiclient.discovery import build
34 |     service = discovery.build('cloudresourcemanager', 'v1')
35 | 
36 |     # Get IAM roles  from specified project
37 |     policy_request = service.projects().getIamPolicy(resource=PROJECT_ID, body={})
38 |     policy_response = policy_request.execute()
39 |     #print(policy_response['bindings'])
40 |     
41 |     # Deriving current timestamp for snapshot
42 |     now = datetime.now()
43 |     date_time = now.strftime("%m/%d/%Y %H:%M:%S")
44 |     rows_to_insert = [] 
45 | 
46 |     # Append snapshot time to each row 
47 |     for i in policy_response['bindings']:
48 |         i['snapshot_time'] = date_time
49 |         rows_to_insert.append(i)
50 |     #print(rows_to_insert)
51 |     # Construct a BigQuery client object.
52 |     client = bigquery.Client()
53 | 
54 |     # load job confuguration
55 |     load_job = client.insert_rows_json("{}.{}.{}".format(PROJECT_ID,DATASET_ID,TABLE_ID), rows_to_insert
56 |     ) 
57 | 
58 |     if load_job == []:
59 |         print('Data has been loaded successfully')
60 |     else:
61 |         print("Somee issue with loading data")
62 | 
63 |     name = 'Succeeded'
64 |     return 'Function Execution {}!'.format(name)
65 | 


--------------------------------------------------------------------------------
/Composer/airflow_dataproc_automate_dag.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Author : @ Anjan GCP Data Engineering
  3 | 
  4 | Example Airflow DAG to demo below Dataproc use cases 
  5 | Airflow operators for managing a dataproc cluster 
  6 |     1. Create Dataproc cluster
  7 |     2. Submit PySpark jobs (in parallel) 
  8 |     3. Delete dataproc cluster
  9 | """
 10 | import os
 11 | from datetime import datetime
 12 | from airflow import models
 13 | from airflow.providers.google.cloud.operators.dataproc import (
 14 |     ClusterGenerator,
 15 |     DataprocCreateClusterOperator,
 16 |     DataprocDeleteClusterOperator,
 17 |     DataprocSubmitJobOperator
 18 | )
 19 | 
 20 | # Param initializations 
 21 | DAG_ID = "dataproc_cluster_jobs"
 22 | PROJECT_ID = "gcp-dataeng-demos-383407"
 23 | BUCKET_NAME = "dataprco-airflow-demos"
 24 | CLUSTER_NAME = "dataeng-demos-airflow"
 25 | REGION = "asia-south2"
 26 | ZONE = "asia-south2-a"
 27 | 
 28 | #PySPark scripts paths
 29 | SCRIPT_BUCKET_PATH = "gcpdataeng-demos/scripts"
 30 | # BQ -> AGGREGATE -> GCS
 31 | SCRIPT_NAME_1 = "pyspark_bq_to_gcs.py"
 32 | # GCS -> AGGREGATE -> BQ
 33 | SCRIPT_NAME_2 = "pyspark_gcs_to_bq.py"
 34 | 
 35 | # Cluster definition: Generating Cluster Config for DataprocCreateClusterOperator
 36 | 
 37 | INIT_FILE = "goog-dataproc-initialization-actions-asia-south2/connectors/connectors.sh"
 38 | 
 39 | # Generating cluster Configurations with this operator
 40 | CLUSTER_GENERATOR_CONFIG = ClusterGenerator(
 41 |     project_id=PROJECT_ID,
 42 |     zone=ZONE,
 43 |     master_machine_type="n1-standard-2",
 44 |     worker_machine_type="n1-standard-2",
 45 |     num_workers=2,
 46 |     storage_bucket=BUCKET_NAME,
 47 |     init_actions_uris=[f"gs://{INIT_FILE}"],
 48 |     metadata={"bigquery-connector-version":"1.2.0","spark-bigquery-connector-version":"0.21.0"}
 49 | ).make()
 50 | 
 51 | # PySpark job configs for Job1
 52 | PYSPARK_JOB_1 = {
 53 |                 "reference": {"project_id": PROJECT_ID},
 54 |                 "placement": {"cluster_name": CLUSTER_NAME},
 55 |                 "pyspark_job": {"main_python_file_uri": f"gs://{SCRIPT_BUCKET_PATH}/{SCRIPT_NAME_1}"}
 56 |                 }
 57 | # PySpark job configs for Job2
 58 | PYSPARK_JOB_2 = {
 59 |                 "reference": {"project_id": PROJECT_ID},
 60 |                 "placement": {"cluster_name": CLUSTER_NAME},
 61 |                 "pyspark_job": {"main_python_file_uri": f"gs://{SCRIPT_BUCKET_PATH}/{SCRIPT_NAME_2}"}
 62 | 
 63 |                 }
 64 | 
 65 | # DAH definition is here
 66 | with models.DAG(
 67 |     DAG_ID,
 68 |     schedule="@once",
 69 |     start_date=datetime(2023, 1, 1),
 70 |     catchup=False,
 71 |     tags=["example", "dataproc"],
 72 | ) as dag:
 73 | 
 74 |     # Create cluster with generates cluster config operator
 75 |     create_dataproc_cluster = DataprocCreateClusterOperator(
 76 |         task_id="create_dataproc_cluster",
 77 |         cluster_name=CLUSTER_NAME,
 78 |         project_id=PROJECT_ID,
 79 |         region=REGION,
 80 |         cluster_config=CLUSTER_GENERATOR_CONFIG,
 81 |     )
 82 | 
 83 |     # PySpark task to read data from Bigquery , perform agrregate on data and write data into GCS
 84 |     pyspark_task_bq_to_gcs = DataprocSubmitJobOperator(
 85 |         task_id="pyspark_task_bq_to_gcs", 
 86 |         job=PYSPARK_JOB_1, 
 87 |         region=REGION, 
 88 |         project_id=PROJECT_ID
 89 |     )
 90 | 
 91 |     # PySpark task to read data from GCS , perform agrregate on data and write data into Bigquery
 92 |     pyspark_task_gcs_to_bq = DataprocSubmitJobOperator(
 93 |         task_id="pyspark_task_gcs_to_bq", 
 94 |         job=PYSPARK_JOB_2, 
 95 |         region=REGION, 
 96 |         project_id=PROJECT_ID
 97 |     )
 98 | 
 99 |     # Delete Cluster once done with jobs
100 |     delete_cluster = DataprocDeleteClusterOperator(
101 |         task_id="delete_cluster",
102 |         project_id=PROJECT_ID,
103 |         cluster_name=CLUSTER_NAME,
104 |         region=REGION
105 |     )
106 | 
107 | # Set task dependencies
108 | create_dataproc_cluster >> [pyspark_task_bq_to_gcs,pyspark_task_gcs_to_bq] >> delete_cluster
109 | 


--------------------------------------------------------------------------------
/Composer/dataflow_python_operator_dag.py:
--------------------------------------------------------------------------------
 1 | # Import statement
 2 | import os
 3 | from datetime import datetime, timedelta
 4 | from airflow import DAG
 5 | from airflow.operators.dummy import DummyOperator
 6 | from airflow.contrib.operators.dataflow_operator import DataFlowPythonOperator
 7 | 
 8 | # Define yesterday value for setting up start for DAG
 9 | yesterday = datetime.combine(datetime.today() - timedelta(1), datetime.min.time())
10 | 
11 | # Default arguments
12 | default_args = {
13 |     'start_date': yesterday,
14 |     'email_on_failure': False,
15 |     'email_on_retry': False,
16 |     'retries': 1,
17 |     'retry_delay': timedelta(minutes=5)
18 | }
19 | 
20 | # DAG main definition
21 | with DAG(dag_id='DataflowPythonOperator',
22 |          catchup=False,
23 |          schedule_interval=timedelta(days=1),
24 |          default_args=default_args
25 |          ) as dag:
26 |     
27 |     # Dummy Start task
28 |     start = DummyOperator(
29 |         task_id='start',
30 |         dag=dag,
31 |     )
32 | 
33 |     # Dataflow batch job  log process task
34 |     dataflow_batch_process_logs = DataFlowPythonOperator(
35 |         task_id='dataflow_batch_process_logs',
36 |         py_file='gs://us-central1-composer-scd2-5607404f-bucket/dags/scripts/dataflow_batch_log_process.py',
37 |         options={
38 |             'output': 'gs://data_eng_demos/output'
39 |         },
40 |         dataflow_default_options={
41 |             'project': 'data-eng-demos19',
42 |             "staging_location": "gs://data_eng_demos/staging",
43 |             "temp_location": "gs://data_eng_demos/temp"
44 |         },
45 |         dag=dag) 
46 |         
47 |     # Dummy end task
48 |     end = DummyOperator(
49 |         task_id='end',
50 |         dag=dag,
51 |     )   
52 | 
53 | # Setting up Task dependencies using Airflow standard notations        
54 | start >>  dataflow_batch_process_logs >> end
55 | 


--------------------------------------------------------------------------------
/Composer/gcs_to_bq_and_bq_operators.py:
--------------------------------------------------------------------------------
 1 | # import statements
 2 | import os
 3 | from datetime import datetime, timedelta
 4 | from airflow import DAG
 5 | from airflow.operators.dummy import DummyOperator
 6 | from airflow.contrib.operators.bigquery_operator import BigQueryOperator
 7 | from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator
 8 | 
 9 | # Custom Python logic for derriving data value
10 | yesterday = datetime.combine(datetime.today() - timedelta(1), datetime.min.time())
11 | 
12 | # Default arguments
13 | default_args = {
14 |     'start_date': yesterday,
15 |     'email_on_failure': False,
16 |     'email_on_retry': False,
17 |     'retries': 1,
18 |     'retry_delay': timedelta(minutes=5)
19 | }
20 | 
21 | # DAG definitions
22 | with DAG(dag_id='GCS_to_BQ_and_AGG',
23 |          catchup=False,
24 |          schedule_interval=timedelta(days=1),
25 |          default_args=default_args
26 |          ) as dag:
27 | 
28 | # Dummy strat task   
29 |     start = DummyOperator(
30 |         task_id='start',
31 |         dag=dag,
32 |     )
33 | 
34 | # GCS to BigQuery data load Operator and task
35 |     gcs_to_bq_load = GoogleCloudStorageToBigQueryOperator(
36 |                 task_id='gcs_to_bq_load',
37 |                 bucket='data_eng_demos',
38 |                 source_objects=['greenhouse_dtls.csv'],
39 |                 destination_project_dataset_table='data-eng-demos19.gcp_dataeng_demos.gcs_to_bq_table',
40 |                 schema_fields=[
41 |                                 {'name': 'year', 'type': 'STRING', 'mode': 'NULLABLE'},
42 |                                 {'name': 'anzsic', 'type': 'STRING', 'mode': 'NULLABLE'},
43 |                                 {'name': 'nzsioc', 'type': 'STRING', 'mode': 'NULLABLE'},
44 |                                 {'name': 'anzsic_descriptor', 'type': 'STRING', 'mode': 'NULLABLE'},
45 |                                 {'name': 'category', 'type': 'STRING', 'mode': 'NULLABLE'},
46 |                                 {'name': 'variable', 'type': 'STRING', 'mode': 'NULLABLE'},
47 |                                 {'name': 'units', 'type': 'STRING', 'mode': 'NULLABLE'},
48 |                                 {'name': 'magnitude', 'type': 'STRING', 'mode': 'NULLABLE'},
49 |                                 {'name': 'source', 'type': 'STRING', 'mode': 'NULLABLE'},
50 |                                 {'name': 'data_value', 'type': 'FLOAT', 'mode': 'NULLABLE'}
51 |                               ],
52 |                 skip_leading_rows=1,
53 |                 create_disposition='CREATE_IF_NEEDED',
54 |                 write_disposition='WRITE_TRUNCATE', 
55 |     dag=dag)
56 | 
57 | # BigQuery task, operator
58 |     create_aggr_bq_table = BigQueryOperator(
59 |     task_id='create_aggr_bq_table',
60 |     use_legacy_sql=False,
61 |     allow_large_results=True,
62 |     sql="CREATE OR REPLACE TABLE gcp_dataeng_demos.bq_table_aggr AS \
63 |          SELECT \
64 |                 year,\
65 |                 anzsic_descriptor,\
66 |                 variable,\
67 |                 source,\
68 |                 SUM(data_value) as sum_data_value\
69 |          FROM data-eng-demos19.gcp_dataeng_demos.gcs_to_bq_table \
70 |          GROUP BY \
71 |                 year,\
72 |                 anzsic_descriptor,\
73 |                 variable,\
74 |                 source", 
75 |     dag=dag)
76 | 
77 | # Dummy end task
78 |     end = DummyOperator(
79 |         task_id='end',
80 |         dag=dag,
81 |     )
82 | 
83 | # Settting up task  dependency
84 | start >> gcs_to_bq_load >> create_aggr_bq_table >> end
85 | 


--------------------------------------------------------------------------------
/Composer/https_operators_demo_dag.py:
--------------------------------------------------------------------------------
 1 | 
 2 | """
 3 | Author : @ Anjan GCP Data Engineering
 4 | This Airflow DAG code to demo - Demo HTTP operators
 5 | 1. Extract data from HTTP API 
 6 | 2. Pull data from Xcom 
 7 | 3. Write data into GCS bucket in JSON format
 8 | 
 9 | """
10 | from __future__ import annotations
11 | 
12 | import json
13 | import os
14 | from datetime import datetime
15 | 
16 | from airflow import DAG
17 | from airflow.providers.http.operators.http import SimpleHttpOperator
18 | from airflow.providers.http.sensors.http import HttpSensor
19 | from airflow.operators.python import PythonOperator
20 | from google.cloud import storage
21 | 
22 | # Dag name
23 | DAG_ID = "demo_http_operator_to_gcs"
24 | 
25 | # ths python funcgion writes data from Xcom to GCS byucket as a JSON file
26 | def WriteToGcs(ti):
27 |     data = ti.xcom_pull(task_ids=['get_http_data'])
28 |     bucket_name = 'gcpdataeng-demos'
29 |     destination_blob_name = 'stock_data.json'
30 |     storage_client = storage.Client()
31 |     bucket = storage_client.bucket(bucket_name)
32 |     blob = bucket.blob(destination_blob_name)
33 | 
34 |     blob.upload_from_string(str(data))
35 | 
36 |     print(
37 |         f"{destination_blob_name} with contents uploaded to {bucket_name}."
38 |     )
39 | # DAG definitions with all required params
40 | dag = DAG(
41 |     DAG_ID,
42 |     default_args={"retries": 1},
43 |     tags=["example"],
44 |     start_date=datetime(2023, 4, 26),
45 |     catchup=False,
46 | )
47 | 
48 | # Task to get data from given HTTP end point
49 | get_http_data = SimpleHttpOperator(
50 |     task_id="get_http_data",
51 |     http_conn_id="http_conn_id_demo",
52 |     method="GET",
53 |     endpoint="/query?function=TOURNAMENT_PORTFOLIO&season=2021-09&apikey=demo",
54 |     response_filter = lambda response : json.loads(response.text),
55 |     dag=dag
56 | )
57 | # Task to write data from Xcom to GCS bucket
58 | write_data_to_gcs = PythonOperator(
59 |     task_id = 'write_data_to_gcs',
60 |     python_callable = WriteToGcs
61 | )
62 | # Task dependency set
63 | get_http_data >> write_data_to_gcs
64 | 


--------------------------------------------------------------------------------
/Composer/python_bash_operators_dag.py:
--------------------------------------------------------------------------------
 1 | #Import dependencies 
 2 | import os
 3 | from datetime import datetime, timedelta
 4 | from airflow import DAG
 5 | from airflow.operators.dummy import DummyOperator
 6 | from airflow.operators.python import PythonOperator
 7 | from airflow.operators.bash import BashOperator
 8 | 
 9 | # Python logic to derive yetsreday's date
10 | yesterday = datetime.combine(datetime.today() - timedelta(1), datetime.min.time())
11 | 
12 | # Default arguments
13 | default_args = {
14 |     'start_date': yesterday,
15 |     'email_on_failure': False,
16 |     'email_on_retry': False,
17 |     'retries': 1,
18 |     'retry_delay': timedelta(minutes=5)
19 | }
20 | 
21 | # Python custom logic/function for python callables
22 | def print_hello():
23 |     print('Hey I am Python operator')
24 | 
25 | # DAG definitions 
26 | with DAG(dag_id='bash_python_operator_demo',
27 |          catchup=False,
28 |          schedule_interval=timedelta(days=1),
29 |          default_args=default_args
30 |          ) as dag:
31 | 
32 |     # Tasks starts here 
33 | 
34 |     # Dummy Start task
35 |     start = DummyOperator(
36 |         task_id='start',
37 |         dag=dag,
38 |     )
39 | 
40 |     # Bash operator , task  
41 |     bash_task = BashOperator(
42 |     task_id='bash_task',
43 |     bash_command="date;echo 'Hey I am bash operator'",
44 |     )
45 |     # Python operator , task
46 |     pyhon_task = PythonOperator(
47 |     task_id='pyhon_task',
48 |     python_callable=print_hello,
49 |     dag=dag)
50 | 
51 |     # Dummy end task
52 |     end = DummyOperator(
53 |         task_id='end',
54 |         dag=dag,
55 |     )
56 | 
57 | # Setting up Task dependencies using Airflow standard notations        
58 | start >>  bash_task >> pyhon_task >> end
59 | 


--------------------------------------------------------------------------------
/Dataflow/batch_etl_avro_data_cloudsql.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Author : Anjan GCP Data Engineering
  3 | This code should be used only for Eductional purpose
  4 | '''
  5 | # Import Dependencies
  6 | import apache_beam as beam
  7 | from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
  8 | import apache_beam.runners.interactive.interactive_beam as ib
  9 | from apache_beam.io.avroio import ReadFromAvro
 10 | from apache_beam.io import WriteToAvro
 11 | from google.cloud.sql.connector import Connector
 12 | import sqlalchemy
 13 | from apache_beam.io import ReadFromText
 14 | from apache_beam.io import WriteToText
 15 | from apache_beam.options.pipeline_options import PipelineOptions
 16 | import logging
 17 | 
 18 | # Setting up the Apache Beam pipeline options.
 19 | beam_options = PipelineOptions(
 20 |     #save_main_session=True,
 21 |     #runner='DirectRunner',
 22 |     setup_file = '/home/jupyter/setup.py',
 23 |     runner='DataflowRunner',
 24 |     project='gcp-dataeng-demos-395910',
 25 |     temp_location='gs://dataflow_demos2/tmp',
 26 |     region='asia-south2')
 27 | 
 28 | class WriteToCloudSQL(beam.DoFn):
 29 |     def process(self,element):
 30 |         
 31 |         from google.cloud.sql.connector import Connector
 32 |         import sqlalchemy
 33 |         # function to return the database connection
 34 |         def getconn():
 35 |             connector = Connector()
 36 |             conn= connector.connect(
 37 |                 "gcp-dataeng-demos-395910:asia-south1:sql-demo",
 38 |                 "pymysql",
 39 |                 user="root",
 40 |                 password='$qlDem0',
 41 |                 db="gcp_demos"
 42 |             )
 43 |             return conn
 44 |         # create connection pool
 45 |         
 46 |         pool = sqlalchemy.create_engine(
 47 |                                         "mysql+pymysql://",
 48 |                                         creator=getconn,
 49 |                                        )
 50 |         # insert statement (DML statement for data load)
 51 |         insert_stmt = sqlalchemy.text("INSERT INTO git_downloads_agg (os_name, os_version,no_f_downloads) VALUES (:os_name, :os_version,:no_f_downloads)",)
 52 |         
 53 |         # interact with Cloud SQL database using connection pool
 54 |         with pool.connect() as db_conn:
 55 |     
 56 |             # Create Table
 57 |             create_table = sqlalchemy.text("CREATE TABLE IF NOT EXISTS git_downloads_agg(os_name VARCHAR(20), os_version VARCHAR(20), no_f_downloads INT)")
 58 |             db_conn.execute(create_table)
 59 | 
 60 |             # Insert data into Table
 61 |             db_conn.execute(insert_stmt, parameters={'os_name':element['os_name'], 'os_version':element['os_version'],'no_f_downloads':int(element['no_f_downloads'])})
 62 |             db_conn.commit()
 63 | def run():           
 64 |     # Beam Pipeline starts here
 65 |     with beam.Pipeline(options=beam_options) as pipeline:
 66 | 
 67 |         # Read AVRO files from GCS location
 68 |         read_raw = pipeline | 'Read' >> beam.io.ReadFromAvro('gs://dataflow_demos2/input_data/pypy_filedownloads.avro')
 69 | 
 70 |         # Filter , Clean and Aggregate data ,Number of PYPY downloads by country,project, version
 71 |         agg_cntry = (read_raw | 'Filter Data' >> beam.Filter(lambda x: x['details']['python'].startswith('3.10')) 
 72 |                             | 'Get required data' >> beam.Map(lambda x:(x['country_code']+','+x['project']+','+x['details']['python'],x['timestamp']))
 73 |                             | 'Combine per key' >> beam.combiners.Count.PerKey()
 74 |                             | 'Make it to dict again' >> beam.Map(lambda x: {'country_code':x[0].split(',')[0],'project':x[0].split(',')[1],
 75 |                                                                              'python_version':x[0].split(',')[2],'no_of_downloads':x[1]})
 76 |                             #| 'Print' >> beam.Map(print)
 77 |                     )
 78 | 
 79 |         # Write Transformed data into GCS in AVRO format
 80 |         agg_cntry | 'WriteToAvro' >> WriteToAvro('gs://dataflow_demos2/output_data/agg_output_data.avro',
 81 |                                                            schema={
 82 |                                                                     "type": "record", "name": "agg_downloads", 
 83 |                                                                     "fields": [
 84 |                                                                                    {"name": "country_code", "type": "string"},
 85 |                                                                                    {"name": "project", "type": "string"},
 86 |                                                                                    {"name": "python_version", "type": "string"},
 87 |                                                                                    {"name": "no_of_downloads", "type": "int"}
 88 |                                                                               ]
 89 |                                                                   }
 90 |                                                           )
 91 | 
 92 | 
 93 |          # Filter , Clean and Aggregate data ,Number of PYPY downloads by os name and version
 94 |         aggr_os_version = (read_raw | 'Filter Data with OS' >> beam.Filter(lambda x: x['details']['system']['name'] == 'Windows' and x['details']['rustc_version'] != None) 
 95 |                           | 'Get os data woth others' >> beam.Map(lambda x: (x['details']['system']['name']+','+x['details']['rustc_version'],x['timestamp']))
 96 |                           | 'Combine per key os' >> beam.combiners.Count.PerKey()
 97 |                           | 'Make it dict of agg results' >> beam.Map(lambda x: {'os_name':x[0].split(',')[0],'os_version':x[0].split(',')[1],'no_f_downloads':x[1]})
 98 |                           #| 'Print' >> beam.Map(print)
 99 |                           )
100 |         # Write Results into CloudSQL(MySQL) Table
101 |         aggr_os_version| 'Write results to CloudSQL Table' >> beam.ParDo(WriteToCloudSQL())
102 |               
103 | # Run Pipeline here
104 | if __name__ == "__main__":
105 |     logging.getLogger().setLevel(logging.INFO)
106 |     run()
107 | 


--------------------------------------------------------------------------------
/Dataflow/beam_stream_data_process.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | Author : @ Anjan GCP Data Engineering
 4 | 
 5 | Created this Apache Beam  code to demo Dataflow Stream ETL pipeline
 6 | 
 7 | Steps Explaining this code;
 8 | 
 9 |     1.  A Python APP acting as a stream data source and publishing continous data into PubSub Topic
10 |     2.  Read data into Apache Beam Pipeline -> into Unbounded PCollection from PubSub Topic
11 |     3.  Applly Unboubded data processing concepts like event timestamps, fixed windows
12 |     4.  Aggregate the data 
13 |     5.  Write aggrgated into Big Query Table
14 |     6.  Run this Pipeline on Dataflow Runner
15 | 
16 | 
17 | CLI command to run this pipeline on dataflow
18 | 
19 | python3 -m <your script name> \ 
20 | --input_topic projects/<your bucket>/topics/<your topic> \ 
21 | --output_path gs://<your bucket>/output \ 
22 | --project <your project> \ 
23 | --region us-west1 \ 
24 | --temp_location gs://<your bucket>/temp \ 
25 | --runner DataflowRunner
26 | '''
27 | 
28 | import argparse
29 | import apache_beam as beam
30 | import logging
31 | from apache_beam.options.pipeline_options import PipelineOptions
32 | from apache_beam import window
33 | from datetime import datetime as dt
34 | 
35 | class AddWindowdtlsFn(beam.DoFn):
36 | 
37 |     def process(self, element,  window=beam.DoFn.WindowParam):
38 |         window_start = window.start.to_utc_datetime()
39 |         window_end = window.end.to_utc_datetime()
40 |         pc = str(element) + '  [ ' + str(window_start) + '  -  ' + str(window_end) + ' ]'
41 |         pc = pc.split('\n')
42 |         return pc
43 | 
44 | def run(input_topic, output_path, pipeline_args=None):
45 |     # Set `save_main_session` to True so DoFns can access globally imported modules.
46 |     pipeline_options = PipelineOptions(
47 |         pipeline_args, streaming=True, save_main_session=True
48 |     )
49 |     with beam.Pipeline(options=pipeline_options) as p:
50 |     #p = beam.Pipeline(options=options)
51 |         (
52 |          p | "Read Events stream data from Topic" >> beam.io.ReadFromPubSub(topic=input_topic)
53 |            | "Covert from Bytes to String" >> beam.Map(lambda s: s.decode("utf-8")) 
54 |            | 'Events Data' >> beam.Map(lambda x: {'event_nbr':x.split(',')[0],'event_time':dt.strptime(x.split(',')[1],'%Y-%m-%d %H:%M:%S.%f')})
55 |            | 'Events with Timestamps' >> beam.Map(lambda events: beam.window.TimestampedValue(events['event_nbr'], events['event_time'].timestamp()))  
56 |            | 'Events fixed Window' >> beam.WindowInto(window.FixedWindows(5))         
57 |            | 'No of events per Window' >> beam.combiners.Count.Globally().without_defaults()
58 |            | 'Final results with Window Info' >> beam.ParDo(AddWindowdtlsFn())
59 |            | 'String To BigQuery Row' >> beam.Map(lambda s: {'window_count': s})
60 |            #| 'Write Windowed resuls to GCS' >> beam.io.WriteToText(output_gcs_location + '/events_per_window_output.txt')
61 |            #| 'Write to PubSub' >> beam.io.WriteToPubSub(topic=topic_sink)
62 |            | 'Write to BigQuery' >> beam.io.Write(beam.io.WriteToBigQuery( 
63 |                                                                                     '<your table>',
64 |                                                                                      dataset='<your dataset>',
65 |                                                                                      project='<your project>',
66 |                                                                                      schema ='window_count:STRING',
67 |                                                                                      create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
68 |                                                                                      write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
69 |                                                                                 )
70 |                                                                                 )
71 | 
72 |         )  #| beam.Map(print)
73 | 
74 | if __name__ == "__main__":
75 |     logging.getLogger().setLevel(logging.INFO)
76 |     parser = argparse.ArgumentParser()
77 |     parser.add_argument(
78 |         "--input_topic",
79 |         help="The Cloud Pub/Sub topic to read from."
80 |         '"projects//topics/".',
81 |     )
82 |     parser.add_argument(
83 |         "--output_path",
84 |         help="Path of the output GCS file including the prefix.",
85 |     )
86 |     known_args, pipeline_args = parser.parse_known_args()
87 |     run(
88 |         known_args.input_topic,
89 |         known_args.output_path,
90 |         pipeline_args
91 |     )
92 | 


--------------------------------------------------------------------------------
/Dataflow/dataflow_batch_demo.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Author : Anjan GCP Data Engineering
 3 | This code should be used only for Eductional purpose
 4 | This code is to perform
 5 | 1. Read data from CSV file 
 6 | 2. Tranform the data  using Beam ParDo (User defined logic)
 7 | 3. Write Transformed data into specified Big Query Table.
 8 | '''
 9 | # Import required modules and methods
10 | import argparse
11 | import logging
12 | import apache_beam as beam
13 | import re
14 | from apache_beam.io import ReadFromText
15 | from apache_beam.io import WriteToText
16 | from apache_beam.options.pipeline_options import PipelineOptions
17 | 
18 | # ParDo Class for parallel processing by applying user defined tranformations
19 | class scrip_val(beam.DoFn):
20 |     def process(self, element):
21 |         try:
22 |             line = element.split('"')
23 |             if line[9] == 'BUY':
24 |                 tp=line[3]+','+line[11].replace(',','')
25 |             else:
26 |                 tp=line[3]+',-'+line[11].replace(',','')
27 |             tp=tp.split()
28 |             return tp
29 |         except:
30 |             logging.info('Some Error occured')
31 | 
32 | # Entry run method for triggering pipline
33 | def run():
34 |     #Input arguments , reading from commandline
35 |     parser = argparse.ArgumentParser()
36 |     parser.add_argument('--input',
37 |                         dest='input',
38 |                         default='gs://dataflow_demo19',
39 |                         help='Input file to process.')
40 |     parser.add_argument('--output',
41 |                         dest='output',
42 |                         required=True,
43 |                         help='Output file to write results to.')
44 |     known_args, pipeline_args = parser.parse_known_args()
45 | 	
46 |     # Function to SUM grouped elements
47 |     def sum_groups(word_ones):
48 |         (word, ones) = word_ones
49 |         return word + ',' + str(sum(ones))
50 |     '''
51 |     def format_result(bulk_deal):
52 |         (bulk, deal) = bulk_deal
53 |         return '%s: %d' % (bulk, deal)
54 |     '''
55 |     # Function to parse and format given input to Big Query readable JSON format
56 |     def parse_method(string_input):
57 | 
58 |         values = re.split(",",re.sub('\r\n', '', re.sub(u'"', '', string_input)))
59 |         row = dict(
60 |             zip(('SYMBOL', 'BUY_SELL_QTY'),
61 |                 values))
62 |         return row
63 |     
64 |     # Main Pipeline 
65 |     with beam.Pipeline(options=PipelineOptions(pipeline_args)) as p:
66 |         lines = p | 'read' >> ReadFromText(known_args.input,skip_header_lines=1)
67 |         counts = (
68 |                 lines
69 |                 | 'Get required tuple'  >> beam.ParDo(scrip_val())
70 |                 | 'PairWithValue' >> beam.Map(lambda x: (x.split(',')[0],int(x.split(',')[1])))
71 |                 | 'Group by Key' >> beam.GroupByKey()
72 |                 | 'Sum Group' >> beam.Map(sum_groups)
73 |                 | 'To String' >> beam.Map(lambda s: str(s))
74 |                 | 'String To BigQuery Row' >> beam.Map(lambda s: parse_method(s))
75 |                 #| 'format' >> beam.Map(format_result)
76 |                 #| 'Print'  >> beam.Map(print)
77 |                 #| 'write' >> WriteToText(known_args.output)
78 |         )
79 |         # Write to Big Query Sink
80 |         counts| 'Write to BigQuery' >> beam.io.Write(
81 |                                                  beam.io.WriteToBigQuery(
82 |                                                                             'batach_data',
83 |                                                                              dataset='dataflow_demo',
84 |                                                                              project='gcp-dataeng-demos',
85 |                                                                              schema ='SYMBOL:STRING,BUY_SELL_QTY:INTEGER',
86 |                                                                              create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
87 |                                                                              write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE
88 |                                                                         )
89 |                                                     )
90 | 
91 | # Trigger entry function here       
92 | if __name__ == '__main__':
93 |     logging.getLogger().setLevel(logging.INFO)
94 |     run()
95 | 


--------------------------------------------------------------------------------
/Dataflow/dataflow_batch_log_process.py:
--------------------------------------------------------------------------------
 1 | # Import required modules and methods
 2 | import argparse
 3 | import logging
 4 | import apache_beam as beam
 5 | import re
 6 | from apache_beam.io import ReadFromText
 7 | from apache_beam.io import WriteToText
 8 | from apache_beam.transforms.sql import SqlTransform
 9 | from apache_beam.options.pipeline_options import PipelineOptions
10 | import json
11 | import ast
12 | 
13 | # Setting up the Apache Beam pipeline options.
14 | beam_options = PipelineOptions(
15 |     save_main_session=True,
16 |     #runner='DirectRunner',
17 |     runner='DataflowRunner',
18 |     project='data-eng-demos19',
19 |     temp_location='gs://data_eng_demos/temp',
20 |     region='us-central1')
21 | 
22 | # ParDo Class for parallel processing by applying user defined tranformations
23 | class ParseJSON(beam.DoFn):
24 |     def process(self, element):
25 |         try:
26 |             dict_line = json.loads(element)
27 |             sub_str = dict_line['protoPayload']['methodName']
28 |             if 'google.cloud' in sub_str:
29 |                 sub_str = sub_str.split('.')[4] + '.' + sub_str.split('.')[5]
30 |             st = '{' + "'user':'" + dict_line['protoPayload']['authenticationInfo']['principalEmail'] + "','job_type':'" + sub_str.lower().rstrip('job') + "','info_type':'" + dict_line['severity'] + "','timestamp':'" + dict_line['timestamp'] + "'}"
31 |             st = st.replace("'",'"')
32 |             return st.split('\n')
33 |         except:
34 |             logging.info('Some Error occured')
35 | 
36 | # Entry Function to run Pipeline
37 | def run():
38 |     # Set `save_main_session` to True so DoFns can access globally imported modules.
39 |     with beam.Pipeline(options=beam_options) as p:
40 | 
41 |         result = (
42 |                   p | 'Read from GCS' >> ReadFromText('gs://logs_bucket19/cloudaudit.googleapis.com/data_access/2022/12/28/*.json')
43 |                     | 'Parse logs to string representation of dict'  >> beam.ParDo(ParseJSON())
44 |                     | 'Convert String to Dict'  >> beam.Map(lambda x: json.loads(x))
45 |                     #| beam.Map(print)
46 |                  )
47 |         
48 |         write_to_gcs = (result | 'get job type tuple' >> beam.Map(lambda x : ( x['job_type']+',' + x['info_type'],1))
49 |                                 | 'combine per key and sum' >> beam.CombinePerKey(sum)
50 |                                 | 'format to JSON' >> beam.Map(lambda x : "{'job_type':'"+ x[0].split(',')[0] + 
51 |                                                                           "','info_type':'" + x[0].split(',')[1] + "','count':" + str(x[1]) +"}" )
52 |                                 #| beam.Map(print)
53 |                                 | 'write final results into GCS bucket' >> beam.io.WriteToText('gs://data_eng_demos/output/bq_job_stats.txt')
54 |                         )
55 |                      
56 |         write_to_bq = result | 'Write parsed results to BigQuery' >> beam.io.Write(beam.io.WriteToBigQuery( 
57 |                                                                                 'bq_auditlog_parsed_data',
58 |                                                                                 dataset='gcp_dataeng_demos',
59 |                                                                                 project='data-eng-demos19',
60 |                                                                                 schema ='SCHEMA_AUTODETECT',
61 |                                                                                 create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
62 |                                                                                 write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE
63 |                                                                             )
64 |                                                                     )
65 |                                                                       
66 | if __name__ == "__main__":
67 |     logging.getLogger().setLevel(logging.INFO)
68 |     run()


--------------------------------------------------------------------------------
/Dataflow/process_nested_data_sql_demo.py:
--------------------------------------------------------------------------------
 1 | # Import required modules and methods
 2 | import argparse
 3 | import logging
 4 | import apache_beam as beam
 5 | import re
 6 | from apache_beam.io import ReadFromText
 7 | from apache_beam.io import WriteToText
 8 | from apache_beam.transforms.sql import SqlTransform
 9 | from apache_beam.options.pipeline_options import PipelineOptions
10 | import json
11 | import ast
12 | 
13 | # Setting up the Apache Beam pipeline options.
14 | beam_options = PipelineOptions(
15 |     save_main_session=True,
16 |     #runner='DirectRunner',
17 |     runner='DataflowRunner',
18 |     project='gcp-dataeng-demos-355417',
19 |     temp_location='gs://dataflow_demo19/temp',
20 |     region='us-central1')
21 | 
22 | # ParDo Class for parallel processing by applying user defined tranformations
23 | class ParseJSON(beam.DoFn):
24 |     def process(self, element):
25 |         try:
26 |             dict_line = json.loads(element)
27 |             lst = []
28 |             st = str(dict_line)
29 |             st = st.split("'Performance':")[0] + "'Previous3IPLBattingAvg':" + str(dict_line['Previous3IPLBattingAvg']) + ","
30 |             for l in dict_line['Performance']:
31 |                 result = (st + str(l).lstrip('{'))
32 |                 result = result.replace("'",'"')
33 |                 lst.append(result)    
34 |             return lst
35 |         except:
36 |             logging.info('Some Error occured')
37 | 
38 | # Beam SQL Transformation query applied on Pcollection
39 | qry = '''SELECT
40 |             PlayerName,
41 |             Age,
42 |             team,
43 |             Previous3IPLBattingAvg,
44 |             SUM(RunsScored) as total_RunsScored,
45 |             SUM(Wickets) AS total_Wickets,
46 |         FROM
47 |             PCOLLECTION
48 |         GROUP BY
49 |             1,2,3,4'''
50 | 
51 | # Mapper function to update dict Previous3IPLBattingAvg values from String to List
52 | def StrLstUpdate(dct):
53 |     dct.update({'Previous3IPLBattingAvg' : ast.literal_eval(dct['Previous3IPLBattingAvg'])})
54 |     return dct
55 | 
56 | # Entry Function to run Pipeline
57 | def run():
58 |     # Set `save_main_session` to True so DoFns can access globally imported modules.
59 |     with beam.Pipeline(options=beam_options) as p:
60 | 
61 |         result = (
62 |                   p | 'Read from GCS' >> ReadFromText('gs://dataflow_demo19/input/ipl_player_stats.json')
63 |                     | 'Parse JSON and flatten'  >> beam.ParDo(ParseJSON())
64 |                     | 'Filter required data' >> beam.Filter(lambda x : ('"NotBowled"' not in x)) | beam.Filter(lambda x : ('"NotBatted"' not in x))
65 |                     | 'Parse List to Dict'  >> beam.Map(lambda x: json.loads(x))
66 |                     | 'Convert as Beam Rows' >> beam.Map(lambda x: beam.Row(
67 |                                                                                     PlayerName = str(x['PlayerName']),
68 |                                                                                     Age = str(x['Age']),
69 |                                                                                     Team = str(x['Team']),
70 |                                                                                     MatchNo = str(x['MatchNo']),
71 |                                                                                     RunsScored = int(x['RunsScored']),
72 |                                                                                     Wickets = int(x['Wickets']),
73 |                                                                                     Previous3IPLBattingAvg = str(x['Previous3IPLBattingAvg'])
74 |                                                                                     )
75 |                                                                 )
76 | 
77 |                     | 'Get Palyer Stats by Appying Beam SQL Transform' >> SqlTransform(qry, dialect='zetasql')
78 |                     | 'Convert to Bigquery readable Dict' >> beam.Map(lambda row : row._asdict())
79 |                     | 'Convert String representation of Previous3IPLBattingAvg to Nested' >> beam.Map(lambda x : StrLstUpdate(x))
80 |                     #| beam.Map(print))
81 |                     #Write to Big Query
82 |                     | 'Write Final results to BigQuery' >> beam.io.Write(beam.io.WriteToBigQuery( 
83 |                                                                                 'batach_data',
84 |                                                                                 dataset='dataflow_demos',
85 |                                                                                 project='gcp-dataeng-demos-355417',
86 |                                                                                 schema ='SCHEMA_AUTODETECT',
87 |                                                                                 create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
88 |                                                                                 write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE
89 |                                                                             )
90 |                                                                     ))
91 | 
92 | if __name__ == "__main__":
93 |     logging.getLogger().setLevel(logging.INFO)
94 |     run()
95 | 


--------------------------------------------------------------------------------
/Dataproc/pyspark_bq_to_gcs_demo.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | Author : @ Anjan GCP Data Engineering
 4 | This script is created to demo below concepts
 5 |   1. Create Spark session on Dataproc cluster
 6 |   2. Read input data from Big Query table
 7 |   3. Apply Transformations to group and aggregate data by using Spark SQL
 8 |   4. Write resultant data to GCS buacket --> File
 9 | 
10 | BigQuery I/O PySpark Demo - BigQuery --> Aggregate data --> write results to GCS
11 | 
12 | """
13 | 
14 | from pyspark.sql import SparkSession
15 | 
16 | # Spark session
17 | spark = SparkSession \
18 |   .builder \
19 |   .master('yarn') \
20 |   .appName('spark-bigquery-gcs-demo') \
21 |   .getOrCreate()
22 | 
23 | # Use the Cloud Storage bucket for temporary BigQuery export data used
24 | # by the connector.
25 | bucket = "gcp-dataeng-demos"
26 | spark.conf.set('temporaryGcsBucket', bucket)
27 | 
28 | # Load data from BigQuery Covid19 public dataset.
29 | covid19 = spark.read.format('bigquery') \
30 |   .option('table', 'bigquery-public-data:covid19_open_data.covid19_open_data') \
31 |   .load()
32 | covid19.createOrReplaceTempView('covid19')
33 | 
34 | # Perform data aggregation.
35 | covid19 = spark.sql(
36 |     'SELECT \
37 |             country_name,\
38 |             EXTRACT(year FROM date) AS year,\
39 |             SUM(new_confirmed) AS new_confirmed,\
40 |             SUM(new_deceased) AS new_deceased,\
41 |             SUM(cumulative_confirmed) AS cumulative_confirmed,\
42 |             SUM(cumulative_deceased) AS cumulative_deceased\
43 |     FROM \
44 |         covid19 \
45 |     GROUP BY \
46 |           1,\
47 |           2 \
48 |     ORDER BY \
49 |       1,\
50 |       2')
51 | 
52 | # Write results to GCS bucket
53 | covid19.write.csv('gs://gcp-dataeng-demos/coutrywise_cases')
54 | 


--------------------------------------------------------------------------------
/GCP_Data_Eng_concept_files/GCP_storage_db.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anjangcp/GCP-Data-Engineering-Demo-Codes/f92f4ce688459c9913d2ddef838c411eaeb969a0/GCP_Data_Eng_concept_files/GCP_storage_db.png


--------------------------------------------------------------------------------
/GCP_Data_Eng_concept_files/Initfile.txt:
--------------------------------------------------------------------------------
1 | Welcome!!
2 | 


--------------------------------------------------------------------------------
/GCP_Data_Eng_concept_files/gcp_etl_services.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anjangcp/GCP-Data-Engineering-Demo-Codes/f92f4ce688459c9913d2ddef838c411eaeb969a0/GCP_Data_Eng_concept_files/gcp_etl_services.png


--------------------------------------------------------------------------------
/GCS/object_lifecycle_mngmnt_cli.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | /****** Object Lifecycle Management Config JSON File  Example ****************/
 3 | 
 4 | {
 5 |     "lifecycle": {
 6 |       "rule": [
 7 |       {
 8 |         "action": {
 9 |           "type": "SetStorageClass",
10 |           "storageClass": "NEARLINE"
11 |         },
12 |         "condition": {
13 |           "age": 365,
14 |           "matchesStorageClass": ["STANDARD"]
15 |         }
16 |       },
17 |       {
18 |         "action": {
19 |           "type": "SetStorageClass",
20 |           "storageClass": "COLDLINE"
21 |         },
22 |         "condition": {
23 |           "age": 730,
24 |           "matchesStorageClass": ["NEARLINE"]
25 |         }
26 |       }
27 |     ]
28 |     }
29 |     }
30 |     
31 |  /****************** CLI Commands **********************/
32 | 
33 | 
34 | -- List Rules
35 | 
36 | gcloud storage buckets describe gs://gcp-data-eng-demos --format="default(lifecycle)"
37 | 
38 | -- Create Rules 
39 | 
40 | gcloud storage buckets update gs://gcp-data-eng-demos \
41 |         --lifecycle-file=/home/gcpdataeng1982/gcs_lifecycle_config_file.json
42 | 
43 | -- Clear Rules
44 | 
45 | gcloud storage buckets update gs://gcp-data-eng-demos --clear-lifecycle
46 | 


--------------------------------------------------------------------------------
/GCS/object_lifecylce_mngmnt.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Author : @ Anjan GCP Data Engineering
 3 | Created SQLs required for Big Query Procedures and Anonymous Blocks Demo
 4 | 
 5 | This code is to demo , how to define and manage GCS Object LifeCycle Management rules using Python Client Libraries
 6 | '''
 7 | 
 8 | from google.cloud import storage
 9 | 
10 | def enable_bucket_lifecycle_management(bucket_name):
11 |     """Enable lifecycle management for a bucket"""
12 | 
13 |     storage_client = storage.Client()
14 | 
15 |     bucket = storage_client.get_bucket(bucket_name)
16 |     rules = bucket.lifecycle_rules
17 | 
18 |     # Print default Rules
19 |     print(f"Lifecycle management rules for bucket {bucket_name} are {list(rules)}")
20 | 
21 |     # Add new Rules
22 |     #bucket.add_lifecycle_delete_rule(age=2)
23 | 
24 |     #Clearing Rules
25 |     bucket.clear_lifecyle_rules()
26 |     bucket.patch()
27 | 
28 |     # Print Rules again after modifications
29 |     rules = bucket.lifecycle_rules
30 |     print(f"Lifecycle management is enable for bucket {bucket_name} and the rules are {list(rules)}")
31 | 
32 |     return bucket
33 | 
34 | enable_bucket_lifecycle_management('gcp-data-eng-demos')
35 | 


--------------------------------------------------------------------------------
/GoogleCloudStorage/gcs_python_client_demo.py:
--------------------------------------------------------------------------------
  1 | """ 
  2 | 
  3 | Author : @ Anjan GCP Data Engineering
  4 | 
  5 | Created for educational purpose only ...
  6 | 
  7 | This python file has different python functions to explain 
  8 |     1. Create GCS bucket
  9 |     2. Manage GCS bucket
 10 |     3. Upload Storage objects
 11 |     4. Download Storage objects
 12 |     5. Bulk uploads and downloads
 13 |     6. Delete bucket
 14 | using Python GCS clinet libraries
 15 | 
 16 | Insallations
 17 | pip3 install google-cloud-storage
 18 | 
 19 | """
 20 | 
 21 | from google.cloud import storage
 22 | 
 23 | """ Create Bucket """
 24 | 
 25 | bucket_name = "demo_gcp_dataeng123"
 26 | 
 27 | def create_bucket_class_location(bucket_name):
 28 |     """
 29 |     Create a new bucket in the US region with the coldline storage
 30 |     class
 31 |     """
 32 |     # bucket_name = "your-new-bucket-name"
 33 | 
 34 |     storage_client = storage.Client()
 35 | 
 36 |     bucket = storage_client.bucket(bucket_name)
 37 |     bucket.storage_class = "STANDARD"
 38 |     new_bucket = storage_client.create_bucket(bucket, location="us")
 39 | 
 40 |     print(
 41 |         "Created bucket {} in {} with storage class {}".format(
 42 |             new_bucket.name, new_bucket.location, new_bucket.storage_class
 43 |         )
 44 |     )
 45 |     return new_bucket
 46 | 
 47 | create_bucket_class_location(bucket_name)
 48 | 
 49 | """ List Buckets """ 
 50 | 
 51 | def list_buckets():
 52 | 
 53 |     storage_client = storage.Client()
 54 |     buckets = storage_client.list_buckets()
 55 | 
 56 |     for bucket in buckets:
 57 |         print(bucket.name)
 58 | 
 59 | #list_buckets()
 60 | 
 61 | """ Get Bucket metadata info """
 62 | 
 63 | bucket_name = "demo_gcp_dataeng123"
 64 | 
 65 | def bucket_metadata(bucket_name):
 66 |     """Prints out a bucket's metadata."""
 67 |     # bucket_name = 'your-bucket-name'
 68 | 
 69 |     storage_client = storage.Client()
 70 |     bucket = storage_client.get_bucket(bucket_name)
 71 | 
 72 |     print(f"ID: {bucket.id}")
 73 |     print(f"Name: {bucket.name}")
 74 |     print(f"Storage Class: {bucket.storage_class}")
 75 |     print(f"Location: {bucket.location}")
 76 |     print(f"Location Type: {bucket.location_type}")
 77 |     print(f"Cors: {bucket.cors}")
 78 |     print(f"Default Event Based Hold: {bucket.default_event_based_hold}")
 79 |     print(f"Default KMS Key Name: {bucket.default_kms_key_name}")
 80 |     print(f"Metageneration: {bucket.metageneration}")
 81 |     print(
 82 |         f"Public Access Prevention: {bucket.iam_configuration.public_access_prevention}"
 83 |     )
 84 |     print(f"Retention Effective Time: {bucket.retention_policy_effective_time}")
 85 |     print(f"Retention Period: {bucket.retention_period}")
 86 |     print(f"Retention Policy Locked: {bucket.retention_policy_locked}")
 87 |     print(f"Object Retention Mode: {bucket.object_retention_mode}")
 88 |     print(f"Requester Pays: {bucket.requester_pays}")
 89 |     print(f"Self Link: {bucket.self_link}")
 90 |     print(f"Time Created: {bucket.time_created}")
 91 |     print(f"Versioning Enabled: {bucket.versioning_enabled}")
 92 |     print(f"Labels: {bucket.labels}")
 93 |     
 94 | #bucket_metadata(bucket_name)
 95 | 
 96 | bucket_name = "demo_gcp_dataeng123"
 97 | source_file_name = "/home/gcpdataeng36/input_batch_data.csv"
 98 | destination_blob_name = "input_batch_data.csv" 
 99 | 
100 | 
101 | def upload_blob(bucket_name, source_file_name, destination_blob_name):
102 |     """Uploads a file to the bucket."""
103 | 
104 |     storage_client = storage.Client()
105 |     bucket = storage_client.bucket(bucket_name)
106 |     blob = bucket.blob(destination_blob_name)
107 | 
108 |     blob.upload_from_filename(source_file_name)
109 | 
110 |     print(
111 |         f"File {source_file_name} uploaded to {destination_blob_name}."
112 |     )
113 | #upload_blob(bucket_name, source_file_name, destination_blob_name)
114 | 
115 | 
116 | """ Get object's ACLs """
117 | 
118 | bucket_name = "demo_gcp_dataeng123"
119 | blob_name = 'input_batch_data.csv'
120 | def print_blob_acl(bucket_name, blob_name):
121 |     """Prints out a blob's access control list."""
122 | 
123 |     storage_client = storage.Client()
124 |     bucket = storage_client.bucket(bucket_name)
125 |     blob = bucket.blob(blob_name)
126 | 
127 |     for entry in blob.acl:
128 |         print(f"{entry['role']}: {entry['entity']}")
129 | 
130 | #print_blob_acl(bucket_name, blob_name)
131 | 
132 | """  List all objcet in a bucket """
133 | 
134 | bucket_name = "demo_gcp_dataeng123"
135 | 
136 | def list_blobs(bucket_name):
137 |     """Lists all the blobs in the bucket."""
138 |     # bucket_name = "your-bucket-name"
139 | 
140 |     storage_client = storage.Client()
141 | 
142 |     # Note: Client.list_blobs requires at least package version 1.17.0.
143 |     blobs = storage_client.list_blobs(bucket_name)
144 | 
145 |     # Note: The call returns a response only when the iterator is consumed.
146 |     for blob in blobs:
147 |         print(blob.name)
148 | 
149 | #list_blobs(bucket_name)
150 | 
151 | """ Upload multiple objects with transfer manager in parallel """
152 | 
153 | bucket_name = "demo_gcp_dataeng123"
154 | filenames = ["demo1.mov","input_batch_data.csv","ind_niftyrealtylist.csv"]
155 | source_directory="/home/gcpdataeng36"
156 | workers=8
157 | 
158 | def upload_many_blobs_with_transfer_manager(
159 |     bucket_name, filenames, source_directory, workers
160 | ):
161 |     """Upload every file in a list to a bucket, concurrently in a process pool.
162 | 
163 |     Each blob name is derived from the filename, not including the
164 |     `source_directory` parameter. For complete control of the blob name for each
165 |     file (and other aspects of individual blob metadata), use
166 |     transfer_manager.upload_many() instead.
167 |     """
168 | 
169 |     from google.cloud.storage import Client, transfer_manager
170 |     import datetime
171 | 
172 |     storage_client = Client()
173 |     bucket = storage_client.bucket(bucket_name)
174 | 
175 |     print("start time:",datetime.datetime.now())
176 | 
177 |     results = transfer_manager.upload_many_from_filenames(
178 |         bucket, filenames, source_directory=source_directory, max_workers=workers
179 |     )
180 | 
181 |     for name, result in zip(filenames, results):
182 |         # The results list is either `None` or an exception for each filename in
183 |         # the input list, in order.
184 | 
185 |         if isinstance(result, Exception):
186 |             print("Failed to upload {} due to exception: {}".format(name, result))
187 |         else:
188 |             print("Uploaded {} to {}.".format(name, bucket.name))
189 |     print("end time:",datetime.datetime.now())
190 | 
191 | #upload_many_blobs_with_transfer_manager(bucket_name, filenames, source_directory, workers)
192 | 
193 | """ Upload large files in chunks """
194 | 
195 | bucket_name = "demo_gcp_dataeng123"
196 | source_filename = "/home/gcpdataeng36/demo1.mov"
197 | destination_blob_name = "demo1.mov"
198 | workers=8
199 | 
200 | def upload_chunks_concurrently(
201 |     bucket_name,
202 |     source_filename,
203 |     destination_blob_name,
204 |     chunk_size=32 * 1024 * 1024,
205 |     workers=8,
206 | ):
207 |     """Upload a single file, in chunks, concurrently in a process pool."""
208 | 
209 |     from google.cloud.storage import Client, transfer_manager
210 |     import datetime
211 | 
212 |     print("start time:",datetime.datetime.now())
213 |     storage_client = Client()
214 |     bucket = storage_client.bucket(bucket_name)
215 |     blob = bucket.blob(destination_blob_name)
216 | 
217 |     transfer_manager.upload_chunks_concurrently(
218 |         source_filename, blob, chunk_size=chunk_size, max_workers=workers
219 |     )
220 | 
221 |     print(f"File {source_filename} uploaded to {destination_blob_name}.")
222 |     print("end time:",datetime.datetime.now())
223 | 
224 | #upload_chunks_concurrently(bucket_name,source_filename,destination_blob_name,chunk_size=32 * 1024 * 1024,workers=8)
225 | 
226 | """ Download multiple files """
227 | 
228 | bucket_name = "demo_gcp_dataeng123"
229 | blob_names = ["demo1.mov","input_batch_data.csv","ind_niftyrealtylist.csv"]
230 | destination_directory =  "/home/gcpdataeng36/downloads"
231 | workers=8
232 | 
233 | def download_many_blobs_with_transfer_manager(
234 |     bucket_name, blob_names, destination_directory, workers=8
235 | ):
236 |     """Download blobs in a list by name, concurrently in a process pool.
237 | 
238 |     The filename of each blob once downloaded is derived from the blob name and
239 |     the `destination_directory `parameter. For complete control of the filename
240 |     of each blob, use transfer_manager.download_many() instead.
241 | 
242 |     Directories will be created automatically as needed to accommodate blob
243 |     names that include slashes.
244 |     """
245 |     from google.cloud.storage import Client, transfer_manager
246 | 
247 |     storage_client = Client()
248 |     bucket = storage_client.bucket(bucket_name)
249 | 
250 |     results = transfer_manager.download_many_to_path(
251 |         bucket, blob_names, destination_directory=destination_directory, max_workers=workers
252 |     )
253 | 
254 |     for name, result in zip(blob_names, results):
255 |         # The results list is either `None` or an exception for each blob in
256 |         # the input list, in order.
257 | 
258 |         if isinstance(result, Exception):
259 |             print("Failed to download {} due to exception: {}".format(name, result))
260 |         else:
261 |             print("Downloaded {} to {}.".format(name, destination_directory + name))
262 | 
263 | 
264 | #download_many_blobs_with_transfer_manager(bucket_name, blob_names, destination_directory, workers=8)
265 | 
266 | """ delete bucket """
267 | 
268 | bucket_name = "demo_gcp_dataeng123"
269 | 
270 | def delete_bucket(bucket_name):
271 |     """Deletes a bucket. The bucket must be empty."""
272 |     # bucket_name = "your-bucket-name"
273 | 
274 |     storage_client = storage.Client()
275 | 
276 |     bucket = storage_client.get_bucket(bucket_name)
277 |     bucket.delete()
278 | 
279 |     print(f"Bucket {bucket.name} deleted")
280 | 
281 | #delete_bucket(bucket_name)
282 | 


--------------------------------------------------------------------------------
/GoogleCloudStorage/object_versioning_cli.txt:
--------------------------------------------------------------------------------
 1 | gcloud storage buckets update gs://version-demo --versioning
 2 | 
 3 | gcloud storage buckets describe gs://version-demo --format="default(versioning)"
 4 | 
 5 | gcloud storage ls --all-versions gs://version-demo
 6 | 
 7 | gcloud storage cp gs://version-demo/lables.key#1678617129615474 gs://version-demo/lables.key
 8 | 
 9 | gcloud storage rm gs://version-demo/lables.key#1678617129615474
10 | 
11 | gcloud storage buckets update gs://version-demo --no-versioning
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # GCP-Data-Engineering-Demo-Codes
2 | Demo Codes will be shared here , srticlty for educational purpose.
3 | 
4 | 1. Codes will be categorized based on GCP services (Ex: Big Query , Dataflow, Dataproc...etc)
5 | 2. Codes included with inline developer comments at each section of the code.
6 | 
7 | Note for users:  If you have to try these codes , please change GCP_project, Big Query dataset/tables, GCS bucket/Folder ..etc accordingly as per your needs.
8 | 


--------------------------------------------------------------------------------
/Security/secretmanager_python_connect.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Secretmanager python client libraries to be installed with below commands
 3 | -----------------------------------------
 4 | pip install google-cloud-secret-manager
 5 | '''
 6 | def access_secret_version(project_id, secret_id, version_id):
 7 |     """
 8 |     Access the payload for the given secret version if one exists. The version
 9 |     can be a version number as a string (e.g. "5") or an alias (e.g. "latest").
10 |     """
11 | 
12 |     # Import the Secret Manager client library.
13 |     from google.cloud import secretmanager
14 | 
15 |     # Create the Secret Manager client.
16 |     client = secretmanager.SecretManagerServiceClient()
17 | 
18 |     # Build the resource name of the secret version.
19 |     name = f"projects/{project_id}/secrets/{secret_id}/versions/{version_id}"
20 | 
21 |     # Access the secret version.
22 |     response = client.access_secret_version(request={"name": name})
23 |     # Print the secret payload.
24 |     # snippet is showing how to access the secret material.
25 |     payload = response.payload.data.decode("UTF-8")
26 |     print("Plaintext: {}".format(payload))
27 | 
28 | # Function call to show output    
29 | access_secret_version('gcp-data-eng-374308', 'cloudsql_pwd','1')
30 | 


--------------------------------------------------------------------------------