├── .gitattributes ├── .gitignore ├── 03-ingestion-gcs-to-bq ├── README.md ├── cloudbuild.yaml ├── config.yaml ├── crm_permissions.csv ├── crm_permissions_20210708.csv ├── crm_users_20210708.csv ├── main.py └── requirements.txt ├── 04-storage ├── airflow-gar-agg.py ├── dataflow-udf-ga4.js ├── diagrams.R ├── ga4-agg.sql └── ga4-bigquery-buildtrigger.yml ├── 06-activation ├── bigquery-clientid.R ├── gtm-ss-http-to-pubsub.js ├── http-to-pubsub.py ├── send_email.R └── user-activity-ga4.sql ├── 08-segmentation ├── bq-to-firestore │ ├── bigquery-parsing-workflow.yaml │ ├── bq-gcs-workflow.yaml │ ├── bq-pagination-workflows.yaml │ ├── bq-row-to-fs.py │ ├── bq-to-firestore.yaml │ ├── bq-to-fs-main-workflow.yaml │ └── connector_write_firestore_workflow.yaml └── crm_imports │ ├── crm_fake_data.R │ └── fake_crm.csv ├── 09-realtime-forecasting └── importing-tidy-ga4-data.R ├── admin └── image-figs.R ├── code-examples.Rproj ├── figure-log.csv └── gar_email └── Dockerfile /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | 08-segmentation/crm_imports/fake_crm.csv 6 | 08-segmentation/crm_imports/ga4-demo-cids.csv 7 | 08-segmentation/crm_imports/fake_crm.csv 8 | -------------------------------------------------------------------------------- /03-ingestion-gcs-to-bq/README.md: -------------------------------------------------------------------------------- 1 | # Cloud Function - Google Cloud Storage to BigQuery 2 | 3 | This is an example of a cloud function that can be used to trigger BigQuery imports when specified files hit Cloud Storage. 4 | 5 | It uses Cloud Build to deploy to Cloud Functions upon each git commit. 6 | 7 | -------------------------------------------------------------------------------- /03-ingestion-gcs-to-bq/cloudbuild.yaml: -------------------------------------------------------------------------------- 1 | steps: 2 | - name: gcr.io/cloud-builders/gcloud 3 | args: ['functions', 4 | 'deploy', 5 | 'gcs_to_bq', 6 | '--source=03-ingestion-gcs-to-bq', 7 | '--runtime=python39', 8 | '--region=europe-west1', 9 | '--trigger-resource=marks-crm-imports-2021', 10 | '--trigger-event=google.storage.object.finalize'] 11 | -------------------------------------------------------------------------------- /03-ingestion-gcs-to-bq/config.yaml: -------------------------------------------------------------------------------- 1 | project: learning-ga4 2 | datasetid: crm_imports 3 | schema: 4 | crm_bookings: 5 | fields: 6 | - name: BOOK_ID 7 | type: STRING 8 | - name: BOOKING_ACTIVE 9 | type: STRING 10 | - name: BOOKING_DEPOSIT 11 | type: STRING 12 | - name: DATE 13 | type: STRING 14 | - name: DEPARTURE_DATE 15 | type: STRING 16 | crm_permissions: 17 | fields: 18 | - name: USER_ID 19 | type: STRING 20 | - name: PERMISSION 21 | type: STRING 22 | - name: STATUS 23 | type: STRING 24 | - name: SOURCE 25 | type: STRING 26 | - name: PERMISSION_DATE 27 | type: STRING 28 | crm_sales: 29 | fields: 30 | - name: SALES_ID 31 | type: STRING 32 | - name: SALES_EMAIL 33 | type: STRING 34 | - name: SALES_FIRST_NAME 35 | type: STRING 36 | - name: SALES_LAST_NAME 37 | type: STRING -------------------------------------------------------------------------------- /03-ingestion-gcs-to-bq/crm_permissions.csv: -------------------------------------------------------------------------------- 1 | USER_ID,PERMISSION,STATUS,SOURCE,PERMISSION_DATE 2 | AB12345,Marketing1,True,Email,2021-01-21 3 | AB34252,Marketing3,True,Website,2020-12-02 4 | RF45343,-,False,-,- -------------------------------------------------------------------------------- /03-ingestion-gcs-to-bq/crm_permissions_20210708.csv: -------------------------------------------------------------------------------- 1 | USER_ID,PERMISSION,STATUS,SOURCE,PERMISSION_DATE 2 | AB12345,Marketing1,True,Email,2021-01-21 3 | AB34252,Marketing3,True,Website,2020-12-02 4 | RF45343,-,False,-,- -------------------------------------------------------------------------------- /03-ingestion-gcs-to-bq/crm_users_20210708.csv: -------------------------------------------------------------------------------- 1 | USER_ID,EMAIL,TOTAL_LIFETIME_REVENUE 2 | AB12345,david@email.com,56789 3 | AB34252,sanne@freeemail.com,34234 4 | RF45343,rose@medson.com,23123 -------------------------------------------------------------------------------- /03-ingestion-gcs-to-bq/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | import logging 4 | import re 5 | import datetime 6 | from google.cloud import bigquery 7 | from google.cloud.bigquery import LoadJobConfig 8 | from google.cloud.bigquery import SchemaField 9 | import google.cloud.logging 10 | 11 | # set up logging https://cloud.google.com/logging/docs/setup/python 12 | client = google.cloud.logging.Client() 13 | client.get_default_handler() 14 | client.setup_logging() 15 | 16 | # load config.yaml into config 17 | config_file = "config.yaml" 18 | 19 | if os.path.isfile(config_file): 20 | with open("config.yaml", "r") as stream: 21 | try: 22 | config = yaml.safe_load(stream) 23 | except yaml.YAMLError as exc: 24 | logging.error(exc) 25 | else: 26 | logging.error("config.yaml needs to be added") 27 | 28 | # create a list of SchemaField objects from a schema config.yaml file 29 | def create_schema(schema_config): 30 | 31 | SCHEMA = [] 32 | for scheme in schema_config: 33 | 34 | if 'description' in scheme: 35 | description = scheme['description'] 36 | else: 37 | description = '' 38 | 39 | if 'mode' in scheme: 40 | mode = scheme['mode'] 41 | else: 42 | mode = 'NULLABLE' 43 | 44 | try: 45 | assert isinstance(scheme['name'], str) 46 | assert isinstance(scheme['type'], str) 47 | assert isinstance(mode, str) 48 | assert isinstance(description, str) 49 | except AssertionError as e: 50 | logging.info( 51 | 'Error in schema: name {} - type {} - mode - {} description {}'.format(scheme['name'], scheme['type'], mode, description)) 52 | break 53 | 54 | entry = SchemaField(name=scheme['name'], 55 | field_type=scheme['type'], 56 | mode=mode, 57 | description=description) 58 | SCHEMA.append(entry) 59 | 60 | logging.debug('SCHEMA created {}'.format(SCHEMA)) 61 | 62 | return SCHEMA 63 | 64 | 65 | 66 | def make_tbl_name(table_id, schema=False): 67 | 68 | t_split = table_id.split('_20') 69 | 70 | name = t_split[0] 71 | 72 | if schema: return name 73 | 74 | suffix = ''.join(re.findall('\d\d', table_id)[0:4]) 75 | 76 | return name + '$' + suffix 77 | 78 | 79 | def query_schema(table_id, job_config): 80 | 81 | schema_name = make_tbl_name(table_id, schema=True) 82 | 83 | logging.info('Looking for schema_name: {} for import: {}'.format(schema_name, table_id)) 84 | # if we have no configuration attempt auto-detection 85 | # recommended only for development tables 86 | if schema_name not in config['schema']: 87 | logging.info('No config found. Using auto detection of schema') 88 | job_config.autodetect = True 89 | return job_config 90 | 91 | logging.info('Found schema for ' + schema_name) 92 | 93 | schema_config = config['schema'][schema_name]['fields'] 94 | 95 | job_config.schema = create_schema(schema_config) 96 | 97 | # standard csv load behaviour can be defined here 98 | job_config.quote_character = '"' 99 | job_config.skip_leading_rows = 1 100 | job_config.field_delimiter = ',' 101 | job_config.allow_quoted_newlines = True 102 | 103 | return job_config 104 | 105 | def load_gcs_bq(uri, table_id, project, dataset_id): 106 | 107 | client = bigquery.Client(project=project) 108 | dataset_ref = client.dataset(dataset_id) 109 | 110 | # Change the below configuration according to your import needs 111 | job_config = LoadJobConfig() 112 | job_config.source_format = bigquery.SourceFormat.CSV 113 | job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE 114 | job_config.encoding = bigquery.Encoding.UTF_8 115 | job_config.time_partitioning = bigquery.TimePartitioning() 116 | 117 | job_config = query_schema(table_id, job_config) 118 | 119 | table_name = make_tbl_name(table_id) 120 | table_ref = dataset_ref.table(table_name) 121 | 122 | job = client.load_table_from_uri( 123 | uri, 124 | table_ref, 125 | location='EU', 126 | job_config=job_config) # API request 127 | 128 | 129 | 130 | def gcs_to_bq(data, context): 131 | """Background Cloud Function to be triggered by Cloud Storage. 132 | This functions constructs the file URI and uploads it to BigQuery. 133 | 134 | Args: 135 | data (dict): The Cloud Functions event payload. 136 | context (google.cloud.functions.Context): Metadata of triggering event. 137 | Returns: 138 | None; the output is written to Stackdriver Logging 139 | """ 140 | 141 | object_name = data['name'] 142 | project = config['project'] 143 | dataset_id = config['datasetid'] 144 | 145 | if object_name: 146 | # create a bigquery table related to the filename 147 | table_id = os.path.splitext(os.path.basename(object_name))[0].replace('.','_') 148 | uri = 'gs://{}/{}'.format(data['bucket'], object_name) 149 | 150 | load_gcs_bq(uri, table_id, project, dataset_id) 151 | 152 | else: 153 | logging.info('Nothing to load') 154 | 155 | return -------------------------------------------------------------------------------- /03-ingestion-gcs-to-bq/requirements.txt: -------------------------------------------------------------------------------- 1 | google-cloud-bigquery==2.20.0 2 | google-cloud-logging==2.5.0 3 | pyyaml==5.4.1 4 | -------------------------------------------------------------------------------- /04-storage/airflow-gar-agg.py: -------------------------------------------------------------------------------- 1 | from airflow.contrib.operators.bigquery_operator import BigQueryOperator 2 | from airflow.contrib.operators.bigquery_check_operator import BigQueryCheckOperator 3 | from airflow.operators.dummy_operator import DummyOperator 4 | from airflow import DAG 5 | from airflow.utils.dates import days_ago 6 | import datetime 7 | 8 | VERSION = '0.1.7' # increment this each version of the DAG 9 | 10 | DAG_NAME = 'ga4-transformation-' + VERSION 11 | 12 | default_args = { 13 | 'start_date': days_ago(1), # change this to a fixed date for backfilling 14 | 'email_on_failure': True, 15 | 'email': 'mark@example.com', 16 | 'email_on_retry': False, 17 | 'depends_on_past': False, 18 | 'retries': 3, 19 | 'retry_delay': datetime.timedelta(minutes=10), 20 | 'project_id': 'learning-ga4', 21 | 'execution_timeout': datetime.timedelta(minutes=60) 22 | } 23 | 24 | schedule_interval = '2 4 * * *' # min, hour, day of month, month, day of week 25 | 26 | dag = DAG(DAG_NAME, default_args=default_args, schedule_interval=schedule_interval) 27 | 28 | 29 | start = DummyOperator( 30 | task_id='start', 31 | dag=dag 32 | ) 33 | 34 | # uses the Airflow macro {{ ds_nodash }} to insert todays date in YYYYMMDD form 35 | check_table = BigQueryCheckOperator( 36 | task_id='check_table', 37 | dag=dag, 38 | sql=''' 39 | SELECT count(1) > 5000 40 | FROM `learning-ga4.analytics_250021309.events_{{ ds_nodash }}`" 41 | ''' 42 | ) 43 | 44 | checked = DummyOperator( 45 | task_id='checked', 46 | dag=dag 47 | ) 48 | 49 | # a function so you can loop over many tables, SQL files 50 | def make_bq(table_id): 51 | 52 | task = BigQueryOperator( 53 | task_id='make_bq_'+table_id, 54 | write_disposition='WRITE_TRUNCATE', 55 | create_disposition='CREATE_IF_NEEDED', 56 | destination_dataset_table='learning_ga4.ga4_aggregations.{}${{ ds_nodash}}'.format(table_id), 57 | sql='./ga4_sql/{}.sql'.format(table_id), 58 | use_legacy_sql=False, 59 | dag=dag 60 | ) 61 | 62 | return task 63 | 64 | ga_tables = [ 65 | 'pageview-aggs', 66 | 'ga4-join-crm', 67 | 'ecom-fields' 68 | ] 69 | 70 | ga_aggregations = [] # helpful if you are doing other downstream transformations 71 | for table in ga_tables: 72 | task = make_bq(table) 73 | checked >> task 74 | ga_aggregations.append(task) 75 | 76 | 77 | # create the DAG 78 | start >> check_table >> checked -------------------------------------------------------------------------------- /04-storage/dataflow-udf-ga4.js: -------------------------------------------------------------------------------- 1 | /** 2 | * A transform function which filters out fields starting with x-ga 3 | * @param {string} inJson 4 | * @return {string} outJson 5 | */ 6 | function transform(inJson) { 7 | var obj = JSON.parse(inJson); 8 | var keys = Object.keys(obj); 9 | var outJson = {}; 10 | 11 | // don't output keys that starts with x-ga 12 | var outJson = keys.filter(function(key) { 13 | return !key.startsWith('x-ga'); 14 | }).reduce(function(acc, key) { 15 | acc[key] = obj[key]; 16 | return acc; 17 | }, {}); 18 | 19 | return JSON.stringify(outJson); 20 | } -------------------------------------------------------------------------------- /04-storage/diagrams.R: -------------------------------------------------------------------------------- 1 | DiagrammeR::mermaid(" 2 | graph LR 3 | import_ga4-->tidy_ga4 4 | tidy_ga4-->operations_dashboard 5 | import_crm-->tidy_crm 6 | tidy_crm-->join_data 7 | tidy_ga4-->join_data 8 | join_data-->marketing_data 9 | join_data-->sales_data 10 | join_data-->retention_data 11 | marketing_data-->web_enrichment 12 | web_enrichment-->user_api 13 | marketing_data-->marketing_dashboard_data 14 | marketing_data-->csuite_dashboard 15 | sales_data-->csuite_dashboard 16 | retention_data-->csuite_dashboard 17 | ") 18 | 19 | -------------------------------------------------------------------------------- /04-storage/ga4-agg.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | -- event_date (the date on which the event was logged) 3 | parse_date('%Y%m%d',event_date) as event_date, 4 | -- event_timestamp (in microseconds, utc) 5 | timestamp_micros(event_timestamp) as event_timestamp, 6 | -- event_name (the name of the event) 7 | event_name, 8 | -- event_key (the event parameter's key) 9 | (SELECT key FROM UNNEST(event_params) WHERE key = 'page_location') as event_key, 10 | -- event_string_value (the string value of the event parameter) 11 | (SELECT value.string_value FROM UNNEST(event_params) WHERE key = 'page_location') as event_string_value 12 | FROM 13 | -- your GA4 exports - change to your location 14 | `mark-edmondson-gde.analytics_206670707.events_*` 15 | WHERE 16 | -- limits query to use table from yesterday only 17 | _table_suffix = format_date('%Y%m%d',date_sub(current_date(), interval 1 day)) 18 | and event_name = 'page_view' -------------------------------------------------------------------------------- /04-storage/ga4-bigquery-buildtrigger.yml: -------------------------------------------------------------------------------- 1 | steps: 2 | - name: 'gcr.io/cloud-builders/gcloud' 3 | entrypoint: 'bash' 4 | dir: '04-storage' 5 | args: ['-c', 6 | "bq --location=eu --project_id=$PROJECT_ID query --use_legacy_sql=false --destination_table=tidydata.ga4_pageviews$$${(date '+%Y%m%d')} < ./ga4-agg.sql"] 7 | -------------------------------------------------------------------------------- /06-activation/bigquery-clientid.R: -------------------------------------------------------------------------------- 1 | library(bigQueryR) 2 | options(googleAuthR.scopes.selected = "https://www.googleapis.com/auth/cloud-platform") 3 | googleAuthR::gar_gce_auth() 4 | 5 | # the GA4 dataset 6 | bqr_global_project("mark-edmondson-gde") 7 | bqr_global_dataset("analytics_206670707") 8 | 9 | query_client_id <- function(client_id, sql_file){ 10 | 11 | # read in SQL file and interpolate client_id 12 | sql <- readChar(sql_file, file.size(sql_file)) 13 | sql_client_id <- sprintf(sql, client_id) 14 | 15 | results <- tryCatch(bqr_query( 16 | query = sql_client_id, 17 | useLegacySql=FALSE 18 | ), error = function(err){ 19 | message(sql_client_id) 20 | stop("Error in query:", results$error, results$message, call. = FALSE) 21 | }) 22 | 23 | str(results) 24 | 25 | message("Writing ", nrow(results), " rows to bigquery_results.csv") 26 | write.csv(results, file = "/workspace/bigquery_results.csv", row.names = FALSE) 27 | 28 | 29 | TRUE 30 | 31 | } 32 | 33 | client_id <- Sys.getenv("CLIENT_ID") 34 | if(nzchar(client_id)){ 35 | query_client_id(client_id, "/workspace/06-activation/user-activity-ga4.sql") 36 | } else { 37 | stop("Could not find client_id") 38 | } 39 | -------------------------------------------------------------------------------- /06-activation/gtm-ss-http-to-pubsub.js: -------------------------------------------------------------------------------- 1 | const getAllEventData = require('getAllEventData'); 2 | const log = require("logToConsole"); 3 | const JSON = require("JSON"); 4 | const sendHttpRequest = require('sendHttpRequest'); 5 | 6 | log(data); 7 | 8 | const postBody = JSON.stringify(getAllEventData()); 9 | 10 | log('postBody parsed to:', postBody); 11 | 12 | const url = data.endpoint + '/' + data.topic_path 13 | 14 | log('Sending event data to:' + url); 15 | 16 | const options = {method: 'POST', 17 | headers: {'Content-Type':'application/json'}}; 18 | 19 | // Sends a POST request 20 | sendHttpRequest(url, (statusCode) => { 21 | if (statusCode >= 200 && statusCode < 300) { 22 | data.gtmOnSuccess(); 23 | } else { 24 | data.gtmOnFailure(); 25 | } 26 | }, options, postBody); 27 | -------------------------------------------------------------------------------- /06-activation/http-to-pubsub.py: -------------------------------------------------------------------------------- 1 | import os, json 2 | from google.cloud import pubsub_v1 # google-cloud-pubsub==2.8.0 3 | 4 | def http_to_pubsub(request): 5 | request_json = request.get_json() 6 | request_args = request.args 7 | 8 | print('Request json: {}'.format(request_json)) 9 | 10 | if request_json: 11 | res = trigger(json.dumps(request_json).encode('utf-8'), request.path) 12 | return res 13 | else: 14 | return 'No data found', 204 15 | 16 | 17 | def trigger(data, topic_name): 18 | publisher = pubsub_v1.PublisherClient() 19 | 20 | topic_name = 'projects/{project_id}/topics{topic}'.format( 21 | project_id=os.getenv('GCP_PROJECT'), 22 | topic=topic_name, 23 | ) 24 | 25 | print ('Publishing message to topic {}'.format(topic_name)) 26 | 27 | # create topic if necessary 28 | try: 29 | future = publisher.publish(topic_name, data) 30 | future_return = future.result() 31 | print('Published message {}'.format(future_return)) 32 | 33 | return future_return 34 | 35 | except Exception as e: 36 | print('Topic {} does not exist? Attempting to create it'.format(topic_name)) 37 | print('Error: {}'.format(e)) 38 | 39 | publisher.create_topic(name=topic_name) 40 | print ('Topic created ' + topic_name) 41 | 42 | return 'Topic Created', 201 -------------------------------------------------------------------------------- /06-activation/send_email.R: -------------------------------------------------------------------------------- 1 | library(blastula) 2 | library(formattable) 3 | 4 | the_data <- read.csv("/workspace/bigquery_results.csv") 5 | 6 | if(nrow(the_data) < 1){ 7 | stop("Data only one row, stopping") 8 | } 9 | 10 | # Get a nicely formatted date/time string 11 | date_time <- add_readable_time() 12 | ga4_table <- format_table(the_data) 13 | 14 | email <- 15 | compose_email( 16 | body = md(glue::glue( 17 | "Hello, 18 | 19 | You requested your GA4 browsing history from Mark Edmondson's website. Here it is! 20 | {ga4_table} 21 | 22 | ")), 23 | footer = md(glue::glue("Email sent on {date_time}.")) 24 | ) 25 | 26 | the_email <- Sys.getenv("EMAIL") 27 | 28 | if(nzchar(the_email)){ 29 | email %>% 30 | smtp_send( 31 | to = the_email, 32 | from = "me@markedmondson.me", 33 | subject = "Your GA4 history for Mark Edmondson's blog", 34 | credentials = creds_file("/workspace/blastula_gmail_creds") 35 | ) 36 | } else { 37 | stop("Could not find email in EMAIL env var") 38 | } 39 | 40 | 41 | -------------------------------------------------------------------------------- /06-activation/user-activity-ga4.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | format_timestamp("%Y%m%d", timestamp_micros(event_timestamp)) as event_date, 3 | CAST(user_pseudo_id AS STRING) as cid, 4 | event_name, 5 | TIMESTAMP_MICROS(user_first_touch_timestamp) as first_touch, 6 | (SELECT value.string_value FROM UNNEST(event_params) WHERE key = 'page_location') as page_location, 7 | (SELECT value.string_value FROM UNNEST(event_params) WHERE key = 'page_referrer') as page_referrer 8 | FROM 9 | `mark-edmondson-gde.analytics_206670707.events_*` 10 | WHERE 11 | _table_suffix between format_date('%Y%m%d',date_sub(current_date(), interval 90 day)) 12 | and format_date('%Y%m%d',date_sub(current_date(), interval 0 day)) 13 | and event_name = 'page_view' 14 | and user_pseudo_id = '%s' 15 | GROUP BY 1,2,3,4,5,6 16 | ORDER BY event_date -------------------------------------------------------------------------------- /08-segmentation/bq-to-firestore/bigquery-parsing-workflow.yaml: -------------------------------------------------------------------------------- 1 | map_bq_result: 2 | params: [row, names] 3 | steps: 4 | - init_cell_map: 5 | assign: 6 | - cell_map: [] 7 | - processRow: 8 | for: 9 | value: cell 10 | in: ${row.f} 11 | index: i 12 | steps: 13 | - map_cell: 14 | assign: 15 | - name: ${names[i]} 16 | - cell: {"${name}" : "${cell.v}"} 17 | - cell_map: ${cell_values} + ${cell} 18 | - returnRowMap: 19 | return: cell_map 20 | -------------------------------------------------------------------------------- /08-segmentation/bq-to-firestore/bq-gcs-workflow.yaml: -------------------------------------------------------------------------------- 1 | - init: 2 | assign: 3 | - project_id: "learning-ga4" 4 | - bq_dataset_export: "firestore_export" 5 | - bq_table_export: "uid_" 6 | - bq_query: > 7 | select 8 | user_id, 9 | device_first, 10 | channel_grouping_first 11 | from 12 | `stacktonic-cloud.st_core.dim_customer` 13 | - gcs_bucket: "" 14 | - gcs_filepath: "firestore-export/firestore-export.json" 15 | - pubsub_topic: "" 16 | - pubsub_message: { 17 | "projectId": "", 18 | "bucketName": "", 19 | "bucketPath": "firestore-export/firestore-export.json", 20 | "firestoreCollection": "", 21 | "firestoreKey": "" 22 | } 23 | - bigquery-create-export-table: 24 | call: googleapis.bigquery.v2.jobs.insert 25 | args: 26 | projectId: ${project_id} 27 | body: 28 | configuration: 29 | query: 30 | query: ${bq_query} 31 | destinationTable: 32 | projectId: ${project_id} 33 | datasetId: ${bq_dataset_export} 34 | tableId: ${bq_table_export} 35 | create_disposition: "CREATE_IF_NEEDED" 36 | write_disposition: "WRITE_TRUNCATE" 37 | allowLargeResults: true 38 | useLegacySql: false 39 | 40 | - bigquery-table-to-gcs: 41 | call: googleapis.bigquery.v2.jobs.insert 42 | args: 43 | projectId: ${project_id} 44 | body: 45 | configuration: 46 | extract: 47 | compression: NONE 48 | destinationFormat: "NEWLINE_DELIMITED_JSON" 49 | destinationUris: ['${"gs://" + gcs_bucket + "/" + gcs_filepath}'] 50 | sourceTable: 51 | projectId: ${project_id} 52 | datasetId: ${bq_dataset_export} 53 | tableId: ${bq_table_export} 54 | - publish_message_to_pubsub: 55 | call: googleapis.pubsub.v1.projects.topics.publish 56 | args: 57 | topic: ${"projects/" + project_id + "/topics/" + pubsub_topic} 58 | body: 59 | messages: 60 | - data: ${base64.encode(json.encode(pubsub_message))} -------------------------------------------------------------------------------- /08-segmentation/bq-to-firestore/bq-pagination-workflows.yaml: -------------------------------------------------------------------------------- 1 | main: 2 | params: [input] 3 | steps: 4 | - init: 5 | assign: 6 | - pageToken: null 7 | - startQuery: 8 | call: googleapis.bigquery.v2.jobs.query 9 | args: 10 | projectId: ${sys.get_env("GOOGLE_CLOUD_PROJECT_ID")} 11 | body: 12 | useLegacySql: false 13 | # Remove LIMIT from the query to iterate through all results 14 | query: > 15 | SELECT name, SUM(number) AS total 16 | FROM `bigquery-public-data.usa_names.usa_1910_2013` 17 | GROUP BY name ORDER BY total DESC LIMIT 50 18 | result: query 19 | - getPage: 20 | call: googleapis.bigquery.v2.jobs.getQueryResults 21 | args: 22 | projectId: ${sys.get_env("GOOGLE_CLOUD_PROJECT_ID")} 23 | jobId: ${query.jobReference.jobId} 24 | maxResults: 500 25 | pageToken: ${pageToken} 26 | result: page 27 | - processPage: 28 | for: 29 | value: row 30 | in: ${page.rows} 31 | steps: 32 | - processRow: 33 | call: sys.log 34 | args: 35 | data: ${row} 36 | - checkIfDone: 37 | switch: 38 | - condition: ${"pageToken" in page and page.pageToken != ""} 39 | assign: 40 | - pageToken: ${page.pageToken} 41 | next: getPage -------------------------------------------------------------------------------- /08-segmentation/bq-to-firestore/bq-row-to-fs.py: -------------------------------------------------------------------------------- 1 | import json 2 | import firebase_admin # 5.2.0 3 | from firebase_admin import credentials 4 | from firebase_admin import firestore 5 | import datetime 6 | 7 | _MAX_LOSSLESS=9007199254740992 8 | 9 | def export_fs(call, fs_collection_name, db, fs_fields): 10 | 11 | if len(fs_fields) != len(call): 12 | return 'error: number of fields does not match number of fields in call' 13 | 14 | try: 15 | id = call[0] 16 | data = dict(zip(fs_fields, call)) 17 | db.collection(fs_collection_name).document(id).set(data) 18 | return 'added {} to Firestore: {}' % id, datetime.datetime.now() 19 | 20 | except Exception as e: 21 | return 'error: failed to add to Firestore: {}', datetime.datetime.now() 22 | 23 | def fs_add(request): 24 | try: 25 | return_value = [] 26 | request_json = request.get_json() 27 | calls = request_json['calls'] 28 | print('request: {}'.format(request_json)) 29 | except Exception as inst: 30 | return json.dumps( { "errorMessage": 'something unexpected in input' } ), 400 31 | 32 | if request_json and 'userDefinedContext' in request_json: 33 | userDefinedContext = request_json['userDefinedContext'] 34 | if 'fs_collection_name' in userDefinedContext: 35 | fs_collection_name = userDefinedContext['fs_collection_name'] 36 | else: 37 | return json.dumps( { "errorMessage": 'no fs_collection_name specified' } ), 400 38 | 39 | if 'fs_fields' in userDefinedContext: 40 | fs_fields = userDefinedContext['fs_fields'].split(",") 41 | else: 42 | return json.dumps( { "errorMessage": 'no fs_fields specified' } ), 400 43 | 44 | if 'fs_project_id' in userDefinedContext: 45 | fs_project_id = userDefinedContext['fs_project_id'] 46 | else: 47 | return json.dumps( { "errorMessage": 'no fs_project_id specified' } ), 400 48 | else: 49 | return json.dumps( { "errorMessage": 'no userDefinedContext specified' } ), 400 50 | 51 | if len(fs_fields) == 0: 52 | return json.dumps( { "errorMessage": 'length of fs_fields=0' } ), 400 53 | 54 | 55 | # Use the application default credentials 56 | cred = credentials.ApplicationDefault() 57 | firebase_admin.initialize_app(cred, { 58 | 'projectId': fs_project_id, 59 | }) 60 | 61 | db = firestore.client() 62 | 63 | print('Authenticated') 64 | print('fs_collection_name: {}' % fs_collection_name) 65 | print('fs_fields: {}' % fs_fields) 66 | print('fs_project_id: {}' % fs_project_id) 67 | 68 | 69 | if len(calls) > 500: 70 | return json.dumps( { "errorMessage": "too many rows - 500 limit" } ), 400 71 | 72 | for call in calls: 73 | value = export_fs(call=call, fs_collection_name=fs_collection_name, db=db, fs_fields=fs_fields) 74 | return_value.append(value) 75 | 76 | replies = [str(x) if x > _MAX_LOSSLESS or x < -_MAX_LOSSLESS else x for x in return_value] 77 | return_json = json.dumps( { "replies" : replies} ) 78 | return return_json 79 | 80 | 81 | 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /08-segmentation/bq-to-firestore/bq-to-firestore.yaml: -------------------------------------------------------------------------------- 1 | # iam roles: 2 | # - bq job user 3 | # - bq data owner 4 | # - cloud datastore user 5 | # - logging admin 6 | # workflows-bq-firestore@learning-ga4.iam.gserviceaccount.com 7 | main: 8 | params: [args] 9 | steps: 10 | - init_fs_writes: 11 | assign: 12 | - fs_writes: [] 13 | - page: ${args.page} 14 | - schema_names: ${args.schema_names} 15 | - projectId: ${args.projectId} 16 | - collection: ${args.collection} 17 | - fs_key: ${args.fs_key} 18 | - processPage: 19 | for: 20 | value: row 21 | in: ${page.rows} 22 | steps: 23 | - parse_data: 24 | call: map_bq_result 25 | args: 26 | row: ${row} 27 | names: ${schema_names} 28 | result: bq_map 29 | - list_to_dict: 30 | call: list_to_dict 31 | args: 32 | a_list: ${bq_map} 33 | result: bq_dict 34 | - assign_row_values: 35 | assign: 36 | - fs_key_value: ${map.get(bq_dict, fs_key)} 37 | - document_id: ${fs_key_value.stringValue} 38 | - name: ${"projects/"+projectId+"/databases/(default)/documents/"+collection+"/"+document_id} 39 | - document: {update: {fields: '${bq_dict}', name: '${name}'}} 40 | - fs_writes: ${list.concat(fs_writes, document)} 41 | - log_fs_writes: 42 | call: sys.log 43 | args: 44 | data: ${fs_writes} 45 | - batch_write_to_firestore: 46 | call: googleapis.firestore.v1.projects.databases.documents.batchWrite 47 | args: 48 | database: ${"projects/"+projectId+"/databases/(default)"} 49 | body: 50 | writes: ${fs_writes} 51 | result: write_result 52 | 53 | map_bq_result: 54 | params: [row, names] 55 | steps: 56 | - init_cell_list: 57 | assign: 58 | - cell_list: [] 59 | - processRow: 60 | # TODO: map different types to BigQuery schema 61 | # https://cloud.google.com/firestore/docs/reference/rest/Shared.Types/ArrayValue#Value 62 | for: 63 | value: cell 64 | in: ${row.f} 65 | index: i 66 | steps: 67 | - map_cell: 68 | assign: 69 | - name: ${names[i]} 70 | - value: ${default(cell.v, "")} 71 | - cell: {"${name}" : { "stringValue": '${value}'}} 72 | - cell_list: ${list.concat(cell_list, cell)} 73 | - returnRowList: 74 | return: ${cell_list} 75 | 76 | list_to_dict: 77 | params: [a_list] 78 | steps: 79 | - init_dict: 80 | assign: 81 | - the_dict: {} 82 | - loop_list: 83 | for: 84 | value: entry 85 | in: ${a_list} 86 | steps: 87 | - map_entry: 88 | assign: 89 | - the_name: ${keys(entry)[0]} 90 | - the_value: ${entry[the_name]} 91 | - the_dict[the_name]: ${the_value} 92 | - return_dict: 93 | return: ${the_dict} 94 | -------------------------------------------------------------------------------- /08-segmentation/bq-to-firestore/bq-to-fs-main-workflow.yaml: -------------------------------------------------------------------------------- 1 | # https://medium.com/google-cloud/parallel-executions-with-google-workflows-3a16f8fee0eb 2 | # iam roles: 3 | # - bq job user 4 | # - bq data owner 5 | # - cloud datastore user 6 | # - logging admin 7 | # - workflows invoker 8 | # workflows-bq-firestore@learning-ga4.iam.gserviceaccount.com 9 | main: 10 | params: [args] 11 | steps: 12 | - init: 13 | assign: 14 | - pageToken: null 15 | - default_query: > 16 | SELECT A.name, A.crm_id, created, job, 17 | sum(A.transactions) as sum_crm_trans, 18 | sum(A.revenue) as sum_crm_rev, 19 | sum(sum_web_trans) as sum_web_trans, 20 | sum(sum_web_rev) as sum_web_rev, 21 | FROM ( 22 | (SELECT * FROM `learning-ga4.crm_imports.fake_crm_transactions`) AS A 23 | LEFT JOIN 24 | (SELECT user_pseudo_id, 25 | count(distinct ecommerce.transaction_id) as sum_web_trans, 26 | sum(ecommerce.purchase_revenue_in_usd) as sum_web_rev, 27 | FROM `learning-ga4.ga4_public_dataset.events_*` 28 | GROUP BY 1) as B 29 | ON B.user_pseudo_id = A.cid) 30 | GROUP BY 1,2,3,4 31 | LIMIT 2000 # for testing, in production remove LIMIT 32 | - bq_query: ${default(map.get(args, "bq_query"), default_query)} 33 | - projectId: ${default(map.get(args, "projectId"), sys.get_env("GOOGLE_CLOUD_PROJECT_ID"))} 34 | - collection: ${default(map.get(args, "collection"), "crm-import")} 35 | - fs_key: ${default(map.get(args, "fs_key"), "crm_id")} # change to column holding firestore key 36 | - maximumBytesBilled: ${default(map.get(args, "maximumBytesBilled"), 1000000000)} #10GB 37 | - bq_page_size: ${default(map.get(args, "bq_page_size"), 50)} # firestore only accepts batches up to 500. 38 | - log_init: 39 | call: sys.log 40 | args: 41 | data: ${bq_query + projectId + collection + fs_key + string(maximumBytesBilled) + string(bq_page_size)} 42 | - startQuery: 43 | call: googleapis.bigquery.v2.jobs.insert 44 | args: 45 | projectId: ${projectId} 46 | body: 47 | configuration: 48 | query: 49 | useLegacySql: false 50 | maximumBytesBilled: ${maximumBytesBilled} 51 | query: ${bq_query} 52 | result: query 53 | - getPage: 54 | call: googleapis.bigquery.v2.jobs.getQueryResults 55 | args: 56 | projectId: ${projectId} 57 | jobId: ${query.jobReference.jobId} 58 | maxResults: ${bq_page_size} 59 | pageToken: ${pageToken} 60 | result: page 61 | - extract_schema: 62 | call: extract_schema 63 | args: 64 | page: ${page} 65 | result: schema_names 66 | - log_schema: 67 | call: sys.log 68 | args: 69 | data: ${schema_names} 70 | - parallel-executor: 71 | call: googleapis.workflowexecutions.v1.projects.locations.workflows.executions.run 72 | args: 73 | workflow_id: bq-to-firestore 74 | argument: 75 | page: ${page} 76 | schema_names: ${schema_names} 77 | projectId: ${projectId} 78 | collection: ${collection} 79 | fs_key: ${fs_key} # change to column holding firestore key 80 | - checkIfDone: 81 | switch: 82 | - condition: ${"pageToken" in page and page.pageToken != ""} 83 | assign: 84 | - pageToken: ${page.pageToken} 85 | next: getPage 86 | 87 | 88 | extract_schema: 89 | params: [page] 90 | steps: 91 | - init_schema_list: 92 | assign: 93 | - schema_list: [] 94 | - i: 0 95 | - array: ${page.schema.fields} 96 | - check_condition: 97 | switch: 98 | - condition: ${len(array) > i} 99 | next: iterate 100 | next: exit_loop 101 | - iterate: 102 | assign: 103 | - value: ${array[i].name} 104 | - schema_list: ${list.concat(schema_list, value)} 105 | - i: ${i+1} 106 | next: check_condition 107 | - exit_loop: 108 | return: ${schema_list} -------------------------------------------------------------------------------- /08-segmentation/bq-to-firestore/connector_write_firestore_workflow.yaml: -------------------------------------------------------------------------------- 1 | # [START workflows_connector_write_firestore] 2 | - init_variables: 3 | assign: 4 | - project: ${sys.get_env("GOOGLE_CLOUD_PROJECT_ID")} 5 | - collection: "peopleDatabase" 6 | - document: "smith.j" 7 | - values_to_write: 8 | FirstName: 9 | stringValue: "John" 10 | LastName: 11 | stringValue: "Smith" 12 | Age: 13 | integerValue: 32 14 | - write_to_firestore: 15 | call: googleapis.firestore.v1.projects.databases.documents.patch 16 | args: 17 | name: ${"projects/"+project+"/databases/(default)/documents/"+collection+"/"+document} 18 | body: 19 | fields: ${values_to_write} 20 | result: write_result 21 | - last: 22 | return: ${write_result.name} 23 | # [END workflows_connector_write_firestore] -------------------------------------------------------------------------------- /08-segmentation/crm_imports/crm_fake_data.R: -------------------------------------------------------------------------------- 1 | library(charlatan) # generate fake names 2 | library(bigQueryR) 3 | 4 | bqr_global_project("learning-ga4") 5 | bqr_global_dataset("crm_imports_us") 6 | 7 | # downloaded from GA4 BigQuery demo dataset 8 | cids <- read.csv(file = "08-segmentation/crm_imports/ga4-demo-cids.csv", 9 | colClasses = "character") 10 | 11 | # random order, only 50% of cids 12 | fake_logins <- cids[sample(nrow(cids)), , drop = FALSE] 13 | fake_logins <- head(fake_logins, nrow(fake_logins) / 2) 14 | fake_logins$crm_id <- sprintf("CRM%06d", seq.int(nrow(fake_logins))) 15 | 16 | # distinct user_ids 17 | 18 | fake_people <- nrow(fake_logins) 19 | ga4_last_date <- as.Date("2021-01-31") 20 | 21 | fake <- ch_generate("name", "job", n = fake_people) 22 | 23 | z <- DateTimeProvider$new() 24 | 25 | fake$created_ts <- unlist(lapply(1:fake_people, 26 | function(x){ 27 | z$date_time_between(start_date = as.Date("2001-03-05"), 28 | end_date = ga4_last_date) 29 | })) 30 | fake$created <- as.POSIXct(fake$created_ts, origin = "1970-01-01") 31 | 32 | # make it more likely to transact if you have these jobs 33 | fake$bias <- grepl("teacher|researcher|academic|school|engine|doctor|prof|surgeon|phd|dr|science", 34 | fake$job, ignore.case = TRUE) 35 | fake$transactions <- as.numeric(difftime(ga4_last_date, fake$created)) %/% 36 | runif(fake_people, 10000,90000) 37 | fake$transactions <- abs(ifelse(fake$bias, 38 | round(fake$transactions*runif(fake_people, 1.1, 2)), 39 | fake$transactions)) 40 | 41 | fake$revenue <- round(fake$transactions * runif(fake_people, 1,150),2) 42 | fake$permission <- as.logical(round(runif(fake_people, min = 0.4, max = 1))) 43 | fake$crm_id <- fake_logins$crm_id 44 | fake$cid <- as.character(fake_logins$user_pseudo_id) 45 | fake$bias <- NULL 46 | fake$created_ts <- NULL 47 | 48 | filename <- "08-segmentation/crm_imports/fake_crm.csv" 49 | write.csv(fake, file = filename, row.names = FALSE) 50 | 51 | # fake <- read.csv(filename,stringsAsFactors = FALSE, colClasses = "character") 52 | 53 | bqr_auth(email = "email@example.com") 54 | bqr_global_project("learning-ga4") 55 | bqr_global_dataset("crm_imports_us") 56 | 57 | bqr_delete_table(tableId = "fake_crm_transactions") 58 | bqr_create_table(tableId = "fake_crm_transactions", 59 | timePartitioning = TRUE) 60 | bqr_upload_data(tableId = "fake_crm_transactions", 61 | upload_data = fake) 62 | -------------------------------------------------------------------------------- /09-realtime-forecasting/importing-tidy-ga4-data.R: -------------------------------------------------------------------------------- 1 | library(googleAnalyticsR) 2 | library(rtweet) 3 | 4 | ga4s <- ga_account_list("ga4") 5 | 6 | gaid <- 206670707 7 | meta <- ga_meta("data", propertyId = gaid) 8 | 9 | article_reads <- ga_data(gaid, 10 | metrics = "eventCount", 11 | date_range = c("2021-07-01",as.character(Sys.Date())), 12 | dimensions = c("date", "customEvent:category"), 13 | orderBys = ga_data_order(+date), 14 | dim_filters = ga_data_filter(!"customEvent:category" == c("(not set)","null")), 15 | limit = -1) 16 | 17 | library(tidyr) 18 | library(dplyr) 19 | 20 | clean_cats <- article_reads |> 21 | rename(category = "customEvent:category", 22 | reads = "eventCount") |> 23 | mutate(category = tolower(category)) |> 24 | separate(category, 25 | into = paste0("category_",1:6), 26 | sep = "[^[:alnum:]-]+", 27 | fill = "right", extra = "drop") 28 | 29 | long_cats <- clean_cats |> 30 | pivot_longer( 31 | cols = starts_with("category_"), 32 | values_to = "categories", 33 | values_drop_na = TRUE 34 | ) 35 | 36 | 37 | agg_cats <- long_cats |> 38 | group_by(date, categories) |> 39 | summarise(category_reads = sum(reads), .groups = "drop_last") |> 40 | arrange(date, desc(category_reads)) 41 | -------------------------------------------------------------------------------- /admin/image-figs.R: -------------------------------------------------------------------------------- 1 | # replace with the folder where your .asciidoc files are 2 | book_folder <- "../learning-google-analytics/" 3 | 4 | # list the .asciidoc files 5 | chapters <- list.files(book_folder,pattern = "asciidoc$", full.names = TRUE) 6 | 7 | # read in the chapters 8 | book_text <- lapply(chapters, readLines) 9 | names(book_text) <- basename(chapters) 10 | 11 | # parse out only the image:: lines 12 | book_images <- lapply(book_text, \(x) x[grepl("^image", x)]) 13 | 14 | # create a list of dfs with image info 15 | images_df <- lapply(names(book_images), \(x){ 16 | o <- book_images[[x]] 17 | if(length(o) == 0) return(NULL) 18 | reggy <- "^image::images/(.+?\\.(png|jpg))\\[(.*)\\]" 19 | filenames <- gsub(reggy, "\\1", o) 20 | captions <- gsub(reggy, "\\3", o) 21 | chapter <- substr(x, 1,2) 22 | the_df <- data.frame(filename = filenames, caption = captions) 23 | the_df$fig_num <- paste0(chapter, "-", gsub(" ", "0", sprintf("%2d", 1:nrow(the_df)))) 24 | 25 | the_df 26 | }) 27 | 28 | # turn it into one data.frame and write to csv 29 | all_images <- Reduce(rbind, images_df) 30 | write.csv(all_images, file = "figure-log.csv", row.names = FALSE) 31 | 32 | # I then imported the CSV into GoogleSheets for review -------------------------------------------------------------------------------- /code-examples.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /figure-log.csv: -------------------------------------------------------------------------------- 1 | "filename","caption","fig_num" 2 | "gcp-pyramid.png","Google Cloud Platform Pyramid Hierarchy on which service you should choose for your applications","01-01" 3 | "data-architecture-7.png","Data Architecture for the Predictive Audiences use case: website data is sent to Google Analytics 4 which creates the predictive audience that is then exported to Google Ads","01-02" 4 | "chapter8_predictive.drawio.png","Data Architecture for the User Segmentation use case","01-03" 5 | "data-architecture-chapter_09.png","Real-time data is taken from GA4 and a forecast is created to inform employee what content they should prioritise for social media content and on-site banners via Google Optimise","01-04" 6 | "storage-decision-tree.png","A flowchart decision tree on selecting the correct GCP storage option","02-01" 7 | "enhanced-measurement-03.png","","03-01" 8 | "ga4-custom-event-parameters.png","Custom events and their parameters. As well as the custom parameters you send in, GA4 also collects useful data points such as ga_session_id and page_location","03-02" 9 | "gtm-setup-ga4-article_read.png","A suggested GTM setting for sending a GA4 custom event: article_read","03-03" 10 | "gtm-dom-selector-webpage.png","The blog post's publish date is available in the page HTML. Using your web browser's console you can find the CSS selector to pick up that data for use within Google Tag Manager","03-04" 11 | "gtm-dom-selector.png","The CSS code for the data can be used within Google Tag Manager's DOM Element Variable for use with GA4 and other tags","03-05" 12 | "ga4-custom-dimension-config.png","Configuring a custom dimension from the new parameter created with the article_read event","03-06" 13 | "simo-base-example-gtag-get-api.jpg","Write Client ID And Other GTAG Fields Into dataLayer","03-07" 14 | "messy-event-category-03.png","The article_read event contains a category parameter that has messy data due to multiple tags being recorded","03-08" 15 | "creating-r-viewer-03.png","Creating a custom event based off data captured via other events - in this case r_viewer is derived from the article_read event.","03-09" 16 | "created-events-list-03.png","Several created events based off of the custom category parameter from article_read","03-10" 17 | "mark-as-npa.png","Marking a use property as NPA to avoid it being used in targeting audiences","03-11" 18 | "cookieinfo-gtm-permissions.png","Permissions for the template code","03-12" 19 | "gtm-variable-template-instance.png","Creating a variable from the template","03-13" 20 | "gtm-consent-user-setup.png","Using the consent variable in a GA4 Event tag","03-14" 21 | "user_consent_setup.png","Setting up a user consent parameter in GA4","03-15" 22 | "event_consent_setup.png","Setting up an event consent parameter in GA4","03-16" 23 | "mp_sequence_diagram.png","","03-17" 24 | "ga4-bigquery-link.png","An example of a completed linking to BigQuery from the GA4 configuration screen","03-18" 25 | "ga4-bigquery-sql-example-1-result.png","An example result for running the BigQuery SQL on your GA4 exports from","03-19" 26 | "gcs-lifecycle-rules.png","Google Cloud Storage Lifecycle rules","03-20" 27 | "cloud-function-gcs-trigger-1.png","Creating a Cloud Function to trigger from a newly uploaded file in Cloud Storage","03-21" 28 | "cloud-function-gcs-trigger-2.png","Creating a Cloud Function to trigger from a newly uploaded file in Cloud Storage","03-22" 29 | "cloud-function-gcs-code-1.png","Adding code for Python Cloud Functions","03-23" 30 | "gcs-to-bq-logs1.png","Inspecting the Cloud Function logs we can see one file was imported to a specified schema, one used auto-detection","03-24" 31 | "gcs-to-bq-logs2.png","Inspecting the BigQuery logs to see the schema has been specified as expected","03-25" 32 | "gcs-to-bq-loaded.png","The BigQuery tables are imported from the CSV on Cloud Storage with the specified schema","03-26" 33 | "cloud-build-setup-2.png","Linking Cloud Build to the GitHub repository holding your files","03-27" 34 | "cloud-build-setup-3.png","A Cloud Build trigger that will activate the contents of the cloudbuild.yaml upon each commit to the GitHub repository","03-28" 35 | "cloud-build-setup-4.png","Setting the Cloud Build permission to deploy Cloud Functions","03-29" 36 | "cloud-build-setup-1.png","Files in a folder enabled for git","03-30" 37 | "cloud-build-setup-5.png","A successful deployment of a Cloud Function via Cloud Build","03-31" 38 | "tidy-1.png","Following three rules makes a dataset tidy: variables are in columns, observations are in rows, and values are in cells. From *R for Data Science* by Hadley Wickham and Garret Grolemund","04-01" 39 | "bigquery-ga4-exports-logs.png","A Cloud Logging filter for seeing when your GA4 BigQuery exports are ready. We shall use this to create a PubSub topic.","04-02" 40 | "bigquery-ga4-export-pubsub.png","Setting up your GA4 BigQuery log so it sends the entries to PubSub topic named ga4-bigquery","04-03" 41 | "cloud-storage-example.png","Files sitting within Cloud Storage in its WebUI","04-04" 42 | "cloud-storage-metadata.png","Various metadata associated with a file upload to Google Cloud Storage","04-05" 43 | "bigquery-scheduler.png","Setting up the query into a scheduled query","04-06" 44 | "airflow_dags.png","An example of an Airflow DAG.","04-07" 45 | "airflow-example-dag.png","An example of the DAG created in Airflow","04-08" 46 | "cloud-scheduler-list.png","Some Cloud Schedulers I have enabled for some tasks within my own Google Cloud Project","04-09" 47 | "cloud-build-log-example.png","A Cloud Build that has successfully built within the Google Cloud Console.","04-10" 48 | "bigquery-ga4-export-build-trigger.png","Setting up a Build Trigger that will build once the BigQuery export for GA4 is complete","04-11" 49 | "bigquery-ga4-export-build-permissions.png","Adding to the Cloud Build service account the permissions to execute BigQuery jobs","04-12" 50 | "pubsub-setting-up-dataflow-config.png","Setting up a Dataflow from within the Google Cloud Console for a PubSub topic into BigQuery via the pre-defined template","04-13" 51 | "dataflow-running.png","Starting up a running job for importing PubSub messages into BigQuery in real-time","04-14" 52 | "pubsub-dataflow-bigquery-schema.png","The BigQuery data schema to receive the PubSub Json","04-15" 53 | "pubsub-dataflow-bigquery-errors.png","Any errors from the data flow will appear in its own BigQuery table so you can examine the payloads","04-16" 54 | "pubsub-dataflow-bigquery.png","A successful streaming import from GA4 into GTM-SS to PubSub to BigQuery","04-17" 55 | "pubsub-to-bq-env-args.png","Setting the environment arguments for use within the cloud function","04-18" 56 | "pubsub-to-bq-parse-json.png","The raw data table receiving the PubSub stream from GA4 via GTM-SS can have its JSON parsed out with BigQuery's functions such as JSON_VALUE()","04-19" 57 | "bigquery-dataset-table-expiration.png","You can configure the table expiration time when you create a dataset","04-20" 58 | "ga4-attribution.png","In GA4 you can set attribution settings on how your conversions are attributed to which channel","05-01" 59 | "ga4-reporting-id.png","Selecting how users can be identified within GA4 reports","05-02" 60 | "lowest-channel.png","Writing questions in the GA4 search bar will parse itself to try and find the most appropriate GA4 report for you","05-03" 61 | "insights-ga4.png","Insights looks to flag the most important findings of the day when you log in","05-04" 62 | "bq-model-cheatsheet.png","This cheat sheet shows what use cases and BigQuery ML model may be most appropriate","05-05" 63 | "nlp-pipeline.png","An event based pipeline for processing text files on Cloud Storage as they arrive and put the Natural Language API results into a BigQuery table","05-06" 64 | "vertex-ai-dataset.png","Creating a dataset from BigQuery in Vertex AI","05-07" 65 | "ga4-demo-audience-list.png","A list of GA4 Audiences taken from the GA4 demo account for Google Merchandise Store","06-01" 66 | "audience-session-start-2-pvs.png","A configuration for session_start events with 2 further page_view events.","06-02" 67 | "audience-got-not-open.png","A configuration for users who received a notification but did not open it","06-03" 68 | "suggested-predictive-audiences.png","If you fulfil the criteria you will see predictive audiences available in your GA4 configuration","06-04" 69 | "optimise-target-audience.png","Selecting a GA4 audience within Google Optimise","06-05" 70 | "optimise-banner.png","Setting up a banner for the website that will trigger when the GA4 Audience segment is fulfilled.","06-06" 71 | "ga4-reports-trend.png","GA4 Standard Reports show you real-time updates and trends for your GA4 event data","06-07" 72 | "image:images/ga4-user-acquisition.png[Showing how users first arrived to my blog]","image:images/ga4-user-acquisition.png[Showing how users first arrived to my blog]","06-08" 73 | "ga4-select-goal-event.png","Selecting which of your events or goal conversions that channel contributed to","06-09" 74 | "ga4-segment-comparisons.png","Comparison of All Users and users who arrived via Google for the count of google analytics category articles","06-10" 75 | "ga4-merchandise-anomaly.png","An anomaly spike found in the merchandise data within the Google Merchandise Store demo GA$ account","06-11" 76 | "ga4-customise-collection.png","A custom collection of reports for my Blog","06-12" 77 | "ga4-explorations.png","The start of your Exploration work flow involves selecting or creating one from the start screen","06-13" 78 | "ga4-explorations-select-variables.png","Select the variables you think you will need in your exploration","06-14" 79 | "ga4-explorations-segments.png","Using the Segment overlap technique to see which users are from the US and use mobile devices","06-15" 80 | "ga4-exploration-fields.png","Selecting the appropriate fields for your exploration report","06-16" 81 | "ga4-exploration-user-explorer.png","The User Explorer report can drill down on an individual cookie ID","06-17" 82 | "ga4-pathing.png","Path analysis of which pages were visited after the two_pageviews event was triggered","06-18" 83 | "ga4-funnels.png","Examining a funnel drop-out journey","06-19" 84 | "ga4-cohorts.png","How many users who triggered the googleanalytics_viewer event came back to the website the subsequent months","06-20" 85 | "datastudio-connectors.png","Connecting Data Studio to GA4","06-21" 86 | "gar-data-studio-connectors.png","Connecting Data Studio via the Google Analytics connector vs the BigQuery table","06-22" 87 | "ga4-datastudio-databloo.png","A GA4 Data Studio example for GA4 from Databloo","06-23" 88 | "ga4_looker.jpg","Looker connects to GA4's BigQuery dataset and uses its LookML language to create useful data points","06-24" 89 | "form_submit_to_pubsub.png","The tag within GTM-SS for forwarding on your events to a HTTP endpoint","06-25" 90 | "firestore-example.png","Example data within a Firestore instance","06-26" 91 | "firestore-lookup-gtmss.png","A Firestore Lookup Variable in Google Tag Manager Serverside","06-27" 92 | "data-architecture-7.png","Data Architecture for the Predictive Audiences use case: website data is sent to Google Analytics 4 which creates the predictive audience that is then exported to Google Ads","07-01" 93 | "audience-creation.png","Customising a predictive audience","07-02" 94 | "predicitve-audeince-config.png","Configuration of an Audience showing likely purchasers in the next seven days","07-03" 95 | "chapter8_predictive.drawio.png","Data Architecture for the User Segmentation use case","08-01" 96 | "user_profession_08.png","Configuration of a custom field to hold user profession in GA4","08-02" 97 | "fake-crm-data-08.png","Fake CRM data within BigQuery generated to overlap the Google Merchandise Store cookieIds","08-03" 98 | "sql-example-8-ga4-screenshot.png","The results of a transaction query upon the public GA4 data","08-04" 99 | "ga4-crm-join-result-08.png","An example showing the result of joining the demo datasets from GA4 and CRM","08-05" 100 | "firestore-crm-import.png","Your CRM data imported into Firestore from BigQuery","08-06" 101 | "ga4-gtm-ss-user-id.png","Setting up a custom event to extract the user_id we will use as the document name to fetch data from Firestore","08-07" 102 | "ga4-gtm-ss-job.png","Configuring a GTM-SS Firestore Lookup variable so call our Firestore collection containing the CRM data, using user_id as its document reference","08-08" 103 | "ga4-gtm-ss-tag.png","Configuring a GA4 GTM-SS event tag with user properties adding the Firestore value","08-09" 104 | "doctors-who-may-purchase.png","Adding a new custom dimension to our Audience definitions, that will combine with the existing Predictive Audience","08-10" 105 | "predicitve-audeince-config.png","Configuration of an Audience showing likely purchasers in the next seven days","08-11" 106 | "data-architecture-chapter_09.png","Real-time data is taken from GA4 and a forecast is created to inform employee what content they should prioritise for social media content and on-site banners via Google Optimise","09-01" 107 | "medical-users-medical-content-audiences.png","Creating an audience we can query in the Real-Time API matching /medical-books to users who are Doctors","09-02" 108 | "medical-users-medical-content-event-trigger.png","When users qualify for the audience, they can set off an event that can be seen within the Real-Time API","09-03" 109 | "cloud-run-container.png","The Container Image URL will be the one you have specified for your local Docker build","09-04" 110 | "r-forecasts.png","Some example output for forecasting GA4 real-time Audience data","09-05" 111 | "r-highcharter.png","Output from the script that turns R forecast objects into `www.highcharts.com` plots","09-06" 112 | "create-service-key-for-ga4.png","Create a service key for use within your app","09-07" 113 | "create-new-json-key.png","Once you have created the service account, download a JSON key for use within your application","09-08" 114 | "adding-service-account-ga4.png","Adding a service email as a user to the GA4 interface for use within your scripts","09-09" 115 | "ga4-realtime-shiny-app.png","A running Shiny app with real-time GA4 data and a forecast","09-10" 116 | -------------------------------------------------------------------------------- /gar_email/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gcr.io/gcer-public/googleauthr-verse:latest 2 | RUN install2.r --error \ 3 | -r 'http://cran.rstudio.com' \ 4 | blastula formattable 5 | --------------------------------------------------------------------------------