├── 01_ingest_and_transform ├── 11_challenge │ ├── README.md │ ├── beam │ │ ├── Dockerfile │ │ ├── beam_processing.py │ │ ├── config.py │ │ └── requirements.txt │ ├── cloud-run-pubsub-proxy │ │ ├── .gitignore │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── app.js │ │ ├── index.js │ │ └── package.json │ ├── config_env.sh │ ├── datalayer │ │ ├── .gitignore │ │ ├── README.md │ │ ├── add_to_cart.json │ │ ├── ecommerce_events_bq_schema.json │ │ ├── purchase.json │ │ ├── purchase_anomaly.json │ │ ├── synth_data_stream.py │ │ └── view_item.json │ ├── main.tf │ ├── processing-service │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── config.py │ │ ├── main.py │ │ └── requirements.txt │ ├── terraform.tfvars │ └── variables.tf └── 12_solution │ ├── README.md │ ├── beam │ ├── Dockerfile │ ├── beam_processing.py │ ├── config.py │ └── requirements.txt │ ├── cloud-run-pubsub-proxy │ ├── .gitignore │ ├── Dockerfile │ ├── README.md │ ├── app.js │ ├── index.js │ └── package.json │ ├── config_env.sh │ ├── datalayer │ ├── .gitignore │ ├── README.md │ ├── add_to_cart.json │ ├── ecommerce_events_bq_schema.json │ ├── purchase.json │ ├── purchase_anomaly.json │ ├── synth_data_stream.py │ └── view_item.json │ ├── main.tf │ ├── processing-service │ ├── Dockerfile │ ├── README.md │ ├── config.py │ ├── main.py │ └── requirements.txt │ ├── terraform.tfvars │ └── variables.tf ├── 02_activate ├── 21_challenge │ ├── README.md │ ├── cloud-run-pubsub-proxy │ │ ├── .gitignore │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── app.js │ │ ├── index.js │ │ └── package.json │ ├── config_custom.py │ ├── config_env.sh │ ├── custom_train │ │ ├── prediction │ │ │ ├── Dockerfile │ │ │ ├── __init__.py │ │ │ ├── config.py │ │ │ ├── main.py │ │ │ └── requirements.txt │ │ └── trainer │ │ │ ├── Dockerfile │ │ │ ├── __init__.py │ │ │ ├── config.py │ │ │ ├── main.py │ │ │ ├── preprocess.py │ │ │ ├── requirements.txt │ │ │ └── train.py │ ├── datalayer │ │ ├── .gitignore │ │ ├── README.md │ │ ├── add_to_cart.json │ │ ├── ecommerce_events_bq_schema.json │ │ ├── purchase.json │ │ ├── purchase_anomaly.json │ │ ├── synth_data_stream.py │ │ └── view_item.json │ ├── inf_processing_service │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── config.py │ │ ├── main.py │ │ └── requirements.txt │ ├── inf_processing_service_custom │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── config.py │ │ ├── main.py │ │ └── requirements.txt │ ├── kf_pipe_custom.py │ ├── main.tf │ ├── processing-service │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── config.py │ │ ├── main.py │ │ └── requirements.txt │ ├── requirements.txt │ ├── terraform.tfvars │ └── variables.tf └── 22_solution │ ├── README.md │ ├── config.py │ ├── config_custom.py │ ├── config_env.sh │ ├── custom_train │ ├── prediction │ │ ├── Dockerfile │ │ ├── __init__.py │ │ ├── config.py │ │ ├── main.py │ │ └── requirements.txt │ └── trainer │ │ ├── Dockerfile │ │ ├── __init__.py │ │ ├── config.py │ │ ├── main.py │ │ ├── preprocess.py │ │ ├── requirements.txt │ │ └── train.py │ ├── inf_processing_service │ ├── Dockerfile │ ├── README.md │ ├── config.py │ ├── main.py │ └── requirements.txt │ ├── inf_processing_service_custom │ ├── Dockerfile │ ├── README.md │ ├── config.py │ ├── main.py │ └── requirements.txt │ ├── kf_pipe.py │ ├── kf_pipe_custom.py │ ├── main.tf │ ├── requirements.txt │ ├── terraform.tfvars │ └── variables.tf ├── CONTRIBUTING.md ├── LICENSE ├── README.md └── rsc ├── .DS_Store ├── cloudrun_processing.png ├── dataflow.png ├── efficient_pipelines.png ├── hyp_architecture.png ├── hyp_ml_architecture.png ├── ingestion.png ├── pubsub_direct.png └── pubsub_metrics.png /01_ingest_and_transform/11_challenge/beam/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gcr.io/dataflow-templates-base/python39-template-launcher-base:latest as template_launcher 2 | 3 | ARG WORKDIR=/dataflow/template 4 | RUN mkdir -p ${WORKDIR} 5 | WORKDIR ${WORKDIR} 6 | 7 | # Copy local code to the container image. 8 | COPY . ./ 9 | COPY ./requirements.txt ./ 10 | COPY ./beam_processing.py ./ 11 | 12 | # Flex Template ENV Vars 13 | ENV FLEX_TEMPLATE_PYTHON_PY_FILE="${WORKDIR}/beam_processing.py" 14 | ENV FLEX_TEMPLATE_PYTHON_REQUIREMENTS_FILE="${WORKDIR}/requirements.txt" 15 | 16 | # Install requirements 17 | RUN pip install --upgrade pip 18 | RUN pip install --no-cache-dir -r requirements.txt 19 | 20 | ENTRYPOINT ["/opt/google/dataflow/python_template_launcher"] -------------------------------------------------------------------------------- /01_ingest_and_transform/11_challenge/beam/beam_processing.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import json 16 | import time 17 | 18 | import config 19 | 20 | import apache_beam as beam 21 | from apache_beam.options.pipeline_options import PipelineOptions 22 | from apache_beam.transforms import trigger 23 | from apache_beam.io.gcp.pubsub import ReadFromPubSub 24 | from apache_beam.io.gcp.bigquery import BigQueryDisposition, WriteToBigQuery 25 | from apache_beam.runners import DataflowRunner 26 | 27 | # Defining event filter functions. 28 | 29 | 30 | def is_item_view(event): 31 | return event['event'] == 'view_item' 32 | 33 | 34 | def is_add_to_cart(event): 35 | return event['event'] == 'add_to_cart' 36 | 37 | 38 | def is_purchase(event): 39 | return event['event'] == 'purchase' 40 | 41 | 42 | class ExtractValueFn(beam.DoFn): 43 | def process(self, element): 44 | print(f"ExtractValueFn: {element['ecommerce']['purchase']['value']}") 45 | return [element['ecommerce']['purchase']['value']] 46 | 47 | 48 | class ExtractAndSumValue(beam.PTransform): 49 | """A transform to extract key/score information and sum the scores. 50 | The constructor argument `field` determines whether 'team' or 'user' info is 51 | extracted. 52 | """ 53 | 54 | def expand(self, pcoll): 55 | sum_val = ( 56 | pcoll 57 | | beam.Map(lambda elem: (elem['user_id'], elem['ecommerce']['purchase']['value'])) 58 | | '' 59 | return(sum_val) 60 | 61 | 62 | class FormatByRow(beam.PTransform): 63 | """A transform to reformat the data to column name/value format. 64 | """ 65 | 66 | def expand(self, pcoll): 67 | row_val = ( 68 | pcoll 69 | | beam.Map(lambda elem: {'user_id': elem[0], 70 | 'summed_value': elem[1] 71 | }) 72 | ) 73 | return(row_val) 74 | 75 | 76 | def streaming_pipeline(project, region): 77 | 78 | subscription = "projects/{}/subscriptions/hyp_subscription_dataflow".format( 79 | project) 80 | 81 | bucket = "gs://{}-ecommerce-events/tmp_dir".format(project) 82 | 83 | # Defining pipeline options. 84 | options = PipelineOptions( 85 | streaming=True, 86 | project=project, 87 | region=region, 88 | staging_location="%s/staging" % bucket, 89 | temp_location="%s/temp" % bucket, 90 | subnetwork='regions/europe-west1/subnetworks/terraform-network', 91 | service_account_email='retailpipeline-hyp@{}.iam.gserviceaccount.com'.format( 92 | project), 93 | max_num_workers=1 94 | ) 95 | 96 | # Defining pipeline. 97 | p = beam.Pipeline(DataflowRunner(), options=options) 98 | 99 | # Receiving message from Pub/Sub & parsing json from string. 100 | json_message = (p 101 | # Listining to Pub/Sub. 102 | | "Read Topic" >> '' 103 | # Parsing json from message string. 104 | | "Parse json" >> beam.Map(json.loads) 105 | ) 106 | 107 | # Extracting Item Views. 108 | item_views = (json_message 109 | | 'Filter for item views' >> beam.Filter(is_item_view) 110 | | "item view row" >> beam.Map(lambda input: {'event_datetime': input['event_datetime'], # Dropping and renaming columns. 111 | 'event': input['event'], 112 | 'user_id': input['user_id'], 113 | 'client_id': input['client_id'], 114 | 'page': input['page'], 115 | 'page_previous': input['page_previous'], 116 | "item_name": input['ecommerce']['items'][0]["item_name"], 117 | "item_id": input['ecommerce']['items'][0]["item_id"], 118 | "price": input['ecommerce']['items'][0]["price"], 119 | "item_brand": input['ecommerce']['items'][0]["item_brand"], 120 | "item_category": input['ecommerce']['items'][0]["item_category"], 121 | "item_category_2": input['ecommerce']['items'][0]["item_category_2"], 122 | "item_category_3": input['ecommerce']['items'][0]["item_category_3"], 123 | "item_category_4": input['ecommerce']['items'][0]["item_category_4"], 124 | "item_variant": input['ecommerce']['items'][0]["item_variant"], 125 | "item_list_name": input['ecommerce']['items'][0]["item_list_name"], 126 | "item_list_id": input['ecommerce']['items'][0]["item_list_id"], 127 | "quantity": input['ecommerce']['items'][0]["quantity"] 128 | }) 129 | ) 130 | 131 | fixed_windowed_items = (json_message 132 | | 'Filter for purchase' >> beam.Filter(is_purchase) 133 | | 'Global Window' >> '' 134 | | 'ExtractAndSumValue' >> ExtractAndSumValue() 135 | | 'FormatByRow' >> FormatByRow() 136 | ) 137 | 138 | # Writing summed values to BigQuery 139 | aggregated_schema = "user_id:STRING, summed_value:FLOAT" 140 | aggregated_table = "{}:ecommerce_sink.beam_aggregated".format(project) 141 | 142 | fixed_windowed_items | "Write Summed Values To BigQuery" >> WriteToBigQuery(table=aggregated_table, schema=aggregated_schema, 143 | create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, 144 | write_disposition=BigQueryDisposition.WRITE_APPEND) 145 | 146 | # Writing the PCollections to two differnt BigQuery tables. 147 | item_views_table = "{}:ecommerce_sink.beam_item_views".format(project) 148 | schema = "event_datetime:DATETIME, event:STRING, user_id:STRING, client_id:STRING, page:STRING, page_previous:STRING, " \ 149 | "item_name:STRING, item_id:STRING, price:STRING, item_brand:STRING, item_category:STRING, item_category_2:STRING, item_category_3:STRING, " \ 150 | "item_category_4:STRING, item_variant:STRING, item_list_name:STRING, item_list_id:STRING, quantity:STRING" 151 | 152 | item_views | "Write Items Views To BigQuery" >> WriteToBigQuery(table=item_views_table, schema=schema, 153 | create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, 154 | write_disposition=BigQueryDisposition.WRITE_APPEND) 155 | 156 | return p.run() 157 | 158 | 159 | if __name__ == '__main__': 160 | GCP_PROJECT = config.project_id 161 | GCP_REGION = config.location 162 | 163 | streaming_pipeline(project=GCP_PROJECT, region=GCP_REGION) 164 | -------------------------------------------------------------------------------- /01_ingest_and_transform/11_challenge/beam/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | project_id = '' 16 | location = 'europe-west1' 17 | -------------------------------------------------------------------------------- /01_ingest_and_transform/11_challenge/beam/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-beam 2 | google-apitools -------------------------------------------------------------------------------- /01_ingest_and_transform/11_challenge/cloud-run-pubsub-proxy/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /01_ingest_and_transform/11_challenge/cloud-run-pubsub-proxy/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use the official lightweight Node.js 12 image. 2 | # https://hub.docker.com/_/node 3 | FROM node:12-slim 4 | 5 | # Create and change to the app directory. 6 | WORKDIR /usr/src/app 7 | 8 | # Copy application dependency manifests to the container image. 9 | # A wildcard is used to ensure both package.json AND package-lock.json are copied. 10 | # Copying this separately prevents re-running npm install on every code change. 11 | COPY package*.json ./ 12 | 13 | # Install production dependencies. 14 | RUN npm install --only=production 15 | 16 | # Copy local code to the container image. 17 | COPY . ./ 18 | 19 | # Run the web service on container startup. 20 | CMD [ "npm", "start" ] -------------------------------------------------------------------------------- /01_ingest_and_transform/11_challenge/cloud-run-pubsub-proxy/README.md: -------------------------------------------------------------------------------- 1 | Cloud Run Proxy is a express webserver that listenes to incoming requests and publishes them to a chosen Pub/Sub topic. -------------------------------------------------------------------------------- /01_ingest_and_transform/11_challenge/cloud-run-pubsub-proxy/app.js: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // https://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | const express = require('express'); 16 | const bodyParser = require('body-parser'); 17 | const app = express(); 18 | 19 | app.use(bodyParser.json()); 20 | 21 | app.get('/', (req, res) => { 22 | console.log('Hello world received a request.'); 23 | 24 | const target = process.env.TARGET || 'World'; 25 | res.send(`Hello ${target}!`); 26 | }); 27 | 28 | app.post('/json', (req, res) => { 29 | const dataLayer = JSON.stringify(req.body) 30 | console.log(`proxy POST request received dataLayer: ${dataLayer}`) 31 | 32 | const {PubSub} = require('@google-cloud/pubsub'); 33 | 34 | // Instantiates a client 35 | const pubsub = new PubSub(); 36 | 37 | const {Buffer} = require('safe-buffer'); 38 | 39 | // Set Pub/Sub topic name 40 | let topicName = 'hyp-pubsub-topic'; 41 | 42 | // References an existing topic 43 | const topic = pubsub.topic(topicName); 44 | 45 | // Publishes the message as a string, 46 | const dataBuffer = Buffer.from(dataLayer); 47 | 48 | // Add two custom attributes, origin and username, to the message 49 | const customAttributes = { 50 | origin: 'gtm-cloud-run', 51 | username: 'gcp-demo', 52 | }; 53 | 54 | // Publishes a message to Pub/Sub 55 | return topic 56 | .publishMessage({data: dataBuffer}) 57 | .then(() => res.status(200).send(`{"message": "pubsub message sent: ${dataBuffer}"}`)) 58 | .catch(err => { 59 | console.error(err); 60 | res.status(500).send(err); 61 | return Promise.reject(err); 62 | }); 63 | }) 64 | 65 | 66 | module.exports = app; 67 | -------------------------------------------------------------------------------- /01_ingest_and_transform/11_challenge/cloud-run-pubsub-proxy/index.js: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // https://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | const app = require('./app.js'); 16 | const PORT = process.env.PORT || 8080; 17 | 18 | app.listen(PORT, () => console.log(`pubsub proxy app listening on port ${PORT}`)); 19 | -------------------------------------------------------------------------------- /01_ingest_and_transform/11_challenge/cloud-run-pubsub-proxy/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pubsub-proxy", 3 | "version": "1.0.0", 4 | "description": "Cloud Run app to send messages to Pub/Sub topic using Node", 5 | "main": "index.js", 6 | "scripts": { 7 | "start": "node index.js" 8 | }, 9 | "author": "", 10 | "license": "Apache-2.0", 11 | "dependencies": { 12 | "express": "^4.17.1", 13 | "body-parser": "^1.19.0", 14 | "@google-cloud/pubsub": "^3.3.0", 15 | "safe-buffer": "5.1.2" 16 | } 17 | } -------------------------------------------------------------------------------- /01_ingest_and_transform/11_challenge/config_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | export GCP_PROJECT="" 4 | export ENDPOINT_URL="" # doesn't need to be defined in the very beginning 5 | export PUSH_ENDPOINT='' # doesn't need to be defined in the very beginning 6 | export GCP_REGION=europe-west1 7 | export RUN_PROXY_DIR=cloud-run-pubsub-proxy 8 | export RUN_PROCESSING_DIR=processing-service 9 | export DATAFLOW_TEMPLATE=beam 10 | export RUN_INFERENCE_PROCESSING_SERVICE=inf_processing_service 11 | 12 | -------------------------------------------------------------------------------- /01_ingest_and_transform/11_challenge/datalayer/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /01_ingest_and_transform/11_challenge/datalayer/README.md: -------------------------------------------------------------------------------- 1 | Datalayer defines the json events that could occur to be fed into the pipeline. 2 | 3 | Four types of events are included: 4 | * add to cart 5 | * made purchase 6 | * made purcase with anomaly (artifical mistake in data to be identified later) 7 | * view item -------------------------------------------------------------------------------- /01_ingest_and_transform/11_challenge/datalayer/add_to_cart.json: -------------------------------------------------------------------------------- 1 | { 2 | "event_datetime":"2020-11-16 20:59:59", 3 | "event": "add_to_cart", 4 | "user_id": "UID00003", 5 | "client_id": "CID00003", 6 | "page":"/product-67890", 7 | "page_previous": "/category-tshirts", 8 | "ecommerce": { 9 | "items": [{ 10 | "item_name": "Donut Friday Scented T-Shirt", 11 | "item_id": "67890", 12 | "price": 33.75, 13 | "item_brand": "Google", 14 | "item_category": "Apparel", 15 | "item_category_2": "Mens", 16 | "item_category_3": "Shirts", 17 | "item_category_4": "Tshirts", 18 | "item_variant": "Black", 19 | "item_list_name": "Search Results", 20 | "item_list_id": "SR123", 21 | "index": 1, 22 | "quantity": 2 23 | }] 24 | } 25 | } -------------------------------------------------------------------------------- /01_ingest_and_transform/11_challenge/datalayer/ecommerce_events_bq_schema.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "event_datetime", 4 | "type": "TIMESTAMP", 5 | "mode": "NULLABLE" 6 | }, 7 | { 8 | "name": "event", 9 | "type": "STRING", 10 | "mode": "REQUIRED" 11 | }, 12 | { 13 | "name": "user_id", 14 | "type": "STRING", 15 | "mode": "REQUIRED" 16 | }, 17 | { 18 | "name": "client_id", 19 | "type": "STRING", 20 | "mode": "NULLABLE" 21 | }, 22 | { 23 | "name": "page", 24 | "type": "STRING", 25 | "mode": "NULLABLE" 26 | }, 27 | { 28 | "name": "page_previous", 29 | "type": "STRING", 30 | "mode": "NULLABLE" 31 | }, 32 | { 33 | "name": "ecommerce", 34 | "type": "RECORD", 35 | "mode": "NULLABLE", 36 | "fields": [ 37 | { 38 | "mode": "REPEATED", 39 | "name": "items", 40 | "type": "RECORD", 41 | "fields": [ 42 | { 43 | "mode": "NULLABLE", 44 | "name": "index", 45 | "type": "INTEGER" 46 | }, 47 | 48 | { 49 | "mode": "NULLABLE", 50 | "name": "item_id", 51 | "type": "INTEGER" 52 | }, 53 | { 54 | "mode": "NULLABLE", 55 | "name": "item_name", 56 | "type": "STRING" 57 | }, 58 | { 59 | "mode": "NULLABLE", 60 | "name": "item_list_name", 61 | "type": "STRING" 62 | }, 63 | { 64 | "mode": "NULLABLE", 65 | "name": "item_list_id", 66 | "type": "STRING" 67 | }, 68 | { 69 | "mode": "NULLABLE", 70 | "name": "price", 71 | "type": "FLOAT" 72 | }, 73 | { 74 | "mode": "NULLABLE", 75 | "name": "item_variant", 76 | "type": "STRING" 77 | }, 78 | { 79 | "mode": "NULLABLE", 80 | "name": "quantity", 81 | "type": "INTEGER" 82 | }, 83 | { 84 | "mode": "NULLABLE", 85 | "name": "item_brand", 86 | "type": "STRING" 87 | }, 88 | { 89 | "mode": "NULLABLE", 90 | "name": "item_category", 91 | "type": "STRING" 92 | }, 93 | { 94 | "mode": "NULLABLE", 95 | "name": "item_category_2", 96 | "type": "STRING" 97 | }, 98 | { 99 | "mode": "NULLABLE", 100 | "name": "item_category_3", 101 | "type": "STRING" 102 | }, 103 | { 104 | "mode": "NULLABLE", 105 | "name": "item_category_4", 106 | "type": "STRING" 107 | } 108 | ] 109 | }, 110 | { 111 | "mode": "NULLABLE", 112 | "name": "purchase", 113 | "type": "RECORD", 114 | "fields": [ 115 | { 116 | "fields": [ 117 | { 118 | "mode": "NULLABLE", 119 | "name": "item_coupon", 120 | "type": "STRING" 121 | }, 122 | { 123 | "mode": "NULLABLE", 124 | "name": "quantity", 125 | "type": "INTEGER" 126 | }, 127 | { 128 | "mode": "NULLABLE", 129 | "name": "item_variant", 130 | "type": "STRING" 131 | }, 132 | { 133 | "mode": "NULLABLE", 134 | "name": "item_category", 135 | "type": "STRING" 136 | }, 137 | { 138 | "mode": "NULLABLE", 139 | "name": "item_name", 140 | "type": "STRING" 141 | }, 142 | { 143 | "mode": "NULLABLE", 144 | "name": "item_id", 145 | "type": "INTEGER" 146 | }, 147 | { 148 | "mode": "NULLABLE", 149 | "name": "item_brand", 150 | "type": "STRING" 151 | }, 152 | { 153 | "mode": "NULLABLE", 154 | "name": "item_price", 155 | "type": "FLOAT" 156 | } 157 | ], 158 | "mode": "REPEATED", 159 | "name": "items", 160 | "type": "RECORD" 161 | }, 162 | { 163 | "mode": "NULLABLE", 164 | "name": "coupon", 165 | "type": "STRING" 166 | }, 167 | { 168 | "mode": "NULLABLE", 169 | "name": "tax", 170 | "type": "FLOAT" 171 | }, 172 | { 173 | "mode": "NULLABLE", 174 | "name": "shipping", 175 | "type": "FLOAT" 176 | }, 177 | { 178 | "mode": "NULLABLE", 179 | "name": "value", 180 | "type": "FLOAT" 181 | }, 182 | { 183 | "mode": "NULLABLE", 184 | "name": "affiliation", 185 | "type": "STRING" 186 | }, 187 | { 188 | "mode": "NULLABLE", 189 | "name": "currency", 190 | "type": "STRING" 191 | }, 192 | { 193 | "mode": "NULLABLE", 194 | "name": "transaction_id", 195 | "type": "STRING" 196 | } 197 | ] 198 | } 199 | ] 200 | } 201 | ] 202 | -------------------------------------------------------------------------------- /01_ingest_and_transform/11_challenge/datalayer/purchase.json: -------------------------------------------------------------------------------- 1 | { 2 | "event_datetime":"2020-11-16 20:59:59", 3 | "event": "purchase", 4 | "user_id": "UID00001", 5 | "client_id": "CID00003", 6 | "page":"/checkout", 7 | "page_previous": "/order-confirmation", 8 | "ecommerce": { 9 | "purchase": { 10 | "transaction_id": "T12345", 11 | "affiliation": "Online Store", 12 | "value": 35.43, 13 | "tax": 4.90, 14 | "shipping": 5.99, 15 | "currency": "EUR", 16 | "coupon": "SUMMER_SALE", 17 | "items": [{ 18 | "item_name": "Triblend Android T-Shirt", 19 | "item_id": "12345", 20 | "item_price": 15.25, 21 | "item_brand": "Google", 22 | "item_category": "Apparel", 23 | "item_variant": "Gray", 24 | "quantity": 1, 25 | "item_coupon": "" 26 | }, { 27 | "item_name": "Donut Friday Scented T-Shirt", 28 | "item_id": "67890", 29 | "item_price": 33.75, 30 | "item_brand": "Google", 31 | "item_category": "Apparel", 32 | "item_variant": "Black", 33 | "quantity": 1 34 | }] 35 | } 36 | } 37 | } -------------------------------------------------------------------------------- /01_ingest_and_transform/11_challenge/datalayer/purchase_anomaly.json: -------------------------------------------------------------------------------- 1 | { 2 | "event_datetime":"2020-11-16 20:59:59", 3 | "event": "purchase", 4 | "user_id": "UID00001", 5 | "client_id": "CID00003", 6 | "page":"/checkout", 7 | "page_previous": "/order-confirmation", 8 | "ecommerce": { 9 | "purchase": { 10 | "transaction_id": "T12345", 11 | "affiliation": "Online Store", 12 | "value": 1000000.10, 13 | "tax": 4.90, 14 | "shipping": 5.99, 15 | "currency": "EUR", 16 | "coupon": "SUMMER_SALE", 17 | "items": [{ 18 | "item_name": "Triblend Android T-Shirt", 19 | "item_id": "12345", 20 | "item_price": 15.25, 21 | "item_brand": "Google", 22 | "item_category": "Apparel", 23 | "item_variant": "Gray", 24 | "quantity": 1, 25 | "item_coupon": "" 26 | }, { 27 | "item_name": "Donut Friday Scented T-Shirt", 28 | "item_id": "67890", 29 | "item_price": 33.75, 30 | "item_brand": "Google", 31 | "item_category": "Apparel", 32 | "item_variant": "Black", 33 | "quantity": 1 34 | }] 35 | } 36 | } 37 | } -------------------------------------------------------------------------------- /01_ingest_and_transform/11_challenge/datalayer/synth_data_stream.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import random 16 | import requests 17 | import json 18 | import time 19 | import argparse 20 | 21 | 22 | def main(endpoint): 23 | draw = round(random.uniform(0, 1), 2) 24 | 25 | uid = f'UID0000{int(round(random.uniform(0, 5), 0))}' 26 | 27 | if 0 <= draw < 1 / 3: 28 | # get view payload 29 | view_item_f = open('./datalayer/view_item.json') 30 | view_item_payload = json.load(view_item_f) 31 | 32 | view_item_payload['user_id'] = uid 33 | 34 | # send view 35 | r = requests.post(endpoint, json=view_item_payload) 36 | 37 | elif 1 / 3 <= draw < 2 / 3: 38 | # get add to cart payload 39 | add_to_cart_f = open('./datalayer/add_to_cart.json') 40 | add_to_cart_payload = json.load(add_to_cart_f) 41 | 42 | add_to_cart_payload['user_id'] = uid 43 | 44 | # send add to cart 45 | r = requests.post(endpoint, json=add_to_cart_payload) 46 | 47 | else: 48 | # decide between anomaly or no anomaly 49 | if draw < 0.95: 50 | # get payload 51 | purchase_f = open('./datalayer/purchase.json') 52 | purchase_payload = json.load(purchase_f) 53 | 54 | purchase_payload['user_id'] = uid 55 | 56 | # send request 57 | r = requests.post(endpoint, json=purchase_payload) 58 | else: 59 | # get payload 60 | purchase_anomaly_f = open('./datalayer/purchase_anomaly.json') 61 | purchase_anomaly_payload = json.load(purchase_anomaly_f) 62 | 63 | purchase_anomaly_payload['user_id'] = uid 64 | 65 | # send request 66 | r = requests.post(endpoint, json=purchase_anomaly_payload) 67 | 68 | # print(r.text) 69 | print(f'{time.time()} -- {r.status_code}') 70 | 71 | 72 | if __name__ == "__main__": 73 | # Parse Arguments 74 | parser = argparse.ArgumentParser() 75 | parser.add_argument("--endpoint", help="Target Endpoint") 76 | 77 | args = parser.parse_args() 78 | 79 | endpoint = args.endpoint + '/json' 80 | 81 | while True: 82 | main(endpoint) 83 | time.sleep(2) 84 | -------------------------------------------------------------------------------- /01_ingest_and_transform/11_challenge/datalayer/view_item.json: -------------------------------------------------------------------------------- 1 | { 2 | "event_datetime":"2020-11-16 22:59:59", 3 | "event": "view_item", 4 | "user_id": "UID00003", 5 | "client_id": "CID00003", 6 | "page":"/product-67890", 7 | "page_previous": "/category-tshirts", 8 | "ecommerce": { 9 | "items": [{ 10 | "item_name": "Donut Friday Scented T-Shirt", 11 | "item_id": "67890", 12 | "price": 33.75, 13 | "item_brand": "Google", 14 | "item_category": "Apparel", 15 | "item_category_2": "Mens", 16 | "item_category_3": "Shirts", 17 | "item_category_4": "Tshirts", 18 | "item_variant": "Black", 19 | "item_list_name": "Search Results", 20 | "item_list_id": "SR123", 21 | "index": 1, 22 | "quantity": 1 23 | }] 24 | } 25 | } -------------------------------------------------------------------------------- /01_ingest_and_transform/11_challenge/main.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2023 Google 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | terraform { 18 | required_providers { 19 | google = { 20 | source = "hashicorp/google" 21 | version = "4.32.0" 22 | } 23 | } 24 | } 25 | 26 | provider "google" { 27 | project = var.project_id 28 | region = var.gcp_region 29 | } 30 | 31 | data "google_project" "project" { 32 | } 33 | 34 | resource "google_compute_network" "vpc_network" { 35 | name = "terraform-network" 36 | } 37 | 38 | resource "google_compute_firewall" "vpc_network_firewall" { 39 | name = "firewall" 40 | 41 | network = google_compute_network.vpc_network.name 42 | 43 | source_service_accounts = ["${google_service_account.data_pipeline_access.email}"] 44 | 45 | allow { 46 | protocol = "tcp" 47 | ports = ["12345", "12346"] 48 | } 49 | } 50 | 51 | resource "google_service_account" "data_pipeline_access" { 52 | project = var.project_id 53 | account_id = "retailpipeline-hyp" 54 | display_name = "Retail app data pipeline access" 55 | } 56 | 57 | 58 | # Set permissions. 59 | resource "google_project_iam_member" "dataflow_admin_role" { 60 | project = var.project_id 61 | role = "roles/dataflow.admin" 62 | member = "serviceAccount:${google_service_account.data_pipeline_access.email}" 63 | } 64 | 65 | resource "google_project_iam_member" "dataflow_worker_role" { 66 | project = var.project_id 67 | role = "roles/dataflow.worker" 68 | member = "serviceAccount:${google_service_account.data_pipeline_access.email}" 69 | } 70 | 71 | resource "google_project_iam_member" "dataflow_bigquery_role" { 72 | project = var.project_id 73 | role = "roles/bigquery.dataEditor" 74 | member = "serviceAccount:${google_service_account.data_pipeline_access.email}" 75 | } 76 | 77 | resource "google_project_iam_member" "dataflow_pub_sub_subscriber" { 78 | project = var.project_id 79 | role = "roles/pubsub.subscriber" 80 | member = "serviceAccount:${google_service_account.data_pipeline_access.email}" 81 | } 82 | 83 | resource "google_project_iam_member" "dataflow_pub_sub_viewer" { 84 | project = var.project_id 85 | role = "roles/pubsub.viewer" 86 | member = "serviceAccount:${google_service_account.data_pipeline_access.email}" 87 | } 88 | 89 | resource "google_project_iam_member" "dataflow_storage_object_admin" { 90 | project = var.project_id 91 | role = "roles/storage.objectAdmin" 92 | member = "serviceAccount:${google_service_account.data_pipeline_access.email}" 93 | } 94 | 95 | data "google_compute_default_service_account" "default" { 96 | } 97 | 98 | resource "google_project_iam_member" "gce_pub_sub_admin" { 99 | project = var.project_id 100 | role = "roles/pubsub.admin" 101 | member = "serviceAccount:${data.google_compute_default_service_account.default.email}" 102 | } 103 | 104 | 105 | # Enabling APIs 106 | resource "google_project_service" "compute" { 107 | service = "compute.googleapis.com" 108 | 109 | disable_on_destroy = false 110 | } 111 | 112 | resource "google_project_service" "run" { 113 | service = "run.googleapis.com" 114 | 115 | disable_on_destroy = false 116 | } 117 | 118 | resource "google_project_service" "dataflow" { 119 | service = "dataflow.googleapis.com" 120 | 121 | disable_on_destroy = false 122 | } 123 | 124 | resource "google_project_service" "pubsub" { 125 | service = "pubsub.googleapis.com" 126 | disable_on_destroy = false 127 | } 128 | 129 | resource "google_project_iam_member" "viewer" { 130 | project = var.project_id 131 | role = "roles/bigquery.metadataViewer" 132 | member = "serviceAccount:service-${data.google_project.project.number}@gcp-sa-pubsub.iam.gserviceaccount.com" 133 | } 134 | 135 | resource "google_project_iam_member" "editor" { 136 | project = var.project_id 137 | role = "roles/bigquery.dataEditor" 138 | member = "serviceAccount:service-${data.google_project.project.number}@gcp-sa-pubsub.iam.gserviceaccount.com" 139 | } 140 | -------------------------------------------------------------------------------- /01_ingest_and_transform/11_challenge/processing-service/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9 2 | 3 | # Allow statements and log messages to immediately appear in the Knative logs 4 | ENV PYTHONUNBUFFERED True 5 | 6 | # Copy local code to the container image. 7 | ENV APP_HOME /app 8 | WORKDIR $APP_HOME 9 | COPY . ./ 10 | COPY ./requirements.txt ./ 11 | 12 | # Install production dependencies. 13 | RUN pip install --upgrade pip 14 | RUN pip install --no-cache-dir -r requirements.txt 15 | 16 | # Run the web service on container startup. Here we use the gunicorn 17 | # webserver, with one worker process and 8 threads. 18 | # For environments with multiple CPU cores, increase the number of workers 19 | # to be equal to the cores available. 20 | # Timeout is set to 0 to disable the timeouts of the workers to allow Cloud Run to handle instance scaling. 21 | CMD exec gunicorn --bind :$PORT --workers 1 --threads 8 --timeout 0 main:app -------------------------------------------------------------------------------- /01_ingest_and_transform/11_challenge/processing-service/README.md: -------------------------------------------------------------------------------- 1 | The processing service defines a Cloud Run Service to process each incoming datapoint. 2 | 3 | 4 | main.py defines the public facing webserver to listen for requests. 5 | 6 | synth_data_stream.py creates a synthetic data stream of events randomly chosen from the datalayer. It also randomly includes anomalies in the data. 7 | 8 | 9 | Command to start data stream: 10 | python3 synth_data_stream.py --endpoint {Pub/Sub endpoint link}' -------------------------------------------------------------------------------- /01_ingest_and_transform/11_challenge/processing-service/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | project_id = '' 16 | location = 'europe-west1' 17 | bq_dataset = 'ecommerce_sink' 18 | bq_table = 'cloud_run' 19 | -------------------------------------------------------------------------------- /01_ingest_and_transform/11_challenge/processing-service/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import time 17 | import base64 18 | import json 19 | 20 | import config 21 | 22 | from flask import Flask, request 23 | 24 | from google.cloud import bigquery 25 | 26 | app = Flask(__name__) 27 | 28 | 29 | @app.route("/hw", methods=['GET', 'POST']) 30 | def hello_world(): 31 | world = request.args.get('world') 32 | return f"Hello {world}!" 33 | 34 | 35 | @app.route("/", methods=["POST"]) 36 | def index(): 37 | envelope = request.get_json() 38 | print(envelope) 39 | print(type(envelope)) 40 | 41 | if not envelope: 42 | msg = "no Pub/Sub message received" 43 | print(f"error: {msg}") 44 | return f"Bad Request: {msg}", 400 45 | 46 | ps_message = envelope['message'] 47 | print(ps_message) 48 | print(type(ps_message)) 49 | 50 | record = base64.b64decode(ps_message["data"]).decode("utf-8").strip() 51 | record = json.loads(record) 52 | 53 | print(record) 54 | print(type(record)) 55 | 56 | rows_to_insert = [record] 57 | 58 | client = '' 59 | 60 | table_id = config.project_id + '.' + config.bq_dataset + '.' + config.bq_table 61 | 62 | errors = '' # Make an API request. 63 | 64 | if errors == []: 65 | print(f"{time.time()} New rows have been added.") 66 | return ("", 204) 67 | else: 68 | print("Encountered errors while inserting rows: {}".format(errors)) 69 | return f"Bad Request: {envelope}", 400 70 | 71 | 72 | if __name__ == "__main__": 73 | app.run(debug=True, host="0.0.0.0", port=int(os.environ.get("PORT", 8080))) 74 | 75 | -------------------------------------------------------------------------------- /01_ingest_and_transform/11_challenge/processing-service/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | numpy 3 | Flask==2.1.0 4 | gunicorn==20.1.0 5 | google-cloud-bigquery -------------------------------------------------------------------------------- /01_ingest_and_transform/11_challenge/terraform.tfvars: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2023 Google 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | project_id = "" 18 | delete_contents_on_destroy = true 19 | -------------------------------------------------------------------------------- /01_ingest_and_transform/11_challenge/variables.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2023 Google 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | variable "project_id" { 18 | description = "Project where the dataset and table are created." 19 | } 20 | 21 | variable "delete_contents_on_destroy" { 22 | description = "(Optional) If set to true, delete all the tables in the dataset when destroying the resource; otherwise, destroying the resource will fail if tables are present." 23 | type = bool 24 | default = null 25 | } 26 | 27 | variable "force_destroy" { 28 | description = "When deleting a bucket, this boolean option will delete all contained objects. If false, Terraform will fail to delete buckets which contain objects." 29 | type = bool 30 | default = false 31 | } 32 | 33 | variable "gcp_region" { 34 | description = "GCP region to deploy resources in." 35 | type = string 36 | default = "europe-west1" 37 | } -------------------------------------------------------------------------------- /01_ingest_and_transform/12_solution/README.md: -------------------------------------------------------------------------------- 1 | # Developing *efficient* Data Pipelines on GCP 2 | 3 | Google Cloud Platform offers numerous possibilities and sample architectures to design data pipelines. 4 | As always, there is no **ONE** perfect data architecture. It always depends!! 5 | 6 | The perfect architecture depends on the data-type, -volume and more. 7 | Business and tech requirements such as the number of data producers and consumers or the intended data activation are also essential. 8 | 9 | This repo provides practical guidance and sample architectures for the most common pipeline requirements I happened to come across with my customers. 10 | 11 | ## We will find three unique solutions to ingest click stream data into BigQuery. 12 | 13 | All examples find unique solutions for **ingesting click-stream data from a web-store to BigQuery**. 14 | 15 | Imagine you are a Data Engineer with the task to monitor specific click stream data from your company's web store. 16 | You already set up a Cloud Run Proxy Service that can be set-up as custom Tag in Google Tag Manager. 17 | Also, you defined a Pub/Sub topic as central event-hub. 18 | Triggered events flow from Google Tag Manager through your Cloud Run Proxy to your Pub/Sub Topic. 19 | 20 | Once events arrived in your central event-hub, you need to decide on how to process and move them to BigQuery. 21 | 22 | 23 | ![Efficient GCP Data Pipelines Architecture Overview](../../rsc/efficient_pipelines.png) 24 | 25 | 26 | ### I. Lean ELT pipelines with raw data in BigQuery 27 | 28 | Thinking about a scenario in which we aim to build the cheapest lowest maintenance data pipeline. 29 | Our only requirement might be to transport the raw data into BigQuery. 30 | For example, to design a Lakehouse structure. 31 | 32 | Introducing the direct Pub/Sub to BigQuery subscription: 33 | 34 | **Strengths:** 35 | - No data processing tool = major cost saving 36 | - No ETL maintenance 37 | - Raw data in lakehouse allows view based processing on use-case basis 38 | - Ingestion scales down to 0 and up without limits 39 | 40 | **Weaknesses:** 41 | - No processing or aggregations of ingested data before in BigQuery 42 | - Raw data volume in lakehouse might grow quickly 43 | - Only limited sanity check possible when ingesting data 44 | 45 | 46 | ### II. Elastic ELT pipeline with Cloud Run 47 | 48 | You might want to develop a pipeline that scales up and down easily, but still allows to apply simple transformations. 49 | For example, you might want to make data sanity checks, apply default cleaning or run ML inference over your data. 50 | 51 | Introducing Cloud Run as data processing tool: 52 | 53 | **Strengths:** 54 | - Scales down to 0 and up with (almost) no limits 55 | - Easy integration of data transformations in any language and based on any dependencies 56 | - Easy entry, no steep learning curve for Kubernetes like container orchestration 57 | 58 | **Weaknesses:** 59 | - No graphic interface to explore data transformation steps 60 | - Only one-at-a-time data point handling, aggregations over multiple data points only possible once in BigQuery 61 | 62 | 63 | ### III. High-Volume ETL pipelines with complex aggregations using Dataflow 64 | 65 | 66 | **Strengths:** 67 | - Apache Beam allows for on-the-fly aggregations and windowing 68 | - Dataflow offers a user interface, great for troubleshooting 69 | 70 | **Weaknesses:** 71 | - Dataflow never scales down to 0 72 | - Despite serverless nature of dataflow, managing machines is extra overhead compared to e.g. Cloud Run 73 | 74 | 75 | This repo provides an end to end example for streaming data from a web store to BigQuery. It contains the following components that can be deployed all at once using Terraform or serve as individual examples. 76 | 77 | - Cloud Run service that can be set-up as a custom tag in Google Tag Manager. 78 | - Pub/Sub topic to consume the data 79 | - Pub/Sub subscription to pull the data from the topic 80 | - Dataflow streaming job using a Pub/Sub to BigQuery template 81 | - BigQuery events table to store the data 82 | - BigQuery SQL query to analyze the events 83 | 84 | The data structure is based on the [Data Layer E-Commerce](https://developers.google.com/tag-manager/ecommerce-ga4) format recommended for Google Tag Manager. 85 | 86 | ## Git clone repo 87 | 88 | ``` 89 | git clone https://github.com/NucleusEngineering/hack-your-pipe.git 90 | cd hack-your-pipe/01_ingest_and_transform/12_solution 91 | ``` 92 | 93 | ## Set-up Cloud Environment 94 | 95 | ### Initialize your account and project 96 | 97 | If you are using the Google Cloud Shell you can skip this step. 98 | 99 | ``` 100 | gcloud init 101 | ``` 102 | 103 | ### Set Google Cloud Project 104 | 105 | Enter your GCP Project ID in `./config_env.sh` & set all necessary environment variables. 106 | ``` 107 | source config_env.sh 108 | ``` 109 | 110 | Set the default GCP project. 111 | ``` 112 | gcloud config set project $GCP_PROJECT 113 | ``` 114 | 115 | ### Enable Google Cloud APIs 116 | 117 | ``` 118 | gcloud services enable compute.googleapis.com cloudbuild.googleapis.com artifactregistry.googleapis.com dataflow.googleapis.com 119 | ``` 120 | 121 | ### Set compute region 122 | 123 | ``` 124 | gcloud config set compute/zone $GCP_REGION 125 | ``` 126 | 127 | 140 | 141 | # Build the Cloud Run Containers 142 | 143 | Update the default project ID in the following files to match your project ID: [beam/config.py](https://github.com/NucleusEngineering/hack-your-pipe/blob/main/01_ingest_and_transform/12_solution/beam/config.py), [processing_service/config.py](https://github.com/NucleusEngineering/hack-your-pipe/blob/main/01_ingest_and_transform/12_solution/processing-service/config.py) 144 | 145 | Check that the file has been saved with the updated project ID value 146 | 147 | ``` 148 | cat beam/config.py 149 | cat processing_service/config.py 150 | ``` 151 | 152 | ## Pub/Sub proxy service container 153 | 154 | ``` 155 | gcloud builds submit $RUN_PROXY_DIR --tag gcr.io/$GCP_PROJECT/pubsub-proxy 156 | ``` 157 | 158 | ## Data Processing service container 159 | 160 | ``` 161 | gcloud builds submit $RUN_PROCESSING_DIR --tag gcr.io/$GCP_PROJECT/data-processing-service 162 | ``` 163 | 164 | ## Dataflow Template container 165 | 166 | ``` 167 | gcloud builds submit $DATAFLOW_TEMPLATE --tag gcr.io/$GCP_PROJECT/beam-processing-flex-template 168 | ``` 169 | 170 | ``` 171 | gsutil mb -c standard -l $GCP_REGION gs://$GCP_PROJECT-ecommerce-events 172 | ``` 173 | 174 | ``` 175 | gcloud dataflow flex-template build gs://$GCP_PROJECT-ecommerce-events/df_templates/dataflow_template.json --image=gcr.io/$GCP_PROJECT/beam-processing-flex-template --sdk-language=PYTHON 176 | ``` 177 | 178 | ### List containers 179 | 180 | Check that the containers were successfully created. 181 | 182 | ``` 183 | gcloud container images list 184 | ``` 185 | 186 | You should see the following output: 187 | 188 | ``` 189 | NAME: gcr.io//beam-processing-flex-template 190 | NAME: gcr.io//data-processing-service 191 | NAME: gcr.io//pubsub-proxy 192 | Only listing images in gcr.io/. Use --repository to list images in other repositories. 193 | ``` 194 | 195 | 196 | ## Deploy using Terraform 197 | 198 | Use Terraform to deploy the following services defined in the `main.tf` file 199 | 200 | - Cloud Run 1: Pub/Sub Proxy 201 | - Cloud Run 2: Data Processing Service 202 | - Pub/Sub Topic 203 | - Pub/Sub Push Subscription 204 | - Pub/Sub Pull Subscription 205 | - Pub/Sub BigQuery Subscription 206 | - Google Cloud Storage 207 | - Dataflow Job 208 | - BigQuery Table per pipeline 209 | 210 | ### Install Terraform 211 | 212 | If you are using the Google Cloud Shell Terraform is already installed. 213 | 214 | Follow the instructions to [install the Terraform cli](https://learn.hashicorp.com/tutorials/terraform/install-cli?in=terraform/gcp-get-started). 215 | 216 | This repo has been tested on Terraform version `1.3.6` and the Google provider version `4.32.0` 217 | 218 | ### Update Project ID in terraform.tfvars 219 | 220 | Rename the `terraform.tfvars.example` file to `terraform.tfvars` and update the default project ID in the file to match your project ID. 221 | 222 | Check that the file has been saved with the updated project ID value 223 | 224 | ``` 225 | cat terraform.tfvars 226 | ``` 227 | 228 | ### Initialize Terraform 229 | 230 | ``` 231 | terraform init 232 | ``` 233 | 234 | ### Create resources in Google Cloud 235 | 236 | Run the plan cmd to see what resources will be created in your project. 237 | 238 | **Important: Make sure you have updated the Project ID in terraform.tfvars before running this** 239 | 240 | ``` 241 | terraform plan 242 | ``` 243 | 244 | Run the apply cmd and point to your `.tfvars` file to deploy all the resources in your project. 245 | 246 | ``` 247 | terraform apply -var-file terraform.tfvars 248 | ``` 249 | 250 | This will show you a plan of everything that will be created and then the following notification where you should enter `yes` to proceed: 251 | 252 | ``` 253 | Plan: 20 to add, 0 to change, 0 to destroy. 254 | 255 | Do you want to perform these actions? 256 | Terraform will perform the actions described above. 257 | Only 'yes' will be accepted to approve. 258 | 259 | Enter a value: 260 | ``` 261 | 262 | ### Terraform output 263 | 264 | Once everything has successfully run you should see the following output: 265 | 266 | ``` 267 | google_compute_network.vpc_network: Creating... 268 | . 269 | . 270 | . 271 | Apply complete! Resources: 20 added, 0 changed, 0 destroyed. 272 | 273 | Outputs: 274 | 275 | cloud_run_proxy_url = https://pubsub-proxy-my-service--uc.a.run.app 276 | ``` 277 | 278 | ## Simulate sending e-commerce events to Cloud Run Pub/Sub proxy using curl 279 | 280 | Use the `cloud_run_proxy_url` value from the Terraform output to simulate sending e-commerce events to the Cloud Run Pub/Sub proxy. 281 | 282 | #### Set Cloud Run Proxy URL 283 | 284 | Enter your the proxy service URL as `ENDPOINT_URL` in `./config_env.sh` & reset the environment variables. 285 | ``` 286 | source config_env.sh 287 | ``` 288 | 289 | #### Create artificial event stream 290 | 291 | Run the script `./datalayer/synth_data_stream.py` to direct a synthetic stream of events to the created endpoint. 292 | 293 | ``` 294 | python3 ./datalayer/synth_data_stream.py --endpoint=$ENDPOINT_URL 295 | ``` 296 | 297 | The program will generate and send a random event based on the samples in `./datalayer` every two seconds. 298 | 299 | After a minute or two you should find the BigQuery event tables populated. 300 | 301 | ### Terraform Destroy 302 | 303 | Use Terraform to destroy all resources 304 | 305 | ``` 306 | terraform destroy 307 | ``` 308 | 309 | You might have to delete the BigQuery tables and rerun the command to destroy the resources. -------------------------------------------------------------------------------- /01_ingest_and_transform/12_solution/beam/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gcr.io/dataflow-templates-base/python39-template-launcher-base:latest as template_launcher 2 | 3 | ARG WORKDIR=/dataflow/template 4 | RUN mkdir -p ${WORKDIR} 5 | WORKDIR ${WORKDIR} 6 | 7 | # Copy local code to the container image. 8 | COPY . ./ 9 | COPY ./requirements.txt ./ 10 | COPY ./beam_processing.py ./ 11 | 12 | # Flex Template ENV Vars 13 | ENV FLEX_TEMPLATE_PYTHON_PY_FILE="${WORKDIR}/beam_processing.py" 14 | ENV FLEX_TEMPLATE_PYTHON_REQUIREMENTS_FILE="${WORKDIR}/requirements.txt" 15 | 16 | # Install requirements 17 | RUN pip install --upgrade pip 18 | RUN pip install --no-cache-dir -r requirements.txt 19 | 20 | ENTRYPOINT ["/opt/google/dataflow/python_template_launcher"] -------------------------------------------------------------------------------- /01_ingest_and_transform/12_solution/beam/beam_processing.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import json 16 | import time 17 | 18 | import config 19 | 20 | import apache_beam as beam 21 | from apache_beam.options.pipeline_options import PipelineOptions 22 | from apache_beam.transforms import trigger 23 | from apache_beam.io.gcp.pubsub import ReadFromPubSub 24 | from apache_beam.io.gcp.bigquery import BigQueryDisposition, WriteToBigQuery 25 | from apache_beam.runners import DataflowRunner 26 | 27 | # Defining event filter functions. 28 | 29 | 30 | def is_item_view(event): 31 | return event['event'] == 'view_item' 32 | 33 | 34 | def is_add_to_cart(event): 35 | return event['event'] == 'add_to_cart' 36 | 37 | 38 | def is_purchase(event): 39 | return event['event'] == 'purchase' 40 | 41 | 42 | class ExtractValueFn(beam.DoFn): 43 | def process(self, element): 44 | print(f"ExtractValueFn: {element['ecommerce']['purchase']['value']}") 45 | return [element['ecommerce']['purchase']['value']] 46 | 47 | 48 | class ExtractAndSumValue(beam.PTransform): 49 | """A transform to extract key/score information and sum the scores. 50 | The constructor argument `field` determines whether 'team' or 'user' info is 51 | extracted. 52 | """ 53 | 54 | def expand(self, pcoll): 55 | sum_val = ( 56 | pcoll 57 | | beam.Map(lambda elem: (elem['user_id'], elem['ecommerce']['purchase']['value'])) 58 | | beam.CombinePerKey(sum)) 59 | return(sum_val) 60 | 61 | 62 | class FormatByRow(beam.PTransform): 63 | """A transform to reformat the data to column name/value format. 64 | """ 65 | 66 | def expand(self, pcoll): 67 | row_val = ( 68 | pcoll 69 | | beam.Map(lambda elem: {'user_id': elem[0], 70 | 'summed_value': elem[1] 71 | }) 72 | ) 73 | return(row_val) 74 | 75 | 76 | def streaming_pipeline(project, region): 77 | 78 | subscription = "projects/{}/subscriptions/hyp_subscription_dataflow".format( 79 | project) 80 | 81 | bucket = "gs://{}-ecommerce-events/tmp_dir".format(project) 82 | 83 | # Defining pipeline options. 84 | options = PipelineOptions( 85 | streaming=True, 86 | project=project, 87 | region=region, 88 | staging_location="%s/staging" % bucket, 89 | temp_location="%s/temp" % bucket, 90 | subnetwork='regions/europe-west1/subnetworks/terraform-network', 91 | service_account_email='retailpipeline-hyp@{}.iam.gserviceaccount.com'.format( 92 | project), 93 | max_num_workers=1 94 | ) 95 | 96 | # Defining pipeline. 97 | p = beam.Pipeline(DataflowRunner(), options=options) 98 | 99 | # Receiving message from Pub/Sub & parsing json from string. 100 | json_message = (p 101 | # Listining to Pub/Sub. 102 | | "Read Topic" >> ReadFromPubSub(subscription=subscription) 103 | # Parsing json from message string. 104 | | "Parse json" >> beam.Map(json.loads) 105 | ) 106 | 107 | # Extracting Item Views. 108 | item_views = (json_message 109 | | 'Filter for item views' >> beam.Filter(is_item_view) 110 | | "item view row" >> beam.Map(lambda input: {'event_datetime': input['event_datetime'], # Dropping and renaming columns. 111 | 'event': input['event'], 112 | 'user_id': input['user_id'], 113 | 'client_id': input['client_id'], 114 | 'page': input['page'], 115 | 'page_previous': input['page_previous'], 116 | "item_name": input['ecommerce']['items'][0]["item_name"], 117 | "item_id": input['ecommerce']['items'][0]["item_id"], 118 | "price": input['ecommerce']['items'][0]["price"], 119 | "item_brand": input['ecommerce']['items'][0]["item_brand"], 120 | "item_category": input['ecommerce']['items'][0]["item_category"], 121 | "item_category_2": input['ecommerce']['items'][0]["item_category_2"], 122 | "item_category_3": input['ecommerce']['items'][0]["item_category_3"], 123 | "item_category_4": input['ecommerce']['items'][0]["item_category_4"], 124 | "item_variant": input['ecommerce']['items'][0]["item_variant"], 125 | "item_list_name": input['ecommerce']['items'][0]["item_list_name"], 126 | "item_list_id": input['ecommerce']['items'][0]["item_list_id"], 127 | "quantity": input['ecommerce']['items'][0]["quantity"] 128 | }) 129 | ) 130 | 131 | fixed_windowed_items = (json_message 132 | | 'Filter for purchase' >> beam.Filter(is_purchase) 133 | | 'Global Window' >> beam.WindowInto(beam.window.GlobalWindows(), 134 | trigger=trigger.Repeatedly( 135 | trigger.AfterCount(10)), 136 | accumulation_mode=trigger.AccumulationMode.ACCUMULATING) 137 | | 'ExtractAndSumValue' >> ExtractAndSumValue() 138 | | 'FormatByRow' >> FormatByRow() 139 | ) 140 | 141 | # Writing summed values to BigQuery 142 | aggregated_schema = "user_id:STRING, summed_value:FLOAT" 143 | aggregated_table = "{}:ecommerce_sink.beam_aggregated".format(project) 144 | 145 | fixed_windowed_items | "Write Summed Values To BigQuery" >> WriteToBigQuery(table=aggregated_table, schema=aggregated_schema, 146 | create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, 147 | write_disposition=BigQueryDisposition.WRITE_APPEND) 148 | 149 | # Writing the PCollections to two differnt BigQuery tables. 150 | item_views_table = "{}:ecommerce_sink.beam_item_views".format(project) 151 | schema = "event_datetime:DATETIME, event:STRING, user_id:STRING, client_id:STRING, page:STRING, page_previous:STRING, " \ 152 | "item_name:STRING, item_id:STRING, price:STRING, item_brand:STRING, item_category:STRING, item_category_2:STRING, item_category_3:STRING, " \ 153 | "item_category_4:STRING, item_variant:STRING, item_list_name:STRING, item_list_id:STRING, quantity:STRING" 154 | 155 | item_views | "Write Items Views To BigQuery" >> WriteToBigQuery(table=item_views_table, schema=schema, 156 | create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, 157 | write_disposition=BigQueryDisposition.WRITE_APPEND) 158 | 159 | return p.run() 160 | 161 | 162 | if __name__ == '__main__': 163 | GCP_PROJECT = config.project_id 164 | GCP_REGION = config.location 165 | 166 | streaming_pipeline(project=GCP_PROJECT, region=GCP_REGION) 167 | -------------------------------------------------------------------------------- /01_ingest_and_transform/12_solution/beam/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | project_id = 'poerschmann-hyp-test3' 16 | location = 'europe-west1' 17 | -------------------------------------------------------------------------------- /01_ingest_and_transform/12_solution/beam/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-beam 2 | google-apitools -------------------------------------------------------------------------------- /01_ingest_and_transform/12_solution/cloud-run-pubsub-proxy/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /01_ingest_and_transform/12_solution/cloud-run-pubsub-proxy/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use the official lightweight Node.js 12 image. 2 | # https://hub.docker.com/_/node 3 | FROM node:12-slim 4 | 5 | # Create and change to the app directory. 6 | WORKDIR /usr/src/app 7 | 8 | # Copy application dependency manifests to the container image. 9 | # A wildcard is used to ensure both package.json AND package-lock.json are copied. 10 | # Copying this separately prevents re-running npm install on every code change. 11 | COPY package*.json ./ 12 | 13 | # Install production dependencies. 14 | RUN npm install --only=production 15 | 16 | # Copy local code to the container image. 17 | COPY . ./ 18 | 19 | # Run the web service on container startup. 20 | CMD [ "npm", "start" ] -------------------------------------------------------------------------------- /01_ingest_and_transform/12_solution/cloud-run-pubsub-proxy/README.md: -------------------------------------------------------------------------------- 1 | Cloud Run Proxy is a express webserver that listenes to incoming requests and publishes them to a chosen Pub/Sub topic. -------------------------------------------------------------------------------- /01_ingest_and_transform/12_solution/cloud-run-pubsub-proxy/app.js: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // https://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | const express = require('express'); 16 | const bodyParser = require('body-parser'); 17 | const app = express(); 18 | 19 | app.use(bodyParser.json()); 20 | 21 | app.get('/', (req, res) => { 22 | console.log('Hello world received a request.'); 23 | 24 | const target = process.env.TARGET || 'World'; 25 | res.send(`Hello ${target}!`); 26 | }); 27 | 28 | app.post('/json', (req, res) => { 29 | const dataLayer = JSON.stringify(req.body) 30 | console.log(`proxy POST request received dataLayer: ${dataLayer}`) 31 | 32 | const {PubSub} = require('@google-cloud/pubsub'); 33 | 34 | // Instantiates a client 35 | const pubsub = new PubSub(); 36 | 37 | const {Buffer} = require('safe-buffer'); 38 | 39 | // Set Pub/Sub topic name 40 | let topicName = 'hyp-pubsub-topic'; 41 | 42 | // References an existing topic 43 | const topic = pubsub.topic(topicName); 44 | 45 | // Publishes the message as a string, 46 | const dataBuffer = Buffer.from(dataLayer); 47 | 48 | // Add two custom attributes, origin and username, to the message 49 | const customAttributes = { 50 | origin: 'gtm-cloud-run', 51 | username: 'gcp-demo', 52 | }; 53 | 54 | // Publishes a message to Pub/Sub 55 | return topic 56 | .publishMessage({data: dataBuffer}) 57 | .then(() => res.status(200).send(`{"message": "pubsub message sent: ${dataBuffer}"}`)) 58 | .catch(err => { 59 | console.error(err); 60 | res.status(500).send(err); 61 | return Promise.reject(err); 62 | }); 63 | }) 64 | 65 | 66 | module.exports = app; 67 | -------------------------------------------------------------------------------- /01_ingest_and_transform/12_solution/cloud-run-pubsub-proxy/index.js: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // https://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | const app = require('./app.js'); 16 | const PORT = process.env.PORT || 8080; 17 | 18 | app.listen(PORT, () => console.log(`pubsub proxy app listening on port ${PORT}`)); 19 | -------------------------------------------------------------------------------- /01_ingest_and_transform/12_solution/cloud-run-pubsub-proxy/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pubsub-proxy", 3 | "version": "1.0.0", 4 | "description": "Cloud Run app to send messages to Pub/Sub topic using Node", 5 | "main": "index.js", 6 | "scripts": { 7 | "start": "node index.js" 8 | }, 9 | "author": "", 10 | "license": "Apache-2.0", 11 | "dependencies": { 12 | "express": "^4.17.1", 13 | "body-parser": "^1.19.0", 14 | "@google-cloud/pubsub": "^3.3.0", 15 | "safe-buffer": "5.1.2" 16 | } 17 | } -------------------------------------------------------------------------------- /01_ingest_and_transform/12_solution/config_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | export GCP_PROJECT="" 4 | export ENDPOINT_URL="" # doesn't need to be defined in the very beginning 5 | export PUSH_ENDPOINT='' # doesn't need to be defined in the very beginning 6 | export GCP_REGION=europe-west1 7 | export RUN_PROXY_DIR=cloud-run-pubsub-proxy 8 | export RUN_PROCESSING_DIR=processing-service 9 | export DATAFLOW_TEMPLATE=beam 10 | export RUN_INFERENCE_PROCESSING_SERVICE=inf_processing_service 11 | 12 | -------------------------------------------------------------------------------- /01_ingest_and_transform/12_solution/datalayer/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /01_ingest_and_transform/12_solution/datalayer/README.md: -------------------------------------------------------------------------------- 1 | Datalayer defines the json events that could occur to be fed into the pipeline. 2 | 3 | Four types of events are included: 4 | * add to cart 5 | * made purchase 6 | * made purcase with anomaly (artifical mistake in data to be identified later) 7 | * view item -------------------------------------------------------------------------------- /01_ingest_and_transform/12_solution/datalayer/add_to_cart.json: -------------------------------------------------------------------------------- 1 | { 2 | "event_datetime":"2020-11-16 20:59:59", 3 | "event": "add_to_cart", 4 | "user_id": "UID00003", 5 | "client_id": "CID00003", 6 | "page":"/product-67890", 7 | "page_previous": "/category-tshirts", 8 | "ecommerce": { 9 | "items": [{ 10 | "item_name": "Donut Friday Scented T-Shirt", 11 | "item_id": "67890", 12 | "price": 33.75, 13 | "item_brand": "Google", 14 | "item_category": "Apparel", 15 | "item_category_2": "Mens", 16 | "item_category_3": "Shirts", 17 | "item_category_4": "Tshirts", 18 | "item_variant": "Black", 19 | "item_list_name": "Search Results", 20 | "item_list_id": "SR123", 21 | "index": 1, 22 | "quantity": 2 23 | }] 24 | } 25 | } -------------------------------------------------------------------------------- /01_ingest_and_transform/12_solution/datalayer/ecommerce_events_bq_schema.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "event_datetime", 4 | "type": "TIMESTAMP", 5 | "mode": "NULLABLE" 6 | }, 7 | { 8 | "name": "event", 9 | "type": "STRING", 10 | "mode": "REQUIRED" 11 | }, 12 | { 13 | "name": "user_id", 14 | "type": "STRING", 15 | "mode": "REQUIRED" 16 | }, 17 | { 18 | "name": "client_id", 19 | "type": "STRING", 20 | "mode": "NULLABLE" 21 | }, 22 | { 23 | "name": "page", 24 | "type": "STRING", 25 | "mode": "NULLABLE" 26 | }, 27 | { 28 | "name": "page_previous", 29 | "type": "STRING", 30 | "mode": "NULLABLE" 31 | }, 32 | { 33 | "name": "weekday", 34 | "type": "STRING", 35 | "mode": "NULLABLE" 36 | }, 37 | { 38 | "name": "ecommerce", 39 | "type": "RECORD", 40 | "mode": "NULLABLE", 41 | "fields": [ 42 | { 43 | "mode": "REPEATED", 44 | "name": "items", 45 | "type": "RECORD", 46 | "fields": [ 47 | { 48 | "mode": "NULLABLE", 49 | "name": "index", 50 | "type": "INTEGER" 51 | }, 52 | 53 | { 54 | "mode": "NULLABLE", 55 | "name": "item_id", 56 | "type": "INTEGER" 57 | }, 58 | { 59 | "mode": "NULLABLE", 60 | "name": "item_name", 61 | "type": "STRING" 62 | }, 63 | { 64 | "mode": "NULLABLE", 65 | "name": "item_list_name", 66 | "type": "STRING" 67 | }, 68 | { 69 | "mode": "NULLABLE", 70 | "name": "item_list_id", 71 | "type": "STRING" 72 | }, 73 | { 74 | "mode": "NULLABLE", 75 | "name": "price", 76 | "type": "FLOAT" 77 | }, 78 | { 79 | "mode": "NULLABLE", 80 | "name": "item_variant", 81 | "type": "STRING" 82 | }, 83 | { 84 | "mode": "NULLABLE", 85 | "name": "quantity", 86 | "type": "INTEGER" 87 | }, 88 | { 89 | "mode": "NULLABLE", 90 | "name": "item_brand", 91 | "type": "STRING" 92 | }, 93 | { 94 | "mode": "NULLABLE", 95 | "name": "item_category", 96 | "type": "STRING" 97 | }, 98 | { 99 | "mode": "NULLABLE", 100 | "name": "item_category_2", 101 | "type": "STRING" 102 | }, 103 | { 104 | "mode": "NULLABLE", 105 | "name": "item_category_3", 106 | "type": "STRING" 107 | }, 108 | { 109 | "mode": "NULLABLE", 110 | "name": "item_category_4", 111 | "type": "STRING" 112 | } 113 | ] 114 | }, 115 | { 116 | "mode": "NULLABLE", 117 | "name": "purchase", 118 | "type": "RECORD", 119 | "fields": [ 120 | { 121 | "fields": [ 122 | { 123 | "mode": "NULLABLE", 124 | "name": "item_coupon", 125 | "type": "STRING" 126 | }, 127 | { 128 | "mode": "NULLABLE", 129 | "name": "quantity", 130 | "type": "INTEGER" 131 | }, 132 | { 133 | "mode": "NULLABLE", 134 | "name": "item_variant", 135 | "type": "STRING" 136 | }, 137 | { 138 | "mode": "NULLABLE", 139 | "name": "item_category", 140 | "type": "STRING" 141 | }, 142 | { 143 | "mode": "NULLABLE", 144 | "name": "item_name", 145 | "type": "STRING" 146 | }, 147 | { 148 | "mode": "NULLABLE", 149 | "name": "item_id", 150 | "type": "INTEGER" 151 | }, 152 | { 153 | "mode": "NULLABLE", 154 | "name": "item_brand", 155 | "type": "STRING" 156 | }, 157 | { 158 | "mode": "NULLABLE", 159 | "name": "item_price", 160 | "type": "FLOAT" 161 | } 162 | ], 163 | "mode": "REPEATED", 164 | "name": "items", 165 | "type": "RECORD" 166 | }, 167 | { 168 | "mode": "NULLABLE", 169 | "name": "coupon", 170 | "type": "STRING" 171 | }, 172 | { 173 | "mode": "NULLABLE", 174 | "name": "tax", 175 | "type": "FLOAT" 176 | }, 177 | { 178 | "mode": "NULLABLE", 179 | "name": "shipping", 180 | "type": "FLOAT" 181 | }, 182 | { 183 | "mode": "NULLABLE", 184 | "name": "value", 185 | "type": "FLOAT" 186 | }, 187 | { 188 | "mode": "NULLABLE", 189 | "name": "affiliation", 190 | "type": "STRING" 191 | }, 192 | { 193 | "mode": "NULLABLE", 194 | "name": "currency", 195 | "type": "STRING" 196 | }, 197 | { 198 | "mode": "NULLABLE", 199 | "name": "transaction_id", 200 | "type": "STRING" 201 | } 202 | ] 203 | } 204 | ] 205 | } 206 | ] 207 | -------------------------------------------------------------------------------- /01_ingest_and_transform/12_solution/datalayer/purchase.json: -------------------------------------------------------------------------------- 1 | { 2 | "event_datetime":"2020-11-16 20:59:59", 3 | "event": "purchase", 4 | "user_id": "UID00001", 5 | "client_id": "CID00003", 6 | "page":"/checkout", 7 | "page_previous": "/order-confirmation", 8 | "ecommerce": { 9 | "purchase": { 10 | "transaction_id": "T12345", 11 | "affiliation": "Online Store", 12 | "value": 35.43, 13 | "tax": 4.90, 14 | "shipping": 5.99, 15 | "currency": "EUR", 16 | "coupon": "SUMMER_SALE", 17 | "items": [{ 18 | "item_name": "Triblend Android T-Shirt", 19 | "item_id": "12345", 20 | "item_price": 15.25, 21 | "item_brand": "Google", 22 | "item_category": "Apparel", 23 | "item_variant": "Gray", 24 | "quantity": 1, 25 | "item_coupon": "" 26 | }, { 27 | "item_name": "Donut Friday Scented T-Shirt", 28 | "item_id": "67890", 29 | "item_price": 33.75, 30 | "item_brand": "Google", 31 | "item_category": "Apparel", 32 | "item_variant": "Black", 33 | "quantity": 1 34 | }] 35 | } 36 | } 37 | } -------------------------------------------------------------------------------- /01_ingest_and_transform/12_solution/datalayer/purchase_anomaly.json: -------------------------------------------------------------------------------- 1 | { 2 | "event_datetime":"2020-11-16 20:59:59", 3 | "event": "purchase", 4 | "user_id": "UID00001", 5 | "client_id": "CID00003", 6 | "page":"/checkout", 7 | "page_previous": "/order-confirmation", 8 | "ecommerce": { 9 | "purchase": { 10 | "transaction_id": "T12345", 11 | "affiliation": "Online Store", 12 | "value": 1000000.10, 13 | "tax": 4.90, 14 | "shipping": 5.99, 15 | "currency": "EUR", 16 | "coupon": "SUMMER_SALE", 17 | "items": [{ 18 | "item_name": "Triblend Android T-Shirt", 19 | "item_id": "12345", 20 | "item_price": 15.25, 21 | "item_brand": "Google", 22 | "item_category": "Apparel", 23 | "item_variant": "Gray", 24 | "quantity": 1, 25 | "item_coupon": "" 26 | }, { 27 | "item_name": "Donut Friday Scented T-Shirt", 28 | "item_id": "67890", 29 | "item_price": 33.75, 30 | "item_brand": "Google", 31 | "item_category": "Apparel", 32 | "item_variant": "Black", 33 | "quantity": 1 34 | }] 35 | } 36 | } 37 | } -------------------------------------------------------------------------------- /01_ingest_and_transform/12_solution/datalayer/synth_data_stream.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import random 16 | import requests 17 | import json 18 | import time 19 | import argparse 20 | 21 | 22 | def main(endpoint): 23 | draw = round(random.uniform(0, 1), 2) 24 | 25 | uid = f'UID0000{int(round(random.uniform(0, 5), 0))}' 26 | 27 | if 0 <= draw < 1 / 3: 28 | # get view payload 29 | view_item_f = open('./datalayer/view_item.json') 30 | view_item_payload = json.load(view_item_f) 31 | 32 | view_item_payload['user_id'] = uid 33 | 34 | # send view 35 | r = requests.post(endpoint, json=view_item_payload) 36 | 37 | elif 1 / 3 <= draw < 2 / 3: 38 | # get add to cart payload 39 | add_to_cart_f = open('./datalayer/add_to_cart.json') 40 | add_to_cart_payload = json.load(add_to_cart_f) 41 | 42 | add_to_cart_payload['user_id'] = uid 43 | 44 | # send add to cart 45 | r = requests.post(endpoint, json=add_to_cart_payload) 46 | 47 | else: 48 | # decide between anomaly or no anomaly 49 | if draw < 0.95: 50 | # get payload 51 | purchase_f = open('./datalayer/purchase.json') 52 | purchase_payload = json.load(purchase_f) 53 | 54 | purchase_payload['user_id'] = uid 55 | 56 | # send request 57 | r = requests.post(endpoint, json=purchase_payload) 58 | else: 59 | # get payload 60 | purchase_anomaly_f = open('./datalayer/purchase_anomaly.json') 61 | purchase_anomaly_payload = json.load(purchase_anomaly_f) 62 | 63 | purchase_anomaly_payload['user_id'] = uid 64 | 65 | # send request 66 | r = requests.post(endpoint, json=purchase_anomaly_payload) 67 | 68 | # print(r.text) 69 | print(f'{time.time()} -- {r.status_code}') 70 | 71 | 72 | if __name__ == "__main__": 73 | # Parse Arguments 74 | parser = argparse.ArgumentParser() 75 | parser.add_argument("--endpoint", help="Target Endpoint") 76 | 77 | args = parser.parse_args() 78 | 79 | endpoint = args.endpoint + '/json' 80 | 81 | while True: 82 | main(endpoint) 83 | time.sleep(2) 84 | -------------------------------------------------------------------------------- /01_ingest_and_transform/12_solution/datalayer/view_item.json: -------------------------------------------------------------------------------- 1 | { 2 | "event_datetime":"2020-11-16 22:59:59", 3 | "event": "view_item", 4 | "user_id": "UID00003", 5 | "client_id": "CID00003", 6 | "page":"/product-67890", 7 | "page_previous": "/category-tshirts", 8 | "ecommerce": { 9 | "items": [{ 10 | "item_name": "Donut Friday Scented T-Shirt", 11 | "item_id": "67890", 12 | "price": 33.75, 13 | "item_brand": "Google", 14 | "item_category": "Apparel", 15 | "item_category_2": "Mens", 16 | "item_category_3": "Shirts", 17 | "item_category_4": "Tshirts", 18 | "item_variant": "Black", 19 | "item_list_name": "Search Results", 20 | "item_list_id": "SR123", 21 | "index": 1, 22 | "quantity": 1 23 | }] 24 | } 25 | } -------------------------------------------------------------------------------- /01_ingest_and_transform/12_solution/main.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2023 Google 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | terraform { 18 | required_providers { 19 | google = { 20 | source = "hashicorp/google" 21 | version = "4.32.0" 22 | } 23 | } 24 | } 25 | 26 | provider "google" { 27 | project = var.project_id 28 | region = var.gcp_region 29 | } 30 | 31 | data "google_project" "project" { 32 | } 33 | 34 | resource "google_compute_network" "vpc_network" { 35 | name = "terraform-network" 36 | } 37 | 38 | resource "google_compute_firewall" "vpc_network_firewall" { 39 | name = "firewall" 40 | 41 | network = google_compute_network.vpc_network.name 42 | 43 | source_service_accounts = ["${google_service_account.data_pipeline_access.email}"] 44 | 45 | allow { 46 | protocol = "tcp" 47 | ports = ["12345", "12346"] 48 | } 49 | } 50 | 51 | resource "google_service_account" "data_pipeline_access" { 52 | project = var.project_id 53 | account_id = "retailpipeline-hyp" 54 | display_name = "Retail app data pipeline access" 55 | } 56 | 57 | 58 | # Set permissions. 59 | resource "google_project_iam_member" "dataflow_admin_role" { 60 | project = var.project_id 61 | role = "roles/dataflow.admin" 62 | member = "serviceAccount:${google_service_account.data_pipeline_access.email}" 63 | } 64 | 65 | resource "google_project_iam_member" "dataflow_worker_role" { 66 | project = var.project_id 67 | role = "roles/dataflow.worker" 68 | member = "serviceAccount:${google_service_account.data_pipeline_access.email}" 69 | } 70 | 71 | resource "google_project_iam_member" "dataflow_bigquery_role" { 72 | project = var.project_id 73 | role = "roles/bigquery.dataEditor" 74 | member = "serviceAccount:${google_service_account.data_pipeline_access.email}" 75 | } 76 | 77 | resource "google_project_iam_member" "dataflow_pub_sub_subscriber" { 78 | project = var.project_id 79 | role = "roles/pubsub.subscriber" 80 | member = "serviceAccount:${google_service_account.data_pipeline_access.email}" 81 | } 82 | 83 | resource "google_project_iam_member" "dataflow_pub_sub_viewer" { 84 | project = var.project_id 85 | role = "roles/pubsub.viewer" 86 | member = "serviceAccount:${google_service_account.data_pipeline_access.email}" 87 | } 88 | 89 | resource "google_project_iam_member" "dataflow_storage_object_admin" { 90 | project = var.project_id 91 | role = "roles/storage.objectAdmin" 92 | member = "serviceAccount:${google_service_account.data_pipeline_access.email}" 93 | } 94 | 95 | data "google_compute_default_service_account" "default" { 96 | } 97 | 98 | resource "google_project_iam_member" "gce_pub_sub_admin" { 99 | project = var.project_id 100 | role = "roles/pubsub.admin" 101 | member = "serviceAccount:${data.google_compute_default_service_account.default.email}" 102 | } 103 | 104 | 105 | # Enabling APIs 106 | resource "google_project_service" "compute" { 107 | service = "compute.googleapis.com" 108 | 109 | disable_on_destroy = false 110 | } 111 | 112 | resource "google_project_service" "run" { 113 | service = "run.googleapis.com" 114 | 115 | disable_on_destroy = false 116 | } 117 | 118 | resource "google_project_service" "dataflow" { 119 | service = "dataflow.googleapis.com" 120 | 121 | disable_on_destroy = false 122 | } 123 | 124 | resource "google_project_service" "pubsub" { 125 | service = "pubsub.googleapis.com" 126 | disable_on_destroy = false 127 | } 128 | 129 | 130 | # Define common resources used by all pipeline options. 131 | # Cloud Run Proxy 132 | resource "google_cloud_run_service" "pubsub_proxy_hyp" { 133 | name = "hyp-run-service-pubsub-proxy" 134 | location = var.gcp_region 135 | 136 | template { 137 | spec { 138 | containers { 139 | image = "gcr.io/${var.project_id}/pubsub-proxy" 140 | } 141 | } 142 | } 143 | 144 | traffic { 145 | percent = 100 146 | latest_revision = true 147 | } 148 | 149 | depends_on = [google_project_service.run] 150 | } 151 | 152 | data "google_iam_policy" "noauth" { 153 | binding { 154 | role = "roles/run.invoker" 155 | members = [ 156 | "allUsers", 157 | ] 158 | } 159 | } 160 | 161 | resource "google_cloud_run_service_iam_policy" "noauth" { 162 | location = google_cloud_run_service.pubsub_proxy_hyp.location 163 | project = google_cloud_run_service.pubsub_proxy_hyp.project 164 | service = google_cloud_run_service.pubsub_proxy_hyp.name 165 | policy_data = data.google_iam_policy.noauth.policy_data 166 | } 167 | 168 | output "cloud_run_proxy_url" { 169 | value = google_cloud_run_service.pubsub_proxy_hyp.status[0].url 170 | } 171 | 172 | # BigQuery Dataset 173 | resource "google_bigquery_dataset" "bq_dataset" { 174 | dataset_id = "ecommerce_sink" 175 | friendly_name = "ecommerce sink" 176 | description = "Destination dataset for all pipeline options" 177 | location = var.gcp_region 178 | 179 | delete_contents_on_destroy = true 180 | 181 | labels = { 182 | env = "default" 183 | } 184 | } 185 | 186 | # Pub/Sub Topic 187 | resource "google_pubsub_topic" "ps_topic" { 188 | name = "hyp-pubsub-topic" 189 | 190 | labels = { 191 | created = "terraform" 192 | } 193 | 194 | depends_on = [google_project_service.pubsub] 195 | } 196 | 197 | 198 | # Pipeline 1: Cloud Run Proxy -> Pub/Sub -> Dataflow -> BigQuery 199 | resource "google_pubsub_subscription" "hyp_sub_dataflow" { 200 | name = "hyp_subscription_dataflow" 201 | topic = google_pubsub_topic.ps_topic.name 202 | 203 | labels = { 204 | created = "terraform" 205 | } 206 | 207 | retain_acked_messages = false 208 | 209 | ack_deadline_seconds = 20 210 | 211 | 212 | retry_policy { 213 | minimum_backoff = "10s" 214 | } 215 | 216 | enable_message_ordering = false 217 | } 218 | 219 | resource "google_dataflow_flex_template_job" "dataflow_stream" { 220 | provider = google-beta 221 | name = "ecommerce-events-ps-to-bq-stream" 222 | container_spec_gcs_path = "gs://${var.project_id}-ecommerce-events/df_templates/dataflow_template.json" 223 | region = var.gcp_region 224 | project = var.project_id 225 | depends_on = [google_project_service.compute, google_project_service.dataflow] 226 | parameters = { 227 | "on_delete" = "cancel" 228 | "service_account_email" = "${google_service_account.data_pipeline_access.email}" 229 | "network" = "${google_compute_network.vpc_network.name}" 230 | "max_workers" = 1 231 | "temp_location" = "gs://${var.project_id}-ecommerce-events/df_tmp_dir" 232 | "runner" = "DataflowRunner" 233 | } 234 | } 235 | 236 | 237 | # Pipeline 2: Cloud Run Proxy -> Pub/Sub -> BigQuery 238 | resource "google_bigquery_table" "bq_table_bqdirect" { 239 | dataset_id = google_bigquery_dataset.bq_dataset.dataset_id 240 | table_id = "pubsub_direct" 241 | deletion_protection = false 242 | 243 | labels = { 244 | env = "default" 245 | } 246 | 247 | schema = < Pub/Sub -> Cloud Run Processing -> BigQuery 297 | resource "google_cloud_run_service" "hyp_run_service_data_processing" { 298 | name = "hyp-run-service-data-processing" 299 | location = var.gcp_region 300 | 301 | template { 302 | spec { 303 | containers { 304 | image = "gcr.io/${var.project_id}/data-processing-service" 305 | } 306 | service_account_name = "${google_service_account.data_pipeline_access.email}" 307 | } 308 | } 309 | 310 | traffic { 311 | percent = 100 312 | latest_revision = true 313 | } 314 | 315 | depends_on = [google_project_service.run] 316 | } 317 | 318 | resource "google_cloud_run_service_iam_policy" "noauth_dp" { 319 | location = google_cloud_run_service.hyp_run_service_data_processing.location 320 | project = google_cloud_run_service.hyp_run_service_data_processing.project 321 | service = google_cloud_run_service.hyp_run_service_data_processing.name 322 | policy_data = data.google_iam_policy.noauth.policy_data 323 | } 324 | 325 | resource "google_pubsub_subscription" "hyp_sub_cloud_run" { 326 | name = "hyp_subscription_cloud_run" 327 | topic = google_pubsub_topic.ps_topic.name 328 | 329 | labels = { 330 | created = "terraform" 331 | } 332 | 333 | push_config { 334 | push_endpoint = google_cloud_run_service.hyp_run_service_data_processing.status[0].url 335 | 336 | attributes = { 337 | x-goog-version = "v1" 338 | } 339 | } 340 | 341 | retain_acked_messages = false 342 | 343 | ack_deadline_seconds = 20 344 | 345 | 346 | retry_policy { 347 | minimum_backoff = "10s" 348 | } 349 | 350 | enable_message_ordering = false 351 | } 352 | 353 | resource "google_bigquery_table" "bq_table_cloud_run" { 354 | dataset_id = google_bigquery_dataset.bq_dataset.dataset_id 355 | table_id = "cloud_run" 356 | deletion_protection = false 357 | 358 | time_partitioning { 359 | type = "DAY" 360 | field = "event_datetime" 361 | } 362 | 363 | labels = { 364 | env = "default" 365 | } 366 | 367 | schema = file("./datalayer/ecommerce_events_bq_schema.json") 368 | 369 | } -------------------------------------------------------------------------------- /01_ingest_and_transform/12_solution/processing-service/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9 2 | 3 | # Allow statements and log messages to immediately appear in the Knative logs 4 | ENV PYTHONUNBUFFERED True 5 | 6 | # Copy local code to the container image. 7 | ENV APP_HOME /app 8 | WORKDIR $APP_HOME 9 | COPY . ./ 10 | COPY ./requirements.txt ./ 11 | 12 | # Install production dependencies. 13 | RUN pip install --upgrade pip 14 | RUN pip install --no-cache-dir -r requirements.txt 15 | 16 | # Run the web service on container startup. Here we use the gunicorn 17 | # webserver, with one worker process and 8 threads. 18 | # For environments with multiple CPU cores, increase the number of workers 19 | # to be equal to the cores available. 20 | # Timeout is set to 0 to disable the timeouts of the workers to allow Cloud Run to handle instance scaling. 21 | CMD exec gunicorn --bind :$PORT --workers 1 --threads 8 --timeout 0 main:app -------------------------------------------------------------------------------- /01_ingest_and_transform/12_solution/processing-service/README.md: -------------------------------------------------------------------------------- 1 | The processing service defines a Cloud Run Service to process each incoming datapoint. 2 | 3 | 4 | main.py defines the public facing webserver to listen for requests. 5 | 6 | synth_data_stream.py creates a synthetic data stream of events randomly chosen from the datalayer. It also randomly includes anomalies in the data. 7 | 8 | 9 | Command to start data stream: 10 | python3 synth_data_stream.py --endpoint {Pub/Sub endpoint link}' -------------------------------------------------------------------------------- /01_ingest_and_transform/12_solution/processing-service/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | project_id = 'poerschmann-hyp-test3' 16 | location = 'europe-west1' 17 | bq_dataset = 'ecommerce_sink' 18 | bq_table = 'cloud_run' 19 | -------------------------------------------------------------------------------- /01_ingest_and_transform/12_solution/processing-service/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import time 17 | import base64 18 | import json 19 | import datetime 20 | import config 21 | 22 | from flask import Flask, request 23 | 24 | from google.cloud import bigquery 25 | 26 | app = Flask(__name__) 27 | 28 | 29 | @app.route("/hw", methods=['GET', 'POST']) 30 | def hello_world(): 31 | world = request.args.get('world') 32 | return f"Hello {world}!" 33 | 34 | 35 | @app.route("/", methods=["POST"]) 36 | def index(): 37 | envelope = request.get_json() 38 | print(envelope) 39 | print(type(envelope)) 40 | 41 | if not envelope: 42 | msg = "no Pub/Sub message received" 43 | print(f"error: {msg}") 44 | return f"Bad Request: {msg}", 400 45 | 46 | ps_message = envelope['message'] 47 | 48 | record = base64.b64decode(ps_message["data"]).decode("utf-8").strip() 49 | record = json.loads(record) 50 | 51 | record["weekday"] = datetime.datetime.strptime(record["event_datetime"], "%Y-%m-%d %H:%M:%S").strftime('%A') 52 | 53 | rows_to_insert = [record] 54 | 55 | client = bigquery.Client(project=config.project_id, location=config.location) 56 | table_id = config.project_id + '.' + config.bq_dataset + '.' + config.bq_table 57 | 58 | errors = client.insert_rows_json(table_id, rows_to_insert) # Make an API request. 59 | if errors == []: 60 | print(f"{time.time()} New rows have been added.") 61 | return ("", 204) 62 | else: 63 | print("Encountered errors while inserting rows: {}".format(errors)) 64 | return f"Bad Request: {envelope}", 400 65 | 66 | 67 | if __name__ == "__main__": 68 | app.run(debug=True, host="0.0.0.0", port=int(os.environ.get("PORT", 8080))) 69 | 70 | -------------------------------------------------------------------------------- /01_ingest_and_transform/12_solution/processing-service/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | numpy 3 | Flask==2.1.0 4 | gunicorn==20.1.0 5 | google-cloud-bigquery -------------------------------------------------------------------------------- /01_ingest_and_transform/12_solution/terraform.tfvars: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2023 Google 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | project_id = "poerschmann-hyp-test3" 18 | delete_contents_on_destroy = true 19 | -------------------------------------------------------------------------------- /01_ingest_and_transform/12_solution/variables.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2023 Google 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | variable "project_id" { 18 | description = "Project where the dataset and table are created." 19 | } 20 | 21 | variable "delete_contents_on_destroy" { 22 | description = "(Optional) If set to true, delete all the tables in the dataset when destroying the resource; otherwise, destroying the resource will fail if tables are present." 23 | type = bool 24 | default = null 25 | } 26 | 27 | variable "force_destroy" { 28 | description = "When deleting a bucket, this boolean option will delete all contained objects. If false, Terraform will fail to delete buckets which contain objects." 29 | type = bool 30 | default = false 31 | } 32 | 33 | variable "gcp_region" { 34 | description = "GCP region to deploy resources in." 35 | type = string 36 | default = "europe-west1" 37 | } -------------------------------------------------------------------------------- /02_activate/21_challenge/cloud-run-pubsub-proxy/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /02_activate/21_challenge/cloud-run-pubsub-proxy/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use the official lightweight Node.js 12 image. 2 | # https://hub.docker.com/_/node 3 | FROM node:12-slim 4 | 5 | # Create and change to the app directory. 6 | WORKDIR /usr/src/app 7 | 8 | # Copy application dependency manifests to the container image. 9 | # A wildcard is used to ensure both package.json AND package-lock.json are copied. 10 | # Copying this separately prevents re-running npm install on every code change. 11 | COPY package*.json ./ 12 | 13 | # Install production dependencies. 14 | RUN npm install --only=production 15 | 16 | # Copy local code to the container image. 17 | COPY . ./ 18 | 19 | # Run the web service on container startup. 20 | CMD [ "npm", "start" ] -------------------------------------------------------------------------------- /02_activate/21_challenge/cloud-run-pubsub-proxy/README.md: -------------------------------------------------------------------------------- 1 | Cloud Run Proxy is a express webserver that listenes to incoming requests and publishes them to a chosen Pub/Sub topic. -------------------------------------------------------------------------------- /02_activate/21_challenge/cloud-run-pubsub-proxy/app.js: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // https://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | const express = require('express'); 16 | const bodyParser = require('body-parser'); 17 | const app = express(); 18 | 19 | app.use(bodyParser.json()); 20 | 21 | app.get('/', (req, res) => { 22 | console.log('Hello world received a request.'); 23 | 24 | const target = process.env.TARGET || 'World'; 25 | res.send(`Hello ${target}!`); 26 | }); 27 | 28 | app.post('/json', (req, res) => { 29 | const dataLayer = JSON.stringify(req.body) 30 | console.log(`proxy POST request received dataLayer: ${dataLayer}`) 31 | 32 | const {PubSub} = require('@google-cloud/pubsub'); 33 | 34 | // Instantiates a client 35 | const pubsub = new PubSub(); 36 | 37 | const {Buffer} = require('safe-buffer'); 38 | 39 | // Set Pub/Sub topic name 40 | let topicName = 'hyp-pubsub-topic'; 41 | 42 | // References an existing topic 43 | const topic = pubsub.topic(topicName); 44 | 45 | // Publishes the message as a string, 46 | const dataBuffer = Buffer.from(dataLayer); 47 | 48 | // Add two custom attributes, origin and username, to the message 49 | const customAttributes = { 50 | origin: 'gtm-cloud-run', 51 | username: 'gcp-demo', 52 | }; 53 | 54 | // Publishes a message to Pub/Sub 55 | return topic 56 | .publishMessage({data: dataBuffer}) 57 | .then(() => res.status(200).send(`{"message": "pubsub message sent: ${dataBuffer}"}`)) 58 | .catch(err => { 59 | console.error(err); 60 | res.status(500).send(err); 61 | return Promise.reject(err); 62 | }); 63 | }) 64 | 65 | 66 | module.exports = app; 67 | -------------------------------------------------------------------------------- /02_activate/21_challenge/cloud-run-pubsub-proxy/index.js: -------------------------------------------------------------------------------- 1 | // Copyright 2023 Google 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // https://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | const app = require('./app.js'); 16 | const PORT = process.env.PORT || 8080; 17 | 18 | app.listen(PORT, () => console.log(`pubsub proxy app listening on port ${PORT}`)); 19 | -------------------------------------------------------------------------------- /02_activate/21_challenge/cloud-run-pubsub-proxy/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pubsub-proxy", 3 | "version": "1.0.0", 4 | "description": "Cloud Run app to send messages to Pub/Sub topic using Node", 5 | "main": "index.js", 6 | "scripts": { 7 | "start": "node index.js" 8 | }, 9 | "author": "", 10 | "license": "Apache-2.0", 11 | "dependencies": { 12 | "express": "^4.17.1", 13 | "body-parser": "^1.19.0", 14 | "@google-cloud/pubsub": "^3.3.0", 15 | "safe-buffer": "5.1.2" 16 | } 17 | } -------------------------------------------------------------------------------- /02_activate/21_challenge/config_custom.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import os 17 | 18 | PROJECT_ID = os.environ['GCP_PROJECT'] 19 | REGION = os.environ['GCP_REGION'] 20 | PIPELINE_ROOT_PATH=f'gs://{PROJECT_ID}-ai-bucket/pipeline_root_custom/' 21 | 22 | TRAIN_IMAGE_URI=os.environ['TRAIN_IMAGE_URI'] 23 | PREDICT_IMAGE_URI=os.environ['PREDICT_IMAGE_URI'] 24 | AIP_STORAGE_URI=f'gs://{PROJECT_ID}-ai-bucket/vtx-artifacts' 25 | 26 | SERVICE_ACCOUNT=f"retailpipeline-hyp@{PROJECT_ID}.iam.gserviceaccount.com" 27 | MACHINE_TYPE = "n1-standard-4" 28 | DATA_URI=f"{PROJECT_ID}.ecommerce_sink.anomaly_data" -------------------------------------------------------------------------------- /02_activate/21_challenge/config_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | export GCP_PROJECT="" 4 | export ENDPOINT_URL="" # doesn't need to be defined in the very beginning 5 | export PUSH_ENDPOINT='' # doesn't need to be defined in the very beginning 6 | export GCP_REGION=europe-west1 7 | export RUN_PROXY_DIR=cloud-run-pubsub-proxy 8 | export RUN_PROCESSING_DIR=processing-service 9 | export DATAFLOW_TEMPLATE=beam 10 | export RUN_INFERENCE_PROCESSING_SERVICE=inf_processing_service 11 | export RUN_INFERENCE_PROCESSING_SERVICE_CUSTOM=inf_processing_service_custom 12 | 13 | export TRAIN_IMAGE_URI=gcr.io/$GCP_PROJECT/custom_train:v1 14 | export PREDICT_IMAGE_URI=gcr.io/$GCP_PROJECT/custom_predict:v1 15 | -------------------------------------------------------------------------------- /02_activate/21_challenge/custom_train/prediction/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM tiangolo/uvicorn-gunicorn-fastapi:python3.7 2 | 3 | # Allow statements and log messages to immediately appear in the Knative logs 4 | ENV PYTHONUNBUFFERED True 5 | 6 | # Copy local code to the container image. 7 | COPY / /app 8 | WORKDIR /app 9 | COPY . ./ 10 | 11 | # Install production dependencies. 12 | RUN pip install --upgrade pip 13 | RUN pip install --no-cache-dir -r requirements.txt 14 | 15 | # Run the web service on container startup. Here we use the gunicorn 16 | # webserver, with one worker process and 8 threads. 17 | # For environments with multiple CPU cores, increase the number of workers 18 | # to be equal to the cores available. 19 | # Timeout is set to 0 to disable the timeouts of the workers to allow Cloud Run to handle instance scaling. 20 | CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"] 21 | 22 | EXPOSE 8080 -------------------------------------------------------------------------------- /02_activate/21_challenge/custom_train/prediction/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NucleusEngineering/hack-your-pipe/dfd667a1545dcc45cca8a2a16ab2c7f70983211f/02_activate/21_challenge/custom_train/prediction/__init__.py -------------------------------------------------------------------------------- /02_activate/21_challenge/custom_train/prediction/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | PROJECT_ID="" 17 | REGION="europe-west1" -------------------------------------------------------------------------------- /02_activate/21_challenge/custom_train/prediction/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | 8 | 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | from fastapi import Request, FastAPI 20 | import json 21 | import os 22 | from joblib import load 23 | import sys 24 | import pandas as pd 25 | from google.cloud import storage 26 | from tempfile import TemporaryFile 27 | import os 28 | import config 29 | 30 | app = FastAPI() 31 | 32 | model_directory = f"{os.environ['AIP_STORAGE_URI']}/model_dir" 33 | storage_path = os.path.join(model_directory, "model.joblib") 34 | 35 | storage_client = storage.Client(project=config.PROJECT_ID) 36 | blob = storage.blob.Blob.from_string(storage_path, client=storage_client) 37 | 38 | blob.download_to_filename("model.joblib") 39 | model =load(open("model.joblib",'rb')) 40 | 41 | @app.get('/') 42 | def get_root(): 43 | return {'message': 'Welcome to custom anomaly detection'} 44 | 45 | @app.get('/health_check') 46 | def health(): 47 | return 200 48 | 49 | if os.environ.get('AIP_PREDICT_ROUTE') is not None: 50 | method = os.environ['AIP_PREDICT_ROUTE'] 51 | else: 52 | method = '/predict' 53 | 54 | @app.post(method) 55 | async def predict(request: Request): 56 | print("----------------- PREDICTING -----------------") 57 | body = await request.json() 58 | # prepare data 59 | instances = pd.DataFrame(body["instances"]) 60 | 61 | # retrieving predictions 62 | outputs = "<3. add the code that predicts anomalies, using the model, and the input from the app>" 63 | 64 | response = outputs.tolist() 65 | print("----------------- OUTPUTS -----------------") 66 | return {"predictions": response} -------------------------------------------------------------------------------- /02_activate/21_challenge/custom_train/prediction/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | numpy 3 | Flask==2.1.0 4 | gunicorn==20.1.0 5 | google-cloud-bigquery 6 | google-cloud-aiplatform 7 | google-cloud-storage 8 | scikit-learn 9 | joblib 10 | gcsfs -------------------------------------------------------------------------------- /02_activate/21_challenge/custom_train/trainer/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gcr.io/deeplearning-platform-release/sklearn-cpu.0-23 2 | WORKDIR / 3 | 4 | # Allow statements and log messages to immediately appear in the Knative logs 5 | ENV PYTHONUNBUFFERED True 6 | 7 | # Copies the trainer code to the docker image. 8 | COPY / /trainer 9 | COPY . ./ 10 | 11 | # Install production dependencies. 12 | RUN pip install --upgrade pip 13 | RUN pip install --no-cache-dir -r requirements.txt 14 | 15 | # Sets up the entry point to invoke the trainer. 16 | CMD ["python", "trainer/main.py"] -------------------------------------------------------------------------------- /02_activate/21_challenge/custom_train/trainer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NucleusEngineering/hack-your-pipe/dfd667a1545dcc45cca8a2a16ab2c7f70983211f/02_activate/21_challenge/custom_train/trainer/__init__.py -------------------------------------------------------------------------------- /02_activate/21_challenge/custom_train/trainer/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | PROJECT_ID="" 17 | REGION="europe-west1" 18 | AIP_STORAGE_URI=f'gs://{PROJECT_ID}-ai-bucket/vtx-artifacts' 19 | # training data: 20 | DATA_URI=f"{PROJECT_ID}.ecommerce_sink.anomaly_data" -------------------------------------------------------------------------------- /02_activate/21_challenge/custom_train/trainer/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from sklearn.tree import DecisionTreeClassifier 16 | from sklearn.metrics import roc_curve 17 | from sklearn.model_selection import train_test_split 18 | from google.cloud import bigquery 19 | from google.cloud import storage 20 | from joblib import dump 21 | 22 | import preprocess 23 | import train 24 | import config 25 | 26 | import os 27 | import pandas as pd 28 | import sys 29 | 30 | # data uri 31 | data_uri = config.DATA_URI 32 | 33 | # bq client 34 | bqclient = bigquery.Client(project=config.PROJECT_ID) 35 | storage_client = storage.Client(project=config.PROJECT_ID) 36 | 37 | ## Download & prep data 38 | print('[INFO] ------ Preparing Data', file=sys.stderr) 39 | train_data, train_labels, test_data, test_labels = preprocess.prep_data(bqclient, storage_client, data_uri) 40 | 41 | ## Train model and save it in Google Cloud Storage 42 | print('[INFO] ------ Training & Saving Model', file=sys.stderr) 43 | '<2. train the model with train and test data and labels, calling the relevant client>' -------------------------------------------------------------------------------- /02_activate/21_challenge/custom_train/trainer/preprocess.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from sklearn.tree import DecisionTreeClassifier 16 | from sklearn.metrics import roc_curve 17 | from sklearn.model_selection import train_test_split 18 | from google.cloud import bigquery 19 | from google.cloud import storage 20 | from joblib import dump 21 | 22 | import os 23 | import pandas as pd 24 | 25 | def download_table(bqclient, storage_client, bq_table_uri: str): 26 | 27 | prefix = "bq://" 28 | if bq_table_uri.startswith(prefix): 29 | bq_table_uri = bq_table_uri[len(prefix):] 30 | 31 | table = bigquery.TableReference.from_string(bq_table_uri) 32 | rows = bqclient.list_rows( 33 | table, 34 | ) 35 | return rows.to_dataframe(create_bqstorage_client=False) 36 | 37 | def prep_data(bqclient, storage_client, data_uri: str): 38 | 39 | # Download data into Pandas DataFrames, split into train / test 40 | df, test_df = train_test_split(download_table(bqclient, storage_client, data_uri)) 41 | labels = df.pop("anomaly").tolist() 42 | data = df.values.tolist() 43 | test_labels = test_df.pop("anomaly").tolist() 44 | test_data = test_df.values.tolist() 45 | 46 | return data, labels, test_data, test_labels -------------------------------------------------------------------------------- /02_activate/21_challenge/custom_train/trainer/requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-learn 2 | google-cloud-bigquery 3 | joblib 4 | pandas 5 | google-cloud-storage -------------------------------------------------------------------------------- /02_activate/21_challenge/custom_train/trainer/train.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from sklearn.tree import DecisionTreeClassifier 16 | from sklearn.metrics import roc_curve 17 | from sklearn.model_selection import train_test_split 18 | from google.cloud import bigquery 19 | from google.cloud import storage 20 | from joblib import dump 21 | 22 | import os 23 | import pandas as pd 24 | 25 | def train_model(data, labels, test_data, test_labels, storage_client): 26 | 27 | # Define and train the Scikit model 28 | skmodel = '<1. initialize the model by calling the model operator>' 29 | '<1. fit the model with data and labels>' 30 | score = skmodel.score(test_data, test_labels) 31 | print('accuracy is:',score) 32 | 33 | # Storage location 34 | model_directory = f"{os.environ['AIP_STORAGE_URI']}/model_dir" 35 | storage_path = os.path.join(model_directory, "model.joblib") 36 | 37 | # Save the model to a local file 38 | dump(skmodel, 'model.joblib') 39 | 40 | blob = storage.blob.Blob.from_string(storage_path, client=storage_client) 41 | blob.upload_from_filename("model.joblib") 42 | 43 | return(skmodel) -------------------------------------------------------------------------------- /02_activate/21_challenge/datalayer/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /02_activate/21_challenge/datalayer/README.md: -------------------------------------------------------------------------------- 1 | Datalayer defines the json events that could occur to be fed into the pipeline. 2 | 3 | Four types of events are included: 4 | * add to cart 5 | * made purchase 6 | * made purcase with anomaly (artifical mistake in data to be identified later) 7 | * view item -------------------------------------------------------------------------------- /02_activate/21_challenge/datalayer/add_to_cart.json: -------------------------------------------------------------------------------- 1 | { 2 | "event_datetime":"2020-11-16 20:59:59", 3 | "event": "add_to_cart", 4 | "user_id": "UID00003", 5 | "client_id": "CID00003", 6 | "page":"/product-67890", 7 | "page_previous": "/category-tshirts", 8 | "ecommerce": { 9 | "items": [{ 10 | "item_name": "Donut Friday Scented T-Shirt", 11 | "item_id": "67890", 12 | "price": 33.75, 13 | "item_brand": "Google", 14 | "item_category": "Apparel", 15 | "item_category_2": "Mens", 16 | "item_category_3": "Shirts", 17 | "item_category_4": "Tshirts", 18 | "item_variant": "Black", 19 | "item_list_name": "Search Results", 20 | "item_list_id": "SR123", 21 | "index": 1, 22 | "quantity": 2 23 | }] 24 | } 25 | } -------------------------------------------------------------------------------- /02_activate/21_challenge/datalayer/ecommerce_events_bq_schema.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "event_datetime", 4 | "type": "TIMESTAMP", 5 | "mode": "NULLABLE" 6 | }, 7 | { 8 | "name": "event", 9 | "type": "STRING", 10 | "mode": "REQUIRED" 11 | }, 12 | { 13 | "name": "user_id", 14 | "type": "STRING", 15 | "mode": "REQUIRED" 16 | }, 17 | { 18 | "name": "client_id", 19 | "type": "STRING", 20 | "mode": "NULLABLE" 21 | }, 22 | { 23 | "name": "page", 24 | "type": "STRING", 25 | "mode": "NULLABLE" 26 | }, 27 | { 28 | "name": "page_previous", 29 | "type": "STRING", 30 | "mode": "NULLABLE" 31 | }, 32 | { 33 | "name": "weekday", 34 | "type": "STRING", 35 | "mode": "NULLABLE" 36 | }, 37 | { 38 | "name": "ecommerce", 39 | "type": "RECORD", 40 | "mode": "NULLABLE", 41 | "fields": [ 42 | { 43 | "mode": "REPEATED", 44 | "name": "items", 45 | "type": "RECORD", 46 | "fields": [ 47 | { 48 | "mode": "NULLABLE", 49 | "name": "index", 50 | "type": "INTEGER" 51 | }, 52 | 53 | { 54 | "mode": "NULLABLE", 55 | "name": "item_id", 56 | "type": "INTEGER" 57 | }, 58 | { 59 | "mode": "NULLABLE", 60 | "name": "item_name", 61 | "type": "STRING" 62 | }, 63 | { 64 | "mode": "NULLABLE", 65 | "name": "item_list_name", 66 | "type": "STRING" 67 | }, 68 | { 69 | "mode": "NULLABLE", 70 | "name": "item_list_id", 71 | "type": "STRING" 72 | }, 73 | { 74 | "mode": "NULLABLE", 75 | "name": "price", 76 | "type": "FLOAT" 77 | }, 78 | { 79 | "mode": "NULLABLE", 80 | "name": "item_variant", 81 | "type": "STRING" 82 | }, 83 | { 84 | "mode": "NULLABLE", 85 | "name": "quantity", 86 | "type": "INTEGER" 87 | }, 88 | { 89 | "mode": "NULLABLE", 90 | "name": "item_brand", 91 | "type": "STRING" 92 | }, 93 | { 94 | "mode": "NULLABLE", 95 | "name": "item_category", 96 | "type": "STRING" 97 | }, 98 | { 99 | "mode": "NULLABLE", 100 | "name": "item_category_2", 101 | "type": "STRING" 102 | }, 103 | { 104 | "mode": "NULLABLE", 105 | "name": "item_category_3", 106 | "type": "STRING" 107 | }, 108 | { 109 | "mode": "NULLABLE", 110 | "name": "item_category_4", 111 | "type": "STRING" 112 | } 113 | ] 114 | }, 115 | { 116 | "mode": "NULLABLE", 117 | "name": "purchase", 118 | "type": "RECORD", 119 | "fields": [ 120 | { 121 | "fields": [ 122 | { 123 | "mode": "NULLABLE", 124 | "name": "item_coupon", 125 | "type": "STRING" 126 | }, 127 | { 128 | "mode": "NULLABLE", 129 | "name": "quantity", 130 | "type": "INTEGER" 131 | }, 132 | { 133 | "mode": "NULLABLE", 134 | "name": "item_variant", 135 | "type": "STRING" 136 | }, 137 | { 138 | "mode": "NULLABLE", 139 | "name": "item_category", 140 | "type": "STRING" 141 | }, 142 | { 143 | "mode": "NULLABLE", 144 | "name": "item_name", 145 | "type": "STRING" 146 | }, 147 | { 148 | "mode": "NULLABLE", 149 | "name": "item_id", 150 | "type": "INTEGER" 151 | }, 152 | { 153 | "mode": "NULLABLE", 154 | "name": "item_brand", 155 | "type": "STRING" 156 | }, 157 | { 158 | "mode": "NULLABLE", 159 | "name": "item_price", 160 | "type": "FLOAT" 161 | } 162 | ], 163 | "mode": "REPEATED", 164 | "name": "items", 165 | "type": "RECORD" 166 | }, 167 | { 168 | "mode": "NULLABLE", 169 | "name": "coupon", 170 | "type": "STRING" 171 | }, 172 | { 173 | "mode": "NULLABLE", 174 | "name": "tax", 175 | "type": "FLOAT" 176 | }, 177 | { 178 | "mode": "NULLABLE", 179 | "name": "shipping", 180 | "type": "FLOAT" 181 | }, 182 | { 183 | "mode": "NULLABLE", 184 | "name": "value", 185 | "type": "FLOAT" 186 | }, 187 | { 188 | "mode": "NULLABLE", 189 | "name": "affiliation", 190 | "type": "STRING" 191 | }, 192 | { 193 | "mode": "NULLABLE", 194 | "name": "currency", 195 | "type": "STRING" 196 | }, 197 | { 198 | "mode": "NULLABLE", 199 | "name": "transaction_id", 200 | "type": "STRING" 201 | } 202 | ] 203 | } 204 | ] 205 | } 206 | ] 207 | -------------------------------------------------------------------------------- /02_activate/21_challenge/datalayer/purchase.json: -------------------------------------------------------------------------------- 1 | { 2 | "event_datetime":"2020-11-16 20:59:59", 3 | "event": "purchase", 4 | "user_id": "UID00001", 5 | "client_id": "CID00003", 6 | "page":"/checkout", 7 | "page_previous": "/order-confirmation", 8 | "ecommerce": { 9 | "purchase": { 10 | "transaction_id": "T12345", 11 | "affiliation": "Online Store", 12 | "value": 35.43, 13 | "tax": 4.90, 14 | "shipping": 5.99, 15 | "currency": "EUR", 16 | "coupon": "SUMMER_SALE", 17 | "items": [{ 18 | "item_name": "Triblend Android T-Shirt", 19 | "item_id": "12345", 20 | "item_price": 15.25, 21 | "item_brand": "Google", 22 | "item_category": "Apparel", 23 | "item_variant": "Gray", 24 | "quantity": 1, 25 | "item_coupon": "" 26 | }, { 27 | "item_name": "Donut Friday Scented T-Shirt", 28 | "item_id": "67890", 29 | "item_price": 33.75, 30 | "item_brand": "Google", 31 | "item_category": "Apparel", 32 | "item_variant": "Black", 33 | "quantity": 1 34 | }] 35 | } 36 | } 37 | } -------------------------------------------------------------------------------- /02_activate/21_challenge/datalayer/purchase_anomaly.json: -------------------------------------------------------------------------------- 1 | { 2 | "event_datetime":"2020-11-16 20:59:59", 3 | "event": "purchase", 4 | "user_id": "UID00001", 5 | "client_id": "CID00003", 6 | "page":"/checkout", 7 | "page_previous": "/order-confirmation", 8 | "ecommerce": { 9 | "purchase": { 10 | "transaction_id": "T12345", 11 | "affiliation": "Online Store", 12 | "value": 1000000.10, 13 | "tax": 4.90, 14 | "shipping": 5.99, 15 | "currency": "EUR", 16 | "coupon": "SUMMER_SALE", 17 | "items": [{ 18 | "item_name": "Triblend Android T-Shirt", 19 | "item_id": "12345", 20 | "item_price": 15.25, 21 | "item_brand": "Google", 22 | "item_category": "Apparel", 23 | "item_variant": "Gray", 24 | "quantity": 1, 25 | "item_coupon": "" 26 | }, { 27 | "item_name": "Donut Friday Scented T-Shirt", 28 | "item_id": "67890", 29 | "item_price": 33.75, 30 | "item_brand": "Google", 31 | "item_category": "Apparel", 32 | "item_variant": "Black", 33 | "quantity": 1 34 | }] 35 | } 36 | } 37 | } -------------------------------------------------------------------------------- /02_activate/21_challenge/datalayer/synth_data_stream.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import random 16 | import requests 17 | import json 18 | import time 19 | import argparse 20 | 21 | 22 | def main(endpoint): 23 | draw = round(random.uniform(0, 1), 2) 24 | 25 | uid = f'UID0000{int(round(random.uniform(0, 5), 0))}' 26 | 27 | if 0 <= draw < 1 / 3: 28 | # get view payload 29 | view_item_f = open('./datalayer/view_item.json') 30 | view_item_payload = json.load(view_item_f) 31 | 32 | view_item_payload['user_id'] = uid 33 | 34 | # send view 35 | r = requests.post(endpoint, json=view_item_payload) 36 | 37 | elif 1 / 3 <= draw < 2 / 3: 38 | # get add to cart payload 39 | add_to_cart_f = open('./datalayer/add_to_cart.json') 40 | add_to_cart_payload = json.load(add_to_cart_f) 41 | 42 | add_to_cart_payload['user_id'] = uid 43 | 44 | # send add to cart 45 | r = requests.post(endpoint, json=add_to_cart_payload) 46 | 47 | else: 48 | # decide between anomaly or no anomaly 49 | if draw < 0.95: 50 | # get payload 51 | purchase_f = open('./datalayer/purchase.json') 52 | purchase_payload = json.load(purchase_f) 53 | 54 | purchase_payload['user_id'] = uid 55 | 56 | # send request 57 | r = requests.post(endpoint, json=purchase_payload) 58 | else: 59 | # get payload 60 | purchase_anomaly_f = open('./datalayer/purchase_anomaly.json') 61 | purchase_anomaly_payload = json.load(purchase_anomaly_f) 62 | 63 | purchase_anomaly_payload['user_id'] = uid 64 | 65 | # send request 66 | r = requests.post(endpoint, json=purchase_anomaly_payload) 67 | 68 | # print(r.text) 69 | print(f'{time.time()} -- {r.status_code}') 70 | 71 | 72 | if __name__ == "__main__": 73 | # Parse Arguments 74 | parser = argparse.ArgumentParser() 75 | parser.add_argument("--endpoint", help="Target Endpoint") 76 | 77 | args = parser.parse_args() 78 | 79 | endpoint = args.endpoint + '/json' 80 | 81 | while True: 82 | main(endpoint) 83 | time.sleep(.1) 84 | -------------------------------------------------------------------------------- /02_activate/21_challenge/datalayer/view_item.json: -------------------------------------------------------------------------------- 1 | { 2 | "event_datetime":"2020-11-16 22:59:59", 3 | "event": "view_item", 4 | "user_id": "UID00003", 5 | "client_id": "CID00003", 6 | "page":"/product-67890", 7 | "page_previous": "/category-tshirts", 8 | "ecommerce": { 9 | "items": [{ 10 | "item_name": "Donut Friday Scented T-Shirt", 11 | "item_id": "67890", 12 | "price": 33.75, 13 | "item_brand": "Google", 14 | "item_category": "Apparel", 15 | "item_category_2": "Mens", 16 | "item_category_3": "Shirts", 17 | "item_category_4": "Tshirts", 18 | "item_variant": "Black", 19 | "item_list_name": "Search Results", 20 | "item_list_id": "SR123", 21 | "index": 1, 22 | "quantity": 1 23 | }] 24 | } 25 | } -------------------------------------------------------------------------------- /02_activate/21_challenge/inf_processing_service/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9 2 | 3 | # Allow statements and log messages to immediately appear in the Knative logs 4 | ENV PYTHONUNBUFFERED True 5 | 6 | # Copy local code to the container image. 7 | ENV APP_HOME /app 8 | WORKDIR $APP_HOME 9 | COPY . ./ 10 | COPY ./requirements.txt ./ 11 | 12 | # Install production dependencies. 13 | RUN pip install --upgrade pip 14 | RUN pip install --no-cache-dir -r requirements.txt 15 | 16 | # Run the web service on container startup. Here we use the gunicorn 17 | # webserver, with one worker process and 8 threads. 18 | # For environments with multiple CPU cores, increase the number of workers 19 | # to be equal to the cores available. 20 | # Timeout is set to 0 to disable the timeouts of the workers to allow Cloud Run to handle instance scaling. 21 | CMD exec gunicorn --bind :$PORT --workers 1 --threads 8 --timeout 0 main:app -------------------------------------------------------------------------------- /02_activate/21_challenge/inf_processing_service/README.md: -------------------------------------------------------------------------------- 1 | The processing service defines a Cloud Run Service to process each incoming datapoint. 2 | 3 | 4 | main.py defines the public facing webserver to listen for requests. 5 | 6 | synth_data_stream.py creates a synthetic data stream of events randomly chosen from the datalayer. It also randomly includes anomalies in the data. 7 | 8 | 9 | Command to start data stream: 10 | python3 synth_data_stream.py --endpoint {Pub/Sub endpoint link}' -------------------------------------------------------------------------------- /02_activate/21_challenge/inf_processing_service/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | project_id = '' 16 | location = 'europe-west1' 17 | bq_dataset = 'ecommerce_sink' 18 | bq_table = 'cloud_run' 19 | bq_table_anomaly = 'cloud_run_anomaly' 20 | endpoint_id = '' 21 | -------------------------------------------------------------------------------- /02_activate/21_challenge/inf_processing_service/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import time 17 | import base64 18 | import json 19 | import datetime 20 | import config 21 | 22 | from flask import Flask, request 23 | 24 | from google.cloud import bigquery, aiplatform 25 | 26 | 27 | app = Flask(__name__) 28 | 29 | 30 | @app.route("/hw", methods=['GET', 'POST']) 31 | def hello_world(): 32 | world = request.args.get('world') 33 | return f"Hello {world}!" 34 | 35 | 36 | @app.route("/", methods=["POST"]) 37 | def index(): 38 | envelope = request.get_json() 39 | print(envelope) 40 | print(type(envelope)) 41 | 42 | if not envelope: 43 | msg = "no Pub/Sub message received" 44 | print(f"error: {msg}") 45 | return f"Bad Request: {msg}", 400 46 | 47 | ps_message = envelope['message'] 48 | 49 | record = base64.b64decode(ps_message["data"]).decode("utf-8").strip() 50 | record = json.loads(record) 51 | 52 | record["weekday"] = datetime.datetime.strptime(record["event_datetime"], "%Y-%m-%d %H:%M:%S").strftime('%A') 53 | 54 | rows_to_insert = [record] 55 | 56 | client = bigquery.Client(project=config.project_id, location=config.location) 57 | table_id = config.project_id + '.' + config.bq_dataset + '.' + config.bq_table 58 | 59 | errors = client.insert_rows_json(table_id, rows_to_insert) # Make an API request. 60 | 61 | 62 | # Create record that includes anomaly detection inference. 63 | if record["event"] == "purchase": 64 | record_to_predict = [ 65 | {"tax": record["ecommerce"]["purchase"]["tax"], 66 | "shipping": record["ecommerce"]["purchase"]["shipping"], 67 | "value":record["ecommerce"]["purchase"]["value"]} 68 | ] 69 | 70 | # 71 | 72 | # < vertex endpoint definition > 73 | 74 | # < calling prediction from endpoint > 75 | 76 | centroid = endpoint_response.predictions[0]["nearest_centroid_id"][0] 77 | 78 | if centroid == 1: 79 | anomaly = True 80 | if centroid == 2: 81 | anomaly = False 82 | 83 | print(anomaly) 84 | 85 | anomaly_record = {"tax": record["ecommerce"]["purchase"]["tax"], "shipping": record["ecommerce"]["purchase"]["shipping"], "value":record["ecommerce"]["purchase"]["value"], "anomaly": anomaly} 86 | 87 | rows_to_insert = [anomaly_record] 88 | 89 | # < defining Big Query client > 90 | # < setting table id > 91 | # < api request to insert rows in BigQuery destination table > 92 | 93 | if errors_an == []: 94 | print(f"{time.time()} New rows with prediction have been added.") 95 | return ("", 204) 96 | else: 97 | print("Encountered errors while inserting rows: {}".format(errors)) 98 | return f"Bad Request: {envelope}", 400 99 | 100 | if errors == []: 101 | print(f"{time.time()} New rows have been added.") 102 | return ("", 204) 103 | else: 104 | print("Encountered errors while inserting rows: {}".format(errors)) 105 | return f"Bad Request: {envelope}", 400 106 | 107 | 108 | if __name__ == "__main__": 109 | app.run(debug=True, host="0.0.0.0", port=int(os.environ.get("PORT", 8080))) 110 | 111 | -------------------------------------------------------------------------------- /02_activate/21_challenge/inf_processing_service/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | numpy 3 | Flask==2.1.0 4 | gunicorn==20.1.0 5 | google-cloud-bigquery 6 | google-cloud-aiplatform -------------------------------------------------------------------------------- /02_activate/21_challenge/inf_processing_service_custom/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9 2 | 3 | # Allow statements and log messages to immediately appear in the Knative logs 4 | ENV PYTHONUNBUFFERED True 5 | 6 | # Copy local code to the container image. 7 | ENV APP_HOME /app 8 | WORKDIR $APP_HOME 9 | COPY . ./ 10 | COPY ./requirements.txt ./ 11 | 12 | # Install production dependencies. 13 | RUN pip install --upgrade pip 14 | RUN pip install --no-cache-dir -r requirements.txt 15 | 16 | # Run the web service on container startup. Here we use the gunicorn 17 | # webserver, with one worker process and 8 threads. 18 | # For environments with multiple CPU cores, increase the number of workers 19 | # to be equal to the cores available. 20 | # Timeout is set to 0 to disable the timeouts of the workers to allow Cloud Run to handle instance scaling. 21 | CMD exec gunicorn --bind :$PORT --workers 1 --threads 8 --timeout 0 main:app -------------------------------------------------------------------------------- /02_activate/21_challenge/inf_processing_service_custom/README.md: -------------------------------------------------------------------------------- 1 | The processing service defines a Cloud Run Service to process each incoming datapoint. 2 | 3 | 4 | main.py defines the public facing webserver to listen for requests. 5 | 6 | synth_data_stream.py creates a synthetic data stream of events randomly chosen from the datalayer. It also randomly includes anomalies in the data. 7 | 8 | 9 | Command to start data stream: 10 | python3 synth_data_stream.py --endpoint {Pub/Sub endpoint link}' -------------------------------------------------------------------------------- /02_activate/21_challenge/inf_processing_service_custom/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | project_id = '' 16 | location = 'europe-west1' 17 | bq_dataset = 'ecommerce_sink' 18 | bq_table = 'cloud_run' 19 | bq_table_anomaly = 'cloud_run_anomaly_custom' 20 | endpoind_id = '' 21 | -------------------------------------------------------------------------------- /02_activate/21_challenge/inf_processing_service_custom/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import time 17 | import base64 18 | import json 19 | import datetime 20 | import config 21 | 22 | from flask import Flask, request 23 | 24 | from google.cloud import bigquery, aiplatform 25 | 26 | 27 | app = Flask(__name__) 28 | 29 | 30 | @app.route("/hw", methods=['GET', 'POST']) 31 | def hello_world(): 32 | world = request.args.get('world') 33 | return f"Hello {world}!" 34 | 35 | 36 | @app.route("/", methods=["POST"]) 37 | def index(): 38 | envelope = request.get_json() 39 | print(envelope) 40 | print(type(envelope)) 41 | 42 | if not envelope: 43 | msg = "no Pub/Sub message received" 44 | print(f"error: {msg}") 45 | return f"Bad Request: {msg}", 400 46 | 47 | ps_message = envelope['message'] 48 | 49 | record = base64.b64decode(ps_message["data"]).decode("utf-8").strip() 50 | record = json.loads(record) 51 | 52 | record["weekday"] = datetime.datetime.strptime(record["event_datetime"], "%Y-%m-%d %H:%M:%S").strftime('%A') 53 | 54 | rows_to_insert = [record] 55 | 56 | client = bigquery.Client(project=config.project_id, location=config.location) 57 | table_id = config.project_id + '.' + config.bq_dataset + '.' + config.bq_table 58 | 59 | errors = client.insert_rows_json(table_id, rows_to_insert) # Make an API request. 60 | 61 | 62 | # Create record that includes anomaly detection inference. 63 | if record["event"] == "purchase": 64 | record_to_predict = [ 65 | {"tax": record["ecommerce"]["purchase"]["tax"], 66 | "shipping": record["ecommerce"]["purchase"]["shipping"], 67 | "value":record["ecommerce"]["purchase"]["value"]} 68 | ] 69 | 70 | aiplatform.init(project=config.project_id, location=config.location) 71 | 72 | endpoint = aiplatform.Endpoint( 73 | endpoint_name=f"projects/{config.project_id}/locations/{config.location}/endpoints/{config.endpoind_id}", 74 | project = config.project_id, 75 | location=config.location, 76 | ) 77 | 78 | endpoint_response = endpoint.predict( 79 | instances=record_to_predict 80 | ) 81 | 82 | anomaly = endpoint_response.predictions[0] 83 | 84 | anomaly_record = {"tax": record["ecommerce"]["purchase"]["tax"], "shipping": record["ecommerce"]["purchase"]["shipping"], "value":record["ecommerce"]["purchase"]["value"], "anomaly": anomaly} 85 | 86 | rows_to_insert = [anomaly_record] 87 | 88 | client = bigquery.Client(project=config.project_id, location=config.location) 89 | table_id = config.project_id + '.' + config.bq_dataset + '.' + config.bq_table_anomaly 90 | 91 | errors_an = client.insert_rows_json(table_id, rows_to_insert) # Make an API request. 92 | 93 | 94 | if errors_an == []: 95 | print(f"{time.time()} New rows with prediction have been added.") 96 | return ("", 204) 97 | else: 98 | print("Encountered errors while inserting rows: {}".format(errors)) 99 | return f"Bad Request: {envelope}", 400 100 | 101 | if errors == []: 102 | print(f"{time.time()} New rows have been added.") 103 | return ("", 204) 104 | else: 105 | print("Encountered errors while inserting rows: {}".format(errors)) 106 | return f"Bad Request: {envelope}", 400 107 | 108 | 109 | if __name__ == "__main__": 110 | app.run(debug=True, host="0.0.0.0", port=int(os.environ.get("PORT", 8080))) 111 | 112 | -------------------------------------------------------------------------------- /02_activate/21_challenge/inf_processing_service_custom/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | numpy 3 | Flask==2.1.0 4 | gunicorn==20.1.0 5 | google-cloud-bigquery 6 | google-cloud-aiplatform 7 | scikit-learn -------------------------------------------------------------------------------- /02_activate/21_challenge/kf_pipe_custom.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import time 16 | from typing import Iterable, Dict, NamedTuple 17 | 18 | import config_custom 19 | 20 | # import kfp 21 | from kfp.v2 import compiler, dsl 22 | from kfp.v2.dsl import component 23 | from kfp.v2.components import importer_node 24 | import google.cloud.aiplatform as aip 25 | from google_cloud_pipeline_components import aiplatform as gcc_aip 26 | 27 | from google_cloud_pipeline_components.types import artifact_types 28 | from google_cloud_pipeline_components.v1.custom_job import CustomTrainingJobOp 29 | 30 | ## Training Worker Specs 31 | WORKER_POOL_SPECS = [ 32 | { 33 | "machine_spec": { 34 | "machine_type": "n1-standard-4" 35 | }, 36 | "replica_count": "1", 37 | "container_spec": { 38 | "image_uri": config_custom.TRAIN_IMAGE_URI, 39 | "env": [ 40 | { 41 | "name": "AIP_STORAGE_URI", 42 | "value": config_custom.AIP_STORAGE_URI 43 | }, 44 | ] 45 | } 46 | } 47 | ] 48 | 49 | def compile_pipe(): 50 | # Define the workflow of the pipeline. 51 | @dsl.pipeline( 52 | name="anomaly-detection-custom-test", 53 | pipeline_root=config_custom.PIPELINE_ROOT_PATH) 54 | 55 | def pipeline( 56 | project_id: str, 57 | region: str, 58 | timestamp_id: str, 59 | artifact_staging_location:str, 60 | bq_source: str, 61 | aip_storage_uri: str, 62 | predict_image_uri: str 63 | ): 64 | 65 | # Model training 66 | train_job = '<1. Add the training job with a display name, project, location and worker pool defined.>' 67 | 68 | # Model evaluation 69 | # Ideally here you can evaluate the model and decide on deployment/or not for CI/CD purposes 70 | # example: https://www.cloudskillsboost.google/focuses/21234?parent=catalog 71 | 72 | # Import with the custom predict container 73 | import_unmanaged_model_op = importer_node.importer( 74 | artifact_uri=aip_storage_uri, 75 | artifact_class=artifact_types.UnmanagedContainerModel, 76 | metadata={ 77 | "containerSpec": { 78 | "imageUri": predict_image_uri, 79 | "env": [ 80 | { 81 | "name": "PROJECT_ID", 82 | "value": project_id}, 83 | ], 84 | "predictRoute": "/predict", 85 | "healthRoute": "/health_check", 86 | "ports": [ 87 | { 88 | "containerPort": 8080 89 | } 90 | ] 91 | }, 92 | }, 93 | ).after(train_job) 94 | 95 | # Upload the model into the registry 96 | custom_model_upload_job = gcc_aip.'<2. Find the correct operator>'( 97 | project=project_id, 98 | location=region, 99 | display_name=f"anomaly-detection-custom-model_{timestamp_id}", 100 | unmanaged_container_model=import_unmanaged_model_op.outputs["artifact"], 101 | ).after(import_unmanaged_model_op) 102 | 103 | # Create an endpoint where the model will be deployed 104 | endpoint_create_job = gcc_aip.'<3. Find the correct operator>'( 105 | project=project_id, 106 | display_name="anomaly-detection-custom-endpoint", 107 | location=region 108 | ) 109 | 110 | # Deploy the model on the endpoint 111 | _ = gcc_aip.'<4. Find the correct operator>'( 112 | model=custom_model_upload_job.outputs["model"], 113 | endpoint=endpoint_create_job.outputs["endpoint"], 114 | deployed_model_display_name="anomaly-detection-custom-deploy", 115 | dedicated_resources_min_replica_count=1, 116 | dedicated_resources_max_replica_count=1, 117 | dedicated_resources_machine_type="n1-standard-2", 118 | traffic_split={"0": 100} 119 | ) 120 | 121 | compiler.Compiler().compile(pipeline_func=pipeline, package_path="hyp-custom-anomaly-detection.json") 122 | 123 | if __name__ == "__main__": 124 | # Initialize aiplatform credentials. 125 | aip.init(project=config_custom.PROJECT_ID, location=config_custom.REGION) 126 | 127 | # Compile pipeline code. 128 | compile_pipe() 129 | 130 | # Unique ident for pipeline run 131 | timestamp_id = str(int(time.time())) 132 | 133 | # Prepare the pipeline job. 134 | job = aip.PipelineJob( 135 | display_name=f"{timestamp_id}-hyp-custom-anomaly-detection", 136 | template_path="hyp-custom-anomaly-detection.json", 137 | pipeline_root=config_custom.PIPELINE_ROOT_PATH, 138 | parameter_values={ 139 | 'project_id': config_custom.PROJECT_ID, 140 | 'region': config_custom.REGION, 141 | 'timestamp_id': timestamp_id, 142 | 'bq_source': config_custom.DATA_URI, 143 | 'aip_storage_uri' : config_custom.AIP_STORAGE_URI, 144 | 'predict_image_uri' : config_custom.PREDICT_IMAGE_URI, 145 | 'artifact_staging_location': config_custom.PIPELINE_ROOT_PATH 146 | } 147 | ) 148 | 149 | job.submit(service_account=config_custom.SERVICE_ACCOUNT) 150 | -------------------------------------------------------------------------------- /02_activate/21_challenge/processing-service/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9 2 | 3 | # Allow statements and log messages to immediately appear in the Knative logs 4 | ENV PYTHONUNBUFFERED True 5 | 6 | # Copy local code to the container image. 7 | ENV APP_HOME /app 8 | WORKDIR $APP_HOME 9 | COPY . ./ 10 | COPY ./requirements.txt ./ 11 | 12 | # Install production dependencies. 13 | RUN pip install --upgrade pip 14 | RUN pip install --no-cache-dir -r requirements.txt 15 | 16 | # Run the web service on container startup. Here we use the gunicorn 17 | # webserver, with one worker process and 8 threads. 18 | # For environments with multiple CPU cores, increase the number of workers 19 | # to be equal to the cores available. 20 | # Timeout is set to 0 to disable the timeouts of the workers to allow Cloud Run to handle instance scaling. 21 | CMD exec gunicorn --bind :$PORT --workers 1 --threads 8 --timeout 0 main:app -------------------------------------------------------------------------------- /02_activate/21_challenge/processing-service/README.md: -------------------------------------------------------------------------------- 1 | The processing service defines a Cloud Run Service to process each incoming datapoint. 2 | 3 | 4 | main.py defines the public facing webserver to listen for requests. 5 | 6 | synth_data_stream.py creates a synthetic data stream of events randomly chosen from the datalayer. It also randomly includes anomalies in the data. 7 | 8 | 9 | Command to start data stream: 10 | python3 synth_data_stream.py --endpoint {Pub/Sub endpoint link}' -------------------------------------------------------------------------------- /02_activate/21_challenge/processing-service/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | project_id = '' 16 | location = 'europe-west1' 17 | bq_dataset = 'ecommerce_sink' 18 | bq_table = 'cloud_run' 19 | -------------------------------------------------------------------------------- /02_activate/21_challenge/processing-service/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import time 17 | import base64 18 | import json 19 | import datetime 20 | import config 21 | 22 | from flask import Flask, request 23 | 24 | from google.cloud import bigquery 25 | 26 | app = Flask(__name__) 27 | 28 | 29 | @app.route("/hw", methods=['GET', 'POST']) 30 | def hello_world(): 31 | world = request.args.get('world') 32 | return f"Hello {world}!" 33 | 34 | 35 | @app.route("/", methods=["POST"]) 36 | def index(): 37 | envelope = request.get_json() 38 | print(envelope) 39 | print(type(envelope)) 40 | 41 | if not envelope: 42 | msg = "no Pub/Sub message received" 43 | print(f"error: {msg}") 44 | return f"Bad Request: {msg}", 400 45 | 46 | ps_message = envelope['message'] 47 | 48 | record = base64.b64decode(ps_message["data"]).decode("utf-8").strip() 49 | record = json.loads(record) 50 | 51 | record["weekday"] = datetime.datetime.strptime(record["event_datetime"], "%Y-%m-%d %H:%M:%S").strftime('%A') 52 | 53 | rows_to_insert = [record] 54 | 55 | client = bigquery.Client(project=config.project_id, location=config.location) 56 | table_id = config.project_id + '.' + config.bq_dataset + '.' + config.bq_table 57 | 58 | errors = client.insert_rows_json(table_id, rows_to_insert) # Make an API request. 59 | if errors == []: 60 | print(f"{time.time()} New rows have been added.") 61 | return ("", 204) 62 | else: 63 | print("Encountered errors while inserting rows: {}".format(errors)) 64 | return f"Bad Request: {envelope}", 400 65 | 66 | 67 | if __name__ == "__main__": 68 | app.run(debug=True, host="0.0.0.0", port=int(os.environ.get("PORT", 8080))) 69 | 70 | -------------------------------------------------------------------------------- /02_activate/21_challenge/processing-service/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | numpy 3 | Flask==2.1.0 4 | gunicorn==20.1.0 5 | google-cloud-bigquery -------------------------------------------------------------------------------- /02_activate/21_challenge/requirements.txt: -------------------------------------------------------------------------------- 1 | kfp 2 | google-cloud-aiplatform 3 | protobuf==3.20.3 4 | google-cloud-pipeline-components==1.0.39 5 | kfp==1.8.19 6 | google-cloud-aiplatform==1.22.0 7 | scikit-learn 8 | -------------------------------------------------------------------------------- /02_activate/21_challenge/terraform.tfvars: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2023 Google 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | project_id = "" 18 | delete_contents_on_destroy = true 19 | -------------------------------------------------------------------------------- /02_activate/21_challenge/variables.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2023 Google 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | variable "project_id" { 18 | description = "Project where the dataset and table are created." 19 | } 20 | 21 | variable "delete_contents_on_destroy" { 22 | description = "(Optional) If set to true, delete all the tables in the dataset when destroying the resource; otherwise, destroying the resource will fail if tables are present." 23 | type = bool 24 | default = null 25 | } 26 | 27 | variable "force_destroy" { 28 | description = "When deleting a bucket, this boolean option will delete all contained objects. If false, Terraform will fail to delete buckets which contain objects." 29 | type = bool 30 | default = false 31 | } 32 | 33 | variable "gcp_region" { 34 | description = "GCP region to deploy resources in." 35 | type = string 36 | default = "europe-west1" 37 | } -------------------------------------------------------------------------------- /02_activate/22_solution/README.md: -------------------------------------------------------------------------------- 1 | # Developing and deploying Machine Leaning Models on GCP 2 | 3 | Welcome to the second part of Hack Your Pipe! 4 | 5 | So far you discovered multiple options to ingest and transform data most efficiently. 6 | In this section you go one step further with your data, but constantly build on the previous learnings. 7 | You will train and deploy Machine Learning models that detect anomalies in the incoming click stream. 8 | 9 | Hereby we will focus on automation, simplicity and reliability of every step in the Machine Learning Lifecycle. 10 | 11 | The architecture you are going to implement will look something like this: 12 | 13 | ![Hack Your Pipe architecture](../../rsc/hyp_ml_architecture.png) 14 | 15 | 16 | 17 | ## Prerequisites: Create Synthetic Data 18 | 19 | You will use the click stream data from the [ingest and transform section](https://github.com/NucleusEngineering/hack-your-pipe/tree/main/01_ingest_and_transform) as an example. 20 | 21 | If you haven't worked through the ingest and transform chapter follow [`01_ingest_and_transform/12_solution/README.md`](https://github.com/NucleusEngineering/hack-your-pipe/blob/main/01_ingest_and_transform/12_solution/README.md). 22 | 23 | Before moving on make sure that your BigQuery project has a dataset `ecommerce_sink` with the tables `cloud_run`, `dataflow` and `pubsub_direct`. 24 | The tables should be populated with at least 1000 data points each. 25 | 26 | ## Git clone repo 27 | 28 | ``` 29 | git clone https://github.com/NucleusEngineering/hack-your-pipe.git 30 | cd hack-your-pipe 31 | ``` 32 | 33 | ## Set-up Cloud Environment 34 | 35 | ### Initialize your account and project 36 | 37 | If you are using the Google Cloud Shell you can skip this step. 38 | 39 | ``` 40 | gcloud init 41 | ``` 42 | 43 | ### Set Google Cloud Project 44 | Enter your GCP Project ID as `GCP_PROJECT` in `./config_env.sh` & set the environment variables. 45 | ``` 46 | source config_env.sh 47 | ``` 48 | 49 | ``` 50 | gcloud config set project $GCP_PROJECT 51 | ``` 52 | 53 | ### Enable Google Cloud APIs 54 | 55 | ``` 56 | gcloud services enable aiplatform.googleapis.com storage.googleapis.com notebooks.googleapis.com dataflow.googleapis.com artifactregistry.googleapis.com 57 | ``` 58 | 59 | ### Set compute zone 60 | 61 | ``` 62 | gcloud config set compute/zone $GCP_REGION 63 | ``` 64 | 65 | ### Create a service account. 66 | 67 | ``` 68 | gcloud iam service-accounts create retailpipeline-hyp \ 69 | --display-name="retailpipeline-hyp" 70 | ``` 71 | You might already have this from running the ingest and transform section. In such a case just add the below permissions. 72 | 73 | ### ... with the necessary permissions. 74 | ``` 75 | gcloud projects add-iam-policy-binding $GCP_PROJECT \ 76 | --member="serviceAccount:retailpipeline-hyp@$GCP_PROJECT.iam.gserviceaccount.com" \ 77 | --role="roles/storage.objectAdmin" 78 | 79 | ``` 80 | 81 | ``` 82 | gcloud projects add-iam-policy-binding $GCP_PROJECT \ 83 | --member="serviceAccount:retailpipeline-hyp@$GCP_PROJECT.iam.gserviceaccount.com" \ 84 | --role="roles/aiplatform.user" 85 | 86 | ``` 87 | 88 | ``` 89 | gcloud projects add-iam-policy-binding $GCP_PROJECT \ 90 | --member="serviceAccount:retailpipeline-hyp@$GCP_PROJECT.iam.gserviceaccount.com" \ 91 | --role="roles/automl.serviceAgent" 92 | 93 | ``` 94 | 95 | 108 | 109 | ### Adjusting all the configs - important! 110 | 111 | Set your GCP project id in the following files in `hack-your-pipe/02_activate/22_solution/` 112 | 113 | * `processing_service/config.py` 114 | * `inf_processing_service_custom/config.py` 115 | * `inf_processing_service/config.py` 116 | * `custom_train/trainer/config.py` 117 | * `custom_train/prediction/config.py` 118 | * `config.py` 119 | 120 | 121 | ## Run ML Pipeline 122 | 123 | ### Set pipeline config options 124 | 125 | Set the config options in [`02_activate/22_solution/config.py`](https://github.com/NucleusEngineering/hack-your-pipe/blob/main/02_activate/22_solution/config.py). 126 | 127 | 128 | ### Run Kubeflow Pipeline in Vertex (BigQueryML model) 129 | 130 | [Vertex Pipelines](https://cloud.google.com/vertex-ai/docs/pipelines/introduction) is an end-to-end and serverless ML orchestration tool. It's supports the open source frameworks [Kubeflow](https://www.kubeflow.org/) and [TFX](https://www.tensorflow.org/tfx). 131 | 132 | The full process from model training to deployment can be orchestrated using Vertex Pipelines. 133 | 134 | To kick off the pipeline simply install the dependencies 135 | ``` 136 | pip install -r ./requirements.txt 137 | ``` 138 | 139 | and then run 140 | 141 | ``` 142 | python3 kf_pipe.py 143 | ``` 144 | 145 | ## Set up processing pipe for real time inference 146 | 147 | Once the model is trained and deployed you will include a real time inference call in the data pipeline and again stream the results to BigQuery. 148 | 149 | Use terraform to create a new BigQuery table as sink for your predictions. 150 | 151 | ``` 152 | terraform init 153 | ``` 154 | 155 | ``` 156 | terraform plan 157 | ``` 158 | 159 | ``` 160 | terraform apply -var-file terraform.tfvars 161 | ``` 162 | 163 | 164 | To include real time inference in your pipeline you have to update the Cloud Run processing service. 165 | That means you need build and deploy a new container version to your service. Don't forget to update the `inf_processing_service_custom/config.py`. 166 | 167 | Build the container, and deploy on Cloud Run (note that you are just replacing the container image of the previous inference service to this new inference service). 168 | 169 | ``` 170 | gcloud builds submit $RUN_INFERENCE_PROCESSING_SERVICE_CUSTOM --tag gcr.io/$GCP_PROJECT/inference-processing-service-custom 171 | ``` 172 | 173 | ``` 174 | gcloud run deploy hyp-run-service-data-processing --image=gcr.io/$GCP_PROJECT/inference-processing-service-custom:latest --region=$GCP_REGION --allow-unauthenticated 175 | ``` 176 | 177 | ## Run Kubeflow Pipeline in Vertex (Custom Container) 178 | 179 | Two additional steps are needed to run the pipeline with custom training and prediction. We start by preparing the code to create custom training and prediction containers. 180 | Containers are providing you a way to write your own preferred data processing and model training with your preferred library and environment. 181 | 182 | Build the containers 183 | 184 | ``` 185 | gcloud builds submit custom_train/trainer/. --tag $TRAIN_IMAGE_URI 186 | ``` 187 | ``` 188 | gcloud builds submit custom_train/prediction/. --tag $PREDICT_IMAGE_URI 189 | ``` 190 | 191 | 192 | And kick off the pipeline same as before 193 | ``` 194 | pip install -r ./requirements.txt 195 | ``` 196 | 197 | and then run 198 | 199 | ``` 200 | python3 kf_pipe_custom.py 201 | ``` 202 | -------------------------------------------------------------------------------- /02_activate/22_solution/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import os 15 | 16 | GCP_PROJECT = os.environ['GCP_PROJECT'] 17 | GCP_REGION = os.environ['GCP_REGION'] 18 | PIPELINE_ROOT_PATH = f"gs://{GCP_PROJECT}-ecommerce-events" 19 | SERVICE_ACCOUNT=f"retailpipeline-hyp@{GCP_PROJECT}.iam.gserviceaccount.com" 20 | MACHINE_TYPE = "n1-standard-4" -------------------------------------------------------------------------------- /02_activate/22_solution/config_custom.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import os 17 | 18 | PROJECT_ID = os.environ['GCP_PROJECT'] 19 | REGION = os.environ['GCP_REGION'] 20 | PIPELINE_ROOT_PATH=f'gs://{PROJECT_ID}-ai-bucket/pipeline_root_custom/' 21 | 22 | TRAIN_IMAGE_URI=os.environ['TRAIN_IMAGE_URI'] 23 | PREDICT_IMAGE_URI=os.environ['PREDICT_IMAGE_URI'] 24 | AIP_STORAGE_URI=f'gs://{PROJECT_ID}-ai-bucket/vtx-artifacts' 25 | 26 | SERVICE_ACCOUNT=f"retailpipeline-hyp@{PROJECT_ID}.iam.gserviceaccount.com" 27 | MACHINE_TYPE = "n1-standard-4" 28 | DATA_URI=f"{PROJECT_ID}.ecommerce_sink.anomaly_data" -------------------------------------------------------------------------------- /02_activate/22_solution/config_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | export GCP_PROJECT="" 4 | export ENDPOINT_URL="" # doesn't need to be defined in the very beginning 5 | export PUSH_ENDPOINT='' # doesn't need to be defined in the very beginning 6 | export GCP_REGION=europe-west1 7 | export RUN_PROXY_DIR=cloud-run-pubsub-proxy 8 | export RUN_PROCESSING_DIR=processing-service 9 | export DATAFLOW_TEMPLATE=beam 10 | export RUN_INFERENCE_PROCESSING_SERVICE=inf_processing_service 11 | export RUN_INFERENCE_PROCESSING_SERVICE_CUSTOM=inf_processing_service_custom 12 | 13 | export TRAIN_IMAGE_URI=gcr.io/$GCP_PROJECT/custom_train:v1 14 | export PREDICT_IMAGE_URI=gcr.io/$GCP_PROJECT/custom_predict:v1 -------------------------------------------------------------------------------- /02_activate/22_solution/custom_train/prediction/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM tiangolo/uvicorn-gunicorn-fastapi:python3.7 2 | 3 | # Allow statements and log messages to immediately appear in the Knative logs 4 | ENV PYTHONUNBUFFERED True 5 | 6 | # Copy local code to the container image. 7 | COPY / /app 8 | WORKDIR /app 9 | COPY . ./ 10 | 11 | # Install production dependencies. 12 | RUN pip install --upgrade pip 13 | RUN pip install --no-cache-dir -r requirements.txt 14 | 15 | # Run the web service on container startup. Here we use the gunicorn 16 | # webserver, with one worker process and 8 threads. 17 | # For environments with multiple CPU cores, increase the number of workers 18 | # to be equal to the cores available. 19 | # Timeout is set to 0 to disable the timeouts of the workers to allow Cloud Run to handle instance scaling. 20 | CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"] 21 | 22 | EXPOSE 8080 -------------------------------------------------------------------------------- /02_activate/22_solution/custom_train/prediction/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NucleusEngineering/hack-your-pipe/dfd667a1545dcc45cca8a2a16ab2c7f70983211f/02_activate/22_solution/custom_train/prediction/__init__.py -------------------------------------------------------------------------------- /02_activate/22_solution/custom_train/prediction/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | PROJECT_ID="" 17 | REGION="europe-west1" -------------------------------------------------------------------------------- /02_activate/22_solution/custom_train/prediction/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | 8 | 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | from fastapi import Request, FastAPI 20 | import json 21 | import os 22 | from joblib import load 23 | import sys 24 | import pandas as pd 25 | from google.cloud import storage 26 | from tempfile import TemporaryFile 27 | import os 28 | import config 29 | 30 | app = FastAPI() 31 | 32 | model_directory = f"{os.environ['AIP_STORAGE_URI']}/model_dir" 33 | storage_path = os.path.join(model_directory, "model.joblib") 34 | 35 | storage_client = storage.Client(project=config.PROJECT_ID) 36 | blob = storage.blob.Blob.from_string(storage_path, client=storage_client) 37 | 38 | blob.download_to_filename("model.joblib") 39 | model =load(open("model.joblib",'rb')) 40 | 41 | @app.get('/') 42 | def get_root(): 43 | return {'message': 'Welcome to custom anomaly detection'} 44 | 45 | @app.get('/health_check') 46 | def health(): 47 | return 200 48 | 49 | if os.environ.get('AIP_PREDICT_ROUTE') is not None: 50 | method = os.environ['AIP_PREDICT_ROUTE'] 51 | else: 52 | method = '/predict' 53 | 54 | @app.post(method) 55 | async def predict(request: Request): 56 | print("----------------- PREDICTING -----------------") 57 | body = await request.json() 58 | # prepare data 59 | instances = pd.DataFrame(body["instances"]) 60 | 61 | # retrieving predictions 62 | outputs = model.predict(instances) 63 | 64 | response = outputs.tolist() 65 | print("----------------- OUTPUTS -----------------") 66 | return {"predictions": response} -------------------------------------------------------------------------------- /02_activate/22_solution/custom_train/prediction/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | numpy 3 | Flask==2.1.0 4 | gunicorn==20.1.0 5 | google-cloud-bigquery 6 | google-cloud-aiplatform 7 | google-cloud-storage 8 | scikit-learn 9 | joblib 10 | gcsfs -------------------------------------------------------------------------------- /02_activate/22_solution/custom_train/trainer/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gcr.io/deeplearning-platform-release/sklearn-cpu.0-23 2 | WORKDIR / 3 | 4 | # Allow statements and log messages to immediately appear in the Knative logs 5 | ENV PYTHONUNBUFFERED True 6 | 7 | # Copies the trainer code to the docker image. 8 | COPY / /trainer 9 | COPY . ./ 10 | 11 | # Install production dependencies. 12 | RUN pip install --upgrade pip 13 | RUN pip install --no-cache-dir -r requirements.txt 14 | 15 | # Sets up the entry point to invoke the trainer. 16 | CMD ["python", "trainer/main.py"] -------------------------------------------------------------------------------- /02_activate/22_solution/custom_train/trainer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NucleusEngineering/hack-your-pipe/dfd667a1545dcc45cca8a2a16ab2c7f70983211f/02_activate/22_solution/custom_train/trainer/__init__.py -------------------------------------------------------------------------------- /02_activate/22_solution/custom_train/trainer/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | PROJECT_ID="" 17 | REGION="europe-west1" 18 | AIP_STORAGE_URI=f'gs://{PROJECT_ID}-ai-bucket/vtx-artifacts' 19 | # training data: 20 | DATA_URI=f"{PROJECT_ID}.ecommerce_sink.anomaly_data" -------------------------------------------------------------------------------- /02_activate/22_solution/custom_train/trainer/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from sklearn.tree import DecisionTreeClassifier 16 | from sklearn.metrics import roc_curve 17 | from sklearn.model_selection import train_test_split 18 | from google.cloud import bigquery 19 | from google.cloud import storage 20 | from joblib import dump 21 | 22 | import preprocess 23 | import train 24 | import config 25 | 26 | import os 27 | import pandas as pd 28 | import sys 29 | 30 | # data uri 31 | data_uri = config.DATA_URI 32 | 33 | # bq client 34 | bqclient = bigquery.Client(project=config.PROJECT_ID) 35 | storage_client = storage.Client(project=config.PROJECT_ID) 36 | 37 | ## Download & prep data 38 | print('[INFO] ------ Preparing Data', file=sys.stderr) 39 | train_data, train_labels, test_data, test_labels = preprocess.prep_data(bqclient, storage_client, data_uri) 40 | 41 | ## Train model and save it in Google Cloud Storage 42 | print('[INFO] ------ Training & Saving Model', file=sys.stderr) 43 | train.train_model(train_data, train_labels, test_data, test_labels, storage_client) -------------------------------------------------------------------------------- /02_activate/22_solution/custom_train/trainer/preprocess.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from sklearn.tree import DecisionTreeClassifier 16 | from sklearn.metrics import roc_curve 17 | from sklearn.model_selection import train_test_split 18 | from google.cloud import bigquery 19 | from google.cloud import storage 20 | from joblib import dump 21 | 22 | import os 23 | import pandas as pd 24 | 25 | def download_table(bqclient, storage_client, bq_table_uri: str): 26 | 27 | prefix = "bq://" 28 | if bq_table_uri.startswith(prefix): 29 | bq_table_uri = bq_table_uri[len(prefix):] 30 | 31 | table = bigquery.TableReference.from_string(bq_table_uri) 32 | rows = bqclient.list_rows( 33 | table, 34 | ) 35 | return rows.to_dataframe(create_bqstorage_client=False) 36 | 37 | def prep_data(bqclient, storage_client, data_uri: str): 38 | 39 | # Download data into Pandas DataFrames, split into train / test 40 | df, test_df = train_test_split(download_table(bqclient, storage_client, data_uri)) 41 | labels = df.pop("anomaly").tolist() 42 | data = df.values.tolist() 43 | test_labels = test_df.pop("anomaly").tolist() 44 | test_data = test_df.values.tolist() 45 | 46 | return data, labels, test_data, test_labels -------------------------------------------------------------------------------- /02_activate/22_solution/custom_train/trainer/requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-learn 2 | google-cloud-bigquery 3 | joblib 4 | pandas 5 | google-cloud-storage -------------------------------------------------------------------------------- /02_activate/22_solution/custom_train/trainer/train.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from sklearn.tree import DecisionTreeClassifier 16 | from sklearn.metrics import roc_curve 17 | from sklearn.model_selection import train_test_split 18 | from google.cloud import bigquery 19 | from google.cloud import storage 20 | from joblib import dump 21 | 22 | import os 23 | import pandas as pd 24 | 25 | def train_model(data, labels, test_data, test_labels, storage_client): 26 | 27 | # Define and train the Scikit model 28 | skmodel = DecisionTreeClassifier() 29 | skmodel.fit(data, labels) 30 | score = skmodel.score(test_data, test_labels) 31 | print('accuracy is:',score) 32 | 33 | # Storage location 34 | model_directory = f"{os.environ['AIP_STORAGE_URI']}/model_dir" 35 | storage_path = os.path.join(model_directory, "model.joblib") 36 | 37 | # Save the model to a local file 38 | dump(skmodel, 'model.joblib') 39 | 40 | blob = storage.blob.Blob.from_string(storage_path, client=storage_client) 41 | blob.upload_from_filename("model.joblib") 42 | 43 | return(skmodel) -------------------------------------------------------------------------------- /02_activate/22_solution/inf_processing_service/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9 2 | 3 | # Allow statements and log messages to immediately appear in the Knative logs 4 | ENV PYTHONUNBUFFERED True 5 | 6 | # Copy local code to the container image. 7 | ENV APP_HOME /app 8 | WORKDIR $APP_HOME 9 | COPY . ./ 10 | COPY ./requirements.txt ./ 11 | 12 | # Install production dependencies. 13 | RUN pip install --upgrade pip 14 | RUN pip install --no-cache-dir -r requirements.txt 15 | 16 | # Run the web service on container startup. Here we use the gunicorn 17 | # webserver, with one worker process and 8 threads. 18 | # For environments with multiple CPU cores, increase the number of workers 19 | # to be equal to the cores available. 20 | # Timeout is set to 0 to disable the timeouts of the workers to allow Cloud Run to handle instance scaling. 21 | CMD exec gunicorn --bind :$PORT --workers 1 --threads 8 --timeout 0 main:app -------------------------------------------------------------------------------- /02_activate/22_solution/inf_processing_service/README.md: -------------------------------------------------------------------------------- 1 | The processing service defines a Cloud Run Service to process each incoming datapoint. 2 | 3 | 4 | main.py defines the public facing webserver to listen for requests. 5 | 6 | synth_data_stream.py creates a synthetic data stream of events randomly chosen from the datalayer. It also randomly includes anomalies in the data. 7 | 8 | 9 | Command to start data stream: 10 | python3 synth_data_stream.py --endpoint {Pub/Sub endpoint link}' -------------------------------------------------------------------------------- /02_activate/22_solution/inf_processing_service/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | project_id = '' 16 | location = 'europe-west1' 17 | bq_dataset = 'ecommerce_sink' 18 | bq_table = 'cloud_run' 19 | bq_table_anomaly = 'cloud_run_anomaly' 20 | endpoind_id = '' 21 | -------------------------------------------------------------------------------- /02_activate/22_solution/inf_processing_service/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import time 17 | import base64 18 | import json 19 | import datetime 20 | import config 21 | 22 | from flask import Flask, request 23 | 24 | from google.cloud import bigquery, aiplatform 25 | 26 | 27 | app = Flask(__name__) 28 | 29 | 30 | @app.route("/hw", methods=['GET', 'POST']) 31 | def hello_world(): 32 | world = request.args.get('world') 33 | return f"Hello {world}!" 34 | 35 | 36 | @app.route("/", methods=["POST"]) 37 | def index(): 38 | envelope = request.get_json() 39 | print(envelope) 40 | print(type(envelope)) 41 | 42 | if not envelope: 43 | msg = "no Pub/Sub message received" 44 | print(f"error: {msg}") 45 | return f"Bad Request: {msg}", 400 46 | 47 | ps_message = envelope['message'] 48 | 49 | record = base64.b64decode(ps_message["data"]).decode("utf-8").strip() 50 | record = json.loads(record) 51 | 52 | record["weekday"] = datetime.datetime.strptime(record["event_datetime"], "%Y-%m-%d %H:%M:%S").strftime('%A') 53 | 54 | rows_to_insert = [record] 55 | 56 | client = bigquery.Client(project=config.project_id, location=config.location) 57 | table_id = config.project_id + '.' + config.bq_dataset + '.' + config.bq_table 58 | 59 | errors = client.insert_rows_json(table_id, rows_to_insert) # Make an API request. 60 | 61 | 62 | # Create record that includes anomaly detection inference. 63 | if record["event"] == "purchase": 64 | record_to_predict = [ 65 | {"tax": record["ecommerce"]["purchase"]["tax"], 66 | "shipping": record["ecommerce"]["purchase"]["shipping"], 67 | "value":record["ecommerce"]["purchase"]["value"]} 68 | ] 69 | 70 | aiplatform.init(project=config.project_id, location=config.location) 71 | 72 | endpoint = aiplatform.Endpoint( 73 | endpoint_name=f"projects/{config.project_id}/locations/{config.location}/endpoints/{config.endpoind_id}", 74 | project = config.project_id, 75 | location=config.location, 76 | ) 77 | 78 | endpoint_response = endpoint.predict( 79 | instances=record_to_predict 80 | ) 81 | 82 | centroid = endpoint_response.predictions[0]["nearest_centroid_id"][0] 83 | 84 | if centroid == 1: 85 | anomaly = True 86 | if centroid == 2: 87 | anomaly = False 88 | 89 | print(anomaly) 90 | 91 | anomaly_record = {"tax": record["ecommerce"]["purchase"]["tax"], "shipping": record["ecommerce"]["purchase"]["shipping"], "value":record["ecommerce"]["purchase"]["value"], "anomaly": anomaly} 92 | 93 | rows_to_insert = [anomaly_record] 94 | 95 | client = bigquery.Client(project=config.project_id, location=config.location) 96 | table_id = config.project_id + '.' + config.bq_dataset + '.' + config.bq_table_anomaly 97 | 98 | errors_an = client.insert_rows_json(table_id, rows_to_insert) # Make an API request. 99 | 100 | 101 | if errors_an == []: 102 | print(f"{time.time()} New rows with prediction have been added.") 103 | return ("", 204) 104 | else: 105 | print("Encountered errors while inserting rows: {}".format(errors)) 106 | return f"Bad Request: {envelope}", 400 107 | 108 | if errors == []: 109 | print(f"{time.time()} New rows have been added.") 110 | return ("", 204) 111 | else: 112 | print("Encountered errors while inserting rows: {}".format(errors)) 113 | return f"Bad Request: {envelope}", 400 114 | 115 | 116 | if __name__ == "__main__": 117 | app.run(debug=True, host="0.0.0.0", port=int(os.environ.get("PORT", 8080))) 118 | 119 | -------------------------------------------------------------------------------- /02_activate/22_solution/inf_processing_service/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | numpy 3 | Flask==2.1.0 4 | gunicorn==20.1.0 5 | google-cloud-bigquery 6 | google-cloud-aiplatform 7 | scikit-learn -------------------------------------------------------------------------------- /02_activate/22_solution/inf_processing_service_custom/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9 2 | 3 | # Allow statements and log messages to immediately appear in the Knative logs 4 | ENV PYTHONUNBUFFERED True 5 | 6 | # Copy local code to the container image. 7 | ENV APP_HOME /app 8 | WORKDIR $APP_HOME 9 | COPY . ./ 10 | COPY ./requirements.txt ./ 11 | 12 | # Install production dependencies. 13 | RUN pip install --upgrade pip 14 | RUN pip install --no-cache-dir -r requirements.txt 15 | 16 | # Run the web service on container startup. Here we use the gunicorn 17 | # webserver, with one worker process and 8 threads. 18 | # For environments with multiple CPU cores, increase the number of workers 19 | # to be equal to the cores available. 20 | # Timeout is set to 0 to disable the timeouts of the workers to allow Cloud Run to handle instance scaling. 21 | CMD exec gunicorn --bind :$PORT --workers 1 --threads 8 --timeout 0 main:app -------------------------------------------------------------------------------- /02_activate/22_solution/inf_processing_service_custom/README.md: -------------------------------------------------------------------------------- 1 | The processing service defines a Cloud Run Service to process each incoming datapoint. 2 | 3 | 4 | main.py defines the public facing webserver to listen for requests. 5 | 6 | synth_data_stream.py creates a synthetic data stream of events randomly chosen from the datalayer. It also randomly includes anomalies in the data. 7 | 8 | 9 | Command to start data stream: 10 | python3 synth_data_stream.py --endpoint {Pub/Sub endpoint link}' -------------------------------------------------------------------------------- /02_activate/22_solution/inf_processing_service_custom/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | project_id = '' 16 | location = 'europe-west1' 17 | bq_dataset = 'ecommerce_sink' 18 | bq_table = 'cloud_run' 19 | bq_table_anomaly = 'cloud_run_anomaly_custom' 20 | endpoind_id = '' 21 | -------------------------------------------------------------------------------- /02_activate/22_solution/inf_processing_service_custom/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import time 17 | import base64 18 | import json 19 | import datetime 20 | import config 21 | 22 | from flask import Flask, request 23 | 24 | from google.cloud import bigquery, aiplatform 25 | 26 | 27 | app = Flask(__name__) 28 | 29 | 30 | @app.route("/hw", methods=['GET', 'POST']) 31 | def hello_world(): 32 | world = request.args.get('world') 33 | return f"Hello {world}!" 34 | 35 | 36 | @app.route("/", methods=["POST"]) 37 | def index(): 38 | envelope = request.get_json() 39 | print(envelope) 40 | print(type(envelope)) 41 | 42 | if not envelope: 43 | msg = "no Pub/Sub message received" 44 | print(f"error: {msg}") 45 | return f"Bad Request: {msg}", 400 46 | 47 | ps_message = envelope['message'] 48 | 49 | record = base64.b64decode(ps_message["data"]).decode("utf-8").strip() 50 | record = json.loads(record) 51 | 52 | record["weekday"] = datetime.datetime.strptime(record["event_datetime"], "%Y-%m-%d %H:%M:%S").strftime('%A') 53 | 54 | rows_to_insert = [record] 55 | 56 | client = bigquery.Client(project=config.project_id, location=config.location) 57 | table_id = config.project_id + '.' + config.bq_dataset + '.' + config.bq_table 58 | 59 | errors = client.insert_rows_json(table_id, rows_to_insert) # Make an API request. 60 | 61 | 62 | # Create record that includes anomaly detection inference. 63 | if record["event"] == "purchase": 64 | record_to_predict = [ 65 | {"tax": record["ecommerce"]["purchase"]["tax"], 66 | "shipping": record["ecommerce"]["purchase"]["shipping"], 67 | "value":record["ecommerce"]["purchase"]["value"]} 68 | ] 69 | 70 | aiplatform.init(project=config.project_id, location=config.location) 71 | 72 | endpoint = aiplatform.Endpoint( 73 | endpoint_name=f"projects/{config.project_id}/locations/{config.location}/endpoints/{config.endpoind_id}", 74 | project = config.project_id, 75 | location=config.location, 76 | ) 77 | 78 | endpoint_response = endpoint.predict( 79 | instances=record_to_predict 80 | ) 81 | 82 | anomaly = endpoint_response.predictions[0] 83 | 84 | anomaly_record = {"tax": record["ecommerce"]["purchase"]["tax"], "shipping": record["ecommerce"]["purchase"]["shipping"], "value":record["ecommerce"]["purchase"]["value"], "anomaly": anomaly} 85 | 86 | rows_to_insert = [anomaly_record] 87 | 88 | client = bigquery.Client(project=config.project_id, location=config.location) 89 | table_id = config.project_id + '.' + config.bq_dataset + '.' + config.bq_table_anomaly 90 | 91 | errors_an = client.insert_rows_json(table_id, rows_to_insert) # Make an API request. 92 | 93 | 94 | if errors_an == []: 95 | print(f"{time.time()} New rows with prediction have been added.") 96 | return ("", 204) 97 | else: 98 | print("Encountered errors while inserting rows: {}".format(errors)) 99 | return f"Bad Request: {envelope}", 400 100 | 101 | if errors == []: 102 | print(f"{time.time()} New rows have been added.") 103 | return ("", 204) 104 | else: 105 | print("Encountered errors while inserting rows: {}".format(errors)) 106 | return f"Bad Request: {envelope}", 400 107 | 108 | 109 | if __name__ == "__main__": 110 | app.run(debug=True, host="0.0.0.0", port=int(os.environ.get("PORT", 8080))) 111 | 112 | -------------------------------------------------------------------------------- /02_activate/22_solution/inf_processing_service_custom/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | numpy 3 | Flask==2.1.0 4 | gunicorn==20.1.0 5 | google-cloud-bigquery 6 | google-cloud-aiplatform 7 | scikit-learn -------------------------------------------------------------------------------- /02_activate/22_solution/kf_pipe.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import time 16 | from typing import Iterable, Dict, NamedTuple 17 | 18 | import config 19 | 20 | # import kfp 21 | from kfp.v2 import compiler, dsl 22 | from kfp.v2.dsl import component 23 | from kfp.v2.components import importer_node 24 | import google.cloud.aiplatform as aip 25 | from google_cloud_pipeline_components import aiplatform as gcc_aip 26 | 27 | from google_cloud_pipeline_components.v1 import endpoint, bigquery 28 | from google_cloud_pipeline_components.types.artifact_types import VertexModel, VertexEndpoint, UnmanagedContainerModel 29 | 30 | 31 | 32 | # TODO: Check for resources & create if needed before pipeline 33 | 34 | def compile_pipe(): 35 | # Define the workflow of the pipeline. 36 | @dsl.pipeline( 37 | name="anomaly-detection-test", 38 | pipeline_root=config.PIPELINE_ROOT_PATH) 39 | 40 | def pipeline(project_id: str, region: str, timestamp_id: str, artifact_staging_location:str): 41 | 42 | aip.init(project=config.GCP_PROJECT, location=config.GCP_REGION) 43 | 44 | bqml_query = f""" 45 | CREATE OR REPLACE MODEL 46 | `{config.GCP_PROJECT}.ecommerce_sink.anomaly_detection` 47 | OPTIONS 48 | ( MODEL_TYPE='KMEANS', 49 | NUM_CLUSTERS=2 ) AS 50 | SELECT 51 | ecommerce.purchase.tax AS tax, 52 | ecommerce.purchase.shipping AS shipping, 53 | ecommerce.purchase.value AS value 54 | FROM `{config.GCP_PROJECT}.ecommerce_sink.cloud_run` 55 | WHERE event='purchase' 56 | ; 57 | """ 58 | 59 | bqml_model = bigquery.BigqueryCreateModelJobOp( 60 | project=project_id, 61 | location=region, 62 | query=bqml_query 63 | ) 64 | 65 | bq_export = bigquery.BigqueryExportModelJobOp( 66 | project=project_id, 67 | location=region, 68 | model=bqml_model.outputs["model"], 69 | model_destination_path=f"{config.PIPELINE_ROOT_PATH}/bq_model-artifacts" 70 | ) 71 | 72 | import_unmanaged_model_task = importer_node.importer( 73 | artifact_uri=f"{config.PIPELINE_ROOT_PATH}/bq_model-artifacts", 74 | artifact_class=UnmanagedContainerModel, 75 | metadata={ 76 | "containerSpec": { 77 | "imageUri": "europe-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-9:latest", 78 | }, 79 | }, 80 | ).after(bq_export) 81 | 82 | model_upload = gcc_aip.ModelUploadOp( 83 | project=project_id, 84 | location=region, 85 | display_name=f"anomaly_detection_{timestamp_id}", 86 | unmanaged_container_model=import_unmanaged_model_task.output, 87 | ) 88 | 89 | # endpoint_uri = "https://europe-west1-aiplatform.googleapis.com/v1/projects/37042627607/locations/europe-west1/endpoints/2381190342041927680" 90 | # endpoint = dsl.importer( 91 | # artifact_uri=endpoint_uri, 92 | # artifact_class=VertexEndpoint, 93 | # metadata={ 94 | # "resourceName": "projects/37042627607/locations/europe-west1/endpoints/2381190342041927680" 95 | # } 96 | # ) 97 | 98 | new_endpoint = endpoint.EndpointCreateOp( 99 | project=config.GCP_PROJECT, 100 | location=config.GCP_REGION, 101 | display_name=f'hyp_inference{int(time.time())}', 102 | # network='terraform-network' 103 | ) 104 | 105 | # Deploy models on endpoint 106 | _ = gcc_aip.ModelDeployOp( 107 | model=model_upload.outputs["model"], 108 | endpoint=new_endpoint.outputs["endpoint"], 109 | dedicated_resources_min_replica_count=1, 110 | dedicated_resources_max_replica_count=1, 111 | dedicated_resources_machine_type=config.MACHINE_TYPE, 112 | traffic_split={"0": 100} 113 | ) 114 | 115 | compiler.Compiler().compile(pipeline_func=pipeline, package_path="hyp-anomaly-detection.json") 116 | 117 | 118 | if __name__ == "__main__": 119 | # Initialize aiplatform credentials. 120 | aip.init(project=config.GCP_PROJECT, location=config.GCP_REGION) 121 | 122 | # Compile pipeline code. 123 | compile_pipe() 124 | 125 | # Unique ident for pipeline run 126 | timestamp_id = str(int(time.time())) 127 | 128 | # Prepare the pipeline job. 129 | job = aip.PipelineJob( 130 | display_name=f"{timestamp_id}-hyp-anomaly-detection", 131 | template_path="hyp-anomaly-detection.json", 132 | pipeline_root=config.PIPELINE_ROOT_PATH, 133 | parameter_values={ 134 | 'project_id': config.GCP_PROJECT, 135 | 'region': config.GCP_REGION, 136 | 'timestamp_id': timestamp_id, 137 | 'artifact_staging_location': config.PIPELINE_ROOT_PATH 138 | } 139 | ) 140 | 141 | job.submit(service_account=config.SERVICE_ACCOUNT) 142 | -------------------------------------------------------------------------------- /02_activate/22_solution/kf_pipe_custom.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import time 16 | from typing import Iterable, Dict, NamedTuple 17 | 18 | import config_custom 19 | 20 | # import kfp 21 | from kfp.v2 import compiler, dsl 22 | from kfp.v2.dsl import component 23 | from kfp.v2.components import importer_node 24 | import google.cloud.aiplatform as aip 25 | from google_cloud_pipeline_components import aiplatform as gcc_aip 26 | 27 | from google_cloud_pipeline_components.types import artifact_types 28 | from google_cloud_pipeline_components.v1.custom_job import CustomTrainingJobOp 29 | 30 | ## Training Worker Specs 31 | WORKER_POOL_SPECS = [ 32 | { 33 | "machine_spec": { 34 | "machine_type": "n1-standard-4" 35 | }, 36 | "replica_count": "1", 37 | "container_spec": { 38 | "image_uri": config_custom.TRAIN_IMAGE_URI, 39 | "env": [ 40 | { 41 | "name": "AIP_STORAGE_URI", 42 | "value": config_custom.AIP_STORAGE_URI 43 | }, 44 | ] 45 | } 46 | } 47 | ] 48 | 49 | def compile_pipe(): 50 | # Define the workflow of the pipeline. 51 | @dsl.pipeline( 52 | name="anomaly-detection-custom-test", 53 | pipeline_root=config_custom.PIPELINE_ROOT_PATH) 54 | 55 | def pipeline( 56 | project_id: str, 57 | region: str, 58 | timestamp_id: str, 59 | artifact_staging_location:str, 60 | bq_source: str, 61 | aip_storage_uri: str, 62 | predict_image_uri: str 63 | ): 64 | 65 | # Model training 66 | train_job = CustomTrainingJobOp( 67 | display_name="pipeline-anomaly-custom-train", 68 | project=project_id, 69 | location=region, 70 | worker_pool_specs=WORKER_POOL_SPECS 71 | ) 72 | 73 | # Model evaluation 74 | # Ideally here you can evaluate the model and decide on deployment/or not for CI/CD purposes 75 | # example: https://www.cloudskillsboost.google/focuses/21234?parent=catalog 76 | 77 | # Import with the custom predict container 78 | import_unmanaged_model_op = importer_node.importer( 79 | artifact_uri=aip_storage_uri, 80 | artifact_class=artifact_types.UnmanagedContainerModel, 81 | metadata={ 82 | "containerSpec": { 83 | "imageUri": predict_image_uri, 84 | "env": [ 85 | { 86 | "name": "PROJECT_ID", 87 | "value": project_id}, 88 | ], 89 | "predictRoute": "/predict", 90 | "healthRoute": "/health_check", 91 | "ports": [ 92 | { 93 | "containerPort": 8080 94 | } 95 | ] 96 | }, 97 | }, 98 | ).after(train_job) 99 | 100 | # Upload the model into the registry 101 | custom_model_upload_job = gcc_aip.ModelUploadOp( 102 | project=project_id, 103 | location=region, 104 | display_name=f"anomaly-detection-custom-model_{timestamp_id}", 105 | unmanaged_container_model=import_unmanaged_model_op.outputs["artifact"], 106 | ).after(import_unmanaged_model_op) 107 | 108 | # Create an endpoint where the model will be deployed 109 | endpoint_create_job = gcc_aip.EndpointCreateOp( 110 | project=project_id, 111 | display_name="anomaly-detection-custom-endpoint", 112 | location=region 113 | ) 114 | 115 | # Deploy the model on the endpoint 116 | _ = gcc_aip.ModelDeployOp( 117 | model=custom_model_upload_job.outputs["model"], 118 | endpoint=endpoint_create_job.outputs["endpoint"], 119 | deployed_model_display_name="anomaly-detection-custom-deploy", 120 | dedicated_resources_min_replica_count=1, 121 | dedicated_resources_max_replica_count=1, 122 | dedicated_resources_machine_type="n1-standard-2", 123 | traffic_split={"0": 100} 124 | ) 125 | 126 | compiler.Compiler().compile(pipeline_func=pipeline, package_path="hyp-custom-anomaly-detection.json") 127 | 128 | if __name__ == "__main__": 129 | # Initialize aiplatform credentials. 130 | aip.init(project=config_custom.PROJECT_ID, location=config_custom.REGION) 131 | 132 | # Compile pipeline code. 133 | compile_pipe() 134 | 135 | # Unique ident for pipeline run 136 | timestamp_id = str(int(time.time())) 137 | 138 | # Prepare the pipeline job. 139 | job = aip.PipelineJob( 140 | display_name=f"{timestamp_id}-hyp-custom-anomaly-detection", 141 | template_path="hyp-custom-anomaly-detection.json", 142 | pipeline_root=config_custom.PIPELINE_ROOT_PATH, 143 | parameter_values={ 144 | 'project_id': config_custom.PROJECT_ID, 145 | 'region': config_custom.REGION, 146 | 'timestamp_id': timestamp_id, 147 | 'bq_source': config_custom.DATA_URI, 148 | 'aip_storage_uri' : config_custom.AIP_STORAGE_URI, 149 | 'predict_image_uri' : config_custom.PREDICT_IMAGE_URI, 150 | 'artifact_staging_location': config_custom.PIPELINE_ROOT_PATH 151 | } 152 | ) 153 | 154 | job.submit(service_account=config_custom.SERVICE_ACCOUNT) 155 | -------------------------------------------------------------------------------- /02_activate/22_solution/main.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2023 Google 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | terraform { 18 | required_providers { 19 | google = { 20 | source = "hashicorp/google" 21 | version = "4.32.0" 22 | } 23 | } 24 | } 25 | 26 | provider "google" { 27 | project = var.project_id 28 | region = var.gcp_region 29 | } 30 | 31 | data "google_project" "project" { 32 | } 33 | 34 | # Enabling APIs 35 | resource "google_project_service" "aiplatform" { 36 | service = "aiplatform.googleapis.com" 37 | 38 | disable_on_destroy = false 39 | } 40 | 41 | resource "google_project_service" "storage" { 42 | service = "storage.googleapis.com" 43 | 44 | disable_on_destroy = false 45 | } 46 | 47 | resource "google_project_service" "notebooks" { 48 | service = "notebooks.googleapis.com" 49 | 50 | disable_on_destroy = false 51 | } 52 | 53 | resource "google_project_service" "dataflow" { 54 | service = "dataflow.googleapis.com" 55 | 56 | disable_on_destroy = false 57 | } 58 | 59 | resource "google_project_service" "artifactregistry" { 60 | service = "artifactregistry.googleapis.com" 61 | 62 | disable_on_destroy = false 63 | } 64 | 65 | 66 | # Creating BigQuery Table 67 | resource "google_bigquery_table" "bq_table_run_anomaly" { 68 | dataset_id = "ecommerce_sink" 69 | table_id = "cloud_run_anomaly" 70 | deletion_protection = false 71 | 72 | labels = { 73 | env = "default" 74 | } 75 | 76 | schema = < to see your current agreements on file or 13 | to sign a new one. 14 | 15 | You generally only need to submit a CLA once, so if you've already submitted one 16 | (even if it was for a different project), you probably don't need to do it 17 | again. 18 | 19 | ## Code Reviews 20 | 21 | All submissions, including submissions by project members, require review. We 22 | use GitHub pull requests for this purpose. Consult 23 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 24 | information on using pull requests. 25 | 26 | ## Community Guidelines 27 | 28 | This project follows 29 | [Google's Open Source Community Guidelines](https://opensource.google/conduct/). -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Welcome to Hack Your Pipe! 2 | 3 | In this repo you will find a series of demos and workshops on Data- and ML-Engineering best practices on GCP. 4 | 5 | For your webshop you will develop an end-to-end data pipeline, from collection, over transformation and up to activation of the interaction data. 6 | 7 | ![Hack Your Pipe architecture](./rsc/hyp_architecture.png) 8 | 9 | The workshop is split into first ingest and transform, and second activate. 10 | 11 | In both folders you will find the challenge lab and working sample solution. 12 | Every sample solution comes with instructions and terraform scripts needed for replication. 13 | 14 | The challenge labs separate the solution development into interactive steps. 15 | To maximize the learning you should aim to solving the challenge labs independently before looking at the solutions. 16 | 17 | ### Good luck and have fun!! -------------------------------------------------------------------------------- /rsc/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NucleusEngineering/hack-your-pipe/dfd667a1545dcc45cca8a2a16ab2c7f70983211f/rsc/.DS_Store -------------------------------------------------------------------------------- /rsc/cloudrun_processing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NucleusEngineering/hack-your-pipe/dfd667a1545dcc45cca8a2a16ab2c7f70983211f/rsc/cloudrun_processing.png -------------------------------------------------------------------------------- /rsc/dataflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NucleusEngineering/hack-your-pipe/dfd667a1545dcc45cca8a2a16ab2c7f70983211f/rsc/dataflow.png -------------------------------------------------------------------------------- /rsc/efficient_pipelines.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NucleusEngineering/hack-your-pipe/dfd667a1545dcc45cca8a2a16ab2c7f70983211f/rsc/efficient_pipelines.png -------------------------------------------------------------------------------- /rsc/hyp_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NucleusEngineering/hack-your-pipe/dfd667a1545dcc45cca8a2a16ab2c7f70983211f/rsc/hyp_architecture.png -------------------------------------------------------------------------------- /rsc/hyp_ml_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NucleusEngineering/hack-your-pipe/dfd667a1545dcc45cca8a2a16ab2c7f70983211f/rsc/hyp_ml_architecture.png -------------------------------------------------------------------------------- /rsc/ingestion.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NucleusEngineering/hack-your-pipe/dfd667a1545dcc45cca8a2a16ab2c7f70983211f/rsc/ingestion.png -------------------------------------------------------------------------------- /rsc/pubsub_direct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NucleusEngineering/hack-your-pipe/dfd667a1545dcc45cca8a2a16ab2c7f70983211f/rsc/pubsub_direct.png -------------------------------------------------------------------------------- /rsc/pubsub_metrics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NucleusEngineering/hack-your-pipe/dfd667a1545dcc45cca8a2a16ab2c7f70983211f/rsc/pubsub_metrics.png --------------------------------------------------------------------------------