├── 01_ingest_and_transform
    ├── 11_challenge
    │   ├── README.md
    │   ├── beam
    │   │   ├── Dockerfile
    │   │   ├── beam_processing.py
    │   │   ├── config.py
    │   │   └── requirements.txt
    │   ├── cloud-run-pubsub-proxy
    │   │   ├── .gitignore
    │   │   ├── Dockerfile
    │   │   ├── README.md
    │   │   ├── app.js
    │   │   ├── index.js
    │   │   └── package.json
    │   ├── config_env.sh
    │   ├── datalayer
    │   │   ├── .gitignore
    │   │   ├── README.md
    │   │   ├── add_to_cart.json
    │   │   ├── ecommerce_events_bq_schema.json
    │   │   ├── purchase.json
    │   │   ├── purchase_anomaly.json
    │   │   ├── synth_data_stream.py
    │   │   └── view_item.json
    │   ├── main.tf
    │   ├── processing-service
    │   │   ├── Dockerfile
    │   │   ├── README.md
    │   │   ├── config.py
    │   │   ├── main.py
    │   │   └── requirements.txt
    │   ├── terraform.tfvars
    │   └── variables.tf
    └── 12_solution
    │   ├── README.md
    │   ├── beam
    │       ├── Dockerfile
    │       ├── beam_processing.py
    │       ├── config.py
    │       └── requirements.txt
    │   ├── cloud-run-pubsub-proxy
    │       ├── .gitignore
    │       ├── Dockerfile
    │       ├── README.md
    │       ├── app.js
    │       ├── index.js
    │       └── package.json
    │   ├── config_env.sh
    │   ├── datalayer
    │       ├── .gitignore
    │       ├── README.md
    │       ├── add_to_cart.json
    │       ├── ecommerce_events_bq_schema.json
    │       ├── purchase.json
    │       ├── purchase_anomaly.json
    │       ├── synth_data_stream.py
    │       └── view_item.json
    │   ├── main.tf
    │   ├── processing-service
    │       ├── Dockerfile
    │       ├── README.md
    │       ├── config.py
    │       ├── main.py
    │       └── requirements.txt
    │   ├── terraform.tfvars
    │   └── variables.tf
├── 02_activate
    ├── 21_challenge
    │   ├── README.md
    │   ├── cloud-run-pubsub-proxy
    │   │   ├── .gitignore
    │   │   ├── Dockerfile
    │   │   ├── README.md
    │   │   ├── app.js
    │   │   ├── index.js
    │   │   └── package.json
    │   ├── config_custom.py
    │   ├── config_env.sh
    │   ├── custom_train
    │   │   ├── prediction
    │   │   │   ├── Dockerfile
    │   │   │   ├── __init__.py
    │   │   │   ├── config.py
    │   │   │   ├── main.py
    │   │   │   └── requirements.txt
    │   │   └── trainer
    │   │   │   ├── Dockerfile
    │   │   │   ├── __init__.py
    │   │   │   ├── config.py
    │   │   │   ├── main.py
    │   │   │   ├── preprocess.py
    │   │   │   ├── requirements.txt
    │   │   │   └── train.py
    │   ├── datalayer
    │   │   ├── .gitignore
    │   │   ├── README.md
    │   │   ├── add_to_cart.json
    │   │   ├── ecommerce_events_bq_schema.json
    │   │   ├── purchase.json
    │   │   ├── purchase_anomaly.json
    │   │   ├── synth_data_stream.py
    │   │   └── view_item.json
    │   ├── inf_processing_service
    │   │   ├── Dockerfile
    │   │   ├── README.md
    │   │   ├── config.py
    │   │   ├── main.py
    │   │   └── requirements.txt
    │   ├── inf_processing_service_custom
    │   │   ├── Dockerfile
    │   │   ├── README.md
    │   │   ├── config.py
    │   │   ├── main.py
    │   │   └── requirements.txt
    │   ├── kf_pipe_custom.py
    │   ├── main.tf
    │   ├── processing-service
    │   │   ├── Dockerfile
    │   │   ├── README.md
    │   │   ├── config.py
    │   │   ├── main.py
    │   │   └── requirements.txt
    │   ├── requirements.txt
    │   ├── terraform.tfvars
    │   └── variables.tf
    └── 22_solution
    │   ├── README.md
    │   ├── config.py
    │   ├── config_custom.py
    │   ├── config_env.sh
    │   ├── custom_train
    │       ├── prediction
    │       │   ├── Dockerfile
    │       │   ├── __init__.py
    │       │   ├── config.py
    │       │   ├── main.py
    │       │   └── requirements.txt
    │       └── trainer
    │       │   ├── Dockerfile
    │       │   ├── __init__.py
    │       │   ├── config.py
    │       │   ├── main.py
    │       │   ├── preprocess.py
    │       │   ├── requirements.txt
    │       │   └── train.py
    │   ├── inf_processing_service
    │       ├── Dockerfile
    │       ├── README.md
    │       ├── config.py
    │       ├── main.py
    │       └── requirements.txt
    │   ├── inf_processing_service_custom
    │       ├── Dockerfile
    │       ├── README.md
    │       ├── config.py
    │       ├── main.py
    │       └── requirements.txt
    │   ├── kf_pipe.py
    │   ├── kf_pipe_custom.py
    │   ├── main.tf
    │   ├── requirements.txt
    │   ├── terraform.tfvars
    │   └── variables.tf
├── CONTRIBUTING.md
├── LICENSE
├── README.md
└── rsc
    ├── .DS_Store
    ├── cloudrun_processing.png
    ├── dataflow.png
    ├── efficient_pipelines.png
    ├── hyp_architecture.png
    ├── hyp_ml_architecture.png
    ├── ingestion.png
    ├── pubsub_direct.png
    └── pubsub_metrics.png


/01_ingest_and_transform/11_challenge/beam/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM gcr.io/dataflow-templates-base/python39-template-launcher-base:latest as template_launcher
 2 | 
 3 | ARG WORKDIR=/dataflow/template
 4 | RUN mkdir -p ${WORKDIR}
 5 | WORKDIR ${WORKDIR}
 6 | 
 7 | # Copy local code to the container image.
 8 | COPY . ./
 9 | COPY ./requirements.txt ./
10 | COPY ./beam_processing.py ./
11 | 
12 | # Flex Template ENV Vars
13 | ENV FLEX_TEMPLATE_PYTHON_PY_FILE="${WORKDIR}/beam_processing.py"
14 | ENV FLEX_TEMPLATE_PYTHON_REQUIREMENTS_FILE="${WORKDIR}/requirements.txt"
15 | 
16 | # Install requirements
17 | RUN pip install --upgrade pip
18 | RUN pip install --no-cache-dir -r requirements.txt
19 | 
20 | ENTRYPOINT ["/opt/google/dataflow/python_template_launcher"]


--------------------------------------------------------------------------------
/01_ingest_and_transform/11_challenge/beam/beam_processing.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Google
  2 | 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | 
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import json
 16 | import time
 17 | 
 18 | import config
 19 | 
 20 | import apache_beam as beam
 21 | from apache_beam.options.pipeline_options import PipelineOptions
 22 | from apache_beam.transforms import trigger
 23 | from apache_beam.io.gcp.pubsub import ReadFromPubSub
 24 | from apache_beam.io.gcp.bigquery import BigQueryDisposition, WriteToBigQuery
 25 | from apache_beam.runners import DataflowRunner
 26 | 
 27 | # Defining event filter functions.
 28 | 
 29 | 
 30 | def is_item_view(event):
 31 |     return event['event'] == 'view_item'
 32 | 
 33 | 
 34 | def is_add_to_cart(event):
 35 |     return event['event'] == 'add_to_cart'
 36 | 
 37 | 
 38 | def is_purchase(event):
 39 |     return event['event'] == 'purchase'
 40 | 
 41 | 
 42 | class ExtractValueFn(beam.DoFn):
 43 |     def process(self, element):
 44 |         print(f"ExtractValueFn: {element['ecommerce']['purchase']['value']}")
 45 |         return [element['ecommerce']['purchase']['value']]
 46 | 
 47 | 
 48 | class ExtractAndSumValue(beam.PTransform):
 49 |     """A transform to extract key/score information and sum the scores.
 50 |   The constructor argument `field` determines whether 'team' or 'user' info is
 51 |   extracted.
 52 |   """
 53 | 
 54 |     def expand(self, pcoll):
 55 |         sum_val = (
 56 |             pcoll
 57 |             | beam.Map(lambda elem: (elem['user_id'], elem['ecommerce']['purchase']['value']))
 58 |             | '<Summing aggregation>'
 59 |         return(sum_val)
 60 | 
 61 | 
 62 | class FormatByRow(beam.PTransform):
 63 |     """A transform to reformat the data to column name/value format.
 64 |   """
 65 | 
 66 |     def expand(self, pcoll):
 67 |         row_val = (
 68 |             pcoll
 69 |             | beam.Map(lambda elem: {'user_id': elem[0],
 70 |                                      'summed_value': elem[1]
 71 |                                      })
 72 |         )
 73 |         return(row_val)
 74 | 
 75 | 
 76 | def streaming_pipeline(project, region):
 77 | 
 78 |     subscription = "projects/{}/subscriptions/hyp_subscription_dataflow".format(
 79 |         project)
 80 | 
 81 |     bucket = "gs://{}-ecommerce-events/tmp_dir".format(project)
 82 | 
 83 |     # Defining pipeline options.
 84 |     options = PipelineOptions(
 85 |         streaming=True,
 86 |         project=project,
 87 |         region=region,
 88 |         staging_location="%s/staging" % bucket,
 89 |         temp_location="%s/temp" % bucket,
 90 |         subnetwork='regions/europe-west1/subnetworks/terraform-network',
 91 |         service_account_email='retailpipeline-hyp@{}.iam.gserviceaccount.com'.format(
 92 |             project),
 93 |         max_num_workers=1
 94 |     )
 95 | 
 96 |     # Defining pipeline.
 97 |     p = beam.Pipeline(DataflowRunner(), options=options)
 98 | 
 99 |     # Receiving message from Pub/Sub & parsing json from string.
100 |     json_message = (p
101 |                     # Listining to Pub/Sub.
102 |                     | "Read Topic" >> '<Read from PubSub Transform>'
103 |                     # Parsing json from message string.
104 |                     | "Parse json" >> beam.Map(json.loads)
105 |                     )
106 | 
107 |     # Extracting Item Views.
108 |     item_views = (json_message
109 |                   | 'Filter for item views' >> beam.Filter(is_item_view)
110 |                   | "item view row" >> beam.Map(lambda input: {'event_datetime': input['event_datetime'],  # Dropping and renaming columns.
111 |                                                                'event': input['event'],
112 |                                                                'user_id':  input['user_id'],
113 |                                                                'client_id': input['client_id'],
114 |                                                                'page': input['page'],
115 |                                                                'page_previous': input['page_previous'],
116 |                                                                "item_name": input['ecommerce']['items'][0]["item_name"],
117 |                                                                "item_id": input['ecommerce']['items'][0]["item_id"],
118 |                                                                "price": input['ecommerce']['items'][0]["price"],
119 |                                                                "item_brand": input['ecommerce']['items'][0]["item_brand"],
120 |                                                                "item_category": input['ecommerce']['items'][0]["item_category"],
121 |                                                                "item_category_2": input['ecommerce']['items'][0]["item_category_2"],
122 |                                                                "item_category_3": input['ecommerce']['items'][0]["item_category_3"],
123 |                                                                "item_category_4": input['ecommerce']['items'][0]["item_category_4"],
124 |                                                                "item_variant": input['ecommerce']['items'][0]["item_variant"],
125 |                                                                "item_list_name": input['ecommerce']['items'][0]["item_list_name"],
126 |                                                                "item_list_id": input['ecommerce']['items'][0]["item_list_id"],
127 |                                                                "quantity": input['ecommerce']['items'][0]["quantity"]
128 |                                                                })
129 |                   )
130 | 
131 |     fixed_windowed_items = (json_message
132 |                             | 'Filter for purchase' >> beam.Filter(is_purchase)
133 |                             | 'Global Window' >> '<Data Windowing>'
134 |                             | 'ExtractAndSumValue' >> ExtractAndSumValue()
135 |                             | 'FormatByRow' >> FormatByRow()
136 |                             )
137 | 
138 |     # Writing summed values to BigQuery
139 |     aggregated_schema = "user_id:STRING, summed_value:FLOAT"
140 |     aggregated_table = "{}:ecommerce_sink.beam_aggregated".format(project)
141 | 
142 |     fixed_windowed_items | "Write Summed Values To BigQuery" >> WriteToBigQuery(table=aggregated_table, schema=aggregated_schema,
143 |                                                                                 create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
144 |                                                                                 write_disposition=BigQueryDisposition.WRITE_APPEND)
145 | 
146 |     # Writing the PCollections to two differnt BigQuery tables.
147 |     item_views_table = "{}:ecommerce_sink.beam_item_views".format(project)
148 |     schema = "event_datetime:DATETIME, event:STRING, user_id:STRING, client_id:STRING, page:STRING, page_previous:STRING, " \
149 |         "item_name:STRING, item_id:STRING, price:STRING, item_brand:STRING, item_category:STRING, item_category_2:STRING, item_category_3:STRING, " \
150 |         "item_category_4:STRING, item_variant:STRING, item_list_name:STRING, item_list_id:STRING, quantity:STRING"
151 | 
152 |     item_views | "Write Items Views To BigQuery" >> WriteToBigQuery(table=item_views_table, schema=schema,
153 |                                                                     create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
154 |                                                                     write_disposition=BigQueryDisposition.WRITE_APPEND)
155 | 
156 |     return p.run()
157 | 
158 | 
159 | if __name__ == '__main__':
160 |     GCP_PROJECT = config.project_id
161 |     GCP_REGION = config.location
162 | 
163 |     streaming_pipeline(project=GCP_PROJECT, region=GCP_REGION)
164 | 


--------------------------------------------------------------------------------
/01_ingest_and_transform/11_challenge/beam/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | project_id = '<project-id>'
16 | location = 'europe-west1'
17 | 


--------------------------------------------------------------------------------
/01_ingest_and_transform/11_challenge/beam/requirements.txt:
--------------------------------------------------------------------------------
1 | apache-beam
2 | google-apitools


--------------------------------------------------------------------------------
/01_ingest_and_transform/11_challenge/cloud-run-pubsub-proxy/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/01_ingest_and_transform/11_challenge/cloud-run-pubsub-proxy/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use the official lightweight Node.js 12 image.
 2 | # https://hub.docker.com/_/node
 3 | FROM node:12-slim
 4 | 
 5 | # Create and change to the app directory.
 6 | WORKDIR /usr/src/app
 7 | 
 8 | # Copy application dependency manifests to the container image.
 9 | # A wildcard is used to ensure both package.json AND package-lock.json are copied.
10 | # Copying this separately prevents re-running npm install on every code change.
11 | COPY package*.json ./
12 | 
13 | # Install production dependencies.
14 | RUN npm install --only=production
15 | 
16 | # Copy local code to the container image.
17 | COPY . ./
18 | 
19 | # Run the web service on container startup.
20 | CMD [ "npm", "start" ]


--------------------------------------------------------------------------------
/01_ingest_and_transform/11_challenge/cloud-run-pubsub-proxy/README.md:
--------------------------------------------------------------------------------
1 | Cloud Run Proxy is a express webserver that listenes to incoming requests and publishes them to a chosen Pub/Sub topic.


--------------------------------------------------------------------------------
/01_ingest_and_transform/11_challenge/cloud-run-pubsub-proxy/app.js:
--------------------------------------------------------------------------------
 1 | // Copyright 2023 Google
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     https://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | const express = require('express');
16 | const bodyParser = require('body-parser');
17 | const app = express();
18 | 
19 | app.use(bodyParser.json());
20 | 
21 | app.get('/', (req, res) => {
22 |   console.log('Hello world received a request.');
23 | 
24 |   const target = process.env.TARGET || 'World';
25 |   res.send(`Hello ${target}!`);
26 | });
27 | 
28 | app.post('/json', (req, res) => {
29 |   const dataLayer = JSON.stringify(req.body)
30 |   console.log(`proxy POST request received dataLayer: ${dataLayer}`)
31 | 
32 |   const {PubSub} = require('@google-cloud/pubsub');
33 | 
34 |   // Instantiates a client
35 |   const pubsub = new PubSub();
36 | 
37 |   const {Buffer} = require('safe-buffer');
38 | 
39 |   // Set Pub/Sub topic name
40 |   let topicName = 'hyp-pubsub-topic';
41 | 
42 |   // References an existing topic
43 |   const topic = pubsub.topic(topicName);
44 | 
45 |   // Publishes the message as a string, 
46 |   const dataBuffer = Buffer.from(dataLayer);
47 | 
48 |   // Add two custom attributes, origin and username, to the message
49 |   const customAttributes = {
50 |     origin: 'gtm-cloud-run',
51 |     username: 'gcp-demo',
52 |   };
53 | 
54 |   // Publishes a message to Pub/Sub
55 |   return topic
56 |     .publishMessage({data: dataBuffer})
57 |     .then(() => res.status(200).send(`{"message": "pubsub message sent: ${dataBuffer}"}`))
58 |     .catch(err => {
59 |       console.error(err);
60 |       res.status(500).send(err);
61 |       return Promise.reject(err);
62 |    });
63 | })
64 | 
65 | 
66 | module.exports = app;
67 | 


--------------------------------------------------------------------------------
/01_ingest_and_transform/11_challenge/cloud-run-pubsub-proxy/index.js:
--------------------------------------------------------------------------------
 1 | // Copyright 2023 Google
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     https://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | const app = require('./app.js');
16 | const PORT = process.env.PORT || 8080;
17 | 
18 | app.listen(PORT, () => console.log(`pubsub proxy app listening on port ${PORT}`));
19 | 


--------------------------------------------------------------------------------
/01_ingest_and_transform/11_challenge/cloud-run-pubsub-proxy/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "pubsub-proxy",
 3 |   "version": "1.0.0",
 4 |   "description": "Cloud Run app to send messages to Pub/Sub topic using Node",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "start": "node index.js"
 8 |   },
 9 |   "author": "",
10 |   "license": "Apache-2.0",
11 |   "dependencies": {
12 |     "express": "^4.17.1",
13 |     "body-parser": "^1.19.0",
14 |     "@google-cloud/pubsub": "^3.3.0",
15 |     "safe-buffer": "5.1.2"
16 |   }
17 | }


--------------------------------------------------------------------------------
/01_ingest_and_transform/11_challenge/config_env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | export GCP_PROJECT="<project-id>"
 4 | export ENDPOINT_URL="<endpoint-url>" # doesn't need to be defined in the very beginning
 5 | export PUSH_ENDPOINT='<processing-endpoint-url>' # doesn't need to be defined in the very beginning
 6 | export GCP_REGION=europe-west1
 7 | export RUN_PROXY_DIR=cloud-run-pubsub-proxy
 8 | export RUN_PROCESSING_DIR=processing-service
 9 | export DATAFLOW_TEMPLATE=beam
10 | export RUN_INFERENCE_PROCESSING_SERVICE=inf_processing_service
11 | 
12 | 


--------------------------------------------------------------------------------
/01_ingest_and_transform/11_challenge/datalayer/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/01_ingest_and_transform/11_challenge/datalayer/README.md:
--------------------------------------------------------------------------------
1 | Datalayer defines the json events that could occur to be fed into the pipeline. 
2 | 
3 | Four types of events are included:
4 | * add to cart 
5 | * made purchase
6 | * made purcase with anomaly (artifical mistake in data to be identified later)
7 | * view item


--------------------------------------------------------------------------------
/01_ingest_and_transform/11_challenge/datalayer/add_to_cart.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "event_datetime":"2020-11-16 20:59:59", 
 3 |   "event": "add_to_cart",
 4 |   "user_id": "UID00003", 
 5 |   "client_id": "CID00003",
 6 |   "page":"/product-67890",
 7 |   "page_previous": "/category-tshirts",
 8 |   "ecommerce": {
 9 |     "items": [{
10 |       "item_name": "Donut Friday Scented T-Shirt",
11 |       "item_id": "67890",
12 |       "price": 33.75,
13 |       "item_brand": "Google",
14 |       "item_category": "Apparel",
15 |       "item_category_2": "Mens",
16 |       "item_category_3": "Shirts",
17 |       "item_category_4": "Tshirts",
18 |       "item_variant": "Black",
19 |       "item_list_name": "Search Results",
20 |       "item_list_id": "SR123",
21 |       "index": 1,
22 |       "quantity": 2
23 |     }]
24 |   }
25 | }


--------------------------------------------------------------------------------
/01_ingest_and_transform/11_challenge/datalayer/ecommerce_events_bq_schema.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "name": "event_datetime",
  4 |     "type": "TIMESTAMP",
  5 |     "mode": "NULLABLE"
  6 |   },
  7 |   {
  8 |     "name": "event",
  9 |     "type": "STRING",
 10 |     "mode": "REQUIRED"
 11 |   },
 12 |   {
 13 |     "name": "user_id",
 14 |     "type": "STRING",
 15 |     "mode": "REQUIRED"
 16 |   },
 17 |   {
 18 |     "name": "client_id",
 19 |     "type": "STRING",
 20 |     "mode": "NULLABLE"
 21 |   },
 22 |   {
 23 |     "name": "page",
 24 |     "type": "STRING",
 25 |     "mode": "NULLABLE"
 26 |   },
 27 |   {
 28 |     "name": "page_previous",
 29 |     "type": "STRING",
 30 |     "mode": "NULLABLE"
 31 |   },
 32 |   {
 33 |     "name": "ecommerce",
 34 |     "type": "RECORD",
 35 |     "mode": "NULLABLE",
 36 |     "fields": [ 
 37 |       {
 38 |         "mode": "REPEATED",
 39 |         "name": "items",
 40 |         "type": "RECORD",
 41 |         "fields": [
 42 |           {
 43 |             "mode": "NULLABLE",
 44 |             "name": "index",
 45 |             "type": "INTEGER"
 46 |           },
 47 | 
 48 |           {
 49 |             "mode": "NULLABLE",
 50 |             "name": "item_id",
 51 |             "type": "INTEGER"
 52 |           },
 53 |           {
 54 |             "mode": "NULLABLE",
 55 |             "name": "item_name",
 56 |             "type": "STRING"
 57 |           },
 58 |           {
 59 |             "mode": "NULLABLE",
 60 |             "name": "item_list_name",
 61 |             "type": "STRING"
 62 |           },
 63 |           {
 64 |             "mode": "NULLABLE",
 65 |             "name": "item_list_id",
 66 |             "type": "STRING"
 67 |           },
 68 |           {
 69 |             "mode": "NULLABLE",
 70 |             "name": "price",
 71 |             "type": "FLOAT"
 72 |           },
 73 |           {
 74 |             "mode": "NULLABLE",
 75 |             "name": "item_variant",
 76 |             "type": "STRING"
 77 |           },
 78 |           {
 79 |             "mode": "NULLABLE",
 80 |             "name": "quantity",
 81 |             "type": "INTEGER"
 82 |           },
 83 |           {
 84 |             "mode": "NULLABLE",
 85 |             "name": "item_brand",
 86 |             "type": "STRING"
 87 |           },
 88 |           {
 89 |             "mode": "NULLABLE",
 90 |             "name": "item_category",
 91 |             "type": "STRING"
 92 |           },
 93 |           {
 94 |             "mode": "NULLABLE",
 95 |             "name": "item_category_2",
 96 |             "type": "STRING"
 97 |           },
 98 |           {
 99 |             "mode": "NULLABLE",
100 |             "name": "item_category_3",
101 |             "type": "STRING"
102 |           },
103 |           {
104 |             "mode": "NULLABLE",
105 |             "name": "item_category_4",
106 |             "type": "STRING"
107 |           }
108 |         ]
109 |       },
110 |       {
111 |         "mode": "NULLABLE",
112 |         "name": "purchase",
113 |         "type": "RECORD",
114 |         "fields": [
115 |           {
116 |             "fields": [
117 |               {
118 |                 "mode": "NULLABLE",
119 |                 "name": "item_coupon",
120 |                 "type": "STRING"
121 |               },
122 |               {
123 |                 "mode": "NULLABLE",
124 |                 "name": "quantity",
125 |                 "type": "INTEGER"
126 |               },
127 |               {
128 |                 "mode": "NULLABLE",
129 |                 "name": "item_variant",
130 |                 "type": "STRING"
131 |               },
132 |               {
133 |                 "mode": "NULLABLE",
134 |                 "name": "item_category",
135 |                 "type": "STRING"
136 |               },
137 |               {
138 |                 "mode": "NULLABLE",
139 |                 "name": "item_name",
140 |                 "type": "STRING"
141 |               },
142 |               {
143 |                 "mode": "NULLABLE",
144 |                 "name": "item_id",
145 |                 "type": "INTEGER"
146 |               },
147 |               {
148 |                 "mode": "NULLABLE",
149 |                 "name": "item_brand",
150 |                 "type": "STRING"
151 |               },
152 |               {
153 |                 "mode": "NULLABLE",
154 |                 "name": "item_price",
155 |                 "type": "FLOAT"
156 |               }
157 |             ],
158 |             "mode": "REPEATED",
159 |             "name": "items",
160 |             "type": "RECORD"
161 |           },
162 |           {
163 |             "mode": "NULLABLE",
164 |             "name": "coupon",
165 |             "type": "STRING"
166 |           },
167 |           {
168 |             "mode": "NULLABLE",
169 |             "name": "tax",
170 |             "type": "FLOAT"
171 |           },
172 |           {
173 |             "mode": "NULLABLE",
174 |             "name": "shipping",
175 |             "type": "FLOAT"
176 |           },
177 |           {
178 |             "mode": "NULLABLE",
179 |             "name": "value",
180 |             "type": "FLOAT"
181 |           },
182 |           {
183 |             "mode": "NULLABLE",
184 |             "name": "affiliation",
185 |             "type": "STRING"
186 |           },
187 |           {
188 |             "mode": "NULLABLE",
189 |             "name": "currency",
190 |             "type": "STRING"
191 |           },
192 |           {
193 |             "mode": "NULLABLE",
194 |             "name": "transaction_id",
195 |             "type": "STRING"
196 |           }
197 |         ]
198 |       }
199 |     ]
200 |   }
201 | ]
202 | 


--------------------------------------------------------------------------------
/01_ingest_and_transform/11_challenge/datalayer/purchase.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "event_datetime":"2020-11-16 20:59:59", 
 3 |   "event": "purchase",
 4 |   "user_id": "UID00001", 
 5 |   "client_id": "CID00003", 
 6 |   "page":"/checkout",
 7 |   "page_previous": "/order-confirmation",
 8 |   "ecommerce": {
 9 |     "purchase": {
10 |       "transaction_id": "T12345",
11 |       "affiliation": "Online Store",
12 |       "value": 35.43,
13 |       "tax": 4.90,
14 |       "shipping": 5.99,
15 |       "currency": "EUR",
16 |       "coupon": "SUMMER_SALE",
17 |       "items": [{
18 |         "item_name": "Triblend Android T-Shirt",
19 |         "item_id": "12345",
20 |         "item_price": 15.25,
21 |         "item_brand": "Google",
22 |         "item_category": "Apparel",
23 |         "item_variant": "Gray",
24 |         "quantity": 1,
25 |         "item_coupon": ""
26 |       }, {
27 |         "item_name": "Donut Friday Scented T-Shirt",
28 |         "item_id": "67890",
29 |         "item_price": 33.75,
30 |         "item_brand": "Google",
31 |         "item_category": "Apparel",
32 |         "item_variant": "Black",
33 |         "quantity": 1
34 |       }]
35 |     }
36 |   }
37 | }


--------------------------------------------------------------------------------
/01_ingest_and_transform/11_challenge/datalayer/purchase_anomaly.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "event_datetime":"2020-11-16 20:59:59", 
 3 |   "event": "purchase",
 4 |   "user_id": "UID00001", 
 5 |   "client_id": "CID00003", 
 6 |   "page":"/checkout",
 7 |   "page_previous": "/order-confirmation",
 8 |   "ecommerce": {
 9 |     "purchase": {
10 |       "transaction_id": "T12345",
11 |       "affiliation": "Online Store",
12 |       "value": 1000000.10,
13 |       "tax": 4.90,
14 |       "shipping": 5.99,
15 |       "currency": "EUR",
16 |       "coupon": "SUMMER_SALE",
17 |       "items": [{
18 |         "item_name": "Triblend Android T-Shirt",
19 |         "item_id": "12345",
20 |         "item_price": 15.25,
21 |         "item_brand": "Google",
22 |         "item_category": "Apparel",
23 |         "item_variant": "Gray",
24 |         "quantity": 1,
25 |         "item_coupon": ""
26 |       }, {
27 |         "item_name": "Donut Friday Scented T-Shirt",
28 |         "item_id": "67890",
29 |         "item_price": 33.75,
30 |         "item_brand": "Google",
31 |         "item_category": "Apparel",
32 |         "item_variant": "Black",
33 |         "quantity": 1
34 |       }]
35 |     }
36 |   }
37 | }


--------------------------------------------------------------------------------
/01_ingest_and_transform/11_challenge/datalayer/synth_data_stream.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import random
16 | import requests
17 | import json
18 | import time
19 | import argparse
20 | 
21 | 
22 | def main(endpoint):
23 |     draw = round(random.uniform(0, 1), 2)
24 | 
25 |     uid = f'UID0000{int(round(random.uniform(0, 5), 0))}'
26 | 
27 |     if 0 <= draw < 1 / 3:
28 |         # get view payload
29 |         view_item_f = open('./datalayer/view_item.json')
30 |         view_item_payload = json.load(view_item_f)
31 | 
32 |         view_item_payload['user_id'] = uid
33 | 
34 |         # send view
35 |         r = requests.post(endpoint, json=view_item_payload)
36 | 
37 |     elif 1 / 3 <= draw < 2 / 3:
38 |         # get add to cart payload
39 |         add_to_cart_f = open('./datalayer/add_to_cart.json')
40 |         add_to_cart_payload = json.load(add_to_cart_f)
41 | 
42 |         add_to_cart_payload['user_id'] = uid
43 | 
44 |         # send add to cart
45 |         r = requests.post(endpoint, json=add_to_cart_payload)
46 | 
47 |     else:
48 |         # decide between anomaly or no anomaly
49 |         if draw < 0.95:
50 |             # get payload
51 |             purchase_f = open('./datalayer/purchase.json')
52 |             purchase_payload = json.load(purchase_f)
53 | 
54 |             purchase_payload['user_id'] = uid
55 | 
56 |             # send request
57 |             r = requests.post(endpoint, json=purchase_payload)
58 |         else:
59 |             # get payload
60 |             purchase_anomaly_f = open('./datalayer/purchase_anomaly.json')
61 |             purchase_anomaly_payload = json.load(purchase_anomaly_f)
62 | 
63 |             purchase_anomaly_payload['user_id'] = uid
64 | 
65 |             # send request
66 |             r = requests.post(endpoint, json=purchase_anomaly_payload)
67 | 
68 |     # print(r.text)
69 |     print(f'{time.time()} -- {r.status_code}')
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     # Parse Arguments
74 |     parser = argparse.ArgumentParser()
75 |     parser.add_argument("--endpoint", help="Target Endpoint")
76 | 
77 |     args = parser.parse_args()
78 | 
79 |     endpoint = args.endpoint + '/json'
80 | 
81 |     while True:
82 |         main(endpoint)
83 |         time.sleep(2)
84 | 


--------------------------------------------------------------------------------
/01_ingest_and_transform/11_challenge/datalayer/view_item.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "event_datetime":"2020-11-16 22:59:59", 
 3 |   "event": "view_item",
 4 |   "user_id": "UID00003", 
 5 |   "client_id": "CID00003",
 6 |   "page":"/product-67890",
 7 |   "page_previous": "/category-tshirts",
 8 |   "ecommerce": {
 9 |     "items": [{
10 |       "item_name": "Donut Friday Scented T-Shirt", 
11 |       "item_id": "67890",
12 |       "price": 33.75,
13 |       "item_brand": "Google",
14 |       "item_category": "Apparel",
15 |       "item_category_2": "Mens",
16 |       "item_category_3": "Shirts",
17 |       "item_category_4": "Tshirts",
18 |       "item_variant": "Black",
19 |       "item_list_name": "Search Results",  
20 |       "item_list_id": "SR123",  
21 |       "index": 1,  
22 |       "quantity": 1
23 |     }]
24 |   }
25 | }


--------------------------------------------------------------------------------
/01_ingest_and_transform/11_challenge/main.tf:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2023 Google
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *      http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | terraform {
 18 |   required_providers {
 19 |     google = {
 20 |       source = "hashicorp/google"
 21 |       version = "4.32.0"
 22 |     }
 23 |   }
 24 | }
 25 | 
 26 | provider "google" {
 27 |   project = var.project_id
 28 |   region  = var.gcp_region
 29 | }
 30 | 
 31 | data "google_project" "project" {
 32 | }
 33 | 
 34 | resource "google_compute_network" "vpc_network" {
 35 |   name = "terraform-network"  
 36 | }
 37 | 
 38 | resource "google_compute_firewall" "vpc_network_firewall" {
 39 |   name    = "firewall"
 40 |   
 41 |   network = google_compute_network.vpc_network.name
 42 |   
 43 |   source_service_accounts = ["${google_service_account.data_pipeline_access.email}"]
 44 | 
 45 |   allow {
 46 |     protocol = "tcp"
 47 |     ports    = ["12345", "12346"]
 48 |   }
 49 | }
 50 | 
 51 | resource "google_service_account" "data_pipeline_access" {
 52 |   project = var.project_id
 53 |   account_id = "retailpipeline-hyp"
 54 |   display_name = "Retail app data pipeline access"
 55 | }
 56 | 
 57 | 
 58 | # Set permissions.
 59 | resource "google_project_iam_member" "dataflow_admin_role" {
 60 |   project = var.project_id
 61 |   role = "roles/dataflow.admin"
 62 |   member = "serviceAccount:${google_service_account.data_pipeline_access.email}"
 63 | }
 64 | 
 65 | resource "google_project_iam_member" "dataflow_worker_role" {
 66 |   project = var.project_id
 67 |   role = "roles/dataflow.worker"
 68 |   member = "serviceAccount:${google_service_account.data_pipeline_access.email}"
 69 | }
 70 | 
 71 | resource "google_project_iam_member" "dataflow_bigquery_role" {
 72 |   project = var.project_id
 73 |   role = "roles/bigquery.dataEditor"
 74 |   member = "serviceAccount:${google_service_account.data_pipeline_access.email}"
 75 | }
 76 | 
 77 | resource "google_project_iam_member" "dataflow_pub_sub_subscriber" {
 78 |   project = var.project_id
 79 |   role = "roles/pubsub.subscriber"
 80 |   member = "serviceAccount:${google_service_account.data_pipeline_access.email}"
 81 | }
 82 | 
 83 | resource "google_project_iam_member" "dataflow_pub_sub_viewer" {
 84 |   project = var.project_id
 85 |   role = "roles/pubsub.viewer"
 86 |   member = "serviceAccount:${google_service_account.data_pipeline_access.email}"
 87 | }
 88 | 
 89 | resource "google_project_iam_member" "dataflow_storage_object_admin" {
 90 |   project = var.project_id
 91 |   role = "roles/storage.objectAdmin"
 92 |   member = "serviceAccount:${google_service_account.data_pipeline_access.email}"
 93 | }
 94 | 
 95 | data "google_compute_default_service_account" "default" {
 96 | }
 97 | 
 98 | resource "google_project_iam_member" "gce_pub_sub_admin" {
 99 |   project = var.project_id
100 |   role = "roles/pubsub.admin"
101 |   member = "serviceAccount:${data.google_compute_default_service_account.default.email}"
102 | }
103 | 
104 | 
105 | # Enabling APIs
106 | resource "google_project_service" "compute" {
107 |   service = "compute.googleapis.com"
108 | 
109 |   disable_on_destroy = false
110 | }
111 | 
112 | resource "google_project_service" "run" {
113 |   service = "run.googleapis.com"
114 | 
115 |   disable_on_destroy = false
116 | }
117 | 
118 | resource "google_project_service" "dataflow" {
119 |   service = "dataflow.googleapis.com"
120 | 
121 |   disable_on_destroy = false
122 | }
123 | 
124 | resource "google_project_service" "pubsub" {
125 |   service = "pubsub.googleapis.com"
126 |   disable_on_destroy = false
127 | }
128 | 
129 | resource "google_project_iam_member" "viewer" {
130 |   project = var.project_id
131 |   role   = "roles/bigquery.metadataViewer"
132 |   member = "serviceAccount:service-${data.google_project.project.number}@gcp-sa-pubsub.iam.gserviceaccount.com"
133 | }
134 | 
135 | resource "google_project_iam_member" "editor" {
136 |   project = var.project_id
137 |   role   = "roles/bigquery.dataEditor"
138 |   member = "serviceAccount:service-${data.google_project.project.number}@gcp-sa-pubsub.iam.gserviceaccount.com"
139 | }
140 | 


--------------------------------------------------------------------------------
/01_ingest_and_transform/11_challenge/processing-service/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9
 2 | 
 3 | # Allow statements and log messages to immediately appear in the Knative logs
 4 | ENV PYTHONUNBUFFERED True
 5 | 
 6 | # Copy local code to the container image.
 7 | ENV APP_HOME /app
 8 | WORKDIR $APP_HOME
 9 | COPY . ./
10 | COPY ./requirements.txt ./
11 | 
12 | # Install production dependencies.
13 | RUN pip install --upgrade pip
14 | RUN pip install --no-cache-dir -r requirements.txt
15 | 
16 | # Run the web service on container startup. Here we use the gunicorn
17 | # webserver, with one worker process and 8 threads.
18 | # For environments with multiple CPU cores, increase the number of workers
19 | # to be equal to the cores available.
20 | # Timeout is set to 0 to disable the timeouts of the workers to allow Cloud Run to handle instance scaling.
21 | CMD exec gunicorn --bind :$PORT --workers 1 --threads 8 --timeout 0 main:app


--------------------------------------------------------------------------------
/01_ingest_and_transform/11_challenge/processing-service/README.md:
--------------------------------------------------------------------------------
 1 | The processing service defines a Cloud Run Service to process each incoming datapoint.
 2 | 
 3 | 
 4 | main.py defines the public facing webserver to listen for requests.
 5 | 
 6 | synth_data_stream.py creates a synthetic data stream of events randomly chosen from the datalayer. It also randomly includes anomalies in the data.
 7 | 
 8 | 
 9 | Command to start data stream:
10 | python3 synth_data_stream.py --endpoint {Pub/Sub endpoint link}'


--------------------------------------------------------------------------------
/01_ingest_and_transform/11_challenge/processing-service/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | project_id = '<project-id>'
16 | location = 'europe-west1'
17 | bq_dataset = 'ecommerce_sink'
18 | bq_table = 'cloud_run'
19 | 


--------------------------------------------------------------------------------
/01_ingest_and_transform/11_challenge/processing-service/main.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | import time
17 | import base64
18 | import json
19 | 
20 | import config
21 | 
22 | from flask import Flask, request
23 | 
24 | from google.cloud import bigquery
25 | 
26 | app = Flask(__name__)
27 | 
28 | 
29 | @app.route("/hw", methods=['GET', 'POST'])
30 | def hello_world():
31 |     world = request.args.get('world')
32 |     return f"Hello {world}!"
33 | 
34 | 
35 | @app.route("/", methods=["POST"])
36 | def index():
37 |     envelope = request.get_json()
38 |     print(envelope)
39 |     print(type(envelope))
40 | 
41 |     if not envelope:
42 |         msg = "no Pub/Sub message received"
43 |         print(f"error: {msg}")
44 |         return f"Bad Request: {msg}", 400
45 | 
46 |     ps_message = envelope['message']
47 |     print(ps_message)
48 |     print(type(ps_message))
49 | 
50 |     record = base64.b64decode(ps_message["data"]).decode("utf-8").strip()
51 |     record = json.loads(record)
52 | 
53 |     print(record)
54 |     print(type(record))
55 | 
56 |     rows_to_insert = [record]
57 | 
58 |     client = '<BQ Client Definition>'
59 | 
60 |     table_id = config.project_id + '.' + config.bq_dataset + '.' + config.bq_table
61 | 
62 |     errors = '<API Call to insert rows from json>'  # Make an API request.
63 | 
64 |     if errors == []:
65 |         print(f"{time.time()} New rows have been added.")
66 |         return ("", 204)
67 |     else:
68 |         print("Encountered errors while inserting rows: {}".format(errors))
69 |         return f"Bad Request: {envelope}", 400
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     app.run(debug=True, host="0.0.0.0", port=int(os.environ.get("PORT", 8080)))
74 | 
75 | 


--------------------------------------------------------------------------------
/01_ingest_and_transform/11_challenge/processing-service/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | numpy
3 | Flask==2.1.0
4 | gunicorn==20.1.0
5 | google-cloud-bigquery


--------------------------------------------------------------------------------
/01_ingest_and_transform/11_challenge/terraform.tfvars:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2023 Google
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | project_id                  = "<project-id>"
18 | delete_contents_on_destroy  = true
19 | 


--------------------------------------------------------------------------------
/01_ingest_and_transform/11_challenge/variables.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2023 Google
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | variable "project_id" {
18 |   description = "Project where the dataset and table are created."
19 | }
20 | 
21 | variable "delete_contents_on_destroy" {
22 |   description = "(Optional) If set to true, delete all the tables in the dataset when destroying the resource; otherwise, destroying the resource will fail if tables are present."
23 |   type        = bool
24 |   default     = null
25 | }
26 | 
27 | variable "force_destroy" {
28 |   description = "When deleting a bucket, this boolean option will delete all contained objects. If false, Terraform will fail to delete buckets which contain objects."
29 |   type        = bool
30 |   default     = false
31 | }
32 | 
33 | variable "gcp_region" {
34 |   description = "GCP region to deploy resources in."
35 |   type = string
36 |   default = "europe-west1"
37 | }


--------------------------------------------------------------------------------
/01_ingest_and_transform/12_solution/README.md:
--------------------------------------------------------------------------------
  1 | # Developing *efficient* Data Pipelines on GCP
  2 | 
  3 | Google Cloud Platform offers numerous possibilities and sample architectures to design data pipelines.
  4 | As always, there is no **ONE** perfect data architecture. It always depends!!   
  5 | 
  6 | The perfect architecture depends on the data-type, -volume and more. 
  7 | Business and tech requirements such as the number of data producers and consumers or the intended data activation are also essential. 
  8 | 
  9 | This repo provides practical guidance and sample architectures for the most common pipeline requirements I happened to come across with my customers.
 10 | 
 11 | ## We will find three unique solutions to ingest click stream data into BigQuery.
 12 | 
 13 | All examples find unique solutions for **ingesting click-stream data from a web-store to BigQuery**. 
 14 | 
 15 | Imagine you are a Data Engineer with the task to monitor specific click stream data from your company's web store.
 16 | You already set up a Cloud Run Proxy Service that can be set-up as custom Tag in Google Tag Manager.
 17 | Also, you defined a Pub/Sub topic as central event-hub. 
 18 | Triggered events flow from Google Tag Manager through your Cloud Run Proxy to your Pub/Sub Topic.
 19 | 
 20 | Once events arrived in your central event-hub, you need to decide on how to process and move them to BigQuery.
 21 | 
 22 | 
 23 | ![Efficient GCP Data Pipelines Architecture Overview](../../rsc/efficient_pipelines.png)
 24 | 
 25 | 
 26 | ### I. Lean ELT pipelines with raw data in BigQuery
 27 | 
 28 | Thinking about a scenario in which we aim to build the cheapest lowest maintenance data pipeline.
 29 | Our only requirement might be to transport the raw data into BigQuery.
 30 | For example, to design a Lakehouse structure.
 31 | 
 32 | Introducing the direct Pub/Sub to BigQuery subscription:
 33 | 
 34 | **Strengths:**
 35 | - No data processing tool = major cost saving
 36 | - No ETL maintenance
 37 | - Raw data in lakehouse allows view based processing on use-case basis
 38 | - Ingestion scales down to 0 and up without limits
 39 | 
 40 | **Weaknesses:**
 41 | - No processing or aggregations of ingested data before in BigQuery
 42 | - Raw data volume in lakehouse might grow quickly 
 43 | - Only limited sanity check possible when ingesting data
 44 | 
 45 | 
 46 | ### II. Elastic ELT pipeline with Cloud Run
 47 | 
 48 | You might want to develop a pipeline that scales up and down easily, but still allows to apply simple transformations.
 49 | For example, you might want to make data sanity checks, apply default cleaning or run ML inference over your data.
 50 | 
 51 | Introducing Cloud Run as data processing tool:
 52 | 
 53 | **Strengths:**
 54 | - Scales down to 0 and up with (almost) no limits
 55 | - Easy integration of data transformations in any language and based on any dependencies
 56 | - Easy entry, no steep learning curve for Kubernetes like container orchestration
 57 | 
 58 | **Weaknesses:**
 59 | - No graphic interface to explore data transformation steps
 60 | - Only one-at-a-time data point handling, aggregations over multiple data points only possible once in BigQuery
 61 | 
 62 | 
 63 | ### III. High-Volume ETL pipelines with complex aggregations using Dataflow
 64 | 
 65 | 
 66 | **Strengths:**
 67 | - Apache Beam allows for on-the-fly aggregations and windowing
 68 | - Dataflow offers a user interface, great for troubleshooting
 69 | 
 70 | **Weaknesses:**
 71 | - Dataflow never scales down to 0
 72 | - Despite serverless nature of dataflow, managing machines is extra overhead compared to e.g. Cloud Run
 73 | 
 74 | 
 75 | This repo provides an end to end example for streaming data from a web store to BigQuery. It contains the following components that can be deployed all at once using Terraform or serve as individual examples.
 76 | 
 77 | - Cloud Run service that can be set-up as a custom tag in Google Tag Manager.
 78 | - Pub/Sub topic to consume the data
 79 | - Pub/Sub subscription to pull the data from the topic
 80 | - Dataflow streaming job using a Pub/Sub to BigQuery template
 81 | - BigQuery events table to store the data
 82 | - BigQuery SQL query to analyze the events
 83 | 
 84 | The data structure is based on the [Data Layer E-Commerce](https://developers.google.com/tag-manager/ecommerce-ga4) format recommended for Google Tag Manager.
 85 | 
 86 | ## Git clone repo 
 87 | 
 88 | ```
 89 | git clone https://github.com/NucleusEngineering/hack-your-pipe.git
 90 | cd hack-your-pipe/01_ingest_and_transform/12_solution
 91 | ```
 92 | 
 93 | ## Set-up Cloud Environment
 94 | 
 95 | ### Initialize your account and project
 96 | 
 97 | If you are using the Google Cloud Shell you can skip this step.
 98 | 
 99 | ```
100 | gcloud init
101 | ```
102 | 
103 | ### Set Google Cloud Project
104 | 
105 | Enter your GCP Project ID in `./config_env.sh` & set all necessary environment variables.
106 | ```
107 | source config_env.sh
108 | ```
109 | 
110 | Set the default GCP project.
111 | ```
112 | gcloud config set project $GCP_PROJECT
113 | ```
114 | 
115 | ### Enable Google Cloud APIs
116 | 
117 | ```
118 | gcloud services enable compute.googleapis.com cloudbuild.googleapis.com artifactregistry.googleapis.com dataflow.googleapis.com
119 | ```
120 | 
121 | ### Set compute region
122 | 
123 | ```
124 | gcloud config set compute/zone $GCP_REGION
125 | ```
126 | 
127 | <!-- ### Organizational Policies
128 | 
129 | Depending on the setup within your organization you might have to [overwrite some organizational policies](https://cloud.google.com/resource-manager/docs/organization-policy/creating-managing-policies#boolean_constraints) for the examples to run.
130 | 
131 | For example, the following policies should not be enforced. 
132 | 
133 | ```
134 | constraints/sql.restrictAuthorizedNetworks
135 | constraints/compute.vmExternalIpAccess
136 | constraints/compute.requireShieldedVm
137 | constraints/storage.uniformBucketLevelAccess
138 | constraints/iam.allowedPolicyMemberDomains
139 | ``` -->
140 | 
141 | # Build the Cloud Run Containers
142 | 
143 | Update the default project ID in the following files to match your project ID: [beam/config.py](https://github.com/NucleusEngineering/hack-your-pipe/blob/main/01_ingest_and_transform/12_solution/beam/config.py), [processing_service/config.py](https://github.com/NucleusEngineering/hack-your-pipe/blob/main/01_ingest_and_transform/12_solution/processing-service/config.py)
144 | 
145 | Check that the file has been saved with the updated project ID value
146 | 
147 | ```
148 | cat beam/config.py
149 | cat processing_service/config.py
150 | ``` 
151 | 
152 | ## Pub/Sub proxy service container
153 | 
154 | ```
155 | gcloud builds submit $RUN_PROXY_DIR --tag gcr.io/$GCP_PROJECT/pubsub-proxy
156 | ```
157 | 
158 | ## Data Processing service container
159 | 
160 | ```
161 | gcloud builds submit $RUN_PROCESSING_DIR --tag gcr.io/$GCP_PROJECT/data-processing-service
162 | ```
163 | 
164 | ## Dataflow Template container
165 | 
166 | ```
167 | gcloud builds submit $DATAFLOW_TEMPLATE --tag gcr.io/$GCP_PROJECT/beam-processing-flex-template
168 | ```
169 | 
170 | ```
171 | gsutil mb -c standard -l $GCP_REGION gs://$GCP_PROJECT-ecommerce-events
172 | ```
173 | 
174 | ```
175 | gcloud dataflow flex-template build gs://$GCP_PROJECT-ecommerce-events/df_templates/dataflow_template.json --image=gcr.io/$GCP_PROJECT/beam-processing-flex-template --sdk-language=PYTHON
176 | ```
177 | 
178 | ### List containers
179 | 
180 | Check that the containers were successfully created.
181 | 
182 | ```
183 | gcloud container images list
184 | ```
185 | 
186 | You should see the following output:
187 | 
188 | ```
189 | NAME: gcr.io/<project-id>/beam-processing-flex-template
190 | NAME: gcr.io/<project-id>/data-processing-service
191 | NAME: gcr.io/<project-id>/pubsub-proxy
192 | Only listing images in gcr.io/<project-id>. Use --repository to list images in other repositories.
193 | ```
194 | 
195 | 
196 | ## Deploy using Terraform
197 | 
198 | Use Terraform to deploy the following services defined in the `main.tf` file
199 | 
200 | - Cloud Run 1: Pub/Sub Proxy
201 | - Cloud Run 2: Data Processing Service
202 | - Pub/Sub Topic
203 | - Pub/Sub Push Subscription
204 | - Pub/Sub Pull Subscription
205 | - Pub/Sub BigQuery Subscription
206 | - Google Cloud Storage
207 | - Dataflow Job
208 | - BigQuery Table per pipeline
209 | 
210 | ### Install Terraform
211 | 
212 | If you are using the Google Cloud Shell Terraform is already installed.
213 | 
214 | Follow the instructions to [install the Terraform cli](https://learn.hashicorp.com/tutorials/terraform/install-cli?in=terraform/gcp-get-started).
215 | 
216 | This repo has been tested on Terraform version `1.3.6` and the Google provider version  `4.32.0`
217 | 
218 | ### Update Project ID in terraform.tfvars
219 | 
220 | Rename the `terraform.tfvars.example` file to `terraform.tfvars` and update the default project ID in the file to match your project ID.
221 | 
222 | Check that the file has been saved with the updated project ID value
223 | 
224 | ```
225 | cat terraform.tfvars
226 | ```
227 | 
228 | ### Initialize Terraform
229 | 
230 | ```
231 | terraform init
232 | ```
233 | 
234 | ### Create resources in Google Cloud
235 | 
236 | Run the plan cmd to see what resources will be created in your project.
237 | 
238 | **Important: Make sure you have updated the Project ID in terraform.tfvars before running this**
239 | 
240 | ```
241 | terraform plan
242 | ```
243 | 
244 | Run the apply cmd and point to your `.tfvars` file to deploy all the resources in your project.
245 | 
246 | ```
247 | terraform apply -var-file terraform.tfvars
248 | ```
249 | 
250 | This will show you a plan of everything that will be created and then the following notification where you should enter `yes` to proceed:
251 | 
252 | ```
253 | Plan: 20 to add, 0 to change, 0 to destroy.
254 | 
255 | Do you want to perform these actions?
256 |   Terraform will perform the actions described above.
257 |   Only 'yes' will be accepted to approve.
258 | 
259 |   Enter a value: 
260 | ```
261 | 
262 | ### Terraform output
263 | 
264 | Once everything has successfully run you should see the following output:
265 | 
266 | ```
267 | google_compute_network.vpc_network: Creating...
268 | .
269 | .
270 | .
271 | Apply complete! Resources: 20 added, 0 changed, 0 destroyed.
272 | 
273 | Outputs:
274 | 
275 | cloud_run_proxy_url = https://pubsub-proxy-my-service-<id>-uc.a.run.app
276 | ```
277 | 
278 | ## Simulate sending e-commerce events to Cloud Run Pub/Sub proxy using curl
279 | 
280 | Use the `cloud_run_proxy_url` value from the Terraform output to simulate sending e-commerce events to the Cloud Run Pub/Sub proxy.
281 | 
282 | #### Set Cloud Run Proxy URL
283 | 
284 | Enter your the proxy service URL as `ENDPOINT_URL` in `./config_env.sh` & reset the environment variables.
285 | ```
286 | source config_env.sh
287 | ```
288 | 
289 | #### Create artificial event stream
290 | 
291 | Run the script `./datalayer/synth_data_stream.py` to direct a synthetic stream of events to the created endpoint.
292 | 
293 | ```
294 | python3 ./datalayer/synth_data_stream.py --endpoint=$ENDPOINT_URL
295 | ```
296 | 
297 | The program will generate and send a random event based on the samples in `./datalayer` every two seconds. 
298 | 
299 | After a minute or two you should find the BigQuery event tables populated.
300 | 
301 | ### Terraform Destroy
302 | 
303 | Use Terraform to destroy all resources
304 | 
305 | ```
306 | terraform destroy
307 | ```
308 | 
309 | You might have to delete the BigQuery tables and rerun the command to destroy the resources.


--------------------------------------------------------------------------------
/01_ingest_and_transform/12_solution/beam/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM gcr.io/dataflow-templates-base/python39-template-launcher-base:latest as template_launcher
 2 | 
 3 | ARG WORKDIR=/dataflow/template
 4 | RUN mkdir -p ${WORKDIR}
 5 | WORKDIR ${WORKDIR}
 6 | 
 7 | # Copy local code to the container image.
 8 | COPY . ./
 9 | COPY ./requirements.txt ./
10 | COPY ./beam_processing.py ./
11 | 
12 | # Flex Template ENV Vars
13 | ENV FLEX_TEMPLATE_PYTHON_PY_FILE="${WORKDIR}/beam_processing.py"
14 | ENV FLEX_TEMPLATE_PYTHON_REQUIREMENTS_FILE="${WORKDIR}/requirements.txt"
15 | 
16 | # Install requirements
17 | RUN pip install --upgrade pip
18 | RUN pip install --no-cache-dir -r requirements.txt
19 | 
20 | ENTRYPOINT ["/opt/google/dataflow/python_template_launcher"]


--------------------------------------------------------------------------------
/01_ingest_and_transform/12_solution/beam/beam_processing.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Google
  2 | 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | 
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import json
 16 | import time
 17 | 
 18 | import config
 19 | 
 20 | import apache_beam as beam
 21 | from apache_beam.options.pipeline_options import PipelineOptions
 22 | from apache_beam.transforms import trigger
 23 | from apache_beam.io.gcp.pubsub import ReadFromPubSub
 24 | from apache_beam.io.gcp.bigquery import BigQueryDisposition, WriteToBigQuery
 25 | from apache_beam.runners import DataflowRunner
 26 | 
 27 | # Defining event filter functions.
 28 | 
 29 | 
 30 | def is_item_view(event):
 31 |     return event['event'] == 'view_item'
 32 | 
 33 | 
 34 | def is_add_to_cart(event):
 35 |     return event['event'] == 'add_to_cart'
 36 | 
 37 | 
 38 | def is_purchase(event):
 39 |     return event['event'] == 'purchase'
 40 | 
 41 | 
 42 | class ExtractValueFn(beam.DoFn):
 43 |     def process(self, element):
 44 |         print(f"ExtractValueFn: {element['ecommerce']['purchase']['value']}")
 45 |         return [element['ecommerce']['purchase']['value']]
 46 | 
 47 | 
 48 | class ExtractAndSumValue(beam.PTransform):
 49 |     """A transform to extract key/score information and sum the scores.
 50 |   The constructor argument `field` determines whether 'team' or 'user' info is
 51 |   extracted.
 52 |   """
 53 | 
 54 |     def expand(self, pcoll):
 55 |         sum_val = (
 56 |             pcoll
 57 |             | beam.Map(lambda elem: (elem['user_id'], elem['ecommerce']['purchase']['value']))
 58 |             | beam.CombinePerKey(sum))
 59 |         return(sum_val)
 60 | 
 61 | 
 62 | class FormatByRow(beam.PTransform):
 63 |     """A transform to reformat the data to column name/value format.
 64 |   """
 65 | 
 66 |     def expand(self, pcoll):
 67 |         row_val = (
 68 |             pcoll
 69 |             | beam.Map(lambda elem: {'user_id': elem[0],
 70 |                                      'summed_value': elem[1]
 71 |                                      })
 72 |         )
 73 |         return(row_val)
 74 | 
 75 | 
 76 | def streaming_pipeline(project, region):
 77 | 
 78 |     subscription = "projects/{}/subscriptions/hyp_subscription_dataflow".format(
 79 |         project)
 80 | 
 81 |     bucket = "gs://{}-ecommerce-events/tmp_dir".format(project)
 82 | 
 83 |     # Defining pipeline options.
 84 |     options = PipelineOptions(
 85 |         streaming=True,
 86 |         project=project,
 87 |         region=region,
 88 |         staging_location="%s/staging" % bucket,
 89 |         temp_location="%s/temp" % bucket,
 90 |         subnetwork='regions/europe-west1/subnetworks/terraform-network',
 91 |         service_account_email='retailpipeline-hyp@{}.iam.gserviceaccount.com'.format(
 92 |             project),
 93 |         max_num_workers=1
 94 |     )
 95 | 
 96 |     # Defining pipeline.
 97 |     p = beam.Pipeline(DataflowRunner(), options=options)
 98 | 
 99 |     # Receiving message from Pub/Sub & parsing json from string.
100 |     json_message = (p
101 |                     # Listining to Pub/Sub.
102 |                     | "Read Topic" >> ReadFromPubSub(subscription=subscription)
103 |                     # Parsing json from message string.
104 |                     | "Parse json" >> beam.Map(json.loads)
105 |                     )
106 | 
107 |     # Extracting Item Views.
108 |     item_views = (json_message
109 |                   | 'Filter for item views' >> beam.Filter(is_item_view)
110 |                   | "item view row" >> beam.Map(lambda input: {'event_datetime': input['event_datetime'],  # Dropping and renaming columns.
111 |                                                                'event': input['event'],
112 |                                                                'user_id':  input['user_id'],
113 |                                                                'client_id': input['client_id'],
114 |                                                                'page': input['page'],
115 |                                                                'page_previous': input['page_previous'],
116 |                                                                "item_name": input['ecommerce']['items'][0]["item_name"],
117 |                                                                "item_id": input['ecommerce']['items'][0]["item_id"],
118 |                                                                "price": input['ecommerce']['items'][0]["price"],
119 |                                                                "item_brand": input['ecommerce']['items'][0]["item_brand"],
120 |                                                                "item_category": input['ecommerce']['items'][0]["item_category"],
121 |                                                                "item_category_2": input['ecommerce']['items'][0]["item_category_2"],
122 |                                                                "item_category_3": input['ecommerce']['items'][0]["item_category_3"],
123 |                                                                "item_category_4": input['ecommerce']['items'][0]["item_category_4"],
124 |                                                                "item_variant": input['ecommerce']['items'][0]["item_variant"],
125 |                                                                "item_list_name": input['ecommerce']['items'][0]["item_list_name"],
126 |                                                                "item_list_id": input['ecommerce']['items'][0]["item_list_id"],
127 |                                                                "quantity": input['ecommerce']['items'][0]["quantity"]
128 |                                                                })
129 |                   )
130 | 
131 |     fixed_windowed_items = (json_message
132 |                             | 'Filter for purchase' >> beam.Filter(is_purchase)
133 |                             | 'Global Window' >> beam.WindowInto(beam.window.GlobalWindows(),
134 |                                                                  trigger=trigger.Repeatedly(
135 |                                                                      trigger.AfterCount(10)),
136 |                                                                  accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
137 |                             | 'ExtractAndSumValue' >> ExtractAndSumValue()
138 |                             | 'FormatByRow' >> FormatByRow()
139 |                             )
140 | 
141 |     # Writing summed values to BigQuery
142 |     aggregated_schema = "user_id:STRING, summed_value:FLOAT"
143 |     aggregated_table = "{}:ecommerce_sink.beam_aggregated".format(project)
144 | 
145 |     fixed_windowed_items | "Write Summed Values To BigQuery" >> WriteToBigQuery(table=aggregated_table, schema=aggregated_schema,
146 |                                                                                 create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
147 |                                                                                 write_disposition=BigQueryDisposition.WRITE_APPEND)
148 | 
149 |     # Writing the PCollections to two differnt BigQuery tables.
150 |     item_views_table = "{}:ecommerce_sink.beam_item_views".format(project)
151 |     schema = "event_datetime:DATETIME, event:STRING, user_id:STRING, client_id:STRING, page:STRING, page_previous:STRING, " \
152 |         "item_name:STRING, item_id:STRING, price:STRING, item_brand:STRING, item_category:STRING, item_category_2:STRING, item_category_3:STRING, " \
153 |         "item_category_4:STRING, item_variant:STRING, item_list_name:STRING, item_list_id:STRING, quantity:STRING"
154 | 
155 |     item_views | "Write Items Views To BigQuery" >> WriteToBigQuery(table=item_views_table, schema=schema,
156 |                                                                     create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
157 |                                                                     write_disposition=BigQueryDisposition.WRITE_APPEND)
158 | 
159 |     return p.run()
160 | 
161 | 
162 | if __name__ == '__main__':
163 |     GCP_PROJECT = config.project_id
164 |     GCP_REGION = config.location
165 | 
166 |     streaming_pipeline(project=GCP_PROJECT, region=GCP_REGION)
167 | 


--------------------------------------------------------------------------------
/01_ingest_and_transform/12_solution/beam/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | project_id = 'poerschmann-hyp-test3'
16 | location = 'europe-west1'
17 | 


--------------------------------------------------------------------------------
/01_ingest_and_transform/12_solution/beam/requirements.txt:
--------------------------------------------------------------------------------
1 | apache-beam
2 | google-apitools


--------------------------------------------------------------------------------
/01_ingest_and_transform/12_solution/cloud-run-pubsub-proxy/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/01_ingest_and_transform/12_solution/cloud-run-pubsub-proxy/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use the official lightweight Node.js 12 image.
 2 | # https://hub.docker.com/_/node
 3 | FROM node:12-slim
 4 | 
 5 | # Create and change to the app directory.
 6 | WORKDIR /usr/src/app
 7 | 
 8 | # Copy application dependency manifests to the container image.
 9 | # A wildcard is used to ensure both package.json AND package-lock.json are copied.
10 | # Copying this separately prevents re-running npm install on every code change.
11 | COPY package*.json ./
12 | 
13 | # Install production dependencies.
14 | RUN npm install --only=production
15 | 
16 | # Copy local code to the container image.
17 | COPY . ./
18 | 
19 | # Run the web service on container startup.
20 | CMD [ "npm", "start" ]


--------------------------------------------------------------------------------
/01_ingest_and_transform/12_solution/cloud-run-pubsub-proxy/README.md:
--------------------------------------------------------------------------------
1 | Cloud Run Proxy is a express webserver that listenes to incoming requests and publishes them to a chosen Pub/Sub topic.


--------------------------------------------------------------------------------
/01_ingest_and_transform/12_solution/cloud-run-pubsub-proxy/app.js:
--------------------------------------------------------------------------------
 1 | // Copyright 2023 Google
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     https://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | const express = require('express');
16 | const bodyParser = require('body-parser');
17 | const app = express();
18 | 
19 | app.use(bodyParser.json());
20 | 
21 | app.get('/', (req, res) => {
22 |   console.log('Hello world received a request.');
23 | 
24 |   const target = process.env.TARGET || 'World';
25 |   res.send(`Hello ${target}!`);
26 | });
27 | 
28 | app.post('/json', (req, res) => {
29 |   const dataLayer = JSON.stringify(req.body)
30 |   console.log(`proxy POST request received dataLayer: ${dataLayer}`)
31 | 
32 |   const {PubSub} = require('@google-cloud/pubsub');
33 | 
34 |   // Instantiates a client
35 |   const pubsub = new PubSub();
36 | 
37 |   const {Buffer} = require('safe-buffer');
38 | 
39 |   // Set Pub/Sub topic name
40 |   let topicName = 'hyp-pubsub-topic';
41 | 
42 |   // References an existing topic
43 |   const topic = pubsub.topic(topicName);
44 | 
45 |   // Publishes the message as a string, 
46 |   const dataBuffer = Buffer.from(dataLayer);
47 | 
48 |   // Add two custom attributes, origin and username, to the message
49 |   const customAttributes = {
50 |     origin: 'gtm-cloud-run',
51 |     username: 'gcp-demo',
52 |   };
53 | 
54 |   // Publishes a message to Pub/Sub
55 |   return topic
56 |     .publishMessage({data: dataBuffer})
57 |     .then(() => res.status(200).send(`{"message": "pubsub message sent: ${dataBuffer}"}`))
58 |     .catch(err => {
59 |       console.error(err);
60 |       res.status(500).send(err);
61 |       return Promise.reject(err);
62 |    });
63 | })
64 | 
65 | 
66 | module.exports = app;
67 | 


--------------------------------------------------------------------------------
/01_ingest_and_transform/12_solution/cloud-run-pubsub-proxy/index.js:
--------------------------------------------------------------------------------
 1 | // Copyright 2023 Google
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     https://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | const app = require('./app.js');
16 | const PORT = process.env.PORT || 8080;
17 | 
18 | app.listen(PORT, () => console.log(`pubsub proxy app listening on port ${PORT}`));
19 | 


--------------------------------------------------------------------------------
/01_ingest_and_transform/12_solution/cloud-run-pubsub-proxy/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "pubsub-proxy",
 3 |   "version": "1.0.0",
 4 |   "description": "Cloud Run app to send messages to Pub/Sub topic using Node",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "start": "node index.js"
 8 |   },
 9 |   "author": "",
10 |   "license": "Apache-2.0",
11 |   "dependencies": {
12 |     "express": "^4.17.1",
13 |     "body-parser": "^1.19.0",
14 |     "@google-cloud/pubsub": "^3.3.0",
15 |     "safe-buffer": "5.1.2"
16 |   }
17 | }


--------------------------------------------------------------------------------
/01_ingest_and_transform/12_solution/config_env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | export GCP_PROJECT="<project-id>"
 4 | export ENDPOINT_URL="<endpoint-url>" # doesn't need to be defined in the very beginning
 5 | export PUSH_ENDPOINT='<processing-endpoint-url>' # doesn't need to be defined in the very beginning
 6 | export GCP_REGION=europe-west1
 7 | export RUN_PROXY_DIR=cloud-run-pubsub-proxy
 8 | export RUN_PROCESSING_DIR=processing-service
 9 | export DATAFLOW_TEMPLATE=beam
10 | export RUN_INFERENCE_PROCESSING_SERVICE=inf_processing_service
11 | 
12 | 


--------------------------------------------------------------------------------
/01_ingest_and_transform/12_solution/datalayer/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/01_ingest_and_transform/12_solution/datalayer/README.md:
--------------------------------------------------------------------------------
1 | Datalayer defines the json events that could occur to be fed into the pipeline. 
2 | 
3 | Four types of events are included:
4 | * add to cart 
5 | * made purchase
6 | * made purcase with anomaly (artifical mistake in data to be identified later)
7 | * view item


--------------------------------------------------------------------------------
/01_ingest_and_transform/12_solution/datalayer/add_to_cart.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "event_datetime":"2020-11-16 20:59:59", 
 3 |   "event": "add_to_cart",
 4 |   "user_id": "UID00003", 
 5 |   "client_id": "CID00003",
 6 |   "page":"/product-67890",
 7 |   "page_previous": "/category-tshirts",
 8 |   "ecommerce": {
 9 |     "items": [{
10 |       "item_name": "Donut Friday Scented T-Shirt",
11 |       "item_id": "67890",
12 |       "price": 33.75,
13 |       "item_brand": "Google",
14 |       "item_category": "Apparel",
15 |       "item_category_2": "Mens",
16 |       "item_category_3": "Shirts",
17 |       "item_category_4": "Tshirts",
18 |       "item_variant": "Black",
19 |       "item_list_name": "Search Results",
20 |       "item_list_id": "SR123",
21 |       "index": 1,
22 |       "quantity": 2
23 |     }]
24 |   }
25 | }


--------------------------------------------------------------------------------
/01_ingest_and_transform/12_solution/datalayer/ecommerce_events_bq_schema.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "name": "event_datetime",
  4 |     "type": "TIMESTAMP",
  5 |     "mode": "NULLABLE"
  6 |   },
  7 |   {
  8 |     "name": "event",
  9 |     "type": "STRING",
 10 |     "mode": "REQUIRED"
 11 |   },
 12 |   {
 13 |     "name": "user_id",
 14 |     "type": "STRING",
 15 |     "mode": "REQUIRED"
 16 |   },
 17 |   {
 18 |     "name": "client_id",
 19 |     "type": "STRING",
 20 |     "mode": "NULLABLE"
 21 |   },
 22 |   {
 23 |     "name": "page",
 24 |     "type": "STRING",
 25 |     "mode": "NULLABLE"
 26 |   },
 27 |   {
 28 |     "name": "page_previous",
 29 |     "type": "STRING",
 30 |     "mode": "NULLABLE"
 31 |   },
 32 |   {
 33 |     "name": "weekday",
 34 |     "type": "STRING",
 35 |     "mode": "NULLABLE"
 36 |   },
 37 |   {
 38 |     "name": "ecommerce",
 39 |     "type": "RECORD",
 40 |     "mode": "NULLABLE",
 41 |     "fields": [ 
 42 |       {
 43 |         "mode": "REPEATED",
 44 |         "name": "items",
 45 |         "type": "RECORD",
 46 |         "fields": [
 47 |           {
 48 |             "mode": "NULLABLE",
 49 |             "name": "index",
 50 |             "type": "INTEGER"
 51 |           },
 52 | 
 53 |           {
 54 |             "mode": "NULLABLE",
 55 |             "name": "item_id",
 56 |             "type": "INTEGER"
 57 |           },
 58 |           {
 59 |             "mode": "NULLABLE",
 60 |             "name": "item_name",
 61 |             "type": "STRING"
 62 |           },
 63 |           {
 64 |             "mode": "NULLABLE",
 65 |             "name": "item_list_name",
 66 |             "type": "STRING"
 67 |           },
 68 |           {
 69 |             "mode": "NULLABLE",
 70 |             "name": "item_list_id",
 71 |             "type": "STRING"
 72 |           },
 73 |           {
 74 |             "mode": "NULLABLE",
 75 |             "name": "price",
 76 |             "type": "FLOAT"
 77 |           },
 78 |           {
 79 |             "mode": "NULLABLE",
 80 |             "name": "item_variant",
 81 |             "type": "STRING"
 82 |           },
 83 |           {
 84 |             "mode": "NULLABLE",
 85 |             "name": "quantity",
 86 |             "type": "INTEGER"
 87 |           },
 88 |           {
 89 |             "mode": "NULLABLE",
 90 |             "name": "item_brand",
 91 |             "type": "STRING"
 92 |           },
 93 |           {
 94 |             "mode": "NULLABLE",
 95 |             "name": "item_category",
 96 |             "type": "STRING"
 97 |           },
 98 |           {
 99 |             "mode": "NULLABLE",
100 |             "name": "item_category_2",
101 |             "type": "STRING"
102 |           },
103 |           {
104 |             "mode": "NULLABLE",
105 |             "name": "item_category_3",
106 |             "type": "STRING"
107 |           },
108 |           {
109 |             "mode": "NULLABLE",
110 |             "name": "item_category_4",
111 |             "type": "STRING"
112 |           }
113 |         ]
114 |       },
115 |       {
116 |         "mode": "NULLABLE",
117 |         "name": "purchase",
118 |         "type": "RECORD",
119 |         "fields": [
120 |           {
121 |             "fields": [
122 |               {
123 |                 "mode": "NULLABLE",
124 |                 "name": "item_coupon",
125 |                 "type": "STRING"
126 |               },
127 |               {
128 |                 "mode": "NULLABLE",
129 |                 "name": "quantity",
130 |                 "type": "INTEGER"
131 |               },
132 |               {
133 |                 "mode": "NULLABLE",
134 |                 "name": "item_variant",
135 |                 "type": "STRING"
136 |               },
137 |               {
138 |                 "mode": "NULLABLE",
139 |                 "name": "item_category",
140 |                 "type": "STRING"
141 |               },
142 |               {
143 |                 "mode": "NULLABLE",
144 |                 "name": "item_name",
145 |                 "type": "STRING"
146 |               },
147 |               {
148 |                 "mode": "NULLABLE",
149 |                 "name": "item_id",
150 |                 "type": "INTEGER"
151 |               },
152 |               {
153 |                 "mode": "NULLABLE",
154 |                 "name": "item_brand",
155 |                 "type": "STRING"
156 |               },
157 |               {
158 |                 "mode": "NULLABLE",
159 |                 "name": "item_price",
160 |                 "type": "FLOAT"
161 |               }
162 |             ],
163 |             "mode": "REPEATED",
164 |             "name": "items",
165 |             "type": "RECORD"
166 |           },
167 |           {
168 |             "mode": "NULLABLE",
169 |             "name": "coupon",
170 |             "type": "STRING"
171 |           },
172 |           {
173 |             "mode": "NULLABLE",
174 |             "name": "tax",
175 |             "type": "FLOAT"
176 |           },
177 |           {
178 |             "mode": "NULLABLE",
179 |             "name": "shipping",
180 |             "type": "FLOAT"
181 |           },
182 |           {
183 |             "mode": "NULLABLE",
184 |             "name": "value",
185 |             "type": "FLOAT"
186 |           },
187 |           {
188 |             "mode": "NULLABLE",
189 |             "name": "affiliation",
190 |             "type": "STRING"
191 |           },
192 |           {
193 |             "mode": "NULLABLE",
194 |             "name": "currency",
195 |             "type": "STRING"
196 |           },
197 |           {
198 |             "mode": "NULLABLE",
199 |             "name": "transaction_id",
200 |             "type": "STRING"
201 |           }
202 |         ]
203 |       }
204 |     ]
205 |   }
206 | ]
207 | 


--------------------------------------------------------------------------------
/01_ingest_and_transform/12_solution/datalayer/purchase.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "event_datetime":"2020-11-16 20:59:59", 
 3 |   "event": "purchase",
 4 |   "user_id": "UID00001", 
 5 |   "client_id": "CID00003", 
 6 |   "page":"/checkout",
 7 |   "page_previous": "/order-confirmation",
 8 |   "ecommerce": {
 9 |     "purchase": {
10 |       "transaction_id": "T12345",
11 |       "affiliation": "Online Store",
12 |       "value": 35.43,
13 |       "tax": 4.90,
14 |       "shipping": 5.99,
15 |       "currency": "EUR",
16 |       "coupon": "SUMMER_SALE",
17 |       "items": [{
18 |         "item_name": "Triblend Android T-Shirt",
19 |         "item_id": "12345",
20 |         "item_price": 15.25,
21 |         "item_brand": "Google",
22 |         "item_category": "Apparel",
23 |         "item_variant": "Gray",
24 |         "quantity": 1,
25 |         "item_coupon": ""
26 |       }, {
27 |         "item_name": "Donut Friday Scented T-Shirt",
28 |         "item_id": "67890",
29 |         "item_price": 33.75,
30 |         "item_brand": "Google",
31 |         "item_category": "Apparel",
32 |         "item_variant": "Black",
33 |         "quantity": 1
34 |       }]
35 |     }
36 |   }
37 | }


--------------------------------------------------------------------------------
/01_ingest_and_transform/12_solution/datalayer/purchase_anomaly.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "event_datetime":"2020-11-16 20:59:59", 
 3 |   "event": "purchase",
 4 |   "user_id": "UID00001", 
 5 |   "client_id": "CID00003", 
 6 |   "page":"/checkout",
 7 |   "page_previous": "/order-confirmation",
 8 |   "ecommerce": {
 9 |     "purchase": {
10 |       "transaction_id": "T12345",
11 |       "affiliation": "Online Store",
12 |       "value": 1000000.10,
13 |       "tax": 4.90,
14 |       "shipping": 5.99,
15 |       "currency": "EUR",
16 |       "coupon": "SUMMER_SALE",
17 |       "items": [{
18 |         "item_name": "Triblend Android T-Shirt",
19 |         "item_id": "12345",
20 |         "item_price": 15.25,
21 |         "item_brand": "Google",
22 |         "item_category": "Apparel",
23 |         "item_variant": "Gray",
24 |         "quantity": 1,
25 |         "item_coupon": ""
26 |       }, {
27 |         "item_name": "Donut Friday Scented T-Shirt",
28 |         "item_id": "67890",
29 |         "item_price": 33.75,
30 |         "item_brand": "Google",
31 |         "item_category": "Apparel",
32 |         "item_variant": "Black",
33 |         "quantity": 1
34 |       }]
35 |     }
36 |   }
37 | }


--------------------------------------------------------------------------------
/01_ingest_and_transform/12_solution/datalayer/synth_data_stream.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import random
16 | import requests
17 | import json
18 | import time
19 | import argparse
20 | 
21 | 
22 | def main(endpoint):
23 |     draw = round(random.uniform(0, 1), 2)
24 | 
25 |     uid = f'UID0000{int(round(random.uniform(0, 5), 0))}'
26 | 
27 |     if 0 <= draw < 1 / 3:
28 |         # get view payload
29 |         view_item_f = open('./datalayer/view_item.json')
30 |         view_item_payload = json.load(view_item_f)
31 | 
32 |         view_item_payload['user_id'] = uid
33 | 
34 |         # send view
35 |         r = requests.post(endpoint, json=view_item_payload)
36 | 
37 |     elif 1 / 3 <= draw < 2 / 3:
38 |         # get add to cart payload
39 |         add_to_cart_f = open('./datalayer/add_to_cart.json')
40 |         add_to_cart_payload = json.load(add_to_cart_f)
41 | 
42 |         add_to_cart_payload['user_id'] = uid
43 | 
44 |         # send add to cart
45 |         r = requests.post(endpoint, json=add_to_cart_payload)
46 | 
47 |     else:
48 |         # decide between anomaly or no anomaly
49 |         if draw < 0.95:
50 |             # get payload
51 |             purchase_f = open('./datalayer/purchase.json')
52 |             purchase_payload = json.load(purchase_f)
53 | 
54 |             purchase_payload['user_id'] = uid
55 | 
56 |             # send request
57 |             r = requests.post(endpoint, json=purchase_payload)
58 |         else:
59 |             # get payload
60 |             purchase_anomaly_f = open('./datalayer/purchase_anomaly.json')
61 |             purchase_anomaly_payload = json.load(purchase_anomaly_f)
62 | 
63 |             purchase_anomaly_payload['user_id'] = uid
64 | 
65 |             # send request
66 |             r = requests.post(endpoint, json=purchase_anomaly_payload)
67 | 
68 |     # print(r.text)
69 |     print(f'{time.time()} -- {r.status_code}')
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     # Parse Arguments
74 |     parser = argparse.ArgumentParser()
75 |     parser.add_argument("--endpoint", help="Target Endpoint")
76 | 
77 |     args = parser.parse_args()
78 | 
79 |     endpoint = args.endpoint + '/json'
80 | 
81 |     while True:
82 |         main(endpoint)
83 |         time.sleep(2)
84 | 


--------------------------------------------------------------------------------
/01_ingest_and_transform/12_solution/datalayer/view_item.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "event_datetime":"2020-11-16 22:59:59", 
 3 |   "event": "view_item",
 4 |   "user_id": "UID00003", 
 5 |   "client_id": "CID00003",
 6 |   "page":"/product-67890",
 7 |   "page_previous": "/category-tshirts",
 8 |   "ecommerce": {
 9 |     "items": [{
10 |       "item_name": "Donut Friday Scented T-Shirt", 
11 |       "item_id": "67890",
12 |       "price": 33.75,
13 |       "item_brand": "Google",
14 |       "item_category": "Apparel",
15 |       "item_category_2": "Mens",
16 |       "item_category_3": "Shirts",
17 |       "item_category_4": "Tshirts",
18 |       "item_variant": "Black",
19 |       "item_list_name": "Search Results",  
20 |       "item_list_id": "SR123",  
21 |       "index": 1,  
22 |       "quantity": 1
23 |     }]
24 |   }
25 | }


--------------------------------------------------------------------------------
/01_ingest_and_transform/12_solution/main.tf:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2023 Google
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *      http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | terraform {
 18 |   required_providers {
 19 |     google = {
 20 |       source = "hashicorp/google"
 21 |       version = "4.32.0"
 22 |     }
 23 |   }
 24 | }
 25 | 
 26 | provider "google" {
 27 |   project = var.project_id
 28 |   region  = var.gcp_region
 29 | }
 30 | 
 31 | data "google_project" "project" {
 32 | }
 33 | 
 34 | resource "google_compute_network" "vpc_network" {
 35 |   name = "terraform-network"  
 36 | }
 37 | 
 38 | resource "google_compute_firewall" "vpc_network_firewall" {
 39 |   name    = "firewall"
 40 |   
 41 |   network = google_compute_network.vpc_network.name
 42 |   
 43 |   source_service_accounts = ["${google_service_account.data_pipeline_access.email}"]
 44 | 
 45 |   allow {
 46 |     protocol = "tcp"
 47 |     ports    = ["12345", "12346"]
 48 |   }
 49 | }
 50 | 
 51 | resource "google_service_account" "data_pipeline_access" {
 52 |   project = var.project_id
 53 |   account_id = "retailpipeline-hyp"
 54 |   display_name = "Retail app data pipeline access"
 55 | }
 56 | 
 57 | 
 58 | # Set permissions.
 59 | resource "google_project_iam_member" "dataflow_admin_role" {
 60 |   project = var.project_id
 61 |   role = "roles/dataflow.admin"
 62 |   member = "serviceAccount:${google_service_account.data_pipeline_access.email}"
 63 | }
 64 | 
 65 | resource "google_project_iam_member" "dataflow_worker_role" {
 66 |   project = var.project_id
 67 |   role = "roles/dataflow.worker"
 68 |   member = "serviceAccount:${google_service_account.data_pipeline_access.email}"
 69 | }
 70 | 
 71 | resource "google_project_iam_member" "dataflow_bigquery_role" {
 72 |   project = var.project_id
 73 |   role = "roles/bigquery.dataEditor"
 74 |   member = "serviceAccount:${google_service_account.data_pipeline_access.email}"
 75 | }
 76 | 
 77 | resource "google_project_iam_member" "dataflow_pub_sub_subscriber" {
 78 |   project = var.project_id
 79 |   role = "roles/pubsub.subscriber"
 80 |   member = "serviceAccount:${google_service_account.data_pipeline_access.email}"
 81 | }
 82 | 
 83 | resource "google_project_iam_member" "dataflow_pub_sub_viewer" {
 84 |   project = var.project_id
 85 |   role = "roles/pubsub.viewer"
 86 |   member = "serviceAccount:${google_service_account.data_pipeline_access.email}"
 87 | }
 88 | 
 89 | resource "google_project_iam_member" "dataflow_storage_object_admin" {
 90 |   project = var.project_id
 91 |   role = "roles/storage.objectAdmin"
 92 |   member = "serviceAccount:${google_service_account.data_pipeline_access.email}"
 93 | }
 94 | 
 95 | data "google_compute_default_service_account" "default" {
 96 | }
 97 | 
 98 | resource "google_project_iam_member" "gce_pub_sub_admin" {
 99 |   project = var.project_id
100 |   role = "roles/pubsub.admin"
101 |   member = "serviceAccount:${data.google_compute_default_service_account.default.email}"
102 | }
103 | 
104 | 
105 | # Enabling APIs
106 | resource "google_project_service" "compute" {
107 |   service = "compute.googleapis.com"
108 | 
109 |   disable_on_destroy = false
110 | }
111 | 
112 | resource "google_project_service" "run" {
113 |   service = "run.googleapis.com"
114 | 
115 |   disable_on_destroy = false
116 | }
117 | 
118 | resource "google_project_service" "dataflow" {
119 |   service = "dataflow.googleapis.com"
120 | 
121 |   disable_on_destroy = false
122 | }
123 | 
124 | resource "google_project_service" "pubsub" {
125 |   service = "pubsub.googleapis.com"
126 |   disable_on_destroy = false
127 | }
128 | 
129 | 
130 | # Define common resources used by all pipeline options.
131 | # Cloud Run Proxy
132 | resource "google_cloud_run_service" "pubsub_proxy_hyp" {
133 |   name     = "hyp-run-service-pubsub-proxy"
134 |   location = var.gcp_region
135 | 
136 |   template {
137 |     spec {
138 |       containers {
139 |         image = "gcr.io/${var.project_id}/pubsub-proxy"
140 |       }
141 |     }
142 |   }
143 | 
144 |   traffic {
145 |     percent         = 100
146 |     latest_revision = true
147 |   }
148 | 
149 |   depends_on = [google_project_service.run]
150 | }
151 | 
152 | data "google_iam_policy" "noauth" {
153 |   binding {
154 |     role = "roles/run.invoker"
155 |     members = [
156 |       "allUsers",
157 |     ]
158 |   }
159 | }
160 | 
161 | resource "google_cloud_run_service_iam_policy" "noauth" {
162 |   location    = google_cloud_run_service.pubsub_proxy_hyp.location
163 |   project     = google_cloud_run_service.pubsub_proxy_hyp.project
164 |   service     = google_cloud_run_service.pubsub_proxy_hyp.name
165 |   policy_data = data.google_iam_policy.noauth.policy_data
166 | }
167 | 
168 | output "cloud_run_proxy_url" {
169 |   value = google_cloud_run_service.pubsub_proxy_hyp.status[0].url
170 | }
171 | 
172 | # BigQuery Dataset
173 | resource "google_bigquery_dataset" "bq_dataset" {
174 |   dataset_id                  = "ecommerce_sink"
175 |   friendly_name               = "ecommerce sink"
176 |   description                 = "Destination dataset for all pipeline options"
177 |   location                    = var.gcp_region
178 | 
179 |   delete_contents_on_destroy = true
180 | 
181 |   labels = {
182 |     env = "default"
183 |   }
184 | }
185 | 
186 | # Pub/Sub Topic
187 | resource "google_pubsub_topic" "ps_topic" {
188 |   name = "hyp-pubsub-topic"
189 | 
190 |   labels = {
191 |     created = "terraform"
192 |   }
193 | 
194 |   depends_on = [google_project_service.pubsub]
195 | }
196 | 
197 | 
198 | # Pipeline 1: Cloud Run Proxy -> Pub/Sub -> Dataflow -> BigQuery
199 | resource "google_pubsub_subscription" "hyp_sub_dataflow" {
200 |   name  = "hyp_subscription_dataflow"
201 |   topic = google_pubsub_topic.ps_topic.name
202 | 
203 |   labels = {
204 |     created = "terraform"
205 |   }
206 |   
207 |   retain_acked_messages      = false
208 | 
209 |   ack_deadline_seconds = 20
210 | 
211 | 
212 |   retry_policy {
213 |     minimum_backoff = "10s"
214 |   }
215 | 
216 |   enable_message_ordering    = false
217 | }
218 | 
219 | resource "google_dataflow_flex_template_job" "dataflow_stream" {
220 |   provider                = google-beta
221 |   name                    = "ecommerce-events-ps-to-bq-stream"
222 |   container_spec_gcs_path = "gs://${var.project_id}-ecommerce-events/df_templates/dataflow_template.json"
223 |   region = var.gcp_region
224 |   project = var.project_id
225 |   depends_on = [google_project_service.compute, google_project_service.dataflow]
226 |   parameters = {
227 |     "on_delete" = "cancel"
228 |     "service_account_email" = "${google_service_account.data_pipeline_access.email}"
229 |     "network" = "${google_compute_network.vpc_network.name}"
230 |     "max_workers" = 1
231 |     "temp_location" = "gs://${var.project_id}-ecommerce-events/df_tmp_dir"
232 |     "runner" = "DataflowRunner"
233 |   }
234 | }
235 | 
236 | 
237 | # Pipeline 2: Cloud Run Proxy -> Pub/Sub -> BigQuery
238 | resource "google_bigquery_table" "bq_table_bqdirect" {
239 |   dataset_id = google_bigquery_dataset.bq_dataset.dataset_id
240 |   table_id   = "pubsub_direct"
241 |   deletion_protection = false
242 | 
243 |   labels = {
244 |     env = "default"
245 |   }
246 | 
247 |   schema = <<EOF
248 | [
249 |   {
250 |     "name": "data",
251 |     "type": "STRING",
252 |     "mode": "NULLABLE",
253 |     "description": "The data"
254 |   }
255 | ]
256 | EOF
257 | }
258 | 
259 | resource "google_project_iam_member" "viewer" {
260 |   project = var.project_id
261 |   role   = "roles/bigquery.metadataViewer"
262 |   member = "serviceAccount:service-${data.google_project.project.number}@gcp-sa-pubsub.iam.gserviceaccount.com"
263 | }
264 | 
265 | resource "google_project_iam_member" "editor" {
266 |   project = var.project_id
267 |   role   = "roles/bigquery.dataEditor"
268 |   member = "serviceAccount:service-${data.google_project.project.number}@gcp-sa-pubsub.iam.gserviceaccount.com"
269 | }
270 | 
271 | resource "google_pubsub_subscription" "sub_bqdirect" {
272 |   name  = "hyp_subscription_bq_direct"
273 |   topic = google_pubsub_topic.ps_topic.name
274 | 
275 |   bigquery_config {
276 |     table = "${google_bigquery_table.bq_table_bqdirect.project}:${google_bigquery_table.bq_table_bqdirect.dataset_id}.${google_bigquery_table.bq_table_bqdirect.table_id}"
277 |   }
278 | 
279 |   depends_on = [google_project_iam_member.viewer, google_project_iam_member.editor]
280 | 
281 |   labels = {
282 |     created = "terraform"
283 |   }
284 |   retain_acked_messages      = false
285 | 
286 |   ack_deadline_seconds = 20
287 | 
288 |   retry_policy {
289 |     minimum_backoff = "10s"
290 |   }
291 | 
292 |   enable_message_ordering    = false
293 | }
294 | 
295 | 
296 | # Pipeline 3: Cloud Run Proxy -> Pub/Sub -> Cloud Run Processing -> BigQuery
297 | resource "google_cloud_run_service" "hyp_run_service_data_processing" {
298 |   name     = "hyp-run-service-data-processing"
299 |   location = var.gcp_region
300 | 
301 |   template {
302 |     spec {
303 |       containers {
304 |         image = "gcr.io/${var.project_id}/data-processing-service"
305 |       }
306 |       service_account_name = "${google_service_account.data_pipeline_access.email}"
307 |     }
308 |   }
309 | 
310 |   traffic {
311 |     percent         = 100
312 |     latest_revision = true
313 |   }
314 | 
315 |   depends_on = [google_project_service.run]
316 | }
317 | 
318 | resource "google_cloud_run_service_iam_policy" "noauth_dp" {
319 |   location    = google_cloud_run_service.hyp_run_service_data_processing.location
320 |   project     = google_cloud_run_service.hyp_run_service_data_processing.project
321 |   service     = google_cloud_run_service.hyp_run_service_data_processing.name
322 |   policy_data = data.google_iam_policy.noauth.policy_data
323 | }
324 | 
325 | resource "google_pubsub_subscription" "hyp_sub_cloud_run" {
326 |   name  = "hyp_subscription_cloud_run"
327 |   topic = google_pubsub_topic.ps_topic.name
328 | 
329 |   labels = {
330 |     created = "terraform"
331 |   }
332 | 
333 |   push_config {
334 |     push_endpoint = google_cloud_run_service.hyp_run_service_data_processing.status[0].url
335 | 
336 |     attributes = {
337 |       x-goog-version = "v1"
338 |     }
339 |   }
340 | 
341 |   retain_acked_messages      = false
342 | 
343 |   ack_deadline_seconds = 20
344 | 
345 | 
346 |   retry_policy {
347 |     minimum_backoff = "10s"
348 |   }
349 | 
350 |   enable_message_ordering    = false
351 | }
352 | 
353 | resource "google_bigquery_table" "bq_table_cloud_run" {
354 |   dataset_id = google_bigquery_dataset.bq_dataset.dataset_id
355 |   table_id   = "cloud_run"
356 |   deletion_protection = false
357 | 
358 |   time_partitioning {
359 |     type = "DAY"
360 |     field = "event_datetime"
361 |   }
362 | 
363 |   labels = {
364 |     env = "default"
365 |   }
366 | 
367 |   schema = file("./datalayer/ecommerce_events_bq_schema.json")
368 | 
369 | }


--------------------------------------------------------------------------------
/01_ingest_and_transform/12_solution/processing-service/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9
 2 | 
 3 | # Allow statements and log messages to immediately appear in the Knative logs
 4 | ENV PYTHONUNBUFFERED True
 5 | 
 6 | # Copy local code to the container image.
 7 | ENV APP_HOME /app
 8 | WORKDIR $APP_HOME
 9 | COPY . ./
10 | COPY ./requirements.txt ./
11 | 
12 | # Install production dependencies.
13 | RUN pip install --upgrade pip
14 | RUN pip install --no-cache-dir -r requirements.txt
15 | 
16 | # Run the web service on container startup. Here we use the gunicorn
17 | # webserver, with one worker process and 8 threads.
18 | # For environments with multiple CPU cores, increase the number of workers
19 | # to be equal to the cores available.
20 | # Timeout is set to 0 to disable the timeouts of the workers to allow Cloud Run to handle instance scaling.
21 | CMD exec gunicorn --bind :$PORT --workers 1 --threads 8 --timeout 0 main:app


--------------------------------------------------------------------------------
/01_ingest_and_transform/12_solution/processing-service/README.md:
--------------------------------------------------------------------------------
 1 | The processing service defines a Cloud Run Service to process each incoming datapoint.
 2 | 
 3 | 
 4 | main.py defines the public facing webserver to listen for requests.
 5 | 
 6 | synth_data_stream.py creates a synthetic data stream of events randomly chosen from the datalayer. It also randomly includes anomalies in the data.
 7 | 
 8 | 
 9 | Command to start data stream:
10 | python3 synth_data_stream.py --endpoint {Pub/Sub endpoint link}'


--------------------------------------------------------------------------------
/01_ingest_and_transform/12_solution/processing-service/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | project_id = 'poerschmann-hyp-test3'
16 | location = 'europe-west1'
17 | bq_dataset = 'ecommerce_sink'
18 | bq_table = 'cloud_run'
19 | 


--------------------------------------------------------------------------------
/01_ingest_and_transform/12_solution/processing-service/main.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | import time
17 | import base64
18 | import json
19 | import datetime
20 | import config
21 | 
22 | from flask import Flask, request
23 | 
24 | from google.cloud import bigquery
25 | 
26 | app = Flask(__name__)
27 | 
28 | 
29 | @app.route("/hw", methods=['GET', 'POST'])
30 | def hello_world():
31 |     world = request.args.get('world')
32 |     return f"Hello {world}!"
33 | 
34 | 
35 | @app.route("/", methods=["POST"])
36 | def index():
37 |     envelope = request.get_json()
38 |     print(envelope)
39 |     print(type(envelope))
40 | 
41 |     if not envelope:
42 |         msg = "no Pub/Sub message received"
43 |         print(f"error: {msg}")
44 |         return f"Bad Request: {msg}", 400
45 | 
46 |     ps_message = envelope['message']
47 | 
48 |     record = base64.b64decode(ps_message["data"]).decode("utf-8").strip()
49 |     record = json.loads(record)
50 | 
51 |     record["weekday"] = datetime.datetime.strptime(record["event_datetime"], "%Y-%m-%d %H:%M:%S").strftime('%A')
52 | 
53 |     rows_to_insert = [record]
54 | 
55 |     client = bigquery.Client(project=config.project_id, location=config.location)
56 |     table_id = config.project_id + '.' + config.bq_dataset + '.' + config.bq_table
57 | 
58 |     errors = client.insert_rows_json(table_id, rows_to_insert)  # Make an API request.
59 |     if errors == []:
60 |         print(f"{time.time()} New rows have been added.")
61 |         return ("", 204)
62 |     else:
63 |         print("Encountered errors while inserting rows: {}".format(errors))
64 |         return f"Bad Request: {envelope}", 400
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     app.run(debug=True, host="0.0.0.0", port=int(os.environ.get("PORT", 8080)))
69 | 
70 | 


--------------------------------------------------------------------------------
/01_ingest_and_transform/12_solution/processing-service/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | numpy
3 | Flask==2.1.0
4 | gunicorn==20.1.0
5 | google-cloud-bigquery


--------------------------------------------------------------------------------
/01_ingest_and_transform/12_solution/terraform.tfvars:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2023 Google
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | project_id                  = "poerschmann-hyp-test3"
18 | delete_contents_on_destroy  = true
19 | 


--------------------------------------------------------------------------------
/01_ingest_and_transform/12_solution/variables.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2023 Google
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | variable "project_id" {
18 |   description = "Project where the dataset and table are created."
19 | }
20 | 
21 | variable "delete_contents_on_destroy" {
22 |   description = "(Optional) If set to true, delete all the tables in the dataset when destroying the resource; otherwise, destroying the resource will fail if tables are present."
23 |   type        = bool
24 |   default     = null
25 | }
26 | 
27 | variable "force_destroy" {
28 |   description = "When deleting a bucket, this boolean option will delete all contained objects. If false, Terraform will fail to delete buckets which contain objects."
29 |   type        = bool
30 |   default     = false
31 | }
32 | 
33 | variable "gcp_region" {
34 |   description = "GCP region to deploy resources in."
35 |   type = string
36 |   default = "europe-west1"
37 | }


--------------------------------------------------------------------------------
/02_activate/21_challenge/cloud-run-pubsub-proxy/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/02_activate/21_challenge/cloud-run-pubsub-proxy/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use the official lightweight Node.js 12 image.
 2 | # https://hub.docker.com/_/node
 3 | FROM node:12-slim
 4 | 
 5 | # Create and change to the app directory.
 6 | WORKDIR /usr/src/app
 7 | 
 8 | # Copy application dependency manifests to the container image.
 9 | # A wildcard is used to ensure both package.json AND package-lock.json are copied.
10 | # Copying this separately prevents re-running npm install on every code change.
11 | COPY package*.json ./
12 | 
13 | # Install production dependencies.
14 | RUN npm install --only=production
15 | 
16 | # Copy local code to the container image.
17 | COPY . ./
18 | 
19 | # Run the web service on container startup.
20 | CMD [ "npm", "start" ]


--------------------------------------------------------------------------------
/02_activate/21_challenge/cloud-run-pubsub-proxy/README.md:
--------------------------------------------------------------------------------
1 | Cloud Run Proxy is a express webserver that listenes to incoming requests and publishes them to a chosen Pub/Sub topic.


--------------------------------------------------------------------------------
/02_activate/21_challenge/cloud-run-pubsub-proxy/app.js:
--------------------------------------------------------------------------------
 1 | // Copyright 2023 Google
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     https://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | const express = require('express');
16 | const bodyParser = require('body-parser');
17 | const app = express();
18 | 
19 | app.use(bodyParser.json());
20 | 
21 | app.get('/', (req, res) => {
22 |   console.log('Hello world received a request.');
23 | 
24 |   const target = process.env.TARGET || 'World';
25 |   res.send(`Hello ${target}!`);
26 | });
27 | 
28 | app.post('/json', (req, res) => {
29 |   const dataLayer = JSON.stringify(req.body)
30 |   console.log(`proxy POST request received dataLayer: ${dataLayer}`)
31 | 
32 |   const {PubSub} = require('@google-cloud/pubsub');
33 | 
34 |   // Instantiates a client
35 |   const pubsub = new PubSub();
36 | 
37 |   const {Buffer} = require('safe-buffer');
38 | 
39 |   // Set Pub/Sub topic name
40 |   let topicName = 'hyp-pubsub-topic';
41 | 
42 |   // References an existing topic
43 |   const topic = pubsub.topic(topicName);
44 | 
45 |   // Publishes the message as a string, 
46 |   const dataBuffer = Buffer.from(dataLayer);
47 | 
48 |   // Add two custom attributes, origin and username, to the message
49 |   const customAttributes = {
50 |     origin: 'gtm-cloud-run',
51 |     username: 'gcp-demo',
52 |   };
53 | 
54 |   // Publishes a message to Pub/Sub
55 |   return topic
56 |     .publishMessage({data: dataBuffer})
57 |     .then(() => res.status(200).send(`{"message": "pubsub message sent: ${dataBuffer}"}`))
58 |     .catch(err => {
59 |       console.error(err);
60 |       res.status(500).send(err);
61 |       return Promise.reject(err);
62 |    });
63 | })
64 | 
65 | 
66 | module.exports = app;
67 | 


--------------------------------------------------------------------------------
/02_activate/21_challenge/cloud-run-pubsub-proxy/index.js:
--------------------------------------------------------------------------------
 1 | // Copyright 2023 Google
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     https://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | const app = require('./app.js');
16 | const PORT = process.env.PORT || 8080;
17 | 
18 | app.listen(PORT, () => console.log(`pubsub proxy app listening on port ${PORT}`));
19 | 


--------------------------------------------------------------------------------
/02_activate/21_challenge/cloud-run-pubsub-proxy/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "pubsub-proxy",
 3 |   "version": "1.0.0",
 4 |   "description": "Cloud Run app to send messages to Pub/Sub topic using Node",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "start": "node index.js"
 8 |   },
 9 |   "author": "",
10 |   "license": "Apache-2.0",
11 |   "dependencies": {
12 |     "express": "^4.17.1",
13 |     "body-parser": "^1.19.0",
14 |     "@google-cloud/pubsub": "^3.3.0",
15 |     "safe-buffer": "5.1.2"
16 |   }
17 | }


--------------------------------------------------------------------------------
/02_activate/21_challenge/config_custom.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import os
17 | 
18 | PROJECT_ID = os.environ['GCP_PROJECT']
19 | REGION = os.environ['GCP_REGION']
20 | PIPELINE_ROOT_PATH=f'gs://{PROJECT_ID}-ai-bucket/pipeline_root_custom/'
21 | 
22 | TRAIN_IMAGE_URI=os.environ['TRAIN_IMAGE_URI']
23 | PREDICT_IMAGE_URI=os.environ['PREDICT_IMAGE_URI']
24 | AIP_STORAGE_URI=f'gs://{PROJECT_ID}-ai-bucket/vtx-artifacts'
25 | 
26 | SERVICE_ACCOUNT=f"retailpipeline-hyp@{PROJECT_ID}.iam.gserviceaccount.com"
27 | MACHINE_TYPE = "n1-standard-4"
28 | DATA_URI=f"{PROJECT_ID}.ecommerce_sink.anomaly_data"


--------------------------------------------------------------------------------
/02_activate/21_challenge/config_env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | export GCP_PROJECT="<project-id>"
 4 | export ENDPOINT_URL="<endpoint-url>" # doesn't need to be defined in the very beginning
 5 | export PUSH_ENDPOINT='<processing-endpoint-url>' # doesn't need to be defined in the very beginning
 6 | export GCP_REGION=europe-west1
 7 | export RUN_PROXY_DIR=cloud-run-pubsub-proxy
 8 | export RUN_PROCESSING_DIR=processing-service
 9 | export DATAFLOW_TEMPLATE=beam
10 | export RUN_INFERENCE_PROCESSING_SERVICE=inf_processing_service
11 | export RUN_INFERENCE_PROCESSING_SERVICE_CUSTOM=inf_processing_service_custom
12 | 
13 | export TRAIN_IMAGE_URI=gcr.io/$GCP_PROJECT/custom_train:v1
14 | export PREDICT_IMAGE_URI=gcr.io/$GCP_PROJECT/custom_predict:v1
15 | 


--------------------------------------------------------------------------------
/02_activate/21_challenge/custom_train/prediction/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM tiangolo/uvicorn-gunicorn-fastapi:python3.7
 2 | 
 3 | # Allow statements and log messages to immediately appear in the Knative logs
 4 | ENV PYTHONUNBUFFERED True
 5 | 
 6 | # Copy local code to the container image.
 7 | COPY / /app
 8 | WORKDIR /app
 9 | COPY . ./
10 | 
11 | # Install production dependencies.
12 | RUN pip install --upgrade pip
13 | RUN pip install --no-cache-dir -r requirements.txt
14 | 
15 | # Run the web service on container startup. Here we use the gunicorn
16 | # webserver, with one worker process and 8 threads.
17 | # For environments with multiple CPU cores, increase the number of workers
18 | # to be equal to the cores available.
19 | # Timeout is set to 0 to disable the timeouts of the workers to allow Cloud Run to handle instance scaling.
20 | CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"]
21 | 
22 | EXPOSE 8080


--------------------------------------------------------------------------------
/02_activate/21_challenge/custom_train/prediction/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NucleusEngineering/hack-your-pipe/dfd667a1545dcc45cca8a2a16ab2c7f70983211f/02_activate/21_challenge/custom_train/prediction/__init__.py


--------------------------------------------------------------------------------
/02_activate/21_challenge/custom_train/prediction/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | PROJECT_ID="<project-id>"
17 | REGION="europe-west1"


--------------------------------------------------------------------------------
/02_activate/21_challenge/custom_train/prediction/main.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | 
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | 
 8 | 
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 | 
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | 
19 | from fastapi import Request, FastAPI
20 | import json
21 | import os
22 | from joblib import load
23 | import sys
24 | import pandas as pd
25 | from google.cloud import storage
26 | from tempfile import TemporaryFile
27 | import os
28 | import config
29 | 
30 | app = FastAPI()
31 | 
32 | model_directory = f"{os.environ['AIP_STORAGE_URI']}/model_dir"
33 | storage_path = os.path.join(model_directory, "model.joblib")
34 | 
35 | storage_client = storage.Client(project=config.PROJECT_ID)
36 | blob = storage.blob.Blob.from_string(storage_path, client=storage_client)
37 | 
38 | blob.download_to_filename("model.joblib")
39 | model =load(open("model.joblib",'rb'))
40 | 
41 | @app.get('/')
42 | def get_root():
43 |     return {'message': 'Welcome to custom anomaly detection'}
44 | 
45 | @app.get('/health_check')
46 | def health():
47 |     return 200
48 | 
49 | if os.environ.get('AIP_PREDICT_ROUTE') is not None:
50 |     method = os.environ['AIP_PREDICT_ROUTE']
51 | else:
52 |     method = '/predict'
53 | 
54 | @app.post(method)
55 | async def predict(request: Request):
56 |     print("----------------- PREDICTING -----------------")
57 |     body = await request.json()
58 |     # prepare data
59 |     instances = pd.DataFrame(body["instances"])
60 |     
61 |     # retrieving predictions
62 |     outputs = "<3. add the code that predicts anomalies, using the model, and the input from the app>"
63 |     
64 |     response = outputs.tolist()
65 |     print("----------------- OUTPUTS -----------------")
66 |     return {"predictions": response}


--------------------------------------------------------------------------------
/02_activate/21_challenge/custom_train/prediction/requirements.txt:
--------------------------------------------------------------------------------
 1 | pandas
 2 | numpy
 3 | Flask==2.1.0
 4 | gunicorn==20.1.0
 5 | google-cloud-bigquery
 6 | google-cloud-aiplatform
 7 | google-cloud-storage
 8 | scikit-learn 
 9 | joblib
10 | gcsfs


--------------------------------------------------------------------------------
/02_activate/21_challenge/custom_train/trainer/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM gcr.io/deeplearning-platform-release/sklearn-cpu.0-23
 2 | WORKDIR /
 3 | 
 4 | # Allow statements and log messages to immediately appear in the Knative logs
 5 | ENV PYTHONUNBUFFERED True
 6 | 
 7 | # Copies the trainer code to the docker image.
 8 | COPY / /trainer
 9 | COPY . ./
10 | 
11 | # Install production dependencies.
12 | RUN pip install --upgrade pip
13 | RUN pip install --no-cache-dir -r requirements.txt
14 | 
15 | # Sets up the entry point to invoke the trainer.
16 | CMD ["python", "trainer/main.py"]


--------------------------------------------------------------------------------
/02_activate/21_challenge/custom_train/trainer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NucleusEngineering/hack-your-pipe/dfd667a1545dcc45cca8a2a16ab2c7f70983211f/02_activate/21_challenge/custom_train/trainer/__init__.py


--------------------------------------------------------------------------------
/02_activate/21_challenge/custom_train/trainer/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | PROJECT_ID="<project-id>"
17 | REGION="europe-west1"
18 | AIP_STORAGE_URI=f'gs://{PROJECT_ID}-ai-bucket/vtx-artifacts'
19 | # training data:
20 | DATA_URI=f"{PROJECT_ID}.ecommerce_sink.anomaly_data"


--------------------------------------------------------------------------------
/02_activate/21_challenge/custom_train/trainer/main.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from sklearn.tree import DecisionTreeClassifier
16 | from sklearn.metrics import roc_curve
17 | from sklearn.model_selection import train_test_split
18 | from google.cloud import bigquery
19 | from google.cloud import storage
20 | from joblib import dump
21 | 
22 | import preprocess
23 | import train
24 | import config
25 | 
26 | import os
27 | import pandas as pd
28 | import sys
29 | 
30 | # data uri
31 | data_uri = config.DATA_URI
32 | 
33 | # bq client
34 | bqclient = bigquery.Client(project=config.PROJECT_ID)
35 | storage_client = storage.Client(project=config.PROJECT_ID)
36 | 
37 | ## Download & prep data
38 | print('[INFO] ------ Preparing Data', file=sys.stderr)
39 | train_data, train_labels, test_data, test_labels = preprocess.prep_data(bqclient, storage_client, data_uri)
40 | 
41 | ## Train model and save it in Google Cloud Storage
42 | print('[INFO] ------ Training & Saving Model', file=sys.stderr)
43 | '<2. train the model with train and test data and labels, calling the relevant client>'


--------------------------------------------------------------------------------
/02_activate/21_challenge/custom_train/trainer/preprocess.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from sklearn.tree import DecisionTreeClassifier
16 | from sklearn.metrics import roc_curve
17 | from sklearn.model_selection import train_test_split
18 | from google.cloud import bigquery
19 | from google.cloud import storage
20 | from joblib import dump
21 | 
22 | import os
23 | import pandas as pd
24 | 
25 | def download_table(bqclient, storage_client, bq_table_uri: str):
26 | 
27 |     prefix = "bq://"
28 |     if bq_table_uri.startswith(prefix):
29 |         bq_table_uri = bq_table_uri[len(prefix):]
30 | 
31 |     table = bigquery.TableReference.from_string(bq_table_uri)
32 |     rows = bqclient.list_rows(
33 |         table,
34 |     )
35 |     return rows.to_dataframe(create_bqstorage_client=False)
36 | 
37 | def prep_data(bqclient, storage_client, data_uri: str):
38 | 
39 |     # Download data into Pandas DataFrames, split into train / test
40 |     df, test_df = train_test_split(download_table(bqclient, storage_client, data_uri))
41 |     labels = df.pop("anomaly").tolist()
42 |     data = df.values.tolist()
43 |     test_labels = test_df.pop("anomaly").tolist()
44 |     test_data = test_df.values.tolist()
45 | 
46 |     return data, labels, test_data, test_labels


--------------------------------------------------------------------------------
/02_activate/21_challenge/custom_train/trainer/requirements.txt:
--------------------------------------------------------------------------------
1 | scikit-learn 
2 | google-cloud-bigquery 
3 | joblib 
4 | pandas 
5 | google-cloud-storage


--------------------------------------------------------------------------------
/02_activate/21_challenge/custom_train/trainer/train.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from sklearn.tree import DecisionTreeClassifier
16 | from sklearn.metrics import roc_curve
17 | from sklearn.model_selection import train_test_split
18 | from google.cloud import bigquery
19 | from google.cloud import storage
20 | from joblib import dump
21 | 
22 | import os
23 | import pandas as pd
24 | 
25 | def train_model(data, labels, test_data, test_labels, storage_client):
26 |     
27 |     # Define and train the Scikit model
28 |     skmodel = '<1. initialize the model by calling the model operator>'
29 |     '<1. fit the model with data and labels>'
30 |     score = skmodel.score(test_data, test_labels)
31 |     print('accuracy is:',score)
32 |     
33 |     # Storage location
34 |     model_directory = f"{os.environ['AIP_STORAGE_URI']}/model_dir"
35 |     storage_path = os.path.join(model_directory, "model.joblib")
36 | 
37 |     # Save the model to a local file
38 |     dump(skmodel, 'model.joblib')
39 | 
40 |     blob = storage.blob.Blob.from_string(storage_path, client=storage_client)
41 |     blob.upload_from_filename("model.joblib")
42 | 
43 |     return(skmodel)


--------------------------------------------------------------------------------
/02_activate/21_challenge/datalayer/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/02_activate/21_challenge/datalayer/README.md:
--------------------------------------------------------------------------------
1 | Datalayer defines the json events that could occur to be fed into the pipeline. 
2 | 
3 | Four types of events are included:
4 | * add to cart 
5 | * made purchase
6 | * made purcase with anomaly (artifical mistake in data to be identified later)
7 | * view item


--------------------------------------------------------------------------------
/02_activate/21_challenge/datalayer/add_to_cart.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "event_datetime":"2020-11-16 20:59:59", 
 3 |   "event": "add_to_cart",
 4 |   "user_id": "UID00003", 
 5 |   "client_id": "CID00003",
 6 |   "page":"/product-67890",
 7 |   "page_previous": "/category-tshirts",
 8 |   "ecommerce": {
 9 |     "items": [{
10 |       "item_name": "Donut Friday Scented T-Shirt",
11 |       "item_id": "67890",
12 |       "price": 33.75,
13 |       "item_brand": "Google",
14 |       "item_category": "Apparel",
15 |       "item_category_2": "Mens",
16 |       "item_category_3": "Shirts",
17 |       "item_category_4": "Tshirts",
18 |       "item_variant": "Black",
19 |       "item_list_name": "Search Results",
20 |       "item_list_id": "SR123",
21 |       "index": 1,
22 |       "quantity": 2
23 |     }]
24 |   }
25 | }


--------------------------------------------------------------------------------
/02_activate/21_challenge/datalayer/ecommerce_events_bq_schema.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     {
  3 |       "name": "event_datetime",
  4 |       "type": "TIMESTAMP",
  5 |       "mode": "NULLABLE"
  6 |     },
  7 |     {
  8 |       "name": "event",
  9 |       "type": "STRING",
 10 |       "mode": "REQUIRED"
 11 |     },
 12 |     {
 13 |       "name": "user_id",
 14 |       "type": "STRING",
 15 |       "mode": "REQUIRED"
 16 |     },
 17 |     {
 18 |       "name": "client_id",
 19 |       "type": "STRING",
 20 |       "mode": "NULLABLE"
 21 |     },
 22 |     {
 23 |       "name": "page",
 24 |       "type": "STRING",
 25 |       "mode": "NULLABLE"
 26 |     },
 27 |     {
 28 |       "name": "page_previous",
 29 |       "type": "STRING",
 30 |       "mode": "NULLABLE"
 31 |     },
 32 |     {
 33 |       "name": "weekday",
 34 |       "type": "STRING",
 35 |       "mode": "NULLABLE"
 36 |     },
 37 |     {
 38 |       "name": "ecommerce",
 39 |       "type": "RECORD",
 40 |       "mode": "NULLABLE",
 41 |       "fields": [ 
 42 |         {
 43 |           "mode": "REPEATED",
 44 |           "name": "items",
 45 |           "type": "RECORD",
 46 |           "fields": [
 47 |             {
 48 |               "mode": "NULLABLE",
 49 |               "name": "index",
 50 |               "type": "INTEGER"
 51 |             },
 52 |   
 53 |             {
 54 |               "mode": "NULLABLE",
 55 |               "name": "item_id",
 56 |               "type": "INTEGER"
 57 |             },
 58 |             {
 59 |               "mode": "NULLABLE",
 60 |               "name": "item_name",
 61 |               "type": "STRING"
 62 |             },
 63 |             {
 64 |               "mode": "NULLABLE",
 65 |               "name": "item_list_name",
 66 |               "type": "STRING"
 67 |             },
 68 |             {
 69 |               "mode": "NULLABLE",
 70 |               "name": "item_list_id",
 71 |               "type": "STRING"
 72 |             },
 73 |             {
 74 |               "mode": "NULLABLE",
 75 |               "name": "price",
 76 |               "type": "FLOAT"
 77 |             },
 78 |             {
 79 |               "mode": "NULLABLE",
 80 |               "name": "item_variant",
 81 |               "type": "STRING"
 82 |             },
 83 |             {
 84 |               "mode": "NULLABLE",
 85 |               "name": "quantity",
 86 |               "type": "INTEGER"
 87 |             },
 88 |             {
 89 |               "mode": "NULLABLE",
 90 |               "name": "item_brand",
 91 |               "type": "STRING"
 92 |             },
 93 |             {
 94 |               "mode": "NULLABLE",
 95 |               "name": "item_category",
 96 |               "type": "STRING"
 97 |             },
 98 |             {
 99 |               "mode": "NULLABLE",
100 |               "name": "item_category_2",
101 |               "type": "STRING"
102 |             },
103 |             {
104 |               "mode": "NULLABLE",
105 |               "name": "item_category_3",
106 |               "type": "STRING"
107 |             },
108 |             {
109 |               "mode": "NULLABLE",
110 |               "name": "item_category_4",
111 |               "type": "STRING"
112 |             }
113 |           ]
114 |         },
115 |         {
116 |           "mode": "NULLABLE",
117 |           "name": "purchase",
118 |           "type": "RECORD",
119 |           "fields": [
120 |             {
121 |               "fields": [
122 |                 {
123 |                   "mode": "NULLABLE",
124 |                   "name": "item_coupon",
125 |                   "type": "STRING"
126 |                 },
127 |                 {
128 |                   "mode": "NULLABLE",
129 |                   "name": "quantity",
130 |                   "type": "INTEGER"
131 |                 },
132 |                 {
133 |                   "mode": "NULLABLE",
134 |                   "name": "item_variant",
135 |                   "type": "STRING"
136 |                 },
137 |                 {
138 |                   "mode": "NULLABLE",
139 |                   "name": "item_category",
140 |                   "type": "STRING"
141 |                 },
142 |                 {
143 |                   "mode": "NULLABLE",
144 |                   "name": "item_name",
145 |                   "type": "STRING"
146 |                 },
147 |                 {
148 |                   "mode": "NULLABLE",
149 |                   "name": "item_id",
150 |                   "type": "INTEGER"
151 |                 },
152 |                 {
153 |                   "mode": "NULLABLE",
154 |                   "name": "item_brand",
155 |                   "type": "STRING"
156 |                 },
157 |                 {
158 |                   "mode": "NULLABLE",
159 |                   "name": "item_price",
160 |                   "type": "FLOAT"
161 |                 }
162 |               ],
163 |               "mode": "REPEATED",
164 |               "name": "items",
165 |               "type": "RECORD"
166 |             },
167 |             {
168 |               "mode": "NULLABLE",
169 |               "name": "coupon",
170 |               "type": "STRING"
171 |             },
172 |             {
173 |               "mode": "NULLABLE",
174 |               "name": "tax",
175 |               "type": "FLOAT"
176 |             },
177 |             {
178 |               "mode": "NULLABLE",
179 |               "name": "shipping",
180 |               "type": "FLOAT"
181 |             },
182 |             {
183 |               "mode": "NULLABLE",
184 |               "name": "value",
185 |               "type": "FLOAT"
186 |             },
187 |             {
188 |               "mode": "NULLABLE",
189 |               "name": "affiliation",
190 |               "type": "STRING"
191 |             },
192 |             {
193 |               "mode": "NULLABLE",
194 |               "name": "currency",
195 |               "type": "STRING"
196 |             },
197 |             {
198 |               "mode": "NULLABLE",
199 |               "name": "transaction_id",
200 |               "type": "STRING"
201 |             }
202 |           ]
203 |         }
204 |       ]
205 |     }
206 |   ]
207 |   


--------------------------------------------------------------------------------
/02_activate/21_challenge/datalayer/purchase.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "event_datetime":"2020-11-16 20:59:59", 
 3 |   "event": "purchase",
 4 |   "user_id": "UID00001", 
 5 |   "client_id": "CID00003", 
 6 |   "page":"/checkout",
 7 |   "page_previous": "/order-confirmation",
 8 |   "ecommerce": {
 9 |     "purchase": {
10 |       "transaction_id": "T12345",
11 |       "affiliation": "Online Store",
12 |       "value": 35.43,
13 |       "tax": 4.90,
14 |       "shipping": 5.99,
15 |       "currency": "EUR",
16 |       "coupon": "SUMMER_SALE",
17 |       "items": [{
18 |         "item_name": "Triblend Android T-Shirt",
19 |         "item_id": "12345",
20 |         "item_price": 15.25,
21 |         "item_brand": "Google",
22 |         "item_category": "Apparel",
23 |         "item_variant": "Gray",
24 |         "quantity": 1,
25 |         "item_coupon": ""
26 |       }, {
27 |         "item_name": "Donut Friday Scented T-Shirt",
28 |         "item_id": "67890",
29 |         "item_price": 33.75,
30 |         "item_brand": "Google",
31 |         "item_category": "Apparel",
32 |         "item_variant": "Black",
33 |         "quantity": 1
34 |       }]
35 |     }
36 |   }
37 | }


--------------------------------------------------------------------------------
/02_activate/21_challenge/datalayer/purchase_anomaly.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "event_datetime":"2020-11-16 20:59:59", 
 3 |   "event": "purchase",
 4 |   "user_id": "UID00001", 
 5 |   "client_id": "CID00003", 
 6 |   "page":"/checkout",
 7 |   "page_previous": "/order-confirmation",
 8 |   "ecommerce": {
 9 |     "purchase": {
10 |       "transaction_id": "T12345",
11 |       "affiliation": "Online Store",
12 |       "value": 1000000.10,
13 |       "tax": 4.90,
14 |       "shipping": 5.99,
15 |       "currency": "EUR",
16 |       "coupon": "SUMMER_SALE",
17 |       "items": [{
18 |         "item_name": "Triblend Android T-Shirt",
19 |         "item_id": "12345",
20 |         "item_price": 15.25,
21 |         "item_brand": "Google",
22 |         "item_category": "Apparel",
23 |         "item_variant": "Gray",
24 |         "quantity": 1,
25 |         "item_coupon": ""
26 |       }, {
27 |         "item_name": "Donut Friday Scented T-Shirt",
28 |         "item_id": "67890",
29 |         "item_price": 33.75,
30 |         "item_brand": "Google",
31 |         "item_category": "Apparel",
32 |         "item_variant": "Black",
33 |         "quantity": 1
34 |       }]
35 |     }
36 |   }
37 | }


--------------------------------------------------------------------------------
/02_activate/21_challenge/datalayer/synth_data_stream.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import random
16 | import requests
17 | import json
18 | import time
19 | import argparse
20 | 
21 | 
22 | def main(endpoint):
23 |     draw = round(random.uniform(0, 1), 2)
24 | 
25 |     uid = f'UID0000{int(round(random.uniform(0, 5), 0))}'
26 | 
27 |     if 0 <= draw < 1 / 3:
28 |         # get view payload
29 |         view_item_f = open('./datalayer/view_item.json')
30 |         view_item_payload = json.load(view_item_f)
31 | 
32 |         view_item_payload['user_id'] = uid
33 | 
34 |         # send view
35 |         r = requests.post(endpoint, json=view_item_payload)
36 | 
37 |     elif 1 / 3 <= draw < 2 / 3:
38 |         # get add to cart payload
39 |         add_to_cart_f = open('./datalayer/add_to_cart.json')
40 |         add_to_cart_payload = json.load(add_to_cart_f)
41 | 
42 |         add_to_cart_payload['user_id'] = uid
43 | 
44 |         # send add to cart
45 |         r = requests.post(endpoint, json=add_to_cart_payload)
46 | 
47 |     else:
48 |         # decide between anomaly or no anomaly
49 |         if draw < 0.95:
50 |             # get payload
51 |             purchase_f = open('./datalayer/purchase.json')
52 |             purchase_payload = json.load(purchase_f)
53 | 
54 |             purchase_payload['user_id'] = uid
55 | 
56 |             # send request
57 |             r = requests.post(endpoint, json=purchase_payload)
58 |         else:
59 |             # get payload
60 |             purchase_anomaly_f = open('./datalayer/purchase_anomaly.json')
61 |             purchase_anomaly_payload = json.load(purchase_anomaly_f)
62 | 
63 |             purchase_anomaly_payload['user_id'] = uid
64 | 
65 |             # send request
66 |             r = requests.post(endpoint, json=purchase_anomaly_payload)
67 | 
68 |     # print(r.text)
69 |     print(f'{time.time()} -- {r.status_code}')
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     # Parse Arguments
74 |     parser = argparse.ArgumentParser()
75 |     parser.add_argument("--endpoint", help="Target Endpoint")
76 | 
77 |     args = parser.parse_args()
78 | 
79 |     endpoint = args.endpoint + '/json'
80 | 
81 |     while True:
82 |         main(endpoint)
83 |         time.sleep(.1)
84 | 


--------------------------------------------------------------------------------
/02_activate/21_challenge/datalayer/view_item.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "event_datetime":"2020-11-16 22:59:59", 
 3 |   "event": "view_item",
 4 |   "user_id": "UID00003", 
 5 |   "client_id": "CID00003",
 6 |   "page":"/product-67890",
 7 |   "page_previous": "/category-tshirts",
 8 |   "ecommerce": {
 9 |     "items": [{
10 |       "item_name": "Donut Friday Scented T-Shirt", 
11 |       "item_id": "67890",
12 |       "price": 33.75,
13 |       "item_brand": "Google",
14 |       "item_category": "Apparel",
15 |       "item_category_2": "Mens",
16 |       "item_category_3": "Shirts",
17 |       "item_category_4": "Tshirts",
18 |       "item_variant": "Black",
19 |       "item_list_name": "Search Results",  
20 |       "item_list_id": "SR123",  
21 |       "index": 1,  
22 |       "quantity": 1
23 |     }]
24 |   }
25 | }


--------------------------------------------------------------------------------
/02_activate/21_challenge/inf_processing_service/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9
 2 | 
 3 | # Allow statements and log messages to immediately appear in the Knative logs
 4 | ENV PYTHONUNBUFFERED True
 5 | 
 6 | # Copy local code to the container image.
 7 | ENV APP_HOME /app
 8 | WORKDIR $APP_HOME
 9 | COPY . ./
10 | COPY ./requirements.txt ./
11 | 
12 | # Install production dependencies.
13 | RUN pip install --upgrade pip
14 | RUN pip install --no-cache-dir -r requirements.txt
15 | 
16 | # Run the web service on container startup. Here we use the gunicorn
17 | # webserver, with one worker process and 8 threads.
18 | # For environments with multiple CPU cores, increase the number of workers
19 | # to be equal to the cores available.
20 | # Timeout is set to 0 to disable the timeouts of the workers to allow Cloud Run to handle instance scaling.
21 | CMD exec gunicorn --bind :$PORT --workers 1 --threads 8 --timeout 0 main:app


--------------------------------------------------------------------------------
/02_activate/21_challenge/inf_processing_service/README.md:
--------------------------------------------------------------------------------
 1 | The processing service defines a Cloud Run Service to process each incoming datapoint.
 2 | 
 3 | 
 4 | main.py defines the public facing webserver to listen for requests.
 5 | 
 6 | synth_data_stream.py creates a synthetic data stream of events randomly chosen from the datalayer. It also randomly includes anomalies in the data.
 7 | 
 8 | 
 9 | Command to start data stream:
10 | python3 synth_data_stream.py --endpoint {Pub/Sub endpoint link}'


--------------------------------------------------------------------------------
/02_activate/21_challenge/inf_processing_service/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | project_id = '<project-id>'
16 | location = 'europe-west1'
17 | bq_dataset = 'ecommerce_sink'
18 | bq_table = 'cloud_run'
19 | bq_table_anomaly = 'cloud_run_anomaly'
20 | endpoint_id = '<endpoint-id>'
21 | 


--------------------------------------------------------------------------------
/02_activate/21_challenge/inf_processing_service/main.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Google
  2 | 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | 
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import os
 16 | import time
 17 | import base64
 18 | import json
 19 | import datetime
 20 | import config
 21 | 
 22 | from flask import Flask, request
 23 | 
 24 | from google.cloud import bigquery, aiplatform
 25 | 
 26 | 
 27 | app = Flask(__name__)
 28 | 
 29 | 
 30 | @app.route("/hw", methods=['GET', 'POST'])
 31 | def hello_world():
 32 |     world = request.args.get('world')
 33 |     return f"Hello {world}!"
 34 | 
 35 | 
 36 | @app.route("/", methods=["POST"])
 37 | def index():
 38 |     envelope = request.get_json()
 39 |     print(envelope)
 40 |     print(type(envelope))
 41 | 
 42 |     if not envelope:
 43 |         msg = "no Pub/Sub message received"
 44 |         print(f"error: {msg}")
 45 |         return f"Bad Request: {msg}", 400
 46 | 
 47 |     ps_message = envelope['message']
 48 | 
 49 |     record = base64.b64decode(ps_message["data"]).decode("utf-8").strip()
 50 |     record = json.loads(record)
 51 | 
 52 |     record["weekday"] = datetime.datetime.strptime(record["event_datetime"], "%Y-%m-%d %H:%M:%S").strftime('%A')
 53 | 
 54 |     rows_to_insert = [record]
 55 | 
 56 |     client = bigquery.Client(project=config.project_id, location=config.location)
 57 |     table_id = config.project_id + '.' + config.bq_dataset + '.' + config.bq_table
 58 | 
 59 |     errors = client.insert_rows_json(table_id, rows_to_insert)  # Make an API request.
 60 | 
 61 | 
 62 |     # Create record that includes anomaly detection inference.
 63 |     if record["event"] == "purchase":
 64 |         record_to_predict = [
 65 |             {"tax": record["ecommerce"]["purchase"]["tax"],
 66 |             "shipping": record["ecommerce"]["purchase"]["shipping"],
 67 |             "value":record["ecommerce"]["purchase"]["value"]}
 68 |             ]
 69 | 
 70 |         # <ai platform sdk initialization>
 71 | 
 72 |         # < vertex endpoint definition >
 73 | 
 74 |         # < calling prediction from endpoint >
 75 | 
 76 |         centroid = endpoint_response.predictions[0]["nearest_centroid_id"][0]
 77 | 
 78 |         if centroid == 1:
 79 |             anomaly = True
 80 |         if centroid == 2:
 81 |             anomaly = False
 82 | 
 83 |         print(anomaly)
 84 |         
 85 |         anomaly_record = {"tax": record["ecommerce"]["purchase"]["tax"], "shipping": record["ecommerce"]["purchase"]["shipping"], "value":record["ecommerce"]["purchase"]["value"], "anomaly": anomaly}
 86 | 
 87 |         rows_to_insert = [anomaly_record]
 88 | 
 89 |         # < defining Big Query client >
 90 |         # < setting table id >
 91 |         # < api request to insert rows in BigQuery destination table >
 92 | 
 93 |         if errors_an == []:
 94 |             print(f"{time.time()} New rows with prediction have been added.")
 95 |             return ("", 204)
 96 |         else:
 97 |             print("Encountered errors while inserting rows: {}".format(errors))
 98 |             return f"Bad Request: {envelope}", 400
 99 | 
100 |     if errors == []:
101 |         print(f"{time.time()} New rows have been added.")
102 |         return ("", 204)
103 |     else:
104 |         print("Encountered errors while inserting rows: {}".format(errors))
105 |         return f"Bad Request: {envelope}", 400
106 | 
107 | 
108 | if __name__ == "__main__":
109 |     app.run(debug=True, host="0.0.0.0", port=int(os.environ.get("PORT", 8080)))
110 | 
111 | 


--------------------------------------------------------------------------------
/02_activate/21_challenge/inf_processing_service/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | numpy
3 | Flask==2.1.0
4 | gunicorn==20.1.0
5 | google-cloud-bigquery
6 | google-cloud-aiplatform


--------------------------------------------------------------------------------
/02_activate/21_challenge/inf_processing_service_custom/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9
 2 | 
 3 | # Allow statements and log messages to immediately appear in the Knative logs
 4 | ENV PYTHONUNBUFFERED True
 5 | 
 6 | # Copy local code to the container image.
 7 | ENV APP_HOME /app
 8 | WORKDIR $APP_HOME
 9 | COPY . ./
10 | COPY ./requirements.txt ./
11 | 
12 | # Install production dependencies.
13 | RUN pip install --upgrade pip
14 | RUN pip install --no-cache-dir -r requirements.txt
15 | 
16 | # Run the web service on container startup. Here we use the gunicorn
17 | # webserver, with one worker process and 8 threads.
18 | # For environments with multiple CPU cores, increase the number of workers
19 | # to be equal to the cores available.
20 | # Timeout is set to 0 to disable the timeouts of the workers to allow Cloud Run to handle instance scaling.
21 | CMD exec gunicorn --bind :$PORT --workers 1 --threads 8 --timeout 0 main:app


--------------------------------------------------------------------------------
/02_activate/21_challenge/inf_processing_service_custom/README.md:
--------------------------------------------------------------------------------
 1 | The processing service defines a Cloud Run Service to process each incoming datapoint.
 2 | 
 3 | 
 4 | main.py defines the public facing webserver to listen for requests.
 5 | 
 6 | synth_data_stream.py creates a synthetic data stream of events randomly chosen from the datalayer. It also randomly includes anomalies in the data.
 7 | 
 8 | 
 9 | Command to start data stream:
10 | python3 synth_data_stream.py --endpoint {Pub/Sub endpoint link}'


--------------------------------------------------------------------------------
/02_activate/21_challenge/inf_processing_service_custom/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | project_id = '<project-id>'
16 | location = 'europe-west1'
17 | bq_dataset = 'ecommerce_sink'
18 | bq_table = 'cloud_run'
19 | bq_table_anomaly = 'cloud_run_anomaly_custom'
20 | endpoind_id = '<endpoint-id>'
21 | 


--------------------------------------------------------------------------------
/02_activate/21_challenge/inf_processing_service_custom/main.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Google
  2 | 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | 
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import os
 16 | import time
 17 | import base64
 18 | import json
 19 | import datetime
 20 | import config
 21 | 
 22 | from flask import Flask, request
 23 | 
 24 | from google.cloud import bigquery, aiplatform
 25 | 
 26 | 
 27 | app = Flask(__name__)
 28 | 
 29 | 
 30 | @app.route("/hw", methods=['GET', 'POST'])
 31 | def hello_world():
 32 |     world = request.args.get('world')
 33 |     return f"Hello {world}!"
 34 | 
 35 | 
 36 | @app.route("/", methods=["POST"])
 37 | def index():
 38 |     envelope = request.get_json()
 39 |     print(envelope)
 40 |     print(type(envelope))
 41 | 
 42 |     if not envelope:
 43 |         msg = "no Pub/Sub message received"
 44 |         print(f"error: {msg}")
 45 |         return f"Bad Request: {msg}", 400
 46 | 
 47 |     ps_message = envelope['message']
 48 | 
 49 |     record = base64.b64decode(ps_message["data"]).decode("utf-8").strip()
 50 |     record = json.loads(record)
 51 | 
 52 |     record["weekday"] = datetime.datetime.strptime(record["event_datetime"], "%Y-%m-%d %H:%M:%S").strftime('%A')
 53 | 
 54 |     rows_to_insert = [record]
 55 | 
 56 |     client = bigquery.Client(project=config.project_id, location=config.location)
 57 |     table_id = config.project_id + '.' + config.bq_dataset + '.' + config.bq_table
 58 | 
 59 |     errors = client.insert_rows_json(table_id, rows_to_insert)  # Make an API request.
 60 | 
 61 | 
 62 |     # Create record that includes anomaly detection inference.
 63 |     if record["event"] == "purchase":
 64 |         record_to_predict = [
 65 |             {"tax": record["ecommerce"]["purchase"]["tax"],
 66 |             "shipping": record["ecommerce"]["purchase"]["shipping"],
 67 |             "value":record["ecommerce"]["purchase"]["value"]}
 68 |             ]
 69 | 
 70 |         aiplatform.init(project=config.project_id, location=config.location)
 71 | 
 72 |         endpoint = aiplatform.Endpoint(
 73 |             endpoint_name=f"projects/{config.project_id}/locations/{config.location}/endpoints/{config.endpoind_id}",
 74 |             project = config.project_id,
 75 |             location=config.location,
 76 |             )
 77 | 
 78 |         endpoint_response = endpoint.predict(
 79 |             instances=record_to_predict
 80 |         )
 81 | 
 82 |         anomaly = endpoint_response.predictions[0]
 83 |         
 84 |         anomaly_record = {"tax": record["ecommerce"]["purchase"]["tax"], "shipping": record["ecommerce"]["purchase"]["shipping"], "value":record["ecommerce"]["purchase"]["value"], "anomaly": anomaly}
 85 | 
 86 |         rows_to_insert = [anomaly_record]
 87 | 
 88 |         client = bigquery.Client(project=config.project_id, location=config.location)
 89 |         table_id = config.project_id + '.' + config.bq_dataset + '.' + config.bq_table_anomaly
 90 | 
 91 |         errors_an = client.insert_rows_json(table_id, rows_to_insert)  # Make an API request.
 92 | 
 93 | 
 94 |         if errors_an == []:
 95 |             print(f"{time.time()} New rows with prediction have been added.")
 96 |             return ("", 204)
 97 |         else:
 98 |             print("Encountered errors while inserting rows: {}".format(errors))
 99 |             return f"Bad Request: {envelope}", 400
100 | 
101 |     if errors == []:
102 |         print(f"{time.time()} New rows have been added.")
103 |         return ("", 204)
104 |     else:
105 |         print("Encountered errors while inserting rows: {}".format(errors))
106 |         return f"Bad Request: {envelope}", 400
107 | 
108 | 
109 | if __name__ == "__main__":
110 |     app.run(debug=True, host="0.0.0.0", port=int(os.environ.get("PORT", 8080)))
111 | 
112 | 


--------------------------------------------------------------------------------
/02_activate/21_challenge/inf_processing_service_custom/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | numpy
3 | Flask==2.1.0
4 | gunicorn==20.1.0
5 | google-cloud-bigquery
6 | google-cloud-aiplatform
7 | scikit-learn


--------------------------------------------------------------------------------
/02_activate/21_challenge/kf_pipe_custom.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Google
  2 | 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | 
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import time
 16 | from typing import Iterable, Dict, NamedTuple
 17 | 
 18 | import config_custom
 19 | 
 20 | # import kfp
 21 | from kfp.v2 import compiler, dsl
 22 | from kfp.v2.dsl import component
 23 | from kfp.v2.components import importer_node
 24 | import google.cloud.aiplatform as aip
 25 | from google_cloud_pipeline_components import aiplatform as gcc_aip
 26 | 
 27 | from google_cloud_pipeline_components.types import artifact_types
 28 | from google_cloud_pipeline_components.v1.custom_job import CustomTrainingJobOp
 29 | 
 30 | ## Training Worker Specs
 31 | WORKER_POOL_SPECS = [
 32 |     {
 33 |         "machine_spec": {
 34 |             "machine_type": "n1-standard-4"
 35 |         },
 36 |         "replica_count": "1",
 37 |         "container_spec": {
 38 |             "image_uri": config_custom.TRAIN_IMAGE_URI,
 39 |             "env": [
 40 |                 {
 41 |                     "name": "AIP_STORAGE_URI",
 42 |                     "value": config_custom.AIP_STORAGE_URI
 43 |                 },
 44 |             ]
 45 |         }
 46 |     }
 47 | ]
 48 | 
 49 | def compile_pipe():
 50 |     # Define the workflow of the pipeline.
 51 |     @dsl.pipeline(
 52 |         name="anomaly-detection-custom-test",
 53 |         pipeline_root=config_custom.PIPELINE_ROOT_PATH)
 54 | 
 55 |     def pipeline(
 56 |         project_id: str, 
 57 |         region: str, 
 58 |         timestamp_id: str, 
 59 |         artifact_staging_location:str,
 60 |         bq_source: str,
 61 |         aip_storage_uri: str,
 62 |         predict_image_uri: str
 63 |         ):
 64 |         
 65 |         # Model training
 66 |         train_job = '<1. Add the training job with a display name, project, location and worker pool defined.>'
 67 | 
 68 |         # Model evaluation
 69 |         # Ideally here you can evaluate the model and decide on deployment/or not for CI/CD purposes
 70 |         # example: https://www.cloudskillsboost.google/focuses/21234?parent=catalog
 71 | 
 72 |         # Import with the custom predict container
 73 |         import_unmanaged_model_op = importer_node.importer(
 74 |             artifact_uri=aip_storage_uri,
 75 |             artifact_class=artifact_types.UnmanagedContainerModel,
 76 |             metadata={
 77 |                 "containerSpec": {
 78 |                     "imageUri": predict_image_uri,
 79 |                     "env": [
 80 |                         {
 81 |                             "name": "PROJECT_ID",
 82 |                             "value": project_id},
 83 |                     ],
 84 |                     "predictRoute": "/predict",
 85 |                     "healthRoute": "/health_check",
 86 |                     "ports": [
 87 |                         {
 88 |                             "containerPort": 8080
 89 |                         }
 90 |                     ]
 91 |                 },
 92 |             },
 93 |         ).after(train_job)
 94 |         
 95 |         # Upload the model into the registry
 96 |         custom_model_upload_job = gcc_aip.'<2. Find the correct operator>'(
 97 |             project=project_id,
 98 |             location=region,
 99 |             display_name=f"anomaly-detection-custom-model_{timestamp_id}",
100 |             unmanaged_container_model=import_unmanaged_model_op.outputs["artifact"],
101 |             ).after(import_unmanaged_model_op)
102 |         
103 |         # Create an endpoint where the model will be deployed
104 |         endpoint_create_job = gcc_aip.'<3. Find the correct operator>'(
105 |             project=project_id,
106 |             display_name="anomaly-detection-custom-endpoint",
107 |             location=region
108 |         )
109 | 
110 |         # Deploy the model on the endpoint
111 |         _ = gcc_aip.'<4. Find the correct operator>'(
112 |             model=custom_model_upload_job.outputs["model"],
113 |             endpoint=endpoint_create_job.outputs["endpoint"],
114 |             deployed_model_display_name="anomaly-detection-custom-deploy",
115 |             dedicated_resources_min_replica_count=1,
116 |             dedicated_resources_max_replica_count=1,
117 |             dedicated_resources_machine_type="n1-standard-2",
118 |             traffic_split={"0": 100}
119 |         )
120 |         
121 |     compiler.Compiler().compile(pipeline_func=pipeline, package_path="hyp-custom-anomaly-detection.json")
122 | 
123 | if __name__ == "__main__":
124 |     # Initialize aiplatform credentials.
125 |     aip.init(project=config_custom.PROJECT_ID, location=config_custom.REGION)
126 | 
127 |     # Compile pipeline code.
128 |     compile_pipe()
129 | 
130 |     # Unique ident for pipeline run
131 |     timestamp_id = str(int(time.time()))
132 | 
133 |     # Prepare the pipeline job.
134 |     job = aip.PipelineJob(
135 |         display_name=f"{timestamp_id}-hyp-custom-anomaly-detection",
136 |         template_path="hyp-custom-anomaly-detection.json",
137 |         pipeline_root=config_custom.PIPELINE_ROOT_PATH,
138 |         parameter_values={
139 |             'project_id': config_custom.PROJECT_ID,
140 |             'region': config_custom.REGION,
141 |             'timestamp_id': timestamp_id,
142 |             'bq_source': config_custom.DATA_URI,
143 |             'aip_storage_uri' : config_custom.AIP_STORAGE_URI,
144 |             'predict_image_uri' : config_custom.PREDICT_IMAGE_URI,
145 |             'artifact_staging_location': config_custom.PIPELINE_ROOT_PATH
146 |         }
147 |     )
148 | 
149 |     job.submit(service_account=config_custom.SERVICE_ACCOUNT)
150 | 


--------------------------------------------------------------------------------
/02_activate/21_challenge/processing-service/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9
 2 | 
 3 | # Allow statements and log messages to immediately appear in the Knative logs
 4 | ENV PYTHONUNBUFFERED True
 5 | 
 6 | # Copy local code to the container image.
 7 | ENV APP_HOME /app
 8 | WORKDIR $APP_HOME
 9 | COPY . ./
10 | COPY ./requirements.txt ./
11 | 
12 | # Install production dependencies.
13 | RUN pip install --upgrade pip
14 | RUN pip install --no-cache-dir -r requirements.txt
15 | 
16 | # Run the web service on container startup. Here we use the gunicorn
17 | # webserver, with one worker process and 8 threads.
18 | # For environments with multiple CPU cores, increase the number of workers
19 | # to be equal to the cores available.
20 | # Timeout is set to 0 to disable the timeouts of the workers to allow Cloud Run to handle instance scaling.
21 | CMD exec gunicorn --bind :$PORT --workers 1 --threads 8 --timeout 0 main:app


--------------------------------------------------------------------------------
/02_activate/21_challenge/processing-service/README.md:
--------------------------------------------------------------------------------
 1 | The processing service defines a Cloud Run Service to process each incoming datapoint.
 2 | 
 3 | 
 4 | main.py defines the public facing webserver to listen for requests.
 5 | 
 6 | synth_data_stream.py creates a synthetic data stream of events randomly chosen from the datalayer. It also randomly includes anomalies in the data.
 7 | 
 8 | 
 9 | Command to start data stream:
10 | python3 synth_data_stream.py --endpoint {Pub/Sub endpoint link}'


--------------------------------------------------------------------------------
/02_activate/21_challenge/processing-service/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | project_id = '<project-id>'
16 | location = 'europe-west1'
17 | bq_dataset = 'ecommerce_sink'
18 | bq_table = 'cloud_run'
19 | 


--------------------------------------------------------------------------------
/02_activate/21_challenge/processing-service/main.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | import time
17 | import base64
18 | import json
19 | import datetime
20 | import config
21 | 
22 | from flask import Flask, request
23 | 
24 | from google.cloud import bigquery
25 | 
26 | app = Flask(__name__)
27 | 
28 | 
29 | @app.route("/hw", methods=['GET', 'POST'])
30 | def hello_world():
31 |     world = request.args.get('world')
32 |     return f"Hello {world}!"
33 | 
34 | 
35 | @app.route("/", methods=["POST"])
36 | def index():
37 |     envelope = request.get_json()
38 |     print(envelope)
39 |     print(type(envelope))
40 | 
41 |     if not envelope:
42 |         msg = "no Pub/Sub message received"
43 |         print(f"error: {msg}")
44 |         return f"Bad Request: {msg}", 400
45 | 
46 |     ps_message = envelope['message']
47 | 
48 |     record = base64.b64decode(ps_message["data"]).decode("utf-8").strip()
49 |     record = json.loads(record)
50 | 
51 |     record["weekday"] = datetime.datetime.strptime(record["event_datetime"], "%Y-%m-%d %H:%M:%S").strftime('%A')
52 | 
53 |     rows_to_insert = [record]
54 | 
55 |     client = bigquery.Client(project=config.project_id, location=config.location)
56 |     table_id = config.project_id + '.' + config.bq_dataset + '.' + config.bq_table
57 | 
58 |     errors = client.insert_rows_json(table_id, rows_to_insert)  # Make an API request.
59 |     if errors == []:
60 |         print(f"{time.time()} New rows have been added.")
61 |         return ("", 204)
62 |     else:
63 |         print("Encountered errors while inserting rows: {}".format(errors))
64 |         return f"Bad Request: {envelope}", 400
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     app.run(debug=True, host="0.0.0.0", port=int(os.environ.get("PORT", 8080)))
69 | 
70 | 


--------------------------------------------------------------------------------
/02_activate/21_challenge/processing-service/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | numpy
3 | Flask==2.1.0
4 | gunicorn==20.1.0
5 | google-cloud-bigquery


--------------------------------------------------------------------------------
/02_activate/21_challenge/requirements.txt:
--------------------------------------------------------------------------------
1 | kfp
2 | google-cloud-aiplatform
3 | protobuf==3.20.3
4 | google-cloud-pipeline-components==1.0.39
5 | kfp==1.8.19
6 | google-cloud-aiplatform==1.22.0
7 | scikit-learn
8 | 


--------------------------------------------------------------------------------
/02_activate/21_challenge/terraform.tfvars:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2023 Google
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | project_id                  = "<project-id>"
18 | delete_contents_on_destroy  = true
19 | 


--------------------------------------------------------------------------------
/02_activate/21_challenge/variables.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2023 Google
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | variable "project_id" {
18 |   description = "Project where the dataset and table are created."
19 | }
20 | 
21 | variable "delete_contents_on_destroy" {
22 |   description = "(Optional) If set to true, delete all the tables in the dataset when destroying the resource; otherwise, destroying the resource will fail if tables are present."
23 |   type        = bool
24 |   default     = null
25 | }
26 | 
27 | variable "force_destroy" {
28 |   description = "When deleting a bucket, this boolean option will delete all contained objects. If false, Terraform will fail to delete buckets which contain objects."
29 |   type        = bool
30 |   default     = false
31 | }
32 | 
33 | variable "gcp_region" {
34 |   description = "GCP region to deploy resources in."
35 |   type = string
36 |   default = "europe-west1"
37 | }


--------------------------------------------------------------------------------
/02_activate/22_solution/README.md:
--------------------------------------------------------------------------------
  1 | # Developing and deploying Machine Leaning Models on GCP
  2 | 
  3 | Welcome to the second part of Hack Your Pipe!
  4 | 
  5 | So far you discovered multiple options to ingest and transform data most efficiently.
  6 | In this section you go one step further with your data, but constantly build on the previous learnings.
  7 | You will train and deploy Machine Learning models that detect anomalies in the incoming click stream.
  8 | 
  9 | Hereby we will focus on automation, simplicity and reliability of every step in the Machine Learning Lifecycle.
 10 | 
 11 | The architecture you are going to implement will look something like this:
 12 | 
 13 | ![Hack Your Pipe architecture](../../rsc/hyp_ml_architecture.png)
 14 | 
 15 | 
 16 | 
 17 | ## Prerequisites: Create Synthetic Data
 18 | 
 19 | You will use the click stream data from the [ingest and transform section](https://github.com/NucleusEngineering/hack-your-pipe/tree/main/01_ingest_and_transform) as an example.
 20 | 
 21 | If you haven't worked through the ingest and transform chapter follow [`01_ingest_and_transform/12_solution/README.md`](https://github.com/NucleusEngineering/hack-your-pipe/blob/main/01_ingest_and_transform/12_solution/README.md).
 22 | 
 23 | Before moving on make sure that your BigQuery project has a dataset `ecommerce_sink` with the tables `cloud_run`, `dataflow` and `pubsub_direct`.
 24 | The tables should be populated with at least 1000 data points each.
 25 | 
 26 | ## Git clone repo 
 27 | 
 28 | ```
 29 | git clone https://github.com/NucleusEngineering/hack-your-pipe.git
 30 | cd hack-your-pipe
 31 | ```
 32 | 
 33 | ## Set-up Cloud Environment
 34 | 
 35 | ### Initialize your account and project
 36 | 
 37 | If you are using the Google Cloud Shell you can skip this step.
 38 | 
 39 | ```
 40 | gcloud init
 41 | ```
 42 | 
 43 | ### Set Google Cloud Project
 44 | Enter your GCP Project ID as `GCP_PROJECT` in `./config_env.sh` & set the environment variables.
 45 | ```
 46 | source config_env.sh
 47 | ```
 48 | 
 49 | ```
 50 | gcloud config set project $GCP_PROJECT
 51 | ```
 52 | 
 53 | ### Enable Google Cloud APIs
 54 | 
 55 | ```
 56 | gcloud services enable aiplatform.googleapis.com storage.googleapis.com notebooks.googleapis.com dataflow.googleapis.com artifactregistry.googleapis.com 
 57 | ```
 58 | 
 59 | ### Set compute zone
 60 | 
 61 | ```
 62 | gcloud config set compute/zone $GCP_REGION
 63 | ```
 64 | 
 65 | ### Create a service account.
 66 | 
 67 | ```
 68 | gcloud iam service-accounts create retailpipeline-hyp \
 69 |     --display-name="retailpipeline-hyp"
 70 | ```
 71 | You might already have this from running the ingest and transform section. In such a case just add the below permissions.
 72 | 
 73 | ### ... with the necessary permissions.
 74 | ```
 75 | gcloud projects add-iam-policy-binding $GCP_PROJECT \
 76 |     --member="serviceAccount:retailpipeline-hyp@$GCP_PROJECT.iam.gserviceaccount.com" \
 77 |     --role="roles/storage.objectAdmin"
 78 | 
 79 | ```
 80 | 
 81 | ```
 82 | gcloud projects add-iam-policy-binding $GCP_PROJECT \
 83 |     --member="serviceAccount:retailpipeline-hyp@$GCP_PROJECT.iam.gserviceaccount.com" \
 84 |     --role="roles/aiplatform.user"
 85 | 
 86 | ```
 87 | 
 88 | ```
 89 | gcloud projects add-iam-policy-binding $GCP_PROJECT \
 90 |     --member="serviceAccount:retailpipeline-hyp@$GCP_PROJECT.iam.gserviceaccount.com" \
 91 |     --role="roles/automl.serviceAgent"
 92 | 
 93 | ```
 94 | 
 95 | <!-- ### Organizational Policies
 96 | 
 97 | Depending on the setup within your organization you might have to [overwrite some organizational policies](https://cloud.google.com/resource-manager/docs/organization-policy/creating-managing-policies#boolean_constraints) for the examples to run.
 98 | 
 99 | For example, the following policies should not be enforced. 
100 | 
101 | ```
102 | constraints/sql.restrictAuthorizedNetworks
103 | constraints/compute.vmExternalIpAccess
104 | constraints/compute.requireShieldedVm
105 | constraints/storage.uniformBucketLevelAccess
106 | constraints/iam.allowedPolicyMemberDomains
107 | ``` -->
108 | 
109 | ### Adjusting all the configs - important!
110 | 
111 | Set your GCP project id in the following files in `hack-your-pipe/02_activate/22_solution/`
112 | 
113 | * `processing_service/config.py`
114 | * `inf_processing_service_custom/config.py`
115 | * `inf_processing_service/config.py`
116 | * `custom_train/trainer/config.py` 
117 | * `custom_train/prediction/config.py`
118 | * `config.py`
119 | 
120 | 
121 | ## Run ML Pipeline
122 | 
123 | ### Set pipeline config options
124 | 
125 | Set the config options in [`02_activate/22_solution/config.py`](https://github.com/NucleusEngineering/hack-your-pipe/blob/main/02_activate/22_solution/config.py). 
126 | 
127 | 
128 | ### Run Kubeflow Pipeline in Vertex (BigQueryML model)
129 | 
130 | [Vertex Pipelines](https://cloud.google.com/vertex-ai/docs/pipelines/introduction) is an end-to-end and serverless ML orchestration tool. It's supports the open source frameworks [Kubeflow](https://www.kubeflow.org/) and [TFX](https://www.tensorflow.org/tfx).
131 | 
132 | The full process from model training to deployment can be orchestrated using Vertex Pipelines. 
133 | 
134 | To kick off the pipeline simply install the dependencies
135 | ```
136 | pip install -r ./requirements.txt
137 | ```
138 | 
139 | and then run
140 | 
141 | ```
142 | python3 kf_pipe.py
143 | ```
144 | 
145 | ## Set up processing pipe for real time inference
146 | 
147 | Once the model is trained and deployed you will include a real time inference call in the data pipeline and again stream the results to BigQuery.
148 | 
149 | Use terraform to create a new BigQuery table as sink for your predictions. 
150 | 
151 | ```
152 | terraform init
153 | ```
154 | 
155 | ```
156 | terraform plan
157 | ```
158 | 
159 | ```
160 | terraform apply -var-file terraform.tfvars
161 | ```
162 | 
163 | 
164 | To include real time inference in your pipeline you have to update the Cloud Run processing service.
165 | That means you need build and deploy a new container version to your service. Don't forget to update the `inf_processing_service_custom/config.py`.
166 | 
167 | Build the container, and deploy on Cloud Run (note that you are just replacing the container image of the previous inference service to this new inference service).
168 | 
169 | ```
170 | gcloud builds submit $RUN_INFERENCE_PROCESSING_SERVICE_CUSTOM --tag gcr.io/$GCP_PROJECT/inference-processing-service-custom
171 | ```
172 | 
173 | ```
174 | gcloud run deploy hyp-run-service-data-processing --image=gcr.io/$GCP_PROJECT/inference-processing-service-custom:latest --region=$GCP_REGION --allow-unauthenticated
175 | ```
176 | 
177 | ## Run Kubeflow Pipeline in Vertex (Custom Container)
178 | 
179 | Two additional steps are needed to run the pipeline with custom training and prediction. We start by preparing the code to create custom training and prediction containers.
180 | Containers are providing you a way to write your own preferred data processing and model training with your preferred library and environment.
181 | 
182 | Build the containers
183 | 
184 | ```
185 | gcloud builds submit custom_train/trainer/. --tag $TRAIN_IMAGE_URI
186 | ```
187 | ```
188 | gcloud builds submit  custom_train/prediction/. --tag $PREDICT_IMAGE_URI 
189 | ```
190 | 
191 | 
192 | And kick off the pipeline same as before
193 | ```
194 | pip install -r ./requirements.txt
195 | ```
196 | 
197 | and then run
198 | 
199 | ```
200 | python3 kf_pipe_custom.py
201 | ```
202 | 


--------------------------------------------------------------------------------
/02_activate/22_solution/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import os 
15 | 
16 | GCP_PROJECT = os.environ['GCP_PROJECT']
17 | GCP_REGION = os.environ['GCP_REGION']
18 | PIPELINE_ROOT_PATH = f"gs://{GCP_PROJECT}-ecommerce-events"
19 | SERVICE_ACCOUNT=f"retailpipeline-hyp@{GCP_PROJECT}.iam.gserviceaccount.com"
20 | MACHINE_TYPE = "n1-standard-4"


--------------------------------------------------------------------------------
/02_activate/22_solution/config_custom.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import os
17 | 
18 | PROJECT_ID = os.environ['GCP_PROJECT']
19 | REGION = os.environ['GCP_REGION']
20 | PIPELINE_ROOT_PATH=f'gs://{PROJECT_ID}-ai-bucket/pipeline_root_custom/'
21 | 
22 | TRAIN_IMAGE_URI=os.environ['TRAIN_IMAGE_URI']
23 | PREDICT_IMAGE_URI=os.environ['PREDICT_IMAGE_URI']
24 | AIP_STORAGE_URI=f'gs://{PROJECT_ID}-ai-bucket/vtx-artifacts'
25 | 
26 | SERVICE_ACCOUNT=f"retailpipeline-hyp@{PROJECT_ID}.iam.gserviceaccount.com"
27 | MACHINE_TYPE = "n1-standard-4"
28 | DATA_URI=f"{PROJECT_ID}.ecommerce_sink.anomaly_data"


--------------------------------------------------------------------------------
/02_activate/22_solution/config_env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | export GCP_PROJECT="<project-id>"
 4 | export ENDPOINT_URL="<endpoint-url>" # doesn't need to be defined in the very beginning
 5 | export PUSH_ENDPOINT='<processing-endpoint-url>' # doesn't need to be defined in the very beginning
 6 | export GCP_REGION=europe-west1
 7 | export RUN_PROXY_DIR=cloud-run-pubsub-proxy
 8 | export RUN_PROCESSING_DIR=processing-service
 9 | export DATAFLOW_TEMPLATE=beam
10 | export RUN_INFERENCE_PROCESSING_SERVICE=inf_processing_service
11 | export RUN_INFERENCE_PROCESSING_SERVICE_CUSTOM=inf_processing_service_custom
12 | 
13 | export TRAIN_IMAGE_URI=gcr.io/$GCP_PROJECT/custom_train:v1
14 | export PREDICT_IMAGE_URI=gcr.io/$GCP_PROJECT/custom_predict:v1


--------------------------------------------------------------------------------
/02_activate/22_solution/custom_train/prediction/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM tiangolo/uvicorn-gunicorn-fastapi:python3.7
 2 | 
 3 | # Allow statements and log messages to immediately appear in the Knative logs
 4 | ENV PYTHONUNBUFFERED True
 5 | 
 6 | # Copy local code to the container image.
 7 | COPY / /app
 8 | WORKDIR /app
 9 | COPY . ./
10 | 
11 | # Install production dependencies.
12 | RUN pip install --upgrade pip
13 | RUN pip install --no-cache-dir -r requirements.txt
14 | 
15 | # Run the web service on container startup. Here we use the gunicorn
16 | # webserver, with one worker process and 8 threads.
17 | # For environments with multiple CPU cores, increase the number of workers
18 | # to be equal to the cores available.
19 | # Timeout is set to 0 to disable the timeouts of the workers to allow Cloud Run to handle instance scaling.
20 | CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"]
21 | 
22 | EXPOSE 8080


--------------------------------------------------------------------------------
/02_activate/22_solution/custom_train/prediction/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NucleusEngineering/hack-your-pipe/dfd667a1545dcc45cca8a2a16ab2c7f70983211f/02_activate/22_solution/custom_train/prediction/__init__.py


--------------------------------------------------------------------------------
/02_activate/22_solution/custom_train/prediction/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | PROJECT_ID="<project-id>"
17 | REGION="europe-west1"


--------------------------------------------------------------------------------
/02_activate/22_solution/custom_train/prediction/main.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | 
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | 
 8 | 
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 | 
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | 
19 | from fastapi import Request, FastAPI
20 | import json
21 | import os
22 | from joblib import load
23 | import sys
24 | import pandas as pd
25 | from google.cloud import storage
26 | from tempfile import TemporaryFile
27 | import os
28 | import config
29 | 
30 | app = FastAPI()
31 | 
32 | model_directory = f"{os.environ['AIP_STORAGE_URI']}/model_dir"
33 | storage_path = os.path.join(model_directory, "model.joblib")
34 | 
35 | storage_client = storage.Client(project=config.PROJECT_ID)
36 | blob = storage.blob.Blob.from_string(storage_path, client=storage_client)
37 | 
38 | blob.download_to_filename("model.joblib")
39 | model =load(open("model.joblib",'rb'))
40 | 
41 | @app.get('/')
42 | def get_root():
43 |     return {'message': 'Welcome to custom anomaly detection'}
44 | 
45 | @app.get('/health_check')
46 | def health():
47 |     return 200
48 | 
49 | if os.environ.get('AIP_PREDICT_ROUTE') is not None:
50 |     method = os.environ['AIP_PREDICT_ROUTE']
51 | else:
52 |     method = '/predict'
53 | 
54 | @app.post(method)
55 | async def predict(request: Request):
56 |     print("----------------- PREDICTING -----------------")
57 |     body = await request.json()
58 |     # prepare data
59 |     instances = pd.DataFrame(body["instances"])
60 |     
61 |     # retrieving predictions
62 |     outputs = model.predict(instances)
63 |     
64 |     response = outputs.tolist()
65 |     print("----------------- OUTPUTS -----------------")
66 |     return {"predictions": response}


--------------------------------------------------------------------------------
/02_activate/22_solution/custom_train/prediction/requirements.txt:
--------------------------------------------------------------------------------
 1 | pandas
 2 | numpy
 3 | Flask==2.1.0
 4 | gunicorn==20.1.0
 5 | google-cloud-bigquery
 6 | google-cloud-aiplatform
 7 | google-cloud-storage
 8 | scikit-learn 
 9 | joblib
10 | gcsfs


--------------------------------------------------------------------------------
/02_activate/22_solution/custom_train/trainer/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM gcr.io/deeplearning-platform-release/sklearn-cpu.0-23
 2 | WORKDIR /
 3 | 
 4 | # Allow statements and log messages to immediately appear in the Knative logs
 5 | ENV PYTHONUNBUFFERED True
 6 | 
 7 | # Copies the trainer code to the docker image.
 8 | COPY / /trainer
 9 | COPY . ./
10 | 
11 | # Install production dependencies.
12 | RUN pip install --upgrade pip
13 | RUN pip install --no-cache-dir -r requirements.txt
14 | 
15 | # Sets up the entry point to invoke the trainer.
16 | CMD ["python", "trainer/main.py"]


--------------------------------------------------------------------------------
/02_activate/22_solution/custom_train/trainer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NucleusEngineering/hack-your-pipe/dfd667a1545dcc45cca8a2a16ab2c7f70983211f/02_activate/22_solution/custom_train/trainer/__init__.py


--------------------------------------------------------------------------------
/02_activate/22_solution/custom_train/trainer/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | PROJECT_ID="<project-id>"
17 | REGION="europe-west1"
18 | AIP_STORAGE_URI=f'gs://{PROJECT_ID}-ai-bucket/vtx-artifacts'
19 | # training data:
20 | DATA_URI=f"{PROJECT_ID}.ecommerce_sink.anomaly_data"


--------------------------------------------------------------------------------
/02_activate/22_solution/custom_train/trainer/main.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from sklearn.tree import DecisionTreeClassifier
16 | from sklearn.metrics import roc_curve
17 | from sklearn.model_selection import train_test_split
18 | from google.cloud import bigquery
19 | from google.cloud import storage
20 | from joblib import dump
21 | 
22 | import preprocess
23 | import train
24 | import config
25 | 
26 | import os
27 | import pandas as pd
28 | import sys
29 | 
30 | # data uri
31 | data_uri = config.DATA_URI
32 | 
33 | # bq client
34 | bqclient = bigquery.Client(project=config.PROJECT_ID)
35 | storage_client = storage.Client(project=config.PROJECT_ID)
36 | 
37 | ## Download & prep data
38 | print('[INFO] ------ Preparing Data', file=sys.stderr)
39 | train_data, train_labels, test_data, test_labels = preprocess.prep_data(bqclient, storage_client, data_uri)
40 | 
41 | ## Train model and save it in Google Cloud Storage
42 | print('[INFO] ------ Training & Saving Model', file=sys.stderr)
43 | train.train_model(train_data, train_labels, test_data, test_labels, storage_client)


--------------------------------------------------------------------------------
/02_activate/22_solution/custom_train/trainer/preprocess.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from sklearn.tree import DecisionTreeClassifier
16 | from sklearn.metrics import roc_curve
17 | from sklearn.model_selection import train_test_split
18 | from google.cloud import bigquery
19 | from google.cloud import storage
20 | from joblib import dump
21 | 
22 | import os
23 | import pandas as pd
24 | 
25 | def download_table(bqclient, storage_client, bq_table_uri: str):
26 | 
27 |     prefix = "bq://"
28 |     if bq_table_uri.startswith(prefix):
29 |         bq_table_uri = bq_table_uri[len(prefix):]
30 | 
31 |     table = bigquery.TableReference.from_string(bq_table_uri)
32 |     rows = bqclient.list_rows(
33 |         table,
34 |     )
35 |     return rows.to_dataframe(create_bqstorage_client=False)
36 | 
37 | def prep_data(bqclient, storage_client, data_uri: str):
38 | 
39 |     # Download data into Pandas DataFrames, split into train / test
40 |     df, test_df = train_test_split(download_table(bqclient, storage_client, data_uri))
41 |     labels = df.pop("anomaly").tolist()
42 |     data = df.values.tolist()
43 |     test_labels = test_df.pop("anomaly").tolist()
44 |     test_data = test_df.values.tolist()
45 | 
46 |     return data, labels, test_data, test_labels


--------------------------------------------------------------------------------
/02_activate/22_solution/custom_train/trainer/requirements.txt:
--------------------------------------------------------------------------------
1 | scikit-learn 
2 | google-cloud-bigquery 
3 | joblib 
4 | pandas 
5 | google-cloud-storage


--------------------------------------------------------------------------------
/02_activate/22_solution/custom_train/trainer/train.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from sklearn.tree import DecisionTreeClassifier
16 | from sklearn.metrics import roc_curve
17 | from sklearn.model_selection import train_test_split
18 | from google.cloud import bigquery
19 | from google.cloud import storage
20 | from joblib import dump
21 | 
22 | import os
23 | import pandas as pd
24 | 
25 | def train_model(data, labels, test_data, test_labels, storage_client):
26 |     
27 |     # Define and train the Scikit model
28 |     skmodel = DecisionTreeClassifier()
29 |     skmodel.fit(data, labels)
30 |     score = skmodel.score(test_data, test_labels)
31 |     print('accuracy is:',score)
32 |     
33 |     # Storage location
34 |     model_directory = f"{os.environ['AIP_STORAGE_URI']}/model_dir"
35 |     storage_path = os.path.join(model_directory, "model.joblib")
36 | 
37 |     # Save the model to a local file
38 |     dump(skmodel, 'model.joblib')
39 | 
40 |     blob = storage.blob.Blob.from_string(storage_path, client=storage_client)
41 |     blob.upload_from_filename("model.joblib")
42 | 
43 |     return(skmodel)


--------------------------------------------------------------------------------
/02_activate/22_solution/inf_processing_service/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9
 2 | 
 3 | # Allow statements and log messages to immediately appear in the Knative logs
 4 | ENV PYTHONUNBUFFERED True
 5 | 
 6 | # Copy local code to the container image.
 7 | ENV APP_HOME /app
 8 | WORKDIR $APP_HOME
 9 | COPY . ./
10 | COPY ./requirements.txt ./
11 | 
12 | # Install production dependencies.
13 | RUN pip install --upgrade pip
14 | RUN pip install --no-cache-dir -r requirements.txt
15 | 
16 | # Run the web service on container startup. Here we use the gunicorn
17 | # webserver, with one worker process and 8 threads.
18 | # For environments with multiple CPU cores, increase the number of workers
19 | # to be equal to the cores available.
20 | # Timeout is set to 0 to disable the timeouts of the workers to allow Cloud Run to handle instance scaling.
21 | CMD exec gunicorn --bind :$PORT --workers 1 --threads 8 --timeout 0 main:app


--------------------------------------------------------------------------------
/02_activate/22_solution/inf_processing_service/README.md:
--------------------------------------------------------------------------------
 1 | The processing service defines a Cloud Run Service to process each incoming datapoint.
 2 | 
 3 | 
 4 | main.py defines the public facing webserver to listen for requests.
 5 | 
 6 | synth_data_stream.py creates a synthetic data stream of events randomly chosen from the datalayer. It also randomly includes anomalies in the data.
 7 | 
 8 | 
 9 | Command to start data stream:
10 | python3 synth_data_stream.py --endpoint {Pub/Sub endpoint link}'


--------------------------------------------------------------------------------
/02_activate/22_solution/inf_processing_service/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | project_id = '<project-id>'
16 | location = 'europe-west1'
17 | bq_dataset = 'ecommerce_sink'
18 | bq_table = 'cloud_run'
19 | bq_table_anomaly = 'cloud_run_anomaly'
20 | endpoind_id = '<endpoint-id>'
21 | 


--------------------------------------------------------------------------------
/02_activate/22_solution/inf_processing_service/main.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Google
  2 | 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | 
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import os
 16 | import time
 17 | import base64
 18 | import json
 19 | import datetime
 20 | import config
 21 | 
 22 | from flask import Flask, request
 23 | 
 24 | from google.cloud import bigquery, aiplatform
 25 | 
 26 | 
 27 | app = Flask(__name__)
 28 | 
 29 | 
 30 | @app.route("/hw", methods=['GET', 'POST'])
 31 | def hello_world():
 32 |     world = request.args.get('world')
 33 |     return f"Hello {world}!"
 34 | 
 35 | 
 36 | @app.route("/", methods=["POST"])
 37 | def index():
 38 |     envelope = request.get_json()
 39 |     print(envelope)
 40 |     print(type(envelope))
 41 | 
 42 |     if not envelope:
 43 |         msg = "no Pub/Sub message received"
 44 |         print(f"error: {msg}")
 45 |         return f"Bad Request: {msg}", 400
 46 | 
 47 |     ps_message = envelope['message']
 48 | 
 49 |     record = base64.b64decode(ps_message["data"]).decode("utf-8").strip()
 50 |     record = json.loads(record)
 51 | 
 52 |     record["weekday"] = datetime.datetime.strptime(record["event_datetime"], "%Y-%m-%d %H:%M:%S").strftime('%A')
 53 | 
 54 |     rows_to_insert = [record]
 55 | 
 56 |     client = bigquery.Client(project=config.project_id, location=config.location)
 57 |     table_id = config.project_id + '.' + config.bq_dataset + '.' + config.bq_table
 58 | 
 59 |     errors = client.insert_rows_json(table_id, rows_to_insert)  # Make an API request.
 60 | 
 61 | 
 62 |     # Create record that includes anomaly detection inference.
 63 |     if record["event"] == "purchase":
 64 |         record_to_predict = [
 65 |             {"tax": record["ecommerce"]["purchase"]["tax"],
 66 |             "shipping": record["ecommerce"]["purchase"]["shipping"],
 67 |             "value":record["ecommerce"]["purchase"]["value"]}
 68 |             ]
 69 | 
 70 |         aiplatform.init(project=config.project_id, location=config.location)
 71 | 
 72 |         endpoint = aiplatform.Endpoint(
 73 |             endpoint_name=f"projects/{config.project_id}/locations/{config.location}/endpoints/{config.endpoind_id}",
 74 |             project = config.project_id,
 75 |             location=config.location,
 76 |             )
 77 | 
 78 |         endpoint_response = endpoint.predict(
 79 |             instances=record_to_predict
 80 |         )
 81 | 
 82 |         centroid = endpoint_response.predictions[0]["nearest_centroid_id"][0]
 83 | 
 84 |         if centroid == 1:
 85 |             anomaly = True
 86 |         if centroid == 2:
 87 |             anomaly = False
 88 | 
 89 |         print(anomaly)
 90 |         
 91 |         anomaly_record = {"tax": record["ecommerce"]["purchase"]["tax"], "shipping": record["ecommerce"]["purchase"]["shipping"], "value":record["ecommerce"]["purchase"]["value"], "anomaly": anomaly}
 92 | 
 93 |         rows_to_insert = [anomaly_record]
 94 | 
 95 |         client = bigquery.Client(project=config.project_id, location=config.location)
 96 |         table_id = config.project_id + '.' + config.bq_dataset + '.' + config.bq_table_anomaly
 97 | 
 98 |         errors_an = client.insert_rows_json(table_id, rows_to_insert)  # Make an API request.
 99 | 
100 | 
101 |         if errors_an == []:
102 |             print(f"{time.time()} New rows with prediction have been added.")
103 |             return ("", 204)
104 |         else:
105 |             print("Encountered errors while inserting rows: {}".format(errors))
106 |             return f"Bad Request: {envelope}", 400
107 | 
108 |     if errors == []:
109 |         print(f"{time.time()} New rows have been added.")
110 |         return ("", 204)
111 |     else:
112 |         print("Encountered errors while inserting rows: {}".format(errors))
113 |         return f"Bad Request: {envelope}", 400
114 | 
115 | 
116 | if __name__ == "__main__":
117 |     app.run(debug=True, host="0.0.0.0", port=int(os.environ.get("PORT", 8080)))
118 | 
119 | 


--------------------------------------------------------------------------------
/02_activate/22_solution/inf_processing_service/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | numpy
3 | Flask==2.1.0
4 | gunicorn==20.1.0
5 | google-cloud-bigquery
6 | google-cloud-aiplatform
7 | scikit-learn


--------------------------------------------------------------------------------
/02_activate/22_solution/inf_processing_service_custom/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9
 2 | 
 3 | # Allow statements and log messages to immediately appear in the Knative logs
 4 | ENV PYTHONUNBUFFERED True
 5 | 
 6 | # Copy local code to the container image.
 7 | ENV APP_HOME /app
 8 | WORKDIR $APP_HOME
 9 | COPY . ./
10 | COPY ./requirements.txt ./
11 | 
12 | # Install production dependencies.
13 | RUN pip install --upgrade pip
14 | RUN pip install --no-cache-dir -r requirements.txt
15 | 
16 | # Run the web service on container startup. Here we use the gunicorn
17 | # webserver, with one worker process and 8 threads.
18 | # For environments with multiple CPU cores, increase the number of workers
19 | # to be equal to the cores available.
20 | # Timeout is set to 0 to disable the timeouts of the workers to allow Cloud Run to handle instance scaling.
21 | CMD exec gunicorn --bind :$PORT --workers 1 --threads 8 --timeout 0 main:app


--------------------------------------------------------------------------------
/02_activate/22_solution/inf_processing_service_custom/README.md:
--------------------------------------------------------------------------------
 1 | The processing service defines a Cloud Run Service to process each incoming datapoint.
 2 | 
 3 | 
 4 | main.py defines the public facing webserver to listen for requests.
 5 | 
 6 | synth_data_stream.py creates a synthetic data stream of events randomly chosen from the datalayer. It also randomly includes anomalies in the data.
 7 | 
 8 | 
 9 | Command to start data stream:
10 | python3 synth_data_stream.py --endpoint {Pub/Sub endpoint link}'


--------------------------------------------------------------------------------
/02_activate/22_solution/inf_processing_service_custom/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | 
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | project_id = '<project-id>'
16 | location = 'europe-west1'
17 | bq_dataset = 'ecommerce_sink'
18 | bq_table = 'cloud_run'
19 | bq_table_anomaly = 'cloud_run_anomaly_custom'
20 | endpoind_id = '<endpoint-id>'
21 | 


--------------------------------------------------------------------------------
/02_activate/22_solution/inf_processing_service_custom/main.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Google
  2 | 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | 
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import os
 16 | import time
 17 | import base64
 18 | import json
 19 | import datetime
 20 | import config
 21 | 
 22 | from flask import Flask, request
 23 | 
 24 | from google.cloud import bigquery, aiplatform
 25 | 
 26 | 
 27 | app = Flask(__name__)
 28 | 
 29 | 
 30 | @app.route("/hw", methods=['GET', 'POST'])
 31 | def hello_world():
 32 |     world = request.args.get('world')
 33 |     return f"Hello {world}!"
 34 | 
 35 | 
 36 | @app.route("/", methods=["POST"])
 37 | def index():
 38 |     envelope = request.get_json()
 39 |     print(envelope)
 40 |     print(type(envelope))
 41 | 
 42 |     if not envelope:
 43 |         msg = "no Pub/Sub message received"
 44 |         print(f"error: {msg}")
 45 |         return f"Bad Request: {msg}", 400
 46 | 
 47 |     ps_message = envelope['message']
 48 | 
 49 |     record = base64.b64decode(ps_message["data"]).decode("utf-8").strip()
 50 |     record = json.loads(record)
 51 | 
 52 |     record["weekday"] = datetime.datetime.strptime(record["event_datetime"], "%Y-%m-%d %H:%M:%S").strftime('%A')
 53 | 
 54 |     rows_to_insert = [record]
 55 | 
 56 |     client = bigquery.Client(project=config.project_id, location=config.location)
 57 |     table_id = config.project_id + '.' + config.bq_dataset + '.' + config.bq_table
 58 | 
 59 |     errors = client.insert_rows_json(table_id, rows_to_insert)  # Make an API request.
 60 | 
 61 | 
 62 |     # Create record that includes anomaly detection inference.
 63 |     if record["event"] == "purchase":
 64 |         record_to_predict = [
 65 |             {"tax": record["ecommerce"]["purchase"]["tax"],
 66 |             "shipping": record["ecommerce"]["purchase"]["shipping"],
 67 |             "value":record["ecommerce"]["purchase"]["value"]}
 68 |             ]
 69 | 
 70 |         aiplatform.init(project=config.project_id, location=config.location)
 71 | 
 72 |         endpoint = aiplatform.Endpoint(
 73 |             endpoint_name=f"projects/{config.project_id}/locations/{config.location}/endpoints/{config.endpoind_id}",
 74 |             project = config.project_id,
 75 |             location=config.location,
 76 |             )
 77 | 
 78 |         endpoint_response = endpoint.predict(
 79 |             instances=record_to_predict
 80 |         )
 81 | 
 82 |         anomaly = endpoint_response.predictions[0]
 83 | 
 84 |         anomaly_record = {"tax": record["ecommerce"]["purchase"]["tax"], "shipping": record["ecommerce"]["purchase"]["shipping"], "value":record["ecommerce"]["purchase"]["value"], "anomaly": anomaly}
 85 | 
 86 |         rows_to_insert = [anomaly_record]
 87 | 
 88 |         client = bigquery.Client(project=config.project_id, location=config.location)
 89 |         table_id = config.project_id + '.' + config.bq_dataset + '.' + config.bq_table_anomaly
 90 | 
 91 |         errors_an = client.insert_rows_json(table_id, rows_to_insert)  # Make an API request.
 92 | 
 93 | 
 94 |         if errors_an == []:
 95 |             print(f"{time.time()} New rows with prediction have been added.")
 96 |             return ("", 204)
 97 |         else:
 98 |             print("Encountered errors while inserting rows: {}".format(errors))
 99 |             return f"Bad Request: {envelope}", 400
100 | 
101 |     if errors == []:
102 |         print(f"{time.time()} New rows have been added.")
103 |         return ("", 204)
104 |     else:
105 |         print("Encountered errors while inserting rows: {}".format(errors))
106 |         return f"Bad Request: {envelope}", 400
107 | 
108 | 
109 | if __name__ == "__main__":
110 |     app.run(debug=True, host="0.0.0.0", port=int(os.environ.get("PORT", 8080)))
111 | 
112 | 


--------------------------------------------------------------------------------
/02_activate/22_solution/inf_processing_service_custom/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | numpy
3 | Flask==2.1.0
4 | gunicorn==20.1.0
5 | google-cloud-bigquery
6 | google-cloud-aiplatform
7 | scikit-learn


--------------------------------------------------------------------------------
/02_activate/22_solution/kf_pipe.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Google
  2 | 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | 
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import time
 16 | from typing import Iterable, Dict, NamedTuple
 17 | 
 18 | import config
 19 | 
 20 | # import kfp
 21 | from kfp.v2 import compiler, dsl
 22 | from kfp.v2.dsl import component
 23 | from kfp.v2.components import importer_node
 24 | import google.cloud.aiplatform as aip
 25 | from google_cloud_pipeline_components import aiplatform as gcc_aip
 26 | 
 27 | from google_cloud_pipeline_components.v1 import endpoint, bigquery
 28 | from google_cloud_pipeline_components.types.artifact_types import VertexModel, VertexEndpoint, UnmanagedContainerModel
 29 | 
 30 | 
 31 | 
 32 | # TODO: Check for resources & create if needed before pipeline
 33 | 
 34 | def compile_pipe():
 35 |     # Define the workflow of the pipeline.
 36 |     @dsl.pipeline(
 37 |         name="anomaly-detection-test",
 38 |         pipeline_root=config.PIPELINE_ROOT_PATH)
 39 | 
 40 |     def pipeline(project_id: str, region: str, timestamp_id: str, artifact_staging_location:str):
 41 | 
 42 |         aip.init(project=config.GCP_PROJECT, location=config.GCP_REGION)
 43 | 
 44 |         bqml_query = f"""
 45 |                 CREATE OR REPLACE MODEL
 46 |                   `{config.GCP_PROJECT}.ecommerce_sink.anomaly_detection`
 47 |                 OPTIONS
 48 |                   ( MODEL_TYPE='KMEANS',
 49 |                     NUM_CLUSTERS=2 ) AS
 50 |                   SELECT
 51 |                     ecommerce.purchase.tax AS tax,
 52 |                     ecommerce.purchase.shipping AS shipping,
 53 |                     ecommerce.purchase.value AS value
 54 |                   FROM `{config.GCP_PROJECT}.ecommerce_sink.cloud_run` 
 55 |                   WHERE event='purchase'
 56 |                 ;
 57 |         """
 58 | 
 59 |         bqml_model = bigquery.BigqueryCreateModelJobOp(
 60 |             project=project_id,
 61 |             location=region,
 62 |             query=bqml_query
 63 |         )
 64 | 
 65 |         bq_export = bigquery.BigqueryExportModelJobOp(
 66 |             project=project_id,
 67 |             location=region,
 68 |             model=bqml_model.outputs["model"],
 69 |             model_destination_path=f"{config.PIPELINE_ROOT_PATH}/bq_model-artifacts"
 70 |         )
 71 | 
 72 |         import_unmanaged_model_task = importer_node.importer(
 73 |             artifact_uri=f"{config.PIPELINE_ROOT_PATH}/bq_model-artifacts",
 74 |             artifact_class=UnmanagedContainerModel,
 75 |             metadata={
 76 |                 "containerSpec": {
 77 |                     "imageUri": "europe-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-9:latest",
 78 |                 },
 79 |             },
 80 |         ).after(bq_export)
 81 | 
 82 |         model_upload = gcc_aip.ModelUploadOp(
 83 |             project=project_id,
 84 |             location=region,
 85 |             display_name=f"anomaly_detection_{timestamp_id}",
 86 |             unmanaged_container_model=import_unmanaged_model_task.output,
 87 |         )
 88 | 
 89 |         # endpoint_uri = "https://europe-west1-aiplatform.googleapis.com/v1/projects/37042627607/locations/europe-west1/endpoints/2381190342041927680"
 90 |         # endpoint = dsl.importer(
 91 |         #     artifact_uri=endpoint_uri,
 92 |         #     artifact_class=VertexEndpoint,
 93 |         #         metadata={
 94 |         #         "resourceName": "projects/37042627607/locations/europe-west1/endpoints/2381190342041927680"
 95 |         #     }
 96 |         #   )
 97 | 
 98 |         new_endpoint = endpoint.EndpointCreateOp(
 99 |             project=config.GCP_PROJECT,
100 |             location=config.GCP_REGION,
101 |             display_name=f'hyp_inference{int(time.time())}',
102 |             # network='terraform-network'
103 |         )
104 |           
105 |         # Deploy models on endpoint
106 |         _ = gcc_aip.ModelDeployOp(
107 |             model=model_upload.outputs["model"],
108 |             endpoint=new_endpoint.outputs["endpoint"],
109 |             dedicated_resources_min_replica_count=1,
110 |             dedicated_resources_max_replica_count=1,
111 |             dedicated_resources_machine_type=config.MACHINE_TYPE,
112 |             traffic_split={"0": 100}
113 |         )
114 | 
115 |     compiler.Compiler().compile(pipeline_func=pipeline, package_path="hyp-anomaly-detection.json")
116 | 
117 | 
118 | if __name__ == "__main__":
119 |     # Initialize aiplatform credentials.
120 |     aip.init(project=config.GCP_PROJECT, location=config.GCP_REGION)
121 | 
122 |     # Compile pipeline code.
123 |     compile_pipe()
124 | 
125 |     # Unique ident for pipeline run
126 |     timestamp_id = str(int(time.time()))
127 | 
128 |     # Prepare the pipeline job.
129 |     job = aip.PipelineJob(
130 |         display_name=f"{timestamp_id}-hyp-anomaly-detection",
131 |         template_path="hyp-anomaly-detection.json",
132 |         pipeline_root=config.PIPELINE_ROOT_PATH,
133 |         parameter_values={
134 |             'project_id': config.GCP_PROJECT,
135 |             'region': config.GCP_REGION,
136 |             'timestamp_id': timestamp_id,
137 |             'artifact_staging_location': config.PIPELINE_ROOT_PATH
138 |         }
139 |     )
140 | 
141 |     job.submit(service_account=config.SERVICE_ACCOUNT)
142 | 


--------------------------------------------------------------------------------
/02_activate/22_solution/kf_pipe_custom.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Google
  2 | 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | 
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import time
 16 | from typing import Iterable, Dict, NamedTuple
 17 | 
 18 | import config_custom
 19 | 
 20 | # import kfp
 21 | from kfp.v2 import compiler, dsl
 22 | from kfp.v2.dsl import component
 23 | from kfp.v2.components import importer_node
 24 | import google.cloud.aiplatform as aip
 25 | from google_cloud_pipeline_components import aiplatform as gcc_aip
 26 | 
 27 | from google_cloud_pipeline_components.types import artifact_types
 28 | from google_cloud_pipeline_components.v1.custom_job import CustomTrainingJobOp
 29 | 
 30 | ## Training Worker Specs
 31 | WORKER_POOL_SPECS = [
 32 |     {
 33 |         "machine_spec": {
 34 |             "machine_type": "n1-standard-4"
 35 |         },
 36 |         "replica_count": "1",
 37 |         "container_spec": {
 38 |             "image_uri": config_custom.TRAIN_IMAGE_URI,
 39 |             "env": [
 40 |                 {
 41 |                     "name": "AIP_STORAGE_URI",
 42 |                     "value": config_custom.AIP_STORAGE_URI
 43 |                 },
 44 |             ]
 45 |         }
 46 |     }
 47 | ]
 48 | 
 49 | def compile_pipe():
 50 |     # Define the workflow of the pipeline.
 51 |     @dsl.pipeline(
 52 |         name="anomaly-detection-custom-test",
 53 |         pipeline_root=config_custom.PIPELINE_ROOT_PATH)
 54 | 
 55 |     def pipeline(
 56 |         project_id: str, 
 57 |         region: str, 
 58 |         timestamp_id: str, 
 59 |         artifact_staging_location:str,
 60 |         bq_source: str,
 61 |         aip_storage_uri: str,
 62 |         predict_image_uri: str
 63 |         ):
 64 |         
 65 |         # Model training
 66 |         train_job = CustomTrainingJobOp(
 67 |             display_name="pipeline-anomaly-custom-train",
 68 |             project=project_id,
 69 |             location=region,
 70 |             worker_pool_specs=WORKER_POOL_SPECS
 71 |         )
 72 | 
 73 |         # Model evaluation
 74 |         # Ideally here you can evaluate the model and decide on deployment/or not for CI/CD purposes
 75 |         # example: https://www.cloudskillsboost.google/focuses/21234?parent=catalog
 76 | 
 77 |         # Import with the custom predict container
 78 |         import_unmanaged_model_op = importer_node.importer(
 79 |             artifact_uri=aip_storage_uri,
 80 |             artifact_class=artifact_types.UnmanagedContainerModel,
 81 |             metadata={
 82 |                 "containerSpec": {
 83 |                     "imageUri": predict_image_uri,
 84 |                     "env": [
 85 |                         {
 86 |                             "name": "PROJECT_ID",
 87 |                             "value": project_id},
 88 |                     ],
 89 |                     "predictRoute": "/predict",
 90 |                     "healthRoute": "/health_check",
 91 |                     "ports": [
 92 |                         {
 93 |                             "containerPort": 8080
 94 |                         }
 95 |                     ]
 96 |                 },
 97 |             },
 98 |         ).after(train_job)
 99 |         
100 |         # Upload the model into the registry
101 |         custom_model_upload_job = gcc_aip.ModelUploadOp(
102 |             project=project_id,
103 |             location=region,
104 |             display_name=f"anomaly-detection-custom-model_{timestamp_id}",
105 |             unmanaged_container_model=import_unmanaged_model_op.outputs["artifact"],
106 |             ).after(import_unmanaged_model_op)
107 |         
108 |         # Create an endpoint where the model will be deployed
109 |         endpoint_create_job = gcc_aip.EndpointCreateOp(
110 |             project=project_id,
111 |             display_name="anomaly-detection-custom-endpoint",
112 |             location=region
113 |         )
114 | 
115 |         # Deploy the model on the endpoint
116 |         _ = gcc_aip.ModelDeployOp(
117 |             model=custom_model_upload_job.outputs["model"],
118 |             endpoint=endpoint_create_job.outputs["endpoint"],
119 |             deployed_model_display_name="anomaly-detection-custom-deploy",
120 |             dedicated_resources_min_replica_count=1,
121 |             dedicated_resources_max_replica_count=1,
122 |             dedicated_resources_machine_type="n1-standard-2",
123 |             traffic_split={"0": 100}
124 |         )
125 |         
126 |     compiler.Compiler().compile(pipeline_func=pipeline, package_path="hyp-custom-anomaly-detection.json")
127 | 
128 | if __name__ == "__main__":
129 |     # Initialize aiplatform credentials.
130 |     aip.init(project=config_custom.PROJECT_ID, location=config_custom.REGION)
131 | 
132 |     # Compile pipeline code.
133 |     compile_pipe()
134 | 
135 |     # Unique ident for pipeline run
136 |     timestamp_id = str(int(time.time()))
137 | 
138 |     # Prepare the pipeline job.
139 |     job = aip.PipelineJob(
140 |         display_name=f"{timestamp_id}-hyp-custom-anomaly-detection",
141 |         template_path="hyp-custom-anomaly-detection.json",
142 |         pipeline_root=config_custom.PIPELINE_ROOT_PATH,
143 |         parameter_values={
144 |             'project_id': config_custom.PROJECT_ID,
145 |             'region': config_custom.REGION,
146 |             'timestamp_id': timestamp_id,
147 |             'bq_source': config_custom.DATA_URI,
148 |             'aip_storage_uri' : config_custom.AIP_STORAGE_URI,
149 |             'predict_image_uri' : config_custom.PREDICT_IMAGE_URI,
150 |             'artifact_staging_location': config_custom.PIPELINE_ROOT_PATH
151 |         }
152 |     )
153 | 
154 |     job.submit(service_account=config_custom.SERVICE_ACCOUNT)
155 | 


--------------------------------------------------------------------------------
/02_activate/22_solution/main.tf:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2023 Google
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *      http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | terraform {
 18 |   required_providers {
 19 |     google = {
 20 |       source = "hashicorp/google"
 21 |       version = "4.32.0"
 22 |     }
 23 |   }
 24 | }
 25 | 
 26 | provider "google" {
 27 |   project = var.project_id
 28 |   region  = var.gcp_region
 29 | }
 30 | 
 31 | data "google_project" "project" {
 32 | }
 33 | 
 34 | # Enabling APIs
 35 | resource "google_project_service" "aiplatform" {
 36 |   service = "aiplatform.googleapis.com"
 37 | 
 38 |   disable_on_destroy = false
 39 | }
 40 | 
 41 | resource "google_project_service" "storage" {
 42 |   service = "storage.googleapis.com"
 43 | 
 44 |   disable_on_destroy = false
 45 | }
 46 | 
 47 | resource "google_project_service" "notebooks" {
 48 |   service = "notebooks.googleapis.com"
 49 | 
 50 |   disable_on_destroy = false
 51 | }
 52 | 
 53 | resource "google_project_service" "dataflow" {
 54 |   service = "dataflow.googleapis.com"
 55 | 
 56 |   disable_on_destroy = false
 57 | }
 58 | 
 59 | resource "google_project_service" "artifactregistry" {
 60 |   service = "artifactregistry.googleapis.com"
 61 | 
 62 |   disable_on_destroy = false
 63 | }
 64 | 
 65 | 
 66 | # Creating BigQuery Table
 67 | resource "google_bigquery_table" "bq_table_run_anomaly" {
 68 |   dataset_id = "ecommerce_sink"
 69 |   table_id   = "cloud_run_anomaly"
 70 |   deletion_protection = false
 71 | 
 72 |   labels = {
 73 |     env = "default"
 74 |   }
 75 | 
 76 |   schema = <<EOF
 77 | [
 78 |   {
 79 |     "name": "tax",
 80 |     "type": "FLOAT",
 81 |     "mode": "NULLABLE",
 82 |     "description": "The data"
 83 |   },
 84 |     {
 85 |     "name": "shipping",
 86 |     "type": "FLOAT",
 87 |     "mode": "NULLABLE",
 88 |     "description": "The data"
 89 |   },
 90 |     {
 91 |     "name": "value",
 92 |     "type": "FLOAT",
 93 |     "mode": "NULLABLE",
 94 |     "description": "The data"
 95 |   },
 96 |     {
 97 |     "name": "anomaly",
 98 |     "type": "BOOL",
 99 |     "mode": "NULLABLE",
100 |     "description": "The data"
101 |   }
102 |   
103 | ]
104 | EOF
105 | }


--------------------------------------------------------------------------------
/02_activate/22_solution/requirements.txt:
--------------------------------------------------------------------------------
1 | kfp
2 | google-cloud-aiplatform
3 | protobuf==3.20.3
4 | google-cloud-pipeline-components==1.0.39
5 | kfp==1.8.19
6 | google-cloud-aiplatform==1.22.0
7 | scikit-learn


--------------------------------------------------------------------------------
/02_activate/22_solution/terraform.tfvars:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2023 Google
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | project_id                  = "<project-id>"
18 | delete_contents_on_destroy  = true
19 | 


--------------------------------------------------------------------------------
/02_activate/22_solution/variables.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2023 Google
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | variable "project_id" {
18 |   description = "Project where the dataset and table are created."
19 | }
20 | 
21 | variable "delete_contents_on_destroy" {
22 |   description = "(Optional) If set to true, delete all the tables in the dataset when destroying the resource; otherwise, destroying the resource will fail if tables are present."
23 |   type        = bool
24 |   default     = null
25 | }
26 | 
27 | variable "force_destroy" {
28 |   description = "When deleting a bucket, this boolean option will delete all contained objects. If false, Terraform will fail to delete buckets which contain objects."
29 |   type        = bool
30 |   default     = false
31 | }
32 | 
33 | variable "gcp_region" {
34 |   description = "GCP region to deploy resources in."
35 |   type = string
36 |   default = "europe-west1"
37 | }


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to Contribute
 2 | 
 3 | We'd love to accept your patches and contributions to this project. There are
 4 | just a few small guidelines you need to follow.
 5 | 
 6 | ## Contributor License Agreement
 7 | 
 8 | Contributions to this project must be accompanied by a Contributor License
 9 | Agreement (CLA). You (or your employer) retain the copyright to your
10 | contribution; this simply gives us permission to use and redistribute your
11 | contributions as part of the project. Head over to
12 | <https://cla.developers.google.com/> to see your current agreements on file or
13 | to sign a new one.
14 | 
15 | You generally only need to submit a CLA once, so if you've already submitted one
16 | (even if it was for a different project), you probably don't need to do it
17 | again.
18 | 
19 | ## Code Reviews
20 | 
21 | All submissions, including submissions by project members, require review. We
22 | use GitHub pull requests for this purpose. Consult
23 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
24 | information on using pull requests.
25 | 
26 | ## Community Guidelines
27 | 
28 | This project follows
29 | [Google's Open Source Community Guidelines](https://opensource.google/conduct/).


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Welcome to Hack Your Pipe!
 2 | 
 3 | In this repo you will find a series of demos and workshops on Data- and ML-Engineering best practices on GCP. 
 4 | 
 5 | For your webshop you will develop an end-to-end data pipeline, from collection, over transformation and up to activation of the interaction data. 
 6 | 
 7 | ![Hack Your Pipe architecture](./rsc/hyp_architecture.png)
 8 | 
 9 | The workshop is split into first ingest and transform, and second activate.
10 | 
11 | In both folders you will find the challenge lab and working sample solution.
12 | Every sample solution comes with instructions and terraform scripts needed for replication.
13 | 
14 | The challenge labs separate the solution development into interactive steps.
15 | To maximize the learning you should aim to solving the challenge labs independently before looking at the solutions.
16 | 
17 | ### Good luck and have fun!!


--------------------------------------------------------------------------------
/rsc/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NucleusEngineering/hack-your-pipe/dfd667a1545dcc45cca8a2a16ab2c7f70983211f/rsc/.DS_Store


--------------------------------------------------------------------------------
/rsc/cloudrun_processing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NucleusEngineering/hack-your-pipe/dfd667a1545dcc45cca8a2a16ab2c7f70983211f/rsc/cloudrun_processing.png


--------------------------------------------------------------------------------
/rsc/dataflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NucleusEngineering/hack-your-pipe/dfd667a1545dcc45cca8a2a16ab2c7f70983211f/rsc/dataflow.png


--------------------------------------------------------------------------------
/rsc/efficient_pipelines.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NucleusEngineering/hack-your-pipe/dfd667a1545dcc45cca8a2a16ab2c7f70983211f/rsc/efficient_pipelines.png


--------------------------------------------------------------------------------
/rsc/hyp_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NucleusEngineering/hack-your-pipe/dfd667a1545dcc45cca8a2a16ab2c7f70983211f/rsc/hyp_architecture.png


--------------------------------------------------------------------------------
/rsc/hyp_ml_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NucleusEngineering/hack-your-pipe/dfd667a1545dcc45cca8a2a16ab2c7f70983211f/rsc/hyp_ml_architecture.png


--------------------------------------------------------------------------------
/rsc/ingestion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NucleusEngineering/hack-your-pipe/dfd667a1545dcc45cca8a2a16ab2c7f70983211f/rsc/ingestion.png


--------------------------------------------------------------------------------
/rsc/pubsub_direct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NucleusEngineering/hack-your-pipe/dfd667a1545dcc45cca8a2a16ab2c7f70983211f/rsc/pubsub_direct.png


--------------------------------------------------------------------------------
/rsc/pubsub_metrics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NucleusEngineering/hack-your-pipe/dfd667a1545dcc45cca8a2a16ab2c7f70983211f/rsc/pubsub_metrics.png


--------------------------------------------------------------------------------