├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── gaming
└── propensity-model
│ └── bqml
│ ├── README.md
│ ├── bqml_ga4_gaming_propensity_to_churn.ipynb
│ └── images
│ └── workflow.PNG
└── retail
├── clustering
└── bqml
│ ├── README.md
│ └── bqml_scaled_clustering.ipynb
├── ltv
└── bqml
│ ├── README.md
│ ├── notebooks
│ └── bqml_automl_ltv_activate_lookalike.ipynb
│ └── scripts
│ ├── 00_procedure_persist.sql
│ ├── 10_procedure_match.sql
│ ├── 20_procedure_prepare.sql
│ ├── 30_procedure_train.sql
│ ├── 40_procedure_predict.sql
│ ├── 50_procedure_top.sql
│ └── run.sh
├── propensity-model
└── bqml
│ ├── README.md
│ ├── bqml_kfp_retail_propensity_to_purchase.ipynb
│ └── images
│ ├── DataExploration.png
│ ├── DataVisualization.png
│ ├── KFP-Function_Params.png
│ ├── KFP-Graph.png
│ └── MLOPs-Pipeline-Architecture.png
├── recommendation-system
├── bqml-mlops
│ ├── README.md
│ ├── dockerfile
│ ├── kfp_tutorial.ipynb
│ ├── part_2
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ ├── cicd.png
│ │ ├── cloudbuild.yaml
│ │ ├── dockerbuild.sh
│ │ └── pipeline.py
│ ├── part_3
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ ├── dockerbuild.sh
│ │ ├── pipeline.png
│ │ └── vertex_ai_pipeline.ipynb
│ └── pipeline.png
├── bqml-scann
│ ├── .gitignore
│ ├── 00_prep_bq_and_datastore.ipynb
│ ├── 00_prep_bq_procedures.ipynb
│ ├── 01_train_bqml_mf_pmi.ipynb
│ ├── 02_export_bqml_mf_embeddings.ipynb
│ ├── 03_create_embedding_lookup_model.ipynb
│ ├── 04_build_embeddings_scann.ipynb
│ ├── 05_deploy_lookup_and_scann_caip.ipynb
│ ├── README.md
│ ├── ann01_create_index.ipynb
│ ├── ann02_run_pipeline.ipynb
│ ├── ann_grpc
│ │ ├── match_pb2.py
│ │ └── match_pb2_grpc.py
│ ├── ann_setup.md
│ ├── embeddings_exporter
│ │ ├── __init__.py
│ │ ├── pipeline.py
│ │ ├── runner.py
│ │ └── setup.py
│ ├── embeddings_lookup
│ │ └── lookup_creator.py
│ ├── figures
│ │ ├── ann-flow.png
│ │ ├── ann-tfx.png
│ │ ├── diagram.png
│ │ ├── feedback-matrix-columns.png
│ │ ├── feedback-matrix-diagonals.png
│ │ ├── feedback-matrix-rows.png
│ │ ├── kfp.png
│ │ └── tfx.png
│ ├── index_builder
│ │ ├── builder
│ │ │ ├── __init__.py
│ │ │ ├── indexer.py
│ │ │ └── task.py
│ │ ├── config.yaml
│ │ └── setup.py
│ ├── index_server
│ │ ├── Dockerfile
│ │ ├── cloudbuild.yaml
│ │ ├── lookup.py
│ │ ├── main.py
│ │ ├── matching.py
│ │ └── requirements.txt
│ ├── perf_test.ipynb
│ ├── requirements.txt
│ ├── sql_scripts
│ │ ├── sp_ComputePMI.sql
│ │ ├── sp_ExractEmbeddings.sql
│ │ └── sp_TrainItemMatchingModel.sql
│ ├── tfx01_interactive.ipynb
│ ├── tfx02_deploy_run.ipynb
│ └── tfx_pipeline
│ │ ├── Dockerfile
│ │ ├── __init__.py
│ │ ├── bq_components.py
│ │ ├── config.py
│ │ ├── item_matcher.py
│ │ ├── lookup_creator.py
│ │ ├── pipeline.py
│ │ ├── runner.py
│ │ ├── scann_evaluator.py
│ │ ├── scann_indexer.py
│ │ └── schema
│ │ └── schema.pbtxt
└── bqml
│ ├── README.md
│ └── bqml_retail_recommendation_system.ipynb
└── time-series
└── bqml-demand-forecasting
├── README.md
├── bqml_retail_demand_forecasting.ipynb
└── images
├── bq_export_datastudio.png
├── datastudio_charts.png
├── datastudio_chartsettings.png
├── datastudio_filter_item.png
├── datastudio_fiveoclockvodka.png
└── datastudio_missingdata.png
/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/
2 | .DS_Store
3 | .vscode/
4 | **/*.cpython-37..pyc
5 | **/*.sqllite
6 | **/*.tar.gz
7 | retail/recommendation-system/bqml-scann/vocabulary.txt
8 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to Contribute
2 |
3 | We'd love to accept your patches and contributions to this project. There are
4 | just a few small guidelines you need to follow.
5 |
6 | ## Contributor License Agreement
7 |
8 | Contributions to this project must be accompanied by a Contributor License
9 | Agreement. You (or your employer) retain the copyright to your contribution;
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to to see
12 | your current agreements on file or to sign a new one.
13 |
14 | You generally only need to submit a CLA once, so if you've already submitted one
15 | (even if it was for a different project), you probably don't need to do it
16 | again.
17 |
18 | ## Code reviews
19 |
20 | All submissions, including submissions by project members, require review. We
21 | use GitHub pull requests for this purpose. Consult
22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
23 | information on using pull requests.
24 |
25 | ## Community Guidelines
26 |
27 | This project follows [Google's Open Source Community
28 | Guidelines](https://opensource.google/conduct/).
29 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright [yyyy] [name of copyright owner]
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
203 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](LICENSE)
2 |
3 | ## Analytics Componentized Patterns
4 |
5 | From sample dataset to activation, these componentized patterns are designed to help you get the most out of [BigQuery ML](https://cloud.google.com/bigquery-ml/docs) and other Google Cloud products in production.
6 |
7 | ### Retail use cases
8 | * Recommendation systems:
9 | * How to build an end to end recommendation system with CI/CD MLOps pipeline on hotel data using BigQuery ML. ([Code][bqml_mlops_code] | [Blogpost][bqml_scann_guide])
10 | * How to build a recommendation system on e-commerce data using BigQuery ML. ([Code][recomm_code] | [Blogpost][recomm_blog] | [Video][recomm_video])
11 | * How to build an item-item real-time recommendation system on song playlists data using BigQuery ML. ([Code][bqml_scann_code] | [Reference Guide][bqml_scann_guide])
12 | * Propensity to purchase model:
13 | * How to build an end-to-end propensity to purchase solution using BigQuery ML and Kubeflow Pipelines. ([Code][propen_code] | [Blogpost][propen_blog])
14 | * Activate on Lifetime Value predictions:
15 | * How to predict the monetary value of your customers and extract emails of the top customers to use in Adwords for example to create similar audiences. Automation is done by a combination of BigQuery Scripting, Stored Procedure and bash script. ([Code][ltv_code])
16 | * Clustering:
17 | * How to build customer segmentation through k-means clustering using BigQuery ML. ([Code][clustering_code] | [Blogpost][clustering_blog])
18 | * Demand Forecasting:
19 | * How to build a time series demand forecasting model using BigQuery ML ([Code][demandforecasting_code] | [Blogpost][demandforecasting_blog] | [Video][demandforecasting_video])
20 |
21 |
22 | ### Gaming use cases
23 | * Propensity to churn model:
24 | * Churn prediction for game developers using Google Analytics 4 (GA4) and BigQuery ML. ([Code][gaming_propen_code] | [Blogpost][gaming_propen_blog] | [Video][gaming_propen_video])
25 |
26 | ### Financial use cases
27 | * Fraud detection
28 | * How to build a real-time credit card fraud detection solution. ([Code][ccfraud_code] | [Blogpost][ccfraud_techblog] | [Video][ccfraud_video])
29 |
30 |
31 | [gaming_propen_code]: gaming/propensity-model/bqml
32 | [gaming_propen_blog]: https://cloud.google.com/blog/topics/developers-practitioners/churn-prediction-game-developers-using-google-analytics-4-ga4-and-bigquery-ml
33 | [gaming_propen_video]: https://www.youtube.com/watch?v=t5a0gwPM4I8
34 | [recomm_code]: retail/recommendation-system/bqml
35 | [recomm_blog]: https://medium.com/google-cloud/how-to-build-a-recommendation-system-on-e-commerce-data-using-bigquery-ml-df9af2b8c110
36 | [recomm_video]: https://youtube.com/watch?v=sEx8RwvT_-8
37 | [bqml_scann_code]: retail/recommendation-system/bqml-scann
38 | [bqml_mlops_code]: retail/recommendation-system/bqml-mlops
39 | [bqml_scann_guide]: https://cloud.google.com/solutions/real-time-item-matching
40 | [propen_code]: retail/propensity-model/bqml
41 | [propen_blog]: https://medium.com/google-cloud/how-to-build-an-end-to-end-propensity-to-purchase-solution-using-bigquery-ml-and-kubeflow-pipelines-cd4161f734d9
42 | [ltv_code]: retail/ltv/bqml
43 | [clustering_code]: retail/clustering/bqml
44 | [clustering_blog]: https://towardsdatascience.com/how-to-build-audience-clusters-with-website-data-using-bigquery-ml-6b604c6a084c
45 | [demandforecasting_code]: retail/time-series/bqml-demand-forecasting
46 | [demandforecasting_blog]: https://cloud.google.com/blog/topics/developers-practitioners/how-build-demand-forecasting-models-bigquery-ml
47 | [demandforecasting_video]: https://www.youtube.com/watch?v=dwOt68CevYA
48 | [ccfraud_code]: https://gitlab.qdatalabs.com/uk-gtm/patterns/cc_fraud_detection/tree/master
49 | [ccfraud_techblog]: https://cloud.google.com/blog/products/data-analytics/how-to-build-a-fraud-detection-solution
50 | [ccfraud_video]: https://youtu.be/qQnxq3COr9Q
51 |
52 |
53 |
54 |
55 | ## Questions? Feedback?
56 | If you have any questions or feedback, please open up a [new issue](https://github.com/GoogleCloudPlatform/analytics-componentized-patterns/issues).
57 |
58 | ## Disclaimer
59 | This is not an officially supported Google product.
60 |
61 | All files in this repository are under the [Apache License, Version 2.0](LICENSE.txt) unless noted otherwise.
62 |
--------------------------------------------------------------------------------
/gaming/propensity-model/bqml/README.md:
--------------------------------------------------------------------------------
1 | ## License
2 | ```
3 | Copyright 2021 Google LLC
4 |
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 |
9 | https://www.apache.org/licenses/LICENSE-2.0
10 |
11 | Unless required by applicable law or agreed to in writing, software
12 | distributed under the License is distributed on an "AS IS" BASIS,
13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | See the License for the specific language governing permissions and
15 | limitations under the License.
16 | ```
17 | [](LICENSE)
18 |
19 | # Churn prediction for game developers using Google Analytics 4 (GA4) and BigQuery ML
20 |
21 | This notebook showcases how you can use BigQuery ML to run propensity models on Google Analytics 4 data from your gaming app to determine the likelihood of specific users returning to your app.
22 |
23 | Using this notebook, you'll learn how to:
24 |
25 | - Explore the BigQuery export dataset for Google Analytics 4
26 | - Prepare the training data using demographic and behavioural attributes
27 | - Train propensity models using BigQuery ML
28 | - Evaluate BigQuery ML models
29 | - Make predictions using the BigQuery ML models
30 | - Implement model insights in practical implementations
31 |
32 | ## Architecture Diagram
33 |
34 | 
35 |
36 | ## More resources
37 |
38 | If you’d like to learn more about any of the topics covered in this notebook, check out these resources:
39 |
40 | - [BigQuery export of Google Analytics data]
41 | - [BigQuery ML quickstart]
42 | - [Events automatically collected by Google Analytics 4]
43 |
44 | ## Questions? Feedback?
45 | If you have any questions or feedback, please open up a [new issue](https://github.com/GoogleCloudPlatform/analytics-componentized-patterns/issues).
46 |
47 | [BigQuery export of Google Analytics data]: https://support.google.com/analytics/answer/9358801
48 | [BigQuery ML quickstart]: https://cloud.google.com/bigquery-ml/docs/bigqueryml-web-ui-start
49 | [Events automatically collected by Google Analytics 4]: https://support.google.com/analytics/answer/9234069
50 |
--------------------------------------------------------------------------------
/gaming/propensity-model/bqml/images/workflow.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/analytics-componentized-patterns/e869574dc67ecabdb913b50bfb058c7e9d3e47e1/gaming/propensity-model/bqml/images/workflow.PNG
--------------------------------------------------------------------------------
/retail/clustering/bqml/README.md:
--------------------------------------------------------------------------------
1 | A common marketing analytics challenge is to understand consumer behavior and develop customer attributes or archetypes. As organizations get better at tackling this problem, they can activate marketing strategies to incorporate additional customer knowledge into their campaigns. Building customer profiles is now easier than ever with BigQuery ML. In this notebook, you’ll learn how to create segmentation and how to use these audiences for marketing activation.
2 |
3 | The notebook can be found [here](bqml_scaled_clustering.ipynb)
4 |
5 | ## Questions? Feedback?
6 | If you have any questions or feedback, please open up a [new issue](https://github.com/GoogleCloudPlatform/analytics-componentized-patterns/issues).
--------------------------------------------------------------------------------
/retail/ltv/bqml/README.md:
--------------------------------------------------------------------------------
1 | # Activate on LTV predictions.
2 | This guide refactors the [final part][series_final] of an existing series about predicting Lifetime Value (LTV). The series uses Tensorflow and shows multiple approaches such as statistical models or deep neural networks to predict the monetary value of customers. The final part leverages [AutoML Tables][automl_tables].
3 |
4 | This document shows an opiniated way to predict the monetary value of your customers for a specific time in the future using historical data.
5 |
6 | This updated version differs in the following:
7 | - Predicts future monetary value for a specific period of time.
8 | - Minimizes development time by using AutoML directly from [BigQuery ML][bq_ml].
9 | - Uses two new datasets for sales and customer data.
10 | - Creates additional training examples by moving the date that separates input and target orders (more details in the notebook)
11 | - Shows how to activate the LTV predictions to create similar audiences in marketing tools.
12 |
13 | The end to end flow assumes that you start with a data dump stored into BigQuery and runs through the following steps:
14 |
15 | 1. Match your dataset to the sales dataset template.
16 | 1. Create features from a list of orders.
17 | 1. Train a model using monetary value as a label.
18 | 1. Predict future monetary value of customers.
19 | 1. Extract the emails of the top customers.
20 |
21 | For more general information about LTV, read the [first part][series_first] of the series.
22 |
23 | [series_final]:https://cloud.google.com/solutions/machine-learning/clv-prediction-with-automl-tables
24 | [automl_tables]:https://cloud.google.com/automl-tables
25 | [bq_ml]:https://cloud.google.com/bigquery-ml/docs/bigqueryml-intro
26 | [series_first]:https://cloud.google.com/solutions/machine-learning/clv-prediction-with-offline-training-intro
27 |
28 | ## Files
29 |
30 | There are two main sets of files:
31 |
32 | **[1. ./notebooks](./notebooks)**
33 |
34 | You can use the notebook in this folder to manually run the flow using example datasets. You can also used your own data.
35 |
36 | **[2. ./scripts](./scripts)**
37 |
38 | The scripts in this folder facilitate automation through BigQuery scripting, BigQuery stored procedures and bash scripting. Scripts use statements from the notebook to:
39 | 1. Transform data.
40 | 1. Train and use model to predict LTV.
41 | 1. Extract emails of the top LTV customers.
42 |
43 | *Note: For production use cases, you can reuse the SQL statements from the scripts folder in pipeline tools such as Kubeflow Pipelines or Cloud Composer.*
44 |
45 | The scripts assume that you already have the sales and crm datasets stored in BigQuery.
46 |
47 | ## Recommended flow
48 |
49 | 1. Do research in the Notebook.
50 | 1. Extract important SQL.
51 | 1. Write SQL scripts.
52 | 1. Test end-to-end flow through bash scripts.
53 | 1. Integrate into a data pipeline.
54 | 1. Run as part of a CI/CD pipeline.
55 |
56 | This code shows you the steps 1 to 4.
57 |
58 | ## Run code
59 |
60 | After you went through the notebook, you can run through all the steps at once using the [run.sh script][run_script].
61 |
62 | 1. If you use your own sales table, update the [matching query][matching_query] to transform your table into a table with a schema that the script understands.
63 | 1. Make sure that you can run the run.sh script
64 |
65 | ```chmod +x run.sh```
66 |
67 | 1. Check how to set parameters
68 |
69 | ```./run.sh --help```
70 |
71 | 1. Run the script
72 | ```./run.sh --project-id [YOUR_PROJECT_ID] --dataset-id [YOUR_DATASET_ID]
73 |
74 |
75 | ## Questions? Feedback?
76 | If you have any questions or feedback, please open up a [new issue](https://github.com/GoogleCloudPlatform/analytics-componentized-patterns/issues).
77 |
78 |
79 | ## Disclaimer
80 | This is not an officially supported Google product.
81 |
82 | All files in this folder are under the Apache License, Version 2.0 unless noted otherwise.
83 |
84 | [run_script]:./scripts/run.sh
85 | [matching_query]:./scripts/10_procedure_match.sql
--------------------------------------------------------------------------------
/retail/ltv/bqml/scripts/00_procedure_persist.sql:
--------------------------------------------------------------------------------
1 | -- Copyright 2020 Google LLC
2 | --
3 | -- Licensed under the Apache License, Version 2.0 (the "License");
4 | -- you may not use this file except in compliance with the License.
5 | -- You may obtain a copy of the License at
6 | --
7 | -- https://www.apache.org/licenses/LICENSE-2.0
8 | --
9 | -- Unless required by applicable law or agreed to in writing, software
10 | -- distributed under the License is distributed on an "AS IS" BASIS,
11 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | -- See the License for the specific language governing permissions and
13 | -- limitations under the License.
14 | --
15 | -- Persists the data from a temporary table to a materialized table.
16 | CREATE OR REPLACE PROCEDURE PersistData(
17 | TEMP_TABLE STRING,
18 | DEST_TABLE STRING)
19 |
20 | BEGIN
21 | EXECUTE IMMEDIATE """
22 | CREATE OR REPLACE TABLE """|| DEST_TABLE || """ AS (
23 | SELECT * FROM """ || TEMP_TABLE || """)""";
24 | END
--------------------------------------------------------------------------------
/retail/ltv/bqml/scripts/10_procedure_match.sql:
--------------------------------------------------------------------------------
1 | -- Copyright 2020 Google LLC
2 | --
3 | -- Licensed under the Apache License, Version 2.0 (the "License");
4 | -- you may not use this file except in compliance with the License.
5 | -- You may obtain a copy of the License at
6 | --
7 | -- https://www.apache.org/licenses/LICENSE-2.0
8 | --
9 | -- Unless required by applicable law or agreed to in writing, software
10 | -- distributed under the License is distributed on an "AS IS" BASIS,
11 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | -- See the License for the specific language governing permissions and
13 | -- limitations under the License.
14 | --
15 | -- Transforms raw data to match fields of the template
16 |
17 | CREATE OR REPLACE PROCEDURE MatchFields(
18 | SOURCE_TABLE STRING)
19 |
20 | BEGIN
21 |
22 | -- || because dynamic table names are not supported.
23 | EXECUTE IMMEDIATE """
24 | CREATE OR REPLACE TEMP TABLE Orders AS
25 | SELECT
26 | CAST(customer_id AS STRING) AS customer_id,
27 | order_id AS order_id,
28 | transaction_date AS transaction_date,
29 | product_sku AS product_sku,
30 | qty AS qty,
31 | unit_price AS unit_price
32 | FROM """ ||
33 | SOURCE_TABLE;
34 |
35 | END
--------------------------------------------------------------------------------
/retail/ltv/bqml/scripts/20_procedure_prepare.sql:
--------------------------------------------------------------------------------
1 | -- Copyright 2020 Google LLC
2 | --
3 | -- Licensed under the Apache License, Version 2.0 (the "License");
4 | -- you may not use this file except in compliance with the License.
5 | -- You may obtain a copy of the License at
6 | --
7 | -- https://www.apache.org/licenses/LICENSE-2.0
8 | --
9 | -- Unless required by applicable law or agreed to in writing, software
10 | -- distributed under the License is distributed on an "AS IS" BASIS,
11 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | -- See the License for the specific language governing permissions and
13 | -- limitations under the License.
14 |
15 | CREATE OR REPLACE PROCEDURE PrepareForML(
16 | MAX_STDV_MONETARY INT64,
17 | MAX_STDV_QTY INT64,
18 | WINDOW_LENGTH INT64, -- How many days back for inputs transactions.
19 | WINDOW_STEP INT64, -- How many days between thresholds.
20 | WINDOW_STEP_INITIAL INT64, -- How many days for the first window.
21 | LENGTH_FUTURE INT64, -- How many days to predict for.
22 | TABLE_FOR_PREDICTING STRING,
23 | TABLE_FOR_TRAINING STRING)
24 |
25 | BEGIN
26 |
27 | DECLARE MIN_DATE DATE; -- Date of the first order in the dataset.
28 | DECLARE MAX_DATE DATE; -- Date of the final order in the dataset.
29 | DECLARE THRESHOLD_DATE DATE; -- Date that separates inputs orders from target orders.
30 | DECLARE WINDOW_START DATE; -- Date at which an input transactions window starts.
31 | DECLARE STEP INT64 DEFAULT 1; -- Index of the window being run.
32 |
33 | -- Aggregates per date per customers.
34 | CREATE OR REPLACE TEMP TABLE Aggred AS
35 | SELECT
36 | customer_id,
37 | order_day,
38 | ROUND(day_value_after_returns, 2) AS value,
39 | day_qty_after_returns as qty_articles,
40 | day_num_returns AS num_returns,
41 | CEIL(avg_time_to_return) AS time_to_return
42 | FROM (
43 | SELECT
44 | customer_id,
45 | order_day,
46 | SUM(order_value_after_returns) AS day_value_after_returns,
47 | STDDEV(SUM(order_value_after_returns)) OVER(PARTITION BY customer_id ORDER BY SUM(order_value_after_returns)) AS stdv_value,
48 | SUM(order_qty_after_returns) AS day_qty_after_returns,
49 | STDDEV(SUM(order_qty_after_returns)) OVER(PARTITION BY customer_id ORDER BY SUM(order_qty_after_returns)) AS stdv_qty,
50 | CASE
51 | WHEN MIN(order_min_qty) < 0 THEN count(1)
52 | ELSE 0
53 | END AS day_num_returns,
54 | CASE
55 | WHEN MIN(order_min_qty) < 0 THEN AVG(time_to_return)
56 | ELSE NULL
57 | END AS avg_time_to_return
58 | FROM (
59 | SELECT
60 | customer_id,
61 | order_id,
62 | -- Gives the order date vs return(s) dates.
63 | MIN(transaction_date) AS order_day,
64 | MAX(transaction_date) AS return_final_day,
65 | DATE_DIFF(MAX(transaction_date), MIN(transaction_date), DAY) AS time_to_return,
66 | -- Aggregates all products in the order
67 | -- and all products returned later.
68 | SUM(qty * unit_price) AS order_value_after_returns,
69 | SUM(qty) AS order_qty_after_returns,
70 | -- If negative, order has qty return(s).
71 | MIN(qty) order_min_qty
72 | FROM
73 | Orders
74 | GROUP BY
75 | customer_id,
76 | order_id)
77 | GROUP BY
78 | customer_id,
79 | order_day)
80 | WHERE
81 | -- [Optional] Remove dates with outliers per a customer.
82 | (stdv_value < MAX_STDV_MONETARY
83 | OR stdv_value IS NULL) AND
84 | (stdv_qty < MAX_STDV_QTY
85 | OR stdv_qty IS NULL);
86 |
87 | -- Creates the inputs and targets accross multiple threshold dates.
88 | SET (MIN_DATE, MAX_DATE) = (
89 | SELECT AS STRUCT
90 | MIN(order_day) AS min_days,
91 | MAX(order_day) AS max_days
92 | FROM
93 | Aggred
94 | );
95 |
96 | SET THRESHOLD_DATE = MIN_DATE;
97 |
98 | CREATE OR REPLACE TEMP TABLE Featured
99 | (
100 | -- dataset STRING,
101 | customer_id STRING,
102 | monetary FLOAT64,
103 | frequency INT64,
104 | recency INT64,
105 | T INT64,
106 | time_between FLOAT64,
107 | avg_basket_value FLOAT64,
108 | avg_basket_size FLOAT64,
109 | has_returns STRING,
110 | avg_time_to_return FLOAT64,
111 | num_returns INT64,
112 | -- threshold DATE,
113 | -- step INT64,
114 | target_monetary FLOAT64,
115 | );
116 |
117 | LOOP
118 | -- Can choose a longer original window in case
119 | -- there were not many orders in the early days.
120 | IF STEP = 1 THEN
121 | SET THRESHOLD_DATE = DATE_ADD(THRESHOLD_DATE, INTERVAL WINDOW_STEP_INITIAL DAY);
122 | ELSE
123 | SET THRESHOLD_DATE = DATE_ADD(THRESHOLD_DATE, INTERVAL WINDOW_STEP DAY);
124 | END IF;
125 | SET STEP = STEP + 1;
126 |
127 | IF THRESHOLD_DATE >= DATE_SUB(MAX_DATE, INTERVAL (WINDOW_STEP) DAY) THEN
128 | LEAVE;
129 | END IF;
130 |
131 | -- Takes all transactions before the threshold date unless you decide
132 | -- to use a different window lenght to test model performance.
133 | IF WINDOW_LENGTH != 0 THEN
134 | SET WINDOW_START = DATE_SUB(THRESHOLD_DATE, INTERVAL WINDOW_LENGTH DAY);
135 | ELSE
136 | SET WINDOW_START = MIN_DATE;
137 | END IF;
138 |
139 | INSERT Featured
140 | SELECT
141 | -- CASE
142 | -- WHEN THRESHOLD_DATE <= DATE_SUB(MAX_DATE, INTERVAL LENGTH_FUTURE DAY) THEN 'UNASSIGNED'
143 | -- ELSE 'TEST'
144 | -- END AS dataset,
145 | tf.customer_id,
146 | ROUND(tf.monetary_orders, 2) AS monetary,
147 | tf.cnt_orders AS frequency,
148 | tf.recency,
149 | tf.T,
150 | ROUND(tf.recency/cnt_orders, 2) AS time_between,
151 | ROUND(tf.avg_basket_value, 2) AS avg_basket_value,
152 | ROUND(tf.avg_basket_size, 2) AS avg_basket_size,
153 | has_returns,
154 | CEIL(avg_time_to_return) AS avg_time_to_return,
155 | num_returns,
156 | -- THRESHOLD_DATE AS threshold,
157 | -- STEP - 1 AS step,
158 | ROUND(tt.target_monetary, 2) AS target_monetary,
159 | FROM (
160 | -- This SELECT uses only data before THRESHOLD_DATE to make features.
161 | SELECT
162 | customer_id,
163 | SUM(value) AS monetary_orders,
164 | DATE_DIFF(MAX(order_day), MIN(order_day), DAY) AS recency,
165 | DATE_DIFF(THRESHOLD_DATE, MIN(order_day), DAY) AS T,
166 | COUNT(DISTINCT order_day) AS cnt_orders,
167 | AVG(qty_articles) avg_basket_size,
168 | AVG(value) avg_basket_value,
169 | CASE
170 | WHEN SUM(num_returns) > 0 THEN 'y'
171 | ELSE 'n'
172 | END AS has_returns,
173 | AVG(time_to_return) avg_time_to_return,
174 | THRESHOLD_DATE AS threshold,
175 | SUM(num_returns) num_returns,
176 | FROM
177 | Aggred
178 | WHERE
179 | order_day <= THRESHOLD_DATE AND
180 | order_day >= WINDOW_START
181 | GROUP BY
182 | customer_id
183 | ) tf
184 | INNER JOIN (
185 | -- This SELECT uses all orders that happened between threshold and
186 | -- threshold + LENGTH_FUTURE to calculte the target monetary.
187 | SELECT
188 | customer_id,
189 | SUM(value) target_monetary
190 | FROM
191 | Aggred
192 | WHERE
193 | order_day <= DATE_ADD(THRESHOLD_DATE, INTERVAL LENGTH_FUTURE DAY)
194 | -- Overall value is similar to predicting only what's after threshold.
195 | -- and the prediction performs better. We can substract later.
196 | -- AND order_day > THRESHOLD_DATE
197 | GROUP BY
198 | customer_id) tt
199 | ON
200 | tf.customer_id = tt.customer_id;
201 |
202 | END LOOP;
203 |
204 | -- Persists the temporary ml table. Could do it directly from the above query
205 | -- but this tutorial tries to limit String SQL statements as much as possible
206 | -- and CREATE OR REPLACE TABLE without specifying a dataset is not supported.
207 | -- The TrainLTV needs the table to be persisted.
208 | EXECUTE IMMEDIATE """
209 | CREATE OR REPLACE TABLE """|| TABLE_FOR_PREDICTING || """ AS (
210 | SELECT * FROM Aggred )""";
211 |
212 | EXECUTE IMMEDIATE """
213 | CREATE OR REPLACE TABLE """|| TABLE_FOR_TRAINING || """ AS (
214 | SELECT * FROM Featured )""";
215 |
216 | -- CALL PersistData(TABLE_FOR_PREDICTING, "Aggred");
217 | -- CALL PersistData(TABLE_FOR_TRAINING, "Featured");
218 |
219 | END
--------------------------------------------------------------------------------
/retail/ltv/bqml/scripts/30_procedure_train.sql:
--------------------------------------------------------------------------------
1 | -- Copyright 2020 Google LLC
2 | --
3 | -- Licensed under the Apache License, Version 2.0 (the "License");
4 | -- you may not use this file except in compliance with the License.
5 | -- You may obtain a copy of the License at
6 | --
7 | -- https://www.apache.org/licenses/LICENSE-2.0
8 | --
9 | -- Unless required by applicable law or agreed to in writing, software
10 | -- distributed under the License is distributed on an "AS IS" BASIS,
11 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | -- See the License for the specific language governing permissions and
13 | -- limitations under the License.
14 |
15 | CREATE OR REPLACE PROCEDURE TrainLTV(
16 | MODEL_NAME STRING,
17 | DATA_TABLE STRING)
18 |
19 | BEGIN
20 |
21 | #[START TRAIN_MODEL]
22 | EXECUTE IMMEDIATE """
23 | CREATE OR REPLACE MODEL """ || MODEL_NAME || """
24 | OPTIONS(MODEL_TYPE='AUTOML_REGRESSOR',
25 | INPUT_LABEL_COLS=['target_monetary'],
26 | OPTIMIZATION_OBJECTIVE='MINIMIZE_MAE')
27 | AS
28 | SELECT
29 | * EXCEPT(customer_id)
30 | FROM """ || DATA_TABLE;
31 | #[END TRAIN_MODEL]
32 |
33 | END
--------------------------------------------------------------------------------
/retail/ltv/bqml/scripts/40_procedure_predict.sql:
--------------------------------------------------------------------------------
1 | -- Copyright 2020 Google LLC
2 | --
3 | -- Licensed under the Apache License, Version 2.0 (the "License");
4 | -- you may not use this file except in compliance with the License.
5 | -- You may obtain a copy of the License at
6 | --
7 | -- https://www.apache.org/licenses/LICENSE-2.0
8 | --
9 | -- Unless required by applicable law or agreed to in writing, software
10 | -- distributed under the License is distributed on an "AS IS" BASIS,
11 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | -- See the License for the specific language governing permissions and
13 | -- limitations under the License.
14 |
15 | CREATE OR REPLACE PROCEDURE PredictLTV(
16 | MODEL_NAME STRING,
17 | TABLE_DATA STRING,
18 | PREDICT_FROM_DATE STRING,
19 | WINDOW_LENGTH INT64,
20 | TABLE_PREDICTION STRING)
21 |
22 | BEGIN
23 |
24 | -- Date at which an input transactions window starts.
25 | DECLARE WINDOW_START STRING;
26 |
27 | IF WINDOW_LENGTH != 0 THEN
28 | SET WINDOW_START = (SELECT CAST(DATE_SUB(PREDICT_FROM_DATE, INTERVAL WINDOW_LENGTH DAY) AS STRING));
29 | ELSE
30 | SET WINDOW_START = "1900-01-01";
31 | END IF;
32 |
33 | IF PREDICT_FROM_DATE = 'NULL' THEN
34 | SET PREDICT_FROM_DATE = (SELECT CAST(CURRENT_DATE() AS STRING));
35 | END IF;
36 |
37 | EXECUTE IMMEDIATE """
38 | CREATE OR REPLACE TABLE """ || TABLE_PREDICTION || """ AS
39 | SELECT
40 | customer_id,
41 | monetary AS monetary_so_far,
42 | ROUND(predicted_target_monetary, 2) AS monetary_predicted,
43 | ROUND(predicted_target_monetary - monetary, 2) AS monetary_future
44 | FROM
45 | ML.PREDICT(
46 | MODEL """ || MODEL_NAME || """,
47 | (
48 | SELECT
49 | customer_id,
50 | ROUND(monetary_orders, 2) AS monetary,
51 | cnt_orders AS frequency,
52 | recency,
53 | T,
54 | ROUND(recency/cnt_orders, 2) AS time_between,
55 | ROUND(avg_basket_value, 2) AS avg_basket_value,
56 | ROUND(avg_basket_size, 2) AS avg_basket_size,
57 | has_returns,
58 | CEIL(avg_time_to_return) AS avg_time_to_return,
59 | num_returns
60 | FROM (
61 | SELECT
62 | customer_id,
63 | SUM(value) AS monetary_orders,
64 | DATE_DIFF(MAX(order_day), MIN(order_day), DAY) AS recency,
65 | DATE_DIFF(PARSE_DATE('%Y-%m-%d', '""" || PREDICT_FROM_DATE || """'), MIN(order_day), DAY) AS T,
66 | COUNT(DISTINCT order_day) AS cnt_orders,
67 | AVG(qty_articles) avg_basket_size,
68 | AVG(value) avg_basket_value,
69 | CASE
70 | WHEN SUM(num_returns) > 0 THEN 'y'
71 | ELSE 'n'
72 | END AS has_returns,
73 | AVG(time_to_return) avg_time_to_return,
74 | SUM(num_returns) num_returns,
75 | FROM
76 | """ || TABLE_DATA || """
77 | WHERE
78 | order_day <= PARSE_DATE('%Y-%m-%d', '""" || PREDICT_FROM_DATE || """') AND
79 | order_day >= PARSE_DATE('%Y-%m-%d', '""" || WINDOW_START || """')
80 | GROUP BY
81 | customer_id
82 | )
83 | )
84 | )""";
85 |
86 | END
--------------------------------------------------------------------------------
/retail/ltv/bqml/scripts/50_procedure_top.sql:
--------------------------------------------------------------------------------
1 | -- Copyright 2020 Google LLC
2 | --
3 | -- Licensed under the Apache License, Version 2.0 (the "License");
4 | -- you may not use this file except in compliance with the License.
5 | -- You may obtain a copy of the License at
6 | --
7 | -- https://www.apache.org/licenses/LICENSE-2.0
8 | --
9 | -- Unless required by applicable law or agreed to in writing, software
10 | -- distributed under the License is distributed on an "AS IS" BASIS,
11 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | -- See the License for the specific language governing permissions and
13 | -- limitations under the License.
14 | --
15 | -- Extracts top LTV customers.
16 | CREATE OR REPLACE PROCEDURE ExtractTopEmails(
17 | TOP_LTV_RATIO FLOAT64,
18 | TABLE_SOURCE STRING,
19 | TABLE_CRM STRING,
20 | TABLE_OUTPUT STRING)
21 |
22 | BEGIN
23 |
24 | EXECUTE IMMEDIATE """
25 | CREATE OR REPLACE TABLE """|| TABLE_OUTPUT || """ AS (
26 | SELECT
27 | p.customer_id,
28 | monetary_future,
29 | c.email AS email
30 | FROM (
31 | SELECT
32 | customer_id,
33 | monetary_future,
34 | PERCENT_RANK() OVER (ORDER BY monetary_future DESC) AS percent_rank_monetary
35 | FROM
36 | """ || TABLE_SOURCE || """ ) p
37 | -- This creates fake emails. You need to join with your own CRM table.
38 | INNER JOIN (
39 | SELECT
40 | customer_id,
41 | email
42 | FROM
43 | """ || TABLE_CRM || """ ) c
44 | ON
45 | p.customer_id = CAST(c.customer_id AS STRING)
46 | WHERE
47 | -- Decides the size of your list of emails. For similar-audience use cases
48 | -- where you need to find a minimum of matching emails, 20% should provide
49 | -- enough potential emails.
50 | percent_rank_monetary <= """ || TOP_LTV_RATIO || """
51 | ORDER BY
52 | monetary_future DESC
53 | )""";
54 |
55 | END
--------------------------------------------------------------------------------
/retail/ltv/bqml/scripts/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2020 Google LLC
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # https://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | # Reads command line arguments.
17 | while [[ $# -gt 0 ]]
18 | do
19 | case ${1} in
20 | -p|--project-id)
21 | shift
22 | PROJECT_ID="${1}"
23 | ;;
24 | -d|--dataset-id)
25 | shift
26 | DATASET_ID="${1}"
27 | ;;
28 | -o|table-sales)
29 | shift
30 | TABLE_SALES="${1}"
31 | ;;
32 | -c|table-crm)
33 | shift
34 | TABLE_CRM="${1}"
35 | ;;
36 | -t|--train-model-id)
37 | shift
38 | TRAIN_MODEL_ID="${1}"
39 | ;;
40 | -u|--use-model-id)
41 | shift
42 | USE_MODEL_ID="${1}"
43 | ;;
44 | -w|--window-length)
45 | shift
46 | WINDOW_LENGTH="${1}"
47 | ;;
48 | -s|--window-step)
49 | shift
50 | WINDOW_STEP="${1}"
51 | ;;
52 | -i|--window-step-initial)
53 | shift
54 | WINDOW_STEP_INITIAL="${1}"
55 | ;;
56 | -l|--length-future)
57 | shift
58 | LENGTH_FUTURE="${1}"
59 | ;;
60 | -m|--max-stdv-monetary)
61 | shift
62 | MAX_STDV_MONETARY="${1}"
63 | ;;
64 | -q|--max-stdv-qty)
65 | shift
66 | MAX_STDV_QTY="${1}"
67 | ;;
68 | -r|--top-ltv-ratio)
69 | shift
70 | TOP_LTV_RATIO="${1}"
71 | ;;
72 | -h|--help)
73 | echo "FLAGS"
74 | echo -e " --project-id=PROJECT_ID, -p"
75 | echo -e " [Required] Project ID."
76 | echo -e " --dataset-id=DATASET_ID, -d"
77 | echo -e " [Required] Dataset ID."
78 | echo -e " --train-model-id=TRAIN_MODEL_ID, -t"
79 | echo -e " Name of the trained model. Set to *null* if you do not want to train a model."
80 | echo -e " --use-model-id=USE_MODEL_ID, -u"
81 | echo -e " Name of the model to use for predictions. Must include dataset: [DATASET].[MODEL]"
82 | echo -e " --window-length=WINDOW_LENGTH, -w"
83 | echo -e " Time range in days for input transactions."
84 | echo -e " --window-step=WINDOW_STEP, -s"
85 | echo -e " Time in days between two windows. Equivalent to the time between two threshold dates."
86 | echo -e " --window-step-initial=WINDOW_STEP_INITIAL, -i"
87 | echo -e " Initial time in days before setting the first threshold date."
88 | echo -e " --length-future=LENGTH_FUTURE, -l"
89 | echo -e " Time in days for which to make a prediction."
90 | echo -e " --max-stdv-monetary=MAX_STDV_MONETARY, -m"
91 | echo -e " Standard deviation of the monetary value per customer above which the script removes transactions."
92 | echo -e " --max-stdv-qty=MAX_STDV_QTY, -q"
93 | echo -e " Standard deviation of the quantity value per customer above which the script removes transactions."
94 | echo -e " --top-ltv-ratio=TOP_LTV_RATIO, -r"
95 | echo -e " Percentage of the top customers to extract."
96 | exit
97 | ;;
98 | *)
99 | echo "Flag not supported. See ./run.sh --help"
100 | exit 1
101 | shift
102 | ;;
103 | esac
104 | shift
105 | done
106 |
107 | # Sets default values if missing from the command line.
108 | NOW=$(date +"%Y%m%d%H%M%S")
109 | TRAIN_MODEL_ID="${TRAIN_MODEL_ID:-$DATASET_ID.model_$NOW}"
110 | USE_MODEL_ID="${USE_MODEL_ID:-$DATASET_ID.model_$NOW}"
111 | WINDOW_LENGTH=${WINDOW_LENGTH:-0}
112 | WINDOW_STEP=${WINDOW_STEP:-30}
113 | WINDOW_STEP_INITIAL=${WINDOW_STEP_INITIAL:-90}
114 | LENGTH_FUTURE=${LENGTH_FUTURE:-30}
115 | MAX_STDV_MONETARY=${MAX_STDV_MONETARY:-500}
116 | MAX_STDV_QTY=${MAX_STDV_QTY:-100}
117 | TOP_LTV_RATIO=${TOP_LTV_RATIO:-0.2}
118 |
119 | SOURCE_TABLE="${TABLE_SALES:-$DATASET_ID.sales}" # Original data dump.
120 | TABLE_CRM="${TABLE_CRM:-$DATASET_ID.crm}" # Table with user information.
121 | TABLE_AGGRED="${DATASET_ID}.aggred" # Used to create training dataset and to run predictions.
122 | TABLE_ML="${DATASET_ID}.ml" # Used to train the model
123 | TABLE_PREDICTIONS="${DATASET_ID}.predictions" # Table where to output predictions.
124 | TABLE_EMAILS="${DATASET_ID}.top_emails" # Table where to output top emails.
125 |
126 | # Project and Dataset IDs are required.
127 | if [ -z "$PROJECT_ID" ]; then
128 | echo "Please specify a project id. See ./run.sh --help"
129 | exit 1
130 | fi
131 |
132 | if [ -z "$DATASET_ID" ]; then
133 | echo "Please specify a dataset id. See ./run.sh --help"
134 | exit 1
135 | fi
136 |
137 | echo "---------------------------------------"
138 | echo "Runs script with the follow parameters."
139 | echo "PROJECT_ID: ${PROJECT_ID}"
140 | echo "DATASET_ID: ${DATASET_ID}"
141 | echo "WINDOW_STEP: ${WINDOW_STEP}"
142 | echo "WINDOW_STEP_INITIAL: ${WINDOW_STEP_INITIAL}"
143 | echo "LENGTH_FUTURE: ${LENGTH_FUTURE}"
144 | echo "MAX_STDV_MONETARY: ${MAX_STDV_MONETARY}"
145 | echo "MAX_STDV_QTY: ${MAX_STDV_QTY}"
146 | echo "TOP_LTV_RATIO: ${TOP_LTV_RATIO}"
147 | echo "Source table for transactions is: ${SOURCE_TABLE}"
148 | echo "Source table for CRM is: ${TABLE_CRM}"
149 | echo "Will train a model named: ${TRAIN_MODEL_ID}"
150 | echo "Will run predictions using model: ${USE_MODEL_ID}"
151 | echo "--------------------------------------"
152 |
153 | gcloud config set project $PROJECT_ID
154 |
155 | bq show ${PROJECT_ID}:${DATASET_ID} || bq mk ${PROJECT_ID}:${DATASET_ID}
156 |
157 | # Load example datasets from public GCS to BQ if they don't exist.
158 | bq show ${TABLE_CRM} || \
159 | bq load \
160 | --project_id $PROJECT_ID \
161 | --skip_leading_rows 1 \
162 | --max_bad_records 100000 \
163 | --replace \
164 | --field_delimiter "," \
165 | --autodetect \
166 | ${TABLE_CRM} \
167 | gs://solutions-public-assets/analytics-componentized-patterns/ltv/crm.csv
168 |
169 | bq show ${SOURCE_TABLE} || \
170 | bq load \
171 | --project_id $PROJECT_ID \
172 | --skip_leading_rows 1 \
173 | --max_bad_records 100000 \
174 | --replace \
175 | --field_delimiter "," \
176 | --autodetect \
177 | ${SOURCE_TABLE} \
178 | gs://solutions-public-assets/analytics-componentized-patterns/ltv/sales_*
179 |
180 |
181 | function store_procedure() {
182 | echo ""
183 | echo "------------------------------------------------------"
184 | echo "--- Deploys procedure ${1}. ---"
185 | echo ""
186 | bq query \
187 | --project_id ${PROJECT_ID} \
188 | --dataset_id ${DATASET_ID} \
189 | --use_legacy_sql=false \
190 | < ${1}
191 | }
192 |
193 | function run_action() {
194 | echo ""
195 | echo "--------------------------------------------"
196 | echo " Run the following procedure:"
197 | echo "$@"
198 | echo ""
199 | bq query \
200 | --project_id ${PROJECT_ID} \
201 | --dataset_id ${DATASET_ID} \
202 | --use_legacy_sql=false \
203 | "$@"
204 | }
205 |
206 | store_procedure 00_procedure_persist.sql
207 | store_procedure 10_procedure_match.sql
208 | store_procedure 20_procedure_prepare.sql
209 |
210 | run_action """
211 | CALL MatchFields('${SOURCE_TABLE}');
212 | CALL PrepareForML(
213 | CAST('${MAX_STDV_MONETARY}' AS INT64),
214 | CAST('${MAX_STDV_QTY}' AS INT64),
215 | CAST('${WINDOW_LENGTH}' AS INT64),
216 | CAST('${WINDOW_STEP}' AS INT64),
217 | CAST('${WINDOW_STEP_INITIAL}' AS INT64),
218 | CAST('${LENGTH_FUTURE}' AS INT64),
219 | '${TABLE_AGGRED}',
220 | '${TABLE_ML}');"""
221 |
222 | store_procedure 30_procedure_train.sql
223 | if [[ $TRAIN_MODEL_ID != "null" ]] ;then
224 | run_action """
225 | CALL TrainLTV(
226 | '${TRAIN_MODEL_ID}',
227 | '${TABLE_ML}');"""
228 | fi
229 |
230 | store_procedure 40_procedure_predict.sql
231 | if [[ $USE_MODEL_ID != "null" ]] ;then
232 | run_action """
233 | CALL PredictLTV(
234 | '${USE_MODEL_ID}',
235 | '${TABLE_AGGRED}',
236 | 'NULL',
237 | CAST('${WINDOW_LENGTH}' AS INT64),
238 | '${TABLE_PREDICTIONS}');"""
239 | fi
240 |
241 | store_procedure 50_procedure_top.sql
242 | run_action """
243 | DECLARE TOP_EMAILS ARRAY;
244 |
245 | CALL ExtractTopEmails(
246 | CAST('${TOP_LTV_RATIO}' AS FLOAT64),
247 | '${TABLE_PREDICTIONS}',
248 | '${TABLE_CRM}',
249 | '${TABLE_EMAILS}');"""
--------------------------------------------------------------------------------
/retail/propensity-model/bqml/README.md:
--------------------------------------------------------------------------------
1 | ## License
2 | ```
3 | Copyright 2020 Google LLC
4 |
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 |
9 | https://www.apache.org/licenses/LICENSE-2.0
10 |
11 | Unless required by applicable law or agreed to in writing, software
12 | distributed under the License is distributed on an "AS IS" BASIS,
13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | See the License for the specific language governing permissions and
15 | limitations under the License.
16 | ```
17 | [](LICENSE)
18 |
19 | # How to build an end-to-end propensity to purchase solution using BigQuery ML and Kubeflow Pipelines
20 | You’ll learn how to build a Propensity a model (if a Customer is going to buy) and how to orchestrate an ML Pipeline for doing so. You can use the code as a reference guide. Amend, replace, or add new AI Pipeline Components according to your use case. Please refer to the [Notebook](bqml_kfp_retail_propensity_to_purchase.ipynb), which contains further documentation and detailed instruction.
21 |
22 | To see a step-by-step tutorial that walks you through implementing this solution, see [Predicting customer propensity to buy by using BigQuery ML and AI Platform](https://cloud.google.com/solutions/predicting-customer-propensity-to-buy).
23 |
24 | ### The Notebook does the followings:
25 |
26 | - Environment Setup
27 |
28 | - Setup Cloud AI Platform Pipelines (using the CloudConsole)
29 | - Install KFP client
30 | - Install Python packages for Google Cloud Services
31 |
32 | - Kubeflow Pipelines (KFP) Setup
33 |
34 | - Prepare Data for the training
35 |
36 | - Create/Validate a Google Cloud Storage Bucket/Folder
37 | - Create the input table in BigQuery
38 |
39 | - Train the model
40 | - Evaluate the model
41 | - Prepare the Model for batch prediction
42 |
43 | - Prepare a test dataset (a table)
44 | - Predict the model in BigQuery
45 |
46 | - Prepare the Model for online prediction
47 |
48 | - Create a new revision (Model revision management)
49 | - Export the BigQuery Model
50 |
51 | - Export the Model from BigQuery to Google Cloud Storage
52 | - Export the Training Stats to Google Cloud Storage
53 | - Export the Eval Metrics to Google Cloud Storage
54 |
55 | - Deploy to Cloud AI Platform Prediction
56 | - Predict the model in Cloud AI Platform Prediction
57 |
58 |
59 | - Data Exploration using BigQuery, Pandas, matplotlib
60 | - SDLC methodologies Adherence (opinionated)
61 |
62 | - Variables naming conventions
63 |
64 | - Upper case Names for immutable variables
65 | - Lower case Names for mutable variables
66 | - Naming prefixes with rpm_ or RPM_
67 |
68 | - Unit Tests
69 | - Cleanup/Reset utility functions
70 |
71 | - KFP knowledge share (demonstration)
72 |
73 | - Pass inputs params through function args
74 | - Pass params through pipeline args
75 | - Pass Output from one Component as input of another
76 | - Create an external Shared Volume available to all the Comp
77 | - Use built in Operators
78 | - Built light weight Component
79 | - Set Component not to cache
80 |
81 |
82 |
83 | ### Architecture of the pipeline
84 | 
85 |
86 | ### Data Exploration and Visualization in the notebook
87 |
88 | 
89 |
90 | 
91 |
92 | ## Running the Unit tests
93 |
94 | Create a local context and use it to unit test the KFP Pipeline component locally. Below is an example of testing your component locally:
95 | ```python
96 | # test locally create_gcs_bucket_folder
97 | local_context = get_local_context()
98 | import json
99 | update_local_context (create_gcs_bucket_folder(
100 | json.dumps(local_context),
101 | local_context['RPM_GCP_STORAGE_BUCKET'],
102 | local_context['RPM_GCP_PROJECT'],
103 | local_context['RPM_DEFAULT_BUCKET_EXT'],
104 | local_context['RPM_GCP_STORAGE_BUCKET_FOLDER'],
105 | local_context['RPM_DEFAULT_BUCKET_FOLDER_NAME']
106 | ))
107 | ```
108 |
109 | ### Utility functions
110 |
111 | Below is an utility function which purges GCS artifacts while unit/integration testing:
112 |
113 | ```python
114 | #delete BQ Table if not needed...!!!BE CAREFUL!!!
115 | def delete_table(table_id):
116 | from google.cloud import bigquery
117 | # Construct a BigQuery client object.
118 | client = bigquery.Client()
119 | # client.delete_table(table_id, not_found_ok=True) # Make an API request.
120 | client.delete_table(table_id) # Make an API request.
121 | print("Deleted table '{}'.".format(table_id))
122 | #delete the table in the bigquery
123 | delete_table(get_local_context()['rpm_table_id'])
124 | ```
125 |
126 | ## Mandatory Variables
127 |
128 | You must set values of these parameters, please refer to the instructions in the Notebook for details:
129 | ```python
130 | RPM_GCP_PROJECT:''
131 | RPM_GCP_KFP_HOST=''
132 | RPM_GCP_APPLICATION_CREDENTIALS=""
133 | ```
134 |
135 | ## A screen grab of the Output of the KFP pipeline
136 | 
137 |
138 | ## KFP Knowledge Share
139 |
140 | The below code snippets demonstrates various KFP syntaxes. It shows various ways to pass parameters. You could use whatever works for you.
141 |
142 | ### Pass inputs params through function args example:
143 | ```python
144 | # create BQ DS only if it doesn't exist
145 | from typing import NamedTuple
146 | def create_bq_ds (ctx:str,
147 | RPM_GCP_PROJECT: str,
148 | RPM_BQ_DATASET_NAME: str,
149 | RPM_LOCATION: str
150 | ) -> NamedTuple('Outputs', [
151 | ('rpm_context', str),
152 | ('rpm_bq_ds_name', str),
153 | ]):
154 | ```
155 | 
156 |
157 | ### Pass params through pipeline args example:
158 | ```python
159 | def bq_googlestr_dataset_to_bq_to_caip_pipeline(
160 | data_path = all_vars['RPM_PVC_NAME'] #you can pass input variables
161 | ):
162 | ```
163 |
164 | ### Pass Output from one Kubeflow Pipelines Component as Input of another Kubeflow Pipelines Component example:
165 | Output 'rpm_table_id' from load_bq_ds_op component passed as input to create_bq_ml_op comp
166 | ```python
167 | create_bq_ml_op = create_kfp_comp(create_bq_ml)(
168 | load_bq_ds_op.outputs['rpm_context'],
169 | all_vars['RPM_GCP_PROJECT'],
170 | all_vars['RPM_MODEL_NAME'],
171 | all_vars['RPM_DEFAULT_MODEL_NAME'],
172 | create_bq_ds_op.outputs['rpm_bq_ds_name'],
173 | load_bq_ds_op.outputs['rpm_table_id']
174 | )
175 | ```
176 |
177 | ### Create an external Shared Volume available to all the Kubeflow Pipelines Component example:
178 | ``` python
179 | #create a volume where the dataset will be temporarily stored.
180 | pvc_op = VolumeOp(
181 | name=all_vars['RPM_PVC_NAME'],
182 | resource_name=all_vars['RPM_PVC_NAME'],
183 | size="20Gi",
184 | modes=dsl.VOLUME_MODE_RWO
185 | )
186 | ```
187 |
188 | ### Use built in Ops example:
189 | ``` python
190 | #create a volume where the dataset will be temporarily stored.
191 | pvc_op = VolumeOp(
192 | name=all_vars['RPM_PVC_NAME'],
193 | resource_name=all_vars['RPM_PVC_NAME'],
194 | size="20Gi",
195 | modes=dsl.VOLUME_MODE_RWO
196 | )
197 | ```
198 |
199 | ### Built light weight Kubeflow Pipelines Component example:
200 | ``` python
201 | # converting functions to container operations
202 | import kfp.components as comp
203 | def create_kfp_comp(rpm_comp):
204 | return comp.func_to_container_op(
205 | func=rpm_comp,
206 | base_image="google/cloud-sdk:latest")
207 | ```
208 |
209 | ### Set Kubeflow Pipelines Component not to cache example:
210 | ```python
211 | get_versioned_bqml_model_export_path_op.execution_options.caching_strategy.max_cache_staleness = "P0D"
212 | ```
213 |
214 | ## Questions? Feedback?
215 | If you have any questions or feedback, please open up a [new issue](https://github.com/GoogleCloudPlatform/analytics-componentized-patterns/issues).
--------------------------------------------------------------------------------
/retail/propensity-model/bqml/images/DataExploration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/analytics-componentized-patterns/e869574dc67ecabdb913b50bfb058c7e9d3e47e1/retail/propensity-model/bqml/images/DataExploration.png
--------------------------------------------------------------------------------
/retail/propensity-model/bqml/images/DataVisualization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/analytics-componentized-patterns/e869574dc67ecabdb913b50bfb058c7e9d3e47e1/retail/propensity-model/bqml/images/DataVisualization.png
--------------------------------------------------------------------------------
/retail/propensity-model/bqml/images/KFP-Function_Params.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/analytics-componentized-patterns/e869574dc67ecabdb913b50bfb058c7e9d3e47e1/retail/propensity-model/bqml/images/KFP-Function_Params.png
--------------------------------------------------------------------------------
/retail/propensity-model/bqml/images/KFP-Graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/analytics-componentized-patterns/e869574dc67ecabdb913b50bfb058c7e9d3e47e1/retail/propensity-model/bqml/images/KFP-Graph.png
--------------------------------------------------------------------------------
/retail/propensity-model/bqml/images/MLOPs-Pipeline-Architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/analytics-componentized-patterns/e869574dc67ecabdb913b50bfb058c7e9d3e47e1/retail/propensity-model/bqml/images/MLOPs-Pipeline-Architecture.png
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-mlops/README.md:
--------------------------------------------------------------------------------
1 | ```python
2 | # Copyright 2020 Google LLC
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # https://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | ```
16 |
17 | ## Tutorial Overview
18 |
19 | This is a three part tutorial where part one will walk you through a complete end to end Machine Learning use case using Google Cloud Platform. You will learn how to build a hybrid recommendation model with embedding technique with Google BigQuery Machine Learning from book [“BigQuery: The Definitive Guide”](https://www.amazon.com/Google-BigQuery-Definitive-Warehousing-Analytics/dp/1492044466), a highly recommended book written by BigQuery and ML expert Valliappa Lakshmanan. We will not cover in detail on typical machine learining steps such data exploration and cleaning, feature selection, and feature engineering (other than embedding technique we show here). We encourage the readers to do so and see if you can improve the model quality and performance. Instead we will mostly focus on show you how to orchestrate the entire machine learning process with Kubeflow on Google AI Platform Pipelines. In [PART TWO](part_2/README.md), you will learn how to setup a CI/CD pipeline with Google Cloud Source Repositories and Google Cloud Build. In [PART THREE](part_3/README.md), you will learn how to run the same code in Part One (with minor changes) in Google's new Vertex AI pipeline.
20 |
21 | The use case is to predict the the propensity of booking for any user/hotel combination. The intuition behind the embedding layer with Matrix Factorization is if we can find similar hotels that are close in the embedding space, we will achieve a higher accuracy to predict whether the user will book the hotel.
22 |
23 | 
24 |
25 | ## Getting Started
26 | Use this [notebook](kfp_tutorial.ipynb) to get started.
27 |
28 | ## Questions? Feedback?
29 | If you have any questions or feedback, please open up a [new issue](https://github.com/GoogleCloudPlatform/analytics-componentized-patterns/issues).
30 |
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-mlops/dockerfile:
--------------------------------------------------------------------------------
1 | FROM gcr.io/deeplearning-platform-release/base-cpu
2 |
3 | RUN apt-get update -y && apt-get -y install kubectl
4 |
5 | RUN python -m pip install --no-cache -I kfp gcsfs
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-mlops/part_2/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.7
2 |
3 | # ensure local python is preferred over distribution python
4 | ENV PATH /usr/local/bin:$PATH
5 |
6 | RUN pip3 install fire pyyaml pathlib
7 | RUN pip3 install --upgrade kfp
8 | RUN pip3 install google-cloud-bigquery
9 |
10 |
11 |
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-mlops/part_2/README.md:
--------------------------------------------------------------------------------
1 | ```
2 | # Copyright 2020 Google LLC
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # https://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | ```
16 |
17 | ## Tutorial Overview
18 |
19 | This is part two of the tutorial where you will learn how to setup a CI/CD pipeline with Google Cloud Source Repositories and Google Cloud Build. Google Cloud Source Repositorie is a fully managed private Git Repositories with integrations for continuous integration, delivery & deployment. Google Cloud Build is a serverless CI/CD platform to build, test and deploy code. In this tutorial, you will create a new repository in Google Cloud Source Respositories, create a new Google Cloud Build trigger that will compile and deploy a KubeFlow Pipeline automatically when you check in new code to the source repository.
20 |
21 | 
22 |
23 | ## Prerequisites
24 | * Follow instructions in [Part One Prerequisites](../README.md) to ensure your enviornment is properly setup correctly before continuing, i.e. datasets in BigQuery, AI Platform Pipelines, Google Cloud Storage and etc.
25 |
26 | ## Setup Instructions
27 | 1. Clone this repository and add only the content in this folder (part_2) into a new Google Cloud Source Repositories. Follow instructions [here](https://cloud.google.com/source-repositories/docs/quickstart) to create a new Google Cloud Source Repositories.
28 |
29 | 1. Open [pipeline.py](pipeline.py) and update the following line of code ```base_image = 'YOUR BASE IMAGE CONTAINER URL FOR COMPONENTS BELOW'```. This container is built as part of [Part One Prerequisites](../README.md).
30 |
31 | 1. Build and push a docker image using [this dockerfile](Dockerfile) as the base image. This is used by the Google Cloud Build to compile and deploy a KubeFlow Pipeline. To help you simplify this step, run the [dockerbuild.sh](dockerbuild.sh) script.
32 |
33 | 1. Create a new Google Cloud Build Trigger by following instructions [here](https://cloud.google.com/build/docs/automating-builds/create-manage-triggers)
34 | - In the Source Repository section, select the newly created repository in previous step
35 | - In the Configuration section, enter cloudbuild.yaml in Cloud Build configuration file location. This file instructs the Google Cloud Build to download the container built earlier and execute the [pipeline.py](pipeline.py) python file
36 | - In the Advanced section, add the following Substitution variables. These variables are passed by cloudbuild.yaml at runtime to pipeline for dynamic configuration
37 | - **_DATASET_NAME** YOUR_BQ_DATASET_NAME
38 | - **_KFP_HOST** YOUR_AI_PLATFORM_PIPELINE_URL
39 | - **_MODEL_STORAGE** YOUR_MODEL_EXPORT_GCS_LOCATION
40 | - **_PIPELINE_IMAGE** YOUR_CLOUD_BUILD_DOCKER_IMAGE_URL_ABOVE
41 | - **_TAG** YOUR_CLOUD_BUILD_DOCKER_IMAGE_TAG_ABOVE
42 | - **_PROJECT_ID** YOUR_GCP_PROJECT_ID
43 | - Leave everything else default and save the trigger
44 |
45 | 1. In Google Cloud Build settings, make sure Kubernetes Engines with Kubernetes Engine Developer role is ENABLED. This allows the Cloud Build to deploy the KubeFlow pipeline. To learn more about permissions, see link [here](https://cloud.google.com/build/docs/securing-builds/configure-access-for-cloud-build-service-account).
46 |
47 | 1. You can now either update the source code and check it in to trigger the Google Cloud Build process **OR** from the Google Cloud Build Trigger console, click the Run button.
48 |
49 |
50 | ## Cleaning up
51 |
52 | * Delete the Google Cloud Trigger
53 | * Delete the Google Cloud Source Repository
54 | * Follow instructions in [Part One Clean Up](../README.md) section to delete the rest of cloud services
55 |
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-mlops/part_2/cicd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/analytics-componentized-patterns/e869574dc67ecabdb913b50bfb058c7e9d3e47e1/retail/recommendation-system/bqml-mlops/part_2/cicd.png
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-mlops/part_2/cloudbuild.yaml:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google Inc. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | steps:
17 | - name: '${_PIPELINE_IMAGE}:${_TAG}'
18 | entrypoint: 'python3'
19 | args: ['pipeline.py',
20 | '--project_id', '${_PROJECT_ID}',
21 | '--dataset_name', '${_DATASET_NAME}',
22 | '--model_storage', '${_MODEL_STORAGE}',
23 | '--kfp_host', '${_KFP_HOST}']
24 | id: 'compile pipeline'
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-mlops/part_2/dockerbuild.sh:
--------------------------------------------------------------------------------
1 | export PROJECT_ID=$(gcloud config list project --format "value(core.project)")
2 | export IMAGE_REPO_NAME=hotel_recommender_pipeline_container
3 | export IMAGE_TAG=python3
4 | export IMAGE_URI=gcr.io/$PROJECT_ID/$IMAGE_REPO_NAME:$IMAGE_TAG
5 |
6 | docker build --no-cache -f Dockerfile -t $IMAGE_URI ./
7 | docker push $IMAGE_URI
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-mlops/part_3/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.7
2 |
3 | # Install Scikit-Learn
4 | # Scikit-learn 0.20 was the last version to support Python 2.7 and Python 3.4.
5 | # Scikit-learn now requires Python 3.6 or newer.
6 | RUN python -m pip install --no-cache -I scikit-learn==0.23.2
7 |
8 | # Install pandas
9 | RUN python -m pip install --no-cache -I pandas==1.0.5
10 |
11 | # Install Google SDK
12 | RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && apt-get update -y && apt-get install google-cloud-sdk -y
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-mlops/part_3/README.md:
--------------------------------------------------------------------------------
1 | ```python
2 | # Copyright 2020 Google LLC
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # https://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | ```
16 |
17 | ## Tutorial Overview
18 |
19 | This is part three of the tutorial where you will learn how to run same code in [Part One](../README.md) (with minor changes) in Google's new Vertex AI pipeline. Vertex Pipelines helps you to automate, monitor, and govern your ML systems by orchestrating your ML workflow in a serverless manner, and storing your workflow's artifacts using Vertex ML Metadata. By storing the artifacts of your ML workflow in Vertex ML Metadata, you can analyze the lineage of your workflow's artifacts — for example, an ML model's lineage may include the training data, hyperparameters, and code that were used to create the model.
20 |
21 | You will also learn how to export the final BQML model and hosted on the Google Vertex AI Endpoint.
22 |
23 | 
24 |
25 | ## Getting Started
26 | Use this [notebook](vertex_ai_pipeline.ipynb) to get started.
27 |
28 | ## Questions? Feedback?
29 | If you have any questions or feedback, please open up a [new issue](https://github.com/GoogleCloudPlatform/analytics-componentized-patterns/issues).
30 |
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-mlops/part_3/dockerbuild.sh:
--------------------------------------------------------------------------------
1 | export PROJECT_ID=$(gcloud config list project --format "value(core.project)")
2 | export IMAGE_REPO_NAME=hotel_recommender_vertexai_container
3 | export IMAGE_TAG=latest
4 | export IMAGE_URI=gcr.io/$PROJECT_ID/$IMAGE_REPO_NAME:$IMAGE_TAG
5 |
6 | docker build --no-cache -f Dockerfile -t $IMAGE_URI ./
7 | docker push $IMAGE_URI
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-mlops/part_3/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/analytics-componentized-patterns/e869574dc67ecabdb913b50bfb058c7e9d3e47e1/retail/recommendation-system/bqml-mlops/part_3/pipeline.png
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-mlops/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/analytics-componentized-patterns/e869574dc67ecabdb913b50bfb058c7e9d3e47e1/retail/recommendation-system/bqml-mlops/pipeline.png
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/
2 | .DS_Store
3 | .idea/
4 | *.pyc
5 | *.egg-info/
6 | workspace/
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/00_prep_bq_procedures.ipynb:
--------------------------------------------------------------------------------
1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"environment":{"name":"tf2-gpu.2-3.m61","type":"gcloud","uri":"gcr.io/deeplearning-platform-release/tf2-gpu.2-3:m61"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.7.9"},"colab":{"name":"00_prep_bq_procedures.ipynb","provenance":[],"collapsed_sections":[],"toc_visible":true}},"cells":[{"cell_type":"markdown","metadata":{"id":"T7ya4ArX8p1h"},"source":["# Create BigQuery stored procedures\n","\n","This notebook is the second of two notebooks that guide you through completing the prerequisites for running the [Real-time Item-to-item Recommendation with BigQuery ML Matrix Factorization and ScaNN](https://github.com/GoogleCloudPlatform/analytics-componentized-patterns/tree/master/retail/recommendation-system/bqml-scann) solution.\n","\n","Use this notebook to create the following stored procedures that are needed by the solution:\n","\n","+ `sp_ComputePMI` - Computes [pointwise mutual information (PMI)](https://en.wikipedia.org/wiki/Pointwise_mutual_information) from item co-occurence data. This data is used by a matrix factorization model to learn item embeddings.\n","+ `sp_TrainItemMatchingModel` - Creates the `item_embedding_model` [matrix factorization](https://en.wikipedia.org/wiki/Matrix_factorization_(recommender_systems)) model. This model learns item embeddings based on the PMI data computed by `sp_ComputePMI`. \n","+ `sp_ExractEmbeddings` - Extracts the item embedding values from the `item_embedding_model` model, aggregates these values to produce a single embedding vector for each item, and stores these vectors in the `item_embeddings` table. The vector data is later exported to Cloud Storage to be used for item embedding lookup.\n","\n","Before starting this notebook, you must run the [00_prep_bq_and_datastore](00_prep_bq_and_datastore.ipynb) notebook to complete the first part of the prerequisites.\n","\n","After completing this notebook, you can run the solution either step-by-step or with a TFX pipeline:\n","\n","+ To start running the solution step-by-step, run the [01_train_bqml_mf_pmi](01_train_bqml_mf_pmi.ipynb) notebook to create item embeddings.\n","+ To run the solution by using a TFX pipeline, run the [tfx01_interactive](tfx01_interactive.ipynb) notebook to create the pipeline."]},{"cell_type":"markdown","metadata":{"id":"8XDNl5508p1q"},"source":["## Setup\r\n","\r\n","Install the required Python packages, configure the environment variables, and authenticate your GCP account."]},{"cell_type":"code","metadata":{"id":"ciAORQac8p1r"},"source":["!pip install -q -U google-cloud-bigquery pyarrow"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"dn791d8i8p1s"},"source":["### Import libraries"]},{"cell_type":"code","metadata":{"id":"XdHb5Au58p1t"},"source":["import os\n","from google.cloud import bigquery"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"9UFVH4xM8p1u"},"source":["### Configure GCP environment settings\r\n","\r\n","Update the following variables to reflect the values for your GCP environment:\r\n","\r\n","+ `PROJECT_ID`: The ID of the Google Cloud project you are using to implement this solution.\r\n","+ `BUCKET`: The name of the Cloud Storage bucket you created to use with this solution. The `BUCKET` value should be just the bucket name, so `myBucket` rather than `gs://myBucket`.\r\n"]},{"cell_type":"code","metadata":{"id":"gZDEzHun8p1v"},"source":["PROJECT_ID = 'yourProject' # Change to your project.\n","BUCKET = 'yourBucketName' # Change to the bucket you created.\n","SQL_SCRIPTS_DIR = 'sql_scripts'\n","BQ_DATASET_NAME = 'recommendations'\n","\n","!gcloud config set project $PROJECT_ID"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"kiCcnPua8p1v"},"source":["### Authenticate your GCP account\n","This is required if you run the notebook in Colab. If you use an AI Platform notebook, you should already be authenticated."]},{"cell_type":"code","metadata":{"id":"iSg7I1e38p1w"},"source":["try:\n"," from google.colab import auth\n"," auth.authenticate_user()\n"," print(\"Colab user is authenticated.\")\n","except: pass"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"gvIYbnii8p1x"},"source":["## Create the stored procedure dependencies"]},{"cell_type":"code","metadata":{"id":"NxvKwbdf8p1y"},"source":["%%bigquery --project $PROJECT_ID\n","\n","CREATE TABLE IF NOT EXISTS recommendations.item_cooc\n","AS SELECT 0 AS item1_Id, 0 AS item2_Id, 0 AS cooc, 0 AS pmi;"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"qc5rdQap8p1z"},"source":["%%bigquery --project $PROJECT_ID\n","\n","CREATE MODEL IF NOT EXISTS recommendations.item_matching_model\n","OPTIONS(\n"," MODEL_TYPE='matrix_factorization', \n"," USER_COL='item1_Id', \n"," ITEM_COL='item2_Id',\n"," RATING_COL='score'\n",")\n","AS\n","SELECT 0 AS item1_Id, 0 AS item2_Id, 0 AS score;"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"-W2Rajhs8p1z"},"source":["## Create the stored procedures\r\n","\r\n","Run the scripts that create the BigQuery stored procedures."]},{"cell_type":"code","metadata":{"id":"Cp87zCIu8p10"},"source":["client = bigquery.Client(project=PROJECT_ID)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"VKXjLIqU8p11"},"source":["sql_scripts = dict()\n","\n","for script_file in [file for file in os.listdir(SQL_SCRIPTS_DIR) if '.sql' in file]:\n"," script_file_path = os.path.join(SQL_SCRIPTS_DIR, script_file)\n"," sql_script = open(script_file_path, 'r').read()\n"," sql_script = sql_script.replace('@DATASET_NAME', BQ_DATASET_NAME)\n"," sql_scripts[script_file] = sql_script"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"mHq4rJYf8p12"},"source":["for script_file in sql_scripts:\n"," print(f'Executing {script_file} script...')\n"," query = sql_scripts[script_file]\n"," query_job = client.query(query)\n"," result = query_job.result()\n","\n","print('Done.')"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"1a3kqq5Q8p12"},"source":["### List the stored procedures"]},{"cell_type":"code","metadata":{"id":"Jm5crAur8p13"},"source":["query = f'SELECT * FROM {BQ_DATASET_NAME}.INFORMATION_SCHEMA.ROUTINES;'\n","query_job = client.query(query)\n","query_job.result().to_dataframe()"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"qeJmhunE961u"},"source":["You can also verify that the stored procedures have been created by viewing them in the [BigQuery console](https://pantheon.corp.google.com/bigquery).\r\n"]},{"cell_type":"markdown","metadata":{"id":"mxd9Wvpi8p13"},"source":["## License\n","\n","Copyright 2020 Google LLC\n","\n","Licensed under the Apache License, Version 2.0 (the \"License\");\n","you may not use this file except in compliance with the License. You may obtain a copy of the License at: http://www.apache.org/licenses/LICENSE-2.0\n","\n","Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. \n","\n","See the License for the specific language governing permissions and limitations under the License.\n","\n","**This is not an official Google product but sample code provided for an educational purpose**"]}]}
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/02_export_bqml_mf_embeddings.ipynb:
--------------------------------------------------------------------------------
1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"02_export_bqml_mf_embeddings.ipynb","provenance":[],"collapsed_sections":[],"toc_visible":true},"environment":{"name":"tf2-2-3-gpu.2-3.m59","type":"gcloud","uri":"gcr.io/deeplearning-platform-release/tf2-2-3-gpu.2-3:m59"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.7.8"}},"cells":[{"cell_type":"markdown","metadata":{"id":"dkgce5cdOcW7"},"source":["# Part 2: Process the item embedding data in BigQuery and export it to Cloud Storage\n","\n","This notebook is the second of five notebooks that guide you through running the [Real-time Item-to-item Recommendation with BigQuery ML Matrix Factorization and ScaNN](https://github.com/GoogleCloudPlatform/analytics-componentized-patterns/tree/master/retail/recommendation-system/bqml-scann) solution.\n","\n","Use this notebook to complete the following tasks:\n","\n","1. Process the song embeddings data in BigQuery to generate a single embedding vector for each song.\n","1. Use a Dataflow pipeline to write the embedding vector data to CSV files and export the files to a Cloud Storage bucket. \n","\n","Before starting this notebook, you must run the [01_train_bqml_mf_pmi](01_train_bqml_mf_pmi.ipynb) notebook to calculate item PMI data and then train a matrix factorization model with it.\n","\n","After completing this notebook, run the [03_create_embedding_lookup_model](03_create_embedding_lookup_model.ipynb) notebook to create a model to serve the item embedding data.\n","\n"]},{"cell_type":"markdown","metadata":{"id":"SW1RHsqGPNzE"},"source":["## Setup\r\n","\r\n","Import the required libraries, configure the environment variables, and authenticate your GCP account.\r\n","\r\n"]},{"cell_type":"code","metadata":{"id":"Mp6ETYF2R-0q"},"source":["!pip install -U -q apache-beam[gcp]"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"zdSKSzqvR_qY"},"source":["### Import libraries"]},{"cell_type":"code","metadata":{"id":"OcUKzLnuR_wa"},"source":["import os\n","import numpy as np\n","import tensorflow.io as tf_io\n","import apache_beam as beam\n","from datetime import datetime"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"22rDpO3JPcy9"},"source":["### Configure GCP environment settings\r\n","\r\n","Update the following variables to reflect the values for your GCP environment:\r\n","\r\n","+ `PROJECT_ID`: The ID of the Google Cloud project you are using to implement this solution.\r\n","+ `BUCKET`: The name of the Cloud Storage bucket you created to use with this solution. The `BUCKET` value should be just the bucket name, so `myBucket` rather than `gs://myBucket`.\r\n","+ `REGION`: The region to use for the Dataflow job."]},{"cell_type":"code","metadata":{"id":"Nyx4vEd7Oa9I"},"source":["PROJECT_ID = 'yourProject' # Change to your project.\n","BUCKET = 'yourBucketName' # Change to the bucket you created.\n","REGION = 'yourDataflowRegion' # Change to your Dataflow region.\n","BQ_DATASET_NAME = 'recommendations'\n","\n","!gcloud config set project $PROJECT_ID"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"3d89ZwydPhQX"},"source":["### Authenticate your GCP account\n","This is required if you run the notebook in Colab. If you use an AI Platform notebook, you should already be authenticated."]},{"cell_type":"code","metadata":{"id":"6ICvdRicPhl8"},"source":["try:\n"," from google.colab import auth\n"," auth.authenticate_user()\n"," print(\"Colab user is authenticated.\")\n","except: pass"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"R1gmEmHbSaQD"},"source":["## Process the item embeddings data\r\n","\r\n","You run the [sp_ExractEmbeddings](sql_scripts/sp_ExractEmbeddings.sql) stored procedure to process the item embeddings data and write the results to the `item_embeddings` table.\r\n","\r\n","This stored procedure works as follows:\r\n","\r\n","1. Uses the [ML.WEIGHTS](https://cloud.google.com/bigquery-ml/docs/reference/standard-sql/bigqueryml-syntax-weights) function to extract the item embedding matrices from the `item_matching_model` model.\r\n","1. Aggregates these matrices to generate a single embedding vector for each item.\r\n","\r\n"," Because BigQuery ML matrix factorization models are designed for user-item recommendation use cases, they generate two embedding matrices, one for users, and the other of items. However, in this use case, both embedding matrices represent items, but in different axes of the feedback matrix. For more information about how the feedback matrix is calculated, see [Understanding item embeddings](https://cloud.google.com/solutions/real-time-item-matching#understanding_item_embeddings).\r\n"]},{"cell_type":"markdown","metadata":{"id":"utkyuwJUyTlb"},"source":["### Run the `sp_ExractEmbeddings` stored procedure"]},{"cell_type":"code","metadata":{"id":"DK0olptba8qi"},"source":["%%bigquery --project $PROJECT_ID\n","\n","CALL recommendations.sp_ExractEmbeddings() "],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"0UvHD7BJ8Gk0"},"source":["Get a count of the records in the `item_embeddings` table:"]},{"cell_type":"code","metadata":{"id":"pQsJenNFzVJ7"},"source":["%%bigquery --project $PROJECT_ID\n","\n","SELECT COUNT(*) embedding_count\n","FROM recommendations.item_embeddings;"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"sx8JNJbA8PxC"},"source":["See a sample of the data in the `item_embeddings` table:"]},{"cell_type":"code","metadata":{"id":"Y4kTGcaRzVJ7"},"source":["%%bigquery --project $PROJECT_ID\n","\n","SELECT *\n","FROM recommendations.item_embeddings\n","LIMIT 5;"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"i3LKaxlNSkrv"},"source":["## Export the item embedding vector data\n","\n","Export the item embedding data to Cloud Storage by using a Dataflow pipeline. This pipeline does the following:\n","\n","1. Reads the item embedding records from the `item_embeddings` table in BigQuery.\n","1. Writes each item embedding record to a CSV file.\n","1. Writes the item embedding CSV files to a Cloud Storage bucket.\n","\n","The pipeline in implemented in the [embeddings_exporter/pipeline.py](embeddings_exporter/pipeline.py) module."]},{"cell_type":"markdown","metadata":{"id":"G8HLFGGl5oac"},"source":["### Configure the pipeline variables\r\n","\r\n","Configure the variables needed by the pipeline:"]},{"cell_type":"code","metadata":{"id":"2ZKaoBwnSk6U"},"source":["runner = 'DataflowRunner'\n","timestamp = datetime.utcnow().strftime('%y%m%d%H%M%S')\n","job_name = f'ks-bqml-export-embeddings-{timestamp}'\n","bq_dataset_name = BQ_DATASET_NAME\n","embeddings_table_name = 'item_embeddings'\n","output_dir = f'gs://{BUCKET}/bqml/item_embeddings'\n","project = PROJECT_ID\n","temp_location = os.path.join(output_dir, 'tmp')\n","region = REGION\n","\n","print(f'runner: {runner}')\n","print(f'job_name: {job_name}')\n","print(f'bq_dataset_name: {bq_dataset_name}')\n","print(f'embeddings_table_name: {embeddings_table_name}')\n","print(f'output_dir: {output_dir}')\n","print(f'project: {project}')\n","print(f'temp_location: {temp_location}')\n","print(f'region: {region}')"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"OyiIh-ATzVJ8"},"source":["try: os.chdir(os.path.join(os.getcwd(), 'embeddings_exporter'))\n","except: pass"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"AHxODaoFzVJ8"},"source":["### Run the pipeline"]},{"cell_type":"markdown","metadata":{"id":"OBarPrE_-LJr"},"source":["It takes about 5 minutes to run the pipeline. You can see the graph for the running pipeline in the [Dataflow Console](https://console.cloud.google.com/dataflow/jobs)."]},{"cell_type":"code","metadata":{"id":"WngoWnt2zVJ9"},"source":["if tf_io.gfile.exists(output_dir):\n"," print(\"Removing {} contents...\".format(output_dir))\n"," tf_io.gfile.rmtree(output_dir)\n","\n","print(\"Creating output: {}\".format(output_dir))\n","tf_io.gfile.makedirs(output_dir)\n","\n","!python runner.py \\\n"," --runner={runner} \\\n"," --job_name={job_name} \\\n"," --bq_dataset_name={bq_dataset_name} \\\n"," --embeddings_table_name={embeddings_table_name} \\\n"," --output_dir={output_dir} \\\n"," --project={project} \\\n"," --temp_location={temp_location} \\\n"," --region={region}"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"PLXGq4CA_Oz0"},"source":["### List the CSV files that were written to Cloud Storage"]},{"cell_type":"code","metadata":{"id":"Ee89jHK5zVJ9"},"source":["!gsutil ls {output_dir}/embeddings-*.csv"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"Fp1bOyVCgBnH"},"source":["## License\n","\n","Copyright 2020 Google LLC\n","\n","Licensed under the Apache License, Version 2.0 (the \"License\");\n","you may not use this file except in compliance with the License. You may obtain a copy of the License at: http://www.apache.org/licenses/LICENSE-2.0\n","\n","Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. \n","\n","See the License for the specific language governing permissions and limitations under the License.\n","\n","**This is not an official Google product but sample code provided for an educational purpose**"]}]}
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/03_create_embedding_lookup_model.ipynb:
--------------------------------------------------------------------------------
1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"03_create_embedding_lookup_model.ipynb","provenance":[],"collapsed_sections":[],"toc_visible":true},"environment":{"name":"tf2-2-3-gpu.2-3.m59","type":"gcloud","uri":"gcr.io/deeplearning-platform-release/tf2-2-3-gpu.2-3:m59"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.7.8"}},"cells":[{"cell_type":"markdown","metadata":{"id":"2WVKxCwTsFVK"},"source":["# Part 3: Create a model to serve the item embedding data\n","\n","This notebook is the third of five notebooks that guide you through running the [Real-time Item-to-item Recommendation with BigQuery ML Matrix Factorization and ScaNN](https://github.com/GoogleCloudPlatform/analytics-componentized-patterns/tree/master/retail/recommendation-system/bqml-scann) solution.\n","\n","Use this notebook to wrap the item embeddings data in a Keras model that can act as an item-embedding lookup, then export the model as a SavedModel.\n","\n","Before starting this notebook, you must run the [02_export_bqml_mf_embeddings](02_export_bqml_mf_embeddings.ipynb) notebook to process the item embeddings data and export it to Cloud Storage.\n","\n","After completing this notebook, run the [04_build_embeddings_scann](04_build_embeddings_scann.ipynb) notebook to create an approximate nearest neighbor index for the item embeddings.\n","\n"]},{"cell_type":"markdown","metadata":{"id":"vLLtPpTQSQM-"},"source":["## Setup\r\n","\r\n","Import the required libraries, configure the environment variables, and authenticate your GCP account."]},{"cell_type":"code","metadata":{"id":"ZlnTyUeAdfnO"},"source":["!pip install -q -U pip\n","!pip install -q tensorflow==2.2.0\n","!pip install -q -U google-auth google-api-python-client google-api-core"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"lq0qelfbSSnR"},"source":["### Import libraries"]},{"cell_type":"code","metadata":{"id":"bup8vvpRSWg2"},"source":["import os\n","import tensorflow as tf\n","import numpy as np\n","print(f'Tensorflow version: {tf.__version__}')"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"Ty3pxeh3Sej9"},"source":["### Configure GCP environment settings\r\n","\r\n","Update the following variables to reflect the values for your GCP environment:\r\n","\r\n","+ `PROJECT_ID`: The ID of the Google Cloud project you are using to implement this solution.\r\n","+ `BUCKET`: The name of the Cloud Storage bucket you created to use with this solution. The `BUCKET` value should be just the bucket name, so `myBucket` rather than `gs://myBucket`."]},{"cell_type":"code","metadata":{"id":"Yx83a_PasCBa"},"source":["PROJECT_ID = 'yourProject' # Change to your project.\n","BUCKET = 'yourBucketName' # Change to the bucket you created.\n","EMBEDDING_FILES_PATH = f'gs://{BUCKET}/bqml/item_embeddings/embeddings-*'\n","MODEL_OUTPUT_DIR = f'gs://{BUCKET}/bqml/embedding_lookup_model'\n","\n","!gcloud config set project $PROJECT_ID"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"My7F5vUCShdv"},"source":["### Authenticate your GCP account\n","This is required if you run the notebook in Colab. If you use an AI Platform notebook, you should already be authenticated."]},{"cell_type":"code","metadata":{"id":"PZAUnfyFShls"},"source":["try:\n"," from google.colab import auth\n"," auth.authenticate_user()\n"," print(\"Colab user is authenticated.\")\n","except: pass"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"hCgB7fjuV9fP"},"source":["## Create the embedding lookup model\r\n","\r\n","You use the `EmbeddingLookup` class to create the item embedding lookup model. The `EmbeddingLookup` class inherits from [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model), and is implemented in the\r\n","[lookup_creator.py](embeddings_lookup/lookup_creator.py)\r\n","module.\r\n","\r\n","The `EmbeddingLookup `class works as follows:\r\n","\r\n","1. Accepts the `embedding_files_prefix` variable in the class constructor. This variable points to the Cloud Storage location of the CSV files containing the item embedding data. \r\n","1. Reads and parses the item embedding CSV files.\r\n","1. Populates the `vocabulary` and `embeddings` class variables. `vocabulary` is an array of item IDs, while `embeddings` is a Numpy array with the shape (*number of embeddings*, *embedding dimensions*). \r\n","1. Appends the `oov_embedding` variable to the `embeddings` variable. The `oov_embedding` variable value is all zeros, and it represents the out of vocabulary (OOV) embedding vector. The `oov_embedding` variable is used when an invalid (\"out of vocabulary\", or OOV) item ID is submitted, in which case an embedding vector of zeros is returned.\r\n","1. Writes the `vocabulary` value to a file, one array element per line, so it can be used as a model asset by the SavedModel.\r\n","1. Uses `token_to_idx`, a `tf.lookup.StaticHashTable` object, to map the\r\n"," item ID to the index of the embedding vector in the `embeddings` Numpy array.\r\n","1. Accepts a list of strings with the `__call__` method of the model. Each string represents the item ID(s) for which the embeddings are to be retrieved. If the input list contains _N_ strings, then _N_ embedding vectors are returned. \r\n","\r\n"," Note that each string in the input list may contain one or more space-separated item IDs. If multiple item IDs are present, the embedding vectors of these item IDs are retrieved and _combined_ (by averaging) into a single embedding vector. This makes it possible to fetch an embedding vector representing a set of items (like a playlist) rather than just a single item."]},{"cell_type":"markdown","metadata":{"id":"x-zb5lLRKUbr"},"source":["### Clear the model export directory"]},{"cell_type":"code","metadata":{"id":"koSO5kd7V9fP"},"source":["if tf.io.gfile.exists(MODEL_OUTPUT_DIR):\n"," print(\"Removing {} contents...\".format(MODEL_OUTPUT_DIR))\n"," tf.io.gfile.rmtree(MODEL_OUTPUT_DIR)"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"vYnm0eqkK2kx"},"source":["### Create the model and export the SavedModel file\r\n","\r\n","Call the `export_saved_model` method, which uses the `EmbeddingLookup` class to create the model and then exports the resulting SavedModel file:"]},{"cell_type":"code","metadata":{"id":"IW1amfSCYMn5"},"source":["from embeddings_lookup import lookup_creator\n","lookup_creator.export_saved_model(EMBEDDING_FILES_PATH, MODEL_OUTPUT_DIR)"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"0MjA6nlUgAM3"},"source":["Inspect the exported SavedModel using the `saved_model_cli` command line tool:\r\n"]},{"cell_type":"code","metadata":{"id":"3Y1o5lVCZqbY"},"source":["!saved_model_cli show --dir {MODEL_OUTPUT_DIR} --tag_set serve --signature_def serving_default"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"q2waV_yOOluG"},"source":["### Test the SavedModel file"]},{"cell_type":"markdown","metadata":{"id":"yX9jAKkkgbyE"},"source":["Test the SavedModel by loading it and then calling it with input item IDs:\r\n"]},{"cell_type":"code","metadata":{"id":"serXfA5jfy0h"},"source":["loaded_model = tf.saved_model.load(MODEL_OUTPUT_DIR)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"oVveXWDIqE1V"},"source":["input_items = ['2114406', '2114402 2120788', 'abc123']\n","output = loaded_model(input_items)\n","print(f'Embeddings retrieved: {output.shape}')\n","for idx, embedding in enumerate(output):\n"," print(f'{input_items[idx]}: {embedding[:5]}')"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"4Azup3yqgjnJ"},"source":["The output shows the output embedding vector (the first five elements of each vector) for each input item. Note the following:\r\n","\r\n","+ The second entry in the input list contains two item IDs, `2114402` and `2120788`. The returned vector is the average of the embeddings of these two items.\r\n","+ The third entry in the input list, `abc123`, is an invalid item ID, so the returned embedding vector contains zeros.\r\n"]},{"cell_type":"markdown","metadata":{"id":"2zkAH5zH5n4g"},"source":["## License\n","\n","Copyright 2020 Google LLC\n","\n","Licensed under the Apache License, Version 2.0 (the \"License\");\n","you may not use this file except in compliance with the License. You may obtain a copy of the License at: http://www.apache.org/licenses/LICENSE-2.0\n","\n","Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. \n","\n","See the License for the specific language governing permissions and limitations under the License.\n","\n","**This is not an official Google product but sample code provided for an educational purpose**"]}]}
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/04_build_embeddings_scann.ipynb:
--------------------------------------------------------------------------------
1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"environment":{"name":"tf2-2-3-gpu.2-3.m59","type":"gcloud","uri":"gcr.io/deeplearning-platform-release/tf2-2-3-gpu.2-3:m59"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.7.8"},"colab":{"name":"04_build_embeddings_scann.ipynb","provenance":[],"collapsed_sections":[],"toc_visible":true}},"cells":[{"cell_type":"markdown","metadata":{"id":"Ai1o-fGapfNV"},"source":["# Part 4: Create an approximate nearest neighbor index for the item embeddings\n","\n","This notebook is the fourth of five notebooks that guide you through running the [Real-time Item-to-item Recommendation with BigQuery ML Matrix Factorization and ScaNN](https://github.com/GoogleCloudPlatform/analytics-componentized-patterns/tree/master/retail/recommendation-system/bqml-scann) solution.\n","\n","Use this notebook to create an approximate nearest neighbor (ANN) index for the item embeddings by using the [ScaNN](https://github.com/google-research/google-research/tree/master/scann) framework. You create the index as a model, train the model on AI Platform Training, then export the index to Cloud Storage so that it can serve ANN information.\n","\n","Before starting this notebook, you must run the [03_create_embedding_lookup_model](03_create_embedding_lookup_model.ipynb) notebook to process the item embeddings data and export it to Cloud Storage.\n","\n","After completing this notebook, run the [05_deploy_lookup_and_scann_caip](05_deploy_lookup_and_scann_caip.ipynb) notebook to deploy the solution. Once deployed, you can submit song IDs to the solution and get similar song recommendations in return, based on the ANN index.\n"]},{"cell_type":"markdown","metadata":{"id":"Pk9Wij8ppfNY"},"source":["## Setup\r\n","\r\n","Import the required libraries, configure the environment variables, and authenticate your GCP account."]},{"cell_type":"code","metadata":{"id":"M-H_wPdmpfNY"},"source":["!pip install -q scann"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"gzoXTCD5pfNZ"},"source":["### Import libraries"]},{"cell_type":"code","metadata":{"id":"_7vHSotqpfNa"},"source":["import tensorflow as tf\n","import numpy as np\n","from datetime import datetime"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"4_tk1WOppfNa"},"source":["### Configure GCP environment settings\r\n","\r\n","Update the following variables to reflect the values for your GCP environment:\r\n","\r\n","+ `PROJECT_ID`: The ID of the Google Cloud project you are using to implement this solution.\r\n","+ `BUCKET`: The name of the Cloud Storage bucket you created to use with this solution. The `BUCKET` value should be just the bucket name, so `myBucket` rather than `gs://myBucket`.\r\n","+ `REGION`: The region to use for the AI Platform Training job."]},{"cell_type":"code","metadata":{"id":"FxhEWsL4pfNb"},"source":["PROJECT_ID = 'yourProject' # Change to your project.\n","BUCKET = 'yourBucketName' # Change to the bucket you created.\n","REGION = 'yourTrainingRegion' # Change to your AI Platform Training region.\n","EMBEDDING_FILES_PREFIX = f'gs://{BUCKET}/bqml/item_embeddings/embeddings-*'\n","OUTPUT_INDEX_DIR = f'gs://{BUCKET}/bqml/scann_index'"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"wPdz87S3pfNb"},"source":["### Authenticate your GCP account\n","This is required if you run the notebook in Colab. If you use an AI Platform notebook, you should already be authenticated."]},{"cell_type":"code","metadata":{"id":"fBXROib7pfNc"},"source":["try:\n"," from google.colab import auth\n"," auth.authenticate_user()\n"," print(\"Colab user is authenticated.\")\n","except: pass"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"xaiIfZw9pfNc"},"source":["## Build the ANN index\r\n","\r\n","Use the `build` method implemented in the [indexer.py](index_builder/builder/indexer.py) module to load the embeddings from the CSV files, create the ANN index model and train it on the embedding data, and save the SavedModel file to Cloud Storage. You pass the following three parameters to this method:\r\n","\r\n","+ `embedding_files_path`, which specifies the Cloud Storage location from which to load the embedding vectors.\r\n","+ `num_leaves`, which provides the value for a hyperparameter that tunes the model based on the trade-off between retrieval latency and recall. A higher `num_leaves` value will use more data and provide better recall, but will also increase latency. If `num_leaves` is set to `None` or `0`, the `num_leaves` value is the square root of the number of items.\r\n","+ `output_dir`, which specifies the Cloud Storage location to write the ANN index SavedModel file to.\r\n","\r\n","Other configuration options for the model are set based on the [rules-of-thumb](https://github.com/google-research/google-research/blob/master/scann/docs/algorithms.md#rules-of-thumb) provided by ScaNN."]},{"cell_type":"markdown","metadata":{"id":"PwcdyrDiGcep"},"source":["### Build the index locally"]},{"cell_type":"code","metadata":{"id":"l5TGzINqpfNc"},"source":["from index_builder.builder import indexer\n","indexer.build(EMBEDDING_FILES_PREFIX, OUTPUT_INDEX_DIR)"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"aonq2MptpfNd"},"source":["### Build the index using AI Platform Training\r\n","\r\n","Submit an AI Platform Training job to build the ScaNN index at scale. The [index_builder](index_builder) directory contains the expected [training application packaging structure](https://cloud.google.com/ai-platform/training/docs/packaging-trainer) for submitting the AI Platform Training job."]},{"cell_type":"code","metadata":{"id":"uCVEI3mjpfNd"},"source":["if tf.io.gfile.exists(OUTPUT_INDEX_DIR):\n"," print(\"Removing {} contents...\".format(OUTPUT_INDEX_DIR))\n"," tf.io.gfile.rmtree(OUTPUT_INDEX_DIR)\n","\n","print(\"Creating output: {}\".format(OUTPUT_INDEX_DIR))\n","tf.io.gfile.makedirs(OUTPUT_INDEX_DIR)\n","\n","timestamp = datetime.utcnow().strftime('%y%m%d%H%M%S')\n","job_name = f'ks_bqml_build_scann_index_{timestamp}'\n","\n","!gcloud ai-platform jobs submit training {job_name} \\\n"," --project={PROJECT_ID} \\\n"," --region={REGION} \\\n"," --job-dir={OUTPUT_INDEX_DIR}/jobs/ \\\n"," --package-path=index_builder/builder \\\n"," --module-name=builder.task \\\n"," --config='index_builder/config.yaml' \\\n"," --runtime-version=2.2 \\\n"," --python-version=3.7 \\\n"," --\\\n"," --embedding-files-path={EMBEDDING_FILES_PREFIX} \\\n"," --output-dir={OUTPUT_INDEX_DIR} \\\n"," --num-leaves=500"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"MzS533hgpfNe"},"source":["After the AI Platform Training job finishes, check that the `scann_index` folder has been created in your Cloud Storage bucket:"]},{"cell_type":"code","metadata":{"id":"hgEdM632pfNe"},"source":["!gsutil ls {OUTPUT_INDEX_DIR}"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"KcSPTZeDpfNf"},"source":["## Test the ANN index\r\n","\r\n","Test the ANN index by using the `ScaNNMatcher` class implemented in the [index_server/matching.py](index_server/matching.py) module.\r\n","\r\n","Run the following code snippets to create an item embedding from random generated values and pass it to `scann_matcher`, which returns the items IDs for the five items that are the approximate nearest neighbors of the embedding you submitted."]},{"cell_type":"code","metadata":{"id":"nQXdRKV4pfNf"},"source":["from index_server.matching import ScaNNMatcher\n","scann_matcher = ScaNNMatcher(OUTPUT_INDEX_DIR)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"xlUc_GsBpfNf"},"source":["vector = np.random.rand(50)\n","scann_matcher.match(vector, 5)"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"CHXed9gdpfNg"},"source":["## License\n","\n","Copyright 2020 Google LLC\n","\n","Licensed under the Apache License, Version 2.0 (the \"License\");\n","you may not use this file except in compliance with the License. You may obtain a copy of the License at: http://www.apache.org/licenses/LICENSE-2.0\n","\n","Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. \n","\n","See the License for the specific language governing permissions and limitations under the License.\n","\n","**This is not an official Google product but sample code provided for an educational purpose**"]}]}
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/ann_grpc/match_pb2_grpc.py:
--------------------------------------------------------------------------------
1 | # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
2 | """Client and server classes corresponding to protobuf-defined services."""
3 | import grpc
4 |
5 | import match_pb2 as match__pb2
6 |
7 |
8 | class MatchServiceStub(object):
9 | """MatchService is a Google managed service for efficient vector similarity
10 | search at scale.
11 | """
12 |
13 | def __init__(self, channel):
14 | """Constructor.
15 |
16 | Args:
17 | channel: A grpc.Channel.
18 | """
19 | self.Match = channel.unary_unary(
20 | '/google.cloud.aiplatform.container.v1alpha1.MatchService/Match',
21 | request_serializer=match__pb2.MatchRequest.SerializeToString,
22 | response_deserializer=match__pb2.MatchResponse.FromString,
23 | )
24 | self.BatchMatch = channel.unary_unary(
25 | '/google.cloud.aiplatform.container.v1alpha1.MatchService/BatchMatch',
26 | request_serializer=match__pb2.BatchMatchRequest.SerializeToString,
27 | response_deserializer=match__pb2.BatchMatchResponse.FromString,
28 | )
29 |
30 |
31 | class MatchServiceServicer(object):
32 | """MatchService is a Google managed service for efficient vector similarity
33 | search at scale.
34 | """
35 |
36 | def Match(self, request, context):
37 | """Returns the nearest neighbors for the query. If it is a sharded
38 | deployment, calls the other shards and aggregates the responses.
39 | """
40 | context.set_code(grpc.StatusCode.UNIMPLEMENTED)
41 | context.set_details('Method not implemented!')
42 | raise NotImplementedError('Method not implemented!')
43 |
44 | def BatchMatch(self, request, context):
45 | """Returns the nearest neighbors for batch queries. If it is a sharded
46 | deployment, calls the other shards and aggregates the responses.
47 | """
48 | context.set_code(grpc.StatusCode.UNIMPLEMENTED)
49 | context.set_details('Method not implemented!')
50 | raise NotImplementedError('Method not implemented!')
51 |
52 |
53 | def add_MatchServiceServicer_to_server(servicer, server):
54 | rpc_method_handlers = {
55 | 'Match': grpc.unary_unary_rpc_method_handler(
56 | servicer.Match,
57 | request_deserializer=match__pb2.MatchRequest.FromString,
58 | response_serializer=match__pb2.MatchResponse.SerializeToString,
59 | ),
60 | 'BatchMatch': grpc.unary_unary_rpc_method_handler(
61 | servicer.BatchMatch,
62 | request_deserializer=match__pb2.BatchMatchRequest.FromString,
63 | response_serializer=match__pb2.BatchMatchResponse.SerializeToString,
64 | ),
65 | }
66 | generic_handler = grpc.method_handlers_generic_handler(
67 | 'google.cloud.aiplatform.container.v1alpha1.MatchService', rpc_method_handlers)
68 | server.add_generic_rpc_handlers((generic_handler,))
69 |
70 |
71 | # This class is part of an EXPERIMENTAL API.
72 | class MatchService(object):
73 | """MatchService is a Google managed service for efficient vector similarity
74 | search at scale.
75 | """
76 |
77 | @staticmethod
78 | def Match(request,
79 | target,
80 | options=(),
81 | channel_credentials=None,
82 | call_credentials=None,
83 | insecure=False,
84 | compression=None,
85 | wait_for_ready=None,
86 | timeout=None,
87 | metadata=None):
88 | return grpc.experimental.unary_unary(request, target, '/google.cloud.aiplatform.container.v1alpha1.MatchService/Match',
89 | match__pb2.MatchRequest.SerializeToString,
90 | match__pb2.MatchResponse.FromString,
91 | options, channel_credentials,
92 | insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
93 |
94 | @staticmethod
95 | def BatchMatch(request,
96 | target,
97 | options=(),
98 | channel_credentials=None,
99 | call_credentials=None,
100 | insecure=False,
101 | compression=None,
102 | wait_for_ready=None,
103 | timeout=None,
104 | metadata=None):
105 | return grpc.experimental.unary_unary(request, target, '/google.cloud.aiplatform.container.v1alpha1.MatchService/BatchMatch',
106 | match__pb2.BatchMatchRequest.SerializeToString,
107 | match__pb2.BatchMatchResponse.FromString,
108 | options, channel_credentials,
109 | insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
110 |
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/ann_setup.md:
--------------------------------------------------------------------------------
1 | ## Setting up the ANN Service Experimental release
2 |
3 | This document outlines the steps required to enable and configure the Experimental release of the AI Platform ANN service.
4 | This instructions will be updated when the service moves to the Preview and General Availability.
5 |
6 | ### Allow-listing the project
7 |
8 | Contact your Google representative to allow-list your project and user id(s).
9 |
10 | ### Enabling the Cloud APIs required by the ANN Service
11 |
12 | You need to enable the following APIs to use the ANN service:
13 |
14 | * aiplatform.googleapis.com
15 | * servicenetworking.googleapis.com
16 | * compute.googleapis.com
17 |
18 | ### Configuring private IP access to the ANN Service
19 |
20 | In the experimental release, ANN service is only accessible using private endpoints. Before using the service you need to have a [VPC network](https://cloud.google.com/vpc) configured with [private services access](https://cloud.google.com/vpc/docs/configure-private-services-access). You can use the `default` VPC or create a new one.
21 |
22 | The below instructions are for a VPC that was created with auto subnets and regional dynamic routing mode (defaults). It is recommended that you execute the below commands from Cloud Shell, using the account with the `roles/compute.networkAdmin` permissions.
23 |
24 | 1. Set environment variables for your project ID, the name of your VPC network, and the name of your reserved range of addresses. The name of the reserved range can be an arbitrary name. It is for display only.
25 |
26 | ```
27 | PROJECT_ID=
28 | gcloud config set project $PROJECT_ID
29 | NETWORK_NAME=
30 | PEERING_RANGE_NAME=google-reserved-range
31 |
32 | ```
33 |
34 | 2. Reserve an IP range for Google services. The reserved range should be large enought to accommodate all peered services. The below command reserves a CIDR block with mask /16
35 |
36 | ```
37 | gcloud compute addresses create $PEERING_RANGE_NAME \
38 | --global \
39 | --prefix-length=16 \
40 | --description="peering range for Google service: AI Platform Online Prediction" \
41 | --network=$NETWORK_NAME \
42 | --purpose=VPC_PEERING \
43 | --project=$PROJECT_ID
44 |
45 | ```
46 |
47 | 3. Create a private connection to establish a VPC Network Peering between your VPC network and the Google services network.
48 |
49 | ```
50 | gcloud services vpc-peerings connect \
51 | --service=servicenetworking.googleapis.com \
52 | --network=$NETWORK_NAME \
53 | --ranges=$PEERING_RANGE_NAME \
54 | --project=$PROJECT_ID
55 |
56 | ```
57 |
58 |
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/embeddings_exporter/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/analytics-componentized-patterns/e869574dc67ecabdb913b50bfb058c7e9d3e47e1/retail/recommendation-system/bqml-scann/embeddings_exporter/__init__.py
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/embeddings_exporter/pipeline.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | import os
17 | import apache_beam as beam
18 |
19 | EMBEDDING_FILE_PREFIX = 'embeddings'
20 |
21 | def get_query(dataset_name, table_name):
22 | query = f'''
23 | SELECT
24 | item_Id,
25 | embedding
26 | FROM
27 | `{dataset_name}.{table_name}`;
28 | '''
29 | return query
30 |
31 |
32 | def to_csv(entry):
33 | item_Id = entry['item_Id']
34 | embedding = entry['embedding']
35 | csv_string = f'{item_Id},'
36 | csv_string += ','.join([str(value) for value in embedding])
37 | return csv_string
38 |
39 |
40 | def run(bq_dataset_name, embeddings_table_name, output_dir, pipeline_args):
41 |
42 | pipeline_options = beam.options.pipeline_options.PipelineOptions(pipeline_args)
43 | project = pipeline_options.get_all_options()['project']
44 | with beam.Pipeline(options=pipeline_options) as pipeline:
45 |
46 | query = get_query(bq_dataset_name, embeddings_table_name)
47 | output_prefix = os.path.join(output_dir, EMBEDDING_FILE_PREFIX)
48 |
49 | _ = (
50 | pipeline
51 | | 'ReadFromBigQuery' >> beam.io.ReadFromBigQuery(
52 | project=project, query=query, use_standard_sql=True, flatten_results=False)
53 | | 'ConvertToCsv' >> beam.Map(to_csv)
54 | | 'WriteToCloudStorage' >> beam.io.WriteToText(
55 | file_path_prefix = output_prefix,
56 | file_name_suffix = ".csv")
57 | )
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/embeddings_exporter/runner.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | import argparse
17 | import pipeline
18 |
19 | SETUP_FILE_PATH = './setup.py'
20 |
21 |
22 | def get_args(argv):
23 |
24 | args_parser = argparse.ArgumentParser()
25 |
26 | args_parser.add_argument('--bq_dataset_name',
27 | help='BigQuery dataset name.',
28 | required=True)
29 |
30 | args_parser.add_argument('--embeddings_table_name',
31 | help='BigQuery table name where the embeddings are stored.',
32 | required=True)
33 |
34 | args_parser.add_argument('--output_dir',
35 | help='GCS location where the embedding CSV files will be stored.',
36 | required=True)
37 |
38 | return args_parser.parse_known_args()
39 |
40 |
41 | def main(argv=None):
42 | args, pipeline_args = get_args(argv)
43 | pipeline_args.append('--setup_file={}'.format(SETUP_FILE_PATH))
44 |
45 | pipeline.run(
46 | args.bq_dataset_name,
47 | args.embeddings_table_name,
48 | args.output_dir,
49 | pipeline_args)
50 |
51 |
52 | if __name__ == '__main__':
53 | main()
54 |
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/embeddings_exporter/setup.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | import setuptools
17 |
18 | REQUIRED_PACKAGES = []
19 |
20 | setuptools.setup(
21 | name='embedding_exporter',
22 | description='Export embeddings from BigQuery to Cloud Storage.',
23 | version='0.1',
24 | install_requires=REQUIRED_PACKAGES,
25 | py_modules=['pipeline'],
26 | )
27 |
28 |
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/embeddings_lookup/lookup_creator.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | import tensorflow as tf
17 | import numpy as np
18 |
19 | VOCABULARY_FILE_NAME = 'vocabulary.txt'
20 |
21 |
22 | class EmbeddingLookup(tf.keras.Model):
23 |
24 | def __init__(self, embedding_files_prefix, **kwargs):
25 | super(EmbeddingLookup, self).__init__(**kwargs)
26 |
27 | vocabulary = list()
28 | embeddings = list()
29 |
30 | # Read embeddings from csv files.
31 | print('Loading embeddings from files...')
32 | for embedding_file in tf.io.gfile.glob(embedding_files_prefix):
33 | print(f'Loading embeddings in {embedding_file} ...')
34 | with tf.io.gfile.GFile(embedding_file, 'r') as lines:
35 | for line in lines:
36 | try:
37 | line_parts = line.split(',')
38 | item = line_parts[0]
39 | embedding = np.array([float(v) for v in line_parts[1:]])
40 | vocabulary.append(item)
41 | embeddings.append(embedding)
42 | except: pass
43 | print('Embeddings loaded.')
44 |
45 | embedding_size = len(embeddings[0])
46 | oov_embedding = np.zeros((1, embedding_size))
47 | self.embeddings = np.append(np.array(embeddings), oov_embedding, axis=0)
48 | print(f'Embeddings: {self.embeddings.shape}')
49 |
50 | # Write vocabulary file.
51 | print('Writing vocabulary to file...')
52 | with open(VOCABULARY_FILE_NAME, 'w') as f:
53 | for item in vocabulary:
54 | f.write(f'{item}\n')
55 | print('Vocabulary file written and will be added as a model asset.')
56 |
57 | self.vocabulary_file = tf.saved_model.Asset(VOCABULARY_FILE_NAME)
58 | initializer = tf.lookup.KeyValueTensorInitializer(
59 | keys=vocabulary, values=list(range(len(vocabulary))))
60 | self.token_to_id = tf.lookup.StaticHashTable(
61 | initializer, default_value=len(vocabulary))
62 |
63 | @tf.function(input_signature=[tf.TensorSpec([None], tf.string)])
64 | def __call__(self, inputs):
65 | tokens = tf.strings.split(inputs, sep=None).to_sparse()
66 | ids = self.token_to_id.lookup(tokens)
67 | embeddings = tf.nn.embedding_lookup_sparse(
68 | params=self.embeddings,
69 | sp_ids=ids,
70 | sp_weights=None,
71 | combiner="mean"
72 | )
73 | return embeddings
74 |
75 |
76 |
77 | def export_saved_model(embedding_files_path, model_output_dir):
78 | print('Instantiating embedding lookup model...')
79 | embedding_lookup_model = EmbeddingLookup(embedding_files_path)
80 | print('Model is Instantiated.')
81 |
82 | signatures = {
83 | 'serving_default': embedding_lookup_model.__call__.get_concrete_function(),
84 | }
85 |
86 | print('Exporting embedding lookup model as a SavedModel...')
87 | tf.saved_model.save(embedding_lookup_model, model_output_dir, signatures=signatures)
88 | print('SavedModel is exported.')
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/figures/ann-flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/analytics-componentized-patterns/e869574dc67ecabdb913b50bfb058c7e9d3e47e1/retail/recommendation-system/bqml-scann/figures/ann-flow.png
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/figures/ann-tfx.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/analytics-componentized-patterns/e869574dc67ecabdb913b50bfb058c7e9d3e47e1/retail/recommendation-system/bqml-scann/figures/ann-tfx.png
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/figures/diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/analytics-componentized-patterns/e869574dc67ecabdb913b50bfb058c7e9d3e47e1/retail/recommendation-system/bqml-scann/figures/diagram.png
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/figures/feedback-matrix-columns.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/analytics-componentized-patterns/e869574dc67ecabdb913b50bfb058c7e9d3e47e1/retail/recommendation-system/bqml-scann/figures/feedback-matrix-columns.png
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/figures/feedback-matrix-diagonals.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/analytics-componentized-patterns/e869574dc67ecabdb913b50bfb058c7e9d3e47e1/retail/recommendation-system/bqml-scann/figures/feedback-matrix-diagonals.png
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/figures/feedback-matrix-rows.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/analytics-componentized-patterns/e869574dc67ecabdb913b50bfb058c7e9d3e47e1/retail/recommendation-system/bqml-scann/figures/feedback-matrix-rows.png
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/figures/kfp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/analytics-componentized-patterns/e869574dc67ecabdb913b50bfb058c7e9d3e47e1/retail/recommendation-system/bqml-scann/figures/kfp.png
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/figures/tfx.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/analytics-componentized-patterns/e869574dc67ecabdb913b50bfb058c7e9d3e47e1/retail/recommendation-system/bqml-scann/figures/tfx.png
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/index_builder/builder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/analytics-componentized-patterns/e869574dc67ecabdb913b50bfb058c7e9d3e47e1/retail/recommendation-system/bqml-scann/index_builder/builder/__init__.py
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/index_builder/builder/indexer.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 | import scann
17 | import tensorflow as tf
18 | import numpy as np
19 | import math
20 | import pickle
21 |
22 | METRIC = 'dot_product'
23 | DIMENSIONS_PER_BLOCK = 2
24 | ANISOTROPIC_QUANTIZATION_THRESHOLD = 0.2
25 | NUM_NEIGHBOURS = 10
26 | NUM_LEAVES_TO_SEARCH = 200
27 | REORDER_NUM_NEIGHBOURS = 200
28 | TOKENS_FILE_NAME = 'tokens'
29 |
30 |
31 | def load_embeddings(embedding_files_pattern):
32 |
33 | embedding_list = list()
34 | tokens = list()
35 | embed_files = tf.io.gfile.glob(embedding_files_pattern)
36 | print(f'{len(embed_files)} embedding files are found.')
37 |
38 | for file_idx, embed_file in enumerate(embed_files):
39 | print(f'Loading embeddings in file {file_idx+1} of {len(embed_files)}...')
40 | with tf.io.gfile.GFile(embed_file, 'r') as file_reader:
41 | lines = file_reader.readlines()
42 | for line in lines:
43 | parts = line.split(',')
44 | item_Id = parts[0]
45 | embedding = parts[1:]
46 | embedding = np.array([float(v) for v in embedding])
47 | normalized_embedding = embedding / np.linalg.norm(embedding)
48 | embedding_list.append(normalized_embedding)
49 | tokens.append(item_Id)
50 |
51 | print(f'{len(embedding_list)} embeddings are loaded.')
52 |
53 | return tokens, np.array(embedding_list)
54 |
55 |
56 | def build_index(embeddings, num_leaves):
57 |
58 | data_size = embeddings.shape[0]
59 | if not num_leaves:
60 | num_leaves = int(math.sqrt(data_size))
61 |
62 | print('Start building the ScaNN index...')
63 | scann_builder = scann.scann_ops.builder(embeddings, NUM_NEIGHBOURS, METRIC)
64 | scann_builder = scann_builder.tree(
65 | num_leaves=num_leaves,
66 | num_leaves_to_search=NUM_LEAVES_TO_SEARCH,
67 | training_sample_size=data_size)
68 | scann_builder = scann_builder.score_ah(
69 | DIMENSIONS_PER_BLOCK,
70 | anisotropic_quantization_threshold=ANISOTROPIC_QUANTIZATION_THRESHOLD)
71 | scann_builder = scann_builder.reorder(REORDER_NUM_NEIGHBOURS)
72 | scann_index = scann_builder.build()
73 | print('ScaNN index is built.')
74 |
75 | return scann_index
76 |
77 |
78 | def save_index(index, tokens, output_dir):
79 | print('Saving index as a SavedModel...')
80 | module = index.serialize_to_module()
81 | tf.saved_model.save(
82 | module, output_dir, signatures=None, options=None
83 | )
84 | print(f'Index is saved to {output_dir}')
85 |
86 | print(f'Saving tokens file...')
87 | tokens_file_path = os.path.join(output_dir, TOKENS_FILE_NAME)
88 | with tf.io.gfile.GFile(tokens_file_path, 'wb') as handle:
89 | pickle.dump(tokens, handle, protocol=pickle.HIGHEST_PROTOCOL)
90 | print(f'Item file is saved to {tokens_file_path}.')
91 |
92 |
93 | def build(embedding_files_pattern, output_dir, num_leaves=None):
94 | print("Indexer started...")
95 | tokens, embeddings = load_embeddings(embedding_files_pattern)
96 | index = build_index(embeddings, num_leaves)
97 | save_index(index, tokens, output_dir)
98 | print("Indexer finished.")
99 |
100 |
101 |
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/index_builder/builder/task.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import argparse
16 | from . import indexer
17 |
18 | def get_args():
19 |
20 | args_parser = argparse.ArgumentParser()
21 |
22 | args_parser.add_argument(
23 | '--embedding-files-path',
24 | help='GCS or local paths to embedding files',
25 | required=True
26 | )
27 |
28 | args_parser.add_argument(
29 | '--output-dir',
30 | help='GCS or local paths to output index file',
31 | required=True
32 | )
33 |
34 | args_parser.add_argument(
35 | '--num-leaves',
36 | help='Number of trees to build in the index',
37 | default=250,
38 | type=int
39 | )
40 |
41 | args_parser.add_argument(
42 | '--job-dir',
43 | help='GCS or local paths to job package'
44 | )
45 |
46 | return args_parser.parse_args()
47 |
48 |
49 | def main():
50 | args = get_args()
51 | indexer.build(
52 | embedding_files_pattern=args.embedding_files_path,
53 | output_dir=args.output_dir,
54 | num_leaves=args.num_leaves
55 | )
56 |
57 | if __name__ == '__main__':
58 | main()
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/index_builder/config.yaml:
--------------------------------------------------------------------------------
1 | trainingInput:
2 | scaleTier: CUSTOM
3 | masterType: n1-standard-8
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/index_builder/setup.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from setuptools import find_packages
16 | from setuptools import setup
17 |
18 | REQUIRED_PACKAGES = ['scann==1.1.1']
19 |
20 | setup(
21 | name='scann-index-buider',
22 | version='v1',
23 | install_requires=REQUIRED_PACKAGES,
24 | packages=find_packages(),
25 | include_package_data=True,
26 | description=''
27 | )
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/index_server/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.8-slim
2 |
3 | COPY requirements.txt .
4 | RUN pip install -r requirements.txt
5 |
6 | COPY . ./
7 |
8 | ARG PORT
9 | ENV PORT=$PORT
10 |
11 | CMD exec gunicorn --bind :$PORT main:app --workers=1 --threads 8 --timeout 1800
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/index_server/cloudbuild.yaml:
--------------------------------------------------------------------------------
1 | steps:
2 |
3 | - name: 'gcr.io/cloud-builders/docker'
4 | args: ['build', '--tag', '${_IMAGE_URL}', '.', '--build-arg=PORT=${_PORT}']
5 | dir: 'index_server'
6 |
7 | images: ['${_IMAGE_URL}']
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/index_server/lookup.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import googleapiclient.discovery
16 | from google.api_core.client_options import ClientOptions
17 |
18 |
19 | class EmbeddingLookup(object):
20 |
21 | def __init__(self, project, region, model_name, version):
22 | api_endpoint = f'https://{region}-ml.googleapis.com'
23 | client_options = ClientOptions(api_endpoint=api_endpoint)
24 | self.service = googleapiclient.discovery.build(
25 | serviceName='ml', version='v1', client_options=client_options)
26 | self.name = f'projects/{project}/models/{model_name}/versions/{version}'
27 | print(f'Embedding lookup service {self.name} is initialized.')
28 |
29 | def lookup(self, instances):
30 | request_body = {'instances': instances}
31 | response = self.service.projects().predict(name=self.name, body=request_body).execute()
32 |
33 | if 'error' in response:
34 | raise RuntimeError(response['error'])
35 |
36 | return response['predictions']
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/index_server/main.py:
--------------------------------------------------------------------------------
1 | # Copyright 2019 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 | from flask import Flask
17 | from flask import request
18 | from flask import jsonify
19 |
20 | from lookup import EmbeddingLookup
21 | from matching import ScaNNMatcher
22 |
23 | PROJECT_ID = os.environ['PROJECT_ID']
24 | REGION = os.environ['REGION']
25 | EMBEDDNIG_LOOKUP_MODEL_NAME = os.environ['EMBEDDNIG_LOOKUP_MODEL_NAME']
26 | EMBEDDNIG_LOOKUP_MODEL_VERSION = os.environ['EMBEDDNIG_LOOKUP_MODEL_VERSION']
27 | INDEX_DIR = os.environ['INDEX_DIR']
28 | PORT = os.environ['PORT']
29 |
30 |
31 | scann_matcher = ScaNNMatcher(INDEX_DIR)
32 | embedding_lookup = EmbeddingLookup(
33 | PROJECT_ID, REGION, EMBEDDNIG_LOOKUP_MODEL_NAME, EMBEDDNIG_LOOKUP_MODEL_VERSION)
34 |
35 | app = Flask(__name__)
36 |
37 |
38 | @app.route("/v1/models//versions/", methods=["GET"])
39 | def health(model, version):
40 | return jsonify({})
41 |
42 |
43 | @app.route("/v1/models//versions/:predict", methods=["POST"])
44 | def predict(model, version):
45 | result = 'predictions'
46 | try:
47 | data = request.get_json()['instances'][0]
48 | query = data.get('query', None)
49 | show = data.get('show', 10)
50 | if not str(show).isdigit(): show = 10
51 |
52 | is_valid, error = validate_request(query, show)
53 |
54 | if not is_valid:
55 | value = error
56 | else:
57 | vector = embedding_lookup.lookup([query])[0]
58 | value = scann_matcher.match(vector, int(show))
59 |
60 | except Exception as error:
61 | value = 'Unexpected error: {}'.format(error)
62 | result = 'error'
63 |
64 | response = jsonify({result: value})
65 | return response
66 |
67 |
68 | def validate_request(query, show):
69 | is_valid = True
70 | error = ''
71 |
72 | if not query:
73 | is_valid = False
74 | error = 'You need to provide the item Id(s) in the query!'
75 |
76 | return is_valid, error
77 |
78 |
79 | if __name__ == '__main__':
80 | app.run(host='0.0.0.0', port=PORT)
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/index_server/matching.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import tensorflow as tf
16 | import numpy as np
17 | import scann
18 | import pickle
19 | import os
20 |
21 | TOKENS_FILE_NAME = 'tokens'
22 |
23 |
24 | class ScaNNMatcher(object):
25 |
26 | def __init__(self, index_dir):
27 | print('Loading ScaNN index...')
28 | scann_module = tf.saved_model.load(index_dir)
29 | self.scann_index = scann.scann_ops.searcher_from_module(scann_module)
30 | tokens_file_path = os.path.join(index_dir, TOKENS_FILE_NAME)
31 | with tf.io.gfile.GFile(tokens_file_path, 'rb') as handle:
32 | self.tokens = pickle.load(handle)
33 | print('ScaNN index is loadded.')
34 |
35 | def match(self, vector, num_matches=10):
36 | embedding = np.array(vector)
37 | query = embedding / np.linalg.norm(embedding)
38 | matche_indices, _ = self.scann_index.search(query, final_num_neighbors=num_matches)
39 | match_tokens = [self.tokens[match_idx] for match_idx in matche_indices.numpy()]
40 | return match_tokens
41 |
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/index_server/requirements.txt:
--------------------------------------------------------------------------------
1 | pip==20.2.4
2 | Flask==1.1.2
3 | gunicorn==20.0.4
4 | google-api-python-client==1.12.5
5 | scann==1.1.1
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/perf_test.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Test the Retrieval Latency of Approximate vs Exact Matching "
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import tensorflow as tf\n",
17 | "import time"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": null,
23 | "metadata": {},
24 | "outputs": [],
25 | "source": [
26 | "PROJECT_ID = 'ksalama-cloudml'\n",
27 | "BUCKET = 'ksalama-cloudml'\n",
28 | "INDEX_DIR = f'gs://{BUCKET}/bqml/scann_index'\n",
29 | "BQML_MODEL_DIR = f'gs://{BUCKET}/bqml/item_matching_model'\n",
30 | "LOOKUP_MODEL_DIR = f'gs://{BUCKET}/bqml/embedding_lookup_model'"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "songs = {\n",
40 | " '2114406': 'Metallica: Nothing Else Matters',\n",
41 | " '2114402': 'Metallica: The Unforgiven',\n",
42 | " '2120788': 'Limp Bizkit: My Way',\n",
43 | " '2120786': 'Limp Bizkit: My Generation',\n",
44 | " '1086322': 'Jacques Brel: Ne Me Quitte Pas',\n",
45 | " '3129954': 'Édith Piaf: Non, Je Ne Regrette Rien',\n",
46 | " '53448': 'France Gall: Ella, Elle l\\'a',\n",
47 | " '887688': 'Enrique Iglesias: Tired Of Being Sorry',\n",
48 | " '562487': 'Shakira: Hips Don\\'t Lie',\n",
49 | " '833391': 'Ricky Martin: Livin\\' la Vida Loca',\n",
50 | " '1098069': 'Snoop Dogg: Drop It Like It\\'s Hot',\n",
51 | " '910683': '2Pac: California Love',\n",
52 | " '1579481': 'Dr. Dre: The Next Episode',\n",
53 | " '2675403': 'Eminem: Lose Yourself',\n",
54 | " '2954929': 'Black Sabbath: Iron Man',\n",
55 | " '625169': 'Black Sabbath: Paranoid',\n",
56 | "}"
57 | ]
58 | },
59 | {
60 | "cell_type": "markdown",
61 | "metadata": {},
62 | "source": [
63 | "## Exact Matching"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "class ExactMatcher(object):\n",
73 | " def __init__(self, model_dir):\n",
74 | " print(\"Loading exact matchg model...\")\n",
75 | " self.model = tf.saved_model.load(model_dir)\n",
76 | " print(\"Exact matchg model is loaded.\")\n",
77 | " \n",
78 | " def match(self, instances):\n",
79 | " outputs = self.model.signatures['serving_default'](tf.constant(instances, tf.dtypes.int64))\n",
80 | " return outputs['predicted_item2_Id'].numpy()"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "exact_matcher = ExactMatcher(BQML_MODEL_DIR)"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": null,
95 | "metadata": {},
96 | "outputs": [],
97 | "source": [
98 | "exact_matches = {}\n",
99 | "\n",
100 | "start_time = time.time()\n",
101 | "for i in range(100):\n",
102 | " for song in songs:\n",
103 | " matches = exact_matcher.match([int(song)])\n",
104 | " exact_matches[song] = matches.tolist()[0]\n",
105 | "end_time = time.time()\n",
106 | "exact_elapsed_time = end_time - start_time\n",
107 | "\n",
108 | "print(f'Elapsed time: {round(exact_elapsed_time, 3)} seconds - average time: {exact_elapsed_time / (100 * len(songs))} seconds')"
109 | ]
110 | },
111 | {
112 | "cell_type": "markdown",
113 | "metadata": {},
114 | "source": [
115 | "## Approximate Matching (ScaNN)"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {},
122 | "outputs": [],
123 | "source": [
124 | "from index_server.matching import ScaNNMatcher\n",
125 | "scann_matcher = ScaNNMatcher(INDEX_DIR)\n",
126 | "embedding_lookup = tf.saved_model.load(LOOKUP_MODEL_DIR)"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": null,
132 | "metadata": {},
133 | "outputs": [],
134 | "source": [
135 | "approx_matches = dict()\n",
136 | "\n",
137 | "start_time = time.time()\n",
138 | "for i in range(100):\n",
139 | " for song in songs:\n",
140 | " vector = embedding_lookup([song]).numpy()[0]\n",
141 | " matches = scann_matcher.match(vector, 50)\n",
142 | " approx_matches[song] = matches\n",
143 | "end_time = time.time()\n",
144 | "scann_elapsed_time = end_time - start_time\n",
145 | "\n",
146 | "print(f'Elapsed time: {round(scann_elapsed_time, 3)} seconds - average time: {scann_elapsed_time / (100 * len(songs))} seconds')"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": null,
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "speedup_percent = round(exact_elapsed_time / scann_elapsed_time, 1)\n",
156 | "print(f'ScaNN speedup: {speedup_percent}x')"
157 | ]
158 | },
159 | {
160 | "cell_type": "markdown",
161 | "metadata": {},
162 | "source": [
163 | "## License\n",
164 | "\n",
165 | "Copyright 2020 Google LLC\n",
166 | "\n",
167 | "Licensed under the Apache License, Version 2.0 (the \"License\");\n",
168 | "you may not use this file except in compliance with the License. You may obtain a copy of the License at: http://www.apache.org/licenses/LICENSE-2.0\n",
169 | "\n",
170 | "Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. \n",
171 | "\n",
172 | "See the License for the specific language governing permissions and limitations under the License.\n",
173 | "\n",
174 | "**This is not an official Google product but sample code provided for an educational purpose**"
175 | ]
176 | }
177 | ],
178 | "metadata": {
179 | "environment": {
180 | "name": "tf2-2-3-gpu.2-3.m58",
181 | "type": "gcloud",
182 | "uri": "gcr.io/deeplearning-platform-release/tf2-2-3-gpu.2-3:m58"
183 | },
184 | "kernelspec": {
185 | "display_name": "Python 3",
186 | "language": "python",
187 | "name": "python3"
188 | },
189 | "language_info": {
190 | "codemirror_mode": {
191 | "name": "ipython",
192 | "version": 3
193 | },
194 | "file_extension": ".py",
195 | "mimetype": "text/x-python",
196 | "name": "python",
197 | "nbconvert_exporter": "python",
198 | "pygments_lexer": "ipython3",
199 | "version": "3.7.8"
200 | }
201 | },
202 | "nbformat": 4,
203 | "nbformat_minor": 4
204 | }
205 |
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow==2.4.0
2 | tfx==0.25.0
3 | apache-beam[gcp]
4 | google-cloud-bigquery
5 | pyarrow
6 | google-auth
7 | google-api-python-client
8 | google-api-core
9 | scann
10 | kfp==1.1.2
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/sql_scripts/sp_ComputePMI.sql:
--------------------------------------------------------------------------------
1 | CREATE OR REPLACE PROCEDURE @DATASET_NAME.sp_ComputePMI(
2 | IN min_item_frequency INT64,
3 | IN max_group_size INT64
4 | )
5 |
6 | BEGIN
7 |
8 | DECLARE total INT64;
9 |
10 | # Get items with minimum frequency
11 | CREATE OR REPLACE TABLE @DATASET_NAME.valid_item_groups
12 | AS
13 |
14 | # Create valid item set
15 | WITH
16 | valid_items AS (
17 | SELECT item_Id, COUNT(group_Id) AS item_frequency
18 | FROM @DATASET_NAME.vw_item_groups
19 | GROUP BY item_Id
20 | HAVING item_frequency >= min_item_frequency
21 | ),
22 |
23 | # Create valid group set
24 | valid_groups AS (
25 | SELECT group_Id, COUNT(item_Id) AS group_size
26 | FROM @DATASET_NAME.vw_item_groups
27 | WHERE item_Id IN (SELECT item_Id FROM valid_items)
28 | GROUP BY group_Id
29 | HAVING group_size BETWEEN 2 AND max_group_size
30 | )
31 |
32 | SELECT item_Id, group_Id
33 | FROM @DATASET_NAME.vw_item_groups
34 | WHERE item_Id IN (SELECT item_Id FROM valid_items)
35 | AND group_Id IN (SELECT group_Id FROM valid_groups);
36 |
37 | # Compute pairwise cooc
38 | CREATE OR REPLACE TABLE @DATASET_NAME.item_cooc
39 | AS
40 | SELECT item1_Id, item2_Id, SUM(cooc) AS cooc
41 | FROM
42 | (
43 | SELECT
44 | a.item_Id item1_Id,
45 | b.item_Id item2_Id,
46 | 1 as cooc
47 | FROM @DATASET_NAME.valid_item_groups a
48 | JOIN @DATASET_NAME.valid_item_groups b
49 | ON a.group_Id = b.group_Id
50 | AND a.item_Id < b.item_Id
51 | )
52 | GROUP BY item1_Id, item2_Id;
53 |
54 | ###################################
55 |
56 | # Compute item frequencies
57 | CREATE OR REPLACE TABLE @DATASET_NAME.item_frequency
58 | AS
59 | SELECT item_Id, COUNT(group_Id) AS frequency
60 | FROM @DATASET_NAME.valid_item_groups
61 | GROUP BY item_Id;
62 |
63 | ###################################
64 |
65 | # Compute total frequency |D|
66 | SET total = (
67 | SELECT SUM(frequency) AS total
68 | FROM @DATASET_NAME.item_frequency
69 | );
70 |
71 | ###################################
72 |
73 | # Add mirror item-pair cooc and same item frequency as cooc
74 | CREATE OR REPLACE TABLE @DATASET_NAME.item_cooc
75 | AS
76 | SELECT item1_Id, item2_Id, cooc
77 | FROM @DATASET_NAME.item_cooc
78 | UNION ALL
79 | SELECT item2_Id as item1_Id, item1_Id AS item2_Id, cooc
80 | FROM @DATASET_NAME.item_cooc
81 | UNION ALL
82 | SELECT item_Id as item1_Id, item_Id AS item2_Id, frequency as cooc
83 | FROM @DATASET_NAME.item_frequency;
84 |
85 | ###################################
86 |
87 | # Compute PMI
88 | CREATE OR REPLACE TABLE @DATASET_NAME.item_cooc
89 | AS
90 | SELECT
91 | a.item1_Id,
92 | a.item2_Id,
93 | a.cooc,
94 | LOG(a.cooc, 2) - LOG(b.frequency, 2) - LOG(c.frequency, 2) + LOG(total, 2) AS pmi
95 | FROM @DATASET_NAME.item_cooc a
96 | JOIN @DATASET_NAME.item_frequency b
97 | ON a.item1_Id = b.item_Id
98 | JOIN @DATASET_NAME.item_frequency c
99 | ON a.item2_Id = c.item_Id;
100 | END
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/sql_scripts/sp_ExractEmbeddings.sql:
--------------------------------------------------------------------------------
1 | CREATE OR REPLACE PROCEDURE @DATASET_NAME.sp_ExractEmbeddings()
2 | BEGIN
3 | CREATE OR REPLACE TABLE @DATASET_NAME.item_embeddings AS
4 | WITH
5 | step1 AS
6 | (
7 | SELECT
8 | feature AS item_Id,
9 | factor_weights,
10 | intercept AS bias,
11 | FROM
12 | ML.WEIGHTS(MODEL `@DATASET_NAME.item_matching_model`)
13 | WHERE feature != 'global__INTERCEPT__'
14 | ),
15 |
16 | step2 AS
17 | (
18 | SELECT
19 | item_Id,
20 | factor,
21 | SUM(weight) AS weight,
22 | SUM(bias) AS bias
23 | FROM step1,
24 | UNNEST(step1.factor_weights) AS embedding
25 | GROUP BY
26 | item_Id,
27 | factor
28 | )
29 |
30 | SELECT
31 | item_Id,
32 | ARRAY_AGG(weight ORDER BY factor ASC) embedding,
33 | bias
34 | FROM step2
35 | GROUP BY item_Id, bias;
36 | END
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/sql_scripts/sp_TrainItemMatchingModel.sql:
--------------------------------------------------------------------------------
1 | CREATE OR REPLACE PROCEDURE @DATASET_NAME.sp_TrainItemMatchingModel(
2 | IN dimensions INT64
3 | )
4 |
5 | BEGIN
6 |
7 | CREATE OR REPLACE MODEL @DATASET_NAME.item_matching_model
8 | OPTIONS(
9 | MODEL_TYPE='matrix_factorization',
10 | FEEDBACK_TYPE='implicit',
11 | WALS_ALPHA=1,
12 | NUM_FACTORS=(dimensions),
13 | USER_COL='item1_Id',
14 | ITEM_COL='item2_Id',
15 | RATING_COL='score',
16 | DATA_SPLIT_METHOD='no_split'
17 | )
18 | AS
19 | SELECT
20 | item1_Id,
21 | item2_Id,
22 | cooc * pmi AS score
23 | FROM @DATASET_NAME.item_cooc;
24 |
25 | END
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/tfx02_deploy_run.ipynb:
--------------------------------------------------------------------------------
1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"environment":{"name":"tf2-2-3-gpu.2-3.m59","type":"gcloud","uri":"gcr.io/deeplearning-platform-release/tf2-2-3-gpu.2-3:m59"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.7.8"},"colab":{"name":"tfx02_deploy_run.ipynb","provenance":[],"collapsed_sections":[],"toc_visible":true}},"cells":[{"cell_type":"markdown","metadata":{"id":"Ocb1x0zeP7Fe"},"source":["# Compile and deploy the TFX pipeline to Kubeflow Pipelines\n","\n","This notebook is the second of two notebooks that guide you through automating the [Real-time Item-to-item Recommendation with BigQuery ML Matrix Factorization and ScaNN](https://github.com/GoogleCloudPlatform/analytics-componentized-patterns/tree/master/retail/recommendation-system/bqml-scann) solution with a pipeline.\n","\n","Use this notebook to compile the TFX pipeline to a Kubeflow Pipelines (KFP) package. This process creates an Argo YAML file in a .tar.gz package, and is accomplished through the following steps:\n","\n","1. Build a custom container image that includes the solution modules.\n","2. Compile the TFX Pipeline using the TFX command-line interface (CLI).\n","3. Deploy the compiled pipeline to KFP.\n","\n","The pipeline workflow is implemented in the [pipeline.py](tfx_pipeline/pipeline.py) module. The [runner.py](tfx_pipeline/runner.py) module reads the configuration settings from the [config.py](tfx_pipeline/config.py) module, defines the runtime parameters of the pipeline, and creates a KFP format that is executable on AI Platform pipelines. \n","\n","Before starting this notebook, you must run the [tfx01_interactive](tfx01_interactive.ipynb) notebook to create the TFX pipeline.\n"]},{"cell_type":"markdown","metadata":{"id":"qdYregr9P7Fl"},"source":["## Install required libraries"]},{"cell_type":"code","metadata":{"id":"oFAdKJSdP7Fm"},"source":["%load_ext autoreload\n","%autoreload 2"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"KfrHaJmyP7Fm"},"source":["!pip install -q -U kfp"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"C-Ibd8heP7Fn"},"source":["## Set environment variables\r\n","\r\n","Update the following variables to reflect the values for your GCP environment:\r\n","\r\n","+ `PROJECT_ID`: The ID of the Google Cloud project you are using to implement this solution.\r\n","+ `BUCKET`: The name of the Cloud Storage bucket you created to use with this solution. The `BUCKET` value should be just the bucket name, so `myBucket` rather than `gs://myBucket`.\r\n","+ `GKE_CLUSTER_NAME`: The name of the Kubernetes Engine cluster used by the AI Platform pipeline. You can find this by looking at the **Cluster** column of the `kubeflow-pipelines` pipeline instance on the AI Platform Pipelines page.\r\n","+ `GKE_CLUSTER_ZONE`: The zone of the Kubernetes Engine cluster used by the AI Platform pipeline. You can find this by looking at the **Zone** column of the `kubeflow-pipelines` pipeline instance on the AI Platform Pipelines page."]},{"cell_type":"code","metadata":{"id":"VHES7S4iP7Fn"},"source":["import os\n","\n","os.environ['PROJECT_ID'] = 'yourProject' # Set your project.\n","os.environ['BUCKET'] = 'yourBucket' # Set your bucket.\n","os.environ['GKE_CLUSTER_NAME'] = 'yourCluster' # Set your GKE cluster name.\n","os.environ['GKE_CLUSTER_ZONE'] = 'yourClusterZone' # Set your GKE cluster zone.\n","\n","os.environ['IMAGE_NAME'] = 'tfx-ml'\n","os.environ['TAG'] = 'tfx0.25.0'\n","os.environ['ML_IMAGE_URI']=f'gcr.io/{os.environ.get(\"PROJECT_ID\")}/{os.environ.get(\"IMAGE_NAME\")}:{os.environ.get(\"TAG\")}'\n","\n","os.environ['NAMESPACE'] = 'kubeflow-pipelines'\n","os.environ['ARTIFACT_STORE_URI'] = f'gs://{os.environ.get(\"BUCKET\")}/tfx_artifact_store'\n","os.environ['GCS_STAGING_PATH'] = f'{os.environ.get(\"ARTIFACT_STORE_URI\")}/staging'\n","\n","os.environ['RUNTIME_VERSION'] = '2.2'\n","os.environ['PYTHON_VERSION'] = '3.7'\n","os.environ['BEAM_RUNNER'] = 'DirectRunner'\n","os.environ['MODEL_REGISTRY_URI'] = f'{os.environ.get(\"ARTIFACT_STORE_URI\")}/model_registry'\n","\n","os.environ['PIPELINE_NAME'] = 'tfx_bqml_scann'"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"OQua8zUuP7Fo"},"source":["from tfx_pipeline import config\n","\n","for key, value in config.__dict__.items():\n"," if key.isupper(): print(f'{key}: {value}')"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"jiOIUg-fP7Fo"},"source":["## Run the Pipeline locally by using the Beam runner"]},{"cell_type":"code","metadata":{"id":"zuheADbDP7Fp"},"source":["import kfp\n","import tfx\n","from tfx.orchestration.beam.beam_dag_runner import BeamDagRunner\n","from tfx_pipeline import pipeline as pipeline_module\n","import tensorflow as tf\n","import ml_metadata as mlmd\n","from ml_metadata.proto import metadata_store_pb2\n","import logging\n","\n","logging.getLogger().setLevel(logging.INFO)\n","\n","print(\"TFX Version:\", tfx.__version__)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"K5UbLvJ8P7Fp"},"source":["pipeline_root = f'{config.ARTIFACT_STORE_URI}/{config.PIPELINE_NAME}_beamrunner'\n","model_regisrty_uri = f'{config.MODEL_REGISTRY_URI}_beamrunner'\n","local_mlmd_sqllite = 'mlmd/mlmd.sqllite'\n","\n","print(f'Pipeline artifacts root: {pipeline_root}') \n","print(f'Model registry location: {model_regisrty_uri}') \n","\n","if tf.io.gfile.exists(pipeline_root):\n"," print(\"Removing previous artifacts...\")\n"," tf.io.gfile.rmtree(pipeline_root)\n","if tf.io.gfile.exists('mlmd'):\n"," print(\"Removing local mlmd SQLite...\")\n"," tf.io.gfile.rmtree('mlmd')\n","print(\"Creating mlmd directory...\")\n","tf.io.gfile.mkdir('mlmd')\n","\n","metadata_connection_config = metadata_store_pb2.ConnectionConfig()\n","metadata_connection_config.sqlite.filename_uri = local_mlmd_sqllite\n","metadata_connection_config.sqlite.connection_mode = 3\n","print(\"ML metadata store is ready.\")\n","\n","beam_pipeline_args = [\n"," f'--runner=DirectRunner',\n"," f'--project={config.PROJECT_ID}',\n"," f'--temp_location={config.ARTIFACT_STORE_URI}/beam/tmp'\n","]\n","\n","pipeline_module.SCHEMA_DIR = 'tfx_pipeline/schema'\n","pipeline_module.LOOKUP_CREATOR_MODULE = 'tfx_pipeline/lookup_creator.py'\n","pipeline_module.SCANN_INDEXER_MODULE = 'tfx_pipeline/scann_indexer.py'"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"BA3IbMZZP7Fq"},"source":["runner = BeamDagRunner()\n","\n","pipeline = pipeline_module.create_pipeline(\n"," pipeline_name=config.PIPELINE_NAME,\n"," pipeline_root=pipeline_root,\n"," project_id=config.PROJECT_ID,\n"," bq_dataset_name=config.BQ_DATASET_NAME,\n"," min_item_frequency=15,\n"," max_group_size=10,\n"," dimensions=50,\n"," num_leaves=500,\n"," eval_min_recall=0.8,\n"," eval_max_latency=0.001,\n"," ai_platform_training_args=None,\n"," beam_pipeline_args=beam_pipeline_args,\n"," model_regisrty_uri=model_regisrty_uri,\n"," metadata_connection_config=metadata_connection_config,\n"," enable_cache=True\n",")\n","\n","runner.run(pipeline)"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"sDHQOTlzP7Fr"},"source":["## Build the container image\n","\n","The pipeline uses a custom container image, which is a derivative of the [tensorflow/tfx:0.25.0](https://hub.docker.com/r/tensorflow/tfx) image, as a runtime execution environment for the pipeline's components. The container image is defined in a [Dockerfile](tfx_pipeline/Dockerfile).\n","\n","The container image installs the required libraries and copies over the modules from the solution's [tfx_pipeline](tfx_pipeline) directory, where the custom components are implemented. The container image is also used by AI Platform Training for executing the training jobs. \n","\n","Build the container image using Cloud Build and then store it in Cloud Container Registry:\n","\n"]},{"cell_type":"code","metadata":{"scrolled":true,"id":"PW_sUGUtP7Fr"},"source":["!gcloud builds submit --tag $ML_IMAGE_URI tfx_pipeline"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"-_bmd8YmP7Fr"},"source":["## Compile the TFX pipeline using the TFX CLI\r\n","\r\n","Use the TFX CLI to compile the TFX pipeline to the KFP format, which allows the pipeline to be deployed and executed on AI Platform Pipelines. The output is a .tar.gz package containing an Argo definition of your pipeline.\r\n"]},{"cell_type":"code","metadata":{"id":"n5QGIAclP7Fs"},"source":["!rm ${PIPELINE_NAME}.tar.gz\n","!tfx pipeline compile \\\n"," --engine=kubeflow \\\n"," --pipeline_path=tfx_pipeline/runner.py "],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"qo6z0fQcP7Fs"},"source":["## Deploy the compiled pipeline to KFP\r\n","\r\n","Use the KFP CLI to deploy the pipeline to a hosted instance of KFP on AI Platform Pipelines:\r\n"]},{"cell_type":"code","metadata":{"id":"baYgjczHP7Fs"},"source":["%%bash\n","\n","gcloud container clusters get-credentials ${GKE_CLUSTER_NAME} --zone ${GKE_CLUSTER_ZONE}\n","export KFP_ENDPOINT=$(kubectl describe configmap inverse-proxy-config -n ${NAMESPACE} | grep \"googleusercontent.com\")\n","\n","kfp --namespace=${NAMESPACE} --endpoint=${KFP_ENDPOINT} \\\n"," pipeline upload \\\n"," --pipeline-name=${PIPELINE_NAME} \\\n"," ${PIPELINE_NAME}.tar.gz"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"xuSbXIvdbYqL"},"source":["After deploying the pipeline, you can browse it by following these steps:\r\n","\r\n","1. Open the [AI Platform Pipelines page](https://pantheon.corp.google.com/ai-platform/pipelines/clusters).\r\n","1. For the `kubeflow-pipelines` instance, click **Open Pipelines Dashboard**.\r\n","1. Click **Pipelines** and confirm that `tfx_bqml_scann` appears on the list of pipelines."]},{"cell_type":"markdown","metadata":{"id":"YlNx68crP7Ft"},"source":["## Run the deployed pipeline\r\n","\r\n","Run the pipeline by using the KFP UI:\r\n","\r\n","1. Open the [AI Platform Pipelines page](https://pantheon.corp.google.com/ai-platform/pipelines/clusters).\r\n","1. For the `kubeflow-pipelines` instance, click **Open Pipelines Dashboard**.\r\n","1. Click **Experiments**.\r\n","1. Click **Create Run**.\r\n","1. For **Pipeline**, choose **tfx_bqml_scann** and then click **Use this pipeline**.\r\n","1. For **Pipeline Version**, choose **tfx_bqml_scann**.\r\n","1. For **Run name**, type `run of tfx_bqml_scann`.\r\n","1. For **Experiment**, choose **Default** and then click **Use this experiment**.\r\n","1. Click **Start**.\r\n","\r\n"]},{"cell_type":"markdown","metadata":{"id":"HmudthxFlnTm"},"source":["The pipelines dashboard displays a list of pipeline runs. In the list, click the name of your run to see a graph of the run displayed. While your run is still in progress, the graph changes as each step executes. Click any step to explore the run's inputs, outputs, logs, etc."]},{"cell_type":"markdown","metadata":{"id":"b9lcrRA_P7Fu"},"source":["## License\n","\n","Copyright 2020 Google LLC\n","\n","Licensed under the Apache License, Version 2.0 (the \"License\");\n","you may not use this file except in compliance with the License. You may obtain a copy of the License at: http://www.apache.org/licenses/LICENSE-2.0\n","\n","Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. \n","\n","See the License for the specific language governing permissions and limitations under the License.\n","\n","**This is not an official Google product but sample code provided for an educational purpose**"]}]}
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/tfx_pipeline/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM tensorflow/tfx:0.25.0
2 |
3 | RUN pip install scann==1.1.1 google-cloud-bigquery==1.26.1 protobuf==3.13.0
4 |
5 | WORKDIR /pipeline
6 | COPY ./ ./
7 | ENV PYTHONPATH="/pipeline:${PYTHONPATH}"
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/tfx_pipeline/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/analytics-componentized-patterns/e869574dc67ecabdb913b50bfb058c7e9d3e47e1/retail/recommendation-system/bqml-scann/tfx_pipeline/__init__.py
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/tfx_pipeline/bq_components.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """BigQuery components."""
15 |
16 | import os
17 | import warnings
18 | import logging
19 |
20 | from google.cloud import bigquery
21 |
22 | import tfx
23 | import tensorflow as tf
24 | from tfx.types.experimental.simple_artifacts import Dataset
25 | from tfx.types.standard_artifacts import Artifact
26 | from tfx.dsl.component.experimental.decorators import component
27 | from tfx.dsl.component.experimental.annotations import InputArtifact, OutputArtifact, Parameter
28 |
29 | from tfx.types.standard_artifacts import Model as BQModel
30 |
31 |
32 | @component
33 | def compute_pmi(
34 | project_id: Parameter[str],
35 | bq_dataset: Parameter[str],
36 | min_item_frequency: Parameter[int],
37 | max_group_size: Parameter[int],
38 | item_cooc: OutputArtifact[Dataset]):
39 |
40 | stored_proc = f'{bq_dataset}.sp_ComputePMI'
41 | query = f'''
42 | DECLARE min_item_frequency INT64;
43 | DECLARE max_group_size INT64;
44 |
45 | SET min_item_frequency = {min_item_frequency};
46 | SET max_group_size = {max_group_size};
47 |
48 | CALL {stored_proc}(min_item_frequency, max_group_size);
49 | '''
50 | result_table = 'item_cooc'
51 |
52 | logging.info(f'Starting computing PMI...')
53 |
54 | client = bigquery.Client(project=project_id)
55 | query_job = client.query(query)
56 | query_job.result() # Wait for the job to complete
57 |
58 | logging.info(f'Items PMI computation completed. Output in {bq_dataset}.{result_table}.')
59 |
60 | # Write the location of the output table to metadata.
61 | item_cooc.set_string_custom_property('bq_dataset', bq_dataset)
62 | item_cooc.set_string_custom_property('bq_result_table', result_table)
63 |
64 |
65 | @component
66 | def train_item_matching_model(
67 | project_id: Parameter[str],
68 | bq_dataset: Parameter[str],
69 | dimensions: Parameter[int],
70 | item_cooc: InputArtifact[Dataset],
71 | bq_model: OutputArtifact[BQModel]):
72 |
73 | item_cooc_table = item_cooc.get_string_custom_property('bq_result_table')
74 | stored_proc = f'{bq_dataset}.sp_TrainItemMatchingModel'
75 | query = f'''
76 | DECLARE dimensions INT64 DEFAULT {dimensions};
77 | CALL {stored_proc}(dimensions);
78 | '''
79 | model_name = 'item_matching_model'
80 |
81 | logging.info(f'Using item co-occurrence table: {bq_dataset}.{item_cooc_table}')
82 | logging.info(f'Starting training of the model...')
83 |
84 | client = bigquery.Client(project=project_id)
85 | query_job = client.query(query)
86 | query_job.result()
87 |
88 | logging.info(f'Model training completed. Output in {bq_dataset}.{model_name}.')
89 |
90 | # Write the location of the model to metadata.
91 | bq_model.set_string_custom_property('bq_dataset', bq_dataset)
92 | bq_model.set_string_custom_property('bq_model_name', model_name)
93 |
94 |
95 | @component
96 | def extract_embeddings(
97 | project_id: Parameter[str],
98 | bq_dataset: Parameter[str],
99 | bq_model: InputArtifact[BQModel],
100 | item_embeddings: OutputArtifact[Dataset]):
101 |
102 | embedding_model_name = bq_model.get_string_custom_property('bq_model_name')
103 | stored_proc = f'{bq_dataset}.sp_ExractEmbeddings'
104 | query = f'''
105 | CALL {stored_proc}();
106 | '''
107 | result_table = 'item_embeddings'
108 |
109 | logging.info(f'Extracting item embedding from: {bq_dataset}.{embedding_model_name}')
110 | logging.info(f'Starting exporting embeddings...')
111 |
112 | client = bigquery.Client(project=project_id)
113 | query_job = client.query(query)
114 | query_job.result() # Wait for the job to complete
115 |
116 | logging.info(f'Embeddings export completed. Output in {bq_dataset}.{result_table}')
117 |
118 | # Write the location of the output table to metadata.
119 | item_embeddings.set_string_custom_property('bq_dataset', bq_dataset)
120 | item_embeddings.set_string_custom_property('bq_result_table', result_table)
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/tfx_pipeline/config.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google LLC. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """The pipeline configurations."""
15 |
16 | import os
17 |
18 |
19 | PIPELINE_NAME=os.getenv('PIPELINE_NAME', 'bqml_scann_embedding_matching')
20 | EMBEDDING_LOOKUP_MODEL_NAME=os.getenv('EMBEDDING_LOOKUP_MODEL_NAME', 'embeddings_lookup')
21 | SCANN_INDEX_MODEL_NAME=os.getenv('SCANN_INDEX_MODEL_NAME', 'embeddings_scann')
22 | PROJECT_ID=os.getenv('PROJECT_ID', 'tfx-cloudml')
23 | REGION=os.getenv('REGION', 'europe-west1')
24 | BQ_DATASET_NAME=os.getenv('BQ_DATASET_NAME', 'recommendations')
25 | ARTIFACT_STORE_URI=os.getenv('ARTIFACT_STORE_URI', 'gs://tfx-cloudml-artifacts')
26 | RUNTIME_VERSION=os.getenv('RUNTIME_VERSION', '2.2')
27 | PYTHON_VERSION=os.getenv('PYTHON_VERSION', '3.7')
28 | USE_KFP_SA=os.getenv('USE_KFP_SA', 'False')
29 | ML_IMAGE_URI=os.getenv('ML_IMAGE_URI', 'tensorflow/tfx:0.23.0')
30 | BEAM_RUNNER=os.getenv('BEAM_RUNNER', 'DirectRunner')
31 | MODEL_REGISTRY_URI=os.getenv('MODEL_REGISTRY_URI', 'gs://tfx-cloudml-artifacts/model_registry')
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/tfx_pipeline/item_matcher.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """ScaNN index matchers."""
15 |
16 | import tensorflow as tf
17 | import numpy as np
18 | import scann
19 | import pickle
20 | import os
21 | import logging
22 |
23 | TOKENS_FILE_NAME = 'tokens'
24 |
25 |
26 | class ScaNNMatcher(object):
27 |
28 | def __init__(self, index_dir):
29 | logging.info('Loading ScaNN index...')
30 | scann_module = tf.saved_model.load(index_dir)
31 | self.scann_index = scann.scann_ops.searcher_from_module(scann_module)
32 | tokens_file_path = os.path.join(index_dir, TOKENS_FILE_NAME)
33 | with tf.io.gfile.GFile(tokens_file_path, 'rb') as handle:
34 | self.tokens = pickle.load(handle)
35 | logging.info('ScaNN index is loaded.')
36 |
37 | def match(self, vector, num_matches=10):
38 | embedding = np.array(vector)
39 | query = embedding / np.linalg.norm(embedding)
40 | matche_indices, _ = self.scann_index.search(query, final_num_neighbors=num_matches)
41 | match_tokens = [self.tokens[match_idx] for match_idx in matche_indices.numpy()]
42 | return match_tokens
43 |
44 |
45 | class ExactMatcher(object):
46 |
47 | def __init__(self, embeddings, tokens):
48 | logging.info('Loading Exact index...')
49 | self.embeddings = embeddings
50 | self.tokens = tokens
51 | logging.info('Embeddings and vocabulary are loaded.')
52 |
53 | def match(self, vector, num_matches=10):
54 | embedding = np.array(vector)
55 | query = embedding / np.linalg.norm(embedding)
56 | similarities = np.dot(self.embeddings, query.T)
57 | matches = list(zip(self.tokens, list(similarities)))
58 | matches = sorted(
59 | matches, key=lambda kv: kv[1], reverse=True)[:num_matches]
60 | return [kv[0] for kv in matches]
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/tfx_pipeline/lookup_creator.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Embedding lookup model."""
15 |
16 |
17 | import tensorflow as tf
18 | import tensorflow_data_validation as tfdv
19 | from tensorflow_transform.tf_metadata import schema_utils
20 | import numpy as np
21 | import logging
22 |
23 | VOCABULARY_FILE_NAME = 'vocabulary.txt'
24 | class EmbeddingLookup(tf.keras.Model):
25 |
26 | def __init__(self, embedding_files_prefix, schema_file_path, **kwargs):
27 | super(EmbeddingLookup, self).__init__(**kwargs)
28 |
29 | vocabulary = list()
30 | embeddings = list()
31 |
32 | logging.info('Loading schema...')
33 | schema = tfdv.load_schema_text(schema_file_path)
34 | feature_sepc = schema_utils.schema_as_feature_spec(schema).feature_spec
35 | logging.info('Schema is loadded.')
36 |
37 | def _gzip_reader_fn(filenames):
38 | return tf.data.TFRecordDataset(filenames, compression_type='GZIP')
39 |
40 | dataset = tf.data.experimental.make_batched_features_dataset(
41 | embedding_files_prefix,
42 | batch_size=1,
43 | num_epochs=1,
44 | features=feature_sepc,
45 | reader=_gzip_reader_fn,
46 | shuffle=False
47 | )
48 |
49 | # Read embeddings from tfrecord files.
50 | logging.info('Loading embeddings from files ...')
51 | for tfrecord_batch in dataset:
52 | vocabulary.append(tfrecord_batch["item_Id"].numpy()[0][0].decode())
53 | embeddings.append(tfrecord_batch["embedding"].numpy()[0])
54 | logging.info('Embeddings loaded.')
55 |
56 | embedding_size = len(embeddings[0])
57 | oov_embedding = np.zeros((1, embedding_size))
58 | self.embeddings = np.append(np.array(embeddings), oov_embedding, axis=0)
59 | logging.info(f'Embeddings: {self.embeddings.shape}')
60 |
61 | # Write vocabualry file.
62 | logging.info('Writing vocabulary to file ...')
63 | with open(VOCABULARY_FILE_NAME, 'w') as f:
64 | for item in vocabulary:
65 | f.write(f'{item}\n')
66 | logging.info('Vocabulary file written and will be added as a model asset.')
67 |
68 | self.vocabulary_file = tf.saved_model.Asset(VOCABULARY_FILE_NAME)
69 | initializer = tf.lookup.KeyValueTensorInitializer(
70 | keys=vocabulary, values=list(range(len(vocabulary))))
71 | self.token_to_id = tf.lookup.StaticHashTable(
72 | initializer, default_value=len(vocabulary))
73 |
74 | @tf.function(input_signature=[tf.TensorSpec([None], tf.string)])
75 | def __call__(self, inputs):
76 | tokens = tf.strings.split(inputs, sep=None).to_sparse()
77 | ids = self.token_to_id.lookup(tokens)
78 | embeddings = tf.nn.embedding_lookup_sparse(
79 | params=self.embeddings,
80 | sp_ids=ids,
81 | sp_weights=None,
82 | combiner="mean"
83 | )
84 | return embeddings
85 |
86 |
87 |
88 | # TFX will call this function
89 | def run_fn(params):
90 |
91 | embedding_files_path = params.train_files
92 | model_output_dir = params.serving_model_dir
93 | schema_file_path = params.schema_file
94 |
95 | logging.info('Instantiating embedding lookup model...')
96 | embedding_lookup_model = EmbeddingLookup(embedding_files_path, schema_file_path)
97 | logging.info('Model is instantiated.')
98 |
99 | signatures = {
100 | 'serving_default': embedding_lookup_model.__call__.get_concrete_function(),
101 | }
102 |
103 | logging.info('Exporting embedding lookup model as a SavedModel...')
104 | tf.saved_model.save(embedding_lookup_model, model_output_dir, signatures=signatures)
105 | logging.info('SavedModel is exported.')
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/tfx_pipeline/pipeline.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """TFX pipeline DSL."""
15 |
16 | import os
17 | import sys
18 | from typing import Dict, List, Text, Optional
19 | from kfp import gcp
20 | import tfx
21 | from tfx.proto import example_gen_pb2, infra_validator_pb2
22 | from tfx.orchestration import pipeline, data_types
23 | from tfx.dsl.components.base import executor_spec
24 | from tfx.components.trainer import executor as trainer_executor
25 | from tfx.extensions.google_cloud_ai_platform.trainer import executor as ai_platform_trainer_executor
26 | from tfx.extensions.google_cloud_big_query.example_gen.component import BigQueryExampleGen
27 | from ml_metadata.proto import metadata_store_pb2
28 |
29 | try:
30 | from . import bq_components
31 | from . import scann_evaluator
32 | except:
33 | import bq_components
34 | import scann_evaluator
35 |
36 |
37 | EMBEDDING_LOOKUP_MODEL_NAME = 'embeddings_lookup'
38 | SCANN_INDEX_MODEL_NAME = 'embeddings_scann'
39 | LOOKUP_CREATOR_MODULE = 'lookup_creator.py'
40 | SCANN_INDEXER_MODULE = 'scann_indexer.py'
41 | SCHEMA_DIR = 'schema'
42 |
43 |
44 | def create_pipeline(pipeline_name: Text,
45 | pipeline_root: Text,
46 | project_id: Text,
47 | bq_dataset_name: Text,
48 | min_item_frequency: data_types.RuntimeParameter,
49 | max_group_size: data_types.RuntimeParameter,
50 | dimensions: data_types.RuntimeParameter,
51 | num_leaves: data_types.RuntimeParameter,
52 | eval_min_recall: data_types.RuntimeParameter,
53 | eval_max_latency: data_types.RuntimeParameter,
54 | ai_platform_training_args: Dict[Text, Text],
55 | beam_pipeline_args: List[Text],
56 | model_regisrty_uri: Text,
57 | metadata_connection_config: Optional[
58 | metadata_store_pb2.ConnectionConfig] = None,
59 | enable_cache: Optional[bool] = False) -> pipeline.Pipeline:
60 | """Implements the online news pipeline with TFX."""
61 |
62 |
63 | local_executor_spec = executor_spec.ExecutorClassSpec(
64 | trainer_executor.GenericExecutor)
65 |
66 | caip_executor_spec = executor_spec.ExecutorClassSpec(
67 | ai_platform_trainer_executor.GenericExecutor)
68 |
69 | # Compute the PMI.
70 | pmi_computer = bq_components.compute_pmi(
71 | project_id=project_id,
72 | bq_dataset=bq_dataset_name,
73 | min_item_frequency=min_item_frequency,
74 | max_group_size=max_group_size
75 | )
76 |
77 | # Train the BQML Matrix Factorization model.
78 | bqml_trainer = bq_components.train_item_matching_model(
79 | project_id=project_id,
80 | bq_dataset=bq_dataset_name,
81 | item_cooc=pmi_computer.outputs.item_cooc,
82 | dimensions=dimensions,
83 | )
84 |
85 | # Extract the embeddings from the BQML model to a table.
86 | embeddings_extractor = bq_components.extract_embeddings(
87 | project_id=project_id,
88 | bq_dataset=bq_dataset_name,
89 | bq_model=bqml_trainer.outputs.bq_model
90 | )
91 |
92 | # Export embeddings from BigQuery to Cloud Storage.
93 | embeddings_exporter = BigQueryExampleGen(
94 | query=f'''
95 | SELECT item_Id, embedding, bias,
96 | FROM {bq_dataset_name}.item_embeddings
97 | ''',
98 | output_config=example_gen_pb2.Output(
99 | split_config=example_gen_pb2.SplitConfig(splits=[
100 | example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=1)]))
101 | )
102 |
103 | # Add dependency from embeddings_exporter to embeddings_extractor.
104 | embeddings_exporter.add_upstream_node(embeddings_extractor)
105 |
106 | # Import embeddings schema.
107 | schema_importer = tfx.components.ImporterNode(
108 | source_uri=SCHEMA_DIR,
109 | artifact_type=tfx.types.standard_artifacts.Schema,
110 | instance_name='ImportSchema',
111 | )
112 |
113 | # Generate stats for the embeddings for validation.
114 | stats_generator = tfx.components.StatisticsGen(
115 | examples=embeddings_exporter.outputs.examples,
116 | )
117 |
118 | # Validate the embeddings stats against the schema.
119 | stats_validator = tfx.components.ExampleValidator(
120 | statistics=stats_generator.outputs.statistics,
121 | schema=schema_importer.outputs.result,
122 | )
123 |
124 | # Create an embedding lookup SavedModel.
125 | embedding_lookup_creator = tfx.components.Trainer(
126 | custom_executor_spec=local_executor_spec,
127 | module_file=LOOKUP_CREATOR_MODULE,
128 | train_args={'splits': ['train'], 'num_steps': 0},
129 | eval_args={'splits': ['train'], 'num_steps': 0},
130 | schema=schema_importer.outputs.result,
131 | examples=embeddings_exporter.outputs.examples
132 | )
133 | embedding_lookup_creator.id = 'CreateEmbeddingLookup'
134 |
135 | # Add dependency from stats_validator to embedding_lookup_creator.
136 | embedding_lookup_creator.add_upstream_node(stats_validator)
137 |
138 | # Infra-validate the embedding lookup model.
139 | infra_validator = tfx.components.InfraValidator(
140 | model=embedding_lookup_creator.outputs.model,
141 | serving_spec=infra_validator_pb2.ServingSpec(
142 | tensorflow_serving=infra_validator_pb2.TensorFlowServing(
143 | tags=['latest']),
144 | local_docker=infra_validator_pb2.LocalDockerConfig(),
145 | ),
146 | validation_spec=infra_validator_pb2.ValidationSpec(
147 | max_loading_time_seconds=60,
148 | num_tries=3,
149 | )
150 | )
151 |
152 | # Push the embedding lookup model to model registry location.
153 | embedding_lookup_pusher = tfx.components.Pusher(
154 | model=embedding_lookup_creator.outputs.model,
155 | infra_blessing=infra_validator.outputs.blessing,
156 | push_destination=tfx.proto.pusher_pb2.PushDestination(
157 | filesystem=tfx.proto.pusher_pb2.PushDestination.Filesystem(
158 | base_directory=os.path.join(model_regisrty_uri, EMBEDDING_LOOKUP_MODEL_NAME))
159 | )
160 | )
161 | embedding_lookup_pusher.id = 'PushEmbeddingLookup'
162 |
163 | # Build the ScaNN index.
164 | scann_indexer = tfx.components.Trainer(
165 | custom_executor_spec=caip_executor_spec if ai_platform_training_args else local_executor_spec,
166 | module_file=SCANN_INDEXER_MODULE,
167 | train_args={'splits': ['train'], 'num_steps': num_leaves},
168 | eval_args={'splits': ['train'], 'num_steps': 0},
169 | schema=schema_importer.outputs.result,
170 | examples=embeddings_exporter.outputs.examples,
171 | custom_config={'ai_platform_training_args': ai_platform_training_args}
172 | )
173 | scann_indexer.id = 'BuildScaNNIndex'
174 |
175 | # Add dependency from stats_validator to scann_indexer.
176 | scann_indexer.add_upstream_node(stats_validator)
177 |
178 | # Evaluate and validate the ScaNN index.
179 | index_evaluator = scann_evaluator.IndexEvaluator(
180 | examples=embeddings_exporter.outputs.examples,
181 | schema=schema_importer.outputs.result,
182 | model=scann_indexer.outputs.model,
183 | min_recall=eval_min_recall,
184 | max_latency=eval_max_latency
185 | )
186 |
187 | # Push the ScaNN index to model registry location.
188 | scann_index_pusher = tfx.components.Pusher(
189 | model=scann_indexer.outputs.model,
190 | model_blessing=index_evaluator.outputs.blessing,
191 | push_destination=tfx.proto.pusher_pb2.PushDestination(
192 | filesystem=tfx.proto.pusher_pb2.PushDestination.Filesystem(
193 | base_directory=os.path.join(model_regisrty_uri, SCANN_INDEX_MODEL_NAME))
194 | )
195 | )
196 | scann_index_pusher.id = 'PushScaNNIndex'
197 |
198 | components=[
199 | pmi_computer,
200 | bqml_trainer,
201 | embeddings_extractor,
202 | embeddings_exporter,
203 | schema_importer,
204 | stats_generator,
205 | stats_validator,
206 | embedding_lookup_creator,
207 | infra_validator,
208 | embedding_lookup_pusher,
209 | scann_indexer,
210 | index_evaluator,
211 | scann_index_pusher
212 | ]
213 |
214 | print('The pipeline consists of the following components:')
215 | print([component.id for component in components])
216 |
217 | return pipeline.Pipeline(
218 | pipeline_name=pipeline_name,
219 | pipeline_root=pipeline_root,
220 | components=components,
221 | beam_pipeline_args=beam_pipeline_args,
222 | metadata_connection_config=metadata_connection_config,
223 | enable_cache=enable_cache
224 | )
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/tfx_pipeline/runner.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """KFP runner"""
15 |
16 | import kfp
17 | from kfp import gcp
18 | from tfx.orchestration import data_types
19 | from tfx.orchestration.kubeflow import kubeflow_dag_runner
20 |
21 | from typing import Optional, Dict, List, Text
22 |
23 | import config
24 | import pipeline
25 |
26 | if __name__ == '__main__':
27 |
28 | # Set the values for the compile time parameters.
29 |
30 | ai_platform_training_args = {
31 | 'project': config.PROJECT_ID,
32 | 'region': config.REGION,
33 | 'masterConfig': {
34 | 'imageUri': config.ML_IMAGE_URI
35 | }
36 | }
37 |
38 | beam_pipeline_args = [
39 | f'--runner={config.BEAM_RUNNER}',
40 | '--experiments=shuffle_mode=auto',
41 | f'--project={config.PROJECT_ID}',
42 | f'--temp_location={config.ARTIFACT_STORE_URI}/beam/tmp',
43 | f'--region={config.REGION}',
44 | ]
45 |
46 |
47 | # Set the default values for the pipeline runtime parameters.
48 |
49 | min_item_frequency = data_types.RuntimeParameter(
50 | name='min-item-frequency',
51 | default=15,
52 | ptype=int
53 | )
54 |
55 | max_group_size = data_types.RuntimeParameter(
56 | name='max_group_size',
57 | default=100,
58 | ptype=int
59 | )
60 |
61 | dimensions = data_types.RuntimeParameter(
62 | name='dimensions',
63 | default=50,
64 | ptype=int
65 | )
66 |
67 | num_leaves = data_types.RuntimeParameter(
68 | name='num-leaves',
69 | default=0,
70 | ptype=int
71 | )
72 |
73 | eval_min_recall = data_types.RuntimeParameter(
74 | name='eval-min-recall',
75 | default=0.8,
76 | ptype=float
77 | )
78 |
79 | eval_max_latency = data_types.RuntimeParameter(
80 | name='eval-max-latency',
81 | default=0.01,
82 | ptype=float
83 | )
84 |
85 | pipeline_root = f'{config.ARTIFACT_STORE_URI}/{config.PIPELINE_NAME}/{kfp.dsl.RUN_ID_PLACEHOLDER}'
86 |
87 | # Set KubeflowDagRunner settings
88 | metadata_config = kubeflow_dag_runner.get_default_kubeflow_metadata_config()
89 |
90 | runner_config = kubeflow_dag_runner.KubeflowDagRunnerConfig(
91 | kubeflow_metadata_config = metadata_config,
92 | pipeline_operator_funcs = kubeflow_dag_runner.get_default_pipeline_operator_funcs(
93 | config.USE_KFP_SA == 'True'),
94 | tfx_image=config.ML_IMAGE_URI
95 | )
96 |
97 | # Compile the pipeline
98 | kubeflow_dag_runner.KubeflowDagRunner(config=runner_config).run(
99 | pipeline.create_pipeline(
100 | pipeline_name=config.PIPELINE_NAME,
101 | pipeline_root=pipeline_root,
102 | project_id=config.PROJECT_ID,
103 | bq_dataset_name=config.BQ_DATASET_NAME,
104 | min_item_frequency=min_item_frequency,
105 | max_group_size=max_group_size,
106 | dimensions=dimensions,
107 | num_leaves=num_leaves,
108 | eval_min_recall=eval_min_recall,
109 | eval_max_latency=eval_max_latency,
110 | ai_platform_training_args=ai_platform_training_args,
111 | beam_pipeline_args=beam_pipeline_args,
112 | model_regisrty_uri=config.MODEL_REGISTRY_URI)
113 | )
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/tfx_pipeline/scann_evaluator.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """ScaNN index evaluator custom component."""
15 |
16 | import os
17 | import time
18 | from typing import Any, Dict, List, Optional, Text, Union
19 | import logging
20 | import json
21 |
22 | import tfx
23 | from tfx.types import standard_artifacts
24 | from tfx.types.component_spec import ChannelParameter
25 | from tfx.types.component_spec import ExecutionParameter
26 | from tfx.dsl.components.base import base_executor
27 | from tfx.dsl.components.base import base_component
28 | from tfx.dsl.components.base import executor_spec
29 | from tfx.types import artifact_utils
30 | from tfx.utils import io_utils
31 | from typing import Optional
32 | from tfx import types
33 |
34 | import tensorflow as tf
35 | import numpy as np
36 | import tensorflow_data_validation as tfdv
37 | from tensorflow_transform.tf_metadata import schema_utils
38 |
39 | try:
40 | from . import item_matcher
41 | from . import scann_indexer
42 | except:
43 | import item_matcher
44 | import scann_indexer
45 |
46 |
47 | QUERIES_SAMPLE_RATIO = 0.01
48 | MAX_NUM_QUERIES = 10000
49 | NUM_NEIGBHOURS = 20
50 |
51 |
52 | class IndexEvaluatorSpec(tfx.types.ComponentSpec):
53 |
54 | INPUTS = {
55 | 'examples': ChannelParameter(type=standard_artifacts.Examples),
56 | 'schema': ChannelParameter(type=standard_artifacts.Schema),
57 | 'model': ChannelParameter(type=standard_artifacts.Model),
58 | }
59 |
60 | OUTPUTS = {
61 | 'evaluation': ChannelParameter(type=standard_artifacts.ModelEvaluation),
62 | 'blessing': ChannelParameter(type=standard_artifacts.ModelBlessing),
63 | }
64 |
65 | PARAMETERS = {
66 | 'min_recall': ExecutionParameter(type=float),
67 | 'max_latency': ExecutionParameter(type=float),
68 | }
69 |
70 |
71 | class ScaNNIndexEvaluatorExecutor(base_executor.BaseExecutor):
72 |
73 | def Do(self,
74 | input_dict: Dict[Text, List[types.Artifact]],
75 | output_dict: Dict[Text, List[types.Artifact]],
76 | exec_properties: Dict[Text, Any]) -> None:
77 |
78 | if 'examples' not in input_dict:
79 | raise ValueError('Examples is missing from input dict.')
80 | if 'model' not in input_dict:
81 | raise ValueError('Model is missing from input dict.')
82 | if 'evaluation' not in output_dict:
83 | raise ValueError('Evaluation is missing from output dict.')
84 | if 'blessing' not in output_dict:
85 | raise ValueError('Blessing is missing from output dict.')
86 |
87 | valid = True
88 |
89 | self._log_startup(input_dict, output_dict, exec_properties)
90 |
91 | embedding_files_pattern = io_utils.all_files_pattern(
92 | artifact_utils.get_split_uri(input_dict['examples'], 'train'))
93 |
94 | schema_file_path = artifact_utils.get_single_instance(
95 | input_dict['schema']).uri + '/schema.pbtxt'
96 |
97 | vocabulary, embeddings = scann_indexer.load_embeddings(
98 | embedding_files_pattern, schema_file_path)
99 |
100 | num_embeddings = embeddings.shape[0]
101 | logging.info(f'{num_embeddings} embeddings are loaded.')
102 | num_queries = int(min(num_embeddings * QUERIES_SAMPLE_RATIO, MAX_NUM_QUERIES))
103 | logging.info(f'Sampling {num_queries} query embeddings for evaluation...')
104 | query_embedding_indices = np.random.choice(num_embeddings, num_queries)
105 | query_embeddings = np.take(embeddings, query_embedding_indices, axis=0)
106 |
107 | # Load Exact matcher
108 | exact_matcher = item_matcher.ExactMatcher(embeddings, vocabulary)
109 | exact_matches = []
110 | logging.info(f'Computing exact matches for the queries...')
111 | for query in query_embeddings:
112 | exact_matches.append(exact_matcher.match(query, NUM_NEIGBHOURS))
113 | logging.info(f'Exact matches are computed.')
114 | del num_embeddings, exact_matcher
115 |
116 | # Load ScaNN index matcher
117 | index_artifact = artifact_utils.get_single_instance(input_dict['model'])
118 | ann_matcher = item_matcher.ScaNNMatcher(index_artifact.uri + '/serving_model_dir')
119 | scann_matches = []
120 | logging.info(f'Computing ScaNN matches for the queries...')
121 | start_time = time.time()
122 | for query in query_embeddings:
123 | scann_matches.append(ann_matcher.match(query, NUM_NEIGBHOURS))
124 | end_time = time.time()
125 | logging.info(f'ScaNN matches are computed.')
126 |
127 | # Compute average latency
128 | elapsed_time = end_time - start_time
129 | current_latency = elapsed_time / num_queries
130 |
131 | # Compute recall
132 | current_recall = 0
133 | for exact, approx in zip(exact_matches, scann_matches):
134 | current_recall += len(set(exact).intersection(set(approx))) / NUM_NEIGBHOURS
135 | current_recall /= num_queries
136 |
137 | metrics = {
138 | 'recall': current_recall,
139 | 'latency': current_latency
140 | }
141 |
142 | min_recall = exec_properties['min_recall']
143 | max_latency = exec_properties['max_latency']
144 |
145 | logging.info(f'Average latency per query achieved {current_latency}. Maximum latency allowed: {max_latency}')
146 | logging.info(f'Recall acheived {current_recall}. Minimum recall allowed: {min_recall}')
147 |
148 | # Validate index latency and recall
149 | valid = (current_latency <= max_latency) and (current_recall >= min_recall)
150 | logging.info(f'Model is valid: {valid}')
151 |
152 | # Output the evaluation artifact.
153 | evaluation = artifact_utils.get_single_instance(output_dict['evaluation'])
154 | evaluation.set_string_custom_property('index_model_uri', index_artifact.uri)
155 | evaluation.set_int_custom_property('index_model_id', index_artifact.id)
156 | io_utils.write_string_file(
157 | os.path.join(evaluation.uri, 'metrics'), json.dumps(metrics))
158 |
159 | # Output the blessing artifact.
160 | blessing = artifact_utils.get_single_instance(output_dict['blessing'])
161 | blessing.set_string_custom_property('index_model_uri', index_artifact.uri)
162 | blessing.set_int_custom_property('index_model_id', index_artifact.id)
163 |
164 | if valid:
165 | io_utils.write_string_file(os.path.join(blessing.uri, 'BLESSED'), '')
166 | blessing.set_int_custom_property('blessed', 1)
167 | else:
168 | io_utils.write_string_file(os.path.join(blessing.uri, 'NOT_BLESSED'), '')
169 | blessing.set_int_custom_property('blessed', 0)
170 |
171 |
172 | class IndexEvaluator(base_component.BaseComponent):
173 |
174 | SPEC_CLASS = IndexEvaluatorSpec
175 | EXECUTOR_SPEC = executor_spec.ExecutorClassSpec(ScaNNIndexEvaluatorExecutor)
176 |
177 | def __init__(self,
178 | examples: types.channel,
179 | schema: types.channel,
180 | model: types.channel,
181 | min_recall: float,
182 | max_latency: float,
183 | evaluation: Optional[types.Channel] = None,
184 | blessing: Optional[types.Channel] = None,
185 | instance_name=None):
186 |
187 | blessing = blessing or types.Channel(
188 | type=standard_artifacts.ModelBlessing,
189 | artifacts=[standard_artifacts.ModelBlessing()])
190 |
191 | evaluation = evaluation or types.Channel(
192 | type=standard_artifacts.ModelEvaluation,
193 | artifacts=[standard_artifacts.ModelEvaluation()])
194 |
195 | spec = IndexEvaluatorSpec(
196 | examples=examples,
197 | schema=schema,
198 | model=model,
199 | evaluation=evaluation,
200 | blessing=blessing,
201 | min_recall=min_recall,
202 | max_latency=max_latency
203 | )
204 |
205 | super().__init__(spec=spec, instance_name=instance_name)
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/tfx_pipeline/scann_indexer.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """ScaNN index builder."""
15 |
16 | import os
17 | import sys
18 | import scann
19 | import tensorflow as tf
20 | import tensorflow_data_validation as tfdv
21 | from tensorflow_transform.tf_metadata import schema_utils
22 | import numpy as np
23 | import math
24 | import pickle
25 | import logging
26 |
27 | METRIC = 'dot_product'
28 | DIMENSIONS_PER_BLOCK = 2
29 | ANISOTROPIC_QUANTIZATION_THRESHOLD = 0.2
30 | NUM_NEIGHBOURS = 10
31 | NUM_LEAVES_TO_SEARCH = 250
32 | REORDER_NUM_NEIGHBOURS = 250
33 | TOKENS_FILE_NAME = 'tokens'
34 |
35 |
36 | def load_embeddings(embedding_files_pattern, schema_file_path):
37 |
38 | embeddings = list()
39 | vocabulary = list()
40 |
41 | logging.info('Loading schema...')
42 | schema = tfdv.load_schema_text(schema_file_path)
43 | feature_sepc = schema_utils.schema_as_feature_spec(schema).feature_spec
44 | logging.info('Schema is loaded.')
45 |
46 | def _gzip_reader_fn(filenames):
47 | return tf.data.TFRecordDataset(filenames, compression_type='GZIP')
48 |
49 | dataset = tf.data.experimental.make_batched_features_dataset(
50 | embedding_files_pattern,
51 | batch_size=1,
52 | num_epochs=1,
53 | features=feature_sepc,
54 | reader=_gzip_reader_fn,
55 | shuffle=False
56 | )
57 |
58 | # Read embeddings from tfrecord files.
59 | logging.info('Loading embeddings from files...')
60 | for tfrecord_batch in dataset:
61 | vocabulary.append(tfrecord_batch["item_Id"].numpy()[0][0].decode())
62 | embedding = tfrecord_batch["embedding"].numpy()[0]
63 | normalized_embedding = embedding / np.linalg.norm(embedding)
64 | embeddings.append(normalized_embedding)
65 | logging.info('Embeddings loaded.')
66 | embeddings = np.array(embeddings)
67 |
68 | return vocabulary, embeddings
69 |
70 |
71 | def build_index(embeddings, num_leaves):
72 |
73 | data_size = embeddings.shape[0]
74 | if not num_leaves:
75 | num_leaves = int(math.sqrt(data_size))
76 | logging.info(f'Indexing {data_size} embeddings with {num_leaves} leaves.')
77 |
78 | logging.info('Start building the ScaNN index...')
79 | scann_builder = scann.scann_ops.builder(embeddings, NUM_NEIGHBOURS, METRIC).tree(
80 | num_leaves=num_leaves,
81 | num_leaves_to_search=NUM_LEAVES_TO_SEARCH,
82 | training_sample_size=data_size).score_ah(
83 | DIMENSIONS_PER_BLOCK,
84 | anisotropic_quantization_threshold=ANISOTROPIC_QUANTIZATION_THRESHOLD).reorder(REORDER_NUM_NEIGHBOURS)
85 | scann_index = scann_builder.build()
86 | logging.info('ScaNN index is built.')
87 |
88 | return scann_index
89 |
90 |
91 | def save_index(index, tokens, output_dir):
92 | logging.info('Saving index as a SavedModel...')
93 | module = index.serialize_to_module()
94 | tf.saved_model.save(
95 | module, output_dir, signatures=None, options=None
96 | )
97 | logging.info(f'Index is saved to {output_dir}')
98 |
99 | logging.info(f'Saving tokens file...')
100 | tokens_file_path = os.path.join(output_dir, TOKENS_FILE_NAME)
101 | with tf.io.gfile.GFile(tokens_file_path, 'wb') as handle:
102 | pickle.dump(tokens, handle, protocol=pickle.HIGHEST_PROTOCOL)
103 | logging.info(f'Item file is saved to {tokens_file_path}.')
104 |
105 |
106 |
107 | # TFX will call this function
108 | def run_fn(params):
109 | embedding_files_path = params.train_files
110 | output_dir = params.serving_model_dir
111 | num_leaves = params.train_steps
112 | schema_file_path = params.schema_file
113 |
114 | logging.info("Indexer started...")
115 | tokens, embeddings = load_embeddings(embedding_files_path, schema_file_path)
116 | index = build_index(embeddings, num_leaves)
117 | save_index(index, tokens, output_dir)
118 | logging.info("Indexer finished.")
119 |
120 |
121 |
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml-scann/tfx_pipeline/schema/schema.pbtxt:
--------------------------------------------------------------------------------
1 | feature {
2 | name: "item_Id"
3 | type: BYTES
4 | int_domain {
5 | }
6 | presence {
7 | min_fraction: 1.0
8 | min_count: 1
9 | }
10 | shape {
11 | dim {
12 | size: 1
13 | }
14 | }
15 | }
16 | feature {
17 | name: "embedding"
18 | type: FLOAT
19 | presence {
20 | min_fraction: 1.0
21 | min_count: 1
22 | }
23 | shape {
24 | dim {
25 | size: 50
26 | }
27 | }
28 | }
29 |
30 | feature {
31 | name: "bias"
32 | type: FLOAT
33 | presence {
34 | min_fraction: 1.0
35 | min_count: 1
36 | }
37 | shape {
38 | dim {
39 | size: 1
40 | }
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/retail/recommendation-system/bqml/README.md:
--------------------------------------------------------------------------------
1 | ## License
2 | ```
3 | Copyright 2020 Google LLC
4 |
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 |
9 | https://www.apache.org/licenses/LICENSE-2.0
10 |
11 | Unless required by applicable law or agreed to in writing, software
12 | distributed under the License is distributed on an "AS IS" BASIS,
13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | See the License for the specific language governing permissions and
15 | limitations under the License.
16 | ```
17 | [](LICENSE)
18 |
19 | # How to build a recommendation system on e-commerce data using BigQuery ML
20 | With your data in BigQuery, machine learning workflows are now easier than ever with BigQuery ML. In this [notebook](bqml_retail_recommendation_system.ipynb) you’ll learn how to build a product recommendation system in a retail scenario using matrix factorization, and how to use the predicted recommendations to drive marketing activation.
21 |
22 | ## Questions? Feedback?
23 | If you have any questions or feedback, please open up a [new issue](https://github.com/GoogleCloudPlatform/analytics-componentized-patterns/issues).
--------------------------------------------------------------------------------
/retail/time-series/bqml-demand-forecasting/README.md:
--------------------------------------------------------------------------------
1 | # How to build a time series demand forecasting model using BigQuery ML
2 |
3 | The goal of this repo is to provide an end-to-end solution for forecasting the demand of multiple retail products, using [this notebook](bqml_retail_demand_forecasting.ipynb) to walk through the steps. Learn how to use BigQuery ML to train a time series model on historical sales data of liquor products, and how to visualize the forecasted values in a dashboard. For an overview of the use case, see [Overview of a demand forecasting solution](https://cloud.google.com/architecture/demand-forecasting-overview).
4 |
5 | After completing the notebook, you will know how to:
6 |
7 | * Pre-process time series data into the correct format needed to create the model.
8 | * Train the time series model in BigQuery ML.
9 | * Evaluate the model.
10 | * Make predictions about future demand using the model.
11 | * Create a dashboard to visualize the forecasted demand using Data Studio.
12 |
13 | This solution is intended for data engineers, data scientists, and data analysts
14 | who build machine learning (ML) datasets and models to support business
15 | decisions. It assumes that you have basic knowledge of the following:
16 |
17 | * Machine learning concepts
18 | * Python
19 | * Standard SQL
20 |
21 | ## Dataset
22 |
23 | This tutorial uses the public
24 | [Iowa Liquor Sales](https://console.cloud.google.com/marketplace/product/iowa-department-of-commerce/iowa-liquor-sales)
25 | dataset that is hosted on BigQuery. This dataset contains the
26 | spirits purchase information of Iowa Class "E" liquor licensees from
27 | January 1, 2012 until the present. For more information, see the
28 | [official documentation by the State of Iowa](https://data.iowa.gov/Sales-Distribution/Iowa-Liquor-Sales/m3tr-qhgy).
29 |
30 | ## Using forecasting for inventory management
31 |
32 | In the retail business, it is important to find a balance when it comes to
33 | inventory: don't stock too much, but don't stock too little. For a large
34 | business, this can mean making decisions about inventory levels for
35 | potentially millions of products.
36 |
37 | To inform inventory level decisions, you can take advantage of historical
38 | data about items purchased by consumers over time. You can use this data about
39 | past customer behavior to make predictions about likely future purchases, which
40 | you can then use to make decisions about how much inventory to stock. In this
41 | scenario, time series forecasting is the right tool to use.
42 |
43 | Time series forecasting depends on the creation of machine learning (ML) models.
44 | If you are a member of a data science team that supports inventory decisions,
45 | this can mean not only producing large numbers of forecasts, but also procuring
46 | and managing the infrastructure to handle model training and prediction. To
47 | save time and effort, you can use BigQuery ML SQL statements to
48 | train, evaluate and deploy models in BigQuery ML instead of
49 | configuring a separate ML infrastructure.
50 |
51 | ## How time series modeling works in BigQuery ML
52 |
53 | When you train a time series model with BigQuery ML, multiple
54 | components are involved, including an
55 | [Autoregressive integrated moving average (ARIMA)](https://en.wikipedia.org/wiki/Autoregressive_integrated_moving_average)
56 | model. The BigQuery ML model creation pipeline uses the following
57 | components, listed in the order that they are run:
58 |
59 | 1. Pre-processing: Automatic cleaning adjustments to the input time
60 | series, which addresses issues like missing values, duplicated timestamps,
61 | spike anomalies, and accounting for abrupt level changes in the time series
62 | history.
63 | 1. Holiday effects: Time series modeling in BigQuery ML can also account
64 | for holiday effects. By default, holiday effects modeling is disabled. But
65 | since this data is from the United States, and the data includes a minimum
66 | one year of daily data, you can also specify an optional `HOLIDAY_REGION`.
67 | With holiday effects enabled, spike and dip anomalies that appear during
68 | holidays will no longer be treated as anomalies. For more information, see
69 | [HOLIDAY_REGION](https://cloud.google.com/bigquery-ml/docs/reference/standard-sql/bigqueryml-syntax-create-time-series#holiday_region).
70 | 1. Seasonal and trend decomposition using the
71 | [Seasonal and Trend decomposition using Loess (STL)](https://otexts.com/fpp2/stl.html)
72 | algorithm. Seasonality extrapolation using the
73 | [double exponential smoothing (ETS)](https://en.wikipedia.org/wiki/Exponential_smoothing#Double_exponential_smoothing)
74 | algorithm.
75 | 1. Trend modeling using the ARIMA model and the
76 | [auto.ARIMA](https://otexts.com/fpp2/arima-r.html)
77 | algorithm for automatic hyper-parameter tuning. In auto.ARIMA, dozens of
78 | candidate models are trained and evaluated in parallel. The best model comes
79 | with the lowest
80 | [Akaike information criterion (AIC)](https://wikipedia.org/wiki/Akaike_information_criterion).
81 |
82 | You can use a single SQL statement to train the model to forecast a single
83 | product or to forecast multiple products at the same time. For more
84 | information, see
85 | [The CREATE MODEL statement for time series models](https://cloud.google.com/bigquery-ml/docs/reference/standard-sql/bigqueryml-syntax-create-time-series).
86 |
87 | ## Costs
88 |
89 | This tutorial uses billable components of Google Cloud Platform (GCP):
90 |
91 | * AI Platform
92 | * BigQuery
93 | * BigQuery ML
94 |
95 | Learn about [BigQuery pricing](https://cloud.google.com/bigquery/pricing), [BigQuery ML
96 | pricing](https://cloud.google.com/bigquery-ml/pricing) and use the [Pricing
97 | Calculator](https://cloud.google.com/products/calculator/)
98 | to generate a cost estimate based on your projected usage.
99 |
100 | ## Set up the GCP environment
101 |
102 | 1. [If you don't want to use an existing project, create a new GCP project.](https://console.cloud.google.com/cloud-resource-manager) When you first create an account, you get a $300 free credit towards your compute/storage costs.
103 | 1. [If you created a new project, make sure that billing is enabled for it.](https://cloud.google.com/billing/docs/how-to/modify-project)
104 | 1. [Enable the AI Platform, AI Platform Notebooks, and Compute Engine APIs in the project you plan to use.](https://console.cloud.google.com/flows/enableapi?apiid=ml.googleapis.com,notebooks.googleapis.com,compute_component)
105 |
106 | ## Run the notebook
107 |
108 | 1. Open the [bqml_retail_demand_forecasting.ipynb](bqml_retail_demand_forecasting.ipynb) notebook.
109 | 1. Click **Run on AI Platform Notebooks**.
110 | 2. In the GCP console, select the project you want to use to run this notebook.
111 | 3. If you have existing notebook instances in this project, select **Create a new notebook instance**.
112 | 4. For **Instance name**, type `demand-forecasting`.
113 | 5. Click **Create**.
114 |
115 | ## Clean up the GCP environment
116 |
117 | Unless you plan to continue using the resources you created while using this notebook, you should delete them once you are done
118 | to avoid incurring charges to your GCP account. You can either delete the project containing the resources, or
119 | keep the project but delete just those resources.
120 |
121 | Either way, you should remove the resources so you won't be billed for them in
122 | the future. The following sections describe how to delete these resources.
123 |
124 | ### Delete the project
125 |
126 | The easiest way to eliminate billing is to delete the project you created for
127 | the solution.
128 |
129 | 1. In the Cloud Console, go to the [Manage resources page](https://console.cloud.google.com/cloud-resource-manager).
130 | 1. In the project list, select the project that you want to delete, and then click **Delete**.
131 | 1. In the dialog, type the project ID, and then click **Shut down** to delete the project.
132 |
133 | ### Delete the components
134 |
135 | If you don't want to delete the project, delete the billable components of the solution.
136 | These include:
137 |
138 | 1. The `demand-forecasting` AI Platform notebook instance.
139 | 2. The `bqmlforecast` BigQuery dataset.
140 |
141 | ## Disclaimer
142 | This is not an officially supported Google product.
143 |
144 | All files in this folder are under the Apache License, Version 2.0 unless noted otherwise.
145 |
146 | ## Questions? Feedback?
147 | If you have any questions or feedback, please open up a [new issue](https://github.com/GoogleCloudPlatform/analytics-componentized-patterns/issues).
148 |
--------------------------------------------------------------------------------
/retail/time-series/bqml-demand-forecasting/images/bq_export_datastudio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/analytics-componentized-patterns/e869574dc67ecabdb913b50bfb058c7e9d3e47e1/retail/time-series/bqml-demand-forecasting/images/bq_export_datastudio.png
--------------------------------------------------------------------------------
/retail/time-series/bqml-demand-forecasting/images/datastudio_charts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/analytics-componentized-patterns/e869574dc67ecabdb913b50bfb058c7e9d3e47e1/retail/time-series/bqml-demand-forecasting/images/datastudio_charts.png
--------------------------------------------------------------------------------
/retail/time-series/bqml-demand-forecasting/images/datastudio_chartsettings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/analytics-componentized-patterns/e869574dc67ecabdb913b50bfb058c7e9d3e47e1/retail/time-series/bqml-demand-forecasting/images/datastudio_chartsettings.png
--------------------------------------------------------------------------------
/retail/time-series/bqml-demand-forecasting/images/datastudio_filter_item.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/analytics-componentized-patterns/e869574dc67ecabdb913b50bfb058c7e9d3e47e1/retail/time-series/bqml-demand-forecasting/images/datastudio_filter_item.png
--------------------------------------------------------------------------------
/retail/time-series/bqml-demand-forecasting/images/datastudio_fiveoclockvodka.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/analytics-componentized-patterns/e869574dc67ecabdb913b50bfb058c7e9d3e47e1/retail/time-series/bqml-demand-forecasting/images/datastudio_fiveoclockvodka.png
--------------------------------------------------------------------------------
/retail/time-series/bqml-demand-forecasting/images/datastudio_missingdata.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/analytics-componentized-patterns/e869574dc67ecabdb913b50bfb058c7e9d3e47e1/retail/time-series/bqml-demand-forecasting/images/datastudio_missingdata.png
--------------------------------------------------------------------------------