├── .gitignore
├── CONTRIBUTING
├── LICENSE
├── README.md
├── clv_automl
    ├── __init__.py
    ├── clv_automl.py
    └── to_predict.csv
├── clv_mle
    ├── __init__.py
    ├── clv_ml_engine.egg-info
    │   ├── PKG-INFO
    │   ├── SOURCES.txt
    │   ├── dependency_links.txt
    │   ├── requires.txt
    │   └── top_level.txt
    ├── config.yaml
    ├── config_tune.json
    ├── dist
    │   └── clv_ml_engine-0.1.tar.gz
    ├── setup.py
    ├── to_predict.csv
    ├── to_predict.json
    └── trainer
    │   ├── README.md
    │   ├── __init__.py
    │   ├── btyd.py
    │   ├── context.py
    │   ├── model.py
    │   └── task.py
├── linear.py
├── notebooks
    ├── Exploration.ipynb
    ├── clv_automl.ipynb
    └── linear_model.ipynb
├── preparation
    └── sql
    │   ├── .DS_Store
    │   ├── common
    │       ├── benchmark.sql
    │       ├── clean.sql
    │       └── features_n_target.sql
    │   └── dnn
    │       ├── split_eval.sql
    │       ├── split_test.sql
    │       └── split_train.sql
├── requirements.txt
└── run
    ├── airflow
        ├── dags
        │   ├── 01_build_train_deploy.py
        │   └── 02_predict_serve.py
        ├── gcs_datastore_transform.js
        ├── requirements.txt
        └── schema_source.json
    └── mltrain.sh


/.gitignore:
--------------------------------------------------------------------------------
 1 | *pyc
 2 | data/*
 3 | jobs/*
 4 | trained/*
 5 | .DS_Store
 6 | images/
 7 | nul
 8 | .ipynb_checkpoints
 9 | mykey.json
10 | run/airflow/*cfg
11 | run/airflow/airflow.db
12 | default.profraw
13 | 


--------------------------------------------------------------------------------
/CONTRIBUTING:
--------------------------------------------------------------------------------
 1 | Want to contribute? Great! First, read this page (including the small print at the end).
 2 | 
 3 | ## Before you contribute
 4 | Before we can use your code, you must sign the [Google Individual Contributor License Agreement] (https://cla.developers.google.com/about/google-individual) (CLA), which you can do online. The CLA is necessary mainly because you own the copyright to your changes, even after your contribution becomes part of our codebase, so we need your permission to use and distribute your code. We also need to be sure of various other things—for instance that you'll tell us if you know that your code infringes on other people's patents. You don't have to sign the CLA until after you've submitted your code for review and a member has approved it, but you must do it before we can put your code into our codebase. Before you start working on a larger contribution, you should get in touch with us first through the issue tracker with your idea so that we can help out and possibly guide you. Coordinating up front makes it much easier to avoid frustration later on.
 5 | 
 6 | ## Code reviews
 7 | All submissions, including submissions by project members, require review. We use Github pull requests for this purpose.
 8 | 
 9 | ## The small print
10 | Contributions made by corporations are covered by a different agreement than the one above, the [Software Grant and Corporate Contributor License Agreement] (https://cla.developers.google.com/about/google-corporate).


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | This code supports the three-part solution [Predicting Customer Lifetime Value with Cloud ML Engine](https://cloud.google.com/solutions/machine-learning/clv-prediction-with-offline-training-intro) published on cloud.google.com.
  2 | 
  3 | This code is also used in the updated solution [Predicting Customer Lifetime Value with AutoML Tables ](https://cloud.google.com/solutions/machine-learning/clv-prediction-with-automl-tables) published on cloud.google.com.
  4 | 
  5 | # Customer Lifetime Value Prediction on GCP
  6 | 
  7 | This project shows how to use ML models to predict customer lifetime value in the following context:
  8 | - We apply the models using [this data set](http://archive.ics.uci.edu/ml/datasets/Online+Retail) [1].
  9 | - We provide an implementation using a TensorFlow DNN model with batch normalization and dropout.
 10 | - We provide an implementation, using the [Lifetimes library](https://github.com/CamDavidsonPilon/lifetimes) in Python, of [statistical models](https://rdrr.io/cran/BTYD/) commonly used in industry to perform lifetime value prediction.
 11 | - We also provide an implementation using [AutoML Tables](https://cloud.google.com/automl-tables).
 12 | 
 13 | The project also shows how to deploy a production-ready data processing pipeline for lifetime value prediction on Google Cloud Platform, using BigQuery and DataStore with orchestration provided by Cloud Composer.
 14 | 
 15 | ## Install
 16 | 
 17 | ### install Miniconda
 18 | 
 19 | The code works with python 2/3.  Using Miniconda2:
 20 | 
 21 | ```
 22 | sudo apt-get install -y git bzip2
 23 | wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh
 24 | bash Miniconda2-latest-Linux-x86_64.sh -b
 25 | export PATH=~/miniconda2/bin:$PATH
 26 | ```
 27 | 
 28 | ### Create dev environment
 29 | 
 30 | ```
 31 | conda create -y -n clv
 32 | source activate clv
 33 | conda install -y -n clv python=2.7 pip
 34 | pip install -r requirements.txt
 35 | ```
 36 | 
 37 | ### Enable the required APIs in your GCP Project
 38 | - Cloud Composer API
 39 | - Machine Learning API (for TensorFlow / Lifetimes models)
 40 | - Dataflow API
 41 | - AutoML Tables API (for AutoML Tables models)
 42 | 
 43 | 
 44 | ### Environment setup
 45 | Before running the training and Airflow scripts, you need some environment variables:
 46 | 
 47 | ```
 48 | export PROJECT=$(gcloud config get-value project 2> /dev/null)
 49 | export BUCKET=gs://${PROJECT}_data_final
 50 | export REGION=us-central1
 51 | export DATASET_NAME=ltv
 52 | 
 53 | export COMPOSER_NAME="clv-final"
 54 | export COMPOSER_BUCKET_NAME=${PROJECT}_composer_final
 55 | export COMPOSER_BUCKET=gs://${COMPOSER_BUCKET_NAME}
 56 | export DF_STAGING=${COMPOSER_BUCKET}/dataflow_staging
 57 | export DF_ZONE=${REGION}-a
 58 | export SQL_MP_LOCATION="sql"
 59 | 
 60 | export LOCAL_FOLDER=$(pwd)
 61 | ```
 62 | 
 63 | 
 64 | ### Data setup
 65 | Creating the BigQuery workspace:
 66 | 
 67 | ```
 68 | gsutil mb -l ${REGION} -p ${PROJECT} ${BUCKET}
 69 | gsutil mb -l ${REGION} -p ${PROJECT} ${COMPOSER_BUCKET}
 70 | bq --location=US mk --dataset ${PROJECT}:${DATASET_NAME}
 71 | ```
 72 | 
 73 | Create a datastore database as detailed in the [Datastore documentation](https://cloud.google.com/datastore/docs/quickstart)
 74 | 
 75 | 
 76 | ### Copy the raw dataset
 77 | ```
 78 | gsutil cp gs://solutions-public-assets/ml-clv/db_dump.csv ${BUCKET}
 79 | gsutil cp ${BUCKET}/db_dump.csv ${COMPOSER_BUCKET}
 80 | ```
 81 | 
 82 | ### Copy the dataset to be predicted. Replace with your own.
 83 | ```
 84 | gsutil cp clv_mle/to_predict.json ${BUCKET}/predictions/
 85 | gsutil cp ${BUCKET}/predictions/to_predict.json ${COMPOSER_BUCKET}/predictions/
 86 | gsutil cp clv_mle/to_predict.csv ${BUCKET}/predictions/
 87 | gsutil cp ${BUCKET}/predictions/to_predict.csv ${COMPOSER_BUCKET}/predictions/
 88 | 
 89 | ```
 90 | 
 91 | ### Create a service account
 92 | Creating a service account is important to make sure that your Cloud Composer instance can perform the required tasks within BigQuery, AutoML Tables, ML Engine, Dataflow, Cloud Storage and Datastore.  It is also needed to run training for AutoML locally.
 93 | 
 94 | The following creates a service account called composer@[YOUR-PROJECT-ID].iam.gserviceaccount.com. and assigns the required roles to the service account.
 95 | 
 96 | ```
 97 | gcloud iam service-accounts create composer --display-name composer --project ${PROJECT}
 98 | 
 99 | gcloud projects add-iam-policy-binding ${PROJECT} \
100 | --member serviceAccount:composer@${PROJECT}.iam.gserviceaccount.com \
101 | --role roles/composer.worker
102 | 
103 | gcloud projects add-iam-policy-binding ${PROJECT} \
104 | --member serviceAccount:composer@${PROJECT}.iam.gserviceaccount.com \
105 | --role roles/bigquery.dataEditor
106 | 
107 | gcloud projects add-iam-policy-binding ${PROJECT} \
108 | --member serviceAccount:composer@${PROJECT}.iam.gserviceaccount.com \
109 | --role roles/bigquery.jobUser
110 | 
111 | gcloud projects add-iam-policy-binding ${PROJECT} \
112 | --member serviceAccount:composer@${PROJECT}.iam.gserviceaccount.com \
113 | --role roles/storage.admin
114 | 
115 | gcloud projects add-iam-policy-binding ${PROJECT} \
116 | --member serviceAccount:composer@${PROJECT}.iam.gserviceaccount.com \
117 | --role roles/ml.developer
118 | 
119 | gcloud projects add-iam-policy-binding ${PROJECT} \
120 | --member serviceAccount:composer@${PROJECT}.iam.gserviceaccount.com \
121 | --role roles/dataflow.developer
122 | 
123 | gcloud projects add-iam-policy-binding ${PROJECT} \
124 | --member serviceAccount:composer@${PROJECT}.iam.gserviceaccount.com \
125 | --role roles/compute.viewer
126 | 
127 | gcloud projects add-iam-policy-binding ${PROJECT} \
128 | --member serviceAccount:composer@${PROJECT}.iam.gserviceaccount.com \
129 | --role roles/storage.objectAdmin
130 | 
131 | gcloud projects add-iam-policy-binding ${PROJECT} \
132 | --member serviceAccount:composer@${PROJECT}.iam.gserviceaccount.com \
133 | --role='roles/automl.editor'
134 | ```
135 | 
136 | Wait until the service account has all the proper roles setup.
137 | 
138 | 
139 | ### Download API Key for AutoML Tables
140 | 
141 | [Create a service account API key](https://cloud.google.com/iam/docs/creating-managing-service-account-keys) and download the json keyfile to run training for AutoML locally.
142 | 
143 | 
144 | ### Upload Machine Learning Engine packaged file
145 | If using the TensorFlow or Lifetimes model, do this once.  If you make changes to any of the Python files in clv_mle, you need to repeat.
146 | 
147 | ```
148 | cd ${LOCAL_FOLDER}/clv_mle
149 | rm -rf clv_ml_engine.egg-info/
150 | rm -rf dist
151 | python setup.py sdist
152 | gsutil cp dist/* ${COMPOSER_BUCKET}/code/
153 | ```
154 | 
155 | 
156 | ## [Optional] launch Jupyter
157 | The ```notebooks``` folder contains notebooks for data exploration and modeling with linear models and AutoML Tables.
158 | 
159 | ```
160 | jupyter notebook
161 | ```
162 | 
163 | If you are interested in using Jupyter with Datalab, you can do the following:
164 | 
165 | ```
166 | jupyter nbextension install --py datalab.notebook --sys-prefix
167 | jupyter notebook
168 | ```
169 | 
170 | And enter in the first cell of your Notebook
171 | 
172 | ```
173 | %load_ext google.datalab.kernel
174 | ```
175 | 
176 | ## Train and Tune Models
177 | 
178 | ### AutoML Tables
179 | You can train the model using the script clv_automl/clv_automl.py.  This takes several parameters.  See usage for full params and default values.
180 | 
181 | Make sure you have downloaded the json API key.  By default this is assumed to be in a file ```mykey.json``` in the same directory as the script.
182 | 
183 | For example:
184 | 
185 | ```
186 | cd ${LOCAL_FOLDER}/clv_automl
187 | python clv_automl.py --project_id [YOUR_PROJECT]
188 | ```
189 | 
190 | ### TensorFlow DNN/Lifetimes
191 | To run training or hypertuning for the non-automl models you can use the mltrain.sh script.  It must be run from the top level directory, as in the examples below. For ML Engine jobs you must supply a bucket on GCS.  The job data folder will be gs://bucket/data and the job directory will be gs://bucket/jobs. So your data files must already be in gs://bucket/data.  If you use ${COMPOSER_BUCKET}, and the DAG has been run at least once, the data files will be present.  For DNN models the data should be named 'train.csv', 'eval.csv' and 'test.csv', for probablistic models the file must be 'btyd.csv'.
192 | 
193 | For example:
194 | 
195 | Train the DNN model on local data:
196 | 
197 | ```
198 | cd ${LOCAL_FOLDER}
199 | gsutil -m cp -r ${COMPOSER_BUCKET}/data .
200 | run/mltrain.sh local data
201 | ```
202 | 
203 | Train the DNN model in a Cloud ML Engine job on data in the ${COMPOSER_BUCKET}:
204 | 
205 | ```
206 | run/mltrain.sh train ${COMPOSER_BUCKET}
207 | ```
208 | 
209 | Run hyperparameter tuning on Cloud ML Engine:
210 | 
211 | ```
212 | run/mltrain.sh tune gs://your-bucket
213 | ```
214 | 
215 | For statistical models:
216 | 
217 | ```
218 | run/mltrain.sh local data --model_type paretonbd_model --threshold_date 2011-08-08 --predict_end 2011-12-12
219 | ```
220 | 
221 | 
222 | ## Automation with AirFlow
223 | This code is set to run automatically using Cloud Composer, a google-managed version of Airflow. The following steps describe how to go from your own copy of the data to a deployed model with results exported both in Datastore and BigQuery.
224 | 
225 | See [part three of the solution](https://cloud.google.com/solutions/machine-learning/clv-prediction-with-offline-training-deploy) for more details.
226 | 
227 | 
228 | ### Set up Cloud Composer
229 | 
230 | #### Create a composer instance with the service account
231 | This will take a while.  This project assumes Airflow 1.9.0, which is the default for Cloud Composer as of March 2019.
232 | 
233 | ```
234 | gcloud composer environments create ${COMPOSER_NAME} \
235 | --location ${REGION}  \
236 | --zone ${REGION}-f \
237 | --machine-type n1-standard-1 \
238 | --service-account=composer@${PROJECT}.iam.gserviceaccount.com
239 | ```
240 | 
241 | #### Make SQL files available to the DAG
242 | There are various ways of calling BigQuery queries. This solutions leverages BigQuery files directly. For them to be accessible by the DAGs, they need to be in the same folder.
243 | 
244 | The following command line, copies the entire sql folder as a subfolder in the Airflow dags folder.
245 | 
246 | ```
247 | cd ${LOCAL_FOLDER}/preparation
248 | 
249 | gcloud composer environments storage dags import \
250 | --environment ${COMPOSER_NAME} \
251 | --source  ${SQL_MP_LOCATION} \
252 | --location ${REGION} \
253 | --project ${PROJECT}
254 | ```
255 | 
256 | #### Other files
257 | Some files are important when running the DAG. They need to be placed in the composer bucket:
258 | 
259 | 1 - The BigQuery schema file used to load data into BigQuery
260 | 
261 | ```
262 | cd ${LOCAL_FOLDER}
263 | gsutil cp ./run/airflow/schema_source.json ${COMPOSER_BUCKET}
264 | ```
265 | 
266 | 2 - A Javascript file used by the Dataflow template for processing.
267 | 
268 | ```
269 | gsutil cp ./run/airflow/gcs_datastore_transform.js ${COMPOSER_BUCKET}
270 | ```
271 | 
272 | #### Set Composer environment variables
273 | 
274 | Region where things happen
275 | 
276 | ```
277 | gcloud composer environments run ${COMPOSER_NAME} --location ${REGION} variables set \
278 | -- \
279 | region ${REGION}
280 | ```
281 | 
282 | Staging location for Dataflow
283 | 
284 | ```
285 | gcloud composer environments run ${COMPOSER_NAME} --location ${REGION} variables set \
286 | -- \
287 | df_temp_location ${DF_STAGING}
288 | ```
289 | 
290 | Zone where Dataflow should run
291 | 
292 | ```
293 | gcloud composer environments run ${COMPOSER_NAME} --location ${REGION} variables set \
294 | -- \
295 | df_zone ${DF_ZONE}
296 | ```
297 | 
298 | BigQuery working dataset
299 | 
300 | ```
301 | gcloud composer environments run ${COMPOSER_NAME} --location ${REGION} variables set \
302 | -- \
303 | dataset ${DATASET_NAME}
304 | ```
305 | 
306 | Composer bucket
307 | 
308 | ```
309 | gcloud composer environments run ${COMPOSER_NAME} --location ${REGION} variables set \
310 | -- \
311 | composer_bucket_name ${COMPOSER_BUCKET_NAME}
312 | ```
313 | 
314 | #### (for AutoML Tables) Composer environment variables
315 | 
316 | AutoML Dataset name
317 | 
318 | ```
319 | gcloud composer environments run ${COMPOSER_NAME} --location ${REGION} variables set \
320 | -- \
321 | automl_dataset "clv_solution"
322 | ```
323 | 
324 | AutoML Model name
325 | 
326 | ```
327 | gcloud composer environments run ${COMPOSER_NAME} --location ${REGION} variables set \
328 | -- \
329 | automl_model "clv_model"
330 | ```
331 | 
332 | AutoML training budget
333 | 
334 | ```
335 | gcloud composer environments run ${COMPOSER_NAME} --location ${REGION} variables set \
336 | -- \
337 | automl_training_budget "1"
338 | ```
339 | 
340 | #### (for AutoML Tables) Import AutoML libraries
341 | ```
342 | gcloud composer environments storage dags import \
343 | --environment ${COMPOSER_NAME} \
344 | --source clv_automl \
345 | --location ${REGION} \
346 | --project ${PROJECT}
347 | 
348 | gcloud composer environments update ${COMPOSER_NAME} \
349 | --update-pypi-packages-from-file run/airflow/requirements.txt \
350 | --location ${REGION} \
351 | --project ${PROJECT}
352 | ```
353 | 
354 | #### Import DAGs
355 | You need to run this for all your dag files. This solution has two DAGs located in the [run/airflow/dags](run/airflow/dags) folder.
356 | 
357 | ```
358 | gcloud composer environments storage dags import \
359 | --environment ${COMPOSER_NAME} \
360 | --source run/airflow/dags/01_build_train_deploy.py \
361 | --location ${REGION} \
362 | --project ${PROJECT}
363 | 
364 | gcloud composer environments storage dags import \
365 | --environment ${COMPOSER_NAME} \
366 | --source run/airflow/dags/02_predict_serve.py \
367 | --location ${REGION} \
368 | --project ${PROJECT}
369 | ```
370 | 
371 | 
372 | ### Run DAGs
373 | You now should have both DAGs and the SQL files in the Cloud Composer's reserved bucket. Because you probably want to run training and prediction tasks independently, you can run the following script as needed. For more automatic runs (like daily for example), refer to the Airflow documentation to setup your DAGs accordingly.
374 | 
375 | Airflow can take various parameters as inputs.
376 | 
377 | The following are used within the .sql files through the syntax {{ dag_run.conf['PARAMETER-NAME'] }}
378 | 
379 | - project: Project ID where the data is located
380 | - dataset: Dataset that is used to write and read the data
381 | - predict_end: When is the final date of the whole sales dataset
382 | - threshold_date: What is the data used to split the data
383 | 
384 | Other variables are important as they depend on your environment and are passed directly to the Operators:
385 | 
386 | - model_type: Name of the model that you want to use. Should be either 'automl' or one of the options from model.py
387 | - project: Your project id
388 | - dataset: Your dataset id
389 | - threshold_date: Date that separates features from target
390 | - predict_end: End date of the dataset
391 | - model_name: Name of the model saved to AutoML Tables or Machine Learning Engine
392 | - model_version: Name of the version of model_name save to Machine Learning Engine (not used for AutoML Tables)
393 | - tf_version: Tensorflow version to be used
394 | - max_monetary: Monetary cap to discard all customers that spend more than that amount
395 | 
396 | ```
397 | gcloud composer environments run ${COMPOSER_NAME} \
398 | --project ${PROJECT} \
399 | --location ${REGION} \
400 | dags trigger \
401 | -- \
402 | build_train_deploy \
403 | --conf '{"model_type":"automl", "project":"'${PROJECT}'", "dataset":"'${DATASET_NAME}'", "threshold_date":"2011-08-08", "predict_end":"2011-12-12", "model_name":"clv_automl", "model_version":"v1", "tf_version":"1.10", "max_monetary":"15000"}'
404 | ```
405 | 
406 | ```
407 | gcloud composer environments run ${COMPOSER_NAME} \
408 | --project ${PROJECT} \
409 | --location ${REGION} \
410 | dags trigger \
411 | -- \
412 | predict_serve \
413 | --conf '{"model_name":"clv_automl", "model_version":"v1", "dataset":"'${DATASET_NAME}'"}'
414 | ```
415 | 
416 | 
417 | ### Disclaimer: This is not an official Google product
418 | 
419 | [1]: Dua, D. and Karra Taniskidou, E. (2017). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.
420 | 


--------------------------------------------------------------------------------
/clv_automl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/tensorflow-lifetime-value/51c3980e725ed2da117b9592f8bf9ccbee1fa509/clv_automl/__init__.py


--------------------------------------------------------------------------------
/clv_automl/clv_automl.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #            http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from __future__ import division
 16 | from __future__ import print_function
 17 | from __future__ import absolute_import
 18 | 
 19 | import argparse
 20 | from google.cloud.automl_v1beta1 import AutoMlClient, PredictionServiceClient
 21 | import sys
 22 | import time
 23 | 
 24 | # parameter defaults
 25 | KEY_FILE = 'mykey.json'
 26 | LOCATION = 'us-central1'
 27 | BQ_DATASET = 'ltv_edu_auto'
 28 | BQ_TABLE = 'features_n_target'
 29 | AUTOML_DATASET = 'clv_solution'
 30 | TARGET_LABEL = 'target_monetary'
 31 | AUTOML_MODEL = 'clv_model'
 32 | BATCH_GCS_INPUT = 'gs://'
 33 | BATCH_GCS_OUTPUT = 'gs://'
 34 | 
 35 | def create_automl_model(client,
 36 |                         project_id,
 37 |                         location,
 38 |                         bq_dataset,
 39 |                         bq_table,
 40 |                         automl_dataset,
 41 |                         automl_model,
 42 |                         training_budget):
 43 |   """
 44 |   Create an AutoML Tables dataset based on the data in BigQuery.
 45 |   Create a model to predict CLV based on that dataset.
 46 | 
 47 |   Returns:
 48 |     The name of the created model.
 49 |   """
 50 |   location_path = client.location_path(project_id, location)
 51 |   dataset_display_name = automl_dataset
 52 | 
 53 |   # create dataset
 54 |   create_dataset_response = client.create_dataset(
 55 |       location_path,
 56 |       {'display_name': dataset_display_name, 'tables_dataset_metadata': {}})
 57 |   print("Creating AutoML Tables dataset...")
 58 |   dataset_name = create_dataset_response.name
 59 |   print("Done")
 60 | 
 61 |   # import data
 62 |   dataset_bq_input_uri = 'bq://{}.{}.{}'.format(project_id, bq_dataset, bq_table)
 63 |   input_config = {
 64 |       'bigquery_source': {
 65 |           'input_uri': dataset_bq_input_uri}}
 66 | 
 67 |   print("Importing data...")
 68 |   import_data_response = client.import_data(dataset_name, input_config)
 69 |   while import_data_response.done() is False:
 70 |     time.sleep(1)
 71 |   print("Done")
 72 | 
 73 |   # get column specs
 74 |   list_table_specs_response = client.list_table_specs(dataset_name)
 75 |   table_specs = [s for s in list_table_specs_response]
 76 |   table_spec_name = table_specs[0].name
 77 |   list_column_specs_response = client.list_column_specs(table_spec_name)
 78 |   column_specs = {s.display_name: s for s in list_column_specs_response}
 79 | 
 80 |   # update dataset to assign a label
 81 |   label_column_name = TARGET_LABEL
 82 |   label_column_spec = column_specs[label_column_name]
 83 |   label_column_id = label_column_spec.name.rsplit('/', 1)[-1]
 84 |   update_dataset_dict = {
 85 |       'name': dataset_name,
 86 |       'tables_dataset_metadata': {
 87 |           'target_column_spec_id': label_column_id
 88 |       }
 89 |   }
 90 |   print("Setting label...")
 91 |   update_dataset_response = client.update_dataset(update_dataset_dict)
 92 |   print("Done")
 93 | 
 94 |   # define the features used to train the model
 95 |   feat_list = list(column_specs.keys())
 96 |   feat_list.remove('target_monetary')
 97 |   feat_list.remove('customer_id')
 98 |   feat_list.remove('monetary_btyd')
 99 |   feat_list.remove('frequency_btyd')
100 |   feat_list.remove('frequency_btyd_clipped')
101 |   feat_list.remove('monetary_btyd_clipped')
102 |   feat_list.remove('target_monetary_clipped')
103 | 
104 |   # create and train the model
105 |   model_display_name = automl_model
106 |   model_training_budget = training_budget * 1000
107 |   model_dict = {
108 |     'display_name': model_display_name,
109 |     'dataset_id': dataset_name.rsplit('/', 1)[-1],
110 |     'tables_model_metadata': {
111 |         'target_column_spec': column_specs['target_monetary'],
112 |         'input_feature_column_specs': [
113 |             column_specs[x] for x in feat_list],
114 |         'train_budget_milli_node_hours': model_training_budget,
115 |         'optimization_objective': 'MINIMIZE_MAE'
116 |     }
117 |   }
118 |   print("Creating AutoML Tables model...")
119 |   create_model_response = client.create_model(location_path, model_dict)
120 |   while create_model_response.done() is False:
121 |     time.sleep(10)
122 |   print("Done")
123 | 
124 |   create_model_result = create_model_response.result()
125 |   model_name = create_model_result.name
126 | 
127 |   return model_name
128 | 
129 | 
130 | def deploy_model(client, model_name):
131 |   """
132 |   Deploy model for predictions.
133 |   """
134 |   print("Deploying AutoML Tables model...")
135 |   deploy_model_response = client.deploy_model(model_name)
136 |   api = client.transport._operations_client
137 |   while deploy_model_response.done is False:
138 |     deploy_model_response = api.get_operation(deploy_model_response.name)
139 |     time.sleep(10)
140 |   print("Done")
141 | 
142 | 
143 | def get_model_evaluation(client, model_name):
144 |   """
145 |   Get the evaluation stats for the model.
146 |   """
147 |   model_evaluations = [e for e in client.list_model_evaluations(model_name)]
148 |   model_evaluation = model_evaluations[0]
149 |   print("Model evaluation:")
150 |   print(model_evaluation)
151 |   return model_evaluation
152 | 
153 | 
154 | def do_batch_prediction(prediction_client,
155 |                         model_name,
156 |                         gcs_input_uri,
157 |                         gcs_output_uri_prefix):
158 | 
159 |   # Define input source.
160 |   batch_prediction_input_source = {
161 |     'gcs_source': {
162 |       'input_uris': [gcs_input_uri]
163 |     }
164 |   }
165 |   # Define output target.
166 |   batch_prediction_output_target = {
167 |       'gcs_destination': {
168 |         'output_uri_prefix': gcs_output_uri_prefix
169 |       }
170 |   }
171 | 
172 |   # initiate batch predict
173 |   print('Performing AutoML Tables batch predict...')
174 |   batch_predict_response = prediction_client.batch_predict(
175 |       model_name, batch_prediction_input_source, batch_prediction_output_target)
176 | 
177 |   # Wait until batch prediction is done.
178 |   while batch_predict_response.done() is False:
179 |     time.sleep(1)
180 |   print('Done')
181 | 
182 |   batch_predict_result = batch_predict_response.result()
183 |   return batch_predict_result
184 | 
185 | 
186 | def create_parser():
187 |   """Initialize command line parser using argparse.
188 | 
189 |   Returns:
190 |     An argparse.ArgumentParser.
191 |   """
192 |   parser = argparse.ArgumentParser()
193 | 
194 |   # required args
195 |   parser.add_argument('--project_id',
196 |                       help='Project id for project containing BQ data',
197 |                       default=KEY_FILE,
198 |                       type=str,
199 |                       required=True)
200 | 
201 |   # data and model args
202 |   parser.add_argument('--training_budget',
203 |                       help='Training budget in hours',
204 |                       default=1,
205 |                       type=int)
206 |   parser.add_argument('--key_file',
207 |                       help='JSON key file for API access',
208 |                       default=KEY_FILE,
209 |                       type=str)
210 |   parser.add_argument('--location',
211 |                       help='GCP region to run',
212 |                       default=LOCATION,
213 |                       type=str)
214 |   parser.add_argument('--automl_dataset',
215 |                       help='Name of AutoML dataset',
216 |                       default=AUTOML_DATASET,
217 |                       type=str)
218 |   parser.add_argument('--automl_model',
219 |                       help='Name of AutoML model',
220 |                       default=AUTOML_MODEL,
221 |                       type=str)
222 |   parser.add_argument('--bq_dataset',
223 |                       help='BigQuery dataset to import from',
224 |                       default=BQ_DATASET,
225 |                       type=str)
226 |   parser.add_argument('--bq_table',
227 |                       help='BigQuery table to import from',
228 |                       default=BQ_TABLE,
229 |                       type=str)
230 |   parser.add_argument('--batch_gcs_input',
231 |                       help='GCS URI for batch predict CSV',
232 |                       default=BATCH_GCS_INPUT,
233 |                       type=str)
234 |   parser.add_argument('--batch_gcs_output',
235 |                       help='GCS URI for batch predict output',
236 |                       default=BATCH_GCS_OUTPUT,
237 |                       type=str)
238 |   return parser
239 | 
240 | 
241 | def main(argv=None):
242 |   """Create and train the CLV model on AutoML Tables."""
243 |   argv = sys.argv if argv is None else argv
244 |   args = create_parser().parse_args(args=argv[1:])
245 | 
246 |   # create and configure client
247 |   keyfile_name = args.key_file
248 |   client = AutoMlClient.from_service_account_file(keyfile_name)
249 | 
250 |   # create and deploy model
251 |   model_name = create_automl_model(client,
252 |                                    args.project_id,
253 |                                    args.location,
254 |                                    args.bq_dataset,
255 |                                    args.bq_table,
256 |                                    args.automl_dataset,
257 |                                    args.automl_model,
258 |                                    args.training_budget)
259 | 
260 |   # deploy model
261 |   deploy_model(client, model_name)
262 | 
263 |   # get model evaluations
264 |   model_evaluation = get_model_evaluation(client, model_name)
265 | 
266 |   # make predictions
267 |   prediction_client = PredictionServiceClient.from_service_account_file(
268 |       keyfile_name)
269 |   do_batch_prediction(prediction_client,
270 |                       model_name,
271 |                       args.batch_gcs_input,
272 |                       args.batch_gcs_output)
273 | 
274 | if __name__ == '__main__':
275 |   main()
276 | 


--------------------------------------------------------------------------------
/clv_automl/to_predict.csv:
--------------------------------------------------------------------------------
1 | customer_id,monetary,recency,frequency,avg_basket_value,avg_basket_size,date_range,cnt_orders,cnt_returns,has_returned
2 | 0123456789,1000.0,10,2,240,4.0,240.0,13.0,1,1
3 | 1234567890,500.0,20,3,250,5.0,250.0,15.0,1,1


--------------------------------------------------------------------------------
/clv_mle/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #            http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.


--------------------------------------------------------------------------------
/clv_mle/clv_ml_engine.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 1.0
 2 | Name: clv-ml-engine
 3 | Version: 0.1
 4 | Summary: A trainer application package for CLV prediction on ML Engine
 5 | Home-page: UNKNOWN
 6 | Author: UNKNOWN
 7 | Author-email: UNKNOWN
 8 | License: UNKNOWN
 9 | Description: UNKNOWN
10 | Platform: UNKNOWN
11 | 


--------------------------------------------------------------------------------
/clv_mle/clv_ml_engine.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | setup.py
 2 | clv_ml_engine.egg-info/PKG-INFO
 3 | clv_ml_engine.egg-info/SOURCES.txt
 4 | clv_ml_engine.egg-info/dependency_links.txt
 5 | clv_ml_engine.egg-info/requires.txt
 6 | clv_ml_engine.egg-info/top_level.txt
 7 | trainer/__init__.py
 8 | trainer/btyd.py
 9 | trainer/context.py
10 | trainer/model.py
11 | trainer/task.py


--------------------------------------------------------------------------------
/clv_mle/clv_ml_engine.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/clv_mle/clv_ml_engine.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | sh
2 | lifetimes==0.9.0.0
3 | numpy==1.14.5
4 | tensorflow==1.10
5 | 


--------------------------------------------------------------------------------
/clv_mle/clv_ml_engine.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | trainer
2 | 


--------------------------------------------------------------------------------
/clv_mle/config.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #            http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | trainingInput:
16 |   masterType: standard


--------------------------------------------------------------------------------
/clv_mle/config_tune.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "trainingInput": {
 3 |     "scaleTier": "CUSTOM",
 4 |     "masterType": "complex_model_m",
 5 |     "hyperparameters": {
 6 |         "goal": "MINIMIZE",
 7 |         "hyperparameterMetricTag": "rmse",
 8 |         "params": [
 9 |           {
10 |             "parameterName": "learning_rate",
11 |             "type": "DOUBLE",
12 |             "minValue": "0.0001",
13 |             "maxValue": "0.1",
14 |             "scaleType": "UNIT_REVERSE_LOG_SCALE"
15 |           },
16 |           {
17 |             "parameterName": "l1_regularization",
18 |             "type": "DOUBLE",
19 |             "minValue": "0.0001",
20 |             "maxValue": "0.1",
21 |             "scaleType": "UNIT_REVERSE_LOG_SCALE"
22 |           },
23 |           {
24 |             "parameterName": "l2_regularization",
25 |             "type": "DOUBLE",
26 |             "minValue": "0.0001",
27 |             "maxValue": "0.1",
28 |             "scaleType": "UNIT_REVERSE_LOG_SCALE"
29 |           },
30 |           {
31 |             "parameterName": "dropout",
32 |             "minValue": 0.01,
33 |             "maxValue": 0.99,
34 |             "type": "DOUBLE",
35 |             "scaleType": "UNIT_REVERSE_LOG_SCALE"
36 |           },
37 |           {
38 |             "parameterName": "learning_decay_rate",
39 |             "minValue": 0.6,
40 |             "maxValue": 0.99,
41 |             "type": "DOUBLE",
42 |             "scaleType": "UNIT_LINEAR_SCALE"
43 |           },
44 |           {
45 |             "parameterName": "hidden_units",
46 |             "type": "CATEGORICAL",
47 |             "categoricalValues": ["256 128 64 32", "128 64 32 16", "256 128 64 32 16", "128 64 32 16 16 8"]
48 |           },
49 |           {
50 |             "parameterName": "batch_size",
51 |             "type": "DISCRETE",
52 |             "discreteValues": ["5", "10", "15", "20", "25", "30"],
53 |           }
54 |         ],
55 |         "maxTrials": 1000,
56 |         "maxParallelTrials": 10
57 |     }
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/clv_mle/dist/clv_ml_engine-0.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/tensorflow-lifetime-value/51c3980e725ed2da117b9592f8bf9ccbee1fa509/clv_mle/dist/clv_ml_engine-0.1.tar.gz


--------------------------------------------------------------------------------
/clv_mle/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from setuptools import find_packages
16 | from setuptools import setup
17 | 
18 | REQUIRED_PACKAGES = ['sh', 'lifetimes==0.9.0.0', 'numpy==1.14.5', 'tensorflow==1.10']
19 | 
20 | setup(
21 |     name='clv_ml_engine',
22 |     version='0.1',
23 |     install_requires=REQUIRED_PACKAGES,
24 |     packages=find_packages(),
25 |     include_package_data=True,
26 |     description='A trainer application package for CLV prediction on ML Engine'
27 | )


--------------------------------------------------------------------------------
/clv_mle/to_predict.csv:
--------------------------------------------------------------------------------
1 | 0123456789,1000.0,10,2,240,4.0,240.0,13.0,1,1
2 | 1234567890,500.0,20,3,250,5.0,250.0,15.0,1,1


--------------------------------------------------------------------------------
/clv_mle/to_predict.json:
--------------------------------------------------------------------------------
1 | {"customer_id":"abc", "monetary": 1000.0, "recency": 20.0, "frequency": 3.0, "avg_basket_value": 250.0, "avg_basket_size": 5.0, "date_range": 250, "cnt_orders": 15, "cnt_returns": 1, "has_returned": 1}
2 | {"customer_id":"cde", "monetary": 500.0, "recency": 20.0, "frequency": 3.0, "avg_basket_value": 250.0, "avg_basket_size": 5.0, "date_range": 250, "cnt_orders": 15, "cnt_returns": 1, "has_returned": 1}


--------------------------------------------------------------------------------
/clv_mle/trainer/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/tensorflow-lifetime-value/51c3980e725ed2da117b9592f8bf9ccbee1fa509/clv_mle/trainer/README.md


--------------------------------------------------------------------------------
/clv_mle/trainer/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #            http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.


--------------------------------------------------------------------------------
/clv_mle/trainer/btyd.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #            http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Core functions for Probabilistic (BTYD) models."""
 16 | 
 17 | from __future__ import print_function
 18 | from __future__ import absolute_import
 19 | 
 20 | from datetime import datetime
 21 | from lifetimes import BetaGeoFitter, ParetoNBDFitter, GammaGammaFitter
 22 | import math
 23 | import numpy as np
 24 | import os
 25 | import pandas as pd
 26 | import tensorflow as tf
 27 | 
 28 | from .model import PARETO, BGNBD
 29 | 
 30 | PENALIZER_COEF = 0.01
 31 | DISCOUNT_RATE = 0.01
 32 | 
 33 | TRAINING_DATA_FILE = 'btyd.csv'
 34 | OUTPUT_FILE = 'predictions.csv'
 35 | 
 36 | 
 37 | def load_data(datapath):
 38 |   """Loads data from CSV data file.
 39 | 
 40 |   Args:
 41 |     datapath: Location of the training file
 42 |   Returns:
 43 |     summary dataframe containing RFM data for btyd models
 44 |     actuals_df containing additional data columns for calculating error
 45 |   """
 46 |   # Does not used the summary_data_from_transaction_data from the Lifetimes
 47 |   # library as it wouldn't scale as well. The pre-processing done in BQ instead.
 48 |   tf.logging.info('Loading data...')
 49 | 
 50 |   ft_file = '{0}/{1}'.format(datapath, TRAINING_DATA_FILE)
 51 | #[START prob_selec]
 52 |   df_ft = pd.read_csv(ft_file)
 53 | 
 54 |   # Extracts relevant dataframes for RFM:
 55 |   # - summary has aggregated values before the threshold date
 56 |   # - actual_df has values of the overall period.
 57 |   summary = df_ft[['customer_id', 'frequency_btyd', 'recency', 'T',
 58 |                    'monetary_btyd']]
 59 | #[END prob_selec]
 60 |   summary.columns = ['customer_id', 'frequency', 'recency', 'T',
 61 |                      'monetary_value']
 62 |   summary = summary.set_index('customer_id')
 63 | 
 64 |   # additional columns needed for calculating error
 65 |   actual_df = df_ft[['customer_id', 'frequency_btyd', 'monetary_dnn',
 66 |                      'target_monetary']]
 67 |   actual_df.columns = ['customer_id', 'train_frequency', 'train_monetary',
 68 |                        'act_target_monetary']
 69 | 
 70 |   tf.logging.info('Data loaded.')
 71 | 
 72 |   return summary, actual_df
 73 | 
 74 | 
 75 | def bgnbd_model(summary):
 76 |   """Instantiate and fit a BG/NBD model.
 77 | 
 78 |   Args:
 79 |     summary: RFM transaction data
 80 |   Returns:
 81 |     bgnbd model fit to the data
 82 |   """
 83 |   bgf = BetaGeoFitter(penalizer_coef=PENALIZER_COEF)
 84 |   bgf.fit(summary['frequency'], summary['recency'], summary['T'])
 85 |   return bgf
 86 | 
 87 | 
 88 | def paretonbd_model(summary):
 89 |   """Instantiate and fit a Pareto/NBD model.
 90 | 
 91 |   Args:
 92 |     summary: RFM transaction data
 93 |   Returns:
 94 |     bgnbd model fit to the data
 95 |   """
 96 |   #[START run_btyd]
 97 |   paretof = ParetoNBDFitter(penalizer_coef=PENALIZER_COEF)
 98 |   paretof.fit(summary['frequency'], summary['recency'], summary['T'])
 99 |   return paretof
100 |   #[END run_btyd]
101 | 
102 | def run_btyd(model_type, data_src, threshold_date, predict_end):
103 |   """Run selected BTYD model on data files located in args.data_src.
104 | 
105 |   Args:
106 |     model_type:                 model type (PARETO, BGNBD)
107 |     data_src:                   path to data
108 |     threshold_date:             end date for training data 'YYYY-mm-dd'
109 |     predict_end:                end date for predictions 'YYYY-mm-dd'
110 |   """
111 |   train_end_date = datetime.strptime(threshold_date, '%Y-%m-%d')
112 |   predict_end_date = datetime.strptime(predict_end, '%Y-%m-%d')
113 | 
114 |   # load training transaction data
115 |   summary, actual_df = load_data(data_src)
116 | 
117 |   # train fitter for selected model
118 |   tf.logging.info('Fitting model...')
119 | 
120 |   if model_type == PARETO:
121 |     fitter = paretonbd_model(summary)
122 |   elif model_type == BGNBD:
123 |     fitter = bgnbd_model(summary)
124 | 
125 |   tf.logging.info('Done.')
126 | 
127 |   #
128 |   # use trained fitter to compute actual vs predicted ltv for each user
129 |   #
130 | 
131 |   # compute the number of days in the prediction period
132 |   time_days = (predict_end_date - train_end_date).days
133 |   time_months = int(math.ceil(time_days / 30.0))
134 | 
135 |   # fit gamma-gamma model
136 |   tf.logging.info('Fitting GammaGamma model...')
137 | 
138 |   ggf = GammaGammaFitter(penalizer_coef=0)
139 |   ggf.fit(summary['frequency'], summary['monetary_value'])
140 | 
141 |   tf.logging.info('Done.')
142 | 
143 |   ltv, rmse = predict_value(summary,
144 |                             actual_df,
145 |                             fitter,
146 |                             ggf,
147 |                             time_days,
148 |                             time_months)
149 | 
150 |   # output results to csv
151 |   output_file = os.path.join(data_src, OUTPUT_FILE)
152 |   ltv.to_csv(output_file, index=False)
153 | 
154 |   # log results
155 |   tf.logging.info('BTYD RMSE error for %s model: %.2f', model_type, rmse)
156 |   print('RMSE prediction error: %.2f' % rmse)
157 | 
158 | 
159 | def predict_value(summary, actual_df, fitter, ggf, time_days, time_months):
160 |   """Predict lifetime values for customers.
161 | 
162 |   Args:
163 |     summary:      RFM transaction data
164 |     actual_df:    dataframe containing data fields for customer id,
165 |                   actual customer values
166 |     fitter:       lifetimes fitter, previously fit to data
167 |     ggf:          lifetimes gamma/gamma fitter, already fit to data
168 |     time_days:    time to predict purchases in days
169 |     time_months:  time to predict value in months
170 |   Returns:
171 |     ltv:  dataframe with predicted values for each customer, along with actual
172 |       values and error
173 |     rmse: root mean squared error summed over all customers
174 |   """
175 |   # setup dataframe to hold results
176 |   ltv = pd.DataFrame(data=np.zeros([actual_df.shape[0], 6]),
177 |                      columns=['customer_id',
178 |                               'actual_total',
179 |                               'predicted_num_purchases',
180 |                               'predicted_value',
181 |                               'predicted_total',
182 |                               'error'], dtype=np.float32)
183 | 
184 |   predicted_num_purchases = fitter.predict(time_days,
185 |                                            summary['frequency'],
186 |                                            summary['recency'],
187 |                                            summary['T'])
188 | 
189 |   predicted_value = ggf.customer_lifetime_value(fitter,
190 |                                                 summary['frequency'],
191 |                                                 summary['recency'],
192 |                                                 summary['T'],
193 |                                                 summary['monetary_value'],
194 |                                                 time=time_months,
195 |                                                 discount_rate=DISCOUNT_RATE)
196 | 
197 |   ltv['customer_id'] = actual_df['customer_id']
198 |   ltv['actual_total'] = actual_df['act_target_monetary']
199 |   ltv['predicted_num_purchases'] = predicted_num_purchases.values
200 |   ltv['predicted_value'] = predicted_value.values
201 |   ltv['predicted_total'] = actual_df['train_monetary'] + ltv['predicted_value']
202 |   ltv['error'] = ltv['actual_total'] - ltv['predicted_total']
203 | 
204 |   mse = pd.Series.sum(ltv['error'] * ltv['error']) / ltv.shape[0]
205 |   rmse = math.sqrt(mse)
206 | 
207 |   return ltv, rmse
208 | 


--------------------------------------------------------------------------------
/clv_mle/trainer/context.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Feature definition and processing."""
 16 | 
 17 | from tensorflow import feature_column as tfc
 18 | from six import iteritems
 19 | 
 20 | class CLVFeatures(object):
 21 |   """Encapulates the features for Estimator models."""
 22 | 
 23 |   # Columns
 24 |   HEADERS = ['customer_id', 'monetary_dnn', 'monetary_btyd', 'frequency_dnn',
 25 |              'frequency_btyd', 'recency', 'T', 'time_between',
 26 |              'avg_basket_value', 'avg_basket_size', 'cnt_returns',
 27 |              'has_returned', 'frequency_btyd_clipped', 'monetary_btyd_clipped',
 28 |              'target_monetary_clipped', 'target_monetary']
 29 | 
 30 |   HEADERS_DEFAULT = [[''], [0.0], [0.0], [0],
 31 |                      [0], [0], [0], [0.0],
 32 |                      [0.0], [0.0], [0],
 33 |                      [-1], [0], [0.0],
 34 |                      [0.0], [0.0]]
 35 | 
 36 |   NUMERICS = {
 37 |       'monetary_dnn': [],
 38 |       'recency': [],
 39 |       'frequency_dnn': [],
 40 |       'T': [],
 41 |       'time_between': [],
 42 |       'avg_basket_value': [],
 43 |       'avg_basket_size': [],
 44 |       'cnt_returns': []}
 45 | 
 46 |   CATEGORICALS_W_LIST = {
 47 |       'has_returned': [0, 1]}
 48 | 
 49 |   # Columns to cross (name, bucket_size, boundaries)
 50 |   # Note that boundaries is None if we have all the values. This will helps
 51 |   # using between categorical_column_with_identity vs bucketized_column
 52 |   # max(recency)=33383
 53 |   # max(frequency) = 300
 54 |   # max(monetary) = 3809291.2
 55 |   CROSSED = []
 56 | 
 57 |   KEY = 'customer_id'
 58 | 
 59 |   UNUSED = [KEY, 'monetary_btyd', 'frequency_btyd', 'frequency_btyd_clipped',
 60 |             'monetary_btyd_clipped', 'target_monetary_clipped']
 61 | 
 62 |   TARGET_NAME = 'target_monetary'
 63 | 
 64 |   def __init__(self, ignore_crosses=False, is_dnn=None):
 65 |     """Initialize CLVFeatures.
 66 | 
 67 |     Args:
 68 |         ignore_crosses: Whether to apply crosses or not
 69 |         is_dnn: Whether the model is a dnn one or not.
 70 |     """
 71 |     if not is_dnn:
 72 |       return
 73 | 
 74 |     self.ignore_crosses = ignore_crosses
 75 | 
 76 |     # Initializes features names that will be used.
 77 |     (self.headers, self.numerics_names,
 78 |      self.categoricals_names) = self._keep_used()
 79 | 
 80 |     # Creates the base continuous and categorical features
 81 |     self.continuous, self.categorical = self._make_base_features()
 82 | 
 83 |     # Creates the crossed features for both wide and deep.
 84 |     if not self.ignore_crosses:
 85 |       self.crossed_for_wide, self.crossed_for_deep = self._make_crossed()
 86 | 
 87 |   def _keep_used(self):
 88 |     """Returns only the used headers names.
 89 | 
 90 |     Returns:
 91 |         used_headers names
 92 |     """
 93 |     headers = [h for h in self.HEADERS if h not in self.UNUSED]
 94 |     numerics_names = {
 95 |         k: v for k, v in iteritems(self.NUMERICS)
 96 |         if (k not in self.UNUSED) and (k != self.TARGET_NAME)
 97 |     }
 98 |     categoricals_names = {
 99 |         k: v for k, v in iteritems(self.CATEGORICALS_W_LIST)
100 |         if k not in self.UNUSED
101 |     }
102 | 
103 |     return headers, numerics_names, categoricals_names
104 | 
105 |   def get_key(self):
106 |     return self.KEY
107 | 
108 |   def get_used_headers(self, with_key=False, with_target=False):
109 |     """Returns headers that are useful to the model.
110 | 
111 |     Possibly includes the key and the target.
112 | 
113 |     Args:
114 |         with_key: include KEY column
115 |         with_target: include target column
116 |     Returns:
117 |         used_headers
118 |     """
119 |     used_headers = [h for h in self.headers if h != self.TARGET_NAME]
120 | 
121 |     if with_key:
122 |       used_headers.insert(0, self.KEY)
123 | 
124 |     if with_target:
125 |       used_headers.append(self.TARGET_NAME)
126 | 
127 |     return used_headers
128 | 
129 |   def get_defaults(self, headers_names=None, with_key=False):
130 |     """Returns default values based on indexes taken from the headers to keep.
131 | 
132 |     If key and target are to keep, it is decided in get_used_headers.
133 | 
134 |     Args:
135 |         headers_names: column header names
136 |         with_key: include KEY column
137 |     Returns:
138 |         default values
139 |     """
140 |     if headers_names is None:
141 |       headers_names = self.get_used_headers(with_key)
142 | 
143 |     keep_indexes = [self.HEADERS.index(n) for n in headers_names]
144 |     return [self.HEADERS_DEFAULT[i] for i in keep_indexes]
145 | 
146 |   def get_all_names(self):
147 |     return self.HEADERS
148 | 
149 |   def get_all_defaults(self):
150 |     return self.HEADERS_DEFAULT
151 | 
152 |   def get_unused(self):
153 |     return self.UNUSED
154 | 
155 |   def get_target_name(self):
156 |     return self.TARGET_NAME
157 | 
158 |   #####################
159 |   # Features creation #
160 |   #####################
161 |   # dense columns = numeric columns + embedding columns
162 |   # categorical columns = vocabolary list columns + bucketized columns
163 |   # sparse columns = hashed categorical columns + crossed columns
164 |   # categorical columns => indicator columns
165 |   # deep columns = dense columns + indicator columns
166 |   # wide columns = categorical columns + sparse columns
167 | 
168 |   def _make_base_features(self):
169 |     """Make base features.
170 | 
171 |     Returns:
172 |       base features
173 |     """
174 |     # Continuous columns
175 |     continuous = {key_name: tfc.numeric_column(key_name)
176 |                   for key_name in self.numerics_names.keys()}
177 | 
178 |     # Categorical columns (can contain all categorical_column_with_*)
179 |     categorical = {
180 |         key_name: tfc.categorical_column_with_vocabulary_list(
181 |             key=key_name,
182 |             vocabulary_list=voc)
183 |         for key_name, voc in self.categoricals_names.items()
184 |     }
185 | 
186 |     return continuous, categorical
187 | 
188 |   def get_base_features(self):
189 |     # Could create bucket or/and hash here before return
190 |     return self.continuous, self.categorical
191 | 
192 |   def _prepare_for_crossing(self, key_name, num_bck, boundaries):
193 |     """Prepares features for crossing.
194 | 
195 |     Whether they're continuous or categorical matters, and
196 |     whether we have the whole dictionary or not.
197 | 
198 |     Args:
199 |       key_name: A string representing the name of the feature
200 |       num_bck: How many buckets to use when we know # of distinct values
201 |       boundaries: Range used for boundaries when bucketinizing
202 |     Returns:
203 |       key name
204 |     """
205 |     key = None
206 |     if key_name in self.continuous.keys():
207 |       if boundaries is not None:
208 |         # Note that cont[key_name] is a source column
209 |         key = tfc.bucketized_column(self.continuous[key_name], boundaries)
210 |       else:
211 |         # We can count all the values in the dataset. Ex: boolean.
212 |         # Note that key_name is a string
213 |         key = tfc.categorical_column_with_identity(key_name, num_bck)
214 |     elif key_name in self.categorical.keys():
215 |       # It is also possible to use the categorical column instead of the
216 |       # column name. i.e key = cat[key_name]
217 |       key = key_name
218 |     else:
219 |       key = key_name
220 | 
221 |     return key
222 | 
223 |   def _make_crossed(self):
224 |     """Makes crossed features for both Wide or Deep network.
225 | 
226 |     Returns:
227 |       Tuple (crossed columns for Wide, its dimension)
228 |     """
229 |     # Crossed columns
230 |     f_crossed_for_wide = []
231 |     f_crossed_for_deep = []
232 |     for to_cross in self.CROSSED:
233 |       keys = []
234 |       bck_size = 1
235 |       for (key, bck, bnd) in to_cross:
236 |         keys.append(self._prepare_for_crossing(key, bck, bnd))
237 |         bck_size *= bck
238 | 
239 |       # We can't go crazy on the dim for crossed_column so use a min
240 |       # **0.25 is a rule of thumb for bucket size vs dimension
241 |       t_crossed = tfc.crossed_column(keys, min(bck_size, 10000))
242 |       t_dimension = int(bck_size**0.25)
243 |       f_crossed_for_wide.append(t_crossed)
244 |       f_crossed_for_deep.append(tfc.embedding_column(t_crossed, t_dimension))
245 | 
246 |     return f_crossed_for_wide, f_crossed_for_deep
247 | 
248 |   def get_wide_features(self):
249 |     """Creates wide features.
250 | 
251 |     Sparse (ie. hashed categorical + crossed) + categorical.
252 | 
253 |     Returns:
254 |       A list of wide features
255 |     """
256 |     # Base sparse (ie categorical) feature columns + crossed
257 |     wide_features = self.categorical.values()
258 | 
259 |     if not self.ignore_crosses:
260 |       wide_features += self.crossed_for_wide
261 | 
262 |     return  wide_features
263 | 
264 |   def get_deep_features(self, with_continuous=True):
265 |     """Creates deep features: dense(ie numeric + embedding) + indicator.
266 | 
267 |     Args:
268 |       with_continuous: include continuous columns
269 |     Returns:
270 |         features for DNN
271 |     """
272 |     # Multi-hot representation of categories. We know all the values so use
273 |     # indicator_column. If the vocabulary could be bigger in the outside
274 |     # world, we'd use embedding_column
275 |     deep_features = [tfc.indicator_column(f) for f in self.categorical.values()]
276 | 
277 |     # Creates deep feature lists
278 |     if with_continuous:
279 |       deep_features += self.continuous.values()
280 | 
281 |     if not self.ignore_crosses:
282 |       deep_features += self.crossed_for_deep
283 | 
284 |     return deep_features
285 | 


--------------------------------------------------------------------------------
/clv_mle/trainer/model.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #            http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """DNN Estimator model code."""
 16 | 
 17 | from __future__ import print_function
 18 | from __future__ import absolute_import
 19 | 
 20 | import tensorflow as tf
 21 | 
 22 | from .context import CLVFeatures
 23 | 
 24 | # Possible estimators:
 25 | # Canned: https://www.tensorflow.org/api_docs/python/tf/estimator or custom ones
 26 | CANNED_MODEL_TYPES = ['DNNRegressor', 'Linear']
 27 | MODEL_TYPES = CANNED_MODEL_TYPES[:] + ['dnn_model', 'paretonbd_model',
 28 |                                        'bgnbd_model']
 29 | CANNED_DEEP, LINEAR, DEEP, PARETO, BGNBD = MODEL_TYPES
 30 | PROBABILISTIC_MODEL_TYPES = [PARETO, BGNBD]
 31 | 
 32 | # Either a custom function or a canned estimator name
 33 | # Used as default it not passed as an argument when calling the task
 34 | MODEL_TYPE = DEEP
 35 | 
 36 | # Features
 37 | clvf = CLVFeatures(
 38 |     ignore_crosses=True, is_dnn=MODEL_TYPE not in PROBABILISTIC_MODEL_TYPES)
 39 | 
 40 | 
 41 | def parse_csv(csv_row):
 42 |   """Parse CSV data row.
 43 | 
 44 |   tf.data.Dataset.map takes a function as an input so need to call parse_fn
 45 |   using map(lamba x: parse_fn(x)) or do def parse_fn and return the function
 46 |   as we do here.
 47 |   Builds a pair (feature dictionary, label) tensor for each example.
 48 | 
 49 |   Args:
 50 |     csv_row: one example as a csv row coming from the Dataset.map()
 51 |   Returns:
 52 |     features and targets
 53 |   """
 54 |   columns = tf.decode_csv(csv_row, record_defaults=clvf.get_all_defaults())
 55 |   features = dict(zip(clvf.get_all_names(), columns))
 56 | 
 57 |   # Remove the headers that we don't use
 58 |   for column_name in clvf.get_unused():
 59 |     features.pop(column_name)
 60 | 
 61 |   target = features.pop(clvf.get_target_name())
 62 | 
 63 |   return features, target
 64 | 
 65 | 
 66 | def dataset_input_fn(data_folder, prefix=None, mode=None, params=None, count=None):
 67 |   """Creates a dataset reading example from filenames.
 68 | 
 69 |   Args:
 70 |     data_folder: Location of the files finishing with a '/'
 71 |     prefix: Start of the file names
 72 |     mode: tf.estimator.ModeKeys(TRAIN, EVAL)
 73 |     params: hyperparameters
 74 |   Returns:
 75 |     features and targets
 76 |   """
 77 |   shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False
 78 | 
 79 |   # Read CSV files into a Dataset
 80 |   filenames = tf.matching_files('{}{}*.csv'.format(data_folder, prefix))
 81 |   dataset = tf.data.TextLineDataset(filenames)
 82 | 
 83 |   # Parse the record into tensors.
 84 |   dataset = dataset.map(parse_csv)
 85 | 
 86 |   # Shuffle the dataset
 87 |   if shuffle:
 88 |     dataset = dataset.shuffle(buffer_size=params.buffer_size)
 89 | 
 90 |   # Repeat the input indefinitely if count is None
 91 |   dataset = dataset.repeat(count=count)
 92 | 
 93 |   # Generate batches
 94 |   dataset = dataset.batch(params.batch_size)
 95 | 
 96 |   # Create a one-shot iterator
 97 |   iterator = dataset.make_one_shot_iterator()
 98 | 
 99 |   # Get batch X and y
100 |   features, target = iterator.get_next()
101 | 
102 |   return features, target
103 | 
104 | 
105 | def read_train(data_folder, params):
106 |   """Returns a shuffled dataset for training."""
107 |   return dataset_input_fn(
108 |       data_folder=data_folder,
109 |       prefix='train',
110 |       params=params,
111 |       mode=tf.estimator.ModeKeys.TRAIN)
112 | 
113 | 
114 | def read_eval(data_folder, params):
115 |   """Returns a dataset for evaluation."""
116 |   return dataset_input_fn(data_folder=data_folder,
117 |                           prefix='eval',
118 |                           params=params)
119 | 
120 | 
121 | def read_test(data_folder, params):
122 |   """Returns a dataset for test."""
123 |   return dataset_input_fn(data_folder=data_folder,
124 |                           prefix='test',
125 |                           params=params,
126 |                           count=1)
127 | 
128 | #####################
129 | # Model Definitions #
130 | #####################
131 | def dnn_model(features, mode, params):
132 |   """Creates a DNN regressor model.
133 | 
134 |   Args:
135 |     features: list of feature_columns
136 |     mode: tf.estimator.ModeKeys(TRAIN, EVAL)
137 |     params: hyperparameters
138 | 
139 |   Returns:
140 |     output tensor
141 |   """
142 |   # Make features
143 |   feature_columns = clvf.get_deep_features()
144 | 
145 |   # Creates the input layers from the features.
146 |   h = tf.feature_column.input_layer(features=features,
147 |                                     feature_columns=feature_columns)
148 | 
149 |   # Loops through the layers.
150 |   for size in params.hidden_units:
151 |     h = tf.layers.dense(h, size, activation=None)
152 |     h = tf.layers.batch_normalization(h, training=(
153 |         mode == tf.estimator.ModeKeys.TRAIN))
154 |     h = tf.nn.relu(h)
155 |     if (params.dropout is not None) and (mode == tf.estimator.ModeKeys.TRAIN):
156 |       h = tf.layers.dropout(h, params.dropout)
157 | 
158 |   logits = tf.layers.dense(h, 1, activation=None)
159 |   return logits
160 | 
161 | 
162 | def model_fn(features, labels, mode, params):
163 |   """Model function for custom Estimator.
164 | 
165 |   Args:
166 |     features: given by dataset_input_fn() tuple
167 |     labels: given by dataset_input_fn() tuple
168 |     mode: given when calling the estimator.train/predict/evaluate function
169 |     params: hyperparameters
170 |   Returns:
171 |       EstimatorSpec that can be used by tf.estimator.Estimator.
172 |   """
173 |   # Build the dnn model and get output logits
174 |   logits = dnn_model(features, mode, params)
175 | 
176 |   # Reshape output layer to 1-dim Tensor to return predictions
177 |   output = tf.squeeze(logits)
178 | 
179 |   # Returns an estimator spec for PREDICT.
180 |   if mode == tf.estimator.ModeKeys.PREDICT:
181 | 
182 |     #[START prediction_output_format]
183 |     predictions = {
184 |         'customer_id': tf.squeeze(features[clvf.get_key()]),
185 |         'predicted_monetary': output
186 |     }
187 |     export_outputs = {
188 |         'predictions': tf.estimator.export.PredictOutput(predictions)
189 |     }
190 | 
191 |     return tf.estimator.EstimatorSpec(mode=mode,
192 |                                       predictions=predictions,
193 |                                       export_outputs=export_outputs)
194 |     #[END prediction_output_format]
195 | 
196 |   # Calculates loss using mean squared error between the given labels
197 |   # and the calculated output.
198 |   loss = tf.losses.mean_squared_error(labels, output)
199 | 
200 |   # Create Optimizer and thhe train operation
201 |   optimizer = get_optimizer(params)
202 | 
203 |   # add update ops for batch norm stats
204 |   update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
205 |   with tf.control_dependencies(update_ops):
206 |     train_op = optimizer.minimize(loss=loss,
207 |                                   global_step=tf.train.get_global_step())
208 | 
209 |   # Root mean square error eval metric
210 |   eval_metric_ops = {
211 |       'rmse': tf.metrics.root_mean_squared_error(labels, output)
212 |   }
213 | 
214 |   # Returns an estimator spec for EVAL and TRAIN modes.
215 |   return tf.estimator.EstimatorSpec(mode=mode,
216 |                                     loss=loss,
217 |                                     train_op=train_op,
218 |                                     eval_metric_ops=eval_metric_ops)
219 | 
220 | 
221 | def rmse_evaluator(labels, predictions):
222 |   """Metric for RMSE.
223 | 
224 |   Args:
225 |     labels: Truth provided by the estimator when adding the metric
226 |     predictions: Predicted values. Provided by the estimator silently
227 |   Returns:
228 |     metric_fn that can be used to add the metrics to an existing Estimator
229 |   """
230 |   pred_values = predictions['predictions']
231 |   return {'rmse': tf.metrics.root_mean_squared_error(labels, pred_values)}
232 | 
233 | 
234 | def get_learning_rate(params):
235 |   """Get learning rate given hyperparams.
236 | 
237 |   Args:
238 |     params: hyperparameters
239 | 
240 |   Returns:
241 |     learning_rate tensor if params.learning_rate_decay,
242 |     else a constant.
243 |   """
244 |   if params.learning_rate_decay:
245 |     global_step = tf.train.get_global_step()
246 |     learning_rate = tf.train.exponential_decay(
247 |         learning_rate=params.learning_rate,
248 |         global_step=global_step,
249 |         decay_steps=params.checkpoint_steps,
250 |         decay_rate=params.learning_decay_rate,
251 |         staircase=True
252 |     )
253 |   else:
254 |     learning_rate = params.learning_rate
255 |   return learning_rate
256 | 
257 | 
258 | def get_optimizer(params):
259 |   """Get optimizer given hyperparams.
260 | 
261 |   Args:
262 |     params: hyperparameters
263 | 
264 |   Returns:
265 |     optimizer object
266 | 
267 |   Raises:
268 |     ValueError: if params.optimizer is not supported.
269 |   """
270 |   if params.optimizer == 'ProximalAdagrad':
271 |     optimizer = tf.train.ProximalAdagradOptimizer(
272 |         learning_rate=get_learning_rate(params),
273 |         l1_regularization_strength=params.l1_regularization,
274 |         l2_regularization_strength=params.l2_regularization
275 |     )
276 |   elif params.optimizer == 'SGD':
277 |     optimizer = tf.train.GradientDescentOptimizer(get_learning_rate(params))
278 |   elif params.optimizer == 'Adam':
279 |     optimizer = tf.train.AdamOptimizer(learning_rate=get_learning_rate(params))
280 |   elif params.optimizer == 'RMSProp':
281 |     optimizer = tf.train.RMSPropOptimizer(
282 |         learning_rate=get_learning_rate(params))
283 |   else:
284 |     raise ValueError('Invalid optimizer: %s' % params.optimizer)
285 |   return optimizer
286 | 
287 | 
288 | def get_estimator(estimator_name, config, params, model_dir):
289 |   """Return one of the TF-provided canned estimators defined by MODEL_TYPE.
290 | 
291 |   Args:
292 |     estimator_name:     estimator model type
293 |     config:             run config
294 |     params:             hyperparams
295 |     model_dir:          model directory
296 | 
297 |   Returns:
298 |     Estimator object
299 |   """
300 |   print('-- Running training with estimator {} --'.format(estimator_name))
301 | 
302 |   if estimator_name not in CANNED_MODEL_TYPES:
303 |     estimator = tf.estimator.Estimator(model_fn=model_fn,
304 |                                        config=config,
305 |                                        params=params,
306 |                                        model_dir=model_dir)
307 |   else:
308 |     if estimator_name == CANNED_DEEP:
309 |       estimator = tf.estimator.DNNRegressor(
310 |           feature_columns=clvf.get_deep_features(),
311 |           hidden_units=params.hidden_units,
312 |           config=config,
313 |           model_dir=model_dir,
314 |           optimizer=lambda: get_optimizer(params),
315 |           batch_norm=True,
316 |           dropout=params.dropout)
317 |     else:
318 |       estimator = tf.estimator.LinearRegressor(
319 |           feature_columns=clvf.get_wide_features(),
320 |           config=config,
321 |           model_dir=model_dir)
322 | 
323 |     # Add RMSE for metric for canned estimators
324 |     estimator = tf.contrib.estimator.add_metrics(estimator, rmse_evaluator)
325 |   return estimator
326 | 


--------------------------------------------------------------------------------
/clv_mle/trainer/task.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #            http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Entry point for CMLE jobs for CLV."""
 16 | 
 17 | from __future__ import division
 18 | from __future__ import print_function
 19 | from __future__ import absolute_import
 20 | 
 21 | import sys
 22 | 
 23 | import argparse
 24 | import json
 25 | import os
 26 | import shutil
 27 | import tensorflow as tf
 28 | 
 29 | from .btyd import run_btyd
 30 | from .context import CLVFeatures
 31 | from .model import get_estimator, read_train, read_eval, read_test
 32 | from .model import MODEL_TYPE, MODEL_TYPES, PROBABILISTIC_MODEL_TYPES
 33 | 
 34 | # Training defaults
 35 | 
 36 | # 100000 is the approximate size of our training set (to nearest 1000).
 37 | #[START hyperparams]
 38 | TRAIN_SIZE = 100000
 39 | NUM_EPOCHS = 70
 40 | BATCH_SIZE = 5
 41 | NUM_EVAL = 20
 42 | 
 43 | LEARNING_DECAY_RATE = 0.7
 44 | HIDDEN_UNITS = '128 64 32 16'
 45 | LEARNING_RATE = 0.00135
 46 | L1_REGULARIZATION = 0.0216647
 47 | L2_REGULARIZATION = 0.0673949
 48 | DROPOUT = 0.899732
 49 | SHUFFLE_BUFFER_SIZE = 10000
 50 | #[END hyperparams]
 51 | # TRAIN_SIZE = 100000
 52 | # NUM_EPOCHS = 70
 53 | # BATCH_SIZE = 20
 54 | # NUM_EVAL = 20
 55 | # HIDDEN_UNITS = '128 64 32 16'
 56 | # LEARNING_RATE = 0.096505
 57 | # L1_REGULARIZATION = 0.0026019
 58 | # L2_REGULARIZATION = 0.0102146
 59 | # DROPOUT = 0.843251
 60 | # SHUFFLE_BUFFER_SIZE = 10000
 61 | 
 62 | 
 63 | def create_parser():
 64 |   """Initialize command line parser using arparse.
 65 | 
 66 |   Returns:
 67 |     An argparse.ArgumentParser.
 68 |   """
 69 |   parser = argparse.ArgumentParser()
 70 | 
 71 |   parser.add_argument('--model_type',
 72 |                       help='Model type to train on',
 73 |                       choices=MODEL_TYPES,
 74 |                       default=MODEL_TYPE)
 75 | 
 76 |   parser.add_argument('--job-dir', type=str, required=True)
 77 |   parser.add_argument('--data-src', type=str, required=True)
 78 | 
 79 |   # The following parameters are required for BTYD.
 80 |   parser.add_argument('--predict_end', type=str, required=False,
 81 |                       help='Predict end date YYYY-mm-dd')
 82 |   parser.add_argument('--threshold_date', type=str, required=False,
 83 |                       help='Threshold date YYYY-mm-dd')
 84 | 
 85 |   # hyper params
 86 |   parser.add_argument('--hidden_units',
 87 |                       help='List of hidden units per fully connected layer.',
 88 |                       default=HIDDEN_UNITS,
 89 |                       type=str)
 90 |   parser.add_argument('--learning_rate',
 91 |                       help='Learning rate for the optimizer',
 92 |                       default=LEARNING_RATE,
 93 |                       type=float)
 94 |   parser.add_argument('--learning_rate_decay',
 95 |                       type=str,
 96 |                       help='Use learning rate decay [True|False]',
 97 |                       default='True')
 98 |   parser.add_argument('--learning_decay_rate',
 99 |                       help='Learning decay rate',
100 |                       type=float,
101 |                       default=LEARNING_DECAY_RATE)
102 |   parser.add_argument('--train_size',
103 |                       help='(Approximate) size of training set',
104 |                       default=TRAIN_SIZE,
105 |                       type=int)
106 |   parser.add_argument('--batch_size',
107 |                       help='Number of input records used per batch',
108 |                       default=BATCH_SIZE,
109 |                       type=int)
110 |   parser.add_argument('--buffer_size',
111 |                       help='Size of the buffer for training shuffle.',
112 |                       default=SHUFFLE_BUFFER_SIZE,
113 |                       type=float)
114 |   parser.add_argument('--train_set_size',
115 |                       help='Number of samples on the train dataset.',
116 |                       type=int)
117 |   parser.add_argument('--l1_regularization',
118 |                       help='L1 Regularization (for ProximalAdagrad)',
119 |                       type=float,
120 |                       default=L1_REGULARIZATION)
121 |   parser.add_argument('--l2_regularization',
122 |                       help='L2 Regularization (for ProximalAdagrad)',
123 |                       type=float,
124 |                       default=L2_REGULARIZATION)
125 |   parser.add_argument('--dropout',
126 |                       help='Dropout probability, 0.0 = No dropout layer',
127 |                       type=float,
128 |                       default=DROPOUT)
129 |   parser.add_argument('--hypertune',
130 |                       action='store_true',
131 |                       help='Perform hyperparam tuning',
132 |                       default=False)
133 |   parser.add_argument('--optimizer',
134 |                       help='Optimizer: [Adam, ProximalAdagrad, SGD, RMSProp]',
135 |                       type=str,
136 |                       default='ProximalAdagrad')
137 |   parser.add_argument('--num_epochs',
138 |                       help='Number of epochs',
139 |                       default=NUM_EPOCHS,
140 |                       type=int)
141 |   parser.add_argument('--ignore_crosses',
142 |                       action='store_true',
143 |                       default=False,
144 |                       help='Whether to ignore crosses (linear model only).')
145 |   parser.add_argument('--verbose-logging',
146 |                       action='store_true',
147 |                       default=False,
148 |                       help='Turn on debug logging')
149 |   parser.add_argument('--labels',
150 |                       type=str,
151 |                       default='',
152 |                       help='Labels for job')
153 |   parser.add_argument('--resume',
154 |                       action='store_true',
155 |                       default=False,
156 |                       help='Resume training on saved model.')
157 |   return parser
158 | 
159 | 
160 | def csv_serving_input_fn():
161 |   """Defines how the model gets exported and the required prediction inputs.
162 | 
163 |   Required to have a saved_model.pdtxt file that can be used for prediction.
164 | 
165 |   Returns:
166 |     ServingInputReceiver for exporting model.
167 |   """
168 |   #[START csv_serving_fn]
169 |   clvf = CLVFeatures(ignore_crosses=True,
170 |                      is_dnn=MODEL_TYPE not in PROBABILISTIC_MODEL_TYPES)
171 |   used_headers = clvf.get_used_headers(with_key=True, with_target=False)
172 |   default_values = clvf.get_defaults(used_headers)
173 | 
174 |   rows_string_tensor = tf.placeholder(dtype=tf.string, shape=[None],
175 |                                       name='csv_rows')
176 |   receiver_tensor = {'csv_rows': rows_string_tensor}
177 | 
178 |   row_columns = tf.expand_dims(rows_string_tensor, -1)
179 |   columns = tf.decode_csv(row_columns, record_defaults=default_values)
180 | 
181 |   features = dict(zip(used_headers, columns))
182 | 
183 |   return tf.estimator.export.ServingInputReceiver(features, receiver_tensor)
184 |   #[END csv_serving_fn]
185 | 
186 | 
187 | def main(argv=None):
188 |   """Run the CLV model."""
189 |   argv = sys.argv if argv is None else argv
190 |   args = create_parser().parse_args(args=argv[1:])
191 | 
192 |   # Set logging mode
193 |   tf.logging.set_verbosity(tf.logging.INFO)
194 | 
195 |   # execute non-estimator models
196 |   if args.model_type in PROBABILISTIC_MODEL_TYPES:
197 |     run_btyd(args.model_type, args.data_src, args.threshold_date,
198 |              args.predict_end)
199 |     return
200 | 
201 |   if args.hypertune:
202 |     # if tuning, join the trial number to the output path
203 |     config = json.loads(os.environ.get('TF_CONFIG', '{}'))
204 |     trial = config.get('task', {}).get('trial', '')
205 |     model_dir = os.path.join(args.job_dir, trial)
206 |   else:
207 |     model_dir = args.job_dir
208 | 
209 |   print('Running training with model {}'.format(args.model_type))
210 | 
211 |   # data path
212 |   data_folder = '{}/'.format(args.data_src)
213 | 
214 |   # Calculate train steps and checkpoint steps based on approximate
215 |   # training set size, batch size, and requested number of training
216 |   # epochs.
217 |   train_steps = (args.train_size/args.batch_size) * args.num_epochs
218 |   checkpoint_steps = int((args.train_size/args.batch_size) * (
219 |       args.num_epochs/NUM_EVAL))
220 | 
221 |   # create RunConfig
222 |   config = tf.estimator.RunConfig(
223 |       save_checkpoints_steps=checkpoint_steps
224 |   )
225 | 
226 |   hidden_units = [int(n) for n in args.hidden_units.split()]
227 | 
228 |   # Hyperparameters
229 |   params = tf.contrib.training.HParams(
230 |       num_epochs=args.num_epochs,
231 |       train_steps=train_steps,
232 |       batch_size=args.batch_size,
233 |       hidden_units=hidden_units,
234 |       learning_rate=args.learning_rate,
235 |       ignore_crosses=args.ignore_crosses,
236 |       buffer_size=args.buffer_size,
237 |       learning_rate_decay=(
238 |           args.learning_rate_decay == 'True'),
239 |       learning_decay_rate=args.learning_decay_rate,
240 |       l1_regularization=args.l1_regularization,
241 |       l2_regularization=args.l2_regularization,
242 |       optimizer=args.optimizer,
243 |       dropout=(
244 |           None if args.dropout == 0.0 else args.dropout),
245 |       checkpoint_steps=checkpoint_steps)
246 | 
247 |   print(params)
248 |   print('')
249 |   print('Dataset Size:', args.train_size)
250 |   print('Batch Size:', args.batch_size)
251 |   print('Steps per Epoch:', args.train_size/args.batch_size)
252 |   print('Total Train Steps:', train_steps)
253 |   print('Required Evaluation Steps:', NUM_EVAL)
254 |   print('Perform evaluation step after each', args.num_epochs/NUM_EVAL,
255 |         'epochs')
256 |   print('Save Checkpoint After', checkpoint_steps, 'steps')
257 |   print('**********************************************')
258 | 
259 |   # Creates the relevant estimator (canned or custom)
260 |   estimator = None
261 | 
262 |   # get model estimator
263 |   #[START choose_model]
264 |   estimator = get_estimator(estimator_name=args.model_type,
265 |                             config=config,
266 |                             params=params,
267 |                             model_dir=model_dir)
268 |   #[END choose_model]
269 |   # Creates the training and eval specs by reading the relevant datasets
270 |   # Note that TrainSpec needs max_steps otherwise it runs forever.
271 |   train_spec = tf.estimator.TrainSpec(
272 |       input_fn=lambda: read_train(data_folder, params),
273 |       max_steps=train_steps)
274 | 
275 |   eval_spec = tf.estimator.EvalSpec(
276 |       input_fn=lambda: read_eval(data_folder, params),
277 |       exporters=[
278 |           tf.estimator.LatestExporter(
279 |               name='estimate',
280 |               serving_input_receiver_fn=csv_serving_input_fn,
281 |               exports_to_keep=1,
282 |               as_text=True
283 |           )
284 |       ],
285 |       steps=1000,
286 |       throttle_secs=1,
287 |       start_delay_secs=1
288 |   )
289 | 
290 |   if not args.resume:
291 |     print('Removing previous trained model...')
292 |     shutil.rmtree(model_dir, ignore_errors=True)
293 |   else:
294 |     print('Resuming training...')
295 | 
296 |   # Runs the training and evaluation using the chosen estimator.
297 |   # Saves model data into export/estimate/1234567890/...
298 |   tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
299 | 
300 |   # Evaluate the test set for final metrics
301 |   estimator.evaluate(lambda: read_test(data_folder, params), name="Test Set")
302 | 
303 | if __name__ == '__main__':
304 |   main()
305 | 


--------------------------------------------------------------------------------
/linear.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.python.lib.io import file_io
 2 | import pandas
 3 | from pandas.compat import StringIO
 4 | import json
 5 | import math
 6 | import numpy as np
 7 | from sklearn.linear_model import LinearRegression
 8 | 
 9 | c_names =['customer_id', 'monetary_dnn', 'monetary_btyd', 'frequency_dnn',
10 |        'frequency_btyd', 'recency', 'T', 'time_between', 'avg_basket_value',
11 |        'avg_basket_size', 'cnt_returns', 'has_returned',
12 |        'frequency_btyd_clipped', 'monetary_btyd_clipped',
13 |        'target_monetary_clipped', 'target_monetary']
14 | 
15 | 
16 | train_df = file_io.FileIO(
17 |             'data/train.csv',
18 |             mode='r').read()
19 | train_df = pandas.read_csv(
20 |             StringIO(train_df),
21 |             header = None,
22 |             names = c_names,
23 |             delimiter=',',
24 |             na_filter=True)
25 | 
26 | test_df = file_io.FileIO(
27 |             'data/eval.csv',
28 |             mode='r').read()
29 | test_df = pandas.read_csv(
30 |             StringIO(test_df),
31 |             header = None,
32 |             names = c_names,
33 |             delimiter=',',
34 |             na_filter=True)
35 | 
36 | reg = LinearRegression().fit(
37 |     train_df.values[:, [1,3,5,6,7,8,9,10,11]],
38 |     train_df.values[:, -1])
39 | 
40 | error = 0
41 | i = 0
42 | for p in reg.predict(test_df.values[:, [1,3,5,6,7,8,9,10,11]]):
43 |     error = error + math.pow(p - test_df.values[i, -1], 2)
44 |     i = i +1
45 | 
46 | print "RMSE = ", math.sqrt(error/test_df.values.shape[0])


--------------------------------------------------------------------------------
/notebooks/clv_automl.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from google.cloud import automl_v1beta1\n",
 10 |     "import os\n",
 11 |     "import time"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "## Create and authenticate clients "
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "keyfile_name = 'mykey.json'\n",
 28 |     "client = automl_v1beta1.AutoMlClient.from_service_account_file(keyfile_name)\n",
 29 |     "prediction_client = automl_v1beta1.PredictionServiceClient.from_service_account_file(keyfile_name)"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "## Initialize some variables"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "project_id = 'ml-clv'\n",
 46 |     "location = 'us-central1'\n",
 47 |     "location_path = client.location_path(project_id, location)\n",
 48 |     "\n",
 49 |     "dataset_display_name = 'clv_solution_test'\n",
 50 |     "model_display_name = 'clv_model_test2'"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "location_path"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "## Create AutoML Dataset"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "create_dataset_response = client.create_dataset(\n",
 76 |     "  location_path,\n",
 77 |     "  {'display_name': dataset_display_name, 'tables_dataset_metadata': {}})\n",
 78 |     "dataset_name = create_dataset_response.name"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "## ... or alternatively, use an existing Dataset"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "dataset_list_response = client.list_datasets(location_path)\n",
 95 |     "dataset_list = [d for d in dataset_list_response]\n",
 96 |     "dataset = [d for d in dataset_list if d.display_name == dataset_display_name][0]\n",
 97 |     "dataset_name = dataset.name"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "dataset_name"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {},
112 |    "source": [
113 |     "## Import data from BigQuery"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "dataset_bq_input_uri = 'bq://ml-clv.clv_auto.features_n_target'\n",
123 |     "input_config = {\n",
124 |     "  'bigquery_source': {\n",
125 |     "      'input_uri': dataset_bq_input_uri}}\n",
126 |     "import_data_response = client.import_data(dataset_name, input_config)"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "while import_data_response.done() is False:\n",
136 |     "    time.sleep(1)"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "markdown",
141 |    "metadata": {},
142 |    "source": [
143 |     "## Get column specs for Dataset"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "list_table_specs_response = client.list_table_specs(dataset_name)\n",
153 |     "table_specs = [s for s in list_table_specs_response]\n",
154 |     "table_spec_name = table_specs[0].name\n",
155 |     "list_column_specs_response = client.list_column_specs(table_spec_name)\n",
156 |     "column_specs = {s.display_name: s for s in list_column_specs_response}"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "markdown",
161 |    "metadata": {},
162 |    "source": [
163 |     "### Example of updating column spec..."
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": [
172 |     "# update column spec for 'has_returned'\n",
173 |     "update_column_spec_dict = {\n",
174 |     "  \"name\": column_specs['has_returned'].name,\n",
175 |     "  \"data_type\": {\n",
176 |     "      \"type_code\": \"CATEGORY\"\n",
177 |     "  }\n",
178 |     "}\n",
179 |     "update_column_response = client.update_column_spec(update_column_spec_dict)"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "metadata": {},
185 |    "source": [
186 |     "## Assign a training label"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "label_column_name = 'target_monetary'\n",
196 |     "label_column_spec = column_specs[label_column_name]\n",
197 |     "label_column_id = label_column_spec.name.rsplit('/', 1)[-1]\n",
198 |     "print('Label column ID: {}'.format(label_column_id))\n",
199 |     "update_dataset_dict = {\n",
200 |     "  'name': dataset_name,\n",
201 |     "  'tables_dataset_metadata': {\n",
202 |     "      'target_column_spec_id': label_column_id\n",
203 |     "  }\n",
204 |     "}\n",
205 |     "update_dataset_response = client.update_dataset(update_dataset_dict)"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "markdown",
210 |    "metadata": {},
211 |    "source": [
212 |     "## Select features for training"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "metadata": {},
219 |    "outputs": [],
220 |    "source": [
221 |     "feat_list = list(column_specs.keys())\n",
222 |     "feat_list.remove('target_monetary')\n",
223 |     "feat_list.remove('customer_id')\n",
224 |     "feat_list.remove('monetary_btyd')\n",
225 |     "feat_list.remove('frequency_btyd')\n",
226 |     "feat_list.remove('frequency_btyd_clipped')\n",
227 |     "feat_list.remove('monetary_btyd_clipped')\n",
228 |     "feat_list.remove('target_monetary_clipped')"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {},
235 |    "outputs": [],
236 |    "source": [
237 |     "feat_list"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "markdown",
242 |    "metadata": {},
243 |    "source": [
244 |     "## Train the model"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": null,
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "model_dict = {\n",
254 |     "  'display_name': model_display_name,\n",
255 |     "  'dataset_id': dataset_name.rsplit('/', 1)[-1],\n",
256 |     "  'tables_model_metadata': {\n",
257 |     "      'target_column_spec': column_specs['target_monetary'],\n",
258 |     "      'input_feature_column_specs': [\n",
259 |     "          column_specs[x] for x in feat_list],\n",
260 |     "      'train_budget_milli_node_hours': 10000,\n",
261 |     "      'optimization_objective': 'MINIMIZE_MAE'\n",
262 |     "  }\n",
263 |     "}"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": null,
269 |    "metadata": {},
270 |    "outputs": [],
271 |    "source": [
272 |     "create_model_response = client.create_model(location_path, model_dict)\n",
273 |     "while create_model_response.done() is False:\n",
274 |     "    time.sleep(10)"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": null,
280 |    "metadata": {},
281 |    "outputs": [],
282 |    "source": [
283 |     "create_model_result = create_model_response.result()\n",
284 |     "model_name = create_model_result.name\n",
285 |     "create_model_result.name"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "markdown",
290 |    "metadata": {},
291 |    "source": [
292 |     "## ... or alternatively get an existing trained Model "
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": null,
298 |    "metadata": {},
299 |    "outputs": [],
300 |    "source": [
301 |     "model_list_response = client.list_models(location_path)\n",
302 |     "model_list = [m for m in model_list_response]\n",
303 |     "model = [m for m in model_list if m.display_name == model_display_name][0]\n",
304 |     "model_name = model.name"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "markdown",
309 |    "metadata": {},
310 |    "source": [
311 |     "## Get evalutions for model"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": null,
317 |    "metadata": {},
318 |    "outputs": [],
319 |    "source": [
320 |     "model_evaluations = [e for e in client.list_model_evaluations(model_name)]\n",
321 |     "model_evaluations[0]"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "markdown",
326 |    "metadata": {},
327 |    "source": [
328 |     "## Deploy the model"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": null,
334 |    "metadata": {
335 |     "scrolled": true
336 |    },
337 |    "outputs": [],
338 |    "source": [
339 |     "deploy_model_response = client.deploy_model(model_name)\n",
340 |     "api = client.transport._operations_client\n",
341 |     "while deploy_model_response.done is False:\n",
342 |     "    deploy_model_response = api.get_operation(deploy_model_response.name)\n",
343 |     "    time.sleep(1)"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "markdown",
348 |    "metadata": {},
349 |    "source": [
350 |     "## Run batch predictions"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "code",
355 |    "execution_count": null,
356 |    "metadata": {},
357 |    "outputs": [],
358 |    "source": [
359 |     "gcs_input_uri = \"gs://ml-clv_composer_final/predictions/to_predict.csv\"\n",
360 |     "gcs_output_uri_prefix = \"gs://ml-clv_composer_final/predictions\"\n",
361 |     "\n",
362 |     "# Define input source.\n",
363 |     "batch_prediction_input_source = {\n",
364 |     "  'gcs_source': {\n",
365 |     "    'input_uris': [gcs_input_uri]\n",
366 |     "  }\n",
367 |     "}\n",
368 |     "\n",
369 |     "# Define output target.\n",
370 |     "batch_prediction_output_target = {\n",
371 |     "    'gcs_destination': {\n",
372 |     "      'output_uri_prefix': gcs_output_uri_prefix\n",
373 |     "    }\n",
374 |     "}\n",
375 |     "\n",
376 |     "batch_predict_response = prediction_client.batch_predict(\n",
377 |     "  model_name, batch_prediction_input_source, batch_prediction_output_target)"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": null,
383 |    "metadata": {},
384 |    "outputs": [],
385 |    "source": [
386 |     "while batch_predict_response.done() is False:\n",
387 |     "    time.sleep(1)"
388 |    ]
389 |   }
390 |  ],
391 |  "metadata": {
392 |   "kernelspec": {
393 |    "display_name": "Python 2",
394 |    "language": "python",
395 |    "name": "python2"
396 |   },
397 |   "language_info": {
398 |    "codemirror_mode": {
399 |     "name": "ipython",
400 |     "version": 2
401 |    },
402 |    "file_extension": ".py",
403 |    "mimetype": "text/x-python",
404 |    "name": "python",
405 |    "nbconvert_exporter": "python",
406 |    "pygments_lexer": "ipython2",
407 |    "version": "2.7.13"
408 |   }
409 |  },
410 |  "nbformat": 4,
411 |  "nbformat_minor": 2
412 | }
413 | 


--------------------------------------------------------------------------------
/notebooks/linear_model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from tensorflow.python.lib.io import file_io\n",
 10 |     "import pandas\n",
 11 |     "from pandas.compat import StringIO\n",
 12 |     "import json\n",
 13 |     "import math\n",
 14 |     "import numpy as np"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {},
 21 |    "outputs": [
 22 |     {
 23 |      "name": "stdout",
 24 |      "output_type": "stream",
 25 |      "text": [
 26 |       "Requirement already satisfied: sklearn in /Users/lramsey/miniconda2/envs/clv/lib/python2.7/site-packages\n",
 27 |       "Requirement already satisfied: scikit-learn in /Users/lramsey/miniconda2/envs/clv/lib/python2.7/site-packages (from sklearn)\n",
 28 |       "Requirement already satisfied: scipy>=0.13.3 in /Users/lramsey/miniconda2/envs/clv/lib/python2.7/site-packages (from scikit-learn->sklearn)\n",
 29 |       "Requirement already satisfied: numpy>=1.8.2 in /Users/lramsey/miniconda2/envs/clv/lib/python2.7/site-packages (from scikit-learn->sklearn)\n",
 30 |       "\u001b[33mYou are using pip version 9.0.1, however version 19.0.1 is available.\n",
 31 |       "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n"
 32 |      ]
 33 |     }
 34 |    ],
 35 |    "source": [
 36 |     "!pip install sklearn"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 3,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "from sklearn.linear_model import LinearRegression"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 4,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "c_names =['customer_id', 'monetary_dnn', 'monetary_btyd', 'frequency_dnn',\n",
 55 |     "       'frequency_btyd', 'recency', 'T', 'time_between', 'avg_basket_value',\n",
 56 |     "       'avg_basket_size', 'cnt_returns', 'has_returned',\n",
 57 |     "       'frequency_btyd_clipped', 'monetary_btyd_clipped',\n",
 58 |     "       'target_monetary_clipped', 'target_monetary']"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 5,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "train_df = file_io.FileIO(\n",
 68 |     "            '../data/train.csv',\n",
 69 |     "            mode='r').read()\n",
 70 |     "train_df = pandas.read_csv(\n",
 71 |     "            StringIO(train_df),\n",
 72 |     "            header = None,\n",
 73 |     "            names = c_names,\n",
 74 |     "            delimiter=',',\n",
 75 |     "            na_filter=True)"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 6,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "eval_df = file_io.FileIO(\n",
 85 |     "            '../data/eval.csv',\n",
 86 |     "            mode='r').read()\n",
 87 |     "eval_df = pandas.read_csv(\n",
 88 |     "            StringIO(eval_df),\n",
 89 |     "            header = None,\n",
 90 |     "            names = c_names,\n",
 91 |     "            delimiter=',',\n",
 92 |     "            na_filter=True)"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 7,
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "test_df = file_io.FileIO(\n",
102 |     "            '../data/test.csv',\n",
103 |     "            mode='r').read()\n",
104 |     "test_df = pandas.read_csv(\n",
105 |     "            StringIO(test_df),\n",
106 |     "            header = None,\n",
107 |     "            names = c_names,\n",
108 |     "            delimiter=',',\n",
109 |     "            na_filter=True)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 8,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "reg = LinearRegression().fit(\n",
119 |     "    train_df.values[:, [1,3,5,6,7,8,9,10,11]],\n",
120 |     "    train_df.values[:, -1])"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 9,
126 |    "metadata": {},
127 |    "outputs": [
128 |     {
129 |      "data": {
130 |       "text/plain": [
131 |        "1399.0002542993266"
132 |       ]
133 |      },
134 |      "execution_count": 9,
135 |      "metadata": {},
136 |      "output_type": "execute_result"
137 |     }
138 |    ],
139 |    "source": [
140 |     "error = 0\n",
141 |     "i = 0\n",
142 |     "for p in reg.predict(eval_df.values[:, [1,3,5,6,7,8,9,10,11]]):\n",
143 |     "    error = error + math.pow(p - eval_df.values[i, -1], 2)\n",
144 |     "    i = i +1\n",
145 |     "math.sqrt(error/eval_df.values.shape[0])"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 10,
151 |    "metadata": {},
152 |    "outputs": [
153 |     {
154 |      "data": {
155 |       "text/plain": [
156 |        "961.9763923040274"
157 |       ]
158 |      },
159 |      "execution_count": 10,
160 |      "metadata": {},
161 |      "output_type": "execute_result"
162 |     }
163 |    ],
164 |    "source": [
165 |     "error = 0\n",
166 |     "i = 0\n",
167 |     "for p in reg.predict(test_df.values[:, [1,3,5,6,7,8,9,10,11]]):\n",
168 |     "    error = error + math.pow(p - test_df.values[i, -1], 2)\n",
169 |     "    i = i +1\n",
170 |     "math.sqrt(error/test_df.values.shape[0])"
171 |    ]
172 |   }
173 |  ],
174 |  "metadata": {
175 |   "kernelspec": {
176 |    "display_name": "Python 2",
177 |    "language": "python",
178 |    "name": "python2"
179 |   },
180 |   "language_info": {
181 |    "codemirror_mode": {
182 |     "name": "ipython",
183 |     "version": 2
184 |    },
185 |    "file_extension": ".py",
186 |    "mimetype": "text/x-python",
187 |    "name": "python",
188 |    "nbconvert_exporter": "python",
189 |    "pygments_lexer": "ipython2",
190 |    "version": "2.7.13"
191 |   }
192 |  },
193 |  "nbformat": 4,
194 |  "nbformat_minor": 2
195 | }
196 | 


--------------------------------------------------------------------------------
/preparation/sql/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/tensorflow-lifetime-value/51c3980e725ed2da117b9592f8bf9ccbee1fa509/preparation/sql/.DS_Store


--------------------------------------------------------------------------------
/preparation/sql/common/benchmark.sql:
--------------------------------------------------------------------------------
 1 | -- Copyright 2018 Google Inc. All Rights Reserved.
 2 | --
 3 | -- Licensed under the Apache License, Version 2.0 (the "License");
 4 | -- you may not use this file except in compliance with the License.
 5 | -- You may obtain a copy of the License at
 6 | --
 7 | --     http://www.apache.org/licenses/LICENSE-2.0
 8 | --
 9 | -- Unless required by applicable law or agreed to in writing, software
10 | -- distributed under the License is distributed on an "AS IS" BASIS,
11 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | -- See the License for the specific language governing permissions and
13 | -- limitations under the License.
14 | 
15 | --[START benchmark]
16 | SELECT
17 |   ROUND(SQRT( SUM(POW(predicted_monetary - target_monetary, 2)) / COUNT(1) ), 2) as rmse
18 | FROM (
19 |   SELECT
20 |     tf.customer_id,
21 |     avg_basket_value * ( cnt_orders * (1 + target_days/feature_days) ) AS predicted_monetary,
22 |     ROUND(tt.target_monetary, 2) AS target_monetary
23 | --[END benchmark]
24 |   FROM (
25 |       -- This SELECT takes records that are used for features later
26 |     SELECT
27 |       customer_id,
28 |       AVG(order_value) avg_basket_value,
29 |       COUNT(DISTINCT order_date) AS cnt_orders
30 |     FROM
31 |       `{{ dag_run.conf['project'] }}.{{ dag_run.conf['dataset'] }}.data_cleaned`
32 |     WHERE
33 |       order_date <= DATE('{{ dag_run.conf['threshold_date'] }}')
34 |     GROUP BY
35 |       customer_id) tf,
36 |     (
37 |       -- This SELECT takes records that are used for target later
38 |     SELECT
39 |       customer_id,
40 |       SUM(order_value) target_monetary
41 |     FROM
42 |       `{{ dag_run.conf['project'] }}.{{ dag_run.conf['dataset'] }}.data_cleaned`
43 |       --WHERE order_date > '2013-01-31'
44 |     GROUP BY
45 |       customer_id) tt,
46 |     (
47 |     SELECT
48 |       DATE_DIFF(DATE('{{ dag_run.conf['threshold_date'] }}'), MIN(order_date), DAY) feature_days,
49 |       DATE_DIFF(DATE('{{ dag_run.conf['predict_end'] }}'), DATE('{{ dag_run.conf['threshold_date'] }}'), DAY) target_days
50 |     FROM
51 |       `{{ dag_run.conf['project'] }}.{{ dag_run.conf['dataset'] }}.data_cleaned` ) AS days
52 |   WHERE
53 |     tf.customer_id = tt.customer_id )


--------------------------------------------------------------------------------
/preparation/sql/common/clean.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |   customer_id,
 3 |   order_date,
 4 |   order_value,
 5 |   order_qty_articles
 6 | FROM
 7 | (
 8 |   SELECT
 9 |     CustomerID AS customer_id,
10 |     PARSE_DATE("%m/%d/%y", SUBSTR(InvoiceDate, 0, 8)) AS order_date,
11 |     ROUND(SUM(UnitPrice * Quantity), 2) AS order_value,
12 |     SUM(Quantity) AS order_qty_articles,
13 |     (
14 |       SELECT
15 |         MAX(PARSE_DATE("%m/%d/%y", SUBSTR(InvoiceDate, 0, 8)))
16 |       FROM
17 |         `{{ dag_run.conf['project'] }}.{{ dag_run.conf['dataset'] }}.data_source` tl
18 |       WHERE
19 |         tl.CustomerID = t.CustomerID
20 |     ) latest_order
21 |   FROM
22 |     `{{ dag_run.conf['project'] }}.{{ dag_run.conf['dataset'] }}.data_source` t
23 |   GROUP BY
24 |       CustomerID,
25 |       order_date
26 | ) a
27 | 
28 | INNER JOIN (
29 |   -- Only customers with more than one positive order values before threshold.
30 |   SELECT
31 |     CustomerID
32 |   FROM (
33 |     -- Customers and how many positive order values  before threshold.
34 |     SELECT
35 |       CustomerID,
36 |       SUM(positive_value) cnt_positive_value
37 |     FROM (
38 |       -- Customer with whether order was positive or not at each date.
39 |       SELECT
40 |         CustomerID,
41 |         (
42 |           CASE
43 |             WHEN SUM(UnitPrice * Quantity) > 0 THEN 1
44 |             ELSE 0
45 |           END ) positive_value
46 |       FROM
47 |         `{{ dag_run.conf['project'] }}.{{ dag_run.conf['dataset'] }}.data_source`
48 |       WHERE
49 |         PARSE_DATE("%m/%d/%y", SUBSTR(InvoiceDate, 0, 8)) < DATE('{{ dag_run.conf['threshold_date'] }}')
50 |       GROUP BY
51 |         CustomerID,
52 |         SUBSTR(InvoiceDate, 0, 8) )
53 |     GROUP BY
54 |       CustomerID )
55 |   WHERE
56 |     cnt_positive_value > 1
57 |   ) b
58 | ON
59 |   a.customer_id = b. CustomerID
60 | --[START common_clean]
61 | WHERE
62 |   -- Bought in the past 3 months
63 |   DATE_DIFF(DATE('{{ dag_run.conf['predict_end'] }}'), latest_order, DAY) <= 90
64 |   -- Make sure returns are consistent.
65 |   AND (
66 |     (order_qty_articles > 0 and order_Value > 0) OR
67 |     (order_qty_articles < 0 and order_Value < 0)
68 |   )
69 | --[END common_clean]


--------------------------------------------------------------------------------
/preparation/sql/common/features_n_target.sql:
--------------------------------------------------------------------------------
  1 | -- Copyright 2018 Google Inc. All Rights Reserved.
  2 | --
  3 | -- Licensed under the Apache License, Version 2.0 (the "License");
  4 | -- you may not use this file except in compliance with the License.
  5 | -- You may obtain a copy of the License at
  6 | --
  7 | --     http://www.apache.org/licenses/LICENSE-2.0
  8 | --
  9 | -- Unless required by applicable law or agreed to in writing, software
 10 | -- distributed under the License is distributed on an "AS IS" BASIS,
 11 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | -- See the License for the specific language governing permissions and
 13 | -- limitations under the License.
 14 | 
 15 | -- Keep all records before a threshold date for Features
 16 | -- And all records before a threshold date for Target
 17 | -- Threshold taken at {{ dag_run.conf['threshold_date'] }} ex: 2013-01-31
 18 | -- {{ dag_run.conf['threshold_date'] }} is understood by Airflow
 19 | SELECT
 20 |   tf.customer_id,
 21 |   -- For training period
 22 |   -- Copying the calculations from Lifetimes where first orders are ignored
 23 |   -- See https://github.com/CamDavidsonPilon/lifetimes/blob/master/lifetimes/utils.py#L246
 24 | --[START features_target]
 25 |   tf.monetary_dnn,
 26 |   tf.monetary_btyd,
 27 |   tf.cnt_orders AS frequency_dnn,
 28 |   tf.cnt_orders - 1 AS frequency_btyd,
 29 |   tf.recency,
 30 |   tf.T,
 31 |   ROUND(tf.recency/cnt_orders, 2) AS time_between,
 32 |   ROUND(tf.avg_basket_value, 2) AS avg_basket_value,
 33 |   ROUND(tf.avg_basket_size, 2) AS avg_basket_size,
 34 |   tf.cnt_returns,
 35 |   (CASE
 36 |       WHEN tf.cnt_returns > 0 THEN 1
 37 |       ELSE 0 END) AS has_returned,
 38 | 
 39 |   -- Used by BTYD mainly, potentially DNN if clipped improve results
 40 |   (CASE
 41 |       WHEN tf.cnt_orders - 1 > 600 THEN 600
 42 |       ELSE tf.cnt_orders - 1 END) AS frequency_btyd_clipped,
 43 |   (CASE
 44 |       WHEN tf.monetary_btyd > 100000 THEN 100000
 45 |       ELSE ROUND(tf.monetary_btyd, 2) END) AS monetary_btyd_clipped,
 46 |   (CASE
 47 |       WHEN tt.target_monetary > 100000 THEN 100000
 48 |       ELSE ROUND(tt.target_monetary, 2) END) AS target_monetary_clipped,
 49 | 
 50 |   -- Target calculated for overall period
 51 |   ROUND(tt.target_monetary, 2) as target_monetary
 52 | --[END features_target]
 53 | FROM
 54 |   -- This SELECT uses only data before threshold to make features.
 55 |   (
 56 |     SELECT
 57 |       customer_id,
 58 |       SUM(order_value) AS monetary_dnn,
 59 |       (CASE
 60 |         WHEN COUNT(DISTINCT order_date) = 1 THEN 0
 61 |         ELSE SUM(order_value_btyd) / (COUNT(DISTINCT order_date) -1) END) AS monetary_btyd,
 62 |       DATE_DIFF(MAX(order_date), MIN(order_date), DAY) AS recency,
 63 |       DATE_DIFF(DATE('{{ dag_run.conf['threshold_date'] }}'), MIN(order_date), DAY) AS T,
 64 |       COUNT(DISTINCT order_date) AS cnt_orders,
 65 |       AVG(order_qty_articles) avg_basket_size,
 66 |       AVG(order_value) avg_basket_value,
 67 |       SUM(CASE
 68 |           WHEN order_value < 1 THEN 1
 69 |           ELSE 0 END) AS cnt_returns
 70 |     FROM
 71 |       -- Makes the order value = 0 if it is the first one
 72 |       (
 73 |         SELECT
 74 |           a.*,
 75 |           (CASE
 76 |               WHEN a.order_date = c.order_date_min THEN 0
 77 |               ELSE a.order_value END) AS order_value_btyd
 78 | --[START airflow_params]
 79 |         FROM
 80 |           `{{ dag_run.conf['project'] }}.{{ dag_run.conf['dataset'] }}.data_cleaned` a
 81 | --[END airflow_params]
 82 |         INNER JOIN (
 83 |           SELECT
 84 |             customer_id,
 85 |             MIN(order_date) AS order_date_min
 86 |           FROM
 87 |             `{{ dag_run.conf['project'] }}.{{ dag_run.conf['dataset'] }}.data_cleaned`
 88 |           GROUP BY
 89 |             customer_id) c
 90 |         ON
 91 |           c.customer_id = a.customer_id
 92 |       )
 93 |     WHERE
 94 | --[START threshold_date]
 95 |       order_date <= DATE('{{ dag_run.conf['threshold_date'] }}')
 96 | --[END threshold_date]
 97 |     GROUP BY
 98 |       customer_id) tf,
 99 | 
100 |   -- This SELECT uses all records to calculate the target (could also use data after threshold )
101 |   (
102 |     SELECT
103 |       customer_id,
104 |       SUM(order_value) target_monetary
105 |     FROM
106 |       `{{ dag_run.conf['project'] }}.{{ dag_run.conf['dataset'] }}.data_cleaned`
107 |       --WHERE order_date > DATE('{{ dag_run.conf['threshold_date'] }}')
108 |     GROUP BY
109 |       customer_id) tt
110 | WHERE
111 |   tf.customer_id = tt.customer_id
112 |   AND tf.monetary_dnn > 0
113 |   AND tf.monetary_dnn <= {{ dag_run.conf['max_monetary'] }}
114 |   AND tf.monetary_btyd > 0


--------------------------------------------------------------------------------
/preparation/sql/dnn/split_eval.sql:
--------------------------------------------------------------------------------
 1 | -- Copyright 2018 Google Inc. All Rights Reserved.
 2 | --
 3 | -- Licensed under the Apache License, Version 2.0 (the "License");
 4 | -- you may not use this file except in compliance with the License.
 5 | -- You may obtain a copy of the License at
 6 | --
 7 | --     http://www.apache.org/licenses/LICENSE-2.0
 8 | --
 9 | -- Unless required by applicable law or agreed to in writing, software
10 | -- distributed under the License is distributed on an "AS IS" BASIS,
11 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | -- See the License for the specific language governing permissions and
13 | -- limitations under the License.
14 | 
15 | SELECT
16 |   *
17 | FROM
18 |   `{{ dag_run.conf['project'] }}.{{ dag_run.conf['dataset'] }}.features_n_target`
19 | WHERE
20 |   -- TRAIN
21 |   MOD(ABS(FARM_FINGERPRINT(CAST(customer_id AS STRING))), 100000) > 70000 AND
22 |   MOD(ABS(FARM_FINGERPRINT(CAST(customer_id AS STRING))), 100000) < 85000
23 | 


--------------------------------------------------------------------------------
/preparation/sql/dnn/split_test.sql:
--------------------------------------------------------------------------------
 1 | -- Copyright 2018 Google Inc. All Rights Reserved.
 2 | --
 3 | -- Licensed under the Apache License, Version 2.0 (the "License");
 4 | -- you may not use this file except in compliance with the License.
 5 | -- You may obtain a copy of the License at
 6 | --
 7 | --     http://www.apache.org/licenses/LICENSE-2.0
 8 | --
 9 | -- Unless required by applicable law or agreed to in writing, software
10 | -- distributed under the License is distributed on an "AS IS" BASIS,
11 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | -- See the License for the specific language governing permissions and
13 | -- limitations under the License.
14 | 
15 | SELECT
16 |   *
17 | FROM
18 |   `{{ dag_run.conf['project'] }}.{{ dag_run.conf['dataset'] }}.features_n_target`
19 | WHERE
20 |   -- TRAIN
21 |   MOD(ABS(FARM_FINGERPRINT(CAST(customer_id AS STRING))), 100000) > 85000 AND
22 |   MOD(ABS(FARM_FINGERPRINT(CAST(customer_id AS STRING))), 100000) < 100000
23 | 


--------------------------------------------------------------------------------
/preparation/sql/dnn/split_train.sql:
--------------------------------------------------------------------------------
 1 | -- Copyright 2018 Google Inc. All Rights Reserved.
 2 | --
 3 | -- Licensed under the Apache License, Version 2.0 (the "License");
 4 | -- you may not use this file except in compliance with the License.
 5 | -- You may obtain a copy of the License at
 6 | --
 7 | --     http://www.apache.org/licenses/LICENSE-2.0
 8 | --
 9 | -- Unless required by applicable law or agreed to in writing, software
10 | -- distributed under the License is distributed on an "AS IS" BASIS,
11 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | -- See the License for the specific language governing permissions and
13 | -- limitations under the License.
14 | 
15 | -- Save to table 'dnn_train'
16 | --[START split_train]
17 | SELECT
18 |   *
19 | FROM
20 |   `{{ dag_run.conf['project'] }}.{{ dag_run.conf['dataset'] }}.features_n_target`
21 | WHERE
22 |   -- TRAIN
23 |   MOD(ABS(FARM_FINGERPRINT(CAST(customer_id AS STRING))), 100000) <= 70000
24 | --[END split_train]


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | google-cloud-automl>=0.1.2
 2 | tensorflow==1.13.1
 3 | jupyter
 4 | apache-beam==2.2.0
 5 | ipdb
 6 | lifetimes==0.9.0.0
 7 | google-api-python-client==1.7.3
 8 | google-cloud-bigquery==0.32.0
 9 | seaborn==0.8.1
10 | pandas-gbq==0.5.0
11 | matplotlib==2.2.3


--------------------------------------------------------------------------------
/run/airflow/dags/01_build_train_deploy.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #            http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import datetime, json, re, logging
 16 | from airflow import models
 17 | from airflow.operators.python_operator import PythonOperator, BranchPythonOperator
 18 | from airflow.hooks.base_hook import BaseHook
 19 | from airflow.contrib.operators import bigquery_operator
 20 | from airflow.contrib.operators import bigquery_get_data
 21 | from airflow.contrib.operators import gcs_to_bq
 22 | from airflow.contrib.operators import bigquery_to_gcs
 23 | from airflow.contrib.operators import mlengine_operator
 24 | from airflow.contrib.operators import mlengine_operator_utils
 25 | from airflow.contrib.hooks.gcp_mlengine_hook import MLEngineHook
 26 | from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook
 27 | from airflow.operators import bash_operator
 28 | from airflow.operators.dummy_operator import DummyOperator
 29 | from airflow.utils import trigger_rule
 30 | 
 31 | from google.cloud.automl_v1beta1 import AutoMlClient
 32 | from clv_automl import clv_automl
 33 | 
 34 | def _get_project_id():
 35 |   """Get project ID from default GCP connection."""
 36 | 
 37 |   extras = BaseHook.get_connection('google_cloud_default').extra_dejson
 38 |   key = 'extra__google_cloud_platform__project'
 39 |   if key in extras:
 40 |     project_id = extras[key]
 41 |   else:
 42 |     raise ('Must configure project_id in google_cloud_default '
 43 |            'connection from Airflow Console')
 44 |   return project_id
 45 | 
 46 | PROJECT = _get_project_id()
 47 | REGION = models.Variable.get('region')
 48 | DATASET = models.Variable.get('dataset')
 49 | COMPOSER_BUCKET_NAME = models.Variable.get('composer_bucket_name')
 50 | GCS_SQL = 'sql'
 51 | DB_DUMP_FILENAME = 'db_dump.csv'
 52 | 
 53 | LOCATION_TRAINING_DATA = '{}/data'.format(COMPOSER_BUCKET_NAME)
 54 | 
 55 | PREFIX_JOBS_EXPORT = 'jobs/clv-composer'
 56 | PREFIX_FINAL_MODEL = '{}/final'.format(PREFIX_JOBS_EXPORT)
 57 | 
 58 | MODEL_PACKAGE_NAME = 'clv_ml_engine-0.1.tar.gz' # Matches name in setup.py
 59 | 
 60 | AUTOML_DATASET = models.Variable.get('automl_dataset')
 61 | AUTOML_MODEL = models.Variable.get('automl_model')
 62 | AUTOML_TRAINING_BUDGET = int(models.Variable.get('automl_training_budget'))
 63 | 
 64 | 
 65 | #[START dag_build_train_deploy]
 66 | default_dag_args = {
 67 |     'start_date': datetime.datetime(2050, 1, 1),
 68 |     'schedule_interval': None,
 69 |     'provide_context': True
 70 | }
 71 | 
 72 | dag = models.DAG(
 73 |     'build_train_deploy',
 74 |     default_args = default_dag_args)
 75 | #[END dag_build_train_deploy]
 76 | 
 77 | 
 78 | # Loads the database dump from Cloud Storage to BigQuery
 79 | t1 = gcs_to_bq.GoogleCloudStorageToBigQueryOperator(
 80 |     task_id="db_dump_to_bigquery",
 81 |     bucket=COMPOSER_BUCKET_NAME,
 82 |     source_objects=[DB_DUMP_FILENAME],
 83 |     schema_object="schema_source.json",
 84 |     source_format="CSV",
 85 |     skip_leading_rows=1,
 86 |     destination_project_dataset_table="{}.{}.{}".format(PROJECT,
 87 |                                                         DATASET,
 88 |                                                         'data_source'),
 89 |     create_disposition="CREATE_IF_NEEDED",
 90 |     write_disposition="WRITE_TRUNCATE",
 91 |     dag=dag
 92 | )
 93 | 
 94 | # Clean the data from BigQuery to BigQuery
 95 | t2 = bigquery_operator.BigQueryOperator(
 96 |     task_id='bq_from_source_to_clean',
 97 |     bql='{}/common/clean.sql'.format(GCS_SQL),
 98 |     use_legacy_sql=False,
 99 |     allow_large_results=True,
100 |     destination_dataset_table="{}.{}.{}".format(PROJECT,
101 |                                                 DATASET,
102 |                                                 'data_cleaned'),
103 |     create_disposition="CREATE_IF_NEEDED",
104 |     write_disposition="WRITE_TRUNCATE",
105 |     dag=dag
106 | )
107 | 
108 | # Creates split between features and targets and also aggregates both sides.
109 | # The threshold date is passed as an arg when calling the Airflow job and
110 | # dynamically understood within the .sql file.
111 | # We should pass query_params but we run into various problems:
112 | # - if using BigQueryOperator, we can not pass dag_run.conf['threshold_date']
113 | # - if using hooks, run_query does not accept a .sql file and needs the string
114 | # So the way is to add directly {{ dag_run.conf['threshold_date'] }} into the
115 | # .sql file which Airflow can ping up when running the operator.
116 | t3 = bigquery_operator.BigQueryOperator(
117 |     task_id='bq_from_clean_to_features',
118 |     bql='{}/common/features_n_target.sql'.format(GCS_SQL),
119 |     use_legacy_sql=False,
120 |     allow_large_results=True,
121 |     destination_dataset_table="{}.{}.{}".format(PROJECT,
122 |                                                 DATASET,
123 |                                                 'features_n_target'),
124 |     create_disposition="CREATE_IF_NEEDED",
125 |     write_disposition="WRITE_TRUNCATE",
126 |     dag=dag
127 | )
128 | 
129 | 
130 | def get_model_type(**kwargs):
131 |   model_type = kwargs['dag_run'].conf.get('model_type')
132 |   if model_type == 'automl':
133 |     model_train_task = 'train_automl'
134 |   else:
135 |     model_train_task = 'train_ml_engine'
136 |   return model_train_task
137 | 
138 | t4_train_cond = BranchPythonOperator(task_id='train_branch', dag=dag, python_callable=get_model_type)
139 | 
140 | #
141 | # Train the model using AutoML
142 | #
143 | def do_train_automl(**kwargs):
144 |     """
145 |     Create, train and deploy automl model.
146 |     """
147 |     # instantiate automl client
148 |     automl_client = AutoMlClient()
149 | 
150 |     model_name = clv_automl.create_automl_model(automl_client,
151 |                                                 PROJECT,
152 |                                                 REGION,
153 |                                                 DATASET,
154 |                                                 'features_n_target',
155 |                                                 AUTOML_DATASET,
156 |                                                 AUTOML_MODEL,
157 |                                                 AUTOML_TRAINING_BUDGET)
158 |     clv_automl.deploy_model(automl_client, model_name)
159 | 
160 | t4_automl = PythonOperator(
161 |     task_id='train_automl', dag=dag, python_callable=do_train_automl)
162 | 
163 | 
164 | t4_ml_engine = DummyOperator(task_id='train_ml_engine', dag=dag)
165 | 
166 | # Split the data into a training set and evaluation set within BigQuery
167 | t4a = bigquery_operator.BigQueryOperator(
168 |     task_id='bq_dnn_train',
169 |     bql='{}/dnn/split_train.sql'.format(GCS_SQL),
170 |     use_legacy_sql=False,
171 |     allow_large_results=True,
172 |     destination_dataset_table="{}.{}.{}".format(PROJECT,
173 |                                                 DATASET,
174 |                                                 'dnn_train'),
175 |     create_disposition="CREATE_IF_NEEDED",
176 |     write_disposition="WRITE_TRUNCATE",
177 |     dag=dag
178 | )
179 | 
180 | t4b = bigquery_operator.BigQueryOperator(
181 |     task_id='bq_dnn_eval',
182 |     bql='{}/dnn/split_eval.sql'.format(GCS_SQL),
183 |     use_legacy_sql=False,
184 |     allow_large_results=True,
185 |     destination_dataset_table="{}.{}.{}".format(PROJECT,
186 |                                                 DATASET,
187 |                                                 'dnn_eval'),
188 |     create_disposition="CREATE_IF_NEEDED",
189 |     write_disposition="WRITE_TRUNCATE",
190 |     dag=dag
191 | )
192 | 
193 | t4c = bigquery_operator.BigQueryOperator(
194 |     task_id='bq_dnn_test',
195 |     bql='{}/dnn/split_test.sql'.format(GCS_SQL),
196 |     use_legacy_sql=False,
197 |     allow_large_results=True,
198 |     destination_dataset_table="{}.{}.{}".format(PROJECT,
199 |                                                 DATASET,
200 |                                                 'dnn_test'),
201 |     create_disposition="CREATE_IF_NEEDED",
202 |     write_disposition="WRITE_TRUNCATE",
203 |     dag=dag
204 | )
205 | 
206 | # TODO: Currently all data steps are done whether BTYD or DNN are used. It would
207 | # be better to have a condition to call only one task or the other using 'model_type'
208 | data_btyd_location = ['gs://{}/{}'.format(LOCATION_TRAINING_DATA, 'btyd.csv')]
209 | data_train_locations = ['gs://{}/{}'.format(LOCATION_TRAINING_DATA, 'train.csv')]
210 | data_eval_locations = ['gs://{}/{}'.format(LOCATION_TRAINING_DATA, 'eval.csv')]
211 | data_test_locations = ['gs://{}/{}'.format(LOCATION_TRAINING_DATA, 'test.csv')]
212 | 
213 | t5a = bigquery_to_gcs.BigQueryToCloudStorageOperator(
214 |     task_id='bq_dnn_train_to_gcs',
215 |     source_project_dataset_table="{}.{}.{}".format(PROJECT, DATASET, 'dnn_train'),
216 |     destination_cloud_storage_uris=data_train_locations,
217 |     print_header=False,
218 |     dag=dag
219 | )
220 | 
221 | t5b = bigquery_to_gcs.BigQueryToCloudStorageOperator(
222 |     task_id='bq_dnn_eval_to_gcs',
223 |     source_project_dataset_table="{}.{}.{}".format(PROJECT, DATASET, 'dnn_eval'),
224 |     destination_cloud_storage_uris=data_eval_locations,
225 |     print_header=False,
226 |     dag=dag
227 | )
228 | 
229 | t5c = bigquery_to_gcs.BigQueryToCloudStorageOperator(
230 |     task_id='bq_dnn_test_to_gcs',
231 |     source_project_dataset_table="{}.{}.{}".format(PROJECT, DATASET, 'dnn_test'),
232 |     destination_cloud_storage_uris=data_test_locations,
233 |     print_header=False,
234 |     dag=dag
235 | )
236 | 
237 | t5d = bigquery_to_gcs.BigQueryToCloudStorageOperator(
238 |     task_id='bq_btyd_to_gcs',
239 |     source_project_dataset_table="{}.{}.{}".format(PROJECT, DATASET, 'features_n_target'),
240 |     destination_cloud_storage_uris=data_btyd_location,
241 |     print_header=True,
242 |     dag=dag
243 | )
244 | 
245 | 
246 | #
247 | # Train the model using ML Engine (TensorFlow DNN or Lifetimes BTYD)
248 | #
249 | def do_train_ml_engine(**kwargs):
250 |     """
251 |     """
252 |     job_id = 'clv-{}'.format(datetime.datetime.now().strftime('%Y%m%d%H%M'))
253 | 
254 |     mlengine_operator.MLEngineTrainingOperator(
255 |         task_id='train_ml_engine_job',
256 |         project_id=PROJECT,
257 |         job_id=job_id,
258 |         package_uris=['gs://{}/code/{}'.format(COMPOSER_BUCKET_NAME, MODEL_PACKAGE_NAME)],
259 |         training_python_module='trainer.task',
260 |         region=REGION,
261 |         training_args=['--job-dir', 'gs://{}/{}/{}'.format(COMPOSER_BUCKET_NAME, PREFIX_JOBS_EXPORT, job_id),
262 |                        '--data-src', 'gs://{}'.format(LOCATION_TRAINING_DATA),
263 |                        '--model_type', kwargs['dag_run'].conf.get('model_type')],
264 |         dag=dag
265 |     ).execute(kwargs)
266 | 
267 | t6 = PythonOperator(
268 |     task_id='train_ml_engine_task', dag=dag, python_callable=do_train_ml_engine)
269 | 
270 | #
271 | # Copies the latest model to a consistent 'final' bucket
272 | #
273 | def do_copy_model_to_final(**kwargs):
274 |     gcs = GoogleCloudStorageHook()
275 | 
276 |     # Returns all the objects within the bucket. All sub-buckets are considered
277 |     # as prefix of the leaves. List does not differentiate files from subbuckets
278 |     all_jobs_files = gcs.list(
279 |         bucket=COMPOSER_BUCKET_NAME,
280 |         prefix='{}/export/estimate'.format(PREFIX_JOBS_EXPORT)
281 |     )
282 | 
283 |     # Extract the latest model bucket parent of variables/ and saved_model.pbtxt
284 |     # The max() string contains the latest model folders in 1234567, we need to
285 |     # extract that using regex
286 |     # ex: jobs/clv-composer/export/estimate/1234567890/variables/variables.index
287 |     # returns /1234567890/
288 |     latest_model_bucket = re.findall(r'/\d+/', max(all_jobs_files))[0]
289 | 
290 |     # List all the files that needs to be copied (only files in the latest bucket
291 |     # and skip the ones that are not files but sub buckets)
292 |     for c in [f for f in all_jobs_files
293 |               if latest_model_bucket in f and f[-1] != '/']:
294 | 
295 |         # The model used for training is saved into a 'final' sub bucket of the
296 |         # export bucket.
297 |         dest_object = c.split(latest_model_bucket)[1]
298 |         dest_object = '{}/{}'.format(PREFIX_FINAL_MODEL, dest_object)
299 | 
300 |         logging.info("Copying {} to {} ...".format(dest_object, COMPOSER_BUCKET_NAME))
301 | 
302 |         gcs.copy(
303 |             source_bucket=COMPOSER_BUCKET_NAME,
304 |             source_object=c,
305 |             destination_object=dest_object
306 |         )
307 | 
308 | # Note that this could be done as well in Tensorflow using tf.gFile aftet the
309 | # model is created but for reasons of flexibility, it was decided to do this in the
310 | # wider workflow. This way, it is also possible pick other models.
311 | t7 = PythonOperator(
312 |     task_id='copy_model_to_final',
313 |     python_callable=do_copy_model_to_final,
314 |     dag=dag)
315 | 
316 | #
317 | # Model Creation
318 | #
319 | 
320 | def do_check_model(**kwargs):
321 |     """ Check if a model with the name exists using Hooks instead of operators.
322 |     Uses xcom_push to pass it to the next step. Could use return too if no key.
323 |     """
324 |     # pushes an XCom without a specific target, just by returning it
325 |     mle = MLEngineHook()
326 |     model_name = kwargs['dag_run'].conf.get('model_name')
327 |     # return bool(mle.get_model(PROJECT, MODEL_DNN_NAME))
328 |     project = mle.get_model(PROJECT, model_name)
329 |     kwargs['ti'].xcom_push(key='is_project', value=bool(project))
330 | 
331 | 
332 | def do_create_model(**kwargs):
333 |     """ Creates a model only if one with the same name did not exist.
334 |     It leverages the check from the previous task pushed using xcom.
335 |     """
336 |     model_params = {
337 |       'name': kwargs['dag_run'].conf.get('model_name'),
338 |       'description': 'A custom DNN regressor model',
339 |       'regions': [REGION]
340 |     }
341 | 
342 |     ti = kwargs['ti']
343 | 
344 |     is_model = ti.xcom_pull(key='is_project', task_ids='check_model')
345 |     if not is_model:
346 |         mle = MLEngineHook()
347 |         mle.create_model(PROJECT, model_params)
348 | 
349 | # Checks if model exists using Hook instead of GCP operators due to conditional.
350 | t8 = PythonOperator(
351 |     task_id='check_model', dag=dag, python_callable=do_check_model)
352 | 
353 | # Creates model if it does not exist using Hook instead of GCP operators
354 | t9 = PythonOperator(
355 |     task_id='create_model', dag=dag, python_callable=do_create_model)
356 | 
357 | #
358 | # Version Creation
359 | #
360 | 
361 | def do_list_versions(**kwargs):
362 |     """ Check if a version with the name exists using Hooks instead of operators.
363 |     Uses xcom_push to pass it to the next step. Could use return too if no key.
364 |     """
365 |     mle = MLEngineHook()
366 |     model_name = kwargs['dag_run'].conf.get('model_name')
367 |     model_versions = mle.list_versions(PROJECT, model_name)
368 |     kwargs['ti'].xcom_push(key='model_versions', value=model_versions)
369 | 
370 | 
371 | def do_create_version(**kwargs):
372 |     """ Creates a new version or overwrite if existing one. It leverages the
373 |     check from the previous task pushed using xcom.
374 |     """
375 |     version_params = {
376 |       "name": kwargs['dag_run'].conf.get('model_version'),
377 |       "description": 'Version 1',
378 |       "runtimeVersion": kwargs['dag_run'].conf.get('tf_version'),
379 |       "deploymentUri": 'gs://{}/{}'.format(COMPOSER_BUCKET_NAME, PREFIX_FINAL_MODEL)
380 |     }
381 | 
382 |     ti = kwargs['ti']
383 | 
384 |     mle = MLEngineHook()
385 | 
386 |     model_name = kwargs['dag_run'].conf.get('model_name')
387 |     model_versions = ti.xcom_pull(key='model_versions', task_ids='list_versions')
388 | 
389 |     version_path = 'projects/{}/models/{}/versions/{}'.format(PROJECT,
390 |                                                               model_name,
391 |                                                               version_params['name'])
392 | 
393 |     if version_path in [v['name'] for v in model_versions]:
394 |         logging.info("Delete previously version of the model to overwrite.")
395 |         mle.delete_version(PROJECT, model_name, version_params['name'])
396 | 
397 |     mle.create_version(PROJECT, model_name, version_params)
398 | 
399 | # Checks if model exists using Hook instead of GCP operators due to conditional.
400 | t10 = PythonOperator(
401 |     task_id='list_versions', dag=dag, python_callable=do_list_versions)
402 | 
403 | # Creates model if it does not exist using Hook instead of GCP operators
404 | t11 = PythonOperator(
405 |     task_id='create_version', dag=dag, python_callable=do_create_version)
406 | 
407 | # Create task graph
408 | t1.set_downstream(t2)
409 | t2.set_downstream(t3)
410 | t3.set_downstream(t4_train_cond)
411 | t4_train_cond.set_downstream([t4_ml_engine, t4_automl])
412 | t4_ml_engine.set_downstream([t4a, t4b, t4c])
413 | t4_ml_engine.set_downstream(t5d)
414 | t4a.set_downstream(t5a)
415 | t4b.set_downstream(t5b)
416 | t4c.set_downstream(t5c)
417 | t6.set_upstream([t5a, t5b, t5c, t5d])
418 | t6.set_downstream(t7)
419 | t7.set_downstream(t8)
420 | t9.set_upstream(t8)
421 | t9.set_downstream(t10)
422 | t10.set_downstream(t11)
423 | 


--------------------------------------------------------------------------------
/run/airflow/dags/02_predict_serve.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #            http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import datetime, json, logging
 16 | from airflow import models
 17 | from airflow.operators.python_operator import PythonOperator, BranchPythonOperator
 18 | from airflow.hooks.base_hook import BaseHook
 19 | from airflow.contrib.operators import mlengine_operator
 20 | from airflow.contrib.operators import mlengine_operator_utils
 21 | from airflow.contrib.operators import dataflow_operator
 22 | from airflow.contrib.operators import gcs_to_bq
 23 | # TODO Add when Composer on v2.0 and more Hook
 24 | # from airflow.contrib.operators import gcs_list_operator
 25 | from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook
 26 | from airflow.utils import trigger_rule
 27 | 
 28 | from google.cloud.automl_v1beta1 import AutoMlClient, PredictionServiceClient
 29 | from clv_automl import clv_automl
 30 | 
 31 | 
 32 | def _get_project_id():
 33 |   """Get project ID from default GCP connection."""
 34 | 
 35 |   extras = BaseHook.get_connection('google_cloud_default').extra_dejson
 36 |   key = 'extra__google_cloud_platform__project'
 37 |   if key in extras:
 38 |     project_id = extras[key]
 39 |   else:
 40 |     raise ('Must configure project_id in google_cloud_default '
 41 |            'connection from Airflow Console')
 42 |   return project_id
 43 | 
 44 | PROJECT = _get_project_id()
 45 | REGION = models.Variable.get('region')
 46 | DF_ZONE = models.Variable.get('df_zone')
 47 | DF_TEMP = models.Variable.get('df_temp_location')
 48 | COMPOSER_BUCKET_NAME = models.Variable.get('composer_bucket_name')
 49 | 
 50 | #[START dag_predict_serve]
 51 | default_dag_args = {
 52 |     'start_date': datetime.datetime(2050, 1, 1),
 53 |     'schedule_interval': None,
 54 |     'provide_context': True,
 55 |     'dataflow_default_options': {
 56 |         'project': PROJECT,
 57 |         'zone': DF_ZONE,
 58 |         'tempLocation': DF_TEMP
 59 |     }
 60 | }
 61 | 
 62 | dag = models.DAG(
 63 |     'predict_serve',
 64 |     default_args = default_dag_args)
 65 | #[END dag_predict_serve]
 66 | 
 67 | #
 68 | # Runs prediction.
 69 | #
 70 | 
 71 | def get_model_type(**kwargs):
 72 |   model_type = kwargs['dag_run'].conf.get('model_type')
 73 |   if model_type == 'automl':
 74 |     model_train_task = 'predict_automl'
 75 |   else:
 76 |     model_train_task = 'predict_ml_engine'
 77 |   return model_train_task
 78 | 
 79 | t0_predict_cond = BranchPythonOperator(task_id='predict_branch', dag=dag, python_callable=get_model_type)
 80 | 
 81 | 
 82 | def do_predict_mle(**kwargs):
 83 |     """ Runs a batch prediction on new data and saving the results as CSV into
 84 |     output_path.
 85 |     """
 86 |     job_id = 'clv-{}'.format(datetime.datetime.now().strftime('%Y%m%d%H%M'))
 87 |     gcs_prediction_input = 'gs://{}/predictions/to_predict.csv'.format(COMPOSER_BUCKET_NAME)
 88 |     gcs_prediction_output = 'gs://{}/predictions/output'.format(COMPOSER_BUCKET_NAME)
 89 |     model_name = kwargs['dag_run'].conf.get('model_name')
 90 |     model_version = kwargs['dag_run'].conf.get('model_version')
 91 | 
 92 |     logging.info("Running prediction using {}:{}...".format(model_name,
 93 |                                                             model_version))
 94 | 
 95 |     mlengine_operator.MLEngineBatchPredictionOperator(
 96 |         task_id='predict_dnn',
 97 |         project_id=PROJECT,
 98 |         job_id=job_id,
 99 |         region=REGION,
100 |         data_format='TEXT',
101 |         input_paths=gcs_prediction_input,
102 |         output_path=gcs_prediction_output,
103 |         model_name=model_name,
104 |         version_name=model_version,
105 |         #uri=gs://WHERE_MODEL_IS_IF_NOT_ML_ENGINE
106 |         #runtime_version=TF_VERSION,
107 |         dag=dag
108 |     ).execute(kwargs)
109 | 
110 | 
111 | def do_predict_automl(**kwargs):
112 |   # get automl clients
113 |   automl_client = AutoMlClient()
114 |   automl_predict_client = PredictionServiceClient()
115 | 
116 |   # get model resource name
117 |   automl_model = models.Variable.get('automl_model')
118 |   location_path = automl_client.location_path(PROJECT, REGION)
119 |   model_list_response = automl_client.list_models(location_path)
120 |   model_list = [m for m in model_list_response]
121 |   model = [m for m in model_list if m.display_name == automl_model][0]
122 | 
123 |   # run batch prediction
124 |   gcs_prediction_input = 'gs://{}/predictions/to_predict.csv'.format(COMPOSER_BUCKET_NAME)
125 |   gcs_prediction_output = 'gs://{}/predictions/output'.format(COMPOSER_BUCKET_NAME)
126 |   clv_automl.do_batch_prediction(automl_predict_client,
127 |                                  model.name,
128 |                                  gcs_prediction_input,
129 |                                  gcs_prediction_output)
130 | 
131 | t1a = PythonOperator(
132 |           task_id='predict_ml_engine', dag=dag, python_callable=do_predict_mle)
133 | 
134 | t1b = PythonOperator(
135 |           task_id='predict_automl', dag=dag, python_callable=do_predict_automl)
136 | 
137 | 
138 | #
139 | # Load the predictions from GCS to Datastore.
140 | #
141 | 
142 | def do_load_to_datastore(**kwargs):
143 |     """ Saves the predictions results into Datastore. Because there is no way to
144 |     directly load a CSV to Datastore, we use Apache Beam on Dataflow with
145 |     templates gs://dataflow-templates/latest/GCS_Text_to_Datastore.
146 |     https://cloud.google.com/dataflow/docs/templates/provided-templates#gcstexttodatastore
147 |     """
148 |     gcs_prediction_output = 'gs://{}/predictions/output'.format(COMPOSER_BUCKET_NAME)
149 |     template = 'gs://dataflow-templates/latest/GCS_Text_to_Datastore'
150 | 
151 |     df_template_params = {
152 |         'textReadPattern': '{}/prediction.results*'.format(gcs_prediction_output),
153 |         'javascriptTextTransformGcsPath': 'gs://{}/gcs_datastore_transform.js'.format(COMPOSER_BUCKET_NAME),
154 |         'javascriptTextTransformFunctionName': 'from_prediction_output_to_datastore_object',
155 |         'datastoreWriteProjectId': PROJECT,
156 |         'errorWritePath': 'gs://{}/errors/serving_load'.format(COMPOSER_BUCKET_NAME)
157 |     }
158 | 
159 |     dataflow_operator.DataflowTemplateOperator(
160 |         task_id='gcs_predictions_df_transform',
161 |         project_id=PROJECT,
162 |         template=template,
163 |         parameters=df_template_params,
164 |         dag=dag
165 |     ).execute(kwargs)
166 | 
167 | t2 = PythonOperator(
168 |     task_id='load_to_datastore', dag=dag, python_callable=do_load_to_datastore)
169 | 
170 | #
171 | # Loads the database dump from Cloud Storage to BigQuery
172 | #
173 | 
174 | def do_list_predictions_files(**kwargs):
175 |     """ Retrieves all the predictions files that should be loaded to BigQuery.
176 |     Can not do a GoogleCloudStorageToBigQueryOperator directly due to the possible
177 |     multiple files.
178 |     """
179 |     # List all relevant files
180 |     # TODO Add when Composer is on Airflow 2.0
181 |     # predictions_files = gcs_list_operator.GoogleCloudStorageListOperator(
182 |     #     task_id='predictions_files',
183 |     #     bucket=COMPOSER_BUCKET_NAME,
184 |     #     prefix='predictions/output/prediction.results-'
185 |     # )
186 |     # TODO Remove when Composer on Airflow 2.0
187 |     gcs = GoogleCloudStorageHook()
188 |     predictions_files = gcs.list(
189 |         bucket=COMPOSER_BUCKET_NAME,
190 |         prefix='predictions/output/prediction.results-'
191 |     )
192 | 
193 |     logging.info("Predictions files are: {}".format(predictions_files))
194 | 
195 |     # Create a variable that can be used in the next task
196 |     kwargs['ti'].xcom_push(key='predictions_files', value=predictions_files)
197 | 
198 | 
199 | def do_load_to_bq(**kwargs):
200 |     """ Loads the prediction files to BigQuery using the list output from
201 |     do_list_predictions_files.
202 |     """
203 |     dataset = kwargs['dag_run'].conf.get('dataset')
204 | 
205 |     # Reads files from the variables saved in the previous task
206 |     ti = kwargs['ti']
207 |     predictions_files = ti.xcom_pull(key='predictions_files',
208 |                                      task_ids='list_predictions_files')
209 | 
210 |     gcs_to_bq.GoogleCloudStorageToBigQueryOperator(
211 |         task_id="load_gcs_predictions_to_bigquery",
212 |         bucket=COMPOSER_BUCKET_NAME,
213 |         source_objects=predictions_files,
214 |         schema_fields=[{
215 |             'name':'customer_id',
216 |             'type':'STRING'
217 |         },{
218 |             'name':'predicted_monetary',
219 |             'type':'FLOAT'
220 |         },{
221 |             'name':'predictions',
222 |             'type':'FLOAT'
223 |         }],
224 |         source_format="NEWLINE_DELIMITED_JSON",
225 |         skip_leading_rows=1,
226 |         destination_project_dataset_table="{}.{}.{}".format(PROJECT,
227 |                                                             dataset,
228 |                                                             'predictions'),
229 |         create_disposition="CREATE_IF_NEEDED",
230 |         write_disposition="WRITE_TRUNCATE",
231 |         dag=dag
232 |     ).execute(kwargs)
233 | 
234 | t3 = PythonOperator(
235 |     task_id='list_predictions_files', dag=dag, python_callable=do_list_predictions_files)
236 | 
237 | t4 = PythonOperator(
238 |     task_id='load_to_bq', dag=dag, python_callable=do_load_to_bq)
239 | 
240 | # How to link them
241 | t0_predict_cond.set_downstream([t1a, t1b])
242 | t2.set_upstream([t1a, t1b])
243 | t3.set_upstream([t1a, t1b])
244 | t3.set_downstream(t4)
245 | 
246 | 
247 | 
248 | 
249 | 
250 | 
251 | 
252 | 
253 | 
254 | 
255 | 
256 | 
257 | 


--------------------------------------------------------------------------------
/run/airflow/gcs_datastore_transform.js:
--------------------------------------------------------------------------------
 1 | // Copyright 2018 Google Inc. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //            http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // To be copied to GCS so it can be uses with Dataflow template
16 | function from_prediction_output_to_datastore_object(prediction_row, entity){
17 |   /*
18 |    * prediction_row looks like what is given by the export function in model.py
19 |    * {"properties": 427.7606201171875, "key": ["'abc'"]}
20 |    * entity should match https://cloud.google.com/datastore/docs/reference/data/rest/v1/Entity
21 |    */
22 | 
23 | //[START row_to_ds]
24 |   var prediction_object = JSON.parse(prediction_row);
25 | 
26 |   to_write = {
27 |     key: {
28 |       path: [{
29 |         //id: prediction_object.key,
30 |         kind: 'clv',
31 |         name: prediction_object.customer_id
32 |       }]
33 |     },
34 |     properties: {
35 |       predicted_monetary: {doubleValue: prediction_object.predicted_monetary}
36 |     }
37 |   };
38 | 
39 |   return JSON.stringify(to_write);
40 | //[END row_to_ds]
41 | }


--------------------------------------------------------------------------------
/run/airflow/requirements.txt:
--------------------------------------------------------------------------------
1 | google-cloud-automl>=0.2.0
2 | 


--------------------------------------------------------------------------------
/run/airflow/schema_source.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "mode": "NULLABLE",
 4 |     "name": "InvoiceNo",
 5 |     "type": "STRING"
 6 |   },
 7 |   {
 8 |     "mode": "NULLABLE",
 9 |     "name": "StockCode",
10 |     "type": "STRING"
11 |   },
12 |   {
13 |     "mode": "NULLABLE",
14 |     "name": "Description",
15 |     "type": "STRING"
16 |   },
17 |   {
18 |     "mode": "NULLABLE",
19 |     "name": "Quantity",
20 |     "type": "INTEGER"
21 |   },
22 |   {
23 |     "mode": "NULLABLE",
24 |     "name": "InvoiceDate",
25 |     "type": "STRING"
26 |   },
27 |   {
28 |     "mode": "NULLABLE",
29 |     "name": "UnitPrice",
30 |     "type": "FLOAT"
31 |   },
32 |   {
33 |     "mode": "NULLABLE",
34 |     "name": "CustomerID",
35 |     "type": "STRING"
36 |   },
37 |   {
38 |     "mode": "NULLABLE",
39 |     "name": "Country",
40 |     "type": "STRING"
41 |   }
42 | ]


--------------------------------------------------------------------------------
/run/mltrain.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Copyright 2018 Google Inc. All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | 
 18 | usage () {
 19 |   echo "usage: run/mltrain.sh [local | train | tune] [gs://]data_folder_or_bucket [args]
 20 | 
 21 | Use 'local' to train locally with a local data folder, and 'train' and 'tune' to
 22 | run on ML Engine.
 23 | 
 24 | For ML Engine jobs you must supply a bucket on GCS.  The job data
 25 | folder will be gs://bucket/data and the job directory will be gs://bucket/jobs.
 26 | So your data files must already be in gs://bucket/data.  For DNN models the
 27 | data should be named 'train.csv', 'eval.csv' and 'test.csv, for probablistic
 28 | models the file must be 'btyd.csv'.
 29 | 
 30 | For probabilistic models, specify '--model_type paretonbd_model' and include
 31 | --threshold_date and --predict_end args.
 32 | 
 33 | Examples:
 34 | 
 35 |    # train locally
 36 |    run/mltrain.sh local data
 37 | 
 38 |    # train on ML Engine
 39 |    run/mltrain.sh train gs://your_bucket
 40 | 
 41 |    # tune hyperparams on ML Engine:
 42 |    run/mltrain.sh tune gs://your_bucket
 43 | 
 44 |    # train using btyd models
 45 |    run/mltrain.sh local data --model_type paretonbd_model --threshold_date 2011-08-08 --predict_end 2011-12-12
 46 | "
 47 | 
 48 | }
 49 | 
 50 | date
 51 | 
 52 | TIME=`date +"%Y%m%d_%H%M%S"`
 53 | 
 54 | 
 55 | if [[ $# == 0 || $# == 1 ]]; then
 56 |   usage
 57 |   exit 1
 58 | fi
 59 | 
 60 | # set job vars
 61 | TRAIN_JOB="$1"
 62 | DATA_DIR="$2"
 63 | BUCKET="$2"
 64 | JOB_NAME=clv_${TRAIN_JOB}_${TIME}
 65 | REGION=us-central1
 66 | 
 67 | # queue additional args
 68 | shift; shift
 69 | 
 70 | if [[ ${TRAIN_JOB} == "local" ]]; then
 71 | 
 72 |   ARGS="--data-src ${DATA_DIR} --verbose-logging $@"
 73 | 
 74 |   mkdir -p jobs/${JOB_NAME}
 75 | 
 76 |   gcloud ml-engine local train \
 77 |     --job-dir jobs/${JOB_NAME} \
 78 |     --module-name clv_mle.trainer.task \
 79 |     --package-path trainer \
 80 |     -- \
 81 |     ${ARGS}
 82 | 
 83 | elif [[ ${TRAIN_JOB} == "train" ]]; then
 84 | 
 85 |   ARGS="--data-src ${BUCKET}/data --verbose-logging $@"
 86 | 
 87 |   gcloud beta ml-engine jobs submit training ${JOB_NAME} \
 88 |     --job-dir ${BUCKET}/jobs/${JOB_NAME} \
 89 |     --region $REGION \
 90 |     --scale-tier=CUSTOM \
 91 |     --module-name trainer.task \
 92 |     --package-path clv_mle/trainer \
 93 |     --config clv_mle/config.yaml \
 94 |     --runtime-version 1.10 \
 95 |     -- \
 96 |     ${ARGS}
 97 | 
 98 | elif [[ $TRAIN_JOB == "tune" ]]; then
 99 | 
100 |   ARGS="--data-src ${BUCKET}/data --verbose-logging $@"
101 | 
102 |   # set configuration for tuning
103 |   CONFIG_TUNE="clv_mle/config_tune.json"
104 | 
105 |   gcloud beta ml-engine jobs submit training ${JOB_NAME} \
106 |     --job-dir ${BUCKET}/jobs/${JOB_NAME} \
107 |     --region ${REGION} \
108 |     --scale-tier=CUSTOM \
109 |     --module-name trainer.task \
110 |     --package-path clv_mle/trainer \
111 |     --config ${CONFIG_TUNE} \
112 |     --runtime-version 1.10 \
113 |     -- \
114 |     --hypertune \
115 |     ${ARGS}
116 | 
117 | else
118 |   usage
119 | fi
120 | 
121 | date
122 | 


--------------------------------------------------------------------------------