├── LICENSE ├── README.md ├── resources └── TensorBoard-Validation.png └── src ├── LICENSE └── TensorFlowOnSpark └── examples └── criteo └── spark ├── __init__.py ├── criteo_dist.py ├── criteo_spark.py └── requirements.txt /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by 10 | Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting 13 | the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all other entities that control, are 16 | controlled by, or are under common control with that entity. For the purposes of this definition, 17 | "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, 18 | whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding 19 | shares, or (iii) beneficial ownership of such entity. 20 | 21 | "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. 22 | 23 | "Source" form shall mean the preferred form for making modifications, including but not limited to software 24 | source code, documentation source, and configuration files. 25 | 26 | "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, 27 | including but not limited to compiled object code, generated documentation, and conversions to other media types. 28 | 29 | "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, 30 | as indicated by a copyright notice that is included in or attached to the work (an example is provided in 31 | the Appendix below). 32 | 33 | "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) 34 | the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, 35 | as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not 36 | include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work 37 | and Derivative Works thereof. 38 | 39 | "Contribution" shall mean any work of authorship, including the original version of the Work and any 40 | modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to 41 | Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to 42 | submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of 43 | electronic, verbal, or written communication sent to the Licensor or its representatives, including but not 44 | limited to communication on electronic mailing lists, source code control systems, and issue tracking systems 45 | that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but 46 | excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner 47 | as "Not a Contribution." 48 | 49 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been 50 | received by Licensor and subsequently incorporated within the Work. 51 | 52 | 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby 53 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license 54 | to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute 55 | the Work and such Derivative Works in Source or Object form. 56 | 57 | 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby 58 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated 59 | in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer 60 | the Work, where such license applies only to those patent claims licensable by such Contributor that are 61 | necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work 62 | to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including 63 | a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the 64 | Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under 65 | this License for that Work shall terminate as of the date such litigation is filed. 66 | 67 | 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any 68 | medium, with or without modifications, and in Source or Object form, provided that You meet the following 69 | conditions: 70 | 71 | You must give any other recipients of the Work or Derivative Works a copy of this License; and 72 | You must cause any modified files to carry prominent notices stating that You changed the files; and 73 | You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, 74 | trademark, and attribution notices from the Source form of the Work, excluding those notices that do not 75 | pertain to any part of the Derivative Works; and 76 | If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You 77 | distribute must include a readable copy of the attribution notices contained within such NOTICE file, 78 | excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the 79 | following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source 80 | form or documentation, if provided along with the Derivative Works; or, within a display generated by the 81 | Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file 82 | are for informational purposes only and do not modify the License. You may add Your own attribution notices 83 | within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, 84 | provided that such additional attribution notices cannot be construed as modifying the License. 85 | 86 | You may add Your own copyright statement to Your modifications and may provide additional or different 87 | license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such 88 | Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise 89 | complies with the conditions stated in this License. 90 | 91 | 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally 92 | submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this 93 | License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall 94 | supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding 95 | such Contributions. 96 | 97 | 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or 98 | product names of the Licensor, except as required for reasonable and customary use in describing the origin 99 | of the Work and reproducing the content of the NOTICE file. 100 | 101 | 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the 102 | Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 103 | OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, 104 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for 105 | determining the appropriateness of using or redistributing the Work and assume any risks associated with Your 106 | exercise of permissions under this License. 107 | 108 | 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), 109 | contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) 110 | or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, 111 | special, incidental, or consequential damages of any character arising as a result of this License or out 112 | of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work 113 | stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such 114 | Contributor has been advised of the possibility of such damages. 115 | 116 | 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, 117 | You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other 118 | liability obligations and/or rights consistent with this License. However, in accepting such obligations, 119 | You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, 120 | and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred 121 | by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional 122 | liability. 123 | 124 | END OF TERMS AND CONDITIONS -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Learning Click-Through Rate at Scale with Tensorflow on Spark 2 | This project is not anymore maintained by Criteo. Nor are the data-sets available to download. Refer to more recent repositories to find a more recent supported project! 3 | 4 | ## Introduction 5 | This project consists of learning a click-throughrate model at scale using TensorflowOnSpark technology. 6 | 7 | In 2013, Criteo released a 1TB dataset: http://labs.criteo.com/2013/12/download-terabyte-click-logs/. 8 | More recently, in order to promote Google cloud technology, Google published a solution to train a model on this data set at scale using there proprietary platform : https://cloud.google.com/blog/big-data/2017/02/using-google-cloud-machine-learning-to-predict-clicks-at-scale 9 | 10 | Instead, we propose a solution based on open source technology that can be leveraged on any cloud, 11 | or private cluster relying on spark. 12 | 13 | We demonstrate how Tensorflow on Spark (https://github.com/yahoo/TensorFlowOnSpark) can be used to reach state of the art performances when it comes to predicting the proba of click at scale. 14 | Notice that the goal here is not to produce the best pCTR predictor, but rather establish an open method that scales on which the user can build on. 15 | Hence, the proposed solution remains very simple, and rely solely on basic features extraction, cross-features and hashing, trained on logistic regression. 16 | 17 | ## Install and test TF on spark 18 | Before making use of this code, please make sure you can install TF on spark on your cluster and 19 | run the mnist example as illustrated here: 20 | https://github.com/yahoo/TensorFlowOnSpark/wiki/GetStarted_YARN 21 | By so doing, you should make sure that you did set up the following variables correctly: 22 | 23 | ``` 24 | export JAVA_HOME= 25 | export HADOOP_HOME= 26 | export SPARK_HOME= 27 | export HADOOP_HDFS_HOME= 28 | export SPARK_HOME= 29 | export PYTHON_ROOT=./Python 30 | export PATH=${PATH}:${HADOOP_HOME}/bin:${SPARK_HOME}/bin:${HADOOP_HDFS_HOME}/bin:${SPARK_HOME}/bin:${PYTHON_ROOT}/bin 31 | export PYSPARK_PYTHON=${PYTHON_ROOT}/bin/python 32 | export SPARK_YARN_USER_ENV="PYSPARK_PYTHON=/usr/bin/python" 33 | export QUEUE=default 34 | export LIB_HDFS= 35 | export LIB_JVM= 36 | ``` 37 | 38 | ## Data set 39 | 40 | The raw data can be accessed here: http://labs.criteo.com/2013/12/download-terabyte-click-logs/ 41 | 42 | ### Download the data set 43 | ``` 44 | for i in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23; do 45 | curl -O http://azuremlsampleexperiments.blob.core.windows.net/criteo/day_${i}.gz 46 | aws s3 mv day_${i}.gz s3://XXXXXXXXXXXXXXXXXXXXXXXXXX/released/ 47 | done 48 | ``` 49 | 50 | ### Upload training data on your AWS s3 using Pig 51 | 52 | ``` 53 | %declare awskey yourkey 54 | %declare awssecretkey yoursecretkey 55 | SET mapred.output.compress 'true'; 56 | SET mapred.output.compression.codec 'org.apache.hadoop.io.compress.BZip2Codec'; 57 | train_data = load 's3n://${awskey}:${awssecretkey}@XXXXXXXXXXXXXXXXXXXXXXXXXX/released/day_{[0-9],1[0-9],2[0-2]}.gz '; 58 | train_data = FOREACH (GROUP train_data BY ROUND(10000* RANDOM()) PARALLEL 10000) GENERATE FLATTEN(train_data); 59 | store train_data into 's3n://${awskey}:${awssecretkey}@XXXXXXXXXXXXXXXXXXXXXXXXXX/data/training/' using PigStorage(); 60 | ``` 61 | We here divide the training data in 10000 chunks, which will allow TFonSpark to reduce its memory usage. 62 | 63 | ### Upload validation data on your AWS s3 using Pig 64 | ``` 65 | %declare awskey yourkey 66 | %declare awssecretkey yoursecretkey 67 | SET mapred.output.compress 'true'; 68 | SET mapred.output.compression.codec 'org.apache.hadoop.io.compress.BZip2Codec'; 69 | train_data = load 's3n://${awskey}:${awssecretkey}@XXXXXXXXXXXXXXXXXXXXXXXXXX/released/day_23.gz'; 70 | train_data = FOREACH (GROUP train_data BY ROUND(100* RANDOM()) PARALLEL 100) GENERATE FLATTEN(train_data); 71 | store train_data into 's3n://${awskey}:${awssecretkey}@XXXXXXXXXXXXXXXXXXXXXXXXXX/data/validation' using PigStorage(); 72 | ``` 73 | 74 | 75 | 76 | 77 | 78 | 79 | ## Running the example 80 | 81 | Set up task variables 82 | ``` 83 | export TRAINING_DATA=hdfs_path_to_training_data_directory 84 | export VALIDATION_DATA=hdfs_path_to_validation_data_directory 85 | export MODEL_OUTPUT=hdfs://default/tmp/criteo_ctr_prediction 86 | ``` 87 | Run command: 88 | 89 | ``` 90 | ${SPARK_HOME}/bin/spark-submit \ 91 | --master yarn \ 92 | --deploy-mode cluster \ 93 | --queue ${QUEUE} \ 94 | --num-executors 12 \ 95 | --executor-memory 27G \ 96 | --py-files TensorFlowOnSpark/tfspark.zip,TensorFlowOnSpark/examples/criteo/spark/criteo_dist.py \ 97 | --conf spark.dynamicAllocation.enabled=false \ 98 | --conf spark.yarn.maxAppAttempts=1 \ 99 | --archives hdfs:///user/${USER}/Python.zip#Python \ 100 | --conf spark.executorEnv.LD_LIBRARY_PATH="$LIB_HDFS:$LIB_JVM" \ 101 | --conf spark.executorEnv.HADOOP_HDFS_HOME="$HADOOP_HDFS_HOME" \ 102 | --conf spark.executorEnv.CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath --glob):${CLASSPATH}" \ 103 | TensorFlowOnSpark/examples/criteo/spark/criteo_spark.py \ 104 | --mode train \ 105 | --data ${TRAINING_DATA} \ 106 | --validation ${VALIDATION_DATA} \ 107 | --steps 1000000 \ 108 | --model ${MODEL_OUTPUT} --tensorboard \ 109 | --tensorboardlogdir ${MODEL_OUTPUT} 110 | ``` 111 | ## Tensorboard tracking: 112 | 113 | By connecting to the Web UI tracker of your application, 114 | you be able to retrieve the tensorboard URL in the stdout of the driver: 115 | 116 | ``` 117 | TensorBoard running at: http://10.4.112.234:36911 118 | ``` 119 | 120 | You can then track the validation loss as the training is going on: 121 | 122 | 123 | ![Alt Text](resources/TensorBoard-Validation.png) 124 | 125 | 126 | -------------------------------------------------------------------------------- /resources/TensorBoard-Validation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/criteo/CriteoDisplayCTR-TFOnSpark/ce3c6691d2964bda1bf19ad6100916fe1ed454e0/resources/TensorBoard-Validation.png -------------------------------------------------------------------------------- /src/LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by 10 | Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting 13 | the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all other entities that control, are 16 | controlled by, or are under common control with that entity. For the purposes of this definition, 17 | "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, 18 | whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding 19 | shares, or (iii) beneficial ownership of such entity. 20 | 21 | "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. 22 | 23 | "Source" form shall mean the preferred form for making modifications, including but not limited to software 24 | source code, documentation source, and configuration files. 25 | 26 | "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, 27 | including but not limited to compiled object code, generated documentation, and conversions to other media types. 28 | 29 | "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, 30 | as indicated by a copyright notice that is included in or attached to the work (an example is provided in 31 | the Appendix below). 32 | 33 | "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) 34 | the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, 35 | as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not 36 | include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work 37 | and Derivative Works thereof. 38 | 39 | "Contribution" shall mean any work of authorship, including the original version of the Work and any 40 | modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to 41 | Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to 42 | submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of 43 | electronic, verbal, or written communication sent to the Licensor or its representatives, including but not 44 | limited to communication on electronic mailing lists, source code control systems, and issue tracking systems 45 | that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but 46 | excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner 47 | as "Not a Contribution." 48 | 49 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been 50 | received by Licensor and subsequently incorporated within the Work. 51 | 52 | 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby 53 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license 54 | to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute 55 | the Work and such Derivative Works in Source or Object form. 56 | 57 | 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby 58 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated 59 | in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer 60 | the Work, where such license applies only to those patent claims licensable by such Contributor that are 61 | necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work 62 | to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including 63 | a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the 64 | Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under 65 | this License for that Work shall terminate as of the date such litigation is filed. 66 | 67 | 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any 68 | medium, with or without modifications, and in Source or Object form, provided that You meet the following 69 | conditions: 70 | 71 | You must give any other recipients of the Work or Derivative Works a copy of this License; and 72 | You must cause any modified files to carry prominent notices stating that You changed the files; and 73 | You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, 74 | trademark, and attribution notices from the Source form of the Work, excluding those notices that do not 75 | pertain to any part of the Derivative Works; and 76 | If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You 77 | distribute must include a readable copy of the attribution notices contained within such NOTICE file, 78 | excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the 79 | following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source 80 | form or documentation, if provided along with the Derivative Works; or, within a display generated by the 81 | Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file 82 | are for informational purposes only and do not modify the License. You may add Your own attribution notices 83 | within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, 84 | provided that such additional attribution notices cannot be construed as modifying the License. 85 | 86 | You may add Your own copyright statement to Your modifications and may provide additional or different 87 | license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such 88 | Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise 89 | complies with the conditions stated in this License. 90 | 91 | 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally 92 | submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this 93 | License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall 94 | supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding 95 | such Contributions. 96 | 97 | 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or 98 | product names of the Licensor, except as required for reasonable and customary use in describing the origin 99 | of the Work and reproducing the content of the NOTICE file. 100 | 101 | 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the 102 | Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 103 | OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, 104 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for 105 | determining the appropriateness of using or redistributing the Work and assume any risks associated with Your 106 | exercise of permissions under this License. 107 | 108 | 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), 109 | contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) 110 | or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, 111 | special, incidental, or consequential damages of any character arising as a result of this License or out 112 | of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work 113 | stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such 114 | Contributor has been advised of the possibility of such damages. 115 | 116 | 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, 117 | You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other 118 | liability obligations and/or rights consistent with this License. However, in accepting such obligations, 119 | You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, 120 | and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred 121 | by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional 122 | liability. 123 | 124 | END OF TERMS AND CONDITIONS -------------------------------------------------------------------------------- /src/TensorFlowOnSpark/examples/criteo/spark/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/criteo/CriteoDisplayCTR-TFOnSpark/ce3c6691d2964bda1bf19ad6100916fe1ed454e0/src/TensorFlowOnSpark/examples/criteo/spark/__init__.py -------------------------------------------------------------------------------- /src/TensorFlowOnSpark/examples/criteo/spark/criteo_dist.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Criteo 2 | # Licensed under the terms of the Apache 2.0 license. 3 | # Please see LICENSE file in the project root for terms. 4 | # Distributed Criteo Display CTR prediction on grid based on TensorFlow on Spark 5 | # https://github.com/yahoo/TensorFlowOnSpark 6 | 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import nested_scopes 10 | from __future__ import print_function 11 | 12 | validation_file = None 13 | 14 | 15 | def print_log(worker_num, arg): 16 | print("{0}: {1}".format(worker_num, arg)) 17 | 18 | 19 | def map_fun(args, ctx): 20 | from datetime import datetime 21 | import math 22 | import tensorflow as tf 23 | import numpy as np 24 | import time 25 | from sklearn.metrics import roc_auc_score 26 | import mmh3 27 | 28 | class CircularFile(object): 29 | def __init__(self, filename): 30 | self.filename = filename 31 | self.file = None 32 | 33 | def readline(self): 34 | if (self.file is None): 35 | self.file = tf.gfile.GFile(self.filename, "r") 36 | 37 | p_line = self.file.readline() 38 | 39 | if p_line == "": 40 | self.file.close() 41 | self.file = tf.gfile.GFile(self.filename, "r") 42 | p_line = self.file.readline() 43 | return p_line 44 | 45 | def close(self): 46 | self.file.close() 47 | self.file = None 48 | 49 | 50 | worker_num = ctx.worker_num 51 | job_name = ctx.job_name 52 | task_index = ctx.task_index 53 | 54 | 55 | # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict) 56 | if job_name == "ps": 57 | time.sleep((worker_num + 1) * 5) 58 | 59 | vocabulary_size = 39 60 | # Feature indexes as defined in input file 61 | INDEX_CAT_FEATURES = 13 62 | 63 | # These parameters values have been selected for illustration purpose and have not been tuned. 64 | learning_rate = 0.0005 65 | droupout_rate = 0.4 66 | NB_OF_HASHES_CAT = 2 ** 15 67 | NB_OF_HASHES_CROSS = 2 ** 15 68 | NB_BUCKETS = 40 69 | 70 | boundaries_bucket = [1.5 ** j - 0.51 for j in range(NB_BUCKETS)] 71 | # Same as in: 72 | # [https://github.com/GoogleCloudPlatform/cloudml-samples/blob/c272e9f3bf670404fb1570698d8808ab62f0fc9a/criteo_tft/trainer/task.py#L163] 73 | 74 | nb_input_features = ((INDEX_CAT_FEATURES) * NB_BUCKETS) + ( 75 | (vocabulary_size - INDEX_CAT_FEATURES) * NB_OF_HASHES_CAT) + NB_OF_HASHES_CROSS 76 | 77 | 78 | batch_size = args.batch_size 79 | 80 | # Get TF cluster and server instances 81 | cluster, server = ctx.start_cluster_server(1, args.rdma) 82 | 83 | 84 | def get_index_bucket(feature_value): 85 | """ 86 | maps the input feature to a one hot encoding index 87 | :param feature_value: the value of the feature 88 | :return: the index of the one hot encoding that activates for the input value 89 | """ 90 | for index, boundary_value in enumerate(boundaries_bucket): 91 | if feature_value < boundary_value: 92 | return index 93 | return index 94 | 95 | 96 | def get_batch_validation(batch_size): 97 | """ 98 | :param batch_size: 99 | :return: a list of read lines, each lines being a list of the features as read from the input file 100 | """ 101 | global validation_file 102 | if validation_file is None: 103 | validation_file = CircularFile(args.validation) 104 | return [validation_file.readline().split('\t') for _ in range(batch_size)] 105 | 106 | def get_cross_feature_name(index, features): 107 | if index < INDEX_CAT_FEATURES: 108 | index_str = str(index) + "_" + str(get_index_bucket(int(features[index]))) 109 | else: 110 | index_str = str(index) + "_" + features[index] 111 | 112 | return index_str 113 | 114 | def get_next_batch(batch): 115 | """ 116 | maps the batch read from the input file to a data array, and a label array that are fed to 117 | the tf placeholders 118 | :param batch: 119 | :return: 120 | """ 121 | data = np.zeros((batch_size, nb_input_features)) 122 | labels = np.zeros(batch_size) 123 | 124 | index = 0 125 | while True: 126 | 127 | features = batch[index][1:] 128 | 129 | if len(features) != vocabulary_size: 130 | continue 131 | 132 | # BUCKETIZE CONTINIOUS FEATURES 133 | for f_index in range(0, INDEX_CAT_FEATURES ): 134 | if features[f_index]: 135 | bucket_index = get_index_bucket(int(features[f_index])) 136 | bucket_number_index = f_index * NB_BUCKETS 137 | bucket_index_offset = bucket_index + bucket_number_index 138 | data[index, bucket_index_offset] = 1 139 | 140 | # BUCKETIZE CATEGORY FEATURES 141 | offset = INDEX_CAT_FEATURES * NB_BUCKETS 142 | for f_index in range(INDEX_CAT_FEATURES, vocabulary_size): 143 | if features[f_index]: 144 | hash_index = mmh3.hash(features[f_index]) % NB_OF_HASHES_CAT 145 | hash_number_index = (f_index - INDEX_CAT_FEATURES) * NB_OF_HASHES_CAT + offset 146 | hash_index_offset = hash_index + hash_number_index 147 | data[index, hash_index_offset] = 1 148 | 149 | # BUCKETIZE CROSS CATEGORY AND CONTINIOUS 150 | offset = INDEX_CAT_FEATURES * NB_BUCKETS + (vocabulary_size - INDEX_CAT_FEATURES) * NB_OF_HASHES_CAT 151 | 152 | for index_i in range(0, vocabulary_size-1): 153 | for index_j in range(index_i + 1, vocabulary_size): 154 | if features[index_i].rstrip() == '' or features[index_j].rstrip() == '': 155 | continue 156 | 157 | index_str_i = get_cross_feature_name(index_i,features) 158 | index_str_j = get_cross_feature_name(index_j,features) 159 | 160 | hash_index = mmh3.hash(index_str_i + "_" + index_str_j) % NB_OF_HASHES_CROSS + offset 161 | data[index, hash_index] = 1 162 | 163 | labels[index] = batch[index][0] 164 | index += 1 165 | if index == batch_size: 166 | break 167 | 168 | return data.astype(int), labels.astype(int) 169 | 170 | 171 | 172 | if job_name == "ps": 173 | server.join() 174 | elif job_name == "worker": 175 | is_chiefing = (task_index == 0) 176 | with tf.device(tf.train.replica_device_setter( 177 | worker_device="/job:worker/task:%d" % task_index, 178 | cluster=cluster)): 179 | 180 | def lineartf(x, droupout_rate, is_training, name=None, reuse=None, dropout=None): 181 | """ 182 | Apply a simple lineartf transformation A*x+b to the input 183 | """ 184 | n_output = 1 185 | if len(x.get_shape()) != 2: 186 | x = tf.contrib.layers.flatten(x) 187 | 188 | n_input = x.get_shape().as_list()[1] 189 | 190 | with tf.variable_scope(name, reuse=reuse): 191 | W = tf.get_variable( 192 | name='W', 193 | shape=[n_input, n_output], 194 | dtype=tf.float32, 195 | initializer=tf.contrib.layers.xavier_initializer()) 196 | 197 | b = tf.get_variable( 198 | name='b', 199 | shape=[n_output], 200 | dtype=tf.float32, 201 | initializer=tf.constant_initializer(0.0)) 202 | 203 | h = tf.nn.bias_add( 204 | name='h', 205 | value=tf.matmul(x, W), 206 | bias=b) 207 | 208 | if dropout: 209 | h = tf.cond(is_training, lambda: tf.layers.dropout(h, rate=droupout_rate, training=True), 210 | lambda: tf.layers.dropout(h, rate=0.0, training=True)) 211 | 212 | return h, W 213 | 214 | is_training = tf.placeholder(tf.bool, shape=()) 215 | input_features = tf.placeholder(tf.float32, [None, nb_input_features], name="input_features") 216 | input_features_lineartf, _ = lineartf(input_features, droupout_rate=droupout_rate, 217 | name='linear_layer', 218 | is_training=is_training, 219 | dropout=None) 220 | 221 | y_true = tf.placeholder(tf.float32, shape=None) 222 | y_prediction = input_features_lineartf 223 | pCTR = tf.nn.sigmoid(y_prediction, name="pCTR") 224 | global_step = tf.Variable(0) 225 | cross_entropy = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y_true, logits=y_prediction)) 226 | tf.summary.scalar('cross_entropy', cross_entropy) 227 | adam_train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cross_entropy, 228 | global_step=global_step) 229 | 230 | saver = tf.train.Saver() 231 | summary_op = tf.summary.merge_all() 232 | init_op = tf.global_variables_initializer() 233 | 234 | logdir = ctx.absolute_path(args.model) 235 | print("Tensorflow model path: {0}".format(logdir)) 236 | 237 | if job_name == "worker" and is_chiefing: 238 | summary_writer = tf.summary.FileWriter(logdir + "/train", graph=tf.get_default_graph()) 239 | summary_val_writer = tf.summary.FileWriter(logdir + "/validation", graph=tf.get_default_graph()) 240 | 241 | options = dict(is_chief=is_chiefing, 242 | logdir=logdir, 243 | summary_op=None, 244 | saver=saver, 245 | global_step=global_step, 246 | stop_grace_secs=300, 247 | save_model_secs=0) 248 | 249 | if args.mode == "train": 250 | options['save_model_secs'] = 120 251 | options['init_op'] = init_op 252 | options['summary_writer'] = None 253 | 254 | sv = tf.train.Supervisor(**options) 255 | 256 | with sv.managed_session(server.target) as sess: 257 | 258 | print("{0} session ready".format(datetime.now().isoformat())) 259 | 260 | tf_feed = ctx.get_data_feed(args.mode == "train") 261 | step = 0 262 | while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps: 263 | batch_data, batch_labels = get_next_batch(tf_feed.next_batch(batch_size)) 264 | 265 | if len(batch_data) > 0: 266 | 267 | if args.mode == "train": 268 | 269 | if sv.is_chief: 270 | # Evaluate current state of the model on next batch of validation 271 | batch_val = get_batch_validation(batch_size) 272 | batch_data, batch_labels = get_next_batch(batch_val) 273 | feed = {input_features: batch_data, y_true: batch_labels, is_training: False} 274 | logloss, summary, step = sess.run([cross_entropy, summary_op, global_step], feed_dict=feed) 275 | summary_val_writer.add_summary(summary, step) 276 | print("validation loss: {0}".format(logloss)) 277 | 278 | feed = {input_features: batch_data, y_true: batch_labels, is_training: True} 279 | _, logloss, summary, step = sess.run([adam_train_step, cross_entropy, summary_op, global_step], 280 | feed_dict=feed) 281 | 282 | else: 283 | feed = {input_features: batch_data, y_true: batch_labels, is_training: False} 284 | yscore = sess.run(pCTR, feed_dict=feed) 285 | tf_feed.batch_results(yscore) 286 | 287 | if sv.should_stop() or step >= args.steps: 288 | tf_feed.terminate() 289 | if is_chiefing: 290 | summary_writer.close() 291 | summary_val_writer.close() 292 | 293 | print("{0} stopping supervisor".format(datetime.now().isoformat())) 294 | sv.stop() 295 | -------------------------------------------------------------------------------- /src/TensorFlowOnSpark/examples/criteo/spark/criteo_spark.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Criteo 2 | # Licensed under the terms of the Apache 2.0 license. 3 | # Please see LICENSE file in the project root for terms. 4 | 5 | # Distributed Criteo Display CTR prediction on grid based on TensorFlow on Spark 6 | # https://github.com/yahoo/TensorFlowOnSpark 7 | 8 | from __future__ import absolute_import 9 | from __future__ import division 10 | from __future__ import print_function 11 | 12 | from pyspark.context import SparkContext 13 | from pyspark.conf import SparkConf 14 | 15 | import argparse 16 | from datetime import datetime 17 | 18 | 19 | 20 | from tensorflowonspark import TFCluster 21 | 22 | 23 | import criteo_dist 24 | 25 | 26 | if __name__ == "__main__": 27 | sc = SparkContext(conf=SparkConf().setAppName("criteo_spark")) 28 | executors = sc._conf.get("spark.executor.instances") 29 | if executors is None: 30 | raise Exception("Could not retrieve the number of executors from the SparkContext") 31 | num_executors = int(executors) 32 | num_ps = 1 33 | 34 | parser = argparse.ArgumentParser() 35 | parser.add_argument("-b", "--batch_size", help="number of records per batch", type=int, default=100) 36 | parser.add_argument("-e", "--epochs", help="number of epochs", type=int, default=1) 37 | parser.add_argument("-i", "--data", help="HDFS path to data in parallelized format") 38 | parser.add_argument("-m", "--model", help="HDFS path to save/load model during train/inference", default="criteo_model") 39 | parser.add_argument("-v", "--validation", help="HDFS path to validation data") 40 | 41 | parser.add_argument("-n", "--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) 42 | parser.add_argument("-o", "--output", help="HDFS path to save test/inference output", default="predictions") 43 | parser.add_argument("-r", "--readers", help="number of reader/enqueue threads", type=int, default=1) 44 | parser.add_argument("-s", "--steps", help="maximum number of steps", type=int, default=1000) 45 | parser.add_argument("-tb", "--tensorboard", help="launch tensorboard process", action="store_true") 46 | parser.add_argument("-X", "--mode", help="train|inference", default="train") 47 | parser.add_argument("-c", "--rdma", help="use rdma connection", default=False) 48 | parser.add_argument("-tbld", "--tensorboardlogdir", 49 | help="Tensorboard log directory. It should on hdfs. Thus, it must be prefixed with hdfs://default") 50 | 51 | args = parser.parse_args() 52 | print("args:", args) 53 | 54 | print("{0} ===== Start".format(datetime.now().isoformat())) 55 | 56 | dataRDD = sc.textFile(args.data).map(lambda ln: [x for x in ln.split('\t')]) 57 | 58 | cluster = TFCluster.run(sc, criteo_dist.map_fun, args, args.cluster_size, num_ps, args.tensorboard, 59 | TFCluster.InputMode.SPARK, log_dir=args.model) 60 | if args.mode == "train": 61 | cluster.train(dataRDD, args.epochs) 62 | else: 63 | labelRDD = cluster.inference(dataRDD) 64 | labelRDD.saveAsTextFile(args.output) 65 | cluster.shutdown() 66 | print("{0} ===== Stop".format(datetime.now().isoformat())) -------------------------------------------------------------------------------- /src/TensorFlowOnSpark/examples/criteo/spark/requirements.txt: -------------------------------------------------------------------------------- 1 | mmh3==2.5.1 2 | tensorflow=1.2.1 3 | numpy==1.13.1 4 | scipy==1.0.0 5 | scikit-learn==0.19.1 6 | --------------------------------------------------------------------------------