├── README.md ├── run_local.sh ├── config.py ├── main.py ├── data_inputs.py ├── feature_processing.py └── model.py /README.md: -------------------------------------------------------------------------------- 1 | # mvdssm 2 | A Multi-View DSSM for Recommendation System with tensorflow estimator. -------------------------------------------------------------------------------- /run_local.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | ckpt_dir=./ckpt 3 | user_model_path=./user_model 4 | item_model_path=./item_model 5 | 6 | train_data=train_files.txt 7 | eval_data=eval_files.txt 8 | train_steps=100000000 9 | batch_size=256 10 | learning_rate=0.000005 11 | save_steps=10000 12 | embed_size=32 13 | 14 | python main.py \ 15 | --train_data=${train_data} \ 16 | --eval_data=${eval_data} \ 17 | --model_dir=${ckpt_dir} \ 18 | --user_model_path=${user_model_path} \ 19 | --item_model_path=${item_model_path} \ 20 | --train_steps=${train_steps} \ 21 | --save_checkpoints_steps=${save_steps} \ 22 | --learning_rate=${learning_rate} \ 23 | --batch_size=${batch_size} \ 24 | --is_eval=False \ 25 | --run_on_cluster=False \ 26 | --train_eval=True \ 27 | --export_user_model=True \ 28 | --export_item_model=True \ 29 | --embed_size=${embed_size} \ 30 | --gpuid=3 31 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import json, os, re, codecs 3 | import tensorflow as tf 4 | 5 | flags = tf.app.flags 6 | 7 | flags.DEFINE_boolean("run_on_cluster", False, "Whether the cluster info need to be passed in as input") 8 | flags.DEFINE_boolean("is_eval", False, "Whether evaluate or not") 9 | flags.DEFINE_boolean("train_eval", False, "Whether train and evaluate model or not") 10 | flags.DEFINE_boolean("export_user_model", False, "Whether export model or not") 11 | flags.DEFINE_boolean("export_item_model", False, "Whether export model or not") 12 | 13 | flags.DEFINE_string("train_dir", "", "") 14 | flags.DEFINE_string("data_dir", "", "") 15 | flags.DEFINE_string("log_dir", "", "") 16 | flags.DEFINE_string("ps_hosts", "","Comma-separated list of hostname:port pairs, you can also specify pattern like ps[1-5].example.com") 17 | flags.DEFINE_string("worker_hosts", "","Comma-separated list of hostname:port pairs, you can also specify worker[1-5].example.co") 18 | flags.DEFINE_string("job_name", "", "One of 'ps', 'worker'") 19 | flags.DEFINE_integer("task_index", 0, "Index of task within the job") 20 | 21 | flags.DEFINE_string("model_dir", "./ckpt/", "Base directory for the model.") 22 | flags.DEFINE_string("user_model_path", "./user_model/", "Saved model.") 23 | flags.DEFINE_string("item_model_path", "./item_model/", "Saved model.") 24 | 25 | flags.DEFINE_string("train_data", "./train_files.txt", "Directory for storing mnist data") 26 | flags.DEFINE_string("eval_data", "./eval_files.txt", "Path to the evaluation data.") 27 | 28 | flags.DEFINE_string("gpuid", "1", "gpuid") 29 | 30 | flags.DEFINE_string("hidden_units", "512,256,128", "user hidden units.") 31 | flags.DEFINE_integer("train_steps",100000, "Number of (global) training steps to perform") 32 | flags.DEFINE_integer("batch_size", 512, "Training batch size") 33 | flags.DEFINE_integer("NEG", 50, "Negative Samples") 34 | flags.DEFINE_integer("embed_size", 32, "Embedding size for FM") 35 | flags.DEFINE_float("learning_rate", 0.0001, "learning rate") 36 | flags.DEFINE_integer("save_checkpoints_steps", 10000, "Save checkpoints every this many steps") 37 | 38 | FLAGS = flags.FLAGS 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | import os 3 | import json 4 | import math 5 | import numpy as np 6 | import tensorflow as tf 7 | from tensorflow import feature_column as fc 8 | import data_inputs 9 | from feature_processing import FeatureConfig 10 | import model 11 | import config 12 | 13 | FLAGS = config.FLAGS 14 | 15 | os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpuid 16 | 17 | if FLAGS.run_on_cluster: 18 | cluster = json.loads(os.environ["TF_CONFIG"]) 19 | task_index = int(os.environ["TF_INDEX"]) 20 | task_type = os.environ["TF_ROLE"] 21 | 22 | 23 | def main(unused_argv): 24 | feature_configs = FeatureConfig().create_features_columns() 25 | classifier = tf.estimator.Estimator(model_fn=model.model_fn, 26 | config=tf.estimator.RunConfig(model_dir=FLAGS.model_dir, 27 | save_checkpoints_steps=FLAGS.save_checkpoints_steps, 28 | keep_checkpoint_max=3), 29 | params={"feature_configs": feature_configs, 30 | "hidden_units": list(map(int, FLAGS.hidden_units.split(","))), 31 | "learning_rate": FLAGS.learning_rate} 32 | ) 33 | def train_eval_model(): 34 | train_spec = tf.estimator.TrainSpec(input_fn=lambda: data_inputs.train_input_fn(FLAGS.train_data, FLAGS.batch_size), 35 | max_steps=FLAGS.train_steps) 36 | eval_spec = tf.estimator.EvalSpec(input_fn=lambda: data_inputs.eval_input_fn(FLAGS.eval_data, FLAGS.batch_size), 37 | start_delay_secs=60, 38 | throttle_secs = 30, 39 | steps=1000) 40 | tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec) 41 | 42 | def train_model(): 43 | from tensorflow.python import debug as tf_debug 44 | debug_hook = tf_debug.LocalCLIDebugHook() 45 | classifier.train(input_fn=lambda: fe.train_input_fn(FLAGS.train_data, FLAGS.batch_size), steps=1000, hooks=[debug_hook,]) 46 | 47 | def eval_model(): 48 | classifier.evaluate(input_fn=lambda: fe.eval_input_fn(FLAGS.eval_data, FLAGS.batch_size), steps=1000) 49 | 50 | if FLAGS.is_eval: 51 | eval_model() 52 | 53 | if FLAGS.train_eval: 54 | train_eval_model() 55 | 56 | 57 | if __name__ == "__main__": 58 | os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" 59 | tf.logging.set_verbosity(tf.logging.INFO) 60 | tf.app.run(main=main) 61 | -------------------------------------------------------------------------------- /data_inputs.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 2 | import os, json, codecs 3 | import tensorflow as tf 4 | from tensorflow import feature_column as fc 5 | import config 6 | 7 | FLAGS = config.FLAGS 8 | 9 | def parse_exp(example): 10 | features_def = dict() 11 | features_def["label"] = tf.io.FixedLenFeature([1], tf.int64) 12 | 13 | features_def["user_classes"] = tf.io.FixedLenFeature([5], tf.int64) # 用户兴趣 14 | features_def["user_age"] = tf.io.FixedLenFeature([1], tf.int64) # 用户年龄 15 | features_def["user_gender"] = tf.io.FixedLenFeature([1], tf.int64) # 用户性别 16 | features_def["user_vector"] = tf.io.FixedLenFeature([128], tf.float32) # 用户向量 17 | features_def["item_classes"] = tf.io.FixedLenFeature([1], tf.int64) # item分类 18 | features_def["item_vector"] = tf.io.FixedLenFeature([128], tf.float32) # item向量 19 | 20 | features = tf.io.parse_single_example(example, features_def) 21 | label = features["label"] 22 | del features["label"] 23 | return features, label 24 | 25 | 26 | def train_input_fn(filenames=None, 27 | batch_size=128, 28 | shuffle_buffer_size=1000): 29 | with tf.gfile.Open(filenames) as f: 30 | filenames = f.read().split() 31 | 32 | if FLAGS.run_on_cluster: 33 | files_all = [] 34 | for f in filenames: 35 | files_all += tf.gfile.Glob(f) 36 | train_worker_num = len(FLAGS.worker_hosts.split(",")) 37 | hash_id = FLAGS.task_index if FLAGS.job_name == "worker" else train_worker_num - 1 38 | files_shard = [files for i, files in enumerate(files_all) if i % train_worker_num == hash_id] 39 | files = tf.data.Dataset.list_files(files_shard) 40 | dataset = files.apply(tf.contrib.data.parallel_interleave(lambda x: tf.data.TFRecordDataset(x), 41 | cycle_length=10, 42 | buffer_output_elements=batch_size*20, 43 | sloppy=True)) 44 | #dataset = tf.data.TFRecordDataset(files_shard) 45 | else: 46 | files = tf.data.Dataset.list_files(filenames) 47 | dataset = files.apply(tf.contrib.data.parallel_interleave(lambda x: tf.data.TFRecordDataset(x), 48 | buffer_output_elements=batch_size*4, 49 | cycle_length=4, 50 | sloppy=True)) 51 | #dataset = dataset.shuffle(batch_size*10) 52 | dataset = dataset.map(parse_exp, num_parallel_calls=8) 53 | dataset = dataset.batch(batch_size, drop_remainder=True).prefetch(1) 54 | return dataset 55 | 56 | def eval_input_fn(filenames=None, 57 | batch_size=128): 58 | with tf.gfile.Open(filenames) as f: 59 | filenames = f.read().split() 60 | files = tf.data.Dataset.list_files(filenames) 61 | dataset = files.apply(tf.contrib.data.parallel_interleave(lambda filename: tf.data.TFRecordDataset(files), buffer_output_elements=batch_size*12, cycle_length=8)) 62 | dataset = dataset.map(parse_exp, num_parallel_calls=4) 63 | dataset = dataset.batch(batch_size, drop_remainder=True) 64 | return dataset 65 | 66 | 67 | -------------------------------------------------------------------------------- /feature_processing.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 2 | import os, json, codecs 3 | import tensorflow as tf 4 | from tensorflow import feature_column as fc 5 | import config 6 | 7 | FLAGS = config.FLAGS 8 | 9 | class FeatureConfig(object): 10 | def __init__(self): 11 | self.user_feature_columns = dict() 12 | self.item_feature_columns = dict() 13 | self.all_columns = dict() 14 | self.feature_spec = dict() 15 | 16 | def create_features_columns(self): 17 | """ 18 | features_def["user_classes"] = tf.io.FixedLenFeature([5], tf.int64) # 用户兴趣 19 | features_def["user_age"] = tf.io.FixedLenFeature([1], tf.int64) # 用户年龄 20 | features_def["user_gender"] = tf.io.FixedLenFeature([1], tf.int64) # 用户性别 21 | features_def["user_vector"] = tf.io.FixedLenFeature([128], tf.float32) # 用户向量 22 | features_def["item_classes"] = tf.io.FixedLenFeature([1], tf.int64) # item分类 23 | features_def["item_vector"] = tf.io.FixedLenFeature([128], tf.float32) # item向量 24 | """ 25 | user_classes_embed = fc.embedding_column(fc.categorical_column_with_hash_bucket(key="user_classes", 26 | hash_bucket_size=40, dtype=tf.int64), 27 | dimension=64, combiner='mean', initializer=tf.uniform_unit_scaling_initializer(factor=1e-5, seed=1, dtype=tf.float32) 28 | ) 29 | user_age_embed = fc.embedding_column(fc.categorical_column_with_identity(key="user_age", 30 | num_bucket=6, dtype=tf.int64), 31 | dimension=8, combiner='mean', initializer=tf.uniform_unit_scaling_initializer(factor=1e-5, seed=1, dtype=tf.float32) 32 | ) 33 | user_gender_embed = fc.embedding_column(fc.categorical_column_with_identity(key="user_gender", 34 | num_bucket=3, dtype=tf.int64), 35 | dimension=8, combiner='mean', initializer=tf.uniform_unit_scaling_initializer(factor=1e-5, seed=1, dtype=tf.float32) 36 | ) 37 | 38 | item_classes_embed = fc.embedding_column(fc.categorical_column_with_hash_bucket(key="item_classes", 39 | hash_bucket_size=40, dtype=tf.int64), 40 | dimension=64, combiner='mean', initializer=tf.uniform_unit_scaling_initializer(factor=1e-5, seed=1, dtype=tf.float32) 41 | ) 42 | 43 | user_vector_input = fc.numeric_column(key="user_vector", shape=(128,), default_value=[0.0]*128, dtype=tf.float32) 44 | item_vector_input = fc.numeric_column(key="item_vector", shape=(128,), default_value=[0.0]*128, dtype=tf.float32) 45 | 46 | for key, value in self.user_feature_columns.items(): 47 | self.all_columns[key] = value 48 | for key, value in self.item_feature_columns.items(): 49 | self.all_columns[key] = value 50 | 51 | self.feature_spec = tf.feature_column.make_parse_example_spec(self.all_columns.values()) 52 | return self 53 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import random 3 | import tensorflow as tf 4 | from tensorflow import feature_column as fc 5 | import config 6 | FLAGS = config.FLAGS 7 | 8 | 9 | def build_user_model(features, mode, params): 10 | user_net = [] 11 | user_inputs = params["feature_configs"].user_feature_columns 12 | with tf.variable_scope("user_side", partitioner=tf.fixed_size_partitioner(len(FLAGS.ps_hosts.split(",")), axis=0)): 13 | for key, value in user_inputs.items(): 14 | input_fea = fc.input_layer(features, value) 15 | user_net.append(input_fea) 16 | user_net = tf.concat(user_net, axis=1) 17 | for idx, units in enumerate(params["hidden_units"]): 18 | user_net = tf.layers.dense(user_net, units=units, activation=tf.nn.leaky_relu, name="user_fc_layer_%s"%idx) 19 | user_net = tf.nn.l2_normalize(user_net) 20 | return user_net 21 | 22 | def build_item_model(features, mode, params): 23 | item_net = [] 24 | item_inputs = params["feature_configs"].item_feature_columns 25 | with tf.variable_scope("item_side", partitioner=tf.fixed_size_partitioner(len(FLAGS.ps_hosts.split(",")), axis=0)): 26 | for key, value in item_inputs.items(): 27 | input_fea = fc.input_layer(features, value) 28 | item_net.append(input_fea) 29 | item_net = tf.concat(item_net, axis=1) 30 | for idx, units in enumerate(params["hidden_units"]): 31 | item_net = tf.layers.dense(item_net, units=units, activation=tf.nn.leaky_relu, name="item_fc_layer_%s"%idx) 32 | item_net = tf.nn.l2_normalize(item_net) 33 | return item_net 34 | 35 | def model_fn(features, labels, mode, params): 36 | # Predict 37 | if mode == tf.estimator.ModeKeys.PREDICT: 38 | if FLAGS.export_user_model: 39 | user_encoder = build_user_model(features, mode, params) 40 | predictions = {"user_vector": user_encoder} 41 | elif FLAGS.export_item_model: 42 | item_encoder = build_item_model(features, mode, params) 43 | predictions = {"item_vector": item_encoder} 44 | export_outputs = {"predictions": tf.estimator.export.PredictOutput(outputs=predictions)} 45 | return tf.estimator.EstimatorSpec(mode, predictions=predictions, export_outputs=export_outputs) 46 | 47 | user_encoder = build_user_model(features, mode, params) 48 | item_encoder = build_item_model(features, mode, params) 49 | 50 | # 随机采样负样本 51 | with tf.name_scope("rotate"): 52 | tmp = tf.tile(item_encoder, [1, 1]) 53 | item_encoder_fd = item_encoder 54 | for i in range(FLAGS.NEG): 55 | rand = tf.cast(((random.random() + i) * tf.cast(FLAGS.batch_size, tf.float32) / FLAGS.NEG), tf.int32) 56 | item_encoder_fd = tf.concat([item_encoder_fd, 57 | tf.slice(tmp, [rand, 0], [FLAGS.batch_size - rand, -1]), 58 | tf.slice(tmp, [0, 0], [rand, -1])], axis=0) 59 | user_norm = tf.tile(tf.sqrt(tf.reduce_sum(tf.square(user_encoder), axis=1, keepdims=True)),[FLAGS.NEG + 1, 1]) 60 | item_norm = tf.sqrt(tf.reduce_sum(tf.square(item_encoder_fd), axis=1, keepdims=True)) 61 | prod = tf.reduce_sum(tf.multiply(tf.tile(user_encoder, [FLAGS.NEG + 1, 1]), item_encoder_fd), axis=1,keepdims=True) 62 | norm_prod = tf.multiply(user_norm, item_norm) 63 | cos_sim_raw = tf.truediv(prod, norm_prod) 64 | cos_sim = tf.transpose(tf.reshape(tf.transpose(cos_sim_raw), [FLAGS.NEG + 1, -1])) * 20 65 | 66 | # 最大化正样本概率 67 | with tf.name_scope("loss"): 68 | prob = tf.nn.softmax(cos_sim) 69 | hit_prob = tf.slice(prob, [0, 0], [-1, 1]) 70 | loss = -tf.reduce_mean(tf.log(hit_prob)) 71 | correct_prediction = tf.cast(tf.equal(tf.argmax(prob, 1), 0), tf.float32) 72 | accuracy = tf.reduce_mean(correct_prediction) 73 | 74 | # Eval 75 | if mode == tf.estimator.ModeKeys.EVAL: 76 | return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops={}) 77 | 78 | # Train 79 | if mode == tf.estimator.ModeKeys.TRAIN: 80 | global_step = tf.train.get_global_step() 81 | learning_rate = tf.train.exponential_decay(params["learning_rate"], global_step, 100000, 0.9, staircase=True) 82 | train_op = (tf.train.AdagradOptimizer(learning_rate).minimize(loss, global_step=global_step)) 83 | return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op) 84 | --------------------------------------------------------------------------------