├── README.md
├── run_local.sh
├── config.py
├── main.py
├── data_inputs.py
├── feature_processing.py
└── model.py


/README.md:
--------------------------------------------------------------------------------
1 | # mvdssm
2 | A Multi-View DSSM for Recommendation System with tensorflow estimator.


--------------------------------------------------------------------------------
/run_local.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | ckpt_dir=./ckpt
 3 | user_model_path=./user_model
 4 | item_model_path=./item_model
 5 | 
 6 | train_data=train_files.txt
 7 | eval_data=eval_files.txt
 8 | train_steps=100000000
 9 | batch_size=256
10 | learning_rate=0.000005
11 | save_steps=10000
12 | embed_size=32
13 | 
14 | python main.py \
15 |     --train_data=${train_data} \
16 |     --eval_data=${eval_data} \
17 |     --model_dir=${ckpt_dir} \
18 |     --user_model_path=${user_model_path} \
19 |     --item_model_path=${item_model_path} \
20 |     --train_steps=${train_steps} \
21 |     --save_checkpoints_steps=${save_steps} \
22 |     --learning_rate=${learning_rate} \
23 |     --batch_size=${batch_size} \
24 |     --is_eval=False \
25 |     --run_on_cluster=False \
26 |     --train_eval=True \
27 |     --export_user_model=True \
28 |     --export_item_model=True \
29 |     --embed_size=${embed_size} \
30 |     --gpuid=3
31 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | import json, os, re, codecs
 3 | import tensorflow as tf
 4 | 
 5 | flags = tf.app.flags
 6 | 
 7 | flags.DEFINE_boolean("run_on_cluster", False, "Whether the cluster info need to be passed in as input")
 8 | flags.DEFINE_boolean("is_eval", False, "Whether evaluate or not")
 9 | flags.DEFINE_boolean("train_eval", False, "Whether train and evaluate model or not")
10 | flags.DEFINE_boolean("export_user_model", False, "Whether export model or not")
11 | flags.DEFINE_boolean("export_item_model", False, "Whether export model or not")
12 | 
13 | flags.DEFINE_string("train_dir", "", "")
14 | flags.DEFINE_string("data_dir", "", "")
15 | flags.DEFINE_string("log_dir", "", "")
16 | flags.DEFINE_string("ps_hosts", "","Comma-separated list of hostname:port pairs, you can also specify pattern like ps[1-5].example.com")
17 | flags.DEFINE_string("worker_hosts", "","Comma-separated list of hostname:port pairs, you can also specify worker[1-5].example.co")
18 | flags.DEFINE_string("job_name", "", "One of 'ps', 'worker'")
19 | flags.DEFINE_integer("task_index", 0, "Index of task within the job")
20 | 
21 | flags.DEFINE_string("model_dir", "./ckpt/", "Base directory for the model.")
22 | flags.DEFINE_string("user_model_path", "./user_model/", "Saved model.")
23 | flags.DEFINE_string("item_model_path", "./item_model/", "Saved model.")
24 | 
25 | flags.DEFINE_string("train_data", "./train_files.txt", "Directory for storing mnist data")
26 | flags.DEFINE_string("eval_data", "./eval_files.txt", "Path to the evaluation data.")
27 | 
28 | flags.DEFINE_string("gpuid", "1", "gpuid")
29 | 
30 | flags.DEFINE_string("hidden_units", "512,256,128", "user hidden units.")
31 | flags.DEFINE_integer("train_steps",100000, "Number of (global) training steps to perform")
32 | flags.DEFINE_integer("batch_size", 512, "Training batch size")
33 | flags.DEFINE_integer("NEG", 50, "Negative Samples")
34 | flags.DEFINE_integer("embed_size", 32, "Embedding size for FM")
35 | flags.DEFINE_float("learning_rate", 0.0001, "learning rate")
36 | flags.DEFINE_integer("save_checkpoints_steps", 10000, "Save checkpoints every this many steps")
37 | 
38 | FLAGS = flags.FLAGS
39 | 
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | # encoding:utf-8
 2 | import os
 3 | import json
 4 | import math
 5 | import numpy as np
 6 | import tensorflow as tf
 7 | from tensorflow import feature_column as fc
 8 | import data_inputs
 9 | from feature_processing import FeatureConfig
10 | import model
11 | import config
12 | 
13 | FLAGS = config.FLAGS
14 | 
15 | os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpuid
16 | 
17 | if FLAGS.run_on_cluster:
18 |     cluster = json.loads(os.environ["TF_CONFIG"])
19 |     task_index = int(os.environ["TF_INDEX"])
20 |     task_type = os.environ["TF_ROLE"]
21 | 
22 | 
23 | def main(unused_argv):
24 |     feature_configs = FeatureConfig().create_features_columns()
25 |     classifier = tf.estimator.Estimator(model_fn=model.model_fn,
26 |                                         config=tf.estimator.RunConfig(model_dir=FLAGS.model_dir,
27 |                                                                       save_checkpoints_steps=FLAGS.save_checkpoints_steps,
28 |                                                                       keep_checkpoint_max=3),
29 |                                         params={"feature_configs": feature_configs,
30 |                                                 "hidden_units": list(map(int, FLAGS.hidden_units.split(","))),
31 |                                                 "learning_rate": FLAGS.learning_rate}
32 |                                         )
33 |     def train_eval_model():
34 |         train_spec = tf.estimator.TrainSpec(input_fn=lambda: data_inputs.train_input_fn(FLAGS.train_data, FLAGS.batch_size),
35 |                                             max_steps=FLAGS.train_steps)
36 |         eval_spec = tf.estimator.EvalSpec(input_fn=lambda: data_inputs.eval_input_fn(FLAGS.eval_data, FLAGS.batch_size),
37 |                                           start_delay_secs=60,
38 |                                           throttle_secs = 30,
39 |                                           steps=1000)
40 |         tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec)
41 | 
42 |     def train_model():
43 |         from tensorflow.python import debug as tf_debug
44 |         debug_hook = tf_debug.LocalCLIDebugHook()
45 |         classifier.train(input_fn=lambda: fe.train_input_fn(FLAGS.train_data, FLAGS.batch_size), steps=1000, hooks=[debug_hook,])
46 | 
47 |     def eval_model():
48 |         classifier.evaluate(input_fn=lambda: fe.eval_input_fn(FLAGS.eval_data, FLAGS.batch_size), steps=1000)
49 | 
50 |     if FLAGS.is_eval:
51 |         eval_model()
52 | 
53 |     if FLAGS.train_eval:
54 |         train_eval_model()
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
59 |     tf.logging.set_verbosity(tf.logging.INFO)
60 |     tf.app.run(main=main)
61 | 


--------------------------------------------------------------------------------
/data_inputs.py:
--------------------------------------------------------------------------------
 1 | #encoding:utf-8
 2 | import os, json, codecs
 3 | import tensorflow as tf
 4 | from tensorflow import feature_column as fc
 5 | import config
 6 | 
 7 | FLAGS = config.FLAGS
 8 | 
 9 | def parse_exp(example):
10 |     features_def = dict()
11 |     features_def["label"] = tf.io.FixedLenFeature([1], tf.int64)
12 |     
13 |     features_def["user_classes"] = tf.io.FixedLenFeature([5], tf.int64)  # 用户兴趣
14 |     features_def["user_age"] = tf.io.FixedLenFeature([1], tf.int64)  # 用户年龄
15 |     features_def["user_gender"] = tf.io.FixedLenFeature([1], tf.int64)  # 用户性别
16 |     features_def["user_vector"] = tf.io.FixedLenFeature([128], tf.float32)  # 用户向量
17 |     features_def["item_classes"] = tf.io.FixedLenFeature([1], tf.int64)  # item分类
18 |     features_def["item_vector"] = tf.io.FixedLenFeature([128], tf.float32)  # item向量
19 |     
20 |     features = tf.io.parse_single_example(example, features_def)
21 |     label = features["label"]
22 |     del features["label"]
23 |     return features, label
24 | 
25 | 
26 | def train_input_fn(filenames=None,
27 |                    batch_size=128,
28 |                    shuffle_buffer_size=1000):
29 |     with tf.gfile.Open(filenames) as f:
30 |         filenames = f.read().split()
31 |     
32 |     if FLAGS.run_on_cluster:
33 |         files_all = []
34 |         for f in filenames:
35 |             files_all += tf.gfile.Glob(f)
36 |         train_worker_num = len(FLAGS.worker_hosts.split(","))
37 |         hash_id = FLAGS.task_index if FLAGS.job_name == "worker" else train_worker_num - 1
38 |         files_shard = [files for i, files in enumerate(files_all) if i % train_worker_num == hash_id]
39 |         files = tf.data.Dataset.list_files(files_shard)
40 |         dataset = files.apply(tf.contrib.data.parallel_interleave(lambda x: tf.data.TFRecordDataset(x), 
41 |                                                                   cycle_length=10,
42 |                                                                   buffer_output_elements=batch_size*20,
43 |                                                                   sloppy=True))
44 |         #dataset = tf.data.TFRecordDataset(files_shard)
45 |     else:
46 |         files = tf.data.Dataset.list_files(filenames)
47 |         dataset = files.apply(tf.contrib.data.parallel_interleave(lambda x: tf.data.TFRecordDataset(x), 
48 |                                                               buffer_output_elements=batch_size*4, 
49 |                                                               cycle_length=4,
50 |                                                               sloppy=True))
51 |     #dataset = dataset.shuffle(batch_size*10)
52 |     dataset = dataset.map(parse_exp, num_parallel_calls=8)
53 |     dataset = dataset.batch(batch_size, drop_remainder=True).prefetch(1)
54 |     return dataset
55 | 
56 | def eval_input_fn(filenames=None,
57 |                   batch_size=128):
58 |     with tf.gfile.Open(filenames) as f:
59 |         filenames = f.read().split()
60 |     files = tf.data.Dataset.list_files(filenames)
61 |     dataset = files.apply(tf.contrib.data.parallel_interleave(lambda filename: tf.data.TFRecordDataset(files), buffer_output_elements=batch_size*12, cycle_length=8))
62 |     dataset = dataset.map(parse_exp, num_parallel_calls=4)
63 |     dataset = dataset.batch(batch_size, drop_remainder=True)
64 |     return dataset
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/feature_processing.py:
--------------------------------------------------------------------------------
 1 | #encoding:utf-8
 2 | import os, json, codecs
 3 | import tensorflow as tf
 4 | from tensorflow import feature_column as fc
 5 | import config
 6 | 
 7 | FLAGS = config.FLAGS
 8 | 
 9 | class FeatureConfig(object):
10 |     def __init__(self):
11 |         self.user_feature_columns = dict()
12 |         self.item_feature_columns = dict()
13 |         self.all_columns = dict()
14 |         self.feature_spec = dict()
15 | 
16 |     def create_features_columns(self):
17 |         """
18 |         features_def["user_classes"] = tf.io.FixedLenFeature([5], tf.int64)  # 用户兴趣
19 |         features_def["user_age"] = tf.io.FixedLenFeature([1], tf.int64)  # 用户年龄
20 |         features_def["user_gender"] = tf.io.FixedLenFeature([1], tf.int64)  # 用户性别
21 |         features_def["user_vector"] = tf.io.FixedLenFeature([128], tf.float32)  # 用户向量
22 |         features_def["item_classes"] = tf.io.FixedLenFeature([1], tf.int64)  # item分类
23 |         features_def["item_vector"] = tf.io.FixedLenFeature([128], tf.float32)  # item向量
24 |         """
25 |         user_classes_embed = fc.embedding_column(fc.categorical_column_with_hash_bucket(key="user_classes", 
26 |                                                                                        hash_bucket_size=40, dtype=tf.int64),
27 |                                                  dimension=64, combiner='mean', initializer=tf.uniform_unit_scaling_initializer(factor=1e-5, seed=1, dtype=tf.float32)
28 |                                                  )
29 |         user_age_embed = fc.embedding_column(fc.categorical_column_with_identity(key="user_age", 
30 |                                                                                  num_bucket=6, dtype=tf.int64),
31 |                                                  dimension=8, combiner='mean', initializer=tf.uniform_unit_scaling_initializer(factor=1e-5, seed=1, dtype=tf.float32)
32 |                                                  )
33 |         user_gender_embed = fc.embedding_column(fc.categorical_column_with_identity(key="user_gender", 
34 |                                                                                     num_bucket=3, dtype=tf.int64),
35 |                                                  dimension=8, combiner='mean', initializer=tf.uniform_unit_scaling_initializer(factor=1e-5, seed=1, dtype=tf.float32)
36 |                                                  )
37 |                                                  
38 |         item_classes_embed = fc.embedding_column(fc.categorical_column_with_hash_bucket(key="item_classes", 
39 |                                                                                         hash_bucket_size=40, dtype=tf.int64),
40 |                                                  dimension=64, combiner='mean', initializer=tf.uniform_unit_scaling_initializer(factor=1e-5, seed=1, dtype=tf.float32)
41 |                                                  )
42 |                                                  
43 |         user_vector_input = fc.numeric_column(key="user_vector", shape=(128,), default_value=[0.0]*128, dtype=tf.float32)
44 |         item_vector_input = fc.numeric_column(key="item_vector", shape=(128,), default_value=[0.0]*128, dtype=tf.float32)
45 | 
46 |         for key, value in self.user_feature_columns.items():
47 |             self.all_columns[key] = value
48 |         for key, value in self.item_feature_columns.items():
49 |             self.all_columns[key] = value
50 |             
51 |         self.feature_spec = tf.feature_column.make_parse_example_spec(self.all_columns.values())
52 |         return self
53 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | import random
 3 | import tensorflow as tf
 4 | from tensorflow import feature_column as fc
 5 | import config
 6 | FLAGS = config.FLAGS
 7 | 
 8 | 
 9 | def build_user_model(features, mode, params):
10 |     user_net = []
11 |     user_inputs = params["feature_configs"].user_feature_columns
12 |     with tf.variable_scope("user_side", partitioner=tf.fixed_size_partitioner(len(FLAGS.ps_hosts.split(",")), axis=0)):
13 |         for key, value in user_inputs.items():
14 |             input_fea = fc.input_layer(features, value)
15 |             user_net.append(input_fea)
16 |         user_net = tf.concat(user_net, axis=1)
17 |         for idx, units in enumerate(params["hidden_units"]):
18 |             user_net = tf.layers.dense(user_net, units=units, activation=tf.nn.leaky_relu, name="user_fc_layer_%s"%idx)
19 |         user_net = tf.nn.l2_normalize(user_net)
20 |     return user_net
21 | 
22 | def build_item_model(features, mode, params):
23 |     item_net = []
24 |     item_inputs = params["feature_configs"].item_feature_columns
25 |     with tf.variable_scope("item_side", partitioner=tf.fixed_size_partitioner(len(FLAGS.ps_hosts.split(",")), axis=0)):
26 |         for key, value in item_inputs.items():
27 |             input_fea = fc.input_layer(features, value)
28 |             item_net.append(input_fea)
29 |         item_net = tf.concat(item_net, axis=1)
30 |         for idx, units in enumerate(params["hidden_units"]):
31 |             item_net = tf.layers.dense(item_net, units=units, activation=tf.nn.leaky_relu, name="item_fc_layer_%s"%idx)
32 |         item_net = tf.nn.l2_normalize(item_net)
33 |     return item_net
34 | 
35 | def model_fn(features, labels, mode, params):
36 |     # Predict
37 |     if mode == tf.estimator.ModeKeys.PREDICT:
38 |         if FLAGS.export_user_model:
39 |             user_encoder = build_user_model(features, mode, params)
40 |             predictions = {"user_vector": user_encoder}
41 |         elif FLAGS.export_item_model:
42 |             item_encoder = build_item_model(features, mode, params)
43 |             predictions = {"item_vector": item_encoder}
44 |         export_outputs = {"predictions": tf.estimator.export.PredictOutput(outputs=predictions)}
45 |         return tf.estimator.EstimatorSpec(mode, predictions=predictions, export_outputs=export_outputs)
46 | 
47 |     user_encoder = build_user_model(features, mode, params)
48 |     item_encoder = build_item_model(features, mode, params)
49 | 
50 |     # 随机采样负样本
51 |     with tf.name_scope("rotate"):
52 |         tmp = tf.tile(item_encoder, [1, 1])
53 |         item_encoder_fd = item_encoder
54 |         for i in range(FLAGS.NEG):
55 |             rand = tf.cast(((random.random() + i) * tf.cast(FLAGS.batch_size, tf.float32) / FLAGS.NEG), tf.int32)
56 |             item_encoder_fd = tf.concat([item_encoder_fd,
57 |                                       tf.slice(tmp, [rand, 0], [FLAGS.batch_size - rand, -1]),
58 |                                       tf.slice(tmp, [0, 0], [rand, -1])], axis=0)
59 |         user_norm = tf.tile(tf.sqrt(tf.reduce_sum(tf.square(user_encoder), axis=1, keepdims=True)),[FLAGS.NEG + 1, 1])
60 |         item_norm = tf.sqrt(tf.reduce_sum(tf.square(item_encoder_fd), axis=1, keepdims=True))
61 |         prod = tf.reduce_sum(tf.multiply(tf.tile(user_encoder, [FLAGS.NEG + 1, 1]), item_encoder_fd), axis=1,keepdims=True)
62 |         norm_prod = tf.multiply(user_norm, item_norm)
63 |         cos_sim_raw = tf.truediv(prod, norm_prod)
64 |         cos_sim = tf.transpose(tf.reshape(tf.transpose(cos_sim_raw), [FLAGS.NEG + 1, -1])) * 20
65 | 
66 |     # 最大化正样本概率
67 |     with tf.name_scope("loss"):
68 |         prob = tf.nn.softmax(cos_sim)
69 |         hit_prob = tf.slice(prob, [0, 0], [-1, 1])
70 |         loss = -tf.reduce_mean(tf.log(hit_prob))
71 |         correct_prediction = tf.cast(tf.equal(tf.argmax(prob, 1), 0), tf.float32)
72 |         accuracy = tf.reduce_mean(correct_prediction)
73 | 
74 |     # Eval
75 |     if mode == tf.estimator.ModeKeys.EVAL:
76 |         return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops={})
77 | 
78 |     # Train
79 |     if mode == tf.estimator.ModeKeys.TRAIN:
80 |         global_step = tf.train.get_global_step()
81 |         learning_rate = tf.train.exponential_decay(params["learning_rate"], global_step, 100000, 0.9, staircase=True)
82 |         train_op = (tf.train.AdagradOptimizer(learning_rate).minimize(loss, global_step=global_step))
83 |         return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
84 | 


--------------------------------------------------------------------------------