├── CIKM2019_Poster.pdf
├── README.md
├── SDM_CIKM2019.pdf
├── code
    ├── config
    │   └── task_config.json
    ├── model_utils
    │   ├── hyperparams.py
    │   ├── model_helper.py
    │   └── task_config.py
    ├── models
    │   ├── basic_modules.py
    │   ├── deep_match.py
    │   └── extra_modules.py
    ├── parsers
    │   └── model_feature_parser.py
    └── train
    │   ├── run.py
    │   └── utils.py
└── data
    └── sample_data
        ├── sample_action.csv
        ├── sample_item.csv
        └── sample_user.csv


/CIKM2019_Poster.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alicogintel/SDM/ad898dd471d448ee2745ecc48c1a46b2af38e516/CIKM2019_Poster.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SDM: Sequential Deep Matching Model for Online Large-scale Recommender System
 2 | ## New Released Code!!!
 3 | Thanks to the [DeepMatch Group](https://github.com/shenweichen/DeepMatch) members for providing [doc](https://zhuanlan.zhihu.com/p/141411747) and [code](https://github.com/shenweichen/DeepMatch).
 4 | 
 5 | ## Demo Code
 6 | Code (Python2.7, TF1.4) of the sequential deep matching (SDM) model for recommender system at Taobao.
 7 | Current version only contains the core code of our model. The processes of data processing and evaluation are executed on our internal cloud platform [ODPS](https://www.alibabacloud.com/campaign/10-year-anniversary).
 8 | 
 9 | ## Paper
10 | Here is the arxiv [link](https://arxiv.org/abs/1909.00385) (accepted by CIKM 2019)
11 | 
12 | Citation:
13 | ```
14 | @inproceedings{lv2019sdm,
15 |   title={SDM: Sequential deep matching model for online large-scale recommender system},
16 |   author={Lv, Fuyu and Jin, Taiwei and Yu, Changlong and Sun, Fei and Lin, Quan and Yang, Keping and Ng, Wilfred},
17 |   booktitle={Proceedings of the 28th ACM International Conference on Information and Knowledge Management},
18 |   pages={2635--2643},
19 |   year={2019},
20 |   organization={ACM}
21 | }
22 | ```
23 | 
24 | ## Dataset
25 | 
26 | **JD Dataset:** [raw data](https://drive.google.com/open?id=19PemKrhA8j-RZj0i20_j4ERcnzaxl5JZ), [train and test data](https://drive.google.com/open?id=1pam-_ojsKooRLVeOXEvbh3AwJ6S4IZ7B) in the paper (tfrecord).
27 | The schema of raw data is shown in data/sample_data/.
28 | 
29 | ## Disclaimer
30 | This is an implementation on experiment of offline JD dataset rather than the online official version.
31 | There may be differences between results reported in the paper and the released one,
32 | because the former one is achieved in distribution tensorflow on our internal deep learning platform [PAI](https://data.aliyun.com/product/learn).
33 | 


--------------------------------------------------------------------------------
/SDM_CIKM2019.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alicogintel/SDM/ad898dd471d448ee2745ecc48c1a46b2af38e516/SDM_CIKM2019.pdf


--------------------------------------------------------------------------------
/code/config/task_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "comment": "model config for SDM",
 3 |   "parameters": {
 4 |     "mode":"train",
 5 |     "num_units": 128,
 6 |     "unit_type": "lstm",
 7 |     "num_layers": 2,
 8 |     "num_residual_layers": 1,
 9 |     "forget_bias": 1.0,
10 |     "dropout": 0.2,
11 |     "max_gradient_norm": 5.0,
12 |     "optimizer": "adagrad",
13 |     "learning_rate": 0.1,
14 |     "num_samples": 20000,
15 |     "batch_size": 256,
16 |     "last_step": 15000000,
17 |     "loss_by_example": false,
18 |     "num_buckets": 20,
19 |     "model": "rnn",
20 |     "vocab_size": 100000000
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/code/model_utils/hyperparams.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | from model_utils.task_config import TaskConfig
  3 | 
  4 | TrainingHParams = collections.namedtuple('TrainingHParams', [
  5 |     'ps_num',
  6 |     'mode',
  7 |     'model',
  8 |     'init_op',
  9 |     'seed',
 10 |     'init_weight',
 11 |     'num_partitions',
 12 |     'min_slice_size',
 13 |     'batch_size',
 14 |     'num_units',
 15 |     'vocab_size',
 16 |     'unit_type',
 17 |     'num_layers',
 18 |     'num_residual_layers',
 19 |     'forget_bias',
 20 |     'dropout',
 21 |     'num_samples',
 22 |     'optimizer',
 23 |     'start_decay_step',
 24 |     'learning_rate',
 25 |     'decay_steps',
 26 |     'decay_factor',
 27 |     'colocate_gradients_with_ops',
 28 |     'max_gradient_norm',
 29 |     'last_step',
 30 |     'topK',
 31 |     'num_epochs',
 32 |     'shuffle',
 33 |     'loss_by_example',
 34 |     'attention_window_size',
 35 |     'num_buckets',
 36 |     'num_heads',
 37 |     'max_length',
 38 |     'input_fn',
 39 |     'item_fc_trans',
 40 |     'user_fc_trans',
 41 |     'nn_init_op',
 42 |     "bucket_size",
 43 |     "embedding_size",
 44 |     "self_attn_ffn",
 45 |     "split_size",
 46 |     "num_labels",
 47 |     "softmax",
 48 |     "user_residual",
 49 |     "partn_strgy",
 50 |     "validation",
 51 |     "train_len",
 52 |     "test_interval",
 53 |     "STAMP",
 54 |     "NARM",
 55 |     "attn_layer_norm",
 56 |     "rnn_layer_norm",
 57 |     "user_attn",
 58 |     "prefer_avg_pooling",
 59 |     "rnn_hidden_units",
 60 |     "attn_fc",
 61 |     "num_multi_head",
 62 |     "wait_time",
 63 |     "user_id_only",
 64 |     "item_id_only",
 65 |     "fusion_op",
 66 |     "prefer_fc",
 67 |     "g_units_one",
 68 |     "input_user_feature",
 69 |     "use_user_id",
 70 | ])
 71 | 
 72 | 
 73 | def create_hparams(task_config):
 74 | 
 75 |     return TrainingHParams(
 76 |         # basic
 77 |         ps_num=task_config.get_config_as_int("ps_num", 1),
 78 |         mode=task_config.get_config("mode", "train"),
 79 |         model=task_config.get_config("model", "rnn"),
 80 |         num_buckets=task_config.get_config_as_int("num_buckets", 10),
 81 |         max_length=task_config.get_config_as_int("max_length", 50),
 82 |         input_fn=task_config.get_config("input_fn", "data_set"),
 83 |         topK=task_config.get_config_as_int("topK", 20),
 84 |         num_epochs=task_config.get_config_as_int("num_epochs", None),
 85 |         shuffle=task_config.get_config_as_bool("shuffle", True),
 86 |         validation=task_config.get_config_as_bool("validation", True),
 87 |         train_len=task_config.get_config_as_int("train_len", None),
 88 |         test_interval=task_config.get_config_as_int("test_interval", 1),
 89 |         wait_time=task_config.get_config_as_int("wait_time", 1),
 90 | 
 91 |         # initializer
 92 |         init_op=task_config.get_config("init_op", "uniform"),
 93 |         nn_init_op=task_config.get_config("nn_init_op", "orthogonal"),
 94 |         seed=task_config.get_config_as_int("seed", 2018),
 95 |         init_weight=task_config.get_config_as_float("init_weight", 0.1),
 96 | 
 97 |         # embedding partition
 98 |         num_partitions=task_config.get_config_as_int("num_partitions", None),
 99 |         min_slice_size=task_config.get_config_as_int("min_slice_size", 32),
100 |         bucket_size={
101 |             "item": task_config.get_config_as_int("item_bucket_size", 10000000),
102 |             "cate": task_config.get_config_as_int("cate_bucket_size", 60000),
103 |             "brand": task_config.get_config_as_int("brand_bucket_size", 10000000),
104 |             "shop": task_config.get_config_as_int("shop_bucket_size", 30000000),
105 |             "user_id": task_config.get_config_as_int("user_id_bucket_size", 1000000),
106 |             "age": task_config.get_config_as_int("age_bucket_size", 100),
107 |             "sex": task_config.get_config_as_int("sex_bucket_size", 10),
108 |             "user_lv_cd": task_config.get_config_as_int("user_lv_cd_bucket_size", 100),
109 |             "city_level": task_config.get_config_as_int("city_level_bucket_size", 100),
110 |             "province": task_config.get_config_as_int("province_bucket_size", 1000),
111 |             "city": task_config.get_config_as_int("city_bucket_size", 1000),
112 |             "country": task_config.get_config_as_int("country_bucket_size", 10000)
113 |         },
114 |         embedding_size={
115 |             "item": task_config.get_config_as_int("item_embedding_size", 64),
116 |             "cate": task_config.get_config_as_int("cate_embedding_size", 16),
117 |             "brand": task_config.get_config_as_int("brand_embedding_size", 16),
118 |             "shop": task_config.get_config_as_int("shop_embedding_size", 32),
119 |             "user_id": task_config.get_config_as_int("user_id_embedding_size", 64),
120 |             "age": task_config.get_config_as_int("age_embedding_size", 4),
121 |             "sex": task_config.get_config_as_int("sex_embedding_size", 4),
122 |             "user_lv_cd": task_config.get_config_as_int("user_lv_cd_embedding_size", 4),
123 |             "city_level": task_config.get_config_as_int("city_level_embedding_size", 4),
124 |             "province": task_config.get_config_as_int("province_embedding_size", 4),
125 |             "city": task_config.get_config_as_int("city_embedding_size", 4),
126 |             "country": task_config.get_config_as_int("country_embedding_size", 4)
127 |         },
128 | 
129 |         # network
130 |         batch_size=task_config.get_config_as_int("batch_size", 256),
131 |         num_units=task_config.get_config_as_int("num_units", 64),
132 |         vocab_size=task_config.get_config_as_int("vocab_size"),
133 |         unit_type=task_config.get_config("unit_type", "lstm"),
134 |         num_layers=task_config.get_config_as_int("num_layers", 2),
135 |         num_residual_layers=task_config.get_config_as_int("num_residual_layers", 1),
136 |         forget_bias=task_config.get_config_as_float("forget_bias", 1.0),
137 |         dropout=task_config.get_config_as_float("dropout", 0.2),
138 |         num_samples=task_config.get_config_as_int("num_samples", 2000),
139 |         attention_window_size=task_config.get_config_as_int("attention_window_size", None),
140 |         num_heads=task_config.get_config_as_int("num_heads", 8),
141 |         item_fc_trans=task_config.get_config_as_bool("item_fc_trans", False),
142 |         user_fc_trans=task_config.get_config_as_bool("user_fc_trans", False),
143 |         self_attn_ffn=task_config.get_config_as_bool("self_attn_ffn", False),
144 |         user_residual=task_config.get_config_as_bool("user_residual", False),
145 |         STAMP=task_config.get_config_as_bool("STAMP", False),
146 |         NARM=task_config.get_config_as_bool("NARM", False),
147 |         attn_layer_norm=task_config.get_config_as_bool("attn_layer_norm", True),
148 |         rnn_layer_norm=task_config.get_config_as_bool("rnn_layer_norm", False),
149 |         user_attn=task_config.get_config("user_attn", "general"),
150 |         prefer_avg_pooling=task_config.get_config_as_bool("prefer_avg_pooling", False),
151 |         rnn_hidden_units=task_config.get_config_as_int("rnn_hidden_units", 64),
152 |         attn_fc=task_config.get_config_as_bool("attn_fc", False),
153 |         num_multi_head=task_config.get_config_as_int("num_multi_head", 1),
154 |         user_id_only=task_config.get_config_as_bool("user_id_only", False),
155 |         item_id_only=task_config.get_config_as_bool("item_id_only", False),
156 |         fusion_op=task_config.get_config("fusion_op", "gated"),
157 |         prefer_fc=task_config.get_config_as_bool("prefer_fc", True),
158 |         g_units_one=task_config.get_config_as_bool("g_units_one", False),
159 |         input_user_feature=task_config.get_config_as_bool("input_user_feature", False),
160 |         use_user_id=task_config.get_config_as_bool("use_user_id", True),
161 | 
162 |         # optimizer
163 |         optimizer=task_config.get_config("optimizer", "adam"),
164 |         start_decay_step=task_config.get_config_as_int("start_decay_step", 1600000),
165 |         learning_rate=task_config.get_config_as_float("learning_rate", 1),
166 |         decay_steps=task_config.get_config_as_int("decay_steps", 100000),
167 |         decay_factor=task_config.get_config_as_float("decay_factor", 0.98),
168 |         colocate_gradients_with_ops=task_config.get_config_as_bool("colocate_gradients_with_ops", True),
169 |         max_gradient_norm=task_config.get_config_as_float("max_gradient_norm", 5.0),
170 |         loss_by_example=task_config.get_config_as_bool("loss_by_example", False),
171 |         last_step=task_config.get_config_as_int("last_step", 32000000),
172 |         split_size=task_config.get_config_as_int("split_size", 1),
173 |         num_labels=task_config.get_config_as_int("num_labels", 1),
174 |         softmax=task_config.get_config("softmax", "sampled_softmax"),
175 |         partn_strgy=task_config.get_config("partn_strgy", "mod")
176 |     )
177 | 
178 | 
179 | def create_flags(flags):
180 |     flags.DEFINE_string("checkpointDir", "./", "checkpoint_dir")
181 |     flags.DEFINE_string("model", "rnn,self_attn,personal,user_attn,prefer", "model")
182 |     flags.DEFINE_string("mode", "train", "mode")
183 |     flags.DEFINE_string("unit_type", "gru", "unit_type")
184 |     flags.DEFINE_string("num_epochs", 10, "num_epochs")
185 |     flags.DEFINE_string("batch_size", 256, "batch_size")
186 |     flags.DEFINE_string("num_samples", 2000, "num_samples")
187 |     flags.DEFINE_integer("split_size", 1, "split_size, batch split size, splited_samples share neg_samples")
188 |     flags.DEFINE_integer("last_step", 15000000, "last_step")
189 |     flags.DEFINE_string("user_id_embedding_size", 64, "user_id_embedding_size")
190 |     flags.DEFINE_string("num_buckets", 1, "num_buckets")
191 |     flags.DEFINE_string("shuffle", True, "shuffle")
192 |     flags.DEFINE_string("loss_by_example", False, "loss_by_example")
193 |     flags.DEFINE_string("user_residual", True, "user layer residual")
194 |     flags.DEFINE_integer("vocab_size", 157371, "size of item pool")
195 | 
196 |     flags.DEFINE_string("learning_rate", 0.001, "learning_rate")
197 |     flags.DEFINE_string("start_decay_step", 16000000, "start_decay_step")
198 |     flags.DEFINE_string("decay_steps", 100000, "decay_steps")
199 |     flags.DEFINE_string("decay_factor", 0.95, "decay_factor")
200 |     flags.DEFINE_string("optimizer", "adagrad", "optimizer")
201 |     flags.DEFINE_string("max_gradient_norm", 5.0, "max_gradient_norm")
202 |     flags.DEFINE_integer("num_labels", 5, "multi labels")
203 |     flags.DEFINE_string("softmax", "sampled_softmax", "softmax layer")
204 |     flags.DEFINE_string("partn_strgy", "div", "for inference or not")
205 |     flags.DEFINE_string("validation", True, "validation or not")
206 |     flags.DEFINE_integer("train_len", 1430824, "sample lens")
207 |     flags.DEFINE_string("item_fc_trans", False, "itemid+general repre")
208 |     flags.DEFINE_string("self_attn_ffn", False, "self_attn_ffn")
209 |     flags.DEFINE_integer("test_interval", 1, "test_interval")
210 | 
211 |     flags.DEFINE_string("STAMP", False, "short term priority")
212 |     flags.DEFINE_string("NARM", False, "neural attentive")
213 |     flags.DEFINE_integer("num_heads", 4, "heads num for attention")
214 |     flags.DEFINE_string("attn_layer_norm", True, "layer_norm attention")
215 |     flags.DEFINE_string("rnn_layer_norm", False, "rnn_layer_norm")
216 |     flags.DEFINE_string("user_attn", "general", "user attention layer choice")
217 |     flags.DEFINE_string("prefer_avg_pooling", False, "prefer features avg pooling, otherwise user attn")
218 |     flags.DEFINE_integer("rnn_hidden_units", 64, "rnn hidden size")
219 |     flags.DEFINE_integer("num_layers", 1, "rnn layer num")
220 |     flags.DEFINE_integer("num_residual_layers", 0, "residual layer num")
221 |     flags.DEFINE_integer("item_embedding_size", 64, "residual layer num")
222 |     flags.DEFINE_integer("num_units", 64, "softmax embedding size")
223 |     flags.DEFINE_string("attn_fc", False, "attention fc")
224 |     flags.DEFINE_integer("num_multi_head", 1, "number of transformers")
225 |     flags.DEFINE_integer("wait_time", 1, "chief worker waiting time")
226 |     flags.DEFINE_string("user_id_only", False, "only user id feature")
227 |     flags.DEFINE_string("item_id_only", False, "only item id feature")
228 |     flags.DEFINE_string("fusion_op", "gated", "fusion operation")
229 |     flags.DEFINE_string("prefer_fc", True, "long rep fc to units")
230 |     flags.DEFINE_string("g_units_one", False, "if scalar gate")
231 |     flags.DEFINE_string("input_user_feature", False, "user feature added to input layer")
232 |     flags.DEFINE_string("use_user_id", True, "user id feature")
233 | 
234 |     return flags
235 | 
236 | 
237 | def create_task_config(FLAGS, conf_file_path):
238 |     FLAGS._parse_flags()
239 |     task_config = TaskConfig(FLAGS.__flags, conf_file_path)
240 |     return task_config
241 | 


--------------------------------------------------------------------------------
/code/model_utils/model_helper.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from tensorflow.python.ops import partitioned_variables
  3 | from tensorflow.python.platform import tf_logging as logging
  4 | from tensorflow.python.framework import dtypes
  5 | from tensorflow.python.ops import variable_scope
  6 | from tensorflow.contrib import layers
  7 | 
  8 | 
  9 | def get_initializer(init_op, seed=None, init_weight=None):
 10 |     """Create an initializer. init_weight is only for uniform."""
 11 |     if init_op is None:
 12 |         return None
 13 |     if init_op == "uniform":
 14 |         assert init_weight
 15 |         return tf.random_uniform_initializer(-init_weight, init_weight, seed=seed)
 16 |     elif init_op == "normal":
 17 |         return tf.random_normal_initializer(mean=0.0, stddev=0.01, seed=seed)
 18 |     elif init_op == "glorot_normal":
 19 |         return tf.contrib.keras.initializers.glorot_normal(seed=seed)
 20 |     elif init_op == "glorot_uniform":
 21 |         return tf.contrib.keras.initializers.glorot_uniform(seed=seed)
 22 |     elif init_op == "xavier":
 23 |         return tf.contrib.layers.xavier_initializer(seed=seed)
 24 |     elif init_op == "orthogonal":
 25 |         return tf.orthogonal_initializer()
 26 |     else:
 27 |         raise ValueError("Unknown init_op %s" % init_op)
 28 | 
 29 | 
 30 | def get_emb_partitioner(num_partitions=None, min_slice_size=None, max_partitions=None):
 31 |     partitioner = None
 32 |     if num_partitions > 1:
 33 |         partitioner = tf.fixed_size_partitioner(num_partitions)
 34 |     elif min_slice_size is not None and max_partitions is not None:
 35 |         partitioner = partitioned_variables.min_max_variable_partitioner(
 36 |             max_partitions=max_partitions,
 37 |             min_slice_size=min_slice_size << 20)
 38 |     return partitioner
 39 | 
 40 | 
 41 | def _single_cell(unit_type, num_units, forget_bias, dropout,
 42 |                  mode, residual_connection=False):
 43 |     """Create an instance of a single RNN cell."""
 44 |     # dropout (= 1 - keep_prob) is set to 0 during eval and infer
 45 |     logger_list = []
 46 |     # Cell Type
 47 |     if unit_type == "lstm":
 48 |         logger_list.append("  LSTM, forget_bias=%g" % forget_bias)
 49 |         single_cell = tf.contrib.rnn.BasicLSTMCell(num_units, forget_bias=forget_bias)
 50 |     elif unit_type == "lstmblock":
 51 |         logger_list.append("  LSTM Block, forget_bias=%g" % forget_bias)
 52 |         single_cell = tf.contrib.rnn.LSTMBlockCell(num_units, forget_bias=forget_bias)
 53 |     elif unit_type == "lstmfused":
 54 |         logger_list.append("  LSTM Block Fused, forget_bias=%g" % forget_bias)
 55 |         single_cell = tf.contrib.rnn.LSTMBlockFusedCell(num_units, forget_bias=forget_bias)
 56 |     elif unit_type == "gru":
 57 |         logger_list.append("  GRU")
 58 |         single_cell = tf.contrib.rnn.GRUCell(num_units)
 59 |     elif unit_type == "layer_norm_lstm":
 60 |         logger_list.append("  Layer Normalized LSTM, forget_bias=%g" % forget_bias)
 61 |         single_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(num_units, forget_bias=forget_bias, layer_norm=True)
 62 |     else:
 63 |         raise ValueError("Unknown unit type %s!" % unit_type)
 64 | 
 65 |     # Dropout (= 1 - keep_prob)
 66 |     dropout = dropout if mode == "train" else 0
 67 |     single_cell = tf.contrib.rnn.DropoutWrapper(cell=single_cell, input_keep_prob=(1.0 - dropout))
 68 |     logger_list.append("  %s " % type(single_cell).__name__)
 69 | 
 70 |     # Residual
 71 |     if residual_connection:
 72 |         single_cell = tf.contrib.rnn.ResidualWrapper(single_cell)
 73 |         logger_list.append("  %s" % type(single_cell).__name__)
 74 |     logging.info("".join(logger_list))
 75 | 
 76 |     return single_cell
 77 | 
 78 | 
 79 | def _cell_list(unit_type, num_units, num_layers, num_residual_layers,
 80 |                forget_bias, dropout, mode, single_cell_fn=None):
 81 |     """Create a list of RNN cells."""
 82 |     if not single_cell_fn:
 83 |         single_cell_fn = _single_cell
 84 | 
 85 |     cell_list = []
 86 |     for i in range(num_layers):
 87 |         logging.info("  cell %d" % i)
 88 |         single_cell = single_cell_fn(
 89 |             unit_type=unit_type,
 90 |             num_units=num_units,
 91 |             forget_bias=forget_bias,
 92 |             dropout=dropout,
 93 |             mode=mode,
 94 |             residual_connection=(i >= num_layers - num_residual_layers)
 95 |         )
 96 |         cell_list.append(single_cell)
 97 | 
 98 |     return cell_list
 99 | 
100 | 
101 | def create_rnn_cell(unit_type, num_units, num_layers, num_residual_layers,
102 |                     forget_bias, dropout, mode, attention_window_size, single_cell_fn=None):
103 |     """Create multi-layer RNN cell.
104 | 
105 |       Args:
106 |         unit_type: string representing the unit type, i.e. "lstm".
107 |         num_units: the depth of each unit.
108 |         num_layers: number of cells.
109 |         num_residual_layers: Number of residual layers from top to bottom. For
110 |           example, if `num_layers=4` and `num_residual_layers=2`, the last 2 RNN
111 |           cells in the returned list will be wrapped with `ResidualWrapper`.
112 |         forget_bias: the initial forget bias of the RNNCell(s).
113 |         dropout: floating point value between 0.0 and 1.0:
114 |           the probability of dropout.  this is ignored if `mode != train`.
115 |         mode: either train/predict
116 |         single_cell_fn: single_cell_fn: allow for adding customized cell.
117 |           When not specified, we default to model_helper._single_cell
118 |       Returns:
119 |         An `RNNCell` instance.
120 |     """
121 |     cell_list = _cell_list(unit_type=unit_type,
122 |                            num_units=num_units,
123 |                            num_layers=num_layers,
124 |                            num_residual_layers=num_residual_layers,
125 |                            forget_bias=forget_bias,
126 |                            dropout=dropout,
127 |                            mode=mode,
128 |                            single_cell_fn=single_cell_fn)
129 | 
130 |     if len(cell_list) == 1:  # Single layer.
131 |         final_cell = cell_list[0]
132 |     else:                    # Multi layers
133 |         final_cell = tf.contrib.rnn.MultiRNNCell(cell_list)
134 | 
135 |     #  Attention Wrapper Cell
136 |     if attention_window_size is not None:
137 |         final_cell = tf.contrib.rnn.AttentionCellWrapper(final_cell, attention_window_size)
138 |     return final_cell
139 | 
140 | 
141 | def gradient_clip(gradients, max_gradient_norm):
142 |     """Clipping gradients of a model."""
143 |     clipped_gradients, gradient_norm = tf.clip_by_global_norm(gradients, max_gradient_norm)
144 |     tf.summary.scalar("grad_norm", gradient_norm)
145 |     tf.summary.scalar("clipped_gradient", tf.global_norm(clipped_gradients))
146 | 
147 |     return clipped_gradients
148 | 
149 | 
150 | def extract_axis_1(data, ind):
151 |     """
152 |     Get specified elements along the first axis of tensor.
153 |     :param data: Tensorflow tensor that will be subsetted.
154 |     :param ind: Indices to take (one for each element along axis 0 of data).
155 |     :return: Subsetted tensor.
156 |     """
157 |     batch_range = tf.range(tf.shape(data)[0], dtype=tf.int32)
158 |     indices = tf.stack([batch_range, ind], axis=1)
159 |     res = tf.gather_nd(data, indices)
160 |     return res
161 | 
162 | 
163 | def get_optimizer(hparams, _global_step):
164 |     _learning_rate = tf.constant(hparams.learning_rate)
165 |     opt = tf.train.GradientDescentOptimizer(hparams.learning_rate)
166 |     if hparams.optimizer == "sgd":
167 |         _learning_rate = tf.cond(
168 |             _global_step < hparams.start_decay_step,
169 |             lambda: tf.constant(hparams.learning_rate),
170 |             lambda: tf.train.exponential_decay(
171 |                 hparams.learning_rate,
172 |                 (_global_step - hparams.start_decay_step),
173 |                 hparams.decay_steps,
174 |                 hparams.decay_factor,
175 |                 staircase=True),
176 |             name="learning_rate")
177 |         opt = tf.train.GradientDescentOptimizer(_learning_rate)
178 |     elif hparams.optimizer == "adam":
179 |         assert float(hparams.learning_rate) <= 0.001, "! High Adam learning rate %g" % hparams.learning_rate
180 |         opt = tf.train.AdamOptimizer(hparams.learning_rate)
181 |     elif hparams.optimizer == 'adagrad':
182 |         opt = tf.train.AdagradOptimizer(hparams.learning_rate)
183 |     elif hparams.optimizer == 'adadelta':
184 |         opt = tf.train.AdadeltaOptimizer(hparams.learning_rate)
185 |     elif hparams.optimizer == 'RMSprop':
186 |         opt = tf.train.RMSPropOptimizer(hparams.learning_rate)
187 |     tf.summary.scalar("lr", _learning_rate)
188 |     return opt, _learning_rate
189 | 
190 | 
191 | def hash_bucket_embedding(name, bucket_size, dim, use_hashmap=False):
192 |     if use_hashmap:
193 |         id_feature = tf.contrib.layers.sparse_column_with_hash_bucket(
194 |             column_name=name, hash_bucket_size=bucket_size, use_hashmap=True)
195 |     else:
196 |         id_feature = tf.contrib.layers.sparse_column_with_hash_bucket(
197 |             column_name=name, hash_bucket_size=bucket_size)
198 |     return tf.contrib.layers.embedding_column(sparse_id_column=id_feature, dimension=dim)
199 | 
200 | 
201 | def learned_positional_encoding(inputs, max_length, num_units):
202 |     outputs = tf.range(tf.shape(inputs)[1])                # (T_q)
203 |     outputs = tf.where(tf.greater_equal(outputs, max_length), tf.fill(tf.shape(outputs), max_length - 1), outputs)
204 |     outputs = tf.expand_dims(outputs, 0)                   # (1, T_q)
205 |     outputs = tf.tile(outputs, [tf.shape(inputs)[0], 1])   # (N, T_q)
206 |     with variable_scope.variable_scope("embeddings") as scope:
207 |         pos_embedding = tf.get_variable(name="pos_embedding", shape=[max_length, num_units],
208 |                                         dtype=tf.float32)
209 |         encoded = tf.nn.embedding_lookup(pos_embedding, outputs)
210 |     return encoded
211 | 
212 | 
213 | def pointwise_feedforward(inputs, drop_out, is_training, num_units=None, activation=None):
214 |     # Inner layer
215 |     # outputs = tf.layers.conv1d(inputs, num_units[0], kernel_size=1, activation=activation)
216 |     outputs = tf.layers.dense(inputs, num_units[0], activation=activation)
217 |     outputs = tf.layers.dropout(outputs, drop_out, training=is_training)
218 |     # Readout layer
219 |     # outputs = tf.layers.conv1d(outputs, num_units[1], kernel_size=1, activation=None)
220 |     outputs = tf.layers.dense(outputs, num_units[1], activation=None)
221 | 
222 |     # drop_out before add&norm
223 |     outputs = tf.layers.dropout(outputs, drop_out, training=is_training)
224 |     # Residual connection
225 |     outputs += inputs
226 |     # Normalize
227 |     outputs = layer_norm(outputs)
228 |     return outputs
229 | 
230 | 
231 | def layer_norm(inputs, epsilon=1e-8):
232 |     mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
233 |     normalized = (inputs - mean) / (tf.sqrt(variance + epsilon))
234 | 
235 |     params_shape = inputs.get_shape()[-1:]
236 |     gamma = tf.get_variable('gamma', params_shape, tf.float32, tf.ones_initializer())
237 |     beta = tf.get_variable('beta', params_shape, tf.float32, tf.zeros_initializer())
238 | 
239 |     outputs = gamma * normalized + beta
240 |     return outputs
241 | 
242 | 
243 | def self_multi_head_attn(inputs, num_units, num_heads, key_masks, dropout_rate, is_training, is_layer_norm=True):
244 |     """
245 |     Args:
246 |       inputs(query): A 3d tensor with shape of [N, T_q, C_q]
247 |       inputs(keys): A 3d tensor with shape of [N, T_k, C_k]
248 |     """
249 |     if num_units is None:
250 |         num_units = inputs.get_shape().as_list[-1]
251 | 
252 |     Q_K_V = tf.layers.dense(inputs, 3 * num_units)  # tf.nn.relu
253 |     Q, K, V = tf.split(Q_K_V, 3, -1)
254 | 
255 |     Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0)  # (h*N, T_q, C/h)
256 |     K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0)  # (h*N, T_k, C/h)
257 |     V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0)  # (h*N, T_k, C/h)
258 | 
259 |     # (h*N, T_q, T_k)
260 |     align = general_attention(Q_, K_)
261 | 
262 |     # (h*N, T_k)
263 |     key_masks = tf.tile(key_masks, [num_heads, 1])
264 |     # (h*N, T_q, T_k)
265 |     key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(inputs)[1], 1])
266 |     # (h*N, T_q, C/h)
267 |     outputs = soft_max_weighted_sum(align, V_, key_masks, dropout_rate, is_training, future_binding=True)
268 | 
269 |     # Restore shape
270 |     outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2)  # (N, T_q, C)
271 |     # output linear
272 |     outputs = tf.layers.dense(outputs, num_units)
273 | 
274 |     # drop_out before residual and layernorm
275 |     outputs = tf.layers.dropout(outputs, dropout_rate, training=is_training)
276 |     # Residual connection
277 |     outputs += inputs  # (N, T_q, C)
278 |     # Normalize
279 |     if is_layer_norm:
280 |         outputs = layer_norm(outputs)  # (N, T_q, C)
281 | 
282 |     return outputs
283 | 
284 | 
285 | def concat_attention(query, key):
286 |     """
287 |     :param query: [batch_size, 1, query_size] -> [batch_size, time, query_size]
288 |     :param key:   [batch_size, time, key_size]
289 |     :return:      [batch_size, 1, time]
290 |         query_size should keep the same dim with key_size
291 |     """
292 |     # TODO: only support 1D attention at present
293 |     # query = tf.tile(query, [1, tf.shape(key)[1], 1])
294 |     # [batch_size, time, q_size+k_size]
295 |     q_k = tf.concat([query, key], axis=-1)
296 |     # [batch_size, time, 1]
297 |     align = tf.layers.dense(q_k, 1, tf.nn.tanh)  # tf.nn.relu old
298 |     # scale (optional)
299 |     align = align / (key.get_shape().as_list()[-1] ** 0.5)
300 |     align = tf.transpose(align, [0, 2, 1])
301 |     return align
302 | 
303 | 
304 | def general_attention(query, key):
305 |     """
306 |     :param query: [batch_size, None, query_size]
307 |     :param key:   [batch_size, time, key_size]
308 |     :return:      [batch_size, None, time]
309 |         query_size should keep the same dim with key_size
310 |     """
311 |     # [batch_size, None, time]
312 |     align = tf.matmul(query, tf.transpose(key, [0, 2, 1]))
313 |     # scale (optional)
314 |     align = align / (key.get_shape().as_list()[-1] ** 0.5)
315 |     return align
316 | 
317 | 
318 | def self_attention(inputs, num_units, key_masks, dropout_rate, is_training, is_layer_norm=True):
319 |     """
320 |     Args:
321 |       inputs(queries): A 3d tensor with shape of [N, T_q, C_q]
322 |       inputs(keys): A 3d tensor with shape of [N, T_k, C_k]
323 |     """
324 |     # if num_units is None:
325 |     #     num_units = inputs.get_shape().as_list[-1]
326 | 
327 |     # (N, T_q, C)
328 |     # Q = tf.layers.dense(inputs, num_units, tf.nn.relu, name='unlinear_trans', reuse=tf.AUTO_REUSE)
329 |     # (N, T_k, C)
330 |     # K = tf.layers.dense(inputs, num_units, tf.nn.relu, name="unlinear_trans", reuse=tf.AUTO_REUSE)
331 | 
332 |     Q = inputs
333 |     K = inputs
334 |     V = inputs
335 | 
336 |     align = general_attention(Q, K)
337 |     outputs = soft_max_weighted_sum(align, V, key_masks, dropout_rate, is_training, future_binding=True)
338 | 
339 |     # Residual connection
340 |     # outputs += inputs  # (N, T_q, C)
341 |     if is_layer_norm:
342 |         # Normalize
343 |         outputs = layer_norm(outputs)  # (N, T_q, C)
344 |     return outputs
345 | 
346 | 
347 | def soft_max_weighted_sum(align, value, key_masks, drop_out, is_training, future_binding=False):
348 |     """
349 |     :param align:           [batch_size, None, time]
350 |     :param value:           [batch_size, time, units]
351 |     :param key_masks:       [batch_size, None, time]
352 |                             2nd dim size with align
353 |     :param drop_out:
354 |     :param is_training:
355 |     :param future_binding:  TODO: only support 2D situation at present
356 |     :return:                weighted sum vector
357 |                             [batch_size, None, units]
358 |     """
359 |     # exp(-large) -> 0
360 |     paddings = tf.fill(tf.shape(align), float('-inf'))
361 |     # [batch_size, None, time]
362 |     align = tf.where(key_masks, align, paddings)
363 | 
364 |     if future_binding:
365 |         length = tf.reshape(tf.shape(value)[1], [-1])
366 |         # [time, time]
367 |         lower_tri = tf.ones(tf.concat([length, length], axis=0))
368 |         # [time, time]
369 |         lower_tri = tf.contrib.linalg.LinearOperatorTriL(lower_tri).to_dense()
370 |         # [batch_size, time, time]
371 |         masks = tf.tile(tf.expand_dims(lower_tri, 0), [tf.shape(align)[0], 1, 1])
372 |         # [batch_size, time, time]
373 |         align = tf.where(tf.equal(masks, 0), paddings, align)
374 | 
375 |     # soft_max and dropout
376 |     # [batch_size, None, time]
377 |     align = tf.nn.softmax(align)
378 |     align = tf.layers.dropout(align, drop_out, training=is_training)
379 |     # weighted sum
380 |     # [batch_size, None, units]
381 |     return tf.matmul(align, value)
382 | 
383 | 
384 | def sequence_feature_mask(columns_to_tensors, feature_columns, seq_len, avg_pooling=False,
385 |                           user_embedding=None, drop_out=0, is_training=True):
386 |     # [batch_size, time, units]
387 |     encoded = layers.sequence_input_from_feature_columns(
388 |         columns_to_tensors=columns_to_tensors,
389 |         feature_columns=feature_columns,
390 |         scope="reuse_embedding"
391 |     )
392 | 
393 |     # [batch_size, time]
394 |     key_masks = tf.sequence_mask(seq_len, tf.shape(encoded)[1], dtypes.float32)
395 | 
396 |     if avg_pooling:
397 |         # [batch_size, time, 1]
398 |         key_masks = tf.reshape(key_masks, [-1, tf.shape(encoded)[1], 1])
399 |         encoded = tf.multiply(encoded, key_masks)
400 |         encoded = tf.reduce_sum(encoded, 1) / tf.reshape(tf.cast(seq_len, dtypes.float32), [-1, 1])
401 |     else:
402 |         # [batch_size, 1, time]
403 |         query = tf.tile(user_embedding, [1, tf.shape(encoded)[1], 1])
404 |         align = concat_attention(query, encoded)
405 |         key_masks = tf.cast(key_masks, dtypes.bool)
406 |         # [batch_size, 1, time]
407 |         key_masks = tf.expand_dims(key_masks, 1)
408 |         encoded = soft_max_weighted_sum(align, encoded, key_masks, drop_out, is_training)
409 |         encoded = tf.squeeze(encoded, 1)
410 |     # [batch_size, units]
411 |     return encoded
412 | 


--------------------------------------------------------------------------------
/code/model_utils/task_config.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | import json
 3 | from tensorflow.python.lib.io import file_io
 4 | from tensorflow.python.platform import tf_logging as logging
 5 | 
 6 | 
 7 | class TaskConfig(object):
 8 |     def __init__(self, param_map=None, conf_file_path=None):
 9 |         self._param_map = {}
10 |         try:
11 |             config = json.load(file_io.FileIO(conf_file_path, 'r'))
12 |             if not config:
13 |                 logging.error("config file not exists")
14 |             if config['parameters']:
15 |                 self._param_map = config['parameters']
16 |             if param_map:
17 |                 self._param_map.update(param_map)
18 |         except:
19 |             logging.info("load conf error!")
20 | 
21 |     def get_config(self, config_name, default=None):
22 |         return self._param_map.get(config_name, default)
23 | 
24 |     def get_config_as_int(self, config_name, default=None):
25 |         value_str = self.get_config(config_name, default)
26 |         return int(value_str) if value_str else value_str
27 | 
28 |     def get_config_as_float(self, config_name, default=None):
29 |         value_str = self.get_config(config_name, default)
30 |         return float(value_str) if value_str else value_str
31 | 
32 |     def get_config_as_bool(self, config_name, default=None):
33 |         raw_value = self.get_config(config_name, default)
34 |         if raw_value and isinstance(raw_value, bool):
35 |             return raw_value
36 |         elif raw_value and (isinstance(raw_value, str) or isinstance(raw_value, unicode)):
37 |             return ast.literal_eval(raw_value)
38 |         else:
39 |             return False
40 | 
41 |     def get_config_as_list(self, config_name, default=None):
42 |         raw_value = self.get_config(config_name, default)
43 |         if raw_value and isinstance(raw_value, list):
44 |             return raw_value
45 |         else:
46 |             return ast.literal_eval(raw_value)
47 | 
48 |     def contains(self, config_name):
49 |         return config_name in self._param_map
50 | 
51 |     def add_config(self, key, value):
52 |         self._param_map[key] = value
53 | 
54 |     def add_if_not_contain(self, key, value):
55 |         if not self.contains(key):
56 |             self.add_config(key, value)
57 | 


--------------------------------------------------------------------------------
/code/models/basic_modules.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from tensorflow.contrib import layers
  3 | from tensorflow.python.ops import variable_scope
  4 | from tensorflow.python.ops.nn_impl import sampled_softmax_loss
  5 | from model_utils import model_helper
  6 | from model_utils.model_helper import extract_axis_1, get_optimizer, layer_norm, sequence_feature_mask, \
  7 |     self_multi_head_attn, self_attention, pointwise_feedforward, general_attention, concat_attention, \
  8 |     soft_max_weighted_sum, learned_positional_encoding
  9 | from tensorflow.python.framework import dtypes
 10 | 
 11 | 
 12 | class BasicModules:
 13 |     def __init__(self, parser, hparams):
 14 |         self.hparams = hparams
 15 |         self.parser = parser
 16 |         self.num_units = self.hparams.num_units
 17 |         self.global_step = tf.train.get_or_create_global_step()
 18 |         self.initializer = model_helper.get_initializer(self.hparams.init_op,
 19 |                                                         self.hparams.seed,
 20 |                                                         self.hparams.init_weight)
 21 |         self.kernel_initializer = model_helper.get_initializer(self.hparams.nn_init_op,
 22 |                                                                seed=self.hparams.seed)
 23 |         self.partitioner = model_helper.get_emb_partitioner(self.hparams.num_partitions,
 24 |                                                             self.hparams.min_slice_size,
 25 |                                                             self.hparams.ps_num)
 26 |         self.dropout = tf.placeholder(tf.float32, name="dropout") \
 27 |             if self.hparams.validation else self.hparams.dropout
 28 |         self.is_training = self.hparams.mode == 'train'
 29 |         self.my_dict = {}
 30 | 
 31 |     def dataset_batch(self, params, dataset):
 32 |         def _parse_function(example_proto):
 33 |             features = tf.parse_single_example(example_proto, features=self.parser.feature_map)
 34 |             sparse2dense = {k: tf.sparse_tensor_to_dense(f, default_value=0)
 35 |                             for k, f in features.iteritems()
 36 |                             if isinstance(f, tf.SparseTensor) and f.dtype != tf.string}
 37 |             features.update(sparse2dense)
 38 |             sparse2dense = {k: tf.sparse_tensor_to_dense(f, default_value="0")
 39 |                             for k, f in features.iteritems()
 40 |                             if isinstance(f, tf.SparseTensor) and f.dtype == tf.string}
 41 |             features.update(sparse2dense)
 42 |             # tf.logging.info(features)
 43 |             return self.parser.output_one_example(features)
 44 | 
 45 |         # Bucket by source sequence length (buckets for lengths 0-9, 10-19, ...)
 46 |         def batching_func(x):
 47 |             tf_padded = {"fix": [], "int": tf.cast(0, tf.int64), "str": "0",
 48 |                          "var": [None], "str_multi": "43,35,12,54,21"}
 49 |             padded_shapes = []
 50 |             padded_values = []
 51 |             for key in self.parser.input_keys:
 52 |                 key_1 = key[1]
 53 |                 key_2 = key[2] if key[0] != "multi_labels" else "str_multi"
 54 |                 padded_shapes.append(tf_padded[key_1])
 55 |                 padded_values.append(tf_padded[key_2])
 56 | 
 57 |             return x.padded_batch(params['batch_size'],
 58 |                                   padded_shapes=tuple(padded_shapes),
 59 |                                   padding_values=tuple(padded_values))
 60 | 
 61 |         def key_func(src_len, *unused_list):
 62 |             # Calculate bucket_width by maximum source sequence length.
 63 |             # Pairs with length [0, bucket_width) go to bucket 0, length
 64 |             # [bucket_width, 2 * bucket_width) go to bucket 1, etc.  Pairs with length
 65 |             # over ((num_bucket-1) * bucket_width) words all go into the last bucket.
 66 |             if self.hparams.max_length > 1:
 67 |                 bucket_width = (self.hparams.max_length + params['num_buckets'] - 1) // params['num_buckets']
 68 |             else:
 69 |                 bucket_width = 5
 70 |             # Bucket sentence pairs by the length of their source sentence and target
 71 |             # sentence.
 72 |             bucket_id = src_len // bucket_width
 73 |             bucket_id = tf.cast(bucket_id, tf.int32)
 74 |             return tf.to_int64(tf.minimum(params['num_buckets'], bucket_id))
 75 | 
 76 |         def reduce_func(unused_key, windowed_data):
 77 |             return batching_func(windowed_data)
 78 | 
 79 |         dataset = dataset.map(_parse_function, num_parallel_calls=32)
 80 |         dataset = dataset.repeat(params["epochs"])
 81 |         if params['shuffle']:
 82 |             dataset = dataset.shuffle(buffer_size=10000, seed=self.hparams.seed)
 83 |         if params['num_buckets'] > 1 and params['mode'] == 'train':
 84 |             dataset = dataset.apply(tf.contrib.data.group_by_window(key_func=key_func, reduce_func=reduce_func,
 85 |                                                                     window_size=params["batch_size"]))
 86 |         elif params['mode'] == 'train' or params['mode'] == "test":
 87 |             dataset = batching_func(dataset)
 88 |         else:
 89 |             dataset = dataset.batch(params["batch_size"])
 90 |         dataset = dataset.prefetch(buffer_size=1000)
 91 |         return dataset
 92 | 
 93 |     def input_fn_dataset(self, file_list, data_type="train"):
 94 |         if data_type == "test":
 95 |             params = {"mode": "test", "epochs": 1, "shuffle": False, "batch_size": 64, "num_buckets": 0}
 96 |         else:
 97 |             params = {"mode": self.hparams.mode, "epochs": self.hparams.num_epochs, "shuffle": self.hparams.shuffle,
 98 |                       "batch_size": self.hparams.batch_size, "num_buckets": self.hparams.num_buckets}
 99 | 
100 |         with tf.name_scope(data_type + '_input_fn') as scope:
101 |             dataset = tf.data.TFRecordDataset(file_list)
102 |             dataset = self.dataset_batch(params, dataset)
103 |             if self.hparams.validation:
104 |                 return dataset
105 |             else:
106 |                 iterator = dataset.make_one_shot_iterator()
107 |                 return self.parser.output_features(iterator)
108 | 
109 |     def create_item_embeddings(self, features):
110 |         # soft_max
111 |         with variable_scope.variable_scope("soft_max", values=None, partitioner=self.partitioner) as scope:
112 |             nce_biases = tf.zeros([self.hparams.vocab_size], name='bias')
113 |             nce_weights = tf.get_variable(name='weight', shape=[self.hparams.vocab_size, self.num_units],
114 |                                           dtype=tf.float32, initializer=self.initializer)
115 | 
116 |         # input item embeddings
117 |         with variable_scope.variable_scope("item_embeddings", partitioner=self.partitioner,
118 |                                            initializer=self.initializer, reuse=tf.AUTO_REUSE) as scope:
119 |             embeddings = self.parser.embedding_columns(feature_type="item")
120 |             if self.hparams.item_id_only:
121 |                 encoded = layers.sequence_input_from_feature_columns(
122 |                     columns_to_tensors={"item_emb": features["item_ids"]},
123 |                     feature_columns=[embeddings[0]], scope="reuse_embedding")
124 |             else:
125 |                 encoded = layers.sequence_input_from_feature_columns(
126 |                     columns_to_tensors={"item_emb": features["item_ids"],
127 |                                         "shop_emb": features["shop_ids"],
128 |                                         "cate_emb": features["cate_ids"],
129 |                                         "brand_emb": features["brand_ids"]},
130 |                     feature_columns=embeddings, scope="reuse_embedding")
131 |             if self.hparams.item_fc_trans:
132 |                 encoded = tf.layers.dense(encoded, self.num_units, tf.nn.tanh,
133 |                                           kernel_initializer=self.kernel_initializer,
134 |                                           name="item_fc")
135 |         return nce_weights, nce_biases, encoded
136 | 
137 |     def create_user_embeddings(self, features):
138 |         # input user embedding
139 |         with variable_scope.variable_scope("user_embeddings", partitioner=self.partitioner,
140 |                                            initializer=self.initializer) as scope:
141 |             embeddings_fix = self.parser.embedding_columns(feature_type="user_fix", use_hashmap=True)
142 |             if self.hparams.use_user_id and self.hparams.user_id_only:
143 |                 encoded = layers.input_from_feature_columns(
144 |                     columns_to_tensors={"user_id_emb": features["user_id"]},
145 |                     feature_columns=[embeddings_fix[0]])
146 |             else:
147 |                 personal_encoded = []
148 |                 profile_features = {}
149 |                 for fs_name in self.parser.embedding_user_features_fix:
150 |                     profile_features.update({fs_name + "_emb": features[fs_name]})
151 | 
152 |                 profile_encoded = layers.input_from_feature_columns(
153 |                     columns_to_tensors=profile_features,
154 |                     feature_columns=embeddings_fix)
155 | 
156 |                 personal_encoded.append(profile_encoded)
157 |                 encoded = tf.concat(personal_encoded, -1)
158 | 
159 |             if self.hparams.user_fc_trans:
160 |                 encoded = tf.layers.dense(encoded, self.num_units, tf.nn.tanh,
161 |                                           kernel_initializer=self.kernel_initializer)
162 |             return encoded
163 | 
164 |     def create_prefer_embeddings(self, features, user_embedding):
165 |         # input prefer item embeddings
166 |         with variable_scope.variable_scope("item_embeddings", partitioner=self.partitioner,
167 |                                            initializer=self.initializer, reuse=tf.AUTO_REUSE) as scope:
168 |             embeddings = self.parser.embedding_columns(feature_type="item")
169 |             feature_names = ["item", "shop", "brand", "cate"]
170 |             if self.hparams.item_id_only:
171 |                 feature_names = [feature_names[0]]
172 |             prefer_outputs = []
173 |             for i in range(len(feature_names)):
174 |                 key_emb = feature_names[i] + "_emb"
175 |                 value_emb = features["prefer_"+feature_names[i]+"s"]
176 |                 value_len = features[feature_names[i]+"s"+"_len"]
177 |                 prefer_encoded = sequence_feature_mask({key_emb: value_emb},
178 |                                                        [embeddings[i]],
179 |                                                        value_len,
180 |                                                        avg_pooling=self.hparams.prefer_avg_pooling,
181 |                                                        user_embedding=user_embedding,
182 |                                                        drop_out=self.dropout,
183 |                                                        is_training=self.is_training)
184 |                 prefer_outputs.append(prefer_encoded)
185 |             prefer_outputs = tf.concat(prefer_outputs, -1)
186 |             if self.hparams.prefer_fc:
187 |                 prefer_outputs = tf.layers.dense(prefer_outputs, self.num_units, tf.nn.tanh,
188 |                                                  kernel_initializer=self.kernel_initializer,
189 |                                                  name="prefer_fc")
190 |             return prefer_outputs
191 | 
192 |     def create_rnn_encoder(self, seq_len, inputs):
193 |         with tf.variable_scope("encoder", values=None,
194 |                                initializer=model_helper.get_initializer(self.hparams.nn_init_op, seed=self.hparams.seed),
195 |                                partitioner=self.partitioner) as scope:
196 |             cell = model_helper.create_rnn_cell(unit_type=self.hparams.unit_type,
197 |                                                 num_units=self.hparams.rnn_hidden_units,
198 |                                                 num_layers=self.hparams.num_layers,
199 |                                                 num_residual_layers=self.hparams.num_residual_layers,
200 |                                                 forget_bias=self.hparams.forget_bias,
201 |                                                 dropout=self.dropout,
202 |                                                 mode=self.hparams.mode,
203 |                                                 attention_window_size=self.hparams.attention_window_size)
204 | 
205 |             rnn_outputs, last_states = tf.nn.dynamic_rnn(cell=cell, dtype=tf.float32,
206 |                                                          sequence_length=seq_len, inputs=inputs)
207 | 
208 |             if self.hparams.rnn_layer_norm:
209 |                 rnn_outputs = layer_norm(rnn_outputs)
210 | 
211 |         return rnn_outputs, last_states
212 | 
213 |     def create_position_encoding(self, inputs):
214 |         with tf.variable_scope('add_pos_encoding', initializer=self.initializer, partitioner=self.partitioner):
215 |             pos_input = learned_positional_encoding(inputs, self.hparams.max_length, self.num_units)
216 |             outputs = inputs + pos_input
217 |             outputs = tf.layers.dropout(outputs, self.dropout, training=self.is_training)
218 |             return outputs
219 | 
220 |     def create_self_attn(self, key_masks_1d, key_masks_2d, inputs):
221 |         attn_outputs = inputs
222 |         for layer in range(self.hparams.num_multi_head):
223 |             with tf.variable_scope('self_attn_'+str(layer), partitioner=self.partitioner):
224 |                 if self.hparams.NARM:
225 |                     attn_outputs = self_attention(attn_outputs, num_units=self.num_units,
226 |                                                   key_masks=key_masks_2d, dropout_rate=self.dropout,
227 |                                                   is_training=self.is_training,
228 |                                                   is_layer_norm=self.hparams.attn_layer_norm)
229 |                 else:
230 |                     attn_outputs = self_multi_head_attn(attn_outputs, num_units=self.num_units,
231 |                                                         num_heads=self.hparams.num_heads, key_masks=key_masks_1d,
232 |                                                         dropout_rate=self.dropout, is_training=self.is_training,
233 |                                                         is_layer_norm=self.hparams.attn_layer_norm)
234 |             with tf.variable_scope('ffn_'+str(layer), partitioner=self.partitioner):
235 |                 if self.hparams.self_attn_ffn:
236 |                         attn_outputs = pointwise_feedforward(attn_outputs, self.dropout, self.is_training,
237 |                                                              num_units=[self.num_units, self.num_units],  # 4 *
238 |                                                              activation=tf.nn.relu)
239 | 
240 |         with tf.variable_scope('attn_concat', partitioner=self.partitioner):
241 |             if self.hparams.STAMP:
242 |                 inputs = tf.layers.dense(inputs, self.num_units, tf.nn.tanh)
243 |                 attn_outputs = tf.layers.dense(attn_outputs, self.num_units, tf.nn.tanh)
244 |                 attn_outputs = tf.multiply(attn_outputs, inputs)
245 | 
246 |             if self.hparams.NARM and not self.hparams.STAMP:
247 |                 attn_outputs = tf.concat([attn_outputs, inputs], axis=-1)
248 |                 if self.hparams.attn_fc:
249 |                     attn_outputs = tf.layers.dense(attn_outputs, self.num_units)
250 | 
251 |         return attn_outputs
252 | 
253 |     def create_user_attn(self, key_masks, inputs, user_embedding_1d, user_embedding_2d):
254 |         """
255 |         Args:
256 |             user_embedding : [batch_size, user_embedding_size]
257 |             inputs :         [batch_size, time, num_units]
258 |             key_masks:       sequence mask, 2D tensor
259 |         Returns:
260 |             outputs :        [batch_size, time, num_units]
261 |         """
262 |         with tf.variable_scope('user_attn', partitioner=self.partitioner):
263 |             # [batch_size, 1, num_units]
264 |             # query = tf.expand_dims(user_embedding, 1)
265 |             key = inputs
266 |             align = None
267 |             if self.hparams.user_attn == 'general':
268 |                 query = tf.layers.dense(user_embedding_1d, self.num_units, tf.nn.tanh)
269 |                 align = general_attention(query, key)
270 |             elif self.hparams.user_attn == 'concat':
271 |                 query = user_embedding_2d
272 |                 align = concat_attention(query, key)
273 | 
274 |             # [batch_size, time, time]
275 |             align = tf.tile(align, [1, tf.shape(inputs)[1], 1])
276 |             outputs = soft_max_weighted_sum(align, key, key_masks, self.dropout, self.is_training, future_binding=True)
277 | 
278 |             if self.hparams.user_residual:
279 |                 outputs += inputs
280 |                 # outputs = layer_norm(outputs)
281 | 
282 |         return outputs
283 | 
284 |     def create_item_user_input(self, seq_input, user_embedding):
285 |         with tf.variable_scope('item_user_feature', partitioner=self.partitioner):
286 |             # user_embedding = tf.tile(tf.expand_dims(user_embedding, 1), [1, tf.shape(seq_input)[1], 1])
287 |             seq_input = tf.concat([seq_input, user_embedding], axis=-1)
288 |         return seq_input
289 | 
290 |     def combine_long_short(self, short_rep, long_rep, user_embedding):
291 |         """
292 |         short_rep: [batch_size, time, units]
293 |         long_rep: [batch_size, units]
294 |         user_embedding: [batch_size, units]
295 |         """
296 |         with variable_scope.variable_scope("fusion", partitioner=self.partitioner) as scope:
297 |             long_rep = tf.tile(tf.expand_dims(long_rep, 1), [1, tf.shape(short_rep)[1], 1])
298 |             if self.hparams.fusion_op == "add":
299 |                 outputs = long_rep + short_rep
300 |             elif self.hparams.fusion_op == "multiply":
301 |                 outputs = tf.multiply(long_rep, short_rep)
302 |             elif self.hparams.fusion_op == "concat":
303 |                 outputs = tf.concat([short_rep, long_rep], axis=-1)
304 |                 outputs = tf.layers.dense(outputs, self.num_units)
305 |             elif self.hparams.fusion_op == "feature_gated":
306 |                 f_input = tf.concat([short_rep, long_rep], -1)
307 |                 f = tf.layers.dense(f_input, self.num_units, activation=tf.nn.tanh)
308 |                 g_input = tf.concat([short_rep, long_rep], -1)
309 |                 g = tf.layers.dense(g_input, self.num_units, activation=tf.sigmoid)
310 |                 outputs = tf.multiply(g, short_rep) + tf.multiply(1 - g, f)
311 |                 tf.summary.scalar("gate", tf.reduce_mean(g))
312 |             else:
313 |                 g_units = self.num_units
314 |                 if self.hparams.g_units_one:
315 |                     g_units = 1
316 |                 # user_embedding = tf.tile(tf.expand_dims(user_embedding, 1), [1, tf.shape(short_rep)[1], 1])
317 |                 g_input = tf.concat([short_rep, long_rep, user_embedding], -1)
318 |                 g = tf.layers.dense(g_input, g_units, activation=tf.sigmoid)
319 |                 outputs = tf.multiply(g, short_rep) + tf.multiply(1 - g, long_rep)
320 |                 tf.summary.scalar("gate", tf.reduce_mean(g))
321 | 
322 |             return outputs
323 | 
324 |     def calculate_loss(self, nce_weights, nce_biases, label_split, rnn_outputs_split, target_splits, batch_size,
325 |                        sampled_values=None):
326 | 
327 |         sampled_loss = sampled_softmax_loss(
328 |             weights=nce_weights,
329 |             biases=nce_biases,
330 |             labels=label_split,
331 |             inputs=rnn_outputs_split,
332 |             num_sampled=self.hparams.num_samples,
333 |             num_classes=self.hparams.vocab_size,
334 |             num_true=self.hparams.num_labels,
335 |             sampled_values=sampled_values,
336 |             partition_strategy=self.hparams.partn_strgy
337 |         )
338 | 
339 |         sampled_loss = tf.reshape(sampled_loss, [batch_size, -1])
340 |         sampled_loss = tf.reduce_sum(sampled_loss * target_splits)
341 | 
342 |         return sampled_loss
343 | 
344 |     def create_split_optimizer(self, features, outputs, nce_weights, nce_biases):
345 |         seq_len = tf.cast(features["seq_len"], dtypes.int32)
346 |         batch_size = tf.shape(outputs)[0]
347 | 
348 |         with tf.variable_scope("loss") as scope:
349 |             rnn_outputs_flat = tf.reshape(outputs, [-1, self.num_units])
350 |             num_labels = self.hparams.num_labels
351 |             if num_labels > 1:
352 |                 multi_labels = tf.reshape(features["multi_labels"], [-1])
353 |                 multi_labels = tf.string_split(multi_labels, delimiter=",").values
354 |                 multi_labels = tf.reshape(multi_labels, [-1, num_labels])
355 |                 label_flat = tf.string_to_number(multi_labels, out_type=tf.int64)
356 |             else:
357 |                 label_flat = tf.reshape(features["labels"], [-1, 1])
358 |             istarget = tf.sequence_mask(seq_len, tf.shape(outputs)[1], dtype=outputs.dtype)
359 | 
360 |             rnn_outputs_splits = tf.split(rnn_outputs_flat, num_or_size_splits=self.hparams.split_size,
361 |                                           name="rnn_output_split", axis=0)
362 |             label_splits = tf.split(label_flat, num_or_size_splits=self.hparams.split_size,
363 |                                     name="label_split", axis=0)
364 |             istarget_splits = tf.split(istarget, num_or_size_splits=self.hparams.split_size,
365 |                                        name="istarget_split", axis=0)
366 | 
367 |             losses = []
368 | 
369 |             i = 0
370 |             for (rnn_outputs_split, label_split, target_split) in zip(rnn_outputs_splits,
371 |                                                                       label_splits,
372 |                                                                       istarget_splits):
373 |                 with tf.variable_scope("loss_" + str(i)) as scope:
374 |                     sampled_loss = self.calculate_loss(nce_weights, nce_biases,
375 |                                                        label_split, rnn_outputs_split,
376 |                                                        target_split, batch_size / self.hparams.split_size)
377 |                     losses.append(sampled_loss)
378 |                 i += 1
379 | 
380 |             all_loss = sum(losses)
381 | 
382 |             _mean_loss_by_example = all_loss / (tf.to_float(batch_size))
383 |             _mean_loss_by_pos = all_loss / (tf.reduce_sum(istarget))
384 |             if self.hparams.loss_by_example:
385 |                 _mean_loss = _mean_loss_by_example
386 |             else:
387 |                 _mean_loss = _mean_loss_by_pos
388 |             _mean_loss = tf.check_numerics(_mean_loss, "loss is nan of inf")
389 | 
390 |         with tf.variable_scope("metrics"):
391 |             tf.summary.scalar("mean_loss_by_example", _mean_loss_by_example)
392 |             tf.summary.scalar("mean_loss_by_pos", _mean_loss_by_pos)
393 |             tf.summary.scalar("train_loss", _mean_loss)
394 |             for i in range(self.hparams.split_size):
395 |                 tf.summary.scalar("sample_loss_" + str(i), losses[i])
396 | 
397 |         with tf.variable_scope("optimizer") as scope:
398 |             params = tf.trainable_variables()
399 |             gradients = tf.gradients(_mean_loss, params,
400 |                                      colocate_gradients_with_ops=self.hparams.colocate_gradients_with_ops)
401 |             clipped_gradients = model_helper.gradient_clip(gradients=gradients,
402 |                                                            max_gradient_norm=self.hparams.max_gradient_norm)
403 |             opt, _learning_rate = get_optimizer(self.hparams, self.global_step)
404 |             train_op = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)
405 | 
406 |         self.my_dict.update({
407 |             'learning_rate': _learning_rate,
408 |             'loss': _mean_loss,
409 |             'drop_out': self.dropout
410 |         })
411 | 
412 |         if self.hparams.validation:
413 |             with tf.variable_scope("validation"):
414 |                 last_output = extract_axis_1(outputs, seq_len - 1)
415 |                 logits = tf.matmul(last_output, tf.transpose(nce_weights)) + nce_biases
416 |                 top_item_ids = tf.nn.top_k(logits, k=self.hparams.topK).indices
417 |                 top_item_ids = tf.reshape(top_item_ids, [batch_size, self.hparams.topK])
418 | 
419 |             self.my_dict.update({
420 |                 'top_items': top_item_ids,
421 |                 'user_id': features['user_id'],
422 |                 'ds': features['ds'],
423 |                 "weight": nce_weights.as_tensor(),
424 |                 'user_embedding_output': last_output
425 |             })
426 | 
427 |         return train_op, self.my_dict
428 | 


--------------------------------------------------------------------------------
/code/models/deep_match.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from models.basic_modules import BasicModules
 3 | from models.extra_modules import ExtraModules
 4 | 
 5 | 
 6 | class DeepMatch(BasicModules, ExtraModules):
 7 | 
 8 |     def __init__(self, parser, hparams):
 9 |         BasicModules.__init__(self, parser, hparams)
10 |         ExtraModules.__init__(self, parser, hparams)
11 | 
12 |     def create_sequence_mask(self, features, seq_input):
13 |         with tf.variable_scope('seq_masks', partitioner=self.partitioner):
14 |             seq_len = features['seq_len']
15 |             max_seq_len = tf.shape(seq_input)[1]
16 |             if "shan" in self.hparams.model:
17 |                 seq_len = seq_len + 1
18 |                 max_seq_len = max_seq_len + 1
19 |             # [batch_size, time]
20 |             key_masks_1d = tf.sequence_mask(seq_len, max_seq_len)
21 |             # [batch_size, time, time]
22 |             key_masks_2d = tf.tile(tf.expand_dims(key_masks_1d, 1), [1, max_seq_len, 1])
23 |         return key_masks_1d, key_masks_2d
24 | 
25 |     def create_expand_seq_dim(self, input_1d, seq_input):
26 |         with tf.variable_scope('expand_seq_dim', partitioner=self.partitioner):
27 |             max_seq_len = tf.shape(seq_input)[1]
28 |             input_1d = tf.expand_dims(input_1d, 1)
29 |             input_2d = tf.tile(input_1d, [1, max_seq_len, 1])
30 |         return input_1d, input_2d
31 | 
32 |     def model_fn_train(self, features):
33 |         nce_weights, nce_biases, seq_input = self.create_item_embeddings(features)
34 |         user_embedding_1d, user_embedding_2d, user_embedding = None, None, None
35 | 
36 |         if "personal" in self.hparams.model:
37 |             user_embedding = self.create_user_embeddings(features)
38 |             user_embedding_1d, user_embedding_2d = self.create_expand_seq_dim(user_embedding, seq_input)
39 |             if self.hparams.input_user_feature:
40 |                 seq_input = self.create_item_user_input(seq_input, user_embedding_2d)
41 | 
42 |         key_masks_1d, key_masks_2d = self.create_sequence_mask(features, seq_input)
43 | 
44 |         outputs = seq_input
45 |         if "rnn" in self.hparams.model:
46 |             outputs, last_states = self.create_rnn_encoder(features['seq_len'], seq_input)
47 |         elif "dnn" in self.hparams.model:
48 |             outputs = self.average_item_embedding(seq_input)
49 |         elif "ahead_pos" in self.hparams.model:
50 |             outputs = self.create_position_encoding(seq_input)
51 | 
52 |         if "self_attn" in self.hparams.model:
53 |             outputs = self.create_self_attn(key_masks_1d, key_masks_2d, outputs)
54 |             if "user_attn" in self.hparams.model:
55 |                 outputs = self.create_user_attn(key_masks_2d, outputs, user_embedding_1d, user_embedding_2d)
56 | 
57 |         if "prefer" in self.hparams.model:
58 |             prefer_outputs = self.create_prefer_embeddings(features, user_embedding_1d)
59 |             if "dnn" in self.hparams.model:
60 |                 outputs = self.create_dnn(outputs, prefer_outputs, user_embedding)
61 |             elif "shan" in self.hparams.model:
62 |                 outputs = self.create_in_shan(outputs, prefer_outputs)
63 |                 outputs = self.create_user_attn(key_masks_2d, outputs, user_embedding_1d, user_embedding_2d)
64 |                 outputs = self.create_out_shan(outputs)
65 |             else:
66 |                 outputs = self.combine_long_short(outputs, prefer_outputs, user_embedding_2d)
67 | 
68 |         train_op, my_dict = self.create_split_optimizer(features, outputs, nce_weights, nce_biases)
69 |         return train_op, self.global_step, my_dict
70 | 


--------------------------------------------------------------------------------
/code/models/extra_modules.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.python.framework import dtypes
 3 | 
 4 | 
 5 | class ExtraModules:
 6 | 
 7 |     def __init__(self, parser, hparams):
 8 |         pass
 9 | 
10 |     def average_item_embedding(self, inputs):
11 |         """
12 |         :param inputs: [batch_size, time, units]
13 |         :return: masked average pooling
14 |         """
15 |         with tf.variable_scope('item_avg_pooling', partitioner=self.partitioner):
16 |             length = tf.reshape(tf.shape(inputs)[1], [-1])
17 |             # [time, time]
18 |             lower_tri = tf.ones(tf.concat([length, length], axis=0))
19 |             # [time, time]
20 |             lower_tri = tf.contrib.linalg.LinearOperatorTriL(lower_tri).to_dense()
21 |             # [batch_size, time, time]
22 |             masks = tf.tile(tf.expand_dims(lower_tri, 0), [tf.shape(inputs)[0], 1, 1])
23 |             # [batch_size, time, units]
24 |             output = tf.matmul(masks, inputs)
25 |             # [time]
26 |             avg_num = tf.range(1, 1 + tf.shape(inputs)[1])
27 |             avg_num = tf.cast(avg_num, dtypes.float32)
28 |             # [1, time, 1]
29 |             avg_num = tf.reshape(avg_num, [1, tf.shape(avg_num)[0], 1])
30 |             # [batch_size, time, 1]
31 |             avg_num = tf.tile(avg_num, [tf.shape(inputs)[0], 1, 1])
32 |             # [batch_size, time, units]
33 |             output = tf.divide(output, avg_num)
34 |         return output
35 | 
36 |     def create_dnn(self, item_embedding, prefer_embedding, user_embedding):
37 |         """
38 |         :param prefer_embedding:
39 |         :param item_embedding:
40 |         :param user_embedding:
41 |         :return:
42 |         """
43 |         with tf.variable_scope('dnn', partitioner=self.partitioner):
44 |             user_embedding = tf.tile(tf.expand_dims(user_embedding, 1), [1, tf.shape(item_embedding)[1], 1])
45 |             prefer_embedding = tf.tile(tf.expand_dims(prefer_embedding, 1), [1, tf.shape(item_embedding)[1], 1])
46 |             output = tf.concat([item_embedding, prefer_embedding, user_embedding], -1)
47 |             output = tf.layers.dropout(output, self.dropout, training=self.is_training)
48 |             output = tf.layers.dense(output, 4 * self.num_units, tf.nn.relu)
49 |             output = tf.layers.dropout(output, self.dropout, training=self.is_training)
50 |             output = tf.layers.dense(output, 2 * self.num_units, tf.nn.relu)
51 |             output = tf.layers.dropout(output, self.dropout, training=self.is_training)
52 |             output = tf.layers.dense(output, self.num_units, tf.nn.relu)
53 |         return output
54 | 
55 |     def create_in_shan(self, item_embedding, prefer_embedding):
56 |         """
57 |         :param item_embedding:
58 |         :param user_embedding:
59 |         :param prefer_embedding:
60 |         :param num_units:
61 |         :return:
62 |         """
63 |         with tf.variable_scope('shan_in', partitioner=self.partitioner):
64 |             prefer_embedding = tf.reshape(prefer_embedding, [-1, 1, tf.shape(prefer_embedding)[-1]])
65 |             output = tf.concat([prefer_embedding, item_embedding], 1)
66 |         return output
67 | 
68 |     def create_out_shan(self, inputs):
69 |         """
70 |         :param inputs:
71 |         :return:
72 |         """
73 |         with tf.variable_scope('shan_out', partitioner=self.partitioner):
74 |             output = tf.slice(inputs, [0, 1, 0], [tf.shape(inputs)[0], tf.shape(inputs)[1] - 1, tf.shape(inputs)[2]])
75 |         return output
76 | 


--------------------------------------------------------------------------------
/code/parsers/model_feature_parser.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from model_utils import model_helper
 3 | 
 4 | 
 5 | class ModelFeatureParser(object):
 6 |     def __init__(self, hparams):
 7 |         self.hparams = hparams
 8 |         # tf record input data schema
 9 |         self.len_fix_int_keys = [("seq_len", "fix", "int"), ("items_len", "fix", "int"),
10 |                                  ("shops_len", "fix", "int"), ("cates_len", "fix", "int"), ("brands_len", "fix", "int")]
11 | 
12 |         self.label_var_int_keys = [("labels", "var", "int")]
13 | 
14 |         self.label_var_str_keys = [("multi_labels", "var", "str")]
15 | 
16 |         self.item_feature_var_str_keys = [("item_ids", "var", "str"), ("shop_ids", "var", "str"),
17 |                                           ("cate_ids", "var", "str"), ("brand_ids", "var", "str")]
18 | 
19 |         self.user_feature_var_str_keys = [("prefer_items", "var", "str"), ("prefer_shops", "var", "str"),
20 |                                           ("prefer_cates", "var", "str"), ("prefer_brands", "var", "str")]
21 | 
22 |         self.user_feature_fix_str_keys = [("user_id", "fix", "str"), ("age", "fix", "str"), ("sex", "fix", "str"),
23 |                                           ("user_lv_cd", "fix", "str"), ("city_level", "fix", "str"),
24 |                                           ("province", "fix", "str"), ("city", "fix", "str"), ("country", "fix", "str")]
25 | 
26 |         # distinct sparse id feature
27 |         self.embedding_item_features = ["item", "shop", "brand", "cate"]
28 |         self.embedding_user_features_fix = ["user_id", "age", "sex", "user_lv_cd", "city_level", "province", "city", "country"]
29 | 
30 |         self.input_keys = self.len_fix_int_keys + self.label_var_int_keys + \
31 |                           self.label_var_str_keys + self.item_feature_var_str_keys + \
32 |                           self.user_feature_var_str_keys + self.user_feature_fix_str_keys + [("ds", "fix", "str")]
33 | 
34 |         tf_feature = {"fix_int": tf.FixedLenFeature([], dtype=tf.int64),
35 |                       "var_int": tf.VarLenFeature(dtype=tf.int64),
36 |                       "var_str": tf.VarLenFeature(dtype=tf.string),
37 |                       "fix_str": tf.FixedLenFeature([], dtype=tf.string)}
38 |         self.feature_map = {}
39 |         if self.hparams.mode == "train":
40 |             for key in self.input_keys:
41 |                 self.feature_map.update({key[0]: tf_feature[key[1] + '_' + key[2]]})
42 | 
43 |     def embedding_columns(self, feature_type, use_hashmap=False):
44 |         sparse_features_emb = []
45 |         embedding_features = {"item": self.embedding_item_features, "user_fix": self.embedding_user_features_fix}
46 |         # item or feature
47 |         for fs_name in embedding_features[feature_type]:
48 |             new_emb = model_helper.hash_bucket_embedding(fs_name+'_emb', self.hparams.bucket_size[fs_name],
49 |                                                          self.hparams.embedding_size[fs_name],
50 |                                                          use_hashmap=use_hashmap)
51 |             sparse_features_emb.append(new_emb)
52 |         return sparse_features_emb
53 | 
54 |     def output_one_example(self, features):
55 |         if self.hparams.mode == "train":
56 |             example = []
57 |             for key in self.input_keys:
58 |                 example.append(features[key[0]])
59 |             return example
60 | 
61 |     def output_features(self, iterator):
62 |         if self.hparams.mode == "train":
63 |             features = iterator.get_next()
64 |             return {self.input_keys[i][0]: features[i] for i in range(len(self.input_keys))}
65 | 


--------------------------------------------------------------------------------
/code/train/run.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import sys
  4 | import tensorflow as tf
  5 | import traceback
  6 | import numpy as np
  7 | 
  8 | currentPath = os.path.split(os.path.realpath(__file__))[0]
  9 | sys.path.append(currentPath + os.sep + '../')
 10 | sys.path.append(currentPath + os.sep + '../..')
 11 | 
 12 | from parsers.model_feature_parser import ModelFeatureParser
 13 | from model_utils.hyperparams import create_hparams, create_flags, create_task_config
 14 | from tensorflow.python.platform import tf_logging as logging
 15 | from train.utils import parent_directory
 16 | from models.deep_match import DeepMatch
 17 | 
 18 | flags = tf.app.flags
 19 | FLAGS = create_flags(flags).FLAGS
 20 | 
 21 | 
 22 | def main(unused_argv):
 23 |     tf.logging.set_verbosity(tf.logging.INFO)
 24 | 
 25 |     config = tf.ConfigProto()
 26 |     config.gpu_options.allow_growth = True
 27 | 
 28 |     # Parse config parameters
 29 |     current_dir = os.path.dirname(os.path.abspath(__file__))
 30 |     conf_file_path = os.path.join(os.path.join(parent_directory(current_dir), 'config/task_config.json'))
 31 |     logging.info("will use task conf file %s" % conf_file_path)
 32 |     task_config = create_task_config(FLAGS, conf_file_path)
 33 |     hparams = create_hparams(task_config=task_config)
 34 |     print hparams
 35 | 
 36 |     parser = ModelFeatureParser(hparams)
 37 |     model = DeepMatch(parser, hparams)
 38 | 
 39 |     # start the training
 40 |     try:
 41 |         run_validating(hparams=hparams, model=model)
 42 |     except Exception, e:
 43 |         logging.error("catch a exception: %s" % e.message)
 44 |         logging.error("exception is: %s" % traceback.format_exc())
 45 |         raise Exception("terminate process!")
 46 | 
 47 | 
 48 | def run_validating(hparams, model):
 49 |     acc_keys = ["user_id", "ds", "user_embedding_output"]
 50 | 
 51 |     #  user defined function
 52 |     #  you should write your own code here for reading and writing data
 53 |     train_file = get_your_train_files()
 54 |     test_file = get_your_test_files()
 55 |     writer = open_your_test_result_file()
 56 | 
 57 |     if not train_file or len(train_file) == 0 or not test_file or len(test_file) == 0:
 58 |         logging.error("End training directly since no train files or test files!")
 59 |         return
 60 | 
 61 |     logging.info("current_train_file: {}".format(train_file))
 62 |     logging.info("current_test_file: {}".format(test_file))
 63 | 
 64 |     checkpointDir = FLAGS.checkpointDir
 65 |     if not tf.gfile.Exists(checkpointDir):
 66 |         tf.gfile.MakeDirs(checkpointDir)
 67 |         with tf.gfile.FastGFile(os.path.join(checkpointDir, "hyperparams"), 'w') as f:
 68 |             f.write(str(hparams))
 69 |             f.flush()
 70 |             f.close()
 71 | 
 72 |     train_data = model.input_fn_dataset(train_file, data_type="train")
 73 |     test_data = model.input_fn_dataset(test_file, data_type="test")
 74 | 
 75 |     iterator = tf.data.Iterator.from_structure(train_data.output_types, train_data.output_shapes)
 76 |     train_init_op = iterator.make_initializer(train_data)
 77 |     test_init_op = iterator.make_initializer(test_data)
 78 | 
 79 |     features = model.parser.output_features(iterator)
 80 | 
 81 |     train_op, global_step, my_dict = model.model_fn_train(features)
 82 | 
 83 |     steps_per_epoch = hparams.train_len // hparams.batch_size
 84 |     test_interval = hparams.test_interval
 85 |     epochs = test_interval
 86 | 
 87 |     config = tf.ConfigProto()
 88 | 
 89 |     chief_only_hooks = [tf.train.StepCounterHook()]
 90 |     drop_zero_dict = {
 91 |         my_dict['drop_out']: 0.0
 92 |     }
 93 | 
 94 |     drop_dict = {
 95 |         my_dict['drop_out']: hparams.dropout
 96 |     }
 97 |     summary_dir = os.path.join(FLAGS.checkpointDir, 'train')
 98 | 
 99 |     with tf.train.MonitoredTrainingSession(chief_only_hooks=chief_only_hooks, config=config) as sess:
100 |         train_writer = tf.summary.MetricsWriter(summary_dir, sess.graph)
101 |         step_ = 0
102 |         sess.run(train_init_op, feed_dict=drop_zero_dict)
103 |         while step_ < steps_per_epoch * hparams.num_epochs + 5:
104 |             _, loss_, step_, lr_ = sess.run([train_op, my_dict["loss"], global_step, my_dict['learning_rate']],
105 |                                             feed_dict=drop_dict)
106 |             train_writer.add_scalar("loss", loss_, step_)
107 |             train_writer.add_scalar("learning_rate", lr_, step_)
108 |             if random.randint(1, 200) == 1:
109 |                 logging.info("[Epoch {}] {}_sampled_mean_loss: {}".format(epochs, step_, loss_))
110 |             if step_ >= steps_per_epoch * epochs:
111 |                 logging.info("[Epoch {}] Testing...".format(epochs))
112 |                 sess.run(test_init_op, feed_dict=drop_zero_dict)
113 |                 weight = sess.run(my_dict['weight'], feed_dict=drop_zero_dict)
114 |                 logging.info(weight.shape)
115 |                 test_batch_counter = 0
116 |                 try:
117 |                     while True:
118 |                         test_batch_counter += 1
119 |                         user_id, ds, user_vector = sess.run([my_dict[j] for j in acc_keys], feed_dict=drop_zero_dict)
120 |                         user_id = user_id.tolist()
121 |                         ds = ds.tolist()
122 |                         arr = np.matmul(user_vector, np.transpose(weight))
123 |                         indices = np.argpartition(arr, -hparams.topK, axis=1)[:, -hparams.topK:]
124 |                         for num, p in enumerate(zip(user_id, indices, ds)):
125 |                             writer.write([p[0], ','.join(map(str, p[1])), epochs, p[2]])
126 |                 except tf.errors.OutOfRangeError:
127 |                     logging.info("[Epoch {}] test batch counter {}..".format(epochs, test_batch_counter))
128 |                     pass
129 |                 sess.run(train_init_op, feed_dict=drop_zero_dict)
130 |                 logging.info("[Epoch {}] Back to train...".format(epochs))
131 |                 epochs += test_interval
132 | 
133 |     logging.info("*" * 20 + "End training.")
134 | 
135 | 
136 | if __name__ == '__main__':
137 |     tf.app.run()
138 | 


--------------------------------------------------------------------------------
/code/train/utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | import os
 5 | 
 6 | 
 7 | def parent_directory(path, times=1):
 8 |     backup_path = path
 9 |     for i in range(times):
10 |         backup_path = os.path.dirname(backup_path)
11 |     return backup_path
12 | 


--------------------------------------------------------------------------------
/data/sample_data/sample_action.csv:
--------------------------------------------------------------------------------
  1 | user_id,item_id,action_time,session_id
  2 | 937922,357022,2018/2/4 8:28,8107857
  3 | 937922,73,2018/2/4 8:27,8107857
  4 | 937922,29583,2018/2/4 8:26,8107857
  5 | 937922,108763,2018/2/4 8:26,8107857
  6 | 1369473,331139,2018/2/3 21:55,3712240
  7 | 1330642,69016,2018/2/1 12:47,1844129
  8 | 1330642,211690,2018/2/1 12:48,1844129
  9 | 1330642,322692,2018/2/1 12:48,1844129
 10 | 1330642,19643,2018/2/1 12:47,1844129
 11 | 411741,320293,2018/3/10 21:20,4680176
 12 | 411741,291893,2018/3/10 21:21,4680176
 13 | 1552476,319649,2018/3/29 6:27,8194740
 14 | 1552476,110244,2018/3/29 6:22,8194740
 15 | 1552476,343178,2018/3/29 6:27,8194740
 16 | 742645,13795,2018/3/19 23:26,10771503
 17 | 742645,50764,2018/3/19 23:27,10771503
 18 | 1349460,210131,2018/2/27 14:32,4392084
 19 | 1455298,208441,2018/4/11 15:21,6190659
 20 | 1455298,334318,2018/4/11 15:14,6190659
 21 | 1455298,237755,2018/4/11 15:14,6190659
 22 | 1455298,6422,2018/4/11 15:22,6190659
 23 | 1455298,268566,2018/4/11 15:14,6190659
 24 | 1455298,115915,2018/4/11 15:13,6190659
 25 | 1455298,208254,2018/4/11 15:22,6190659
 26 | 1455298,177209,2018/4/14 14:09,6628254
 27 | 1455298,71793,2018/4/14 14:10,6628254
 28 | 1455298,141950,2018/4/12 15:37,10207258
 29 | 1455298,97236,2018/4/12 15:46,8129309
 30 | 1068082,277346,2018/3/3 23:14,6364387
 31 | 67623,314054,2018/3/12 17:29,3455782
 32 | 914860,177884,2018/3/4 14:16,4186146
 33 | 914860,211134,2018/3/4 14:18,4186146
 34 | 914860,298918,2018/3/4 14:20,4186146
 35 | 914860,326156,2018/3/3 15:42,5422447
 36 | 914860,254818,2018/3/3 15:41,5422447
 37 | 1068082,128673,2018/4/13 8:40,1449113
 38 | 931145,243514,2018/3/26 10:36,2341021
 39 | 931145,321897,2018/3/26 10:41,2341021
 40 | 931145,19120,2018/3/26 10:37,2341021
 41 | 931145,229189,2018/3/26 10:30,2341021
 42 | 931145,168630,2018/3/26 10:38,2341021
 43 | 931145,287459,2018/3/26 10:32,2341021
 44 | 931145,53531,2018/3/26 10:31,2341021
 45 | 931145,199243,2018/3/26 10:35,2341021
 46 | 931145,255210,2018/3/26 10:30,2341021
 47 | 1030562,161532,2018/2/18 22:20,290794
 48 | 48912,241353,2018/3/13 18:05,2408273
 49 | 48912,198785,2018/3/13 18:04,2408273
 50 | 48912,283695,2018/3/13 18:04,2408273
 51 | 48912,330122,2018/3/13 18:01,2408273
 52 | 48912,223701,2018/3/13 18:04,2408273
 53 | 843174,201651,2018/4/15 7:25,3077407
 54 | 188522,245398,2018/2/28 10:13,4115259
 55 | 102717,272462,2018/3/18 18:54,8017199
 56 | 102717,92989,2018/3/18 18:55,8017199
 57 | 102717,167081,2018/3/18 18:59,8017199
 58 | 1177411,252529,2018/3/24 5:42,6564504
 59 | 669643,194478,2018/4/8 19:24,7861652
 60 | 1105549,318390,2018/3/21 14:52,10266611
 61 | 1105549,253645,2018/3/21 14:53,10266611
 62 | 1105549,10592,2018/3/21 14:55,10266611
 63 | 854108,142082,2018/2/27 22:48,8246488
 64 | 861709,262378,2018/2/11 12:24,10053112
 65 | 125241,158355,2018/2/26 15:55,4265977
 66 | 125241,156646,2018/2/26 15:56,4265977
 67 | 102717,68275,2018/3/1 16:01,3224127
 68 | 914860,35907,2018/2/25 14:54,10115485
 69 | 914860,155219,2018/2/25 16:00,6518843
 70 | 914860,34820,2018/2/25 15:54,6518843
 71 | 914860,275374,2018/2/25 15:56,6518843
 72 | 1359480,7737,2018/3/25 13:31,6187812
 73 | 780789,194477,2018/3/6 6:02,2431464
 74 | 780789,910,2018/3/6 6:26,2431464
 75 | 780789,358167,2018/3/6 5:58,2431464
 76 | 780789,19643,2018/3/6 5:59,2431464
 77 | 1164554,28660,2018/2/1 20:51,11253914
 78 | 742645,33865,2018/3/20 21:06,7902443
 79 | 742645,248059,2018/3/21 8:09,8880819
 80 | 914860,245991,2018/3/1 14:56,4800040
 81 | 914860,48661,2018/3/1 14:57,4800040
 82 | 1359480,129456,2018/3/18 8:06,9296468
 83 | 1359480,133804,2018/4/7 10:01,3976607
 84 | 1359480,86536,2018/4/7 10:02,3976607
 85 | 931145,92067,2018/4/5 8:52,5711637
 86 | 931145,34516,2018/4/5 8:47,5711637
 87 | 931145,2890,2018/4/5 8:43,5711637
 88 | 931145,313787,2018/4/5 8:51,5711637
 89 | 931145,281189,2018/4/5 8:52,5711637
 90 | 843174,236680,2018/3/23 1:53,9598328
 91 | 1177411,86758,2018/3/3 16:06,8652446
 92 | 1177411,221085,2018/2/18 7:58,4312641
 93 | 1177411,191329,2018/2/18 7:55,4312641
 94 | 1177411,280994,2018/2/18 8:00,4312641
 95 | 1177411,11435,2018/2/18 8:04,4312641
 96 | 1177411,233176,2018/2/18 8:06,4312641
 97 | 1177411,157263,2018/2/18 7:57,4312641
 98 | 1105549,160369,2018/3/17 14:10,4361756
 99 | 188522,164058,2018/2/24 20:06,8581424
100 | 188522,245398,2018/2/24 20:08,8581424


--------------------------------------------------------------------------------
/data/sample_data/sample_item.csv:
--------------------------------------------------------------------------------
1 | item_id,brand,shop_id,cate,226519,6302,2399,79,63114,9167,4216,79,372345,2748,7125,79,366931,2698,10252,79,174979,8368,871,79,295436,6302,2399,79,282251,6302,2399,79,146764,6302,2399,79,130851,6302,2399,79,150184,2748,9541,79,114345,6574,4071,79,98950,8368,871,79,188988,6574,4071,79,106591,8103,7575,79,251912,6111,1570,79,140059,4217,7011,79,110630,4217,7011,79,196053,5192,652,79,227772,9167,4216,79,228234,5192,652,79,271298,5192,652,79,133835,3274,6700,79,208319,5192,652,79,313911,9167,4216,79,151783,9268,1259,79,5374,10435,8501,79,134395,2698,10252,79,228138,6302,2399,79,73526,10614,9389,79,86302,6302,2399,79,217122,6302,2399,79,14750,10435,8330,79,80215,6302,7129,79,253840,6302,7129,79,354373,10435,6075,79,180753,6302,1345,79,284293,6302,1345,79,188267,2300,2375,79,133191,6302,1345,79,195692,6302,1345,79,207669,6302,1345,79,94010,10435,8330,79,262860,6302,1345,79,252344,9167,4216,79,173463,6302,6004,79,212106,6302,6515,79,220171,6302,7579,79,169499,6302,7579,79,215755,6302,7579,79,276694,6302,8382,79,222024,6302,8382,79,294204,6302,8382,79,109604,6302,8382,79,166892,6302,8382,79,194915,6302,8382,79,176506,6302,8382,79,334983,6302,8382,79,220101,6302,8382,79,142410,10435,8330,79,292099,6302,4348,79,149415,6302,4348,79,63999,6302,4348,79,111022,6302,4348,79,6144,6302,8382,79,30167,6302,9311,79,51969,6302,9311,79,226975,6302,1616,79,296097,6302,6586,79,251503,6302,6586,79,352183,9167,4216,79,51930,5762,8382,79,14937,5762,8382,79,332781,10435,7950,79,154392,6302,7650,79,33575,10179,216,79,100728,9167,4385,79,344292,9167,4385,79,367542,10435,7950,79,325100,9167,4385,79,352254,9167,4385,79,58610,9167,4385,79,179737,10435,6126,79,315108,10435,6075,79,359267,10435,6126,79,9362,10435,6126,79,101011,10435,1884,79,135892,10435,6075,79,343553,5762,6826,79,39192,10435,7950,79,362725,10435,7950,79,117009,10435,7950,79,254757,10435,7950,79,229056,10435,7950,79,287141,6302,5568,79,166195,6302,5568,79,345583,6302,5568,79,305000,6302,9234,79,19069,10435,5048,79,331913,6302,9234,79,
2 | 


--------------------------------------------------------------------------------
/data/sample_data/sample_user.csv:
--------------------------------------------------------------------------------
  1 | user_id,age,sex,user_lv_cd,city_level,province,city,county
  2 | 1117472,5,1,6,5,30,149,2407
  3 | 203661,5,1,6,5,30,149,2407
  4 | 882636,2,0,7,5,30,149,2407
  5 | 1495035,2,0,1,5,30,149,2407
  6 | 929985,2,1,1,5,30,149,2407
  7 | 684456,5,1,1,5,30,149,2407
  8 | 740847,5,0,7,5,30,149,2407
  9 | 538600,4,0,6,5,30,149,2407
 10 | 402514,5,1,5,5,30,149,2407
 11 | 1429509,4,0,5,5,30,149,2407
 12 | 580012,5,1,1,5,30,149,2407
 13 | 1054779,5,0,7,5,30,149,2407
 14 | 146414,5,0,6,5,30,149,2407
 15 | 1570039,4,0,1,5,30,149,2407
 16 | 789137,6,0,6,5,30,149,2407
 17 | 80381,5,0,1,5,30,149,2407
 18 | 1146021,2,0,5,5,30,149,2407
 19 | 730149,5,1,7,5,30,149,2407
 20 | 1229930,5,1,5,5,30,149,2407
 21 | 903792,5,1,5,5,30,149,2407
 22 | 656442,5,1,1,5,30,149,2407
 23 | 512385,6,0,1,5,30,149,2407
 24 | 334786,5,0,6,5,30,149,2407
 25 | 44483,5,1,1,5,30,149,2407
 26 | 523737,4,1,5,5,30,149,2407
 27 | 702347,5,1,1,5,30,149,2407
 28 | 415747,4,0,1,5,30,149,2407
 29 | 1569153,6,1,6,5,30,149,2407
 30 | 1382694,4,1,1,5,30,149,2407
 31 | 464712,6,0,5,5,30,149,2407
 32 | 913142,4,1,6,5,30,149,2407
 33 | 729097,4,0,7,5,30,149,2407
 34 | 696529,5,1,1,5,30,149,2407
 35 | 461675,1,0,1,5,30,149,2407
 36 | 808711,4,1,6,5,30,149,2407
 37 | 532217,6,1,6,5,30,149,2407
 38 | 85693,2,0,6,5,30,149,2407
 39 | 100963,5,0,1,5,30,149,2407
 40 | 751194,5,0,6,5,30,149,2407
 41 | 829948,2,-1,6,5,30,149,2407
 42 | 506921,5,1,6,5,30,149,2407
 43 | 537878,4,1,6,5,30,149,2407
 44 | 352351,1,0,6,5,30,149,2407
 45 | 208907,6,0,5,5,30,149,2407
 46 | 185796,2,1,6,5,30,149,2407
 47 | 564309,6,0,5,5,30,149,2407
 48 | 1298329,6,1,1,5,30,149,2407
 49 | 1365052,6,0,1,5,30,149,2407
 50 | 951094,4,0,1,5,30,149,2407
 51 | 1075853,6,1,1,5,30,149,2407
 52 | 478383,1,1,6,5,30,149,2407
 53 | 43966,6,1,6,5,30,149,2407
 54 | 179008,4,0,1,5,30,149,2407
 55 | 31279,4,1,6,5,30,149,2407
 56 | 526369,1,0,4,5,30,149,2407
 57 | 234619,4,0,6,5,30,149,2407
 58 | 1269373,5,1,6,5,30,149,2407
 59 | 115392,5,1,5,5,30,149,2407
 60 | 1192799,5,0,6,5,30,149,2407
 61 | 1452587,5,0,5,5,30,149,2407
 62 | 829598,5,1,1,5,30,149,2407
 63 | 1469900,5,0,6,5,30,149,2407
 64 | 327722,5,1,6,5,30,149,2407
 65 | 371501,6,1,5,5,30,149,2407
 66 | 541536,5,0,7,5,30,149,2407
 67 | 1161472,1,0,6,5,30,149,2407
 68 | 1320654,6,0,5,5,30,149,2407
 69 | 968528,4,1,6,5,30,149,2407
 70 | 965120,4,1,6,5,30,149,2407
 71 | 992568,5,0,1,5,30,149,2407
 72 | 14591,1,1,6,5,30,149,2407
 73 | 893716,6,0,1,5,30,149,2407
 74 | 145613,5,1,1,5,30,149,2407
 75 | 671215,6,1,1,5,30,149,2407
 76 | 1096950,5,0,1,5,30,149,2407
 77 | 842543,6,1,1,5,30,149,2407
 78 | 2655,5,0,5,5,30,149,2407
 79 | 790750,6,0,5,5,30,149,2407
 80 | 266009,5,0,1,5,30,149,2407
 81 | 800645,5,0,1,5,30,149,2407
 82 | 1543388,4,1,1,5,30,149,2407
 83 | 91222,6,0,5,5,30,149,2407
 84 | 502292,6,1,7,5,30,149,2407
 85 | 222058,6,0,6,5,30,149,2407
 86 | 430407,4,0,6,5,30,149,2407
 87 | 1490779,5,1,5,5,30,149,2407
 88 | 718115,4,1,7,5,30,149,2407
 89 | 46428,4,1,6,5,30,149,2407
 90 | 1086362,4,1,1,5,30,149,2407
 91 | 1013903,6,1,5,5,30,149,2407
 92 | 1215558,5,1,5,5,30,149,2407
 93 | 594560,5,1,1,5,30,149,2407
 94 | 100004,5,1,5,5,30,149,2407
 95 | 395926,5,1,5,5,30,149,2407
 96 | 311773,4,1,1,5,30,149,2407
 97 | 633035,5,1,1,5,30,149,2407
 98 | 1479930,5,1,1,5,30,149,2407
 99 | 977783,5,1,1,5,30,149,2407
100 | 27711,6,0,1,5,30,149,2407


--------------------------------------------------------------------------------