├── LICENSE ├── README.md ├── data_reader.py ├── hyper_param.py ├── id_allocator.py ├── item_recommender.py ├── layer_util.py └── sample.data /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ## R3S 3 | 4 | Users of recommendation systems usually focus on one topic at a time. When finishing reading an item, users may want to access more relevant items related to the last read one as extended reading. 5 | 6 | However, conventional recommendation systems are hard to provide the continuous extended reading function of these relevant items, since the main recommendation results should be diversified. 7 | 8 | In this paper, we propose a new task named recommendation suggestion, which aims to (1) predict whether users want extended reading, and (2) provide appropriate relevant items as suggestions. 9 | 10 | These recommended relevant items are arranged in a relevant box and instantly inserted below the clicked item in the main feed. 11 | 12 | The challenge of recommendation suggestion on relevant items is that it should further consider semantic relevance and information gain besides CTR-related factors. Moreover, the real-time relevant box insertion may also harm the overall performance when users do not want extended reading. 13 | 14 | To address these issues, we propose a novel Real-time relevant recommendation suggestion (R3S) framework, which consists of an Item recommender and a Box trigger. We extract features from multiple aspects including feature interaction, semantic similarity and information gain as different experts, and propose a new Multi-critic multi-gate mixture-of-experts (M3oE) strategy to jointly consider different experts with multi-head critics. 15 | 16 | In experiments, we conduct both offline and online evaluations on a real-world recommendation system with detailed ablation tests. The significant improvements in item/box related metrics verify the effectiveness of R3S. Moreover, we have deployed R3S on WeChat Top Stories, which affects millions of users. 17 | 18 | ### Requirements: 19 | - Python 3.8 20 | - Tensorflow 2.4.1 21 | 22 | ## Note 23 | 24 | In the actual online system, R3S is a complex re-ranking framework implemented in C++. 25 | All models are trained based on a deeply customized version of distributed tensorflow supporting large-scale sparse features. 26 | 27 | Without massive data and machine resources, training R3S is not realistic. 28 | 29 | Therefore, the open source code here only implements a simplified version for interested researchers. If there are any errors, please contact me. Thanks! 30 | 31 | ## About 32 | 33 | "Real-time Relevant Recommendation Suggestion" ([WSDM 2021](http://nlp.csai.tsinghua.edu.cn/~xrb/publications/WSDM-21_R3S.pdf)) 34 | -------------------------------------------------------------------------------- /data_reader.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Data Reader 3 | 4 | April 2021 5 | modric10zhang@gmail.com 6 | 7 | ''' 8 | import sys 9 | from id_allocator import IdAllocator 10 | from hyper_param import param_dict as pd 11 | 12 | 13 | class DataReader(object): 14 | def __init__(self, batch_num): 15 | self._id_tool = IdAllocator() 16 | self._data = [] 17 | self._batch = batch_num 18 | 19 | def unique_feature_num(self): 20 | return self._id_tool.unique_id_num() 21 | 22 | def parse_feature(self, raw_feature): 23 | feature = set() 24 | for f in raw_feature.split(','): 25 | feature.add(self._id_tool.allocate(f)) 26 | if len(feature) == 0: 27 | feature.add(0) 28 | return feature 29 | 30 | def load(self, sample_path): 31 | with open(sample_path, 'r') as fp: 32 | for sinfo in fp: 33 | skv = {} 34 | info = sinfo.strip().split('\t') 35 | for ii in info: 36 | ff = ii.split(' ') 37 | for fi in ff: 38 | pos = fi.find(':') 39 | skv[fi[:pos]] = fi[pos + 1:] 40 | feats = [[], [], [], []] 41 | fields = [pd['user_field_num'], pd['doc_field_num'], pd['con_field_num'], pd['doc_field_num']] 42 | prefix = ['uf', 'rf', 'cf', 'sf'] 43 | assert (len(fields) == len(prefix)) 44 | for k in range(len(fields)): 45 | for i in range(fields[k]): 46 | fk = '%s%s' % (prefix[k], i) 47 | if fk not in skv: 48 | raise Exception('field %s not exist.' % fk) 49 | feats[k].append(self.parse_feature(skv[fk])) 50 | self._data.append([feats[0], feats[1], feats[2], feats[3], skv['dwell_time']]) 51 | 52 | def next(self): 53 | nb = None 54 | if len(self._data) <= 0: 55 | return nb 56 | else: 57 | idx = len(self._data) if len(self._data) <= self._batch else self._batch 58 | nb = self._data[:idx] 59 | self._data = self._data[idx:] 60 | return nb 61 | -------------------------------------------------------------------------------- /hyper_param.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Model Hyper Parameter Dict 3 | 4 | April 2021 5 | modric10zhang@gmail.com 6 | 7 | ''' 8 | 9 | param_dict = { 10 | 'feat_dim': 6, # feature embedding dimension 11 | 'user_field_num': 5, # number of user feature fields 12 | 'doc_field_num': 5, # number of doc feature fields 13 | 'con_field_num': 5, # number of context feature fields 14 | 'expert_dim': 90, # expert subnetwork dimension 15 | 'expert_num': 3, # number of experts 16 | 'critic_num': 4, # number of critic networks(gates) 17 | 'num_epochs': 100, # training epoch 18 | 'batch_size': 128, # batch size 19 | 'lr': 0.0002, # learning rate of network 20 | 'dropout': 0.3, # dropout ratio 21 | 'grad_clip': 5.0, # grad clip 22 | 'head_num': 3, # head number for all self-attention units 23 | } 24 | -------------------------------------------------------------------------------- /id_allocator.py: -------------------------------------------------------------------------------- 1 | class IdAllocator(object): 2 | def __init__(self): 3 | self._id = 1 4 | self._tbl = {} 5 | 6 | def allocate(self, x): 7 | if type(x) is not str: 8 | raise Exception('only str is supported in IdAllocator.') 9 | if x not in self._tbl: 10 | self._id += 1 11 | self._tbl[x] = self._id 12 | return self._tbl[x] 13 | 14 | def unique_id_num(self): 15 | return self._id 16 | -------------------------------------------------------------------------------- /item_recommender.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Item Recommender 3 | 4 | April 2021 5 | modric10zhang@gmail.com 6 | 7 | ''' 8 | 9 | import os 10 | import math 11 | import numpy as np 12 | import tensorflow.compat.v1 as tf 13 | from layer_util import * 14 | from data_reader import DataReader 15 | from hyper_param import param_dict as pd 16 | 17 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 18 | tf.disable_eager_execution() 19 | 20 | ###### global variable for local computation ###### 21 | g_loss_sum = 0. 22 | g_loss_cnt = 0 23 | 24 | g_working_mode = 'local_train' 25 | g_training = False 26 | 27 | g_dr = DataReader(pd['batch_size']) 28 | 29 | 30 | class ItemRecommender(object): 31 | def __init__(self): 32 | # placeholder 33 | self.sph_user = tf.sparse_placeholder(tf.int32, name='sph_user') 34 | self.sph_doc = tf.sparse_placeholder(tf.int32, name='sph_doc') 35 | self.sph_con = tf.sparse_placeholder(tf.int32, name='sph_con') 36 | self.sph_seed = tf.sparse_placeholder(tf.int32, name='sph_seed') 37 | self.sph_ig = tf.sparse_placeholder(tf.int32, name='sph_ig') 38 | self.ph_dwell_time = tf.placeholder(tf.float32, name='ph_dwell_time') 39 | 40 | self.create_graph('m3oe') 41 | diff = tf.reshape(self.ph_dwell_time, [-1]) - tf.reshape(self.output, [-1]) 42 | self.loss = tf.reduce_mean(tf.square(diff)) 43 | vs = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='m3oe') 44 | self.grads = tf.clip_by_global_norm(tf.gradients(self.loss, vs), pd['grad_clip'])[0] 45 | with tf.variable_scope('opt'): 46 | optimizer = tf.train.AdamOptimizer(pd['lr']) 47 | self.opt = optimizer.apply_gradients(zip(self.grads, vs)) 48 | 49 | def field_interact(self, fields): 50 | global g_training 51 | qkv = tf.layers.dropout(fields, rate=pd['dropout'], training=g_training) 52 | with tf.variable_scope('fi'): 53 | return multihead_attention(queries=qkv, 54 | keys=qkv, 55 | values=qkv, 56 | num_heads=pd['head_num'], 57 | dropout_rate=pd['dropout'], 58 | training=g_training, 59 | causality=False, 60 | scope='mha') 61 | 62 | def create_graph(self, scope): 63 | global g_training, g_dr 64 | with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): 65 | feat_dict = get_embeddings(g_dr.unique_feature_num(), 66 | pd['feat_dim'], 67 | scope='feat_embedding', 68 | zero_pad=False) 69 | n_batch = pd['batch_size'] 70 | n_user, n_doc, n_con = pd['user_field_num'], pd['doc_field_num'], pd['con_field_num'] 71 | embed_dim = pd['feat_dim'] 72 | user_embed = tf.nn.embedding_lookup_sparse(feat_dict, self.sph_user, sp_weights=None, combiner='mean') 73 | self.user = tf.reshape(user_embed, shape=[n_batch, n_user, embed_dim]) 74 | doc_embed = tf.nn.embedding_lookup_sparse(feat_dict, self.sph_doc, sp_weights=None, combiner='mean') 75 | self.doc = tf.reshape(doc_embed, shape=[n_batch, n_doc, embed_dim]) 76 | con_embed = tf.nn.embedding_lookup_sparse(feat_dict, self.sph_con, sp_weights=None, combiner='mean') 77 | self.con = tf.reshape(con_embed, shape=[n_batch, n_con, embed_dim]) 78 | seed_embed = tf.nn.embedding_lookup_sparse(feat_dict, self.sph_seed, sp_weights=None, combiner='mean') 79 | self.seed = tf.reshape(seed_embed, shape=[n_batch, n_doc, embed_dim]) 80 | ig_embed = tf.nn.embedding_lookup_sparse(feat_dict, self.sph_ig, sp_weights=None, combiner='mean') 81 | self.ig = tf.reshape(ig_embed, shape=[n_batch, n_doc, embed_dim]) 82 | 83 | fi_in = tf.concat([self.doc, self.seed], axis=1) 84 | # feature interaction network 85 | fi_expert = tf.reshape(self.field_interact(fi_in), shape=[n_batch, -1]) 86 | fi_expert = tf.concat([fi_expert, 87 | tf.reshape(self.user, shape=[n_batch, -1]), 88 | tf.reshape(self.con, shape=[n_batch, -1])], axis=1) 89 | fi_expert = tf.layers.dense(fi_expert, fi_expert.get_shape().as_list()[-1], activation=tf.nn.relu) 90 | fi_expert = tf.layers.dense(fi_expert, pd['expert_dim'], activation=tf.nn.relu) 91 | # sys.exit(0) 92 | edc = tf.reshape(self.doc, shape=[-1, embed_dim]) 93 | esd = tf.reshape(self.seed, shape=[-1, embed_dim]) 94 | # similarity network 95 | smn0 = tf.multiply(edc, esd) 96 | smn1 = tf.reduce_sum(tf.multiply(edc, esd), axis=1, keep_dims=True) 97 | smn = tf.reshape(tf.concat([smn0, smn1], axis=1), shape=[n_batch, -1]) 98 | sim_expert = tf.concat([smn, 99 | tf.reshape(self.user, shape=[n_batch, -1]), 100 | tf.reshape(self.con, shape=[n_batch, -1])], axis=1) 101 | sim_expert = tf.layers.dense(sim_expert, pd['expert_dim'], activation=tf.nn.relu) 102 | # information gain network 103 | ig_expert = tf.concat([tf.reshape(self.ig, [n_batch, -1]), 104 | tf.reshape(self.user, [n_batch, -1]), 105 | tf.reshape(self.con, [n_batch, -1])], axis=1) 106 | ig_expert = tf.layers.dense(ig_expert, pd['expert_dim'], activation=tf.nn.relu) 107 | # multi-ciritic 108 | gate_in = tf.concat([tf.reshape(self.user, [n_batch, -1]), 109 | tf.reshape(self.seed, [n_batch, -1]), 110 | tf.reshape(self.con, [n_batch, -1])], axis=1) 111 | experts = tf.stack([fi_expert, sim_expert, ig_expert], axis=1) 112 | gates, votes = [], [] 113 | for i in range(pd['critic_num']): 114 | gates.append(tf.nn.softmax(tf.layers.dense(gate_in, pd['expert_num']))) 115 | gates[i] = tf.reshape(gates[i], [n_batch, pd['expert_num'], 1]) 116 | votes.append(tf.reduce_sum(gates[i] * experts, axis=1)) 117 | votes = tf.stack(votes, axis=1) 118 | # attention layer 119 | w_init = tf.truncated_normal_initializer(stddev=0.01) 120 | att_x = tf.concat([tf.reshape(self.user, [n_batch, -1]), 121 | tf.reshape(self.doc, [n_batch, -1]), 122 | tf.reshape(self.seed, [n_batch, -1]), 123 | tf.reshape(self.con, [n_batch, -1])], axis=1) 124 | att_w = tf.get_variable('att_w', (pd['expert_dim'], att_x.get_shape().as_list()[-1]), initializer=w_init) 125 | att_o = tf.tensordot(votes, att_w, [[2], [0]]) 126 | att_x = tf.tile(tf.expand_dims(att_x, 1), [1, pd['critic_num'], 1]) 127 | att_o = tf.expand_dims(tf.nn.softmax(tf.reduce_sum(att_o * att_x, 2)), -1) 128 | vote_ret = tf.reduce_sum(att_o * votes, axis=1) 129 | fc = tf.layers.dropout( 130 | tf.layers.dense(vote_ret, vote_ret.get_shape().as_list()[-1] / 2, activation=tf.nn.relu), 131 | rate=pd['dropout'], 132 | training=g_training) 133 | self.output = tf.layers.dense(fc, 1, activation=tf.nn.relu) 134 | 135 | # call for evaluation 136 | def predict(self, sess, ph_dict): 137 | return sess.run(self.output, feed_dict={self.sph_user: ph_dict['user'], 138 | self.sph_doc: ph_dict['doc'], 139 | self.sph_con: ph_dict['con'], 140 | self.sph_seed: ph_dict['seed'], 141 | self.sph_ig: ph_dict['ig'], 142 | self.ph_dwell_time: ph_dict['reward']}) 143 | 144 | # call for learning from data 145 | def learn(self, sess, ph_dict): 146 | loss, _ = sess.run([self.loss, self.opt], feed_dict={self.sph_user: ph_dict['user'], 147 | self.sph_doc: ph_dict['doc'], 148 | self.sph_con: ph_dict['con'], 149 | self.sph_seed: ph_dict['seed'], 150 | self.sph_ig: ph_dict['ig'], 151 | self.ph_dwell_time: ph_dict['reward']}) 152 | global g_loss_sum, g_loss_cnt 153 | g_loss_sum += np.mean(loss) 154 | g_loss_cnt += 1 155 | 156 | 157 | def sigmoid(x): 158 | return 1.0 / (1.0 + math.exp(max(min(-x, 1e2), -1e2))) 159 | 160 | 161 | def handle(sess, net, sess_data): 162 | def gen_sparse_tensor(fs): 163 | global g_dr 164 | kk, vv = [], [] 165 | for i in range(len(fs)): 166 | ff = fs[i] 167 | assert (isinstance(ff, set)) 168 | ff = list(ff) 169 | for k in range(len(ff)): 170 | kk.append(np.array([i, k], dtype=np.int32)) 171 | vv.append(ff[k]) 172 | return tf.SparseTensorValue(kk, vv, [len(fs), g_dr.unique_feature_num()]) 173 | 174 | if len(sess_data) != pd['batch_size']: 175 | return 176 | user, doc, con, seed, dwell = [], [], [], [], [] 177 | for i in range(len(sess_data)): 178 | user.append(sess_data[i][0]) 179 | doc.append(sess_data[i][1]) 180 | con.append(sess_data[i][2]) 181 | seed.append(sess_data[i][3]) 182 | dwell.append(sess_data[i][4]) 183 | phd = {} 184 | # print np.array(user).shape 185 | user = np.array(user).reshape(pd['batch_size'] * pd['user_field_num']) 186 | phd['user'] = gen_sparse_tensor(user) 187 | doc = np.array(doc).reshape(pd['batch_size'] * pd['doc_field_num']) 188 | phd['doc'] = gen_sparse_tensor(doc) 189 | seed = np.array(seed).reshape(pd['batch_size'] * pd['doc_field_num']) 190 | phd['seed'] = gen_sparse_tensor(seed) 191 | ig = [] 192 | for i in range(doc.shape[0]): 193 | ig.append({0} if doc[i] <= seed[i] else doc[i] - seed[i]) 194 | ig = np.array(ig).reshape(pd['batch_size'] * pd['doc_field_num']) 195 | phd['ig'] = gen_sparse_tensor(ig) 196 | con = np.array(con).reshape(pd['batch_size'] * pd['con_field_num']) 197 | phd['con'] = gen_sparse_tensor(con) 198 | phd['reward'] = dwell 199 | global g_training 200 | if g_training: 201 | # train network 202 | net.learn(sess, phd) 203 | else: 204 | # evaluate network 205 | qout = net.predict(sess, phd).reshape([-1]) 206 | global g_working_mode 207 | for i in range(len(dwell)): 208 | if 'local_predict' == g_working_mode: 209 | print('%s %s' % (dwell[i], qout[i])) 210 | 211 | 212 | def work(): 213 | sess = tf.Session() 214 | # build networks 215 | net = ItemRecommender() 216 | saver = tf.train.Saver(max_to_keep=1) 217 | g_init_op = tf.global_variables_initializer() 218 | if os.path.exists('./ckpt') and len(os.listdir('./ckpt')): 219 | model_file = tf.train.latest_checkpoint('./ckpt') 220 | saver.restore(sess, model_file) 221 | else: 222 | sess.run(g_init_op) 223 | os.system('mkdir ckpt') 224 | global g_loss_sum, g_loss_cnt, g_dr 225 | last_epoch_loss = 1e2 226 | for k in range(pd['num_epochs']): 227 | if k > 0: 228 | g_dr.load('sample.data') 229 | data = g_dr.next() 230 | batch_cnt = 0 231 | while data is not None: 232 | handle(sess, net, data) 233 | data = g_dr.next() 234 | batch_cnt += 1 235 | if g_training and batch_cnt % 10 == 0: 236 | print('>>>Average Loss --- epoch %d --- batch %d --- %f' % ( 237 | k, batch_cnt, g_loss_sum / (g_loss_cnt + 1e-6))) 238 | print('>>>Average Loss --- epoch %d --- batch %d --- %f' % (k, batch_cnt, g_loss_sum / (g_loss_cnt + 1e-6))) 239 | if g_loss_sum / g_loss_cnt > last_epoch_loss: 240 | print('Job Finished!') 241 | break 242 | else: 243 | last_epoch_loss = g_loss_sum / g_loss_cnt 244 | saver.save(sess, 'ckpt/m3oe.ckpt') 245 | 246 | 247 | if __name__ == '__main__': 248 | g_dr.load('sample.data') 249 | if g_working_mode == 'local_train': 250 | g_training = True 251 | elif g_working_mode == 'local_predict': 252 | g_training = False 253 | else: 254 | raise Exception('invalid working mode') 255 | work() 256 | -------------------------------------------------------------------------------- /layer_util.py: -------------------------------------------------------------------------------- 1 | ''' 2 | The implementation of multi-head attention mechanism 3 | refers to Kyubyong/transformer(https://github.com/Kyubyong/transformer) 4 | 5 | April 2021 6 | modric10zhang@gmail.com 7 | 8 | ''' 9 | 10 | import tensorflow.compat.v1 as tf 11 | 12 | 13 | def layer_norm(inputs, scope='ln'): 14 | '''Applies layer normalization. See https://arxiv.org/abs/1607.06450. 15 | inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`. 16 | epsilon: A floating number. A very small number for preventing ZeroDivision Error. 17 | scope: Optional scope for `variable_scope`. 18 | 19 | Returns: 20 | A tensor with the same shape and data dtype as `inputs`. 21 | ''' 22 | epsilon = 1e-8 23 | with tf.variable_scope(scope): 24 | inputs_shape = inputs.get_shape() 25 | params_shape = inputs_shape[-1:] 26 | # [-1] means last dimension 27 | mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True) 28 | beta = tf.get_variable("beta", params_shape, initializer=tf.zeros_initializer()) 29 | gamma = tf.get_variable("gamma", params_shape, initializer=tf.ones_initializer()) 30 | normalized = (inputs - mean) / ((variance + epsilon) ** (.5)) 31 | outputs = gamma * normalized + beta 32 | return outputs 33 | 34 | 35 | def get_embeddings(dict_size, num_units, scope, zero_pad=True, partitioner=None): 36 | '''Constructs token embedding matrix. 37 | Note that the column of index 0's are set to zeros. 38 | dict_size: scalar. V. 39 | num_units: embedding dimensionalty. E. 40 | zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero 41 | To apply query/key masks easily, zero pad is turned on. 42 | 43 | Returns 44 | weight variable: (V, E) 45 | ''' 46 | with tf.variable_scope(scope): 47 | w_init = tf.truncated_normal_initializer(mean=0, stddev=0.1) 48 | embeddings = tf.get_variable('w', 49 | dtype=tf.float32, 50 | shape=(dict_size, num_units), 51 | initializer=w_init, 52 | partitioner=partitioner) 53 | # initializer=tf.contrib.layers.xavier_initializer()) 54 | if zero_pad: 55 | embeddings = tf.concat((tf.zeros(shape=[1, num_units]), 56 | embeddings[1:, :]), 0) 57 | return embeddings 58 | 59 | 60 | def mask(inputs, queries=None, keys=None, type=None): 61 | """Masks paddings on keys or queries to inputs 62 | inputs: 3d tensor. (N, T_q, T_k) 63 | queries: 3d tensor. (N, T_q, d) 64 | keys: 3d tensor. (N, T_k, d) 65 | 66 | e.g., 67 | >> queries = tf.constant([[[1.], 68 | [2.], 69 | [0.]]], tf.float32) # (1, 3, 1) 70 | >> keys = tf.constant([[[4.], 71 | [0.]]], tf.float32) # (1, 2, 1) 72 | >> inputs = tf.constant([[[4., 0.], 73 | [8., 0.], 74 | [0., 0.]]], tf.float32) 75 | >> mask(inputs, queries, keys, "key") 76 | array([[[ 4.0000000e+00, -4.2949673e+09], 77 | [ 8.0000000e+00, -4.2949673e+09], 78 | [ 0.0000000e+00, -4.2949673e+09]]], dtype=float32) 79 | >> inputs = tf.constant([[[1., 0.], 80 | [1., 0.], 81 | [1., 0.]]], tf.float32) 82 | >> mask(inputs, queries, keys, "query") 83 | array([[[1., 0.], 84 | [1., 0.], 85 | [0., 0.]]], dtype=float32) 86 | """ 87 | padding_num = -2 ** 32 + 1 88 | if type in ("k", "key", "keys"): 89 | # Generate masks 90 | masks = tf.sign(tf.reduce_sum(tf.abs(keys), axis=-1)) # (N, T_k) 91 | masks = tf.expand_dims(masks, 1) # (N, 1, T_k) 92 | masks = tf.tile(masks, [1, tf.shape(queries)[1], 1]) # (N, T_q, T_k) 93 | 94 | # Apply masks to inputs 95 | paddings = tf.ones_like(inputs) * padding_num 96 | outputs = tf.where(tf.equal(masks, 0), paddings, inputs) # (N, T_q, T_k) 97 | elif type in ("q", "query", "queries"): 98 | # Generate masks 99 | masks = tf.sign(tf.reduce_sum(tf.abs(queries), axis=-1)) # (N, T_q) 100 | masks = tf.expand_dims(masks, -1) # (N, T_q, 1) 101 | masks = tf.tile(masks, [1, 1, tf.shape(keys)[1]]) # (N, T_q, T_k) 102 | 103 | # Apply masks to inputs 104 | outputs = inputs * masks 105 | elif type in ("f", "future", "right"): 106 | diag_vals = tf.ones_like(inputs[0, :, :]) # (T_q, T_k) 107 | # tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense() # (T_q, T_k) 108 | tril = tf.linalg.band_part(diag_vals, -1, 0) # (T_q, T_k) 109 | masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(inputs)[0], 1, 1]) # (N, T_q, T_k) 110 | 111 | paddings = tf.ones_like(masks) * padding_num 112 | outputs = tf.where(tf.equal(masks, 0), paddings, inputs) 113 | else: 114 | print("Check if you entered type correctly!") 115 | 116 | return outputs 117 | 118 | 119 | def scaled_dot_product_attention(Q, K, V, 120 | causality=False, dropout_rate=0., 121 | training=True, 122 | scope="sdpa"): 123 | '''See 3.2.1. 124 | Q: Packed queries. 3d tensor. [N, T_q, d_k]. 125 | K: Packed keys. 3d tensor. [N, T_k, d_k]. 126 | V: Packed values. 3d tensor. [N, T_k, d_v]. 127 | causality: If True, applies masking for future blinding 128 | dropout_rate: A floating point number of [0, 1]. 129 | training: boolean for controlling droput 130 | scope: Optional scope for `variable_scope`. 131 | ''' 132 | with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): 133 | d_k = Q.get_shape().as_list()[-1] 134 | # dot product 135 | outputs = tf.matmul(Q, tf.transpose(K, [0, 2, 1])) # (N, T_q, T_k) 136 | # scale 137 | outputs /= d_k ** 0.5 138 | # key masking 139 | outputs = mask(outputs, Q, K, type="key") 140 | # causality or future blinding masking 141 | if causality: 142 | outputs = mask(outputs, type="future") 143 | # softmax 144 | outputs = tf.nn.softmax(outputs) 145 | attention = tf.transpose(outputs, [0, 2, 1]) 146 | tf.summary.image("attention", tf.expand_dims(attention[:1], -1)) 147 | # query masking 148 | outputs = mask(outputs, Q, K, type="query") 149 | # dropout 150 | outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=training) 151 | # weighted sum (context vectors) 152 | outputs = tf.matmul(outputs, V) # (N, T_q, d_v) 153 | return outputs 154 | 155 | 156 | def multihead_attention(queries, keys, values, 157 | num_heads=4, dropout_rate=0., 158 | training=True, 159 | causality=False, 160 | scope="mha"): 161 | '''Applies multihead attention. See 3.2.2 162 | queries: A 3d tensor with shape of [N, T_q, d_model]. 163 | keys: A 3d tensor with shape of [N, T_k, d_model]. 164 | values: A 3d tensor with shape of [N, T_k, d_model]. 165 | num_heads: An int. Number of heads. 166 | training: Boolean. Controller of mechanism for dropout. 167 | causality: Boolean. If true, units that reference the future are masked. 168 | scope: Optional scope for `variable_scope`. 169 | 170 | Returns 171 | A 3d tensor with shape of (N, T_q, C) 172 | ''' 173 | d_model = queries.get_shape().as_list()[-1] 174 | with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): 175 | # Linear projections 176 | Q = tf.layers.dense(queries, d_model, use_bias=False) # (N, T_q, d_model) 177 | K = tf.layers.dense(keys, d_model, use_bias=False) # (N, T_k, d_model) 178 | V = tf.layers.dense(values, d_model, use_bias=False) # (N, T_k, d_model) 179 | # Split and concat 180 | Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, d_model/h) 181 | K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, d_model/h) 182 | V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, d_model/h) 183 | # Attention 184 | outputs = scaled_dot_product_attention(Q_, K_, V_, causality, dropout_rate, training) 185 | # Restore shape 186 | outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2) # (N, T_q, d_model) 187 | # Residual connection 188 | outputs += queries 189 | ## Normalize 190 | # outputs = layer_norm(outputs) 191 | return outputs 192 | 193 | 194 | def feed_forward(inputs, num_units, activation, scope="positionwise_feedforward"): 195 | '''position-wise feed forward net. See 3.3 196 | 197 | inputs: A 3d tensor with shape of [N, T, C]. 198 | num_units: A list of two integers. 199 | scope: Optional scope for `variable_scope`. 200 | 201 | Returns: 202 | A 3d tensor with the same shape and dtype as inputs 203 | ''' 204 | with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): 205 | # Inner layer 206 | outputs = tf.layers.dense(inputs, num_units[0], activation=activation) 207 | # Outer layer 208 | outputs = tf.layers.dense(outputs, num_units[1]) 209 | # Residual connection 210 | outputs += inputs 211 | # Normalize 212 | outputs = layer_norm(outputs) 213 | return outputs 214 | --------------------------------------------------------------------------------