├── LICENSE
├── README.md
├── data_reader.py
├── hyper_param.py
├── id_allocator.py
├── item_recommender.py
├── layer_util.py
└── sample.data


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## R3S
 3 | 
 4 | Users of recommendation systems usually focus on one topic at a time. When finishing reading an item, users may want to access more relevant items related to the last read one as extended reading. 
 5 | 
 6 | However, conventional recommendation systems are hard to provide the continuous extended reading function of these relevant items, since the main recommendation results should be diversified. 
 7 | 
 8 | In this paper, we propose a new task named recommendation suggestion, which aims to (1) predict whether users want extended reading, and (2) provide appropriate relevant items as suggestions.
 9 | 
10 | These recommended relevant items are arranged in a relevant box and instantly inserted below the clicked item in the main feed. 
11 | 
12 | The challenge of recommendation suggestion on relevant items is that it should further consider semantic relevance and information gain besides CTR-related factors. Moreover, the real-time relevant box insertion may also harm the overall performance when users do not want extended reading. 
13 | 
14 | To address these issues, we propose a novel Real-time relevant recommendation suggestion (R3S) framework, which consists of an Item recommender and a Box trigger. We extract features from multiple aspects including feature interaction, semantic similarity and information gain as different experts, and propose a new Multi-critic multi-gate mixture-of-experts (M3oE) strategy to jointly consider different experts with multi-head critics.
15 | 
16 | In experiments, we conduct both offline and online evaluations on a real-world recommendation system with detailed ablation tests. The significant improvements in item/box related metrics verify the effectiveness of R3S. Moreover, we have deployed R3S on WeChat Top Stories, which affects millions of users. 
17 | 
18 | ### Requirements:
19 | - Python 3.8
20 | - Tensorflow 2.4.1
21 | 
22 | ## Note
23 | 
24 | In the actual online system, R3S is a complex re-ranking framework implemented in C++. 
25 | All models are trained based on a deeply customized version of distributed tensorflow supporting large-scale sparse features.
26 | 
27 | Without massive data and machine resources, training R3S is not realistic.
28 | 
29 | Therefore, the open source code here only implements a simplified version for interested researchers. If there are any errors, please contact me. Thanks!
30 | 
31 | ## About
32 | 
33 | "Real-time Relevant Recommendation Suggestion" ([WSDM 2021](http://nlp.csai.tsinghua.edu.cn/~xrb/publications/WSDM-21_R3S.pdf))
34 | 


--------------------------------------------------------------------------------
/data_reader.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Data Reader
 3 | 
 4 | April 2021
 5 | modric10zhang@gmail.com
 6 | 
 7 | '''
 8 | import sys
 9 | from id_allocator import IdAllocator
10 | from hyper_param import param_dict as pd
11 | 
12 | 
13 | class DataReader(object):
14 |     def __init__(self, batch_num):
15 |         self._id_tool = IdAllocator()
16 |         self._data = []
17 |         self._batch = batch_num
18 | 
19 |     def unique_feature_num(self):
20 |         return self._id_tool.unique_id_num()
21 | 
22 |     def parse_feature(self, raw_feature):
23 |         feature = set()
24 |         for f in raw_feature.split(','):
25 |             feature.add(self._id_tool.allocate(f))
26 |         if len(feature) == 0:
27 |             feature.add(0)
28 |         return feature
29 | 
30 |     def load(self, sample_path):
31 |         with open(sample_path, 'r') as fp:
32 |             for sinfo in fp:
33 |                 skv = {}
34 |                 info = sinfo.strip().split('\t')
35 |                 for ii in info:
36 |                     ff = ii.split(' ')
37 |                     for fi in ff:
38 |                         pos = fi.find(':')
39 |                         skv[fi[:pos]] = fi[pos + 1:]
40 |                 feats = [[], [], [], []]
41 |                 fields = [pd['user_field_num'], pd['doc_field_num'], pd['con_field_num'], pd['doc_field_num']]
42 |                 prefix = ['uf', 'rf', 'cf', 'sf']
43 |                 assert (len(fields) == len(prefix))
44 |                 for k in range(len(fields)):
45 |                     for i in range(fields[k]):
46 |                         fk = '%s%s' % (prefix[k], i)
47 |                         if fk not in skv:
48 |                             raise Exception('field %s not exist.' % fk)
49 |                         feats[k].append(self.parse_feature(skv[fk]))
50 |                 self._data.append([feats[0], feats[1], feats[2], feats[3], skv['dwell_time']])
51 | 
52 |     def next(self):
53 |         nb = None
54 |         if len(self._data) <= 0:
55 |             return nb
56 |         else:
57 |             idx = len(self._data) if len(self._data) <= self._batch else self._batch
58 |             nb = self._data[:idx]
59 |             self._data = self._data[idx:]
60 |         return nb
61 | 


--------------------------------------------------------------------------------
/hyper_param.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Model Hyper Parameter Dict
 3 | 
 4 | April 2021
 5 | modric10zhang@gmail.com
 6 | 
 7 | '''
 8 | 
 9 | param_dict = {
10 |     'feat_dim': 6,  # feature embedding dimension
11 |     'user_field_num': 5,  # number of user feature fields
12 |     'doc_field_num': 5,  # number of doc feature fields
13 |     'con_field_num': 5,  # number of context feature fields
14 |     'expert_dim': 90,  # expert subnetwork dimension
15 |     'expert_num': 3,  # number of experts
16 |     'critic_num': 4,  # number of critic networks(gates)
17 |     'num_epochs': 100,  # training epoch
18 |     'batch_size': 128,  # batch size
19 |     'lr': 0.0002,  # learning rate of network
20 |     'dropout': 0.3,  # dropout ratio
21 |     'grad_clip': 5.0,  # grad clip
22 |     'head_num': 3,  # head number for all self-attention units
23 | }
24 | 


--------------------------------------------------------------------------------
/id_allocator.py:
--------------------------------------------------------------------------------
 1 | class IdAllocator(object):
 2 |     def __init__(self):
 3 |         self._id = 1
 4 |         self._tbl = {}
 5 | 
 6 |     def allocate(self, x):
 7 |         if type(x) is not str:
 8 |             raise Exception('only str is supported in IdAllocator.')
 9 |         if x not in self._tbl:
10 |             self._id += 1
11 |             self._tbl[x] = self._id
12 |         return self._tbl[x]
13 | 
14 |     def unique_id_num(self):
15 |         return self._id
16 | 


--------------------------------------------------------------------------------
/item_recommender.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Item Recommender
  3 | 
  4 | April 2021
  5 | modric10zhang@gmail.com
  6 | 
  7 | '''
  8 | 
  9 | import os
 10 | import math
 11 | import numpy as np
 12 | import tensorflow.compat.v1 as tf
 13 | from layer_util import *
 14 | from data_reader import DataReader
 15 | from hyper_param import param_dict as pd
 16 | 
 17 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 18 | tf.disable_eager_execution()
 19 | 
 20 | ###### global variable for local computation ######
 21 | g_loss_sum = 0.
 22 | g_loss_cnt = 0
 23 | 
 24 | g_working_mode = 'local_train'
 25 | g_training = False
 26 | 
 27 | g_dr = DataReader(pd['batch_size'])
 28 | 
 29 | 
 30 | class ItemRecommender(object):
 31 |     def __init__(self):
 32 |         # placeholder
 33 |         self.sph_user = tf.sparse_placeholder(tf.int32, name='sph_user')
 34 |         self.sph_doc = tf.sparse_placeholder(tf.int32, name='sph_doc')
 35 |         self.sph_con = tf.sparse_placeholder(tf.int32, name='sph_con')
 36 |         self.sph_seed = tf.sparse_placeholder(tf.int32, name='sph_seed')
 37 |         self.sph_ig = tf.sparse_placeholder(tf.int32, name='sph_ig')
 38 |         self.ph_dwell_time = tf.placeholder(tf.float32, name='ph_dwell_time')
 39 | 
 40 |         self.create_graph('m3oe')
 41 |         diff = tf.reshape(self.ph_dwell_time, [-1]) - tf.reshape(self.output, [-1])
 42 |         self.loss = tf.reduce_mean(tf.square(diff))
 43 |         vs = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='m3oe')
 44 |         self.grads = tf.clip_by_global_norm(tf.gradients(self.loss, vs), pd['grad_clip'])[0]
 45 |         with tf.variable_scope('opt'):
 46 |             optimizer = tf.train.AdamOptimizer(pd['lr'])
 47 |             self.opt = optimizer.apply_gradients(zip(self.grads, vs))
 48 | 
 49 |     def field_interact(self, fields):
 50 |         global g_training
 51 |         qkv = tf.layers.dropout(fields, rate=pd['dropout'], training=g_training)
 52 |         with tf.variable_scope('fi'):
 53 |             return multihead_attention(queries=qkv,
 54 |                                        keys=qkv,
 55 |                                        values=qkv,
 56 |                                        num_heads=pd['head_num'],
 57 |                                        dropout_rate=pd['dropout'],
 58 |                                        training=g_training,
 59 |                                        causality=False,
 60 |                                        scope='mha')
 61 | 
 62 |     def create_graph(self, scope):
 63 |         global g_training, g_dr
 64 |         with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
 65 |             feat_dict = get_embeddings(g_dr.unique_feature_num(),
 66 |                                        pd['feat_dim'],
 67 |                                        scope='feat_embedding',
 68 |                                        zero_pad=False)
 69 |             n_batch = pd['batch_size']
 70 |             n_user, n_doc, n_con = pd['user_field_num'], pd['doc_field_num'], pd['con_field_num']
 71 |             embed_dim = pd['feat_dim']
 72 |             user_embed = tf.nn.embedding_lookup_sparse(feat_dict, self.sph_user, sp_weights=None, combiner='mean')
 73 |             self.user = tf.reshape(user_embed, shape=[n_batch, n_user, embed_dim])
 74 |             doc_embed = tf.nn.embedding_lookup_sparse(feat_dict, self.sph_doc, sp_weights=None, combiner='mean')
 75 |             self.doc = tf.reshape(doc_embed, shape=[n_batch, n_doc, embed_dim])
 76 |             con_embed = tf.nn.embedding_lookup_sparse(feat_dict, self.sph_con, sp_weights=None, combiner='mean')
 77 |             self.con = tf.reshape(con_embed, shape=[n_batch, n_con, embed_dim])
 78 |             seed_embed = tf.nn.embedding_lookup_sparse(feat_dict, self.sph_seed, sp_weights=None, combiner='mean')
 79 |             self.seed = tf.reshape(seed_embed, shape=[n_batch, n_doc, embed_dim])
 80 |             ig_embed = tf.nn.embedding_lookup_sparse(feat_dict, self.sph_ig, sp_weights=None, combiner='mean')
 81 |             self.ig = tf.reshape(ig_embed, shape=[n_batch, n_doc, embed_dim])
 82 | 
 83 |             fi_in = tf.concat([self.doc, self.seed], axis=1)
 84 |             # feature interaction network
 85 |             fi_expert = tf.reshape(self.field_interact(fi_in), shape=[n_batch, -1])
 86 |             fi_expert = tf.concat([fi_expert,
 87 |                                    tf.reshape(self.user, shape=[n_batch, -1]),
 88 |                                    tf.reshape(self.con, shape=[n_batch, -1])], axis=1)
 89 |             fi_expert = tf.layers.dense(fi_expert, fi_expert.get_shape().as_list()[-1], activation=tf.nn.relu)
 90 |             fi_expert = tf.layers.dense(fi_expert, pd['expert_dim'], activation=tf.nn.relu)
 91 |             # sys.exit(0)
 92 |             edc = tf.reshape(self.doc, shape=[-1, embed_dim])
 93 |             esd = tf.reshape(self.seed, shape=[-1, embed_dim])
 94 |             # similarity network
 95 |             smn0 = tf.multiply(edc, esd)
 96 |             smn1 = tf.reduce_sum(tf.multiply(edc, esd), axis=1, keep_dims=True)
 97 |             smn = tf.reshape(tf.concat([smn0, smn1], axis=1), shape=[n_batch, -1])
 98 |             sim_expert = tf.concat([smn,
 99 |                                     tf.reshape(self.user, shape=[n_batch, -1]),
100 |                                     tf.reshape(self.con, shape=[n_batch, -1])], axis=1)
101 |             sim_expert = tf.layers.dense(sim_expert, pd['expert_dim'], activation=tf.nn.relu)
102 |             # information gain network
103 |             ig_expert = tf.concat([tf.reshape(self.ig, [n_batch, -1]),
104 |                                    tf.reshape(self.user, [n_batch, -1]),
105 |                                    tf.reshape(self.con, [n_batch, -1])], axis=1)
106 |             ig_expert = tf.layers.dense(ig_expert, pd['expert_dim'], activation=tf.nn.relu)
107 |             # multi-ciritic
108 |             gate_in = tf.concat([tf.reshape(self.user, [n_batch, -1]),
109 |                                  tf.reshape(self.seed, [n_batch, -1]),
110 |                                  tf.reshape(self.con, [n_batch, -1])], axis=1)
111 |             experts = tf.stack([fi_expert, sim_expert, ig_expert], axis=1)
112 |             gates, votes = [], []
113 |             for i in range(pd['critic_num']):
114 |                 gates.append(tf.nn.softmax(tf.layers.dense(gate_in, pd['expert_num'])))
115 |                 gates[i] = tf.reshape(gates[i], [n_batch, pd['expert_num'], 1])
116 |                 votes.append(tf.reduce_sum(gates[i] * experts, axis=1))
117 |             votes = tf.stack(votes, axis=1)
118 |             # attention layer
119 |             w_init = tf.truncated_normal_initializer(stddev=0.01)
120 |             att_x = tf.concat([tf.reshape(self.user, [n_batch, -1]),
121 |                                tf.reshape(self.doc, [n_batch, -1]),
122 |                                tf.reshape(self.seed, [n_batch, -1]),
123 |                                tf.reshape(self.con, [n_batch, -1])], axis=1)
124 |             att_w = tf.get_variable('att_w', (pd['expert_dim'], att_x.get_shape().as_list()[-1]), initializer=w_init)
125 |             att_o = tf.tensordot(votes, att_w, [[2], [0]])
126 |             att_x = tf.tile(tf.expand_dims(att_x, 1), [1, pd['critic_num'], 1])
127 |             att_o = tf.expand_dims(tf.nn.softmax(tf.reduce_sum(att_o * att_x, 2)), -1)
128 |             vote_ret = tf.reduce_sum(att_o * votes, axis=1)
129 |             fc = tf.layers.dropout(
130 |                 tf.layers.dense(vote_ret, vote_ret.get_shape().as_list()[-1] / 2, activation=tf.nn.relu),
131 |                 rate=pd['dropout'],
132 |                 training=g_training)
133 |             self.output = tf.layers.dense(fc, 1, activation=tf.nn.relu)
134 | 
135 |     # call for evaluation
136 |     def predict(self, sess, ph_dict):
137 |         return sess.run(self.output, feed_dict={self.sph_user: ph_dict['user'],
138 |                                                 self.sph_doc: ph_dict['doc'],
139 |                                                 self.sph_con: ph_dict['con'],
140 |                                                 self.sph_seed: ph_dict['seed'],
141 |                                                 self.sph_ig: ph_dict['ig'],
142 |                                                 self.ph_dwell_time: ph_dict['reward']})
143 | 
144 |     # call for learning from data
145 |     def learn(self, sess, ph_dict):
146 |         loss, _ = sess.run([self.loss, self.opt], feed_dict={self.sph_user: ph_dict['user'],
147 |                                                              self.sph_doc: ph_dict['doc'],
148 |                                                              self.sph_con: ph_dict['con'],
149 |                                                              self.sph_seed: ph_dict['seed'],
150 |                                                              self.sph_ig: ph_dict['ig'],
151 |                                                              self.ph_dwell_time: ph_dict['reward']})
152 |         global g_loss_sum, g_loss_cnt
153 |         g_loss_sum += np.mean(loss)
154 |         g_loss_cnt += 1
155 | 
156 | 
157 | def sigmoid(x):
158 |     return 1.0 / (1.0 + math.exp(max(min(-x, 1e2), -1e2)))
159 | 
160 | 
161 | def handle(sess, net, sess_data):
162 |     def gen_sparse_tensor(fs):
163 |         global g_dr
164 |         kk, vv = [], []
165 |         for i in range(len(fs)):
166 |             ff = fs[i]
167 |             assert (isinstance(ff, set))
168 |             ff = list(ff)
169 |             for k in range(len(ff)):
170 |                 kk.append(np.array([i, k], dtype=np.int32))
171 |                 vv.append(ff[k])
172 |         return tf.SparseTensorValue(kk, vv, [len(fs), g_dr.unique_feature_num()])
173 | 
174 |     if len(sess_data) != pd['batch_size']:
175 |         return
176 |     user, doc, con, seed, dwell = [], [], [], [], []
177 |     for i in range(len(sess_data)):
178 |         user.append(sess_data[i][0])
179 |         doc.append(sess_data[i][1])
180 |         con.append(sess_data[i][2])
181 |         seed.append(sess_data[i][3])
182 |         dwell.append(sess_data[i][4])
183 |     phd = {}
184 |     # print np.array(user).shape
185 |     user = np.array(user).reshape(pd['batch_size'] * pd['user_field_num'])
186 |     phd['user'] = gen_sparse_tensor(user)
187 |     doc = np.array(doc).reshape(pd['batch_size'] * pd['doc_field_num'])
188 |     phd['doc'] = gen_sparse_tensor(doc)
189 |     seed = np.array(seed).reshape(pd['batch_size'] * pd['doc_field_num'])
190 |     phd['seed'] = gen_sparse_tensor(seed)
191 |     ig = []
192 |     for i in range(doc.shape[0]):
193 |         ig.append({0} if doc[i] <= seed[i] else doc[i] - seed[i])
194 |     ig = np.array(ig).reshape(pd['batch_size'] * pd['doc_field_num'])
195 |     phd['ig'] = gen_sparse_tensor(ig)
196 |     con = np.array(con).reshape(pd['batch_size'] * pd['con_field_num'])
197 |     phd['con'] = gen_sparse_tensor(con)
198 |     phd['reward'] = dwell
199 |     global g_training
200 |     if g_training:
201 |         # train network
202 |         net.learn(sess, phd)
203 |     else:
204 |         # evaluate network
205 |         qout = net.predict(sess, phd).reshape([-1])
206 |         global g_working_mode
207 |         for i in range(len(dwell)):
208 |             if 'local_predict' == g_working_mode:
209 |                 print('%s %s' % (dwell[i], qout[i]))
210 | 
211 | 
212 | def work():
213 |     sess = tf.Session()
214 |     # build networks
215 |     net = ItemRecommender()
216 |     saver = tf.train.Saver(max_to_keep=1)
217 |     g_init_op = tf.global_variables_initializer()
218 |     if os.path.exists('./ckpt') and len(os.listdir('./ckpt')):
219 |         model_file = tf.train.latest_checkpoint('./ckpt')
220 |         saver.restore(sess, model_file)
221 |     else:
222 |         sess.run(g_init_op)
223 |         os.system('mkdir ckpt')
224 |     global g_loss_sum, g_loss_cnt, g_dr
225 |     last_epoch_loss = 1e2
226 |     for k in range(pd['num_epochs']):
227 |         if k > 0:
228 |             g_dr.load('sample.data')
229 |         data = g_dr.next()
230 |         batch_cnt = 0
231 |         while data is not None:
232 |             handle(sess, net, data)
233 |             data = g_dr.next()
234 |             batch_cnt += 1
235 |             if g_training and batch_cnt % 10 == 0:
236 |                 print('>>>Average Loss --- epoch %d --- batch %d --- %f' % (
237 |                     k, batch_cnt, g_loss_sum / (g_loss_cnt + 1e-6)))
238 |         print('>>>Average Loss --- epoch %d --- batch %d --- %f' % (k, batch_cnt, g_loss_sum / (g_loss_cnt + 1e-6)))
239 |         if g_loss_sum / g_loss_cnt > last_epoch_loss:
240 |             print('Job Finished!')
241 |             break
242 |         else:
243 |             last_epoch_loss = g_loss_sum / g_loss_cnt
244 |     saver.save(sess, 'ckpt/m3oe.ckpt')
245 | 
246 | 
247 | if __name__ == '__main__':
248 |     g_dr.load('sample.data')
249 |     if g_working_mode == 'local_train':
250 |         g_training = True
251 |     elif g_working_mode == 'local_predict':
252 |         g_training = False
253 |     else:
254 |         raise Exception('invalid working mode')
255 |     work()
256 | 


--------------------------------------------------------------------------------
/layer_util.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | The implementation of multi-head attention mechanism
  3 | refers to Kyubyong/transformer(https://github.com/Kyubyong/transformer)
  4 | 
  5 | April 2021
  6 | modric10zhang@gmail.com
  7 | 
  8 | '''
  9 | 
 10 | import tensorflow.compat.v1 as tf
 11 | 
 12 | 
 13 | def layer_norm(inputs, scope='ln'):
 14 |     '''Applies layer normalization. See https://arxiv.org/abs/1607.06450.
 15 |     inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`.
 16 |     epsilon: A floating number. A very small number for preventing ZeroDivision Error.
 17 |     scope: Optional scope for `variable_scope`.
 18 |       
 19 |     Returns:
 20 |       A tensor with the same shape and data dtype as `inputs`.
 21 |     '''
 22 |     epsilon = 1e-8
 23 |     with tf.variable_scope(scope):
 24 |         inputs_shape = inputs.get_shape()
 25 |         params_shape = inputs_shape[-1:]
 26 |         # [-1] means last dimension
 27 |         mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
 28 |         beta = tf.get_variable("beta", params_shape, initializer=tf.zeros_initializer())
 29 |         gamma = tf.get_variable("gamma", params_shape, initializer=tf.ones_initializer())
 30 |         normalized = (inputs - mean) / ((variance + epsilon) ** (.5))
 31 |         outputs = gamma * normalized + beta
 32 |     return outputs
 33 | 
 34 | 
 35 | def get_embeddings(dict_size, num_units, scope, zero_pad=True, partitioner=None):
 36 |     '''Constructs token embedding matrix.
 37 |     Note that the column of index 0's are set to zeros.
 38 |     dict_size: scalar. V.
 39 |     num_units: embedding dimensionalty. E.
 40 |     zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero
 41 |     To apply query/key masks easily, zero pad is turned on.
 42 | 
 43 |     Returns
 44 |     weight variable: (V, E)
 45 |     '''
 46 |     with tf.variable_scope(scope):
 47 |         w_init = tf.truncated_normal_initializer(mean=0, stddev=0.1)
 48 |         embeddings = tf.get_variable('w',
 49 |                                      dtype=tf.float32,
 50 |                                      shape=(dict_size, num_units),
 51 |                                      initializer=w_init,
 52 |                                      partitioner=partitioner)
 53 |         #                           initializer=tf.contrib.layers.xavier_initializer())
 54 |         if zero_pad:
 55 |             embeddings = tf.concat((tf.zeros(shape=[1, num_units]),
 56 |                                     embeddings[1:, :]), 0)
 57 |     return embeddings
 58 | 
 59 | 
 60 | def mask(inputs, queries=None, keys=None, type=None):
 61 |     """Masks paddings on keys or queries to inputs
 62 |     inputs: 3d tensor. (N, T_q, T_k)
 63 |     queries: 3d tensor. (N, T_q, d)
 64 |     keys: 3d tensor. (N, T_k, d)
 65 | 
 66 |     e.g.,
 67 |     >> queries = tf.constant([[[1.],
 68 |                         [2.],
 69 |                         [0.]]], tf.float32) # (1, 3, 1)
 70 |     >> keys = tf.constant([[[4.],
 71 |                      [0.]]], tf.float32)  # (1, 2, 1)
 72 |     >> inputs = tf.constant([[[4., 0.],
 73 |                                [8., 0.],
 74 |                                [0., 0.]]], tf.float32)
 75 |     >> mask(inputs, queries, keys, "key")
 76 |     array([[[ 4.0000000e+00, -4.2949673e+09],
 77 |         [ 8.0000000e+00, -4.2949673e+09],
 78 |         [ 0.0000000e+00, -4.2949673e+09]]], dtype=float32)
 79 |     >> inputs = tf.constant([[[1., 0.],
 80 |                              [1., 0.],
 81 |                               [1., 0.]]], tf.float32)
 82 |     >> mask(inputs, queries, keys, "query")
 83 |     array([[[1., 0.],
 84 |         [1., 0.],
 85 |         [0., 0.]]], dtype=float32)
 86 |     """
 87 |     padding_num = -2 ** 32 + 1
 88 |     if type in ("k", "key", "keys"):
 89 |         # Generate masks
 90 |         masks = tf.sign(tf.reduce_sum(tf.abs(keys), axis=-1))  # (N, T_k)
 91 |         masks = tf.expand_dims(masks, 1)  # (N, 1, T_k)
 92 |         masks = tf.tile(masks, [1, tf.shape(queries)[1], 1])  # (N, T_q, T_k)
 93 | 
 94 |         # Apply masks to inputs
 95 |         paddings = tf.ones_like(inputs) * padding_num
 96 |         outputs = tf.where(tf.equal(masks, 0), paddings, inputs)  # (N, T_q, T_k)
 97 |     elif type in ("q", "query", "queries"):
 98 |         # Generate masks
 99 |         masks = tf.sign(tf.reduce_sum(tf.abs(queries), axis=-1))  # (N, T_q)
100 |         masks = tf.expand_dims(masks, -1)  # (N, T_q, 1)
101 |         masks = tf.tile(masks, [1, 1, tf.shape(keys)[1]])  # (N, T_q, T_k)
102 | 
103 |         # Apply masks to inputs
104 |         outputs = inputs * masks
105 |     elif type in ("f", "future", "right"):
106 |         diag_vals = tf.ones_like(inputs[0, :, :])  # (T_q, T_k)
107 |         # tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense()  # (T_q, T_k)
108 |         tril = tf.linalg.band_part(diag_vals, -1, 0)  # (T_q, T_k)
109 |         masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(inputs)[0], 1, 1])  # (N, T_q, T_k)
110 | 
111 |         paddings = tf.ones_like(masks) * padding_num
112 |         outputs = tf.where(tf.equal(masks, 0), paddings, inputs)
113 |     else:
114 |         print("Check if you entered type correctly!")
115 | 
116 |     return outputs
117 | 
118 | 
119 | def scaled_dot_product_attention(Q, K, V,
120 |                                  causality=False, dropout_rate=0.,
121 |                                  training=True,
122 |                                  scope="sdpa"):
123 |     '''See 3.2.1.
124 |     Q: Packed queries. 3d tensor. [N, T_q, d_k].
125 |     K: Packed keys. 3d tensor. [N, T_k, d_k].
126 |     V: Packed values. 3d tensor. [N, T_k, d_v].
127 |     causality: If True, applies masking for future blinding
128 |     dropout_rate: A floating point number of [0, 1].
129 |     training: boolean for controlling droput
130 |     scope: Optional scope for `variable_scope`.
131 |     '''
132 |     with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
133 |         d_k = Q.get_shape().as_list()[-1]
134 |         # dot product
135 |         outputs = tf.matmul(Q, tf.transpose(K, [0, 2, 1]))  # (N, T_q, T_k)
136 |         # scale
137 |         outputs /= d_k ** 0.5
138 |         # key masking
139 |         outputs = mask(outputs, Q, K, type="key")
140 |         # causality or future blinding masking
141 |         if causality:
142 |             outputs = mask(outputs, type="future")
143 |         # softmax
144 |         outputs = tf.nn.softmax(outputs)
145 |         attention = tf.transpose(outputs, [0, 2, 1])
146 |         tf.summary.image("attention", tf.expand_dims(attention[:1], -1))
147 |         # query masking
148 |         outputs = mask(outputs, Q, K, type="query")
149 |         # dropout
150 |         outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=training)
151 |         # weighted sum (context vectors)
152 |         outputs = tf.matmul(outputs, V)  # (N, T_q, d_v)
153 |     return outputs
154 | 
155 | 
156 | def multihead_attention(queries, keys, values,
157 |                         num_heads=4, dropout_rate=0.,
158 |                         training=True,
159 |                         causality=False,
160 |                         scope="mha"):
161 |     '''Applies multihead attention. See 3.2.2
162 |     queries: A 3d tensor with shape of [N, T_q, d_model].
163 |     keys: A 3d tensor with shape of [N, T_k, d_model].
164 |     values: A 3d tensor with shape of [N, T_k, d_model].
165 |     num_heads: An int. Number of heads.
166 |     training: Boolean. Controller of mechanism for dropout.
167 |     causality: Boolean. If true, units that reference the future are masked.
168 |     scope: Optional scope for `variable_scope`.
169 |         
170 |     Returns
171 |       A 3d tensor with shape of (N, T_q, C)  
172 |     '''
173 |     d_model = queries.get_shape().as_list()[-1]
174 |     with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
175 |         # Linear projections
176 |         Q = tf.layers.dense(queries, d_model, use_bias=False)  # (N, T_q, d_model)
177 |         K = tf.layers.dense(keys, d_model, use_bias=False)  # (N, T_k, d_model)
178 |         V = tf.layers.dense(values, d_model, use_bias=False)  # (N, T_k, d_model)
179 |         # Split and concat
180 |         Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0)  # (h*N, T_q, d_model/h)
181 |         K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0)  # (h*N, T_k, d_model/h)
182 |         V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0)  # (h*N, T_k, d_model/h)
183 |         # Attention
184 |         outputs = scaled_dot_product_attention(Q_, K_, V_, causality, dropout_rate, training)
185 |         # Restore shape
186 |         outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2)  # (N, T_q, d_model)
187 |         # Residual connection
188 |         outputs += queries
189 |         ## Normalize
190 |         # outputs = layer_norm(outputs)
191 |     return outputs
192 | 
193 | 
194 | def feed_forward(inputs, num_units, activation, scope="positionwise_feedforward"):
195 |     '''position-wise feed forward net. See 3.3
196 |     
197 |     inputs: A 3d tensor with shape of [N, T, C].
198 |     num_units: A list of two integers.
199 |     scope: Optional scope for `variable_scope`.
200 | 
201 |     Returns:
202 |       A 3d tensor with the same shape and dtype as inputs
203 |     '''
204 |     with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
205 |         # Inner layer
206 |         outputs = tf.layers.dense(inputs, num_units[0], activation=activation)
207 |         # Outer layer
208 |         outputs = tf.layers.dense(outputs, num_units[1])
209 |         # Residual connection
210 |         outputs += inputs
211 |         # Normalize
212 |         outputs = layer_norm(outputs)
213 |     return outputs
214 | 


--------------------------------------------------------------------------------